From fa0f0834b021ae452a8109060c740e41b1b2994e Mon Sep 17 00:00:00 2001 From: Jake Awe Date: Thu, 17 Jul 2025 12:07:07 -0700 Subject: [PATCH 001/366] DOC v25.10 Updates [skip ci] --- .../cuda12.9-conda/devcontainer.json | 8 +-- .devcontainer/cuda12.9-pip/devcontainer.json | 6 +- .github/workflows/build.yaml | 28 +++++----- .github/workflows/pandas-tests.yaml | 2 +- .github/workflows/pr.yaml | 56 +++++++++---------- .../workflows/pr_issue_status_automation.yml | 8 +-- .github/workflows/test.yaml | 30 +++++----- .../trigger-breaking-change-alert.yaml | 2 +- README.md | 2 +- VERSION | 2 +- .../all_cuda-129_arch-aarch64.yaml | 10 ++-- .../all_cuda-129_arch-x86_64.yaml | 10 ++-- cpp/examples/versions.cmake | 2 +- dependencies.yaml | 54 +++++++++--------- java/ci/README.md | 4 +- java/pom.xml | 2 +- python/cudf/cudf/VERSION | 2 +- .../dependencies.yaml | 6 +- python/cudf/pyproject.toml | 14 ++--- python/cudf_kafka/pyproject.toml | 2 +- python/cudf_polars/docs/overview.md | 2 +- python/cudf_polars/pyproject.toml | 6 +- python/custreamz/pyproject.toml | 4 +- python/dask_cudf/pyproject.toml | 6 +- python/libcudf/pyproject.toml | 8 +-- python/pylibcudf/pyproject.toml | 10 ++-- 26 files changed, 143 insertions(+), 143 deletions(-) diff --git a/.devcontainer/cuda12.9-conda/devcontainer.json b/.devcontainer/cuda12.9-conda/devcontainer.json index 1e00021f0ed..5c010923260 100644 --- a/.devcontainer/cuda12.9-conda/devcontainer.json +++ b/.devcontainer/cuda12.9-conda/devcontainer.json @@ -5,19 +5,19 @@ "args": { "CUDA": "12.9", "PYTHON_PACKAGE_MANAGER": "conda", - "BASE": "rapidsai/devcontainers:25.08-cpp-mambaforge-ubuntu22.04" + "BASE": "rapidsai/devcontainers:25.10-cpp-mambaforge-ubuntu22.04" } }, "runArgs": [ "--rm", "--name", - "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-25.08-cuda12.9-conda" + "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-25.10-cuda12.9-conda" ], "hostRequirements": { "gpu": "optional" }, "features": { - "ghcr.io/rapidsai/devcontainers/features/cuda:25.8": { + "ghcr.io/rapidsai/devcontainers/features/cuda:25.10": { "version": "12.9", "installCompilers": false, "installProfilers": true, @@ -38,7 +38,7 @@ "installnvJPEG": false, "pruneStaticLibs": true }, - "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:25.8": {} + "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:25.10": {} }, "overrideFeatureInstallOrder": [ "ghcr.io/rapidsai/devcontainers/features/cuda", diff --git a/.devcontainer/cuda12.9-pip/devcontainer.json b/.devcontainer/cuda12.9-pip/devcontainer.json index 0debf91e159..666e6b872f6 100644 --- a/.devcontainer/cuda12.9-pip/devcontainer.json +++ b/.devcontainer/cuda12.9-pip/devcontainer.json @@ -5,19 +5,19 @@ "args": { "CUDA": "12.9", "PYTHON_PACKAGE_MANAGER": "pip", - "BASE": "rapidsai/devcontainers:25.08-cpp-cuda12.9-ubuntu22.04" + "BASE": "rapidsai/devcontainers:25.10-cpp-cuda12.9-ubuntu22.04" } }, "runArgs": [ "--rm", "--name", - "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-25.08-cuda12.9-pip" + "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-25.10-cuda12.9-pip" ], "hostRequirements": { "gpu": "optional" }, "features": { - "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:25.8": {} + "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:25.10": {} }, "overrideFeatureInstallOrder": [ "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils" diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml index caea3dad7c4..8576f6af66b 100644 --- a/.github/workflows/build.yaml +++ b/.github/workflows/build.yaml @@ -46,7 +46,7 @@ jobs: cpp-build: needs: [telemetry-setup] secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-25.08 + uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-25.10 with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} @@ -57,7 +57,7 @@ jobs: python-build: needs: [telemetry-setup, cpp-build] secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-25.08 + uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-25.10 with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} @@ -67,7 +67,7 @@ jobs: upload-conda: needs: [cpp-build, python-build] secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@branch-25.08 + uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@branch-25.10 with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} @@ -77,7 +77,7 @@ jobs: if: github.ref_type == 'branch' needs: python-build secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-25.08 + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-25.10 with: arch: "amd64" branch: ${{ inputs.branch }} @@ -90,7 +90,7 @@ jobs: wheel-build-libcudf: needs: [telemetry-setup] secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-25.08 + uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-25.10 with: # build for every combination of arch and CUDA version, but only for the latest Python matrix_filter: group_by([.ARCH, (.CUDA_VER|split(".")|map(tonumber)|.[0])]) | map(max_by(.PY_VER|split(".")|map(tonumber))) @@ -105,7 +105,7 @@ jobs: wheel-publish-libcudf: needs: wheel-build-libcudf secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-25.08 + uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-25.10 with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} @@ -116,7 +116,7 @@ jobs: wheel-build-pylibcudf: needs: [telemetry-setup, wheel-build-libcudf] secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-25.08 + uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-25.10 with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} @@ -128,7 +128,7 @@ jobs: wheel-publish-pylibcudf: needs: wheel-build-pylibcudf secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-25.08 + uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-25.10 with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} @@ -139,7 +139,7 @@ jobs: wheel-build-cudf: needs: [telemetry-setup, wheel-build-pylibcudf] secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-25.08 + uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-25.10 with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} @@ -151,7 +151,7 @@ jobs: wheel-publish-cudf: needs: wheel-build-cudf secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-25.08 + uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-25.10 with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} @@ -162,7 +162,7 @@ jobs: wheel-build-dask-cudf: needs: [telemetry-setup, wheel-build-cudf] secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-25.08 + uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-25.10 with: # This selects "ARCH=amd64 + the latest supported Python + CUDA". matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))])) @@ -177,7 +177,7 @@ jobs: wheel-publish-dask-cudf: needs: wheel-build-dask-cudf secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-25.08 + uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-25.10 with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} @@ -188,7 +188,7 @@ jobs: wheel-build-cudf-polars: needs: [telemetry-setup, wheel-build-pylibcudf] secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-25.08 + uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-25.10 with: # This selects "ARCH=amd64 + the latest supported Python + CUDA". matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))])) @@ -203,7 +203,7 @@ jobs: wheel-publish-cudf-polars: needs: wheel-build-cudf-polars secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-25.08 + uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-25.10 with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} diff --git a/.github/workflows/pandas-tests.yaml b/.github/workflows/pandas-tests.yaml index ef06159ab90..56fba1f6d8f 100644 --- a/.github/workflows/pandas-tests.yaml +++ b/.github/workflows/pandas-tests.yaml @@ -22,7 +22,7 @@ jobs: pandas-tests: # run the Pandas unit tests secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-25.08 + uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-25.10 with: matrix_filter: '[{"ARCH": "amd64", "PY_VER": "3.13", "CUDA_VER": "12.9.1", "LINUX_VER": "ubuntu24.04", "GPU": "l4", "DRIVER": "latest", "DEPENDENCIES": "newest"}]' build_type: nightly diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml index 2976f30184d..883d005cc61 100644 --- a/.github/workflows/pr.yaml +++ b/.github/workflows/pr.yaml @@ -43,7 +43,7 @@ jobs: - telemetry-setup - third-party-integration-tests-cudf-pandas secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@branch-25.08 + uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@branch-25.10 if: always() with: needs: ${{ toJSON(needs) }} @@ -68,7 +68,7 @@ jobs: changed-files: secrets: inherit needs: telemetry-setup - uses: rapidsai/shared-workflows/.github/workflows/changed-files.yaml@branch-25.08 + uses: rapidsai/shared-workflows/.github/workflows/changed-files.yaml@branch-25.10 with: files_yaml: | test_cpp: @@ -130,14 +130,14 @@ jobs: checks: secrets: inherit needs: telemetry-setup - uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@branch-25.08 + uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@branch-25.10 with: enable_check_generated_files: false ignored_pr_jobs: "telemetry-summarize spark-rapids-jni" conda-cpp-build: needs: checks secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-25.08 + uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-25.10 with: build_type: pull-request node_type: "cpu16" @@ -145,7 +145,7 @@ jobs: cpp-linters: secrets: inherit needs: checks - uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-25.08 + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-25.10 with: build_type: pull-request script: "ci/cpp_linters.sh" @@ -153,13 +153,13 @@ jobs: conda-cpp-checks: needs: conda-cpp-build secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@branch-25.08 + uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@branch-25.10 with: build_type: pull-request conda-cpp-tests: needs: [conda-cpp-build, changed-files] secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-25.08 + uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-25.10 if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_cpp with: build_type: pull-request @@ -167,14 +167,14 @@ jobs: conda-python-build: needs: conda-cpp-build secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-25.08 + uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-25.10 with: build_type: pull-request script: ci/build_python.sh conda-python-cudf-tests: needs: [conda-python-build, changed-files] secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-25.08 + uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-25.10 if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_python with: build_type: pull-request @@ -183,7 +183,7 @@ jobs: # Tests for dask_cudf, custreamz, cudf_kafka are separated for CI parallelism needs: [conda-python-build, changed-files] secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-25.08 + uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-25.10 if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_python with: build_type: pull-request @@ -191,7 +191,7 @@ jobs: conda-java-tests: needs: [conda-cpp-build, changed-files] secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-25.08 + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-25.10 if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_java with: build_type: pull-request @@ -202,7 +202,7 @@ jobs: conda-notebook-tests: needs: [conda-python-build, changed-files] secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-25.08 + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-25.10 if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_notebooks with: build_type: pull-request @@ -213,7 +213,7 @@ jobs: docs-build: needs: conda-python-build secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-25.08 + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-25.10 with: build_type: pull-request node_type: "gpu-l4-latest-1" @@ -223,7 +223,7 @@ jobs: wheel-build-libcudf: needs: checks secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-25.08 + uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-25.10 with: # build for every combination of arch and CUDA version, but only for the latest Python matrix_filter: group_by([.ARCH, (.CUDA_VER|split(".")|map(tonumber)|.[0])]) | map(max_by(.PY_VER|split(".")|map(tonumber))) @@ -235,7 +235,7 @@ jobs: wheel-build-pylibcudf: needs: [checks, wheel-build-libcudf] secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-25.08 + uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-25.10 with: build_type: pull-request script: "ci/build_wheel_pylibcudf.sh" @@ -244,7 +244,7 @@ jobs: wheel-build-cudf: needs: wheel-build-pylibcudf secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-25.08 + uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-25.10 with: build_type: pull-request script: "ci/build_wheel_cudf.sh" @@ -253,7 +253,7 @@ jobs: wheel-tests-cudf: needs: [wheel-build-cudf, changed-files] secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-25.08 + uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-25.10 if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_python with: build_type: pull-request @@ -261,7 +261,7 @@ jobs: wheel-build-cudf-polars: needs: wheel-build-pylibcudf secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-25.08 + uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-25.10 with: # This selects "ARCH=amd64 + the latest supported Python + CUDA". matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))])) @@ -273,7 +273,7 @@ jobs: wheel-tests-cudf-polars: needs: [wheel-build-cudf-polars, changed-files] secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-25.08 + uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-25.10 if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_python with: # This selects "ARCH=amd64 + the latest supported Python + CUDA". @@ -283,7 +283,7 @@ jobs: cudf-polars-polars-tests: needs: wheel-build-cudf-polars secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-25.08 + uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-25.10 with: # This selects "ARCH=amd64 + the latest supported Python + CUDA". matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))])) @@ -292,7 +292,7 @@ jobs: wheel-build-dask-cudf: needs: wheel-build-cudf secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-25.08 + uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-25.10 with: # This selects "ARCH=amd64 + the latest supported Python + CUDA". matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))])) @@ -304,7 +304,7 @@ jobs: wheel-tests-dask-cudf: needs: [wheel-build-dask-cudf, changed-files] secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-25.08 + uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-25.10 if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_python with: # This selects "ARCH=amd64 + the latest supported Python + CUDA". @@ -314,7 +314,7 @@ jobs: devcontainer: secrets: inherit needs: telemetry-setup - uses: rapidsai/shared-workflows/.github/workflows/build-in-devcontainer.yaml@branch-25.08 + uses: rapidsai/shared-workflows/.github/workflows/build-in-devcontainer.yaml@branch-25.10 with: node_type: "cpu32" arch: '["amd64"]' @@ -326,7 +326,7 @@ jobs: unit-tests-cudf-pandas: needs: [wheel-build-cudf, changed-files] secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-25.08 + uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-25.10 if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_python || fromJSON(needs.changed-files.outputs.changed_file_groups).test_cudf_pandas with: # This selects "ARCH=amd64 + the latest supported Python + CUDA". @@ -336,7 +336,7 @@ jobs: third-party-integration-tests-cudf-pandas: needs: conda-python-build secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-25.08 + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-25.10 with: build_type: pull-request branch: ${{ inputs.branch }} @@ -351,7 +351,7 @@ jobs: # run the Pandas unit tests using PR branch needs: [wheel-build-cudf, changed-files] secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-25.08 + uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-25.10 if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_python || fromJSON(needs.changed-files.outputs.changed_file_groups).test_cudf_pandas with: matrix_filter: '[{"ARCH": "amd64", "PY_VER": "3.13", "CUDA_VER": "12.9.1", "LINUX_VER": "ubuntu24.04", "GPU": "l4", "DRIVER": "latest", "DEPENDENCIES": "newest"}]' @@ -362,7 +362,7 @@ jobs: pandas-tests-diff: # diff the results of running the Pandas unit tests and publish a job summary needs: pandas-tests - uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-25.08 + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-25.10 with: node_type: "cpu4" build_type: pull-request @@ -370,7 +370,7 @@ jobs: narwhals-tests: needs: [conda-python-build, changed-files] secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-25.08 + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-25.10 if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_python with: build_type: pull-request diff --git a/.github/workflows/pr_issue_status_automation.yml b/.github/workflows/pr_issue_status_automation.yml index 46973456a90..148d83e73d6 100644 --- a/.github/workflows/pr_issue_status_automation.yml +++ b/.github/workflows/pr_issue_status_automation.yml @@ -23,7 +23,7 @@ on: jobs: get-project-id: - uses: rapidsai/shared-workflows/.github/workflows/project-get-item-id.yaml@branch-25.08 + uses: rapidsai/shared-workflows/.github/workflows/project-get-item-id.yaml@branch-25.10 if: github.event.pull_request.state == 'open' secrets: inherit permissions: @@ -34,7 +34,7 @@ jobs: update-status: # This job sets the PR and its linked issues to "In Progress" status - uses: rapidsai/shared-workflows/.github/workflows/project-get-set-single-select-field.yaml@branch-25.08 + uses: rapidsai/shared-workflows/.github/workflows/project-get-set-single-select-field.yaml@branch-25.10 if: ${{ github.event.pull_request.state == 'open' && needs.get-project-id.outputs.ITEM_PROJECT_ID != '' }} needs: get-project-id with: @@ -50,7 +50,7 @@ jobs: update-sprint: # This job sets the PR and its linked issues to the current "Weekly Sprint" - uses: rapidsai/shared-workflows/.github/workflows/project-get-set-iteration-field.yaml@branch-25.08 + uses: rapidsai/shared-workflows/.github/workflows/project-get-set-iteration-field.yaml@branch-25.10 if: ${{ github.event.pull_request.state == 'open' && needs.get-project-id.outputs.ITEM_PROJECT_ID != '' }} needs: get-project-id with: @@ -79,7 +79,7 @@ jobs: update-release: # This job sets the PR and its linked issues to the release they are targeting - uses: rapidsai/shared-workflows/.github/workflows/project-get-set-single-select-field.yaml@branch-25.08 + uses: rapidsai/shared-workflows/.github/workflows/project-get-set-single-select-field.yaml@branch-25.10 if: ${{ github.event.pull_request.state == 'open' && needs.get-project-id.outputs.ITEM_PROJECT_ID != '' }} needs: [get-project-id, process-branch-name] with: diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index 293488e6765..de86f3d5b50 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -24,7 +24,7 @@ on: jobs: conda-cpp-checks: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@branch-25.08 + uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@branch-25.10 with: build_type: ${{ inputs.build_type }} branch: ${{ inputs.branch }} @@ -32,7 +32,7 @@ jobs: sha: ${{ inputs.sha }} conda-cpp-tests: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-25.08 + uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-25.10 with: build_type: ${{ inputs.build_type }} branch: ${{ inputs.branch }} @@ -41,7 +41,7 @@ jobs: sha: ${{ inputs.sha }} conda-cpp-memcheck-tests: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-25.08 + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-25.10 with: build_type: ${{ inputs.build_type }} branch: ${{ inputs.branch }} @@ -53,7 +53,7 @@ jobs: script: "ci/test_cpp_memcheck.sh" cpp-linters: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-25.08 + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-25.10 with: build_type: ${{ inputs.build_type }} branch: ${{ inputs.branch }} @@ -63,7 +63,7 @@ jobs: file_to_upload: iwyu_results.txt conda-python-cudf-tests: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-25.08 + uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-25.10 with: build_type: ${{ inputs.build_type }} branch: ${{ inputs.branch }} @@ -73,7 +73,7 @@ jobs: conda-python-other-tests: # Tests for dask_cudf, custreamz, cudf_kafka are separated for CI parallelism secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-25.08 + uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-25.10 with: build_type: ${{ inputs.build_type }} branch: ${{ inputs.branch }} @@ -82,7 +82,7 @@ jobs: script: "ci/test_python_other.sh" conda-java-tests: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-25.08 + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-25.10 with: build_type: ${{ inputs.build_type }} branch: ${{ inputs.branch }} @@ -94,7 +94,7 @@ jobs: script: "ci/test_java.sh" conda-notebook-tests: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-25.08 + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-25.10 with: build_type: ${{ inputs.build_type }} branch: ${{ inputs.branch }} @@ -106,7 +106,7 @@ jobs: script: "ci/test_notebooks.sh" wheel-tests-cudf: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-25.08 + uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-25.10 with: build_type: ${{ inputs.build_type }} branch: ${{ inputs.branch }} @@ -115,7 +115,7 @@ jobs: script: ci/test_wheel_cudf.sh wheel-tests-dask-cudf: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-25.08 + uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-25.10 with: build_type: ${{ inputs.build_type }} branch: ${{ inputs.branch }} @@ -124,7 +124,7 @@ jobs: script: ci/test_wheel_dask_cudf.sh unit-tests-cudf-pandas: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-25.08 + uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-25.10 with: build_type: ${{ inputs.build_type }} branch: ${{ inputs.branch }} @@ -133,7 +133,7 @@ jobs: script: ci/cudf_pandas_scripts/run_tests.sh third-party-integration-tests-cudf-pandas: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-25.08 + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-25.10 with: build_type: ${{ inputs.build_type }} branch: ${{ inputs.branch }} @@ -145,7 +145,7 @@ jobs: ci/cudf_pandas_scripts/third-party-integration/test.sh python/cudf/cudf_pandas_tests/third_party_integration_tests/dependencies.yaml wheel-tests-cudf-polars: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-25.08 + uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-25.10 with: build_type: ${{ inputs.build_type }} branch: ${{ inputs.branch }} @@ -154,7 +154,7 @@ jobs: script: "ci/test_wheel_cudf_polars.sh" cudf-polars-polars-tests: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-25.08 + uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-25.10 with: build_type: ${{ inputs.build_type }} branch: ${{ inputs.branch }} @@ -163,7 +163,7 @@ jobs: script: "ci/test_cudf_polars_polars_tests.sh" narwhals-tests: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-25.08 + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-25.10 with: build_type: ${{ inputs.build_type }} branch: ${{ inputs.branch }} diff --git a/.github/workflows/trigger-breaking-change-alert.yaml b/.github/workflows/trigger-breaking-change-alert.yaml index 593fcb1086a..48bf37afc40 100644 --- a/.github/workflows/trigger-breaking-change-alert.yaml +++ b/.github/workflows/trigger-breaking-change-alert.yaml @@ -12,7 +12,7 @@ jobs: trigger-notifier: if: contains(github.event.pull_request.labels.*.name, 'breaking') secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/breaking-change-alert.yaml@branch-25.08 + uses: rapidsai/shared-workflows/.github/workflows/breaking-change-alert.yaml@branch-25.10 with: sender_login: ${{ github.event.sender.login }} sender_avatar: ${{ github.event.sender.avatar_url }} diff --git a/README.md b/README.md index d5c5782882f..04980005846 100644 --- a/README.md +++ b/README.md @@ -73,7 +73,7 @@ pip install cudf-cu12 cuDF can be installed with conda (via [miniforge](https://github.com/conda-forge/miniforge)) from the `rapidsai` channel: ```bash -conda install -c rapidsai -c conda-forge cudf=25.08 +conda install -c rapidsai -c conda-forge cudf=25.10 ``` We also provide [nightly Conda packages](https://anaconda.org/rapidsai-nightly) built from the HEAD diff --git a/VERSION b/VERSION index 3af4bda0205..296e35288d1 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -25.08.00 +25.10.00 diff --git a/conda/environments/all_cuda-129_arch-aarch64.yaml b/conda/environments/all_cuda-129_arch-aarch64.yaml index f7449e015f9..d53a7b0e731 100644 --- a/conda/environments/all_cuda-129_arch-aarch64.yaml +++ b/conda/environments/all_cuda-129_arch-aarch64.yaml @@ -27,7 +27,7 @@ dependencies: - cupy>=12.0.0 - cxx-compiler - cython>=3.0.3 -- dask-cuda==25.8.*,>=0.0.0a0 +- dask-cuda==25.10.*,>=0.0.0a0 - dlpack>=0.8,<1.0 - doxygen=1.9.1 - fastavro>=0.22.9 @@ -40,9 +40,9 @@ dependencies: - ipython - jupyter_client - libcurand-dev -- libkvikio==25.8.*,>=0.0.0a0 +- libkvikio==25.10.*,>=0.0.0a0 - librdkafka>=2.8.0,<2.9.0a0 -- librmm==25.8.*,>=0.0.0a0 +- librmm==25.10.*,>=0.0.0a0 - make - mmh3 - moto>=4.0.8 @@ -81,10 +81,10 @@ dependencies: - python>=3.10,<3.14 - pytorch>=2.4.0 - rapids-build-backend>=0.3.0,<0.4.0.dev0 -- rapids-dask-dependency==25.8.*,>=0.0.0a0 +- rapids-dask-dependency==25.10.*,>=0.0.0a0 - rapids-logger==0.1.*,>=0.0.0a0 - rich -- rmm==25.8.*,>=0.0.0a0 +- rmm==25.10.*,>=0.0.0a0 - s3fs>=2022.3.0 - scikit-build-core>=0.10.0 - scipy diff --git a/conda/environments/all_cuda-129_arch-x86_64.yaml b/conda/environments/all_cuda-129_arch-x86_64.yaml index d34654274db..96aac76cb86 100644 --- a/conda/environments/all_cuda-129_arch-x86_64.yaml +++ b/conda/environments/all_cuda-129_arch-x86_64.yaml @@ -27,7 +27,7 @@ dependencies: - cupy>=12.0.0 - cxx-compiler - cython>=3.0.3 -- dask-cuda==25.8.*,>=0.0.0a0 +- dask-cuda==25.10.*,>=0.0.0a0 - dlpack>=0.8,<1.0 - doxygen=1.9.1 - fastavro>=0.22.9 @@ -41,9 +41,9 @@ dependencies: - jupyter_client - libcufile-dev - libcurand-dev -- libkvikio==25.8.*,>=0.0.0a0 +- libkvikio==25.10.*,>=0.0.0a0 - librdkafka>=2.8.0,<2.9.0a0 -- librmm==25.8.*,>=0.0.0a0 +- librmm==25.10.*,>=0.0.0a0 - make - mmh3 - moto>=4.0.8 @@ -82,10 +82,10 @@ dependencies: - python>=3.10,<3.14 - pytorch>=2.4.0 - rapids-build-backend>=0.3.0,<0.4.0.dev0 -- rapids-dask-dependency==25.8.*,>=0.0.0a0 +- rapids-dask-dependency==25.10.*,>=0.0.0a0 - rapids-logger==0.1.*,>=0.0.0a0 - rich -- rmm==25.8.*,>=0.0.0a0 +- rmm==25.10.*,>=0.0.0a0 - s3fs>=2022.3.0 - scikit-build-core>=0.10.0 - scipy diff --git a/cpp/examples/versions.cmake b/cpp/examples/versions.cmake index 8d4a8335b47..4ef1f84f8c6 100644 --- a/cpp/examples/versions.cmake +++ b/cpp/examples/versions.cmake @@ -12,4 +12,4 @@ # the License. # ============================================================================= -set(CUDF_TAG branch-25.08) +set(CUDF_TAG branch-25.10) diff --git a/dependencies.yaml b/dependencies.yaml index 5123071d430..60fb3ab9ddb 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -588,7 +588,7 @@ dependencies: - output_types: [conda] packages: - breathe>=4.35.0 - - dask-cuda==25.8.*,>=0.0.0a0 + - dask-cuda==25.10.*,>=0.0.0a0 - *doxygen - make - myst-nb @@ -718,14 +718,14 @@ dependencies: common: - output_types: [conda, requirements, pyproject] packages: - - rapids-dask-dependency==25.8.*,>=0.0.0a0 + - rapids-dask-dependency==25.10.*,>=0.0.0a0 - nvidia-ml-py run_dask_cudf: common: - output_types: [conda, requirements, pyproject] packages: - pynvml>=12.0.0,<13.0.0a0 - - rapids-dask-dependency==25.8.*,>=0.0.0a0 + - rapids-dask-dependency==25.10.*,>=0.0.0a0 run_custreamz: common: - output_types: conda @@ -757,9 +757,9 @@ dependencies: common: - output_types: conda packages: - - libcudf-example==25.8.*,>=0.0.0a0 - - libcudf_kafka==25.8.*,>=0.0.0a0 - - libcudf-tests==25.8.*,>=0.0.0a0 + - libcudf-example==25.10.*,>=0.0.0a0 + - libcudf_kafka==25.10.*,>=0.0.0a0 + - libcudf-tests==25.10.*,>=0.0.0a0 test_java: common: - output_types: conda @@ -874,7 +874,7 @@ dependencies: common: - output_types: [conda, requirements, pyproject] packages: - - dask-cuda==25.8.*,>=0.0.0a0 + - dask-cuda==25.10.*,>=0.0.0a0 specific: - output_types: [conda, requirements] matrices: @@ -891,7 +891,7 @@ dependencies: common: - output_types: [conda, requirements, pyproject] packages: - - dask-cuda==25.8.*,>=0.0.0a0 + - dask-cuda==25.10.*,>=0.0.0a0 - *numpy - rich test_python_narwhals: @@ -906,7 +906,7 @@ dependencies: common: - output_types: conda packages: - - &libcudf_unsuffixed libcudf==25.8.*,>=0.0.0a0 + - &libcudf_unsuffixed libcudf==25.10.*,>=0.0.0a0 - output_types: requirements packages: # pip recognizes the index as a global option for the requirements.txt file @@ -919,13 +919,13 @@ dependencies: cuda: "12.*" cuda_suffixed: "true" packages: - - libcudf-cu12==25.8.*,>=0.0.0a0 + - libcudf-cu12==25.10.*,>=0.0.0a0 - {matrix: null, packages: [*libcudf_unsuffixed]} depends_on_pylibcudf: common: - output_types: conda packages: - - &pylibcudf_unsuffixed pylibcudf==25.8.*,>=0.0.0a0 + - &pylibcudf_unsuffixed pylibcudf==25.10.*,>=0.0.0a0 - output_types: requirements packages: # pip recognizes the index as a global option for the requirements.txt file @@ -938,13 +938,13 @@ dependencies: cuda: "12.*" cuda_suffixed: "true" packages: - - pylibcudf-cu12==25.8.*,>=0.0.0a0 + - pylibcudf-cu12==25.10.*,>=0.0.0a0 - {matrix: null, packages: [*pylibcudf_unsuffixed]} depends_on_pylibcudf_pyarrow: common: - output_types: conda packages: - - &plc_unsuffixed pylibcudf==25.8.*,>=0.0.0a0 + - &plc_unsuffixed pylibcudf==25.10.*,>=0.0.0a0 - output_types: requirements packages: # pip recognizes the index as a global option for the requirements.txt file @@ -957,13 +957,13 @@ dependencies: cuda: "12.*" cuda_suffixed: "true" packages: - - pylibcudf-cu12[pyarrow]==25.8.*,>=0.0.0a0 + - pylibcudf-cu12[pyarrow]==25.10.*,>=0.0.0a0 - {matrix: null, packages: [*plc_unsuffixed]} depends_on_cudf: common: - output_types: conda packages: - - &cudf_unsuffixed cudf==25.8.*,>=0.0.0a0 + - &cudf_unsuffixed cudf==25.10.*,>=0.0.0a0 - output_types: requirements packages: # pip recognizes the index as a global option for the requirements.txt file @@ -976,13 +976,13 @@ dependencies: cuda: "12.*" cuda_suffixed: "true" packages: - - cudf-cu12==25.8.*,>=0.0.0a0 + - cudf-cu12==25.10.*,>=0.0.0a0 - {matrix: null, packages: [*cudf_unsuffixed]} depends_on_cudf_kafka: common: - output_types: conda packages: - - &cudf_kafka_unsuffixed cudf_kafka==25.8.*,>=0.0.0a0 + - &cudf_kafka_unsuffixed cudf_kafka==25.10.*,>=0.0.0a0 - output_types: requirements packages: # pip recognizes the index as a global option for the requirements.txt file @@ -995,7 +995,7 @@ dependencies: cuda: "12.*" cuda_suffixed: "true" packages: - - cudf_kafka-cu12==25.8.*,>=0.0.0a0 + - cudf_kafka-cu12==25.10.*,>=0.0.0a0 - {matrix: null, packages: [*cudf_kafka_unsuffixed]} depends_on_cupy: common: @@ -1013,7 +1013,7 @@ dependencies: common: - output_types: conda packages: - - &libkvikio_unsuffixed libkvikio==25.8.*,>=0.0.0a0 + - &libkvikio_unsuffixed libkvikio==25.10.*,>=0.0.0a0 - output_types: requirements packages: - --extra-index-url=https://pypi.nvidia.com @@ -1025,7 +1025,7 @@ dependencies: cuda: "12.*" cuda_suffixed: "true" packages: - - libkvikio-cu12==25.8.*,>=0.0.0a0 + - libkvikio-cu12==25.10.*,>=0.0.0a0 - matrix: packages: - *libkvikio_unsuffixed @@ -1033,7 +1033,7 @@ dependencies: common: - output_types: conda packages: - - &librmm_unsuffixed librmm==25.8.*,>=0.0.0a0 + - &librmm_unsuffixed librmm==25.10.*,>=0.0.0a0 - output_types: requirements packages: # pip recognizes the index as a global option for the requirements.txt file @@ -1046,7 +1046,7 @@ dependencies: cuda: "12.*" cuda_suffixed: "true" packages: - - librmm-cu12==25.8.*,>=0.0.0a0 + - librmm-cu12==25.10.*,>=0.0.0a0 - matrix: packages: - *librmm_unsuffixed @@ -1054,7 +1054,7 @@ dependencies: common: - output_types: conda packages: - - &rmm_unsuffixed rmm==25.8.*,>=0.0.0a0 + - &rmm_unsuffixed rmm==25.10.*,>=0.0.0a0 - output_types: requirements packages: # pip recognizes the index as a global option for the requirements.txt file @@ -1067,7 +1067,7 @@ dependencies: cuda: "12.*" cuda_suffixed: "true" packages: - - rmm-cu12==25.8.*,>=0.0.0a0 + - rmm-cu12==25.10.*,>=0.0.0a0 - matrix: packages: - *rmm_unsuffixed @@ -1108,17 +1108,17 @@ dependencies: common: - output_types: conda packages: - - dask-cudf==25.8.*,>=0.0.0a0 + - dask-cudf==25.10.*,>=0.0.0a0 depends_on_custreamz: common: - output_types: conda packages: - - custreamz==25.8.*,>=0.0.0a0 + - custreamz==25.10.*,>=0.0.0a0 depends_on_cudf_polars: common: - output_types: conda packages: - - cudf-polars==25.8.*,>=0.0.0a0 + - cudf-polars==25.10.*,>=0.0.0a0 depends_on_narwhals: common: - output_types: [conda, requirements, pyproject] diff --git a/java/ci/README.md b/java/ci/README.md index bb2ba24fc63..5597cb22109 100644 --- a/java/ci/README.md +++ b/java/ci/README.md @@ -34,7 +34,7 @@ nvidia-docker run -it cudf-build:12.9.1-devel-rocky8 bash You can download the cuDF repo in the docker container or you can mount it into the container. Here I choose to download again in the container. ```bash -git clone --recursive https://github.com/rapidsai/cudf.git -b branch-25.08 +git clone --recursive https://github.com/rapidsai/cudf.git -b branch-25.10 ``` ### Build cuDF jar with devtoolset @@ -47,4 +47,4 @@ scl enable gcc-toolset-11 "java/ci/build-in-docker.sh" ### The output -You can find the cuDF jar in java/target/ like cudf-25.08.0-SNAPSHOT-cuda12.jar. +You can find the cuDF jar in java/target/ like cudf-25.10.0-SNAPSHOT-cuda12.jar. diff --git a/java/pom.xml b/java/pom.xml index 720a176f24e..6db3ae68ece 100644 --- a/java/pom.xml +++ b/java/pom.xml @@ -21,7 +21,7 @@ ai.rapids cudf - 25.08.0-SNAPSHOT + 25.10.0-SNAPSHOT cudfjni diff --git a/python/cudf/cudf/VERSION b/python/cudf/cudf/VERSION index 3af4bda0205..296e35288d1 100644 --- a/python/cudf/cudf/VERSION +++ b/python/cudf/cudf/VERSION @@ -1 +1 @@ -25.08.00 +25.10.00 diff --git a/python/cudf/cudf_pandas_tests/third_party_integration_tests/dependencies.yaml b/python/cudf/cudf_pandas_tests/third_party_integration_tests/dependencies.yaml index bad5d9af315..98b4a31a391 100644 --- a/python/cudf/cudf_pandas_tests/third_party_integration_tests/dependencies.yaml +++ b/python/cudf/cudf_pandas_tests/third_party_integration_tests/dependencies.yaml @@ -188,7 +188,7 @@ dependencies: common: - output_types: conda packages: - - cudf==25.8.*,>=0.0.0a0 + - cudf==25.10.*,>=0.0.0a0 - pandas - pytest - pytest-xdist @@ -264,13 +264,13 @@ dependencies: common: - output_types: conda packages: - - cuml==25.8.*,>=0.0.0a0 + - cuml==25.10.*,>=0.0.0a0 - scikit-learn test_cugraph: common: - output_types: conda packages: - - cugraph==25.8.*,>=0.0.0a0 + - cugraph==25.10.*,>=0.0.0a0 - networkx test_ibis: common: diff --git a/python/cudf/pyproject.toml b/python/cudf/pyproject.toml index ec14f0e209d..55ecf7f46fb 100644 --- a/python/cudf/pyproject.toml +++ b/python/cudf/pyproject.toml @@ -22,7 +22,7 @@ dependencies = [ "cuda-python>=12.6.2,<13.0a0", "cupy-cuda12x>=12.0.0", "fsspec>=0.6.0", - "libcudf==25.8.*,>=0.0.0a0", + "libcudf==25.10.*,>=0.0.0a0", "numba-cuda[cu12]>=0.16.0,<0.17.0a0", "numba>=0.59.1,<0.62.0a0", "numpy>=1.23,<3.0a0", @@ -31,9 +31,9 @@ dependencies = [ "pandas>=2.0,<2.4.0dev0", "pyarrow>=14.0.0,<20.0.0a0,!=17.0.0; platform_machine=='aarch64'", "pyarrow>=14.0.0,<20.0.0a0; platform_machine=='x86_64'", - "pylibcudf==25.8.*,>=0.0.0a0", + "pylibcudf==25.10.*,>=0.0.0a0", "rich", - "rmm==25.8.*,>=0.0.0a0", + "rmm==25.10.*,>=0.0.0a0", "typing_extensions>=4.0.0", ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. classifiers = [ @@ -121,11 +121,11 @@ matrix-entry = "cuda_suffixed=true" requires = [ "cmake>=3.30.4", "cython>=3.0.3", - "libcudf==25.8.*,>=0.0.0a0", - "librmm==25.8.*,>=0.0.0a0", + "libcudf==25.10.*,>=0.0.0a0", + "librmm==25.10.*,>=0.0.0a0", "ninja", - "pylibcudf==25.8.*,>=0.0.0a0", - "rmm==25.8.*,>=0.0.0a0", + "pylibcudf==25.10.*,>=0.0.0a0", + "rmm==25.10.*,>=0.0.0a0", ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. [tool.scikit-build] diff --git a/python/cudf_kafka/pyproject.toml b/python/cudf_kafka/pyproject.toml index a13ba960506..f4be4552feb 100644 --- a/python/cudf_kafka/pyproject.toml +++ b/python/cudf_kafka/pyproject.toml @@ -18,7 +18,7 @@ authors = [ license = { text = "Apache-2.0" } requires-python = ">=3.10" dependencies = [ - "cudf==25.8.*,>=0.0.0a0", + "cudf==25.10.*,>=0.0.0a0", ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. [project.optional-dependencies] diff --git a/python/cudf_polars/docs/overview.md b/python/cudf_polars/docs/overview.md index d3087d00f2d..df6af87bbf0 100644 --- a/python/cudf_polars/docs/overview.md +++ b/python/cudf_polars/docs/overview.md @@ -8,7 +8,7 @@ You will need: preferred configuration. Or else, use [rustup](https://www.rust-lang.org/tools/install) 2. A [cudf development - environment](https://github.com/rapidsai/cudf/blob/branch-25.08/CONTRIBUTING.md#setting-up-your-build-environment). + environment](https://github.com/rapidsai/cudf/blob/branch-25.10/CONTRIBUTING.md#setting-up-your-build-environment). The combined devcontainer works, or whatever your favourite approach is. :::{note} diff --git a/python/cudf_polars/pyproject.toml b/python/cudf_polars/pyproject.toml index f29d6370a24..bf3968d22e3 100644 --- a/python/cudf_polars/pyproject.toml +++ b/python/cudf_polars/pyproject.toml @@ -20,7 +20,7 @@ license = { text = "Apache-2.0" } requires-python = ">=3.10" dependencies = [ "polars>=1.28,<1.32", - "pylibcudf==25.8.*,>=0.0.0a0", + "pylibcudf==25.10.*,>=0.0.0a0", "typing-extensions; python_version < '3.11'", ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. classifiers = [ @@ -37,7 +37,7 @@ classifiers = [ [project.optional-dependencies] test = [ - "dask-cuda==25.8.*,>=0.0.0a0", + "dask-cuda==25.10.*,>=0.0.0a0", "numpy>=1.23,<3.0a0", "pytest", "pytest-cov", @@ -46,7 +46,7 @@ test = [ ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. experimental = [ "nvidia-ml-py", - "rapids-dask-dependency==25.8.*,>=0.0.0a0", + "rapids-dask-dependency==25.10.*,>=0.0.0a0", ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. [project.urls] diff --git a/python/custreamz/pyproject.toml b/python/custreamz/pyproject.toml index 0bbcadcd8a5..0b961894d2d 100644 --- a/python/custreamz/pyproject.toml +++ b/python/custreamz/pyproject.toml @@ -20,8 +20,8 @@ license = { text = "Apache-2.0" } requires-python = ">=3.10" dependencies = [ "confluent-kafka>=2.8.0,<2.9.0a0", - "cudf==25.8.*,>=0.0.0a0", - "cudf_kafka==25.8.*,>=0.0.0a0", + "cudf==25.10.*,>=0.0.0a0", + "cudf_kafka==25.10.*,>=0.0.0a0", "streamz", ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. classifiers = [ diff --git a/python/dask_cudf/pyproject.toml b/python/dask_cudf/pyproject.toml index 0939ba23308..f0d1d91fbfe 100644 --- a/python/dask_cudf/pyproject.toml +++ b/python/dask_cudf/pyproject.toml @@ -19,13 +19,13 @@ authors = [ license = { text = "Apache-2.0" } requires-python = ">=3.10" dependencies = [ - "cudf==25.8.*,>=0.0.0a0", + "cudf==25.10.*,>=0.0.0a0", "cupy-cuda12x>=12.0.0", "fsspec>=0.6.0", "numpy>=1.23,<3.0a0", "pandas>=2.0,<2.4.0dev0", "pynvml>=12.0.0,<13.0.0a0", - "rapids-dask-dependency==25.8.*,>=0.0.0a0", + "rapids-dask-dependency==25.10.*,>=0.0.0a0", ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. classifiers = [ "Intended Audience :: Developers", @@ -47,7 +47,7 @@ cudf = "dask_cudf.backends:CudfBackendEntrypoint" [project.optional-dependencies] test = [ - "dask-cuda==25.8.*,>=0.0.0a0", + "dask-cuda==25.10.*,>=0.0.0a0", "pytest", "pytest-cov", "pytest-xdist", diff --git a/python/libcudf/pyproject.toml b/python/libcudf/pyproject.toml index 0cdf864bf3a..7b69543d898 100644 --- a/python/libcudf/pyproject.toml +++ b/python/libcudf/pyproject.toml @@ -38,8 +38,8 @@ classifiers = [ "Environment :: GPU :: NVIDIA CUDA", ] dependencies = [ - "libkvikio==25.8.*,>=0.0.0a0", - "librmm==25.8.*,>=0.0.0a0", + "libkvikio==25.10.*,>=0.0.0a0", + "librmm==25.10.*,>=0.0.0a0", "rapids-logger==0.1.*,>=0.0.0a0", ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. @@ -80,8 +80,8 @@ dependencies-file = "../../dependencies.yaml" matrix-entry = "cuda_suffixed=true;use_cuda_wheels=true" requires = [ "cmake>=3.30.4", - "libkvikio==25.8.*,>=0.0.0a0", - "librmm==25.8.*,>=0.0.0a0", + "libkvikio==25.10.*,>=0.0.0a0", + "librmm==25.10.*,>=0.0.0a0", "ninja", "rapids-logger==0.1.*,>=0.0.0a0", ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. diff --git a/python/pylibcudf/pyproject.toml b/python/pylibcudf/pyproject.toml index fea40e8821e..561f9eccd64 100644 --- a/python/pylibcudf/pyproject.toml +++ b/python/pylibcudf/pyproject.toml @@ -19,10 +19,10 @@ license = { text = "Apache-2.0" } requires-python = ">=3.10" dependencies = [ "cuda-python>=12.6.2,<13.0a0", - "libcudf==25.8.*,>=0.0.0a0", + "libcudf==25.10.*,>=0.0.0a0", "nvtx>=0.2.1", "packaging", - "rmm==25.8.*,>=0.0.0a0", + "rmm==25.10.*,>=0.0.0a0", "typing_extensions>=4.0.0", ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. classifiers = [ @@ -119,10 +119,10 @@ matrix-entry = "cuda_suffixed=true" requires = [ "cmake>=3.30.4", "cython>=3.0.3", - "libcudf==25.8.*,>=0.0.0a0", - "librmm==25.8.*,>=0.0.0a0", + "libcudf==25.10.*,>=0.0.0a0", + "librmm==25.10.*,>=0.0.0a0", "ninja", - "rmm==25.8.*,>=0.0.0a0", + "rmm==25.10.*,>=0.0.0a0", ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. [tool.scikit-build] From 40f7ba59bc9cd3ef43085d5825e2d14afadae3d2 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Fri, 18 Jul 2025 09:03:29 -0400 Subject: [PATCH 002/366] update versions --- .github/workflows/build.yaml | 2 +- .github/workflows/pr.yaml | 8 ++++---- .github/workflows/test.yaml | 8 ++++---- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml index 2875cea0f3c..d9bb501c968 100644 --- a/.github/workflows/build.yaml +++ b/.github/workflows/build.yaml @@ -82,7 +82,7 @@ jobs: arch: "amd64" branch: ${{ inputs.branch }} build_type: ${{ inputs.build_type || 'branch' }} - container_image: "rapidsai/ci-conda:25.08-latest" + container_image: "rapidsai/ci-conda:25.10-latest" date: ${{ inputs.date }} node_type: "gpu-l4-latest-1" script: "ci/build_docs.sh" diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml index 908da80d690..f56b086ca8b 100644 --- a/.github/workflows/pr.yaml +++ b/.github/workflows/pr.yaml @@ -197,7 +197,7 @@ jobs: build_type: pull-request node_type: "gpu-l4-latest-1" arch: "amd64" - container_image: "rapidsai/ci-conda:25.08-latest" + container_image: "rapidsai/ci-conda:25.10-latest" script: "ci/test_java.sh" conda-notebook-tests: needs: [conda-python-build, changed-files] @@ -208,7 +208,7 @@ jobs: build_type: pull-request node_type: "gpu-l4-latest-1" arch: "amd64" - container_image: "rapidsai/ci-conda:25.08-latest" + container_image: "rapidsai/ci-conda:25.10-latest" script: "ci/test_notebooks.sh" docs-build: needs: conda-python-build @@ -218,7 +218,7 @@ jobs: build_type: pull-request node_type: "gpu-l4-latest-1" arch: "amd64" - container_image: "rapidsai/ci-conda:25.08-latest" + container_image: "rapidsai/ci-conda:25.10-latest" script: "ci/build_docs.sh" wheel-build-libcudf: needs: checks @@ -378,7 +378,7 @@ jobs: date: ${{ inputs.date }} sha: ${{ inputs.sha }} node_type: "gpu-l4-latest-1" - container_image: "rapidsai/ci-conda:25.08-latest" + container_image: "rapidsai/ci-conda:25.10-latest" script: ci/test_narwhals.sh spark-rapids-jni: needs: changed-files diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index 52e09dd7d8b..b8af253bc56 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -49,7 +49,7 @@ jobs: sha: ${{ inputs.sha }} node_type: "gpu-l4-latest-1" arch: "amd64" - container_image: "rapidsai/ci-conda:25.08-latest" + container_image: "rapidsai/ci-conda:25.10-latest" script: "ci/test_cpp_memcheck.sh" cpp-linters: secrets: inherit @@ -90,7 +90,7 @@ jobs: sha: ${{ inputs.sha }} node_type: "gpu-l4-latest-1" arch: "amd64" - container_image: "rapidsai/ci-conda:25.08-latest" + container_image: "rapidsai/ci-conda:25.10-latest" script: "ci/test_java.sh" conda-notebook-tests: secrets: inherit @@ -102,7 +102,7 @@ jobs: sha: ${{ inputs.sha }} node_type: "gpu-l4-latest-1" arch: "amd64" - container_image: "rapidsai/ci-conda:25.08-latest" + container_image: "rapidsai/ci-conda:25.10-latest" script: "ci/test_notebooks.sh" wheel-tests-cudf: secrets: inherit @@ -170,5 +170,5 @@ jobs: date: ${{ inputs.date }} sha: ${{ inputs.sha }} node_type: "gpu-l4-latest-1" - container_image: "rapidsai/ci-conda:25.08-latest" + container_image: "rapidsai/ci-conda:25.10-latest" script: ci/test_narwhals.sh From 3767b7dd8e49456351d4c0220e342d0b098fe4b8 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Mon, 21 Jul 2025 07:47:44 -0400 Subject: [PATCH 003/366] fix merge conflict --- dependencies.yaml | 4 ---- 1 file changed, 4 deletions(-) diff --git a/dependencies.yaml b/dependencies.yaml index 23b003a12a1..63fbbe44f4c 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -730,12 +730,8 @@ dependencies: common: - output_types: [conda, requirements, pyproject] packages: -<<<<<<< HEAD - rapids-dask-dependency==25.10.*,>=0.0.0a0 - nvidia-ml-py -======= - - rapids-dask-dependency==25.8.*,>=0.0.0a0 ->>>>>>> branch-25.08 run_dask_cudf: common: - output_types: [conda, requirements, pyproject] From 3955e23fd1484fbf0c395c0651da6678a4bcaa2f Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Wed, 23 Jul 2025 09:57:40 -0700 Subject: [PATCH 004/366] Use more pytest fixtures and avoid GPU parameterization in cuDF classic tests (#19419) Towards https://github.com/rapidsai/cudf/issues/9999 * Use more pytest fixtures * Avoids `pytest.mark.parametrize` with GPU objects Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - GALI PREM SAGAR (https://github.com/galipremsagar) - Bradley Dice (https://github.com/bdice) URL: https://github.com/rapidsai/cudf/pull/19419 --- python/cudf/cudf/tests/test_scan.py | 22 ++-- python/cudf/cudf/tests/test_serialize.py | 9 +- python/cudf/cudf/tests/test_series.py | 136 +++++++++++------------ python/cudf/cudf/tests/test_seriesmap.py | 8 +- python/cudf/cudf/tests/test_setitem.py | 25 +++-- 5 files changed, 93 insertions(+), 107 deletions(-) diff --git a/python/cudf/cudf/tests/test_scan.py b/python/cudf/cudf/tests/test_scan.py index b76566b00e2..d4b21480070 100644 --- a/python/cudf/cudf/tests/test_scan.py +++ b/python/cudf/cudf/tests/test_scan.py @@ -1,6 +1,4 @@ -# Copyright (c) 2021-2024, NVIDIA CORPORATION. - -from itertools import product +# Copyright (c) 2021-2025, NVIDIA CORPORATION. import numpy as np import pandas as pd @@ -11,18 +9,17 @@ from cudf.testing import assert_eq from cudf.testing._utils import INTEGER_TYPES, NUMERIC_TYPES, gen_rand -params_sizes = [0, 1, 2, 5] + +@pytest.fixture(params=NUMERIC_TYPES) +def dtype(request): + return request.param -def _gen_params(): - for t, n in product(NUMERIC_TYPES, params_sizes): - if (t == np.int8 or t == np.int16) and n > 20: - # to keep data in range - continue - yield t, n +@pytest.fixture(params=[0, 1, 5]) +def nelem(request): + return request.param -@pytest.mark.parametrize("dtype,nelem", list(_gen_params())) def test_cumsum(dtype, nelem): if dtype == np.int8: # to keep data in range @@ -86,7 +83,6 @@ def test_cumsum_decimal(dtype): assert_eq(got, expected) -@pytest.mark.parametrize("dtype,nelem", list(_gen_params())) def test_cummin(dtype, nelem): if dtype == np.int8: # to keep data in range @@ -149,7 +145,6 @@ def test_cummin_decimal(dtype): assert_eq(got, expected) -@pytest.mark.parametrize("dtype,nelem", list(_gen_params())) def test_cummax(dtype, nelem): if dtype == np.int8: # to keep data in range @@ -212,7 +207,6 @@ def test_cummax_decimal(dtype): assert_eq(got, expected) -@pytest.mark.parametrize("dtype,nelem", list(_gen_params())) def test_cumprod(dtype, nelem): if dtype == np.int8: # to keep data in range diff --git a/python/cudf/cudf/tests/test_serialize.py b/python/cudf/cudf/tests/test_serialize.py index a4e8ce3015c..1c508307e32 100644 --- a/python/cudf/cudf/tests/test_serialize.py +++ b/python/cudf/cudf/tests/test_serialize.py @@ -428,14 +428,15 @@ def test_serialize_sliced_string(): @pytest.mark.parametrize( "columns", [ - cudf.RangeIndex(2), - cudf.Index([1, 2], dtype="int8"), - cudf.MultiIndex( + lambda: cudf.RangeIndex(2), + lambda: cudf.Index([1, 2], dtype="int8"), + lambda: cudf.MultiIndex( levels=[["a", "b"], [1, 2]], codes=[[0, 1], [0, 1]], names=["a", 0] ), ], + ids=["RangeIndex", "Index", "MultiIndex"], ) def test_serialize_column_types_preserved(columns): - expected = cudf.DataFrame([[10, 11]], columns=columns) + expected = cudf.DataFrame([[10, 11]], columns=columns()) result = cudf.DataFrame.deserialize(*expected.serialize()) assert_eq(result, expected) diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py index 751c98cc845..b47addf4311 100644 --- a/python/cudf/cudf/tests/test_series.py +++ b/python/cudf/cudf/tests/test_series.py @@ -29,8 +29,8 @@ ) -def _series_na_data(): - return [ +@pytest.fixture( + params=[ pd.Series([0, 1, 2, np.nan, 4, None, 6]), pd.Series( [0, 1, 2, np.nan, 4, None, 6], @@ -48,6 +48,9 @@ def _series_na_data(): pd.Series([None]), pd.Series(["a", "b", "", "c", None, "e"]), ] +) +def ps(request): + return request.param @pytest.mark.parametrize( @@ -756,17 +759,18 @@ def test_series_round_half_up(): @pytest.mark.parametrize( - "series", + "series_data", [ - cudf.Series([1.0, None, np.nan, 4.0], nan_as_null=False), - cudf.Series([1.24430, None, np.nan, 4.423530], nan_as_null=False), - cudf.Series([1.24430, np.nan, 4.423530], nan_as_null=False), - cudf.Series([-1.24430, np.nan, -4.423530], nan_as_null=False), - cudf.Series(np.repeat(np.nan, 100)), + [1.0, None, np.nan, 4.0], + [1.24430, None, np.nan, 4.423530], + [1.24430, np.nan, 4.423530], + [-1.24430, np.nan, -4.423530], + np.repeat(np.nan, 100), ], ) @pytest.mark.parametrize("decimal", [0, 1, 2, 3]) -def test_round_nan_as_null_false(series, decimal): +def test_round_nan_as_null_false(series_data, decimal): + series = cudf.Series(series_data, nan_as_null=False) pser = series.to_pandas() result = series.round(decimal) expected = pser.round(decimal) @@ -836,7 +840,6 @@ def test_series_round_decimal( assert_eq(result_half_even, expected_ser_half_even) -@pytest.mark.parametrize("ps", _series_na_data()) @pytest.mark.parametrize("nan_as_null", [True, False, None]) def test_series_isnull_isna(ps, nan_as_null): nan_contains = ps.apply(lambda x: isinstance(x, float) and np.isnan(x)) @@ -852,7 +855,6 @@ def test_series_isnull_isna(ps, nan_as_null): assert_eq(ps.isna(), gs.isna()) -@pytest.mark.parametrize("ps", _series_na_data()) @pytest.mark.parametrize("nan_as_null", [True, False, None]) def test_series_notnull_notna(ps, nan_as_null): nan_contains = ps.apply(lambda x: isinstance(x, float) and np.isnan(x)) @@ -911,43 +913,44 @@ def test_series_memory_usage(): @pytest.mark.parametrize( - "sr,expected_psr", + "sr_data,expected_psr", [ ( - cudf.Series([1, 2, None, 3], dtype="uint8"), + pa.array([1, 2, None, 3], type=pa.uint8()), pd.Series([1, 2, None, 3], dtype=pd.UInt8Dtype()), ), ( - cudf.Series([23, None, None, 32], dtype="uint16"), + pa.array([23, None, None, 32], type=pa.uint16()), pd.Series([23, None, None, 32], dtype=pd.UInt16Dtype()), ), ( - cudf.Series([None, 123, None, 1], dtype="uint32"), + pa.array([None, 123, None, 1], type=pa.uint32()), pd.Series([None, 123, None, 1], dtype=pd.UInt32Dtype()), ), ( - cudf.Series([234, 2323, 23432, None, None, 224], dtype="uint64"), + pa.array([234, 2323, 23432, None, None, 224], type=pa.uint64()), pd.Series( [234, 2323, 23432, None, None, 224], dtype=pd.UInt64Dtype() ), ), ( - cudf.Series([-10, 1, None, -1, None, 3], dtype="int8"), + pa.array([-10, 1, None, -1, None, 3], type=pa.int8()), pd.Series([-10, 1, None, -1, None, 3], dtype=pd.Int8Dtype()), ), ( - cudf.Series([111, None, 222, None, 13], dtype="int16"), + pa.array([111, None, 222, None, 13], type=pa.int16()), pd.Series([111, None, 222, None, 13], dtype=pd.Int16Dtype()), ), ( - cudf.Series([11, None, 22, 33, None, 2, None, 3], dtype="int32"), + pa.array([11, None, 22, 33, None, 2, None, 3], type=pa.int32()), pd.Series( [11, None, 22, 33, None, 2, None, 3], dtype=pd.Int32Dtype() ), ), ( - cudf.Series( - [32431, None, None, 32322, 0, 10, -32324, None], dtype="int64" + pa.array( + [32431, None, None, 32322, 0, 10, -32324, None], + type=pa.int64(), ), pd.Series( [32431, None, None, 32322, 0, 10, -32324, None], @@ -955,9 +958,9 @@ def test_series_memory_usage(): ), ), ( - cudf.Series( + pa.array( [True, None, False, None, False, True, True, False], - dtype="bool_", + type=pa.bool_(), ), pd.Series( [True, None, False, None, False, True, True, False], @@ -965,7 +968,7 @@ def test_series_memory_usage(): ), ), ( - cudf.Series( + pa.array( [ "abc", "a", @@ -976,7 +979,7 @@ def test_series_memory_usage(): None, "rapids ai", ], - dtype="object", + type=pa.string(), ), pd.Series( [ @@ -993,9 +996,9 @@ def test_series_memory_usage(): ), ), ( - cudf.Series( + pa.array( [1, 2, None, 10.2, None], - dtype="float32", + type=pa.float32(), ), pd.Series( [1, 2, None, 10.2, None], @@ -1004,7 +1007,8 @@ def test_series_memory_usage(): ), ], ) -def test_series_to_pandas_nullable_dtypes(sr, expected_psr): +def test_series_to_pandas_nullable_dtypes(sr_data, expected_psr): + sr = cudf.Series(sr_data) actual_psr = sr.to_pandas(nullable=True) assert_eq(actual_psr, expected_psr) @@ -1066,22 +1070,24 @@ def custom_add_func(sr, val): @pytest.mark.parametrize( - "data", - [cudf.Series([1, 2, 3]), cudf.Series([10, 11, 12], index=[1, 2, 3])], + "pd_data", + [pd.Series([1, 2, 3]), pd.Series([10, 11, 12], index=[1, 2, 3])], ) @pytest.mark.parametrize( "other", [ - cudf.Series([4, 5, 6]), - cudf.Series([4, 5, 6, 7, 8]), - cudf.Series([4, np.nan, 6], nan_as_null=False), + pd.Series([4, 5, 6]), + pd.Series([4, 5, 6, 7, 8]), + pd.Series([4, np.nan, 6]), [4, np.nan, 6], {1: 9}, ], ) -def test_series_update(data, other): +def test_series_update(pd_data, other): + data = cudf.Series.from_pandas(pd_data) gs = data.copy(deep=True) - if isinstance(other, cudf.Series): + if isinstance(other, pd.Series): + other = cudf.Series.from_pandas(other, nan_as_null=False) g_other = other.copy(deep=True) p_other = g_other.to_pandas() else: @@ -1344,28 +1350,20 @@ def test_explode(data, ignore_index, p_index): @pytest.mark.parametrize( - "data, expected", + "data", [ - ( - [cudf.Series([1, 2, 3]), cudf.Series([10, 20])], - cudf.Series([[1, 2, 3], [10, 20]]), - ), - ( - [cudf.Series([1, 2, 3]), None, cudf.Series([10, 20, np.nan])], - cudf.Series([[1, 2, 3], None, [10, 20, np.nan]]), - ), - ( - [cp.array([5, 6]), cudf.NA, cp.array([1])], - cudf.Series([[5, 6], None, [1]]), - ), - ( - [None, None, None, None, None, cudf.Series([10, 20])], - cudf.Series([None, None, None, None, None, [10, 20]]), - ), + [[1, 2, 3], [10, 20]], + [[1.0, 2.0, 3.0], None, [10.0, 20.0, np.nan]], + [[5, 6], None, [1]], + [None, None, None, None, None, [10, 20]], ], ) -def test_nested_series_from_sequence_data(data, expected): - actual = cudf.Series(data) +@pytest.mark.parametrize("klass", [cudf.Series, list, cp.array]) +def test_nested_series_from_sequence_data(data, klass): + actual = cudf.Series( + [klass(val) if val is not None else val for val in data] + ) + expected = cudf.Series(data) assert_eq(actual, expected) @@ -1648,14 +1646,15 @@ def test_series_add_suffix(): @pytest.mark.parametrize( - "cudf_series", + "data", [ - cudf.Series([0.25, 0.5, 0.2, -0.05]), - cudf.Series([0, 1, 2, np.nan, 4, cudf.NA, 6]), + [0.25, 0.5, 0.2, -0.05], + [0, 1, 2, np.nan, 4, cudf.NA, 6], ], ) @pytest.mark.parametrize("lag", [1, 2, 3, 4]) -def test_autocorr(cudf_series, lag): +def test_autocorr(data, lag): + cudf_series = cudf.Series(data) psr = cudf_series.to_pandas() cudf_corr = cudf_series.autocorr(lag=lag) @@ -1822,10 +1821,10 @@ def test_isin_numeric(data, values): assert_eq(got, expected) -@pytest.mark.xfail(raises=TypeError) def test_fill_new_category(): gs = cudf.Series(pd.Categorical(["a", "b", "c"])) - gs[0:1] = "d" + with pytest.raises(TypeError): + gs[0:1] = "d" @pytest.mark.skipif( @@ -1892,9 +1891,9 @@ def test_isin_datetime(data, values): "data", [ [], - pd.Series(["this", "is", None, "a", "test"]), - pd.Series(["test", "this", "test", "is", None, "test", "a", "test"]), - pd.Series(["0", "12", "14"]), + ["this", "is", None, "a", "test"], + ["test", "this", "test", "is", None, "test", "a", "test"], + ["0", "12", "14"], ], ) @pytest.mark.parametrize( @@ -1971,14 +1970,8 @@ def test_diff(dtype, period, data_empty): assert_eq(diffed_outcome, expected_outcome) -@pytest.mark.parametrize( - "data", - [ - ["a", "b", "c", "d", "e"], - ], -) -def test_diff_unsupported_dtypes(data): - gs = cudf.Series(data) +def test_diff_unsupported_dtypes(): + gs = cudf.Series(["a", "b", "c", "d", "e"]) with pytest.raises( TypeError, match=r"unsupported operand type\(s\)", @@ -3007,7 +3000,6 @@ def test_null_like_to_nan_pandas_compat(): assert_eq(ser, pser) -@pytest.mark.parametrize("ps", _series_na_data()) def test_roundtrip_series_plc_column(ps): expect = cudf.Series(ps) actual = cudf.Series.from_pylibcudf(*expect.to_pylibcudf()) diff --git a/python/cudf/cudf/tests/test_seriesmap.py b/python/cudf/cudf/tests/test_seriesmap.py index db1de7d0cf4..b232275e8c9 100644 --- a/python/cudf/cudf/tests/test_seriesmap.py +++ b/python/cudf/cudf/tests/test_seriesmap.py @@ -1,6 +1,5 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. +# Copyright (c) 2020-2025, NVIDIA CORPORATION. -from itertools import product from math import floor import numpy as np @@ -44,11 +43,10 @@ def test_series_map_callable_numeric_basic(): assert_eq(expected_function, actual_function) -@pytest.mark.parametrize("nelem", list(product([2, 10, 100, 1000]))) -def test_series_map_callable_numeric_random(nelem): +def test_series_map_callable_numeric_random(): # Generate data rng = np.random.default_rng(seed=0) - data = rng.random(nelem) * 100 + data = rng.random(50) * 100 sr = Series(data) pdsr = pd.Series(data) diff --git a/python/cudf/cudf/tests/test_setitem.py b/python/cudf/cudf/tests/test_setitem.py index 2ab9d41cd2c..ffbf21b5548 100644 --- a/python/cudf/cudf/tests/test_setitem.py +++ b/python/cudf/cudf/tests/test_setitem.py @@ -10,10 +10,10 @@ from cudf.testing._utils import assert_exceptions_equal, expect_warning_if -@pytest.mark.parametrize("df", [pd.DataFrame({"a": [1, 2, 3]})]) @pytest.mark.parametrize("arg", [[True, False, True], [True, True, True]]) @pytest.mark.parametrize("value", [0, -1]) -def test_dataframe_setitem_bool_mask_scaler(df, arg, value): +def test_dataframe_setitem_bool_mask_scaler(arg, value): + df = pd.DataFrame({"a": [1, 2, 3]}) gdf = cudf.from_pandas(df) df[arg] = value @@ -50,8 +50,6 @@ def test_dataframe_setitem_columns(df, arg, value): assert_eq(df, gdf, check_dtype=False) -@pytest.mark.parametrize("df", [pd.DataFrame({"a": [1, 2, 3]})]) -@pytest.mark.parametrize("arg", [["b", "c"]]) @pytest.mark.parametrize( "value", [ @@ -66,7 +64,9 @@ def test_dataframe_setitem_columns(df, arg, value): np.timedelta64(34234324234324234, "ns"), ], ) -def test_dataframe_setitem_new_columns(df, arg, value): +def test_dataframe_setitem_new_columns(value): + df = pd.DataFrame({"a": [1, 2, 3]}) + arg = ["b", "c"] gdf = cudf.from_pandas(df) cudf_replace_value = value @@ -92,11 +92,11 @@ def test_series_setitem_index(): assert_eq(df, gdf, check_dtype=False) -@pytest.mark.parametrize("psr", [pd.Series([1, 2, 3], index=["a", "b", "c"])]) @pytest.mark.parametrize( "arg", ["b", ["a", "c"], slice(1, 2, 1), [True, False, True]] ) -def test_series_set_item(psr, arg): +def test_series_set_item(arg): + psr = pd.Series([1, 2, 3], index=["a", "b", "c"]) gsr = cudf.from_pandas(psr) psr[arg] = 11 @@ -135,14 +135,15 @@ def test_setitem_dataframe_series_inplace(index): @pytest.mark.parametrize( - "replace_data", + "klass", [ - [100, 200, 300, 400, 500], - cudf.Series([100, 200, 300, 400, 500]), - cudf.Series([100, 200, 300, 400, 500], index=[2, 3, 4, 5, 6]), + list, + cudf.Series, + lambda x: cudf.Series(x, index=[2, 3, 4, 5, 6]), ], ) -def test_series_set_equal_length_object_by_mask(replace_data): +def test_series_set_equal_length_object_by_mask(klass): + replace_data = klass([100, 200, 300, 400, 500]) psr = pd.Series([1, 2, 3, 4, 5], dtype="Int64") gsr = cudf.from_pandas(psr) From 0c8fa1cce067d5667202391f301fc0f408bede2d Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Wed, 23 Jul 2025 10:24:16 -0700 Subject: [PATCH 005/366] Update s3 Bucket fixture creation in test_s3 (#19424) It appears these s3 tests borrowed from some older patterns in pandas tests. This PR updates the S3 bucket fixture creation to how pandas now does it. Namely, now each test will have a unique bucket name and will consistently clear the contents after each tests runs. Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - Tom Augspurger (https://github.com/TomAugspurger) URL: https://github.com/rapidsai/cudf/pull/19424 --- python/cudf/cudf/tests/test_s3.py | 525 +++++++++++++----------------- 1 file changed, 223 insertions(+), 302 deletions(-) diff --git a/python/cudf/cudf/tests/test_s3.py b/python/cudf/cudf/tests/test_s3.py index afb82f75bcf..e9c90c899da 100644 --- a/python/cudf/cudf/tests/test_s3.py +++ b/python/cudf/cudf/tests/test_s3.py @@ -1,109 +1,77 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. +# Copyright (c) 2020-2025, NVIDIA CORPORATION. -import os -import socket -from contextlib import contextmanager +import subprocess +import sys +import uuid from io import BytesIO, StringIO -import numpy as np import pandas as pd import pytest from fsspec.core import get_fs_token_paths import cudf +from cudf.io.parquet import ParquetDatasetWriter from cudf.testing import assert_eq -moto = pytest.importorskip("moto", minversion="3.1.6") -boto3 = pytest.importorskip("boto3") -s3fs = pytest.importorskip("s3fs") -ThreadedMotoServer = pytest.importorskip("moto.server").ThreadedMotoServer +@pytest.fixture(scope="module") +def monkeymodule(): + with pytest.MonkeyPatch.context() as mp: + yield mp -@pytest.fixture(scope="session") -def endpoint_ip(): - return "127.0.0.1" +@pytest.fixture(scope="module") +def aws_credentials(monkeymodule): + """Mocked AWS Credentials for moto.""" + monkeymodule.setenv("AWS_ACCESS_KEY_ID", "foobar_key") + monkeymodule.setenv("AWS_SECRET_ACCESS_KEY", "foobar_secret") + monkeymodule.setenv("S3FS_LOGGING_LEVEL", "DEBUG") + monkeymodule.setenv("AWS_SECURITY_TOKEN", "testing") + monkeymodule.setenv("AWS_SESSION_TOKEN", "foobar_session_token") + monkeymodule.setenv("AWS_DEFAULT_REGION", "us-east-1") -@pytest.fixture(scope="session") -def endpoint_port(): - # Return a free port per worker session. - sock = socket.socket() - sock.bind(("127.0.0.1", 0)) - port = sock.getsockname()[1] - sock.close() - return port - - -@contextmanager -def ensure_safe_environment_variables(): +@pytest.fixture(scope="module") +def moto_server(aws_credentials, monkeymodule): """ - Get a context manager to safely set environment variables - All changes will be undone on close, hence environment variables set - within this contextmanager will neither persist nor change global state. + Fixture to set up moto server in separate process """ - saved_environ = dict(os.environ) - try: - yield - finally: - os.environ.clear() - os.environ.update(saved_environ) + moto_server = pytest.importorskip("moto.server") + server = moto_server.ThreadedMotoServer(port=0) + server.start() + host, port = server.get_host_and_port() + url = f"http://{host}:{port}" + monkeymodule.setenv("AWS_ENDPOINT_URL", url) + yield url + server.stop() -@pytest.fixture(scope="session") -def s3_base(endpoint_ip, endpoint_port): - """ - Fixture to set up moto server in separate process - """ - with ensure_safe_environment_variables(): - # Fake aws credentials exported to prevent botocore looking for - # system aws credentials, https://github.com/spulec/moto/issues/1793 - os.environ["AWS_ACCESS_KEY_ID"] = "foobar_key" - os.environ["AWS_SECRET_ACCESS_KEY"] = "foobar_secret" - os.environ["S3FS_LOGGING_LEVEL"] = "DEBUG" - os.environ["AWS_SECURITY_TOKEN"] = "foobar_security_token" - os.environ["AWS_SESSION_TOKEN"] = "foobar_session_token" - os.environ["AWS_DEFAULT_REGION"] = "us-east-1" - - # Launching moto in server mode, i.e., as a separate process - # with an S3 endpoint on localhost - - endpoint_uri = f"http://{endpoint_ip}:{endpoint_port}/" - os.environ["AWS_ENDPOINT_URL"] = endpoint_uri - - server = ThreadedMotoServer(ip_address=endpoint_ip, port=endpoint_port) - server.start() - yield endpoint_uri - server.stop() - - -@pytest.fixture() -def s3so(endpoint_ip, endpoint_port): +@pytest.fixture +def s3so(moto_server): """ Returns s3 storage options to pass to fsspec """ - endpoint_uri = f"http://{endpoint_ip}:{endpoint_port}/" + return {"client_kwargs": {"endpoint_url": moto_server}} - return {"client_kwargs": {"endpoint_url": endpoint_uri}} +@pytest.fixture +def moto_s3_resource(moto_server): + boto3 = pytest.importorskip("boto3") + s3 = boto3.resource("s3", endpoint_url=moto_server) + return s3 -@contextmanager -def s3_context(s3_base, bucket, files=None): - if files is None: - files = {} - with ensure_safe_environment_variables(): - client = boto3.client("s3", endpoint_url=s3_base) - client.create_bucket(Bucket=bucket, ACL="public-read-write") - for f, data in files.items(): - client.put_object(Bucket=bucket, Key=f, Body=data) - - yield s3fs.S3FileSystem(client_kwargs={"endpoint_url": s3_base}) - for f, data in files.items(): - try: - client.delete_object(Bucket=bucket, Key=f) - except Exception: - pass +@pytest.fixture +def s3_bucket_public(moto_s3_resource): + """ + Create a public S3 bucket using moto. + """ + bucket_name = f"cudf-test-{uuid.uuid4()}" + bucket = moto_s3_resource.Bucket(bucket_name) + bucket.create(ACL="public-read-write") + yield bucket + bucket.objects.delete() + bucket.delete() @pytest.fixture( @@ -116,84 +84,83 @@ def kvikio_remote_io(request): @pytest.fixture -def pdf(scope="module"): - df = pd.DataFrame() - df["Integer"] = np.array([2345, 11987, 9027, 9027]) - df["Float"] = np.array([9.001, 8.343, 6, 2.781]) - df["Integer2"] = np.array([2345, 106, 2088, 789277]) - df["String"] = np.array(["Alpha", "Beta", "Gamma", "Delta"]) - df["Boolean"] = np.array([True, False, True, False]) - return df +def pdf(): + return pd.DataFrame( + { + "Integer": [2345, 11987, 9027, 9027], + "Float": [9.001, 8.343, 6, 2.781], + "Integer2": [2345, 106, 2088, 789277], + "String": ["Alpha", "Beta", "Gamma", "Delta"], + "Boolean": [True, False, True, False], + } + ) @pytest.fixture -def pdf_ext(scope="module"): - size = 100 - df = pd.DataFrame() - df["Integer"] = np.array([i for i in range(size)]) - df["List"] = [[i] for i in range(size)] - df["Struct"] = [{"a": i} for i in range(size)] - df["String"] = (["Alpha", "Beta", "Gamma", "Delta"] * (-(size // -4)))[ - :size - ] - return df +def pdf_ext(): + size = 10 + return pd.DataFrame( + { + "Integer": [i for i in range(size)], + "List": [[i] for i in range(size)], + "Struct": [{"a": i} for i in range(size)], + "String": (["Alpha", "Beta", "Gamma", "Delta"] * (-(size // -4)))[ + :size + ], + } + ) @pytest.mark.parametrize("bytes_per_thread", [32, 1024]) -def test_read_csv(s3_base, s3so, pdf, bytes_per_thread): +def test_read_csv(s3_bucket_public, s3so, pdf, bytes_per_thread): # Write to buffer fname = "test_csv_reader.csv" - bucket = "csv" buffer = pdf.to_csv(index=False) + s3_bucket_public.put_object(Key=fname, Body=buffer) # Use fsspec file object - with s3_context(s3_base=s3_base, bucket=bucket, files={fname: buffer}): - got = cudf.read_csv( - f"s3://{bucket}/{fname}", - storage_options=s3so, - bytes_per_thread=bytes_per_thread, - ) + got = cudf.read_csv( + f"s3://{s3_bucket_public.name}/{fname}", + storage_options=s3so, + bytes_per_thread=bytes_per_thread, + ) assert_eq(pdf, got) @pytest.mark.parametrize("bytes_per_thread", [32, 1024]) -def test_read_csv_byte_range(s3_base, s3so, pdf, bytes_per_thread): +def test_read_csv_byte_range(s3_bucket_public, s3so, pdf, bytes_per_thread): # Write to buffer fname = "test_csv_reader_byte_range.csv" - bucket = "csv" buffer = pdf.to_csv(index=False) + s3_bucket_public.put_object(Key=fname, Body=buffer) # Use fsspec file object - with s3_context(s3_base=s3_base, bucket=bucket, files={fname: buffer}): - got = cudf.read_csv( - f"s3://{bucket}/{fname}", - storage_options=s3so, - byte_range=(74, 73), - bytes_per_thread=bytes_per_thread, - header=None, - names=["Integer", "Float", "Integer2", "String", "Boolean"], - ) + got = cudf.read_csv( + f"s3://{s3_bucket_public.name}/{fname}", + storage_options=s3so, + byte_range=(74, 73), + bytes_per_thread=bytes_per_thread, + header=None, + names=["Integer", "Float", "Integer2", "String", "Boolean"], + ) assert_eq(pdf.iloc[-2:].reset_index(drop=True), got) @pytest.mark.parametrize("chunksize", [None, 3]) -def test_write_csv(s3_base, s3so, pdf, chunksize): +def test_write_csv(s3_bucket_public, s3so, pdf, chunksize): # Write to buffer fname = "test_csv_writer.csv" - bucket = "csv" gdf = cudf.from_pandas(pdf) - with s3_context(s3_base=s3_base, bucket=bucket) as s3fs: - gdf.to_csv( - f"s3://{bucket}/{fname}", - index=False, - chunksize=chunksize, - storage_options=s3so, - ) - assert s3fs.exists(f"s3://{bucket}/{fname}") - - # TODO: Update to use `storage_options` from pandas v1.2.0 - got = pd.read_csv(s3fs.open(f"s3://{bucket}/{fname}")) + gdf.to_csv( + f"s3://{s3_bucket_public.name}/{fname}", + index=False, + chunksize=chunksize, + storage_options=s3so, + ) + got = pd.read_csv( + f"s3://{s3_bucket_public.name}/{fname}", storage_options=s3so + ) assert_eq(pdf, got) @@ -201,7 +168,7 @@ def test_write_csv(s3_base, s3so, pdf, chunksize): @pytest.mark.parametrize("bytes_per_thread", [32, 1024]) @pytest.mark.parametrize("columns", [None, ["Float", "String"]]) def test_read_parquet( - s3_base, + s3_bucket_public, s3so, kvikio_remote_io, pdf, @@ -209,51 +176,49 @@ def test_read_parquet( columns, ): fname = "test_parquet_reader.parquet" - bucket = "parquet" buffer = BytesIO() pdf.to_parquet(path=buffer) # Check direct path handling buffer.seek(0) - with s3_context(s3_base=s3_base, bucket=bucket, files={fname: buffer}): - got1 = cudf.read_parquet( - f"s3://{bucket}/{fname}", - storage_options=s3so, - bytes_per_thread=bytes_per_thread, - columns=columns, - ) + s3_bucket_public.put_object(Key=fname, Body=buffer) + got1 = cudf.read_parquet( + f"s3://{s3_bucket_public.name}/{fname}", + storage_options=s3so, + bytes_per_thread=bytes_per_thread, + columns=columns, + ) expect = pdf[columns] if columns else pdf assert_eq(expect, got1) # Check fsspec file-object handling buffer.seek(0) - with s3_context(s3_base=s3_base, bucket=bucket, files={fname: buffer}): - fs = get_fs_token_paths( - f"s3://{bucket}/{fname}", storage_options=s3so - )[0] - with fs.open(f"s3://{bucket}/{fname}", mode="rb") as f: - got2 = cudf.read_parquet( - f, - bytes_per_thread=bytes_per_thread, - columns=columns, - ) + fs = get_fs_token_paths( + f"s3://{s3_bucket_public.name}/{fname}", storage_options=s3so + )[0] + with fs.open(f"s3://{s3_bucket_public.name}/{fname}", mode="rb") as f: + got2 = cudf.read_parquet( + f, + bytes_per_thread=bytes_per_thread, + columns=columns, + ) assert_eq(expect, got2) @pytest.mark.parametrize("method", ["all", "parquet"]) @pytest.mark.parametrize("blocksize", [1024 * 1024, 1024]) def test_read_parquet_prefetch_options( - s3_base, + s3_bucket_public, s3so, pdf, method, blocksize, ): - bucket = "parquet" fname_1 = "test_parquet_reader_prefetch_options_1.parquet" buffer_1 = BytesIO() pdf.to_parquet(path=buffer_1) buffer_1.seek(0) + s3_bucket_public.put_object(Key=fname_1, Body=buffer_1) fname_2 = "test_parquet_reader_prefetch_options_2.parquet" buffer_2 = BytesIO() @@ -261,27 +226,20 @@ def test_read_parquet_prefetch_options( pdf_2["Integer"] += 1 pdf_2.to_parquet(path=buffer_2) buffer_2.seek(0) + s3_bucket_public.put_object(Key=fname_2, Body=buffer_2) - with s3_context( - s3_base=s3_base, - bucket=bucket, - files={ - fname_1: buffer_1, - fname_2: buffer_2, + got = cudf.read_parquet( + [ + f"s3://{s3_bucket_public.name}/{fname_1}", + f"s3://{s3_bucket_public.name}/{fname_2}", + ], + storage_options=s3so, + prefetch_options={ + "method": method, + "blocksize": blocksize, }, - ): - got = cudf.read_parquet( - [ - f"s3://{bucket}/{fname_1}", - f"s3://{bucket}/{fname_2}", - ], - storage_options=s3so, - prefetch_options={ - "method": method, - "blocksize": blocksize, - }, - columns=["String", "Integer"], - ) + columns=["String", "Integer"], + ) expect = pd.concat([pdf, pdf_2], ignore_index=True)[["String", "Integer"]] assert_eq(expect, got) @@ -291,7 +249,7 @@ def test_read_parquet_prefetch_options( @pytest.mark.parametrize("columns", [None, ["List", "Struct"]]) @pytest.mark.parametrize("index", [None, "Integer"]) def test_read_parquet_ext( - s3_base, + s3_bucket_public, s3so, pdf_ext, bytes_per_thread, @@ -299,7 +257,6 @@ def test_read_parquet_ext( index, ): fname = "test_parquet_reader_ext.parquet" - bucket = "parquet" buffer = BytesIO() if index: @@ -309,13 +266,13 @@ def test_read_parquet_ext( # Check direct path handling buffer.seek(0) - with s3_context(s3_base=s3_base, bucket=bucket, files={fname: buffer}): - got1 = cudf.read_parquet( - f"s3://{bucket}/{fname}", - storage_options=s3so, - bytes_per_thread=bytes_per_thread, - columns=columns, - ) + s3_bucket_public.put_object(Key=fname, Body=buffer) + got1 = cudf.read_parquet( + f"s3://{s3_bucket_public.name}/{fname}", + storage_options=s3so, + bytes_per_thread=bytes_per_thread, + columns=columns, + ) if index: expect = ( pdf_ext.set_index(index)[columns] @@ -327,112 +284,90 @@ def test_read_parquet_ext( assert_eq(expect, got1) -def test_read_parquet_filesystem(s3_base, s3so, pdf): +def test_read_parquet_filesystem(s3_bucket_public, s3so, pdf): fname = "data.0.parquet" - # NOTE: Need a unique bucket name when a glob pattern - # is used, otherwise fsspec seems to cache the bucket - # contents, and later tests using the same bucket name - # will fail. - bucket = "test_read_parquet_filesystem" buffer = BytesIO() pdf.to_parquet(path=buffer) buffer.seek(0) fs = get_fs_token_paths("s3://", mode="rb", storage_options=s3so)[0] - with s3_context( - s3_base=s3_base, - bucket=bucket, - files={fname: buffer}, - ): - # Check that a glob pattern works - path = f"s3://{bucket}/{'data.*.parquet'}" - got = cudf.read_parquet(path, filesystem=fs) + s3_bucket_public.put_object(Key=fname, Body=buffer) + # Check that a glob pattern works + path = f"s3://{s3_bucket_public.name}/{'data.*.parquet'}" + got = cudf.read_parquet(path, filesystem=fs) assert_eq(pdf, got) -def test_read_parquet_multi_file(s3_base, s3so, pdf): +def test_read_parquet_multi_file(s3_bucket_public, s3so, pdf): fname_1 = "test_parquet_reader_multi_file_1.parquet" buffer_1 = BytesIO() pdf.to_parquet(path=buffer_1) buffer_1.seek(0) + s3_bucket_public.put_object(Key=fname_1, Body=buffer_1) fname_2 = "test_parquet_reader_multi_file_2.parquet" buffer_2 = BytesIO() pdf.to_parquet(path=buffer_2) buffer_2.seek(0) + s3_bucket_public.put_object(Key=fname_2, Body=buffer_2) - bucket = "parquet" - with s3_context( - s3_base=s3_base, - bucket=bucket, - files={ - fname_1: buffer_1, - fname_2: buffer_2, - }, - ): - got = cudf.read_parquet( - [ - f"s3://{bucket}/{fname_1}", - f"s3://{bucket}/{fname_2}", - ], - storage_options=s3so, - ).reset_index(drop=True) + got = cudf.read_parquet( + [ + f"s3://{s3_bucket_public.name}/{fname_1}", + f"s3://{s3_bucket_public.name}/{fname_2}", + ], + storage_options=s3so, + ).reset_index(drop=True) expect = pd.concat([pdf, pdf], ignore_index=True) assert_eq(expect, got) -def test_read_parquet_filters(s3_base, s3so, pdf_ext): +def test_read_parquet_filters(s3_bucket_public, s3so, pdf_ext): fname = "test_parquet_reader_filters.parquet" - bucket = "parquet" buffer = BytesIO() pdf_ext.to_parquet(path=buffer) buffer.seek(0) + s3_bucket_public.put_object(Key=fname, Body=buffer) filters = [("String", "==", "Omega")] - with s3_context(s3_base=s3_base, bucket=bucket, files={fname: buffer}): - got = cudf.read_parquet( - f"s3://{bucket}/{fname}", - storage_options=s3so, - filters=filters, - ) + got = cudf.read_parquet( + f"s3://{s3_bucket_public.name}/{fname}", + storage_options=s3so, + filters=filters, + ) # All row-groups should be filtered out assert_eq(pdf_ext.iloc[:0], got.reset_index(drop=True)) @pytest.mark.parametrize("partition_cols", [None, ["String"]]) -def test_write_parquet(s3_base, s3so, pdf, partition_cols): +def test_write_parquet(s3_bucket_public, s3so, pdf, partition_cols): fname_cudf = "test_parquet_writer_cudf" fname_pandas = "test_parquet_writer_pandas" - bucket = "parquet" gdf = cudf.from_pandas(pdf) - with s3_context(s3_base=s3_base, bucket=bucket) as s3fs: - gdf.to_parquet( - f"s3://{bucket}/{fname_cudf}", - partition_cols=partition_cols, - storage_options=s3so, - ) - assert s3fs.exists(f"s3://{bucket}/{fname_cudf}") - pdf.to_parquet( - f"s3://{bucket}/{fname_pandas}", - partition_cols=partition_cols, - storage_options=s3so, - ) - assert s3fs.exists(f"s3://{bucket}/{fname_pandas}") + gdf.to_parquet( + f"s3://{s3_bucket_public.name}/{fname_cudf}", + partition_cols=partition_cols, + storage_options=s3so, + ) + pdf.to_parquet( + f"s3://{s3_bucket_public.name}/{fname_pandas}", + partition_cols=partition_cols, + storage_options=s3so, + ) - got = pd.read_parquet( - f"s3://{bucket}/{fname_pandas}", storage_options=s3so - ) - expect = cudf.read_parquet( - f"s3://{bucket}/{fname_cudf}", storage_options=s3so - ) + got = pd.read_parquet( + f"s3://{s3_bucket_public.name}/{fname_pandas}", storage_options=s3so + ) + expect = cudf.read_parquet( + f"s3://{s3_bucket_public.name}/{fname_cudf}", storage_options=s3so + ) assert_eq(expect, got) -def test_read_json(s3_base, s3so): +def test_read_json(s3_bucket_public, s3so): fname = "test_json_reader.json" - bucket = "json" buffer = ( '{"amount": 100, "name": "Alice"}\n' '{"amount": 200, "name": "Bob"}\n' @@ -440,103 +375,89 @@ def test_read_json(s3_base, s3so): '{"amount": 400, "name": "Dennis"}\n' ) - with s3_context(s3_base=s3_base, bucket=bucket, files={fname: buffer}): - got = cudf.read_json( - f"s3://{bucket}/{fname}", - engine="cudf", - orient="records", - lines=True, - storage_options=s3so, - ) + s3_bucket_public.put_object(Key=fname, Body=buffer) + got = cudf.read_json( + f"s3://{s3_bucket_public.name}/{fname}", + engine="cudf", + orient="records", + lines=True, + storage_options=s3so, + ) expect = pd.read_json(StringIO(buffer), lines=True) assert_eq(expect, got) @pytest.mark.parametrize("columns", [None, ["string1"]]) -def test_read_orc(s3_base, s3so, datadir, columns): +def test_read_orc(s3_bucket_public, s3so, datadir, columns): source_file = str(datadir / "orc" / "TestOrcFile.testSnappy.orc") fname = "test_orc_reader.orc" - bucket = "orc" expect = pd.read_orc(source_file) with open(source_file, "rb") as f: buffer = f.read() - with s3_context(s3_base=s3_base, bucket=bucket, files={fname: buffer}): - got = cudf.read_orc( - f"s3://{bucket}/{fname}", - columns=columns, - storage_options=s3so, - ) + s3_bucket_public.put_object(Key=fname, Body=buffer) + got = cudf.read_orc( + f"s3://{s3_bucket_public.name}/{fname}", + columns=columns, + storage_options=s3so, + ) if columns: expect = expect[columns] assert_eq(expect, got) -def test_write_orc(s3_base, s3so, pdf): +def test_write_orc(s3_bucket_public, s3so, pdf): fname = "test_orc_writer.orc" - bucket = "orc" gdf = cudf.from_pandas(pdf) - with s3_context(s3_base=s3_base, bucket=bucket) as s3fs: - gdf.to_orc(f"s3://{bucket}/{fname}", storage_options=s3so) - assert s3fs.exists(f"s3://{bucket}/{fname}") + gdf.to_orc(f"s3://{s3_bucket_public.name}/{fname}", storage_options=s3so) - with s3fs.open(f"s3://{bucket}/{fname}") as f: - got = pd.read_orc(f) + got = pd.read_orc(f"s3://{s3_bucket_public.name}/{fname}") assert_eq(pdf, got) -def test_write_chunked_parquet(s3_base, s3so): +def test_write_chunked_parquet(s3_bucket_public, s3so): df1 = cudf.DataFrame({"b": [10, 11, 12], "a": [1, 2, 3]}) df2 = cudf.DataFrame({"b": [20, 30, 50], "a": [3, 2, 1]}) dirname = "chunked_writer_directory" - bucket = "parquet" - from cudf.io.parquet import ParquetDatasetWriter - - with s3_context( - s3_base=s3_base, bucket=bucket, files={dirname: BytesIO()} - ) as s3fs: - with ParquetDatasetWriter( - f"s3://{bucket}/{dirname}", - partition_cols=["a"], - storage_options=s3so, - ) as cw: - cw.write_table(df1) - cw.write_table(df2) - - # TODO: Replace following workaround with: - # expect = cudf.read_parquet(f"s3://{bucket}/{dirname}/", - # storage_options=s3so) - # after the following bug is fixed: - # https://issues.apache.org/jira/browse/ARROW-16438 - - dfs = [] - for folder in {"a=1", "a=2", "a=3"}: - assert s3fs.exists(f"s3://{bucket}/{dirname}/{folder}") - for file in s3fs.ls(f"s3://{bucket}/{dirname}/{folder}"): - df = cudf.read_parquet("s3://" + file, storage_options=s3so) - dfs.append(df) - - actual = cudf.concat(dfs).astype("int64") - assert_eq( - actual.sort_values(["b"]).reset_index(drop=True), - cudf.concat([df1, df2]).sort_values(["b"]).reset_index(drop=True), + with ParquetDatasetWriter( + f"s3://{s3_bucket_public.name}/{dirname}", + partition_cols=["a"], + storage_options=s3so, + ) as cw: + cw.write_table(df1) + cw.write_table(df2) + + # TODO: Replace following workaround with: + # expect = cudf.read_parquet(f"s3://{bucket}/{dirname}/", + # storage_options=s3so) + # after the following bug is fixed: + # https://issues.apache.org/jira/browse/ARROW-16438 + + dfs = [ + cudf.read_parquet( + f"s3://{s3_bucket_public.name}/{file.key}", storage_options=s3so ) + for file in s3_bucket_public.objects.all() + ] + + actual = cudf.concat(dfs).astype("int64") + assert_eq( + actual.sort_values(["b"]).reset_index(drop=True), + cudf.concat([df1, df2]).sort_values(["b"]).reset_index(drop=True), + ) def test_no_s3fs_on_cudf_import(): - import subprocess - import sys - - output = subprocess.check_output( + output = subprocess.check_call( [ sys.executable, "-c", - "import cudf; import sys; print('pyarrow._s3fs' in sys.modules)", + "import cudf, sys; assert 'pyarrow._s3fs' not in sys.modules", ], cwd="/", ) - assert output.strip() == b"False" + assert output == 0 From a2077705eefd8875a3b7c8f15e0792d57e3a9e70 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Wed, 23 Jul 2025 13:25:51 -0700 Subject: [PATCH 006/366] Use more pytest fixtures and avoid GPU parameterization in cuDF classic tests (#19450) Towards https://github.com/rapidsai/cudf/issues/9999 * Use more pytest fixtures * Avoids pytest.mark.parametrize with GPU objects * Eliminate/reduce parameterizations of input size Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - Bradley Dice (https://github.com/bdice) URL: https://github.com/rapidsai/cudf/pull/19450 --- python/cudf/cudf/tests/test_dataframe.py | 780 ++++++++++------------- python/cudf/cudf/tests/test_datetime.py | 688 +++++++++----------- python/cudf/cudf/tests/test_decimal.py | 267 ++++---- 3 files changed, 735 insertions(+), 1000 deletions(-) diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index ac82f5c8ab2..2671c0bf0f3 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -139,8 +139,8 @@ def test_init_with_missing_columns(index): assert_eq(pdf, gdf) -def _dataframe_na_data(): - return [ +@pytest.fixture( + params=[ pd.DataFrame( { "a": [0, 1, 2, np.nan, 4, None, 6], @@ -160,6 +160,9 @@ def _dataframe_na_data(): pd.DataFrame({"a": ["a", "b", "c", None, "e"]}), pd.DataFrame({"a": ["a", "b", "c", "d", "e"]}), ] +) +def na_data(request): + return request.param @pytest.mark.parametrize( @@ -596,24 +599,6 @@ def test_dataframe_drop_index(pdf, index, inplace): assert_eq(expected, actual) -@pytest.mark.parametrize( - "pdf", - [ - pd.DataFrame( - {"a": range(10), "b": range(10, 20), "d": ["a", "v"] * 5}, - index=pd.MultiIndex( - levels=[ - ["lama", "cow", "falcon"], - ["speed", "weight", "length"], - ], - codes=[ - [0, 0, 0, 1, 1, 1, 2, 2, 2, 1], - [0, 1, 2, 0, 1, 2, 0, 1, 2, 1], - ], - ), - ) - ], -) @pytest.mark.parametrize( "index,level", [ @@ -636,7 +621,19 @@ def test_dataframe_drop_index(pdf, index, inplace): ) @pytest.mark.parametrize("inplace", [True, False]) def test_dataframe_drop_multiindex(pdf, index, level, inplace): - pdf = pdf.copy() + pdf = pd.DataFrame( + {"a": range(10), "b": range(10, 20), "d": ["a", "v"] * 5}, + index=pd.MultiIndex( + levels=[ + ["lama", "cow", "falcon"], + ["speed", "weight", "length"], + ], + codes=[ + [0, 0, 0, 1, 1, 1, 2, 2, 2, 1], + [0, 1, 2, 0, 1, 2, 0, 1, 2, 1], + ], + ), + ) gdf = cudf.from_pandas(pdf) expected = pdf.drop(index=index, inplace=inplace, level=level) @@ -650,21 +647,16 @@ def test_dataframe_drop_multiindex(pdf, index, level, inplace): @pytest.mark.parametrize( - "pdf", - [ - pd.DataFrame({"a": range(10), "b": range(10, 20), "c": range(1, 11)}), - pd.DataFrame( - {"a": range(10), "b": range(10, 20), "d": ["a", "v"] * 5} - ), - ], + "data", + [{"c": range(1, 11)}, {"d": ["a", "v"] * 5}], ) @pytest.mark.parametrize( "labels", [["a"], ["b"], "a", "b", ["a", "b"]], ) @pytest.mark.parametrize("inplace", [True, False]) -def test_dataframe_drop_labels_axis_1(pdf, labels, inplace): - pdf = pdf.copy() +def test_dataframe_drop_labels_axis_1(data, labels, inplace): + pdf = pd.DataFrame({"a": range(10), "b": range(10, 20), **data}) gdf = cudf.from_pandas(pdf) expected = pdf.drop(labels=labels, axis=1, inplace=inplace) @@ -967,7 +959,7 @@ def test_dataframe_pop(): assert empty_pdf.empty and empty_gdf.empty -@pytest.mark.parametrize("nelem", [0, 3, 100, 1000]) +@pytest.mark.parametrize("nelem", [0, 10]) def test_dataframe_astype(nelem): df = cudf.DataFrame() data = np.asarray(range(nelem), dtype=np.int32) @@ -1100,8 +1092,7 @@ def test_dataframe_to_string_with_masked_data(): assert got == expect -def test_dataframe_to_string_wide(monkeypatch): - monkeypatch.setenv("COLUMNS", "79") +def test_dataframe_to_string_wide(): # Test basic df = cudf.DataFrame({f"a{i}": [0, 1, 2] for i in range(100)}) with pd.option_context("display.max_columns", 0): @@ -1188,7 +1179,7 @@ def test_dataframe_add_col_to_object_dataframe(): cols = ["a", "b", "c"] df = pd.DataFrame(columns=cols, dtype="str") - data = {k: v for (k, v) in zip(cols, [["a"] for _ in cols])} + data = {k: ["a"] for k in cols} gdf = cudf.DataFrame(data) gdf = gdf[:0] @@ -1449,7 +1440,6 @@ def test_assign_callable(mapping): assert_eq(expect, actual) -@pytest.mark.parametrize("nrows", [1, 8, 100, 1000]) @pytest.mark.parametrize( "method", [ @@ -1465,7 +1455,8 @@ def test_assign_callable(mapping): ], ) @pytest.mark.parametrize("seed", [None, 42]) -def test_dataframe_hash_values(nrows, method, seed): +def test_dataframe_hash_values(method, seed): + nrows = 10 warning_expected = seed is not None and method not in { "murmur3", "xxhash32", @@ -1597,10 +1588,10 @@ def test_dataframe_hash_values_xxhash64(): assert_eq(out_df, expected_df) -@pytest.mark.parametrize("nrows", [3, 10, 100, 1000]) -@pytest.mark.parametrize("nparts", [1, 2, 8, 13]) -@pytest.mark.parametrize("nkeys", [1, 2]) -def test_dataframe_hash_partition(nrows, nparts, nkeys): +@pytest.mark.parametrize("nparts", [1, 2]) +def test_dataframe_hash_partition(nparts): + nrows = 10 + nkeys = 2 rng = np.random.default_rng(seed=0) gdf = cudf.DataFrame( {f"key{i}": rng.integers(0, 7 - i, nrows) for i in range(nkeys)} @@ -1627,8 +1618,8 @@ def test_dataframe_hash_partition(nrows, nparts, nkeys): assert len(part_unique_keys) -@pytest.mark.parametrize("nrows", [3, 10, 50]) -def test_dataframe_hash_partition_masked_value(nrows): +def test_dataframe_hash_partition_masked_value(): + nrows = 10 gdf = cudf.DataFrame() gdf["key"] = np.arange(nrows) gdf["val"] = np.arange(nrows) + 100 @@ -1648,8 +1639,8 @@ def test_dataframe_hash_partition_masked_value(nrows): ) -@pytest.mark.parametrize("nrows", [3, 10, 50]) -def test_dataframe_hash_partition_masked_keys(nrows): +def test_dataframe_hash_partition_masked_keys(): + nrows = 5 gdf = cudf.DataFrame() gdf["key"] = np.arange(nrows) gdf["val"] = np.arange(nrows) + 100 @@ -1724,13 +1715,11 @@ def test_dataframe_concat_different_column_types(): cudf.concat([df1, df2]) -@pytest.mark.parametrize( - "df_1", [cudf.DataFrame({"a": [1, 2], "b": [1, 3]}), cudf.DataFrame({})] -) -@pytest.mark.parametrize( - "df_2", [cudf.DataFrame({"a": [], "b": []}), cudf.DataFrame({})] -) -def test_concat_empty_dataframe(df_1, df_2): +@pytest.mark.parametrize("df_1_data", [{"a": [1, 2], "b": [1, 3]}, {}]) +@pytest.mark.parametrize("df_2_data", [{"a": [], "b": []}, {}]) +def test_concat_empty_dataframe(df_1_data, df_2_data): + df_1 = cudf.DataFrame(df_1_data) + df_2 = cudf.DataFrame(df_2_data) with _hide_concat_empty_dtype_warning(): got = cudf.concat([df_1, df_2]) expect = pd.concat([df_1.to_pandas(), df_2.to_pandas()], sort=False) @@ -1787,8 +1776,8 @@ def test_concat_different_column_dataframe(df1_d, df2_d): @pytest.mark.parametrize( "ser_1", [pd.Series([1, 2, 3]), pd.Series([], dtype="float64")] ) -@pytest.mark.parametrize("ser_2", [pd.Series([], dtype="float64")]) -def test_concat_empty_series(ser_1, ser_2): +def test_concat_empty_series(ser_1): + ser_2 = pd.Series([], dtype="float64") with _hide_concat_empty_dtype_warning(): got = cudf.concat([cudf.Series(ser_1), cudf.Series(ser_2)]) expect = pd.concat([ser_1, ser_2]) @@ -1898,7 +1887,7 @@ def test_concat_with_axis(): ) -@pytest.mark.parametrize("nrows", [0, 3, 10, 100, 1000]) +@pytest.mark.parametrize("nrows", [0, 3]) def test_nonmatching_index_setitem(nrows): rng = np.random.default_rng(seed=0) @@ -2038,7 +2027,7 @@ def test_index_in_dataframe_constructor(): dtypes = NUMERIC_TYPES + DATETIME_TYPES + ["bool"] -@pytest.mark.parametrize("nelem", [0, 2, 3, 100, 1000]) +@pytest.mark.parametrize("nelem", [0, 2]) @pytest.mark.parametrize("data_type", dtypes) def test_from_arrow(nelem, data_type): rng = np.random.default_rng(seed=0) @@ -2077,7 +2066,7 @@ def test_from_arrow_chunked_categories(): assert sorted(final_dictionary) == sorted(dictionary.to_pylist()) -@pytest.mark.parametrize("nelem", [0, 2, 3, 100, 1000]) +@pytest.mark.parametrize("nelem", [0, 2]) @pytest.mark.parametrize("data_type", dtypes) def test_to_arrow(nelem, data_type): rng = np.random.default_rng(seed=0) @@ -2246,8 +2235,8 @@ def test_dataframe_shape_empty(): assert pdf.shape == gdf.shape -@pytest.mark.parametrize("num_cols", [1, 2, 10]) -@pytest.mark.parametrize("num_rows", [1, 2, 20]) +@pytest.mark.parametrize("num_cols", [1, 3]) +@pytest.mark.parametrize("num_rows", [1, 4]) @pytest.mark.parametrize("dtype", [*dtypes, "object"]) @pytest.mark.parametrize("nulls", ["none", "some", "all"]) def test_dataframe_transpose(nulls, num_cols, num_rows, dtype): @@ -2320,8 +2309,8 @@ def test_dataframe_transpose(nulls, num_cols, num_rows, dtype): assert_eq(expect, got_property.to_pandas(nullable=nullable)) -@pytest.mark.parametrize("num_cols", [1, 2, 10]) -@pytest.mark.parametrize("num_rows", [1, 2, 20]) +@pytest.mark.parametrize("num_cols", [1, 3]) +@pytest.mark.parametrize("num_rows", [1, 5]) def test_dataframe_transpose_category(num_cols, num_rows): pdf = pd.DataFrame() @@ -2908,8 +2897,8 @@ def test_cuda_array_interface(dtype): assert_eq(pd_data, gdf["test"]) -@pytest.mark.parametrize("nelem", [0, 2, 3, 100]) -@pytest.mark.parametrize("nchunks", [1, 2, 5, 10]) +@pytest.mark.parametrize("nelem", [0, 10]) +@pytest.mark.parametrize("nchunks", [1, 5]) @pytest.mark.parametrize("data_type", dtypes) def test_from_arrow_chunked_arrays(nelem, nchunks, data_type): rng = np.random.default_rng(seed=0) @@ -3011,18 +3000,19 @@ def test_dataframe_boolmask(mask_shape): @pytest.mark.parametrize( - "mask", + "box", [ - [True, False, True], + list, pytest.param( - cudf.Series([True, False, True]), + cudf.Series, marks=pytest_xfail( reason="Pandas can't index a multiindex with a Series" ), ), ], ) -def test_dataframe_multiindex_boolmask(mask): +def test_dataframe_multiindex_boolmask(box): + mask = box([True, False, True]) gdf = cudf.DataFrame( {"w": [3, 2, 1], "x": [1, 2, 3], "y": [0, 1, 0], "z": [1, 1, 1]} ) @@ -3073,7 +3063,7 @@ def test_pandas_non_contiguious(): assert_eq(gdf.to_pandas(), df) -@pytest.mark.parametrize("num_elements", [0, 2, 10, 100]) +@pytest.mark.parametrize("num_elements", [0, 10]) @pytest.mark.parametrize("null_type", [np.nan, None, "mixed"]) def test_series_all_null(num_elements, null_type): if null_type == "mixed": @@ -3093,7 +3083,7 @@ def test_series_all_null(num_elements, null_type): assert_eq(expect, got) -@pytest.mark.parametrize("num_elements", [0, 2, 10, 100]) +@pytest.mark.parametrize("num_elements", [0, 10]) def test_series_all_valid_nan(num_elements): data = [np.nan] * num_elements sr = cudf.Series(data, nan_as_null=False) @@ -3301,16 +3291,6 @@ def test_reset_index_invalid_level(): pd.DataFrame([1]).reset_index(level=2) -@pytest.mark.parametrize( - "data", - [ - { - "a": [1, 2, 3, 4, 5], - "b": ["a", "b", "c", "d", "e"], - "c": [1.0, 2.0, 3.0, 4.0, 5.0], - } - ], -) @pytest.mark.parametrize( "index", [ @@ -3346,8 +3326,14 @@ def test_reset_index_invalid_level(): @pytest.mark.parametrize("drop", [True, False]) @pytest.mark.parametrize("append", [True, False]) @pytest.mark.parametrize("inplace", [True, False]) -def test_set_index(data, index, drop, append, inplace): - gdf = cudf.DataFrame(data) +def test_set_index(index, drop, append, inplace): + gdf = cudf.DataFrame( + { + "a": [1, 2, 3, 4, 5], + "b": ["a", "b", "c", "d", "e"], + "c": [1.0, 2.0, 3.0, 4.0, 5.0], + } + ) pdf = gdf.to_pandas() expected = pdf.set_index(index, inplace=inplace, drop=drop, append=append) @@ -3359,27 +3345,22 @@ def test_set_index(data, index, drop, append, inplace): assert_eq(expected, actual) -@pytest.mark.parametrize( - "data", - [ +@pytest.mark.parametrize("index", ["a", pd.Index([1, 1, 2, 2, 3])]) +def test_set_index_verify_integrity(index): + gdf = cudf.DataFrame( { "a": [1, 1, 2, 2, 5], "b": ["a", "b", "c", "d", "e"], "c": [1.0, 2.0, 3.0, 4.0, 5.0], } - ], -) -@pytest.mark.parametrize("index", ["a", pd.Index([1, 1, 2, 2, 3])]) -@pytest.mark.parametrize("verify_integrity", [True]) -@pytest_xfail -def test_set_index_verify_integrity(data, index, verify_integrity): - gdf = cudf.DataFrame(data) - gdf.set_index(index, verify_integrity=verify_integrity) + ) + with pytest.raises(ValueError): + gdf.set_index(index, verify_integrity=True) @pytest.mark.parametrize("drop", [True, False]) -@pytest.mark.parametrize("nelem", [10, 200, 1333]) -def test_set_index_multi(drop, nelem): +def test_set_index_multi(drop): + nelem = 10 rng = np.random.default_rng(seed=0) a = np.arange(nelem) rng.shuffle(a) @@ -3409,26 +3390,6 @@ def test_set_index_multi(drop, nelem): ) -@pytest.fixture() -def reindex_data(): - return cudf.datasets.randomdata( - nrows=6, - dtypes={ - "a": "category", - "c": float, - "d": str, - }, - ) - - -@pytest.fixture() -def reindex_data_numeric(): - return cudf.datasets.randomdata( - nrows=6, - dtypes={"a": float, "b": float, "c": float}, - ) - - @pytest_unmark_spilling @pytest.mark.parametrize("copy", [True, False]) @pytest.mark.parametrize( @@ -3454,7 +3415,15 @@ def reindex_data_numeric(): ), ], ) -def test_dataframe_reindex(copy, reindex_data, args, gd_kwargs): +def test_dataframe_reindex(copy, args, gd_kwargs): + reindex_data = cudf.datasets.randomdata( + nrows=6, + dtypes={ + "a": "category", + "c": float, + "d": str, + }, + ) pdf, gdf = reindex_data.to_pandas(), reindex_data gd_kwargs["copy"] = copy @@ -3487,9 +3456,11 @@ def test_dataframe_reindex(copy, reindex_data, args, gd_kwargs): ), ], ) -def test_dataframe_reindex_fill_value( - reindex_data_numeric, args, kwargs, fill_value -): +def test_dataframe_reindex_fill_value(args, kwargs, fill_value): + reindex_data_numeric = cudf.datasets.randomdata( + nrows=6, + dtypes={"a": float, "b": float, "c": float}, + ) pdf, gdf = reindex_data_numeric.to_pandas(), reindex_data_numeric kwargs["fill_value"] = fill_value assert_eq(pdf.reindex(*args, **kwargs), gdf.reindex(*args, **kwargs)) @@ -3967,7 +3938,7 @@ def test_select_dtype_datetime_with_frequency(): def test_dataframe_describe_exclude(): rng = np.random.default_rng(seed=12) - data_length = 10000 + data_length = 10 df = cudf.DataFrame() df["x"] = rng.normal(10, 1, data_length) @@ -3983,7 +3954,7 @@ def test_dataframe_describe_exclude(): def test_dataframe_describe_include(): rng = np.random.default_rng(seed=12) - data_length = 10000 + data_length = 10 df = cudf.DataFrame() df["x"] = rng.normal(10, 1, data_length) @@ -3998,7 +3969,7 @@ def test_dataframe_describe_include(): def test_dataframe_describe_default(): rng = np.random.default_rng(seed=12) - data_length = 10000 + data_length = 10 df = cudf.DataFrame() df["x"] = rng.normal(10, 1, data_length) @@ -4012,7 +3983,7 @@ def test_dataframe_describe_default(): def test_series_describe_include_all(): rng = np.random.default_rng(seed=12) - data_length = 10000 + data_length = 10 df = cudf.DataFrame() df["x"] = rng.normal(10, 1, data_length) @@ -4035,7 +4006,7 @@ def test_series_describe_include_all(): def test_dataframe_describe_percentiles(): rng = np.random.default_rng(seed=12) - data_length = 10000 + data_length = 100 sample_percentiles = [0.0, 0.1, 0.33, 0.84, 0.4, 0.99] df = cudf.DataFrame() @@ -4092,7 +4063,7 @@ def test_shift(dtype, period, data_empty): @pytest.mark.parametrize("dtype", NUMERIC_TYPES) -@pytest.mark.parametrize("period", [-1, -5, -10, -20, 0, 1, 5, 10, 20]) +@pytest.mark.parametrize("period", [-15, -1, 0, 1, 15]) @pytest.mark.parametrize("data_empty", [False, True]) def test_diff(dtype, period, data_empty): if data_empty: @@ -4116,14 +4087,14 @@ def test_diff(dtype, period, data_empty): assert_eq(diffed_outcome, expected_outcome) -@pytest.mark.parametrize("df", _dataframe_na_data()) @pytest.mark.parametrize("nan_as_null", [True, False, None]) @pytest.mark.parametrize("api_call", ["isnull", "isna", "notna", "notnull"]) -def test_dataframe_isnull_isna_and_reverse(df, nan_as_null, api_call): +def test_dataframe_isnull_isna_and_reverse(na_data, nan_as_null, api_call): def detect_nan(x): # Check if the input is a float and if it is nan return x.apply(lambda v: isinstance(v, float) and np.isnan(v)) + df = na_data nan_contains = df.select_dtypes(object).apply(detect_nan) if nan_as_null is False and ( nan_contains.any().any() and not nan_contains.all().all() @@ -5282,18 +5253,18 @@ def test_df_constructor_dtype(dtype): @pytest.mark.parametrize( "data", [ - cudf.datasets.randomdata( + lambda: cudf.datasets.randomdata( nrows=10, dtypes={"a": "category", "b": int, "c": float, "d": int} ), - cudf.datasets.randomdata( + lambda: cudf.datasets.randomdata( nrows=10, dtypes={"a": "category", "b": int, "c": float, "d": str} ), - cudf.datasets.randomdata( + lambda: cudf.datasets.randomdata( nrows=10, dtypes={"a": bool, "b": int, "c": float, "d": str} ), - cudf.DataFrame(), - cudf.DataFrame({"a": [0, 1, 2], "b": [1, None, 3]}), - cudf.DataFrame( + lambda: cudf.DataFrame(), + lambda: cudf.DataFrame({"a": [0, 1, 2], "b": [1, None, 3]}), + lambda: cudf.DataFrame( { "a": [1, 2, 3, 4], "b": [7, np.nan, 9, 10], @@ -5305,7 +5276,7 @@ def test_df_constructor_dtype(dtype): "f": cudf.Series([10, None, np.nan, 11], nan_as_null=False), } ), - cudf.DataFrame( + lambda: cudf.DataFrame( { "a": [10, 11, 12, 13, 14, 15], "b": cudf.Series( @@ -5321,7 +5292,7 @@ def test_df_constructor_dtype(dtype): @pytest.mark.parametrize("skipna", [True, False]) @pytest.mark.parametrize("numeric_only", [True, False]) def test_rowwise_ops(data, op, skipna, numeric_only): - gdf = data + gdf = data() pdf = gdf.to_pandas() kwargs = {"axis": 1, "skipna": skipna, "numeric_only": numeric_only} @@ -5419,49 +5390,49 @@ def test_rowwise_ops_nullable_dtypes_partial_null(op): [ ( "max", - cudf.Series( + lambda: cudf.Series( [10, None, None, 2234, None, 453], dtype="int64", ), ), ( "min", - cudf.Series( + lambda: cudf.Series( [10, None, None, 13, None, 15], dtype="int64", ), ), ( "sum", - cudf.Series( + lambda: cudf.Series( [20, None, None, 2247, None, 468], dtype="int64", ), ), ( "product", - cudf.Series( + lambda: cudf.Series( [100, None, None, 29042, None, 6795], dtype="int64", ), ), ( "mean", - cudf.Series( + lambda: cudf.Series( [10.0, None, None, 1123.5, None, 234.0], dtype="float32", ), ), ( "var", - cudf.Series( + lambda: cudf.Series( [0.0, None, None, 1233210.25, None, 47961.0], dtype="float32", ), ), ( "std", - cudf.Series( + lambda: cudf.Series( [0.0, None, None, 1110.5, None, 219.0], dtype="float32", ), @@ -5484,6 +5455,7 @@ def test_rowwise_ops_nullable_int_dtypes(op, expected): else: got = getattr(gdf, op)(axis=1, skipna=False) + expected = expected() assert_eq(got.null_count, expected.null_count) assert_eq(got, expected) @@ -5492,62 +5464,62 @@ def test_rowwise_ops_nullable_int_dtypes(op, expected): "data", [ { - "t1": cudf.Series( + "t1": pd.Series( ["2020-08-01 09:00:00", "1920-05-01 10:30:00"], dtype=" 0 -@pytest.mark.parametrize("data", [data1(), data2()]) @pytest.mark.parametrize("dtype", NUMERIC_TYPES) def test_typecast_from_datetime(data, dtype): - pd_data = pd.Series(data.copy()) + pd_data = pd.Series(data) np_data = np.array(pd_data) gdf_data = Series(pd_data) @@ -295,13 +255,12 @@ def test_typecast_from_datetime(data, dtype): np.testing.assert_equal(np_casted, gdf_casted.to_numpy()) -@pytest.mark.parametrize("data", [data1(), data2()]) @pytest.mark.parametrize( "dtype", ["datetime64[s]", "datetime64[ms]", "datetime64[us]", "datetime64[ns]"], ) def test_typecast_from_datetime_to_int64_to_datetime(data, dtype): - pd_data = pd.Series(data.copy()) + pd_data = pd.Series(data) np_data = np.array(pd_data) gdf_data = Series(pd_data) @@ -311,20 +270,43 @@ def test_typecast_from_datetime_to_int64_to_datetime(data, dtype): np.testing.assert_equal(np_casted, gdf_casted.to_numpy()) -@pytest.mark.parametrize("data", [timeseries_us_data()]) @pytest.mark.parametrize( "dtype", ["datetime64[s]", "datetime64[ms]", "datetime64[us]", "datetime64[ns]"], ) def test_typecast_to_different_datetime_resolutions(data, dtype): - pd_data = pd.Series(data.copy()) + data = pd.date_range( + "2019-07-16 00:00:00", + "2019-07-16 00:00:01", + freq="5555us", + name="times", + ) + pd_data = pd.Series(data) np_data = np.array(pd_data).astype(dtype) gdf_series = Series(pd_data).astype(dtype) np.testing.assert_equal(np_data, gdf_series.to_numpy()) @pytest.mark.parametrize( - "data", [timestamp_ms_data(), timestamp_us_data(), timestamp_ns_data()] + "data", + [ + [ + "2019-07-16 00:00:00.333", + "2019-07-16 00:00:00.666", + "2019-07-16 00:00:00.888", + ], + [ + "2019-07-16 00:00:00.333333", + "2019-07-16 00:00:00.666666", + "2019-07-16 00:00:00.888888", + ], + [ + "2019-07-16 00:00:00.333333333", + "2019-07-16 00:00:00.666666666", + "2019-07-16 00:00:00.888888888", + ], + ], + ids=["ms_data", "us_data", "ns_data"], ) @pytest.mark.parametrize( "dtype", @@ -333,7 +315,7 @@ def test_typecast_to_different_datetime_resolutions(data, dtype): def test_string_timstamp_typecast_to_different_datetime_resolutions( data, dtype ): - pd_sr = data + pd_sr = pd.Series(data) gdf_sr = cudf.Series.from_pandas(pd_sr) expect = pd_sr.values.astype(dtype) @@ -342,13 +324,13 @@ def test_string_timstamp_typecast_to_different_datetime_resolutions( np.testing.assert_equal(expect, got) -@pytest.mark.parametrize("data", [numerical_data()]) @pytest.mark.parametrize("from_dtype", NUMERIC_TYPES) @pytest.mark.parametrize( "to_dtype", ["datetime64[s]", "datetime64[ms]", "datetime64[us]", "datetime64[ns]"], ) -def test_typecast_to_datetime(data, from_dtype, to_dtype): +def test_typecast_to_datetime(from_dtype, to_dtype): + data = np.arange(1, 10) np_data = data.astype(from_dtype) gdf_data = Series(np_data) @@ -358,13 +340,13 @@ def test_typecast_to_datetime(data, from_dtype, to_dtype): np.testing.assert_equal(np_casted, gdf_casted.to_numpy()) -@pytest.mark.parametrize("data", [numerical_data()]) @pytest.mark.parametrize("from_dtype", NUMERIC_TYPES) @pytest.mark.parametrize( "to_dtype", ["datetime64[s]", "datetime64[ms]", "datetime64[us]", "datetime64[ns]"], ) -def test_typecast_to_from_datetime(data, from_dtype, to_dtype): +def test_typecast_to_from_datetime(from_dtype, to_dtype): + data = np.arange(1, 10) np_data = data.astype(from_dtype) gdf_data = Series(np_data) @@ -374,7 +356,6 @@ def test_typecast_to_from_datetime(data, from_dtype, to_dtype): np.testing.assert_equal(np_casted, gdf_casted.to_numpy()) -@pytest.mark.parametrize("data", [numerical_data()]) @pytest.mark.parametrize( "from_dtype", ["datetime64[s]", "datetime64[ms]", "datetime64[us]", "datetime64[ns]"], @@ -383,7 +364,8 @@ def test_typecast_to_from_datetime(data, from_dtype, to_dtype): "to_dtype", ["datetime64[s]", "datetime64[ms]", "datetime64[us]", "datetime64[ns]"], ) -def test_typecast_from_datetime_to_datetime(data, from_dtype, to_dtype): +def test_typecast_from_datetime_to_datetime(from_dtype, to_dtype): + data = np.arange(1, 10) np_data = data.astype(from_dtype) ser = Series(np_data) @@ -393,10 +375,10 @@ def test_typecast_from_datetime_to_datetime(data, from_dtype, to_dtype): np.testing.assert_equal(np_casted, ser_casted.to_numpy()) -@pytest.mark.parametrize("data", [numerical_data()]) @pytest.mark.parametrize("nulls", ["some", "all"]) def test_to_from_pandas_nulls(data, nulls): - pd_data = pd.Series(data.copy().astype("datetime64[ns]")) + data = np.arange(1, 10) + pd_data = pd.Series(data.astype("datetime64[ns]")) if nulls == "some": # Fill half the values with NaT pd_data[list(range(0, len(pd_data), 2))] = np.datetime64("nat", "ns") @@ -480,16 +462,11 @@ def test_datetime_nunique(data, nulls): assert_eq(got, expected) -testdata = [ - ( - Series( - ["2018-01-01", None, "2019-01-31", None, "2018-01-01"], - dtype="datetime64[ms]", - ), - True, - ), - ( - Series( +@pytest.mark.parametrize( + "data, expected", + [ + [["2018-01-01", None, "2019-01-31", None, "2018-01-01"], True], + [ [ "2018-01-01", "2018-01-02", @@ -497,23 +474,18 @@ def test_datetime_nunique(data, nulls): "2018-03-01", "2018-01-01", ], - dtype="datetime64[ms]", - ), - False, - ), - ( - Series( + False, + ], + [ np.array( ["2018-01-01", None, "2019-12-30"], dtype="datetime64[ms]" - ) - ), - True, - ), -] - - -@pytest.mark.parametrize("data, expected", testdata) + ), + True, + ], + ], +) def test_datetime_has_null_test(data, expected): + data = Series(data, dtype="datetime64[ms]") pd_data = data.to_pandas() count = pd_data.notna().value_counts() expected_count = 0 @@ -1038,7 +1010,6 @@ def test_datetime_series_ops_with_scalars(data, other_scalars, dtype, op): @pytest.mark.parametrize("data", ["20110101", "20120101", "20130101"]) @pytest.mark.parametrize("other_scalars", ["20110101", "20120101", "20130101"]) -@pytest.mark.parametrize("op", _cmpops) @pytest.mark.parametrize( "dtype", ["datetime64[ns]", "datetime64[us]", "datetime64[ms]", "datetime64[s]"], @@ -1432,21 +1403,18 @@ def test_isocalendar_series(data): @pytest.mark.parametrize( "data", [ - pd.DatetimeIndex([], dtype="datetime64[ns]"), - pd.DatetimeIndex([None, None], dtype="datetime64[ns]"), - pd.DatetimeIndex( - [ - "2020-05-31 08:00:00", - "1999-12-31 18:40:00", - "2000-12-31 04:00:00", - ], - dtype="datetime64[ns]", - ), - pd.DatetimeIndex(["2100-03-14 07:30:00"], dtype="datetime64[ns]"), + [], + [None, None], + [ + "2020-05-31 08:00:00", + "1999-12-31 18:40:00", + "2000-12-31 04:00:00", + ], + ["2100-03-14 07:30:00"], ], ) def test_isocalendar_index(data): - ps = data.copy() + ps = pd.DatetimeIndex(data, dtype="datetime64[ns]") gs = cudf.from_pandas(ps) expect = ps.isocalendar() @@ -1474,27 +1442,20 @@ def test_days_in_months(dtype): assert_eq(ps.dt.days_in_month, gs.dt.days_in_month) -@pytest.mark.parametrize( - "data", - [ - [ - "2020-05-31", - None, - "1999-12-01", - "2000-12-21", - None, - "1900-02-28", - "1800-03-14", - "2100-03-10", - "1970-01-01", - "1969-12-11", - ] - ], -) -@pytest.mark.parametrize("dtype", ["datetime64[ns]"]) -def test_is_month_start(data, dtype): - # Series - ps = pd.Series(data, dtype=dtype) +def test_is_month_start(): + data = [ + "2020-05-31", + None, + "1999-12-01", + "2000-12-21", + None, + "1900-02-28", + "1800-03-14", + "2100-03-10", + "1970-01-01", + "1969-12-11", + ] + ps = pd.Series(data, dtype="datetime64[ns]") gs = cudf.from_pandas(ps) expect = ps.dt.is_month_start @@ -1507,19 +1468,6 @@ def test_is_month_start(data, dtype): # Date Range Tests # ################################################################## -date_range_test_dates_start = [ - "2000-02-13 08:41:06", # leap year - "1996-11-21 04:05:30", # non leap year - "1970-01-01 00:00:00", # unix epoch time 0 - "1831-05-08 15:23:21", -] -date_range_test_dates_end = [ - "2000-02-13 08:41:06", # leap year - "1996-11-21 04:05:30", # non leap year - "1970-01-01 00:00:00", # unix epoch time 0 - "1831-05-08 15:23:21", -] -date_range_test_periods = [1, 10, 100] date_range_test_freq = [ {"months": 3, "years": 1}, {"hours": 10, "days": 57, "nanoseconds": 3}, @@ -1532,22 +1480,49 @@ def test_is_month_start(data, dtype): ] -@pytest.fixture(params=date_range_test_dates_start[:]) +@pytest.fixture( + params=[ + "2000-02-13 08:41:06", + "1996-11-21 04:05:30", + "1970-01-01 00:00:00", + "1831-05-08 15:23:21", + ], + ids=["leap_year", "non_leap_year", "unix_epoch_time_0", "random_date"], +) def start(request): return request.param -@pytest.fixture(params=date_range_test_dates_end[:]) +@pytest.fixture( + params=[ + "2000-02-13 08:41:06", + "1996-11-21 04:05:30", + "1970-01-01 00:00:00", + "1831-05-08 15:23:21", + ], + ids=["leap_year", "non_leap_year", "unix_epoch_time_0", "random_date"], +) def end(request): return request.param -@pytest.fixture(params=date_range_test_periods[:]) +@pytest.fixture(params=[1, 10]) def periods(request): return request.param -@pytest.fixture(params=date_range_test_freq[:]) +@pytest.fixture( + params=[ + {"months": 3, "years": 1}, + {"hours": 10, "days": 57, "nanoseconds": 3}, + "83D", + "17h", + "-680min", + "110546s", + "110546789ms", + "110546789248us", + ] +) def freq(request): return request.param @@ -1635,18 +1610,17 @@ def test_date_range_freq_does_not_divide_range(): ) -def test_date_range_raise_overflow(): - # Fixed offset - start = np.datetime64(np.iinfo("int64").max, "ns") - periods = 2 - freq = cudf.DateOffset(nanoseconds=1) - with pytest.raises(pd.errors.OutOfBoundsDatetime): - cudf.date_range(start=start, periods=periods, freq=freq) - - # Non-fixed offset +@pytest.mark.parametrize( + "kwargs", + [ + {"nanoseconds": 1}, + {"months": 1}, + ], +) +def test_date_range_raise_overflow(kwargs): start = np.datetime64(np.iinfo("int64").max, "ns") periods = 2 - freq = cudf.DateOffset(months=1) + freq = cudf.DateOffset(**kwargs) with pytest.raises(pd.errors.OutOfBoundsDatetime): cudf.date_range(start=start, periods=periods, freq=freq) @@ -1699,28 +1673,21 @@ def test_date_range_raise_unsupported(freqstr_unsupported): ################################################################## -@pytest.mark.parametrize( - "data", - [ - [ - "2020-05-31", - "2020-02-29", - None, - "1999-12-01", - "2000-12-21", - None, - "1900-02-28", - "1800-03-14", - "2100-03-10", - "1970-01-01", - "1969-12-11", - ] - ], -) -@pytest.mark.parametrize("dtype", ["datetime64[ns]"]) -def test_is_month_end(data, dtype): - # Series - ps = pd.Series(data, dtype=dtype) +def test_is_month_end(): + data = [ + "2020-05-31", + "2020-02-29", + None, + "1999-12-01", + "2000-12-21", + None, + "1900-02-28", + "1800-03-14", + "2100-03-10", + "1970-01-01", + "1969-12-11", + ] + ps = pd.Series(data, dtype="datetime64[ns]") gs = cudf.from_pandas(ps) expect = ps.dt.is_month_end @@ -1729,29 +1696,23 @@ def test_is_month_end(data, dtype): assert_eq(expect, got) -@pytest.mark.parametrize( - "data", - [ - [ - "2020-05-31", - None, - "1999-12-01", - "2000-12-21", - None, - "1900-01-01", - "1800-03-14", - "2100-03-10", - "1970-01-01", - "1969-12-11", - "2017-12-30", - "2017-12-31", - "2018-01-01", - ] - ], -) -@pytest.mark.parametrize("dtype", ["datetime64[ns]"]) -def test_is_year_start(data, dtype): - ps = pd.Series(data, dtype=dtype) +def test_is_year_start(): + data = [ + "2020-05-31", + None, + "1999-12-01", + "2000-12-21", + None, + "1900-01-01", + "1800-03-14", + "2100-03-10", + "1970-01-01", + "1969-12-11", + "2017-12-30", + "2017-12-31", + "2018-01-01", + ] + ps = pd.Series(data, dtype="datetime64[ns]") gs = cudf.from_pandas(ps) expect = ps.dt.is_year_start @@ -1760,33 +1721,27 @@ def test_is_year_start(data, dtype): assert_eq(expect, got) -@pytest.mark.parametrize( - "data", - [ - [ - "2020-05-31", - None, - "1999-12-01", - "2000-12-21", - None, - "1900-12-31", - "1800-03-14", - "2017-12-30", - "2017-12-31", - "2020-12-31 08:00:00", - None, - "1999-12-31 18:40:00", - "2000-12-31 04:00:00", - None, - "1800-12-14 07:30:00", - "2100-12-14 07:30:00", - "2020-05-31", - ] - ], -) -@pytest.mark.parametrize("dtype", ["datetime64[ns]"]) -def test_is_year_end(data, dtype): - ps = pd.Series(data, dtype=dtype) +def test_is_year_end(): + data = [ + "2020-05-31", + None, + "1999-12-01", + "2000-12-21", + None, + "1900-12-31", + "1800-03-14", + "2017-12-30", + "2017-12-31", + "2020-12-31 08:00:00", + None, + "1999-12-31 18:40:00", + "2000-12-31 04:00:00", + None, + "1800-12-14 07:30:00", + "2100-12-14 07:30:00", + "2020-05-31", + ] + ps = pd.Series(data, dtype="datetime64[ns]") gs = cudf.from_pandas(ps) expect = ps.dt.is_year_end @@ -1795,31 +1750,24 @@ def test_is_year_end(data, dtype): assert_eq(expect, got) -@pytest.mark.parametrize( - "data", - [ - [ - "2020-05-01", - "2020-05-31", - "2020-02-29", - None, - "1999-12-01", - "2000-12-21", - None, - "1900-02-28", - "1800-03-14", - "2100-03-10", - "1970-04-1", - "1970-01-01", - "1969-12-11", - "2020-12-31", - ] - ], -) -@pytest.mark.parametrize("dtype", ["datetime64[ns]"]) -def test_is_quarter_start(data, dtype): - # Series - ps = pd.Series(data, dtype=dtype) +def test_is_quarter_start(): + data = [ + "2020-05-01", + "2020-05-31", + "2020-02-29", + None, + "1999-12-01", + "2000-12-21", + None, + "1900-02-28", + "1800-03-14", + "2100-03-10", + "1970-04-1", + "1970-01-01", + "1969-12-11", + "2020-12-31", + ] + ps = pd.Series(data, dtype="datetime64[ns]") gs = cudf.from_pandas(ps) expect = ps.dt.is_quarter_start @@ -1828,31 +1776,24 @@ def test_is_quarter_start(data, dtype): assert_eq(expect, got) -@pytest.mark.parametrize( - "data", - [ - [ - "2020-05-01", - "2020-05-31", - "2020-02-29", - None, - "1999-12-01", - "2000-12-21", - None, - "1900-02-28", - "1800-03-14", - "2100-03-10", - "1970-04-1", - "1970-01-01", - "1969-12-11", - "2020-12-31", - ] - ], -) -@pytest.mark.parametrize("dtype", ["datetime64[ns]"]) -def test_is_quarter_end(data, dtype): - # Series - ps = pd.Series(data, dtype=dtype) +def test_is_quarter_end(): + data = [ + "2020-05-01", + "2020-05-31", + "2020-02-29", + None, + "1999-12-01", + "2000-12-21", + None, + "1900-02-28", + "1800-03-14", + "2100-03-10", + "1970-04-1", + "1970-01-01", + "1969-12-11", + "2020-12-31", + ] + ps = pd.Series(data, dtype="datetime64[ns]") gs = cudf.from_pandas(ps) expect = ps.dt.is_quarter_end @@ -1871,28 +1812,21 @@ def test_error_values(): PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, reason="https://github.com/pandas-dev/pandas/issues/52761", ) -@pytest.mark.parametrize( - "data", - [ - ( - [ - "2020-05-31 08:00:00", - "1999-12-31 18:40:10", - "2000-12-31 04:00:05", - "1900-02-28 07:00:06", - "1800-03-14 07:30:20", - "2100-03-14 07:30:20", - "1970-01-01 00:00:09", - "1969-12-31 12:59:10", - ] - ) - ], -) @pytest.mark.parametrize("time_type", DATETIME_TYPES) @pytest.mark.parametrize( "resolution", ["D", "h", "min", "min", "s", "ms", "us", "ns"] ) def test_ceil(data, time_type, resolution): + data = [ + "2020-05-31 08:00:00", + "1999-12-31 18:40:10", + "2000-12-31 04:00:05", + "1900-02-28 07:00:06", + "1800-03-14 07:30:20", + "2100-03-14 07:30:20", + "1970-01-01 00:00:09", + "1969-12-31 12:59:10", + ] gs = cudf.Series(data, dtype=time_type) ps = gs.to_pandas() @@ -1905,28 +1839,21 @@ def test_ceil(data, time_type, resolution): PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, reason="https://github.com/pandas-dev/pandas/issues/52761", ) -@pytest.mark.parametrize( - "data", - [ - ( - [ - "2020-05-31 08:00:00", - "1999-12-31 18:40:10", - "2000-12-31 04:00:05", - "1900-02-28 07:00:06", - "1800-03-14 07:30:20", - "2100-03-14 07:30:20", - "1970-01-01 00:00:09", - "1969-12-31 12:59:10", - ] - ) - ], -) @pytest.mark.parametrize("time_type", DATETIME_TYPES) @pytest.mark.parametrize( "resolution", ["D", "h", "min", "min", "s", "ms", "us", "ns"] ) -def test_floor(data, time_type, resolution): +def test_floor(time_type, resolution): + data = [ + "2020-05-31 08:00:00", + "1999-12-31 18:40:10", + "2000-12-31 04:00:05", + "1900-02-28 07:00:06", + "1800-03-14 07:30:20", + "2100-03-14 07:30:20", + "1970-01-01 00:00:09", + "1969-12-31 12:59:10", + ] gs = cudf.Series(data, dtype=time_type) ps = gs.to_pandas() @@ -1935,28 +1862,21 @@ def test_floor(data, time_type, resolution): assert_eq(expect, got) -@pytest.mark.parametrize( - "data", - [ - ( - [ - "2020-05-31 08:00:00", - "1999-12-31 18:40:10", - "2000-12-31 04:00:05", - "1900-02-28 07:00:06", - "1800-03-14 07:30:20", - "2100-03-14 07:30:20", - "1970-01-01 00:00:09", - "1969-12-31 12:59:10", - ] - ) - ], -) @pytest.mark.parametrize("time_type", DATETIME_TYPES) @pytest.mark.parametrize( "resolution", ["D", "h", "min", "min", "s", "ms", "us", "ns"] ) -def test_round(data, time_type, resolution): +def test_round(time_type, resolution): + data = [ + "2020-05-31 08:00:00", + "1999-12-31 18:40:10", + "2000-12-31 04:00:05", + "1900-02-28 07:00:06", + "1800-03-14 07:30:20", + "2100-03-14 07:30:20", + "1970-01-01 00:00:09", + "1969-12-31 12:59:10", + ] gs = cudf.Series(data, dtype=time_type) ps = gs.to_pandas() @@ -2008,31 +1928,23 @@ def test_first(idx, offset): assert_eq(expect, got) -@pytest.mark.parametrize( - # This test case tests correctness when start is end of month - "idx, offset", - [ - ( - pd.DatetimeIndex( - [ - "2020-01-31", - "2020-02-15", - "2020-02-29", - "2020-03-15", - "2020-03-31", - "2020-04-15", - "2020-04-30", - ] - ), - "3M", - ) - ], -) @pytest.mark.skipif( PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, reason="warning not present in older pandas versions", ) -def test_first_start_at_end_of_month(idx, offset): +def test_first_start_at_end_of_month(): + idx = pd.DatetimeIndex( + [ + "2020-01-31", + "2020-02-15", + "2020-02-29", + "2020-03-15", + "2020-03-31", + "2020-04-15", + "2020-04-30", + ] + ) + offset = "3M" p = pd.Series(range(len(idx)), index=idx) g = cudf.from_pandas(p) @@ -2115,7 +2027,6 @@ def test_datetime_constructor(data, dtype): assert_eq(expected, actual) -@pytest.mark.parametrize("op", _cmpops) def test_datetime_binop_tz_timestamp(op): s = cudf.Series([1, 2, 3], dtype="datetime64[ns]") pd_tz_timestamp = pd.Timestamp("1970-01-01 00:00:00.000000001", tz="utc") @@ -2127,14 +2038,9 @@ def test_datetime_binop_tz_timestamp(op): op(s, date_scalar) -@pytest.mark.parametrize( - "data1", [["20110101", "20120101", None, "20140101", None]] -) -@pytest.mark.parametrize( - "data2", [["20110101", "20120101", "20130101", None, None]] -) -@pytest.mark.parametrize("op", _cmpops) -def test_datetime_series_cmpops_pandas_compatibility(data1, data2, op): +def test_datetime_series_cmpops_pandas_compatibility(op): + data1 = ["20110101", "20120101", None, "20140101", None] + data2 = ["20110101", "20120101", "20130101", None, None] gsr1 = cudf.Series(data=data1, dtype="datetime64[ns]") psr1 = gsr1.to_pandas() diff --git a/python/cudf/cudf/tests/test_decimal.py b/python/cudf/cudf/tests/test_decimal.py index 048b3a656e3..2cb16f71011 100644 --- a/python/cudf/cudf/tests/test_decimal.py +++ b/python/cudf/cudf/tests/test_decimal.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021-2024, NVIDIA CORPORATION. +# Copyright (c) 2021-2025, NVIDIA CORPORATION. import decimal from decimal import Decimal @@ -6,7 +6,6 @@ import numpy as np import pyarrow as pa import pytest -from packaging import version import cudf from cudf.core.column import Decimal32Column, Decimal64Column, NumericalColumn @@ -16,41 +15,36 @@ FLOAT_TYPES, INTEGER_TYPES, SIGNED_TYPES, - _decimal_series, expect_warning_if, ) -data_ = [ - [Decimal("1.1"), Decimal("2.2"), Decimal("3.3"), Decimal("4.4")], - [Decimal("-1.1"), Decimal("2.2"), Decimal("3.3"), Decimal("4.4")], - [1], - [-1], - [1, 2, 3, 4], - [42, 17, 41], - [1, 2, None, 4], - [None, None, None], - [], -] -typ_ = [ - pa.decimal128(precision=4, scale=2), - pa.decimal128(precision=5, scale=3), - pa.decimal128(precision=6, scale=4), -] - - -@pytest.mark.parametrize("data_", data_) -@pytest.mark.parametrize("typ_", typ_) -def test_round_trip_decimal64_column(data_, typ_): - pa_arr = pa.array(data_, type=typ_) - col_64 = Decimal64Column.from_arrow(pa_arr) - assert pa_arr.equals(col_64.to_arrow()) - -@pytest.mark.parametrize("data_", data_) -@pytest.mark.parametrize("typ_", typ_) -def test_round_trip_decimal32_column(data_, typ_): +@pytest.mark.parametrize( + "data_", + [ + [Decimal("1.1"), Decimal("2.2"), Decimal("3.3"), Decimal("4.4")], + [Decimal("-1.1"), Decimal("2.2"), Decimal("3.3"), Decimal("4.4")], + [1], + [-1], + [1, 2, 3, 4], + [42, 17, 41], + [1, 2, None, 4], + [None, None, None], + [], + ], +) +@pytest.mark.parametrize( + "typ_", + [ + pa.decimal128(precision=4, scale=2), + pa.decimal128(precision=5, scale=3), + pa.decimal128(precision=6, scale=4), + ], +) +@pytest.mark.parametrize("col", [Decimal32Column, Decimal64Column]) +def test_round_trip_decimal_column(data_, typ_, col): pa_arr = pa.array(data_, type=typ_) - col_32 = Decimal32Column.from_arrow(pa_arr) + col_32 = col.from_arrow(pa_arr) assert pa_arr.equals(col_32.to_arrow()) @@ -68,36 +62,29 @@ def test_from_arrow_max_precision_decimal32(): ) -@pytest.mark.parametrize( - "data", - [ - cudf.Series( - [ - 14.12302, - 97938.2, - np.nan, - 0.0, - -8.302014, - np.nan, - 94.31304, - -112.2314, - 0.3333333, - np.nan, - ] - ), - ], -) @pytest.mark.parametrize("from_dtype", FLOAT_TYPES) @pytest.mark.parametrize( "to_dtype", [Decimal64Dtype(7, 2), Decimal64Dtype(11, 4), Decimal64Dtype(18, 9)], ) -def test_typecast_from_float_to_decimal(request, data, from_dtype, to_dtype): +def test_typecast_from_float_to_decimal(request, from_dtype, to_dtype): + data = cudf.Series( + [ + 14.12302, + 97938.2, + np.nan, + 0.0, + -8.302014, + np.nan, + 94.31304, + -112.2314, + 0.3333333, + np.nan, + ] + ) request.applymarker( pytest.mark.xfail( - condition=version.parse(pa.__version__) >= version.parse("13.0.0") - and from_dtype == np.dtype("float32") - and to_dtype.precision > 12, + from_dtype == np.dtype("float32") and to_dtype.precision > 12, reason="https://github.com/rapidsai/cudf/issues/14169", ) ) @@ -113,32 +100,27 @@ def test_typecast_from_float_to_decimal(request, data, from_dtype, to_dtype): assert_eq(got, expected) -@pytest.mark.parametrize( - "data", - [ - cudf.Series( - [ - 14.12302, - 38.2, - np.nan, - 0.0, - -8.302014, - np.nan, - 94.31304, - np.nan, - -112.2314, - 0.3333333, - np.nan, - ] - ), - ], -) @pytest.mark.parametrize("from_dtype", INTEGER_TYPES) @pytest.mark.parametrize( "to_dtype", [Decimal64Dtype(9, 3), Decimal64Dtype(11, 4), Decimal64Dtype(18, 9)], ) -def test_typecast_from_int_to_decimal(data, from_dtype, to_dtype): +def test_typecast_from_int_to_decimal(from_dtype, to_dtype): + data = cudf.Series( + [ + 14.12302, + 38.2, + np.nan, + 0.0, + -8.302014, + np.nan, + 94.31304, + np.nan, + -112.2314, + 0.3333333, + np.nan, + ] + ) got = data.astype(from_dtype) pa_arr = ( @@ -153,25 +135,6 @@ def test_typecast_from_int_to_decimal(data, from_dtype, to_dtype): assert_eq(got, expected) -@pytest.mark.parametrize( - "data", - [ - cudf.Series( - [ - 14.12309, - 2.343942, - np.nan, - 0.0, - -8.302082, - np.nan, - 94.31308, - -112.2364, - -8.029972, - np.nan, - ] - ), - ], -) @pytest.mark.parametrize( "from_dtype", [ @@ -194,7 +157,21 @@ def test_typecast_from_int_to_decimal(data, from_dtype, to_dtype): Decimal32Dtype(5, 3), ], ) -def test_typecast_to_from_decimal(data, from_dtype, to_dtype): +def test_typecast_to_from_decimal(from_dtype, to_dtype): + data = cudf.Series( + [ + 14.12309, + 2.343942, + np.nan, + 0.0, + -8.302082, + np.nan, + 94.31308, + -112.2364, + -8.029972, + np.nan, + ] + ) if from_dtype.scale > to_dtype.MAX_PRECISION: pytest.skip( "This is supposed to overflow because the representation value in " @@ -216,31 +193,26 @@ def test_typecast_to_from_decimal(data, from_dtype, to_dtype): assert_eq(got, expected) -@pytest.mark.parametrize( - "data", - [ - cudf.Series( - [ - 14.12309, - 2.343942, - np.nan, - 0.0, - -8.302082, - np.nan, - 94.31308, - -112.2364, - -8.029972, - np.nan, - ] - ), - ], -) @pytest.mark.parametrize( "from_dtype", [Decimal64Dtype(7, 2), Decimal64Dtype(11, 4), Decimal64Dtype(17, 10)], ) @pytest.mark.parametrize("to_dtype", SIGNED_TYPES) -def test_typecast_from_decimal(data, from_dtype, to_dtype): +def test_typecast_from_decimal(from_dtype, to_dtype): + data = cudf.Series( + [ + 14.12309, + 2.343942, + np.nan, + 0.0, + -8.302082, + np.nan, + 94.31308, + -112.2364, + -8.029972, + np.nan, + ] + ) got = data.astype(from_dtype) pa_arr = got.to_arrow().cast(to_dtype, safe=False) @@ -252,7 +224,7 @@ def test_typecast_from_decimal(data, from_dtype, to_dtype): @pytest.mark.parametrize( - "args", + "data, dtype, item, to, expect", [ # scatter to a single index ( @@ -281,21 +253,21 @@ def test_typecast_from_decimal(data, from_dtype, to_dtype): ["1", "2", "3"], Decimal64Dtype(1, 0), Decimal(5), - cudf.Series([True, False, True]), + [True, False, True], ["5", "2", "5"], ), ( ["1.5", "2.5", "3.5"], Decimal64Dtype(2, 1), Decimal("5.5"), - cudf.Series([True, True, True]), + [True, True, True], ["5.5", "5.5", "5.5"], ), ( ["1.0042", "2.0042", "3.0042"], Decimal64Dtype(5, 4), Decimal("5.0042"), - cudf.Series([False, False, True]), + [False, False, True], ["1.0042", "2.0042", "5.0042"], ), # We will allow assigning a decimal with less precision @@ -320,16 +292,15 @@ def test_typecast_from_decimal(data, from_dtype, to_dtype): (["1", "2", "3"], Decimal64Dtype(1, 0), 50, 1, pa.lib.ArrowInvalid), ], ) -def test_series_setitem_decimal(args): - data, dtype, item, to, expect = args - data = _decimal_series(data, dtype) +def test_series_setitem_decimal(data, dtype, item, to, expect): + data = cudf.Series([Decimal(x) for x in data], dtype=dtype) if expect is pa.lib.ArrowInvalid: with pytest.raises(expect): data[to] = item return else: - expect = _decimal_series(expect, dtype) + expect = cudf.Series([Decimal(x) for x in expect], dtype=dtype) data[to] = item assert_eq(data, expect) @@ -347,37 +318,29 @@ def test_series_construction_with_nulls(input_obj): @pytest.mark.parametrize( "data", [ - { - "a": _decimal_series( - ["1", "2", "3"], dtype=cudf.Decimal64Dtype(1, 0) - ) - }, - { - "a": _decimal_series( - ["1", "2", "3"], dtype=cudf.Decimal64Dtype(1, 0) - ), - "b": _decimal_series( - ["1.0", "2.0", "3.0"], dtype=cudf.Decimal64Dtype(2, 1) - ), - "c": _decimal_series( - ["10.1", "20.2", "30.3"], dtype=cudf.Decimal64Dtype(3, 1) - ), - }, - { - "a": _decimal_series( - ["1", None, "3"], dtype=cudf.Decimal64Dtype(1, 0) - ), - "b": _decimal_series( - ["1.0", "2.0", None], dtype=cudf.Decimal64Dtype(2, 1) - ), - "c": _decimal_series( - [None, "20.2", "30.3"], dtype=cudf.Decimal64Dtype(3, 1) - ), - }, + [(["1", "2", "3"], cudf.Decimal64Dtype(1, 0))], + [ + (["1", "2", "3"], cudf.Decimal64Dtype(1, 0)), + (["1.0", "2.0", "3.0"], cudf.Decimal64Dtype(2, 1)), + (["10.1", "20.2", "30.3"], cudf.Decimal64Dtype(3, 1)), + ], + [ + (["1", None, "3"], cudf.Decimal64Dtype(1, 0)), + (["1.0", "2.0", None], cudf.Decimal64Dtype(2, 1)), + ([None, "20.2", "30.3"], cudf.Decimal64Dtype(3, 1)), + ], ], ) def test_serialize_decimal_columns(data): - df = cudf.DataFrame(data) + df = cudf.DataFrame( + { + str(i): cudf.Series( + [Decimal(x) if x is not None else x for x in values], + dtype=dtype, + ) + for i, (values, dtype) in enumerate(data) + } + ) recreated = df.__class__.deserialize(*df.serialize()) assert_eq(recreated, df) From 1e195f35a1f84b98a0b6e041b70a5fac4b553b8d Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Wed, 23 Jul 2025 13:27:11 -0700 Subject: [PATCH 007/366] Use more pytest fixtures and avoid GPU parameterization in cuDF classic tests (#19436) Towards https://github.com/rapidsai/cudf/issues/9999 * Use more pytest fixtures * Avoids pytest.mark.parametrize with GPU objects Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - Bradley Dice (https://github.com/bdice) URL: https://github.com/rapidsai/cudf/pull/19436 --- python/cudf/cudf/tests/test_no_cuinit.py | 82 +++++++------------- python/cudf/cudf/tests/test_no_device.py | 23 +++--- python/cudf/cudf/tests/test_numpy_interop.py | 17 ++-- python/cudf/cudf/tests/test_pickling.py | 35 +++------ python/cudf/cudf/tests/test_quantiles.py | 13 +--- 5 files changed, 62 insertions(+), 108 deletions(-) diff --git a/python/cudf/cudf/tests/test_no_cuinit.py b/python/cudf/cudf/tests/test_no_cuinit.py index 593d280f960..ed9d6a2a901 100644 --- a/python/cudf/cudf/tests/test_no_cuinit.py +++ b/python/cudf/cudf/tests/test_no_cuinit.py @@ -1,6 +1,5 @@ # Copyright (c) 2023-2025, NVIDIA CORPORATION. -import os import subprocess import sys from shutil import which @@ -40,7 +39,16 @@ def cuda_gdb(request): return gdb -def test_cudf_import_no_cuinit(cuda_gdb): +@pytest.mark.parametrize( + "cudf_call, should_be_initialized", + [ + ("import cudf", False), + ("import cudf; cudf.Series([1])", True), + ], +) +def test_rapids_no_initialize_cuinit( + cuda_gdb, monkeypatch, cudf_call, should_be_initialized +): # When RAPIDS_NO_INITIALIZE is set, importing cudf should _not_ # create a CUDA context (i.e. cuInit should not be called). # Intercepting the call to cuInit programmatically is tricky since @@ -50,62 +58,28 @@ def test_cudf_import_no_cuinit(cuda_gdb): # needs provide hooks that override dlsym, cuGetProcAddress, and # cuInit. # Instead, we just run under GDB and see if we hit a breakpoint - env = os.environ.copy() - env["RAPIDS_NO_INITIALIZE"] = "1" - output = subprocess.run( - [ - cuda_gdb, - "-x", - "-", - "--args", - sys.executable, - "-c", - "import cudf", - ], - input=GDB_COMMANDS, - env=env, - capture_output=True, - text=True, - cwd="/", - ) - - cuInit_called = output.stdout.find("in cuInit ()") - print("Command output:\n") # noqa: T201 - print("*** STDOUT ***") # noqa: T201 - print(output.stdout) # noqa: T201 - print("*** STDERR ***") # noqa: T201 - print(output.stderr) # noqa: T201 - assert output.returncode == 0 - assert cuInit_called < 0 - - -def test_cudf_create_series_cuinit(cuda_gdb): - # This tests that our gdb scripting correctly identifies cuInit - # when it definitely should have been called. - env = os.environ.copy() - env["RAPIDS_NO_INITIALIZE"] = "1" - output = subprocess.run( - [ - cuda_gdb, - "-x", - "-", - "--args", - sys.executable, - "-c", - "import cudf; cudf.Series([1])", - ], - input=GDB_COMMANDS, - env=env, - capture_output=True, - text=True, - cwd="/", - ) + with monkeypatch.context() as m: + m.setenv("RAPIDS_NO_INITIALIZE", "1") + output = subprocess.run( + [ + cuda_gdb, + "-x", + "-", + "--args", + sys.executable, + "-c", + cudf_call, + ], + input=GDB_COMMANDS, + capture_output=True, + text=True, + cwd="/", + ) - cuInit_called = output.stdout.find("in cuInit ()") print("Command output:\n") # noqa: T201 print("*** STDOUT ***") # noqa: T201 print(output.stdout) # noqa: T201 print("*** STDERR ***") # noqa: T201 print(output.stderr) # noqa: T201 assert output.returncode == 0 - assert cuInit_called >= 0 + assert ("in cuInit ()" in output.stdout) == should_be_initialized diff --git a/python/cudf/cudf/tests/test_no_device.py b/python/cudf/cudf/tests/test_no_device.py index 722762b2d0c..f663297eeb6 100644 --- a/python/cudf/cudf/tests/test_no_device.py +++ b/python/cudf/cudf/tests/test_no_device.py @@ -1,16 +1,13 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. -import os +# Copyright (c) 2024-2025, NVIDIA CORPORATION. import subprocess +import sys -def test_cudf_import_no_device(): - env = os.environ.copy() - env["CUDA_VISIBLE_DEVICES"] = "-1" - output = subprocess.run( - ["python", "-c", "import cudf"], - env=env, - capture_output=True, - text=True, - cwd="/", - ) - assert output.returncode == 0 +def test_cudf_import_no_device(monkeypatch): + with monkeypatch.context() as m: + m.setenv("CUDA_VISIBLE_DEVICES", "-1") + output = subprocess.check_call( + [sys.executable, "-c", "import cudf"], + cwd="/", + ) + assert output == 0 diff --git a/python/cudf/cudf/tests/test_numpy_interop.py b/python/cudf/cudf/tests/test_numpy_interop.py index fa664d52ecf..0bdb806732b 100644 --- a/python/cudf/cudf/tests/test_numpy_interop.py +++ b/python/cudf/cudf/tests/test_numpy_interop.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019-2024, NVIDIA CORPORATION. +# Copyright (c) 2019-2025, NVIDIA CORPORATION. import numpy as np import pytest @@ -78,18 +78,19 @@ def test_numpy_non_contiguious(): @pytest.mark.parametrize( "data", [ - Series([1, 2, 3, -12, 12, 44]), - Series([1, 2, 3, -12, 12, 44], dtype="str"), - Series([1, 2, 3, -12, 12, 44]).index, - DataFrame({"a": [1, 2, 3, -1234], "b": [0.1, 0.2222, 0.4, -3.14]}), - DataFrame( + lambda: Series([1, 2, 3, -12, 12, 44]), + lambda: Series([1, 2, 3, -12, 12, 44], dtype="str"), + lambda: DataFrame( {"a": [1, 2, 3, -1234], "b": [0.1, 0.2222, 0.4, -3.14]} - ).index, + ), ], ) @pytest.mark.parametrize("dtype", [None, "float", "int", "str"]) def test_series_dataframe__array__(data, dtype): - gs = data + gs = data() with pytest.raises(TypeError): gs.__array__(dtype=dtype) + + with pytest.raises(TypeError): + gs.index.__array__(dtype=dtype) diff --git a/python/cudf/cudf/tests/test_pickling.py b/python/cudf/cudf/tests/test_pickling.py index 2f10a5dfd74..ac13056fa7c 100644 --- a/python/cudf/cudf/tests/test_pickling.py +++ b/python/cudf/cudf/tests/test_pickling.py @@ -1,4 +1,4 @@ -# Copyright (c) 2018-2024, NVIDIA CORPORATION. +# Copyright (c) 2018-2025, NVIDIA CORPORATION. import pickle @@ -13,7 +13,16 @@ pytestmark = pytest.mark.spilling -def check_serialization(df): +@pytest.mark.parametrize( + "keys", + [ + np.arange(5, dtype=np.float64), + pd.Categorical(["a", "a", "a", "b", "a", "b", "a", "b", "a", "c"]), + ], +) +def test_pickle_dataframe(keys): + rng = np.random.default_rng(seed=0) + df = DataFrame({"keys": keys, "vals": rng.random(len(keys))}) # basic assert_frame_picklable(df) # sliced @@ -39,28 +48,6 @@ def assert_frame_picklable(df): assert_eq(loaded, df) -def test_pickle_dataframe_numeric(): - rng = np.random.default_rng(seed=0) - df = DataFrame() - nelem = 10 - df["keys"] = np.arange(nelem, dtype=np.float64) - df["vals"] = rng.random(nelem) - - check_serialization(df) - - -def test_pickle_dataframe_categorical(): - rng = np.random.default_rng(seed=0) - - df = DataFrame() - df["keys"] = pd.Categorical( - ["a", "a", "a", "b", "a", "b", "a", "b", "a", "c"] - ) - df["vals"] = rng.random(len(df)) - - check_serialization(df) - - def test_memory_usage_dataframe(): rng = np.random.default_rng(seed=0) df = DataFrame() diff --git a/python/cudf/cudf/tests/test_quantiles.py b/python/cudf/cudf/tests/test_quantiles.py index 84de2ac38e7..8b2f5acb3e1 100644 --- a/python/cudf/cudf/tests/test_quantiles.py +++ b/python/cudf/cudf/tests/test_quantiles.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. +# Copyright (c) 2020-2025, NVIDIA CORPORATION. import re @@ -93,14 +93,9 @@ def test_quantile_type_int_float(interpolation): assert type(expected) is type(actual) -@pytest.mark.parametrize( - "data", - [ - [float("nan"), float("nan"), 0.9], - [float("nan"), float("nan"), float("nan")], - ], -) -def test_ignore_nans(data): +@pytest.mark.parametrize("val", [0.9, float("nan")]) +def test_ignore_nans(val): + data = [float("nan"), float("nan"), val] psr = pd.Series(data) gsr = cudf.Series(data, nan_as_null=False) From 1ba88355499e6c811180503ba1df74d9ffab9d8b Mon Sep 17 00:00:00 2001 From: Matthew Murray <41342305+Matt711@users.noreply.github.com> Date: Wed, 23 Jul 2025 18:17:43 -0400 Subject: [PATCH 008/366] Fix missing return in StringFunction.Strptime strict=True path (#19464) When `strict=True` and all strings are valid timestamps, we incorrectly hit the fall through case (ie. Strptime function is not implemented). Authors: - Matthew Murray (https://github.com/Matt711) Approvers: - Matthew Roeschke (https://github.com/mroeschke) URL: https://github.com/rapidsai/cudf/pull/19464 --- .../cudf_polars/dsl/expressions/string.py | 24 +++++++------ .../tests/expressions/test_stringfunction.py | 34 +++++++++---------- 2 files changed, 30 insertions(+), 28 deletions(-) diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/string.py b/python/cudf_polars/cudf_polars/dsl/expressions/string.py index 0b65b5fbd8a..1e70e855cd5 100644 --- a/python/cudf_polars/cudf_polars/dsl/expressions/string.py +++ b/python/cudf_polars/cudf_polars/dsl/expressions/string.py @@ -692,13 +692,14 @@ def do_evaluate( ) elif self.name is StringFunction.Name.Strptime: # TODO: ignores ambiguous - format, strict, exact, cache = self.options + format, strict, _, _ = self.options col = self.children[0].evaluate(df, context=context) is_timestamps = plc.strings.convert.convert_datetime.is_timestamp( col.obj, format ) + plc_col = col.obj if strict: if not plc.reduce.reduce( is_timestamps, @@ -710,16 +711,17 @@ def do_evaluate( not_timestamps = plc.unary.unary_operation( is_timestamps, plc.unary.UnaryOperator.NOT ) - null = plc.Scalar.from_py(None, col.obj.type()) - res = plc.copying.boolean_mask_scatter( - [null], plc.Table([col.obj]), not_timestamps - ) - return Column( - plc.strings.convert.convert_datetime.to_timestamps( - res.columns()[0], self.dtype.plc, format - ), - dtype=self.dtype, - ) + null = plc.Scalar.from_py(None, plc_col.type()) + plc_col = plc.copying.boolean_mask_scatter( + [null], plc.Table([plc_col]), not_timestamps + ).columns()[0] + + return Column( + plc.strings.convert.convert_datetime.to_timestamps( + plc_col, self.dtype.plc, format + ), + dtype=self.dtype, + ) elif self.name is StringFunction.Name.Replace: column, target, repl = columns n, _ = self.options diff --git a/python/cudf_polars/tests/expressions/test_stringfunction.py b/python/cudf_polars/tests/expressions/test_stringfunction.py index e9b96b2df62..6515e487e7e 100644 --- a/python/cudf_polars/tests/expressions/test_stringfunction.py +++ b/python/cudf_polars/tests/expressions/test_stringfunction.py @@ -256,32 +256,32 @@ def test_split_exact_inclusive_unsupported(ldf_split): assert_ir_translation_raises(q, NotImplementedError) -@pytest.fixture -def to_datetime_data(): - return pl.LazyFrame( - { - "a": [ - "2021-01-01", - "2021-01-02", - "abcd", - ] - } - ) - - @pytest.mark.parametrize("cache", [True, False], ids=lambda cache: f"{cache=}") @pytest.mark.parametrize("strict", [True, False], ids=lambda strict: f"{strict=}") @pytest.mark.parametrize("exact", [True, False], ids=lambda exact: f"{exact=}") @pytest.mark.parametrize("format", ["%Y-%m-%d", None], ids=lambda format: f"{format=}") -def test_to_datetime(to_datetime_data, cache, strict, format, exact): - q = to_datetime_data.select( +@pytest.mark.parametrize( + "values, has_invalid_row", + [ + (["2024-01-01", "2023-12-31", None], False), + (["2024-01-01", "foo", None], True), + ], + ids=["valid", "invalid"], +) +def test_to_datetime(values, has_invalid_row, cache, strict, format, exact): + df = pl.DataFrame({"a": values}) + q = df.lazy().select( pl.col("a").str.strptime( - pl.Datetime("ns"), format=format, cache=cache, strict=strict, exact=exact + pl.Datetime("ns"), + format=format, + cache=cache, + strict=strict, + exact=exact, ) ) if cache or format is None or not exact: assert_ir_translation_raises(q, NotImplementedError) - elif strict: + elif strict and has_invalid_row: assert_collect_raises( q, polars_except=pl.exceptions.InvalidOperationError, From 652edce46f8c516df59922002d6f1c7d7dd69f83 Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Thu, 24 Jul 2025 10:17:32 -0500 Subject: [PATCH 009/366] Allow latest OS in devcontainers (#19480) This PR removes the OS suffix from devcontainers, allowing the upstream devcontainer images to determine the OS version. Contributes to https://github.com/rapidsai/build-planning/issues/200. Authors: - Bradley Dice (https://github.com/bdice) Approvers: - Gil Forsyth (https://github.com/gforsyth) URL: https://github.com/rapidsai/cudf/pull/19480 --- .devcontainer/cuda12.9-conda/devcontainer.json | 2 +- .devcontainer/cuda12.9-pip/devcontainer.json | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.devcontainer/cuda12.9-conda/devcontainer.json b/.devcontainer/cuda12.9-conda/devcontainer.json index 5c010923260..1ed542f11f3 100644 --- a/.devcontainer/cuda12.9-conda/devcontainer.json +++ b/.devcontainer/cuda12.9-conda/devcontainer.json @@ -5,7 +5,7 @@ "args": { "CUDA": "12.9", "PYTHON_PACKAGE_MANAGER": "conda", - "BASE": "rapidsai/devcontainers:25.10-cpp-mambaforge-ubuntu22.04" + "BASE": "rapidsai/devcontainers:25.10-cpp-mambaforge" } }, "runArgs": [ diff --git a/.devcontainer/cuda12.9-pip/devcontainer.json b/.devcontainer/cuda12.9-pip/devcontainer.json index 666e6b872f6..3b35d4398c5 100644 --- a/.devcontainer/cuda12.9-pip/devcontainer.json +++ b/.devcontainer/cuda12.9-pip/devcontainer.json @@ -5,7 +5,7 @@ "args": { "CUDA": "12.9", "PYTHON_PACKAGE_MANAGER": "pip", - "BASE": "rapidsai/devcontainers:25.10-cpp-cuda12.9-ubuntu22.04" + "BASE": "rapidsai/devcontainers:25.10-cpp-cuda12.9" } }, "runArgs": [ From a9acbd46b22e36ae28dd6cd86b8ffb7847301aeb Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Thu, 24 Jul 2025 10:05:20 -0700 Subject: [PATCH 010/366] Use more pytest fixtures and avoid GPU parameterization in test_indexing/joining/monotonic/multiindex.py (#19437) Towards https://github.com/rapidsai/cudf/issues/9999 * Use more pytest fixtures * Avoids pytest.mark.parametrize with GPU objects * Adds some `copy()`s in order not to mutate pytest fixtures Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - Matthew Murray (https://github.com/Matt711) URL: https://github.com/rapidsai/cudf/pull/19437 --- python/cudf/cudf/tests/test_indexing.py | 93 +++---- python/cudf/cudf/tests/test_joining.py | 111 ++++---- python/cudf/cudf/tests/test_monotonic.py | 33 ++- python/cudf/cudf/tests/test_multiindex.py | 302 +++++++++++----------- 4 files changed, 255 insertions(+), 284 deletions(-) diff --git a/python/cudf/cudf/tests/test_indexing.py b/python/cudf/cudf/tests/test_indexing.py index 281f1d014a1..6d373f56b14 100644 --- a/python/cudf/cudf/tests/test_indexing.py +++ b/python/cudf/cudf/tests/test_indexing.py @@ -819,19 +819,19 @@ def test_empty_boolean_mask(dtype): ], ) @pytest.mark.parametrize( - "mask", + "mask_vals", [ [True, True, True, True], [False, False, False, False], [True, False, True, False], [True, False, False, True], - np.array([True, False, True, False]), - pd.Series([True, False, True, False]), - cudf.Series([True, False, True, False]), ], ) +@pytest.mark.parametrize( + "mask_class", [list, np.array, pd.Series, cudf.Series] +) @pytest.mark.parametrize("nulls", ["one", "some", "all", "none"]) -def test_series_apply_boolean_mask(data, mask, nulls): +def test_series_apply_boolean_mask(data, mask_vals, mask_class, nulls): rng = np.random.default_rng(seed=0) psr = pd.Series(data) @@ -853,6 +853,7 @@ def test_series_apply_boolean_mask(data, mask, nulls): if psr.dtype == "object" and nulls == "all": gsr = cudf.Series([None, None, None, None], dtype="object") + mask = mask_class(mask_vals) if isinstance(mask, cudf.Series): expect = psr[mask.to_pandas()] else: @@ -874,11 +875,6 @@ def test_dataframe_apply_boolean_mask(): assert_eq(pdf[[True, False, True, False]], gdf[[True, False, True, False]]) -""" -This test compares cudf and Pandas DataFrame boolean indexing. -""" - - @pytest.mark.parametrize( "mask_fn", [lambda x: x, lambda x: np.array(x), lambda x: pd.Series(x)] ) @@ -1146,6 +1142,8 @@ def test_series_setitem_loc_numeric_index(key, value): ) def test_dataframe_setitem_iloc(key, value, pdf_gdf): pdf, gdf = pdf_gdf + pdf = pdf.copy() + gdf = gdf.copy() pdf.iloc[key] = value gdf.iloc[key] = value assert_eq(pdf, gdf) @@ -1172,6 +1170,8 @@ def test_dataframe_setitem_iloc(key, value, pdf_gdf): ) def test_dataframe_setitem_loc(key, value, pdf_gdf): pdf, gdf = pdf_gdf + pdf = pdf.copy() + gdf = gdf.copy() pdf.loc[key] = value gdf.loc[key] = value assert_eq(pdf, gdf) @@ -1202,6 +1202,8 @@ def test_dataframe_setitem_loc_empty_df(key, value): ) def test_dataframe_setitem_iloc_multiindex(key, value, pdf_gdf_multi): pdf, gdf = pdf_gdf_multi + pdf = pdf.copy() + gdf = gdf.copy() pdf.iloc[key] = value gdf.iloc[key] = value @@ -1386,8 +1388,8 @@ def test_dataframe_sliced(gdf_kwargs, slice): @pytest.mark.parametrize( "gdf", [ - cudf.DataFrame({"a": range(10000)}), - cudf.DataFrame( + lambda: cudf.DataFrame({"a": range(10000)}), + lambda: cudf.DataFrame( { "a": range(10000), "b": range(10000), @@ -1397,21 +1399,23 @@ def test_dataframe_sliced(gdf_kwargs, slice): "f": range(10000), } ), - cudf.DataFrame({"a": range(20), "b": range(20)}), - cudf.DataFrame( + lambda: cudf.DataFrame({"a": range(20), "b": range(20)}), + lambda: cudf.DataFrame( { "a": range(20), "b": range(20), "c": ["abc", "def", "xyz", "def", "pqr"] * 4, } ), - cudf.DataFrame(index=[1, 2, 3]), - cudf.DataFrame(index=range(10000)), - cudf.DataFrame(columns=["a", "b", "c", "d"]), - cudf.DataFrame(columns=["a"], index=range(10000)), - cudf.DataFrame(columns=["a", "col2", "...col n"], index=range(10000)), - cudf.DataFrame(index=cudf.Series(range(10000)).astype("str")), - cudf.DataFrame( + lambda: cudf.DataFrame(index=[1, 2, 3]), + lambda: cudf.DataFrame(index=range(10000)), + lambda: cudf.DataFrame(columns=["a", "b", "c", "d"]), + lambda: cudf.DataFrame(columns=["a"], index=range(10000)), + lambda: cudf.DataFrame( + columns=["a", "col2", "...col n"], index=range(10000) + ), + lambda: cudf.DataFrame(index=cudf.Series(range(10000)).astype("str")), + lambda: cudf.DataFrame( columns=["a", "b", "c", "d"], index=cudf.Series(range(10000)).astype("str"), ), @@ -1422,6 +1426,7 @@ def test_dataframe_sliced(gdf_kwargs, slice): [slice(6), slice(1), slice(7), slice(1, 3)], ) def test_dataframe_iloc_index(gdf, slice): + gdf = gdf() pdf = gdf.to_pandas() actual = gdf.iloc[:, slice] @@ -1595,26 +1600,11 @@ def test_dataframe_iloc_inplace_update(key, value): assert_eq(expected, actual) -@pytest.mark.parametrize( - "loc_key", - [([0, 2], ["x", "y"])], -) -@pytest.mark.parametrize( - "iloc_key", - [[0, 2]], -) -@pytest.mark.parametrize( - ("data, index"), - [ - ( - {"x": [10, 20], "y": [30, 40]}, - [0, 2], - ) - ], -) -def test_dataframe_loc_iloc_inplace_update_with_RHS_dataframe( - loc_key, iloc_key, data, index -): +def test_dataframe_loc_iloc_inplace_update_with_RHS_dataframe(): + loc_key = ([0, 2], ["x", "y"]) + iloc_key = [0, 2] + data = {"x": [10, 20], "y": [30, 40]} + index = [0, 2] gdf = cudf.DataFrame({"x": [1, 2, 3], "y": [4, 5, 6]}) pdf = gdf.to_pandas() @@ -1691,16 +1681,18 @@ def test_dataframe_iloc_inplace_update_shape_mismatch_RHS_df(): @pytest.mark.parametrize( - "array,is_error", + "end, second_dim, is_error", [ - (cupy.arange(20, 40).reshape(-1, 2), False), - (cupy.arange(20, 50).reshape(-1, 3), True), - (np.arange(20, 40).reshape(-1, 2), False), - (np.arange(20, 30).reshape(-1, 1), False), - (cupy.arange(20, 30).reshape(-1, 1), False), + (40, 2, False), + (50, 3, True), + (30, 1, False), ], ) -def test_dataframe_indexing_setitem_np_cp_array(array, is_error): +@pytest.mark.parametrize("mod", [cupy, np]) +def test_dataframe_indexing_setitem_np_cp_array( + end, second_dim, is_error, mod +): + array = mod.arange(20, end).reshape(-1, second_dim) gdf = cudf.DataFrame({"a": range(10), "b": range(10)}) pdf = gdf.to_pandas() if not is_error: @@ -2311,12 +2303,11 @@ def test_loc_datetime_monotonic_with_ts(data, scalar): assert_eq(actual, expected) -@pytest.mark.parametrize("data", [[15, 14, 3, 10, 1]]) @pytest.mark.parametrize("scalar", [1, 10, 15, 14, 0, 2]) -def test_loc_datetime_random_with_ts(data, scalar): +def test_loc_datetime_random_with_ts(scalar): gdf = cudf.DataFrame( {"a": [1, 1, 1, 2, 2], "b": [1, 2, 3, 4, 5]}, - index=cudf.Index(data, dtype="datetime64[ns]"), + index=cudf.Index([15, 14, 3, 10, 1], dtype="datetime64[ns]"), ) pdf = gdf.to_pandas() diff --git a/python/cudf/cudf/tests/test_joining.py b/python/cudf/cudf/tests/test_joining.py index d893974e610..bb24111cfc3 100644 --- a/python/cudf/cudf/tests/test_joining.py +++ b/python/cudf/cudf/tests/test_joining.py @@ -19,45 +19,35 @@ ) from cudf.utils.dtypes import find_common_type -_JOIN_TYPES = ( - "left", - "inner", - "outer", - "right", - "leftanti", - "leftsemi", - "cross", -) - -def make_params(): - rng = np.random.default_rng(seed=0) - - hows = _JOIN_TYPES +@pytest.fixture( + params=( + "left", + "inner", + "outer", + "right", + "leftanti", + "leftsemi", + "cross", + ) +) +def how(request): + return request.param - # Test specific cases (1) - aa = [0, 0, 4, 5, 5] - bb = [0, 0, 2, 3, 5] - for how in hows: - yield (aa, bb, how) - # Test specific cases (2) - aa = [0, 0, 1, 2, 3] - bb = [0, 1, 2, 2, 3] - for how in hows: - yield (aa, bb, how) +rng = np.random.default_rng(seed=0) - # Test large random integer inputs - aa = rng.integers(0, 50, 100) - bb = rng.integers(0, 50, 100) - for how in hows: - yield (aa, bb, how) - # Test floating point inputs - aa = rng.random(50) - bb = rng.random(50) - for how in hows: - yield (aa, bb, how) +@pytest.fixture( + params=[ + [[0, 0, 4, 5, 5], [0, 0, 2, 3, 5]], + [[0, 0, 1, 2, 3], [0, 1, 2, 2, 3]], + [rng.integers(0, 50, 100), rng.integers(0, 50, 100)], + [rng.random(50), rng.random(50)], + ] +) +def aa_bb(request): + return request.param def pd_odd_joins(left, right, join_type): @@ -68,8 +58,6 @@ def pd_odd_joins(left, right, join_type): def assert_join_results_equal(expect, got, how, **kwargs): - if how not in _JOIN_TYPES: - raise ValueError(f"Unrecognized join type {how}") if how == "right": got = got[expect.columns] @@ -98,11 +86,14 @@ def assert_join_results_equal(expect, got, how, **kwargs): raise ValueError(f"Not a join result: {type(expect).__name__}") -@pytest.mark.parametrize("aa,bb,how", make_params()) -def test_dataframe_join_how(aa, bb, how): - df = cudf.DataFrame() - df["a"] = aa - df["b"] = bb +def test_dataframe_join_how(aa_bb, how): + aa, bb = aa_bb + df = cudf.DataFrame( + { + "a": aa, + "b": bb, + } + ) def work_pandas(df, how): df1 = df.set_index("a") @@ -149,21 +140,19 @@ def work_gdf(df): # _sorted_check_series(expect['b'], expect['a'], got['b'], # got['a']) else: + magic = 0xDEADBEAF for c in expecto.columns: - _check_series(expecto[c].fillna(-1), goto[c].fillna(-1)) + expect = expecto[c].fillna(-1) + got = goto[c].fillna(-1) - -def _check_series(expect, got): - magic = 0xDEADBEAF - - direct_equal = np.all(expect.values == got.to_numpy()) - nanfilled_equal = np.all( - expect.fillna(magic).values == got.fillna(magic).to_numpy() - ) - msg = "direct_equal={}, nanfilled_equal={}".format( - direct_equal, nanfilled_equal - ) - assert direct_equal or nanfilled_equal, msg + direct_equal = np.all(expect.values == got.to_numpy()) + nanfilled_equal = np.all( + expect.fillna(magic).values == got.fillna(magic).to_numpy() + ) + msg = "direct_equal={}, nanfilled_equal={}".format( + direct_equal, nanfilled_equal + ) + assert direct_equal or nanfilled_equal, msg @pytest.mark.skipif( @@ -610,11 +599,9 @@ def test_indicator(): gdf = cudf.DataFrame({"x": [1, 2, 1]}) gdf.merge(gdf, indicator=False) - with pytest.raises(NotImplementedError) as info: + with pytest.raises(NotImplementedError, match=".*indicator=False.*"): gdf.merge(gdf, indicator=True) - assert "indicator=False" in str(info.value) - def test_merge_suffixes(): pdf = cudf.DataFrame({"x": [1, 2, 1]}) @@ -1787,15 +1774,15 @@ def test_typecast_on_join_indexes_matching_categorical(): @pytest.mark.parametrize( "lhs", [ - cudf.Series([1, 2, 3], name="a"), - cudf.DataFrame({"a": [2, 3, 4], "c": [4, 5, 6]}), + lambda: cudf.Series([1, 2, 3], name="a"), + lambda: cudf.DataFrame({"a": [2, 3, 4], "c": [4, 5, 6]}), ], ) @pytest.mark.parametrize( "rhs", [ - cudf.Series([1, 2, 3], name="b"), - cudf.DataFrame({"b": [2, 3, 4], "c": [4, 5, 6]}), + lambda: cudf.Series([1, 2, 3], name="b"), + lambda: cudf.DataFrame({"b": [2, 3, 4], "c": [4, 5, 6]}), ], ) @pytest.mark.parametrize( @@ -1816,6 +1803,8 @@ def test_series_dataframe_mixed_merging(lhs, rhs, how, kwargs): ): pytest.skip("Index joins not compatible with leftsemi and leftanti") + lhs = lhs() + rhs = rhs() check_lhs = lhs.copy() check_rhs = rhs.copy() if isinstance(lhs, cudf.Series): diff --git a/python/cudf/cudf/tests/test_monotonic.py b/python/cudf/cudf/tests/test_monotonic.py index a34c89f55d3..ae5f1e1c90c 100644 --- a/python/cudf/cudf/tests/test_monotonic.py +++ b/python/cudf/cudf/tests/test_monotonic.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019-2024, NVIDIA CORPORATION. +# Copyright (c) 2019-2025, NVIDIA CORPORATION. """ Tests related to is_unique, is_monotonic_increasing & @@ -12,7 +12,6 @@ import cudf from cudf import Index, MultiIndex, Series from cudf.core.index import CategoricalIndex, DatetimeIndex, RangeIndex -from cudf.testing import assert_eq @pytest.mark.parametrize("testrange", [(10, 20, 1), (0, -10, -1), (5, 5, 1)]) @@ -299,22 +298,20 @@ def test_get_slice_bound_missing_str(label, side): assert got == expect -testdata = [ - ( - Series(["2018-01-01", "2019-01-31", None], dtype="datetime64[ms]"), - False, - ), - (Series([1, 2, 3, None]), False), - (Series([None, 1, 2, 3]), False), - (Series(["a", "b", "c", None]), False), - (Series([None, "a", "b", "c"]), False), -] - - -@pytest.mark.parametrize("data, expected", testdata) -def test_is_monotonic_always_falls_for_null(data, expected): - assert_eq(expected, data.is_monotonic_increasing) - assert_eq(expected, data.is_monotonic_decreasing) +@pytest.mark.parametrize( + "data", + [ + [pd.Timestamp("2018-01-01"), pd.Timestamp("2019-01-31"), None], + [1, 2, 3, None], + [None, 1, 2, 3], + ["a", "b", "c", None], + [None, "a", "b", "c"], + ], +) +def test_is_monotonic_always_falls_for_null(data): + ser = Series(data) + assert ser.is_monotonic_increasing is False + assert ser.is_monotonic_decreasing is False @pytest.mark.parametrize("box", [Series, Index]) diff --git a/python/cudf/cudf/tests/test_multiindex.py b/python/cudf/cudf/tests/test_multiindex.py index 39315b6198e..4c1f5259c61 100644 --- a/python/cudf/cudf/tests/test_multiindex.py +++ b/python/cudf/cudf/tests/test_multiindex.py @@ -10,6 +10,7 @@ import pickle import re from contextlib import contextmanager +from functools import reduce from io import BytesIO import cupy as cp @@ -249,12 +250,14 @@ def pdfIndexNulls(): def test_from_pandas(pdf, pdfIndex): + pdf = pdf.copy(deep=False) pdf.index = pdfIndex gdf = cudf.from_pandas(pdf) assert_eq(pdf, gdf) def test_multiindex_transpose(pdf, pdfIndex): + pdf = pdf.copy(deep=False) pdf.index = pdfIndex gdf = cudf.from_pandas(pdf) assert_eq(pdf.transpose(), gdf.transpose()) @@ -284,6 +287,8 @@ def test_series_multiindex(pdfIndex): def test_multiindex_take(pdf, gdf, pdfIndex): gdfIndex = cudf.from_pandas(pdfIndex) + pdf = pdf.copy(deep=False) + gdf = gdf.copy(deep=False) pdf.index = pdfIndex gdf.index = gdfIndex assert_eq(pdf.index.take([0]), gdf.index.take([0])) @@ -302,6 +307,8 @@ def test_multiindex_take(pdf, gdf, pdfIndex): def test_multiindex_getitem(pdf, gdf, pdfIndex): gdfIndex = cudf.from_pandas(pdfIndex) + pdf = pdf.copy(deep=False) + gdf = gdf.copy(deep=False) pdf.index = pdfIndex gdf.index = gdfIndex assert_eq(pdf.index[0], gdf.index[0]) @@ -336,6 +343,8 @@ def test_multiindex_getitem(pdf, gdf, pdfIndex): def test_multiindex_loc(pdf, gdf, pdfIndex, key_tuple): gdfIndex = cudf.from_pandas(pdfIndex) assert_eq(pdfIndex, gdfIndex) + pdf = pdf.copy(deep=False) + gdf = gdf.copy(deep=False) pdf.index = pdfIndex gdf.index = gdfIndex # The index is unsorted, which makes things slow but is fine for testing. @@ -349,14 +358,9 @@ def test_multiindex_loc(pdf, gdf, pdfIndex, key_tuple): assert_eq(expected, got) -@pytest.mark.parametrize( - "indexer", - [ - (([1, 1], [0, 1]), slice(None)), - (([1, 1], [1, 0]), slice(None)), - ], -) -def test_multiindex_compatible_ordering(indexer): +@pytest.mark.parametrize("second_val", [[0, 1], [1, 0]]) +def test_multiindex_compatible_ordering(second_val): + indexer = (([1, 1], second_val), slice(None)) df = pd.DataFrame( {"a": [1, 1, 2, 3], "b": [1, 0, 1, 1], "c": [1, 2, 3, 4]} ).set_index(["a", "b"]) @@ -376,15 +380,16 @@ def test_multiindex_compatible_ordering(indexer): slice(None), ], ) -def test_multiindex_loc_slice(pdf, gdf, pdfIndex, arg): +def test_multiindex_loc_slice(pdf, pdfIndex, arg): gdf = cudf.from_pandas(pdf) gdfIndex = cudf.from_pandas(pdfIndex) + pdf = pdf.copy(deep=False) pdf.index = pdfIndex gdf.index = gdfIndex assert_eq(pdf.loc[arg], gdf.loc[arg]) -def test_multiindex_loc_errors(pdf, gdf, pdfIndex): +def test_multiindex_loc_errors(pdf, pdfIndex): gdf = cudf.from_pandas(pdf) gdfIndex = cudf.from_pandas(pdfIndex) gdf.index = gdfIndex @@ -402,6 +407,8 @@ def test_multiindex_loc_errors(pdf, gdf, pdfIndex): def test_multiindex_loc_then_column(pdf, gdf, pdfIndex): gdfIndex = cudf.from_pandas(pdfIndex) assert_eq(pdfIndex, gdfIndex) + pdf = pdf.copy(deep=False) + gdf = gdf.copy(deep=False) pdf.index = pdfIndex gdf.index = gdfIndex # The index is unsorted, which makes things slow but is fine for testing. @@ -413,6 +420,8 @@ def test_multiindex_loc_then_column(pdf, gdf, pdfIndex): def test_multiindex_loc_rows_0(pdf, gdf, pdfIndex): gdfIndex = cudf.from_pandas(pdfIndex) + pdf = pdf.copy(deep=False) + gdf = gdf.copy(deep=False) pdf.index = pdfIndex gdf.index = gdfIndex @@ -426,6 +435,8 @@ def test_multiindex_loc_rows_0(pdf, gdf, pdfIndex): def test_multiindex_loc_rows_1_2_key(pdf, gdf, pdfIndex): gdfIndex = cudf.from_pandas(pdfIndex) + pdf = pdf.copy(deep=False) + gdf = gdf.copy(deep=False) pdf.index = pdfIndex gdf.index = gdfIndex assert_eq(pdf.loc[("c", "forest"), :], gdf.loc[("c", "forest"), :]) @@ -433,6 +444,8 @@ def test_multiindex_loc_rows_1_2_key(pdf, gdf, pdfIndex): def test_multiindex_loc_rows_1_1_key(pdf, gdf, pdfIndex): gdfIndex = cudf.from_pandas(pdfIndex) + pdf = pdf.copy(deep=False) + gdf = gdf.copy(deep=False) pdf.index = pdfIndex gdf.index = gdfIndex assert_eq(pdf.loc[("c",), :], gdf.loc[("c",), :]) @@ -467,7 +480,8 @@ def test_multiindex_column_shape(): ("c", "forest", "clear"), ], ) -def test_multiindex_columns(pdf, gdf, pdfIndex, query): +def test_multiindex_columns(pdf, pdfIndex, query): + pdf = pdf.copy(deep=False) pdf = pdf.T gdf = cudf.from_pandas(pdf) gdfIndex = cudf.from_pandas(pdfIndex) @@ -490,8 +504,6 @@ def test_multiindex_from_tuples(): def test_multiindex_from_dataframe(): - if not hasattr(pd.MultiIndex([[]], [[]]), "codes"): - pytest.skip() pdf = pd.DataFrame( [["a", "house"], ["a", "store"], ["b", "house"], ["b", "store"]] ) @@ -690,49 +702,44 @@ def test_multiindex_equals(): assert_eq(mi1.equals(mi2), False) -@pytest.mark.parametrize( - "data", - [ - { - "Date": [ - "2020-08-27", - "2020-08-28", - "2020-08-31", - "2020-08-27", - "2020-08-28", - "2020-08-31", - "2020-08-27", - "2020-08-28", - "2020-08-31", - ], - "Close": [ - 3400.00, - 3401.80, - 3450.96, - 226.58, - 228.91, - 225.53, - 505.13, - 525.91, - 534.98, - ], - "Symbol": [ - "AMZN", - "AMZN", - "AMZN", - "MSFT", - "MSFT", - "MSFT", - "NVDA", - "NVDA", - "NVDA", - ], - } - ], -) -@pytest.mark.parametrize("names", [["X", "Y"]]) -def test_multiindex_copy_sem(data, names): +def test_multiindex_copy_sem(): """Test semantic equality for MultiIndex.copy""" + names = ["X", "Y"] + data = { + "Date": [ + "2020-08-27", + "2020-08-28", + "2020-08-31", + "2020-08-27", + "2020-08-28", + "2020-08-31", + "2020-08-27", + "2020-08-28", + "2020-08-31", + ], + "Close": [ + 3400.00, + 3401.80, + 3450.96, + 226.58, + 228.91, + 225.53, + 505.13, + 525.91, + 534.98, + ], + "Symbol": [ + "AMZN", + "AMZN", + "AMZN", + "MSFT", + "MSFT", + "MSFT", + "NVDA", + "NVDA", + "NVDA", + ], + } gdf = cudf.DataFrame(data) pdf = gdf.to_pandas() @@ -795,7 +802,7 @@ def test_multiindex_copy_sem(data, names): "NVDA", ], }, - cudf.MultiIndex( + pd.MultiIndex( levels=[[1001, 1002], [2001, 2002]], codes=[[1, 1, 0, 0], [0, 1, 0, 1]], names=["col1", "col2"], @@ -809,58 +816,57 @@ def test_multiindex_copy_deep(data, copy_on_write, deep): Case1: Constructed from GroupBy, StringColumns Case2: Constructed from MultiIndex, NumericColumns """ - original_cow_setting = cudf.get_option("copy_on_write") - cudf.set_option("copy_on_write", copy_on_write) - - if isinstance(data, dict): - import operator - from functools import reduce - - gdf = cudf.DataFrame(data) - mi1 = gdf.groupby(["Date", "Symbol"]).mean().index - mi2 = mi1.copy(deep=deep) - - lchildren = [col.children for col in mi1._columns] - rchildren = [col.children for col in mi2._columns] - - # Flatten - lchildren = reduce(operator.add, lchildren) - rchildren = reduce(operator.add, rchildren) - - lptrs = [child.base_data.get_ptr(mode="read") for child in lchildren] - rptrs = [child.base_data.get_ptr(mode="read") for child in rchildren] - - assert all((x == y) for x, y in zip(lptrs, rptrs)) - - elif isinstance(data, cudf.MultiIndex): - same_ref = (not deep) or ( - cudf.get_option("copy_on_write") and not deep - ) - mi1 = data - mi2 = mi1.copy(deep=deep) + with cudf.option_context("copy_on_write", copy_on_write): + if isinstance(data, dict): + gdf = cudf.DataFrame(data) + mi1 = gdf.groupby(["Date", "Symbol"]).mean().index + mi2 = mi1.copy(deep=deep) + + lchildren = [col.children for col in mi1._columns] + rchildren = [col.children for col in mi2._columns] + + # Flatten + lchildren = reduce(operator.add, lchildren) + rchildren = reduce(operator.add, rchildren) + + lptrs = [ + child.base_data.get_ptr(mode="read") for child in lchildren + ] + rptrs = [ + child.base_data.get_ptr(mode="read") for child in rchildren + ] + + assert all((x == y) for x, y in zip(lptrs, rptrs)) + + elif isinstance(data, pd.MultiIndex): + data = cudf.MultiIndex.from_pandas(data) + same_ref = (not deep) or ( + cudf.get_option("copy_on_write") and not deep + ) + mi1 = data + mi2 = mi1.copy(deep=deep) - # Assert ._levels identity - lptrs = [ - lv._column.base_data.get_ptr(mode="read") for lv in mi1._levels - ] - rptrs = [ - lv._column.base_data.get_ptr(mode="read") for lv in mi2._levels - ] + # Assert ._levels identity + lptrs = [ + lv._column.base_data.get_ptr(mode="read") for lv in mi1._levels + ] + rptrs = [ + lv._column.base_data.get_ptr(mode="read") for lv in mi2._levels + ] - assert all((x == y) == same_ref for x, y in zip(lptrs, rptrs)) + assert all((x == y) == same_ref for x, y in zip(lptrs, rptrs)) - # Assert ._codes identity - lptrs = [c.base_data.get_ptr(mode="read") for c in mi1._codes] - rptrs = [c.base_data.get_ptr(mode="read") for c in mi2._codes] + # Assert ._codes identity + lptrs = [c.base_data.get_ptr(mode="read") for c in mi1._codes] + rptrs = [c.base_data.get_ptr(mode="read") for c in mi2._codes] - assert all((x == y) == same_ref for x, y in zip(lptrs, rptrs)) + assert all((x == y) == same_ref for x, y in zip(lptrs, rptrs)) - # Assert ._data identity - lptrs = [d.base_data.get_ptr(mode="read") for d in mi1._columns] - rptrs = [d.base_data.get_ptr(mode="read") for d in mi2._columns] + # Assert ._data identity + lptrs = [d.base_data.get_ptr(mode="read") for d in mi1._columns] + rptrs = [d.base_data.get_ptr(mode="read") for d in mi2._columns] - assert all((x == y) == same_ref for x, y in zip(lptrs, rptrs)) - cudf.set_option("copy_on_write", original_cow_setting) + assert all((x == y) == same_ref for x, y in zip(lptrs, rptrs)) @pytest.mark.parametrize( @@ -894,6 +900,8 @@ def test_multiindex_copy_deep(data, copy_on_write, deep): def test_multiindex_iloc(pdf, gdf, pdfIndex, iloc_rows, iloc_columns): gdfIndex = cudf.from_pandas(pdfIndex) assert_eq(pdfIndex, gdfIndex) + pdf = pdf.copy(deep=False) + gdf = gdf.copy(deep=False) pdf.index = pdfIndex gdf.index = gdfIndex presult = pdf.iloc[iloc_rows, iloc_columns] @@ -950,6 +958,8 @@ def test_multiindex_iloc_scalar(): def test_multicolumn_iloc(pdf, gdf, pdfIndex, iloc_rows, iloc_columns): gdfIndex = cudf.from_pandas(pdfIndex) assert_eq(pdfIndex, gdfIndex) + pdf = pdf.copy(deep=False) + gdf = gdf.copy(deep=False) pdf.index = pdfIndex gdf.index = gdfIndex pdf = pdf.T @@ -1001,6 +1011,8 @@ def test_multiindex_groupby_to_frame(): def test_multiindex_reset_index(pdf, gdf, pdfIndex): gdfIndex = cudf.from_pandas(pdfIndex) + pdf = pdf.copy(deep=False) + gdf = gdf.copy(deep=False) pdf.index = pdfIndex gdf.index = gdfIndex assert_eq(pdf.reset_index(), gdf.reset_index()) @@ -1045,6 +1057,8 @@ def test_multiindex_multicolumn_reset_index(): def test_groupby_multiindex_columns_from_pandas(pdf, gdf, pdfIndex): gdfIndex = cudf.from_pandas(pdfIndex) + pdf = pdf.copy(deep=False) + gdf = gdf.copy(deep=False) pdf.index = pdfIndex gdf.index = gdfIndex assert_eq(gdf, pdf) @@ -1053,6 +1067,8 @@ def test_groupby_multiindex_columns_from_pandas(pdf, gdf, pdfIndex): def test_multiindex_rows_with_wildcard(pdf, gdf, pdfIndex): gdfIndex = cudf.from_pandas(pdfIndex) + pdf = pdf.copy(deep=False) + gdf = gdf.copy(deep=False) pdf.index = pdfIndex gdf.index = gdfIndex # The index is unsorted, which makes things slow but is fine for testing. @@ -1101,6 +1117,7 @@ def test_multiindex_multicolumn_zero_row_slice(): def test_multicolumn_loc(pdf, pdfIndex): + pdf = pdf.copy(deep=False) pdf = pdf.T pdf.columns = pdfIndex gdf = cudf.from_pandas(pdf) @@ -1114,6 +1131,7 @@ def test_multicolumn_loc(pdf, pdfIndex): reason="https://github.com/pandas-dev/pandas/issues/43351", ) def test_multicolumn_set_item(pdf, pdfIndex): + pdf = pdf.copy(deep=False) pdf = pdf.T pdf.columns = pdfIndex gdf = cudf.from_pandas(pdf) @@ -1181,39 +1199,39 @@ def test_multiindex_to_numpy(): "gdi, fill_value, expected", [ ( - cudf.MultiIndex( + lambda: cudf.MultiIndex( levels=[[1, 3, 4, None], [1, 2, 5]], codes=[[0, 0, 1, 2, 3], [0, 2, 1, 1, 0]], names=["x", "y"], ), 5, - cudf.MultiIndex( + lambda: cudf.MultiIndex( levels=[[1, 3, 4, 5], [1, 2, 5]], codes=[[0, 0, 1, 2, 3], [0, 2, 1, 1, 0]], names=["x", "y"], ), ), ( - cudf.MultiIndex( + lambda: cudf.MultiIndex( levels=[[1, 3, 4, None], [1, None, 5]], codes=[[0, 0, 1, 2, 3], [0, 2, 1, 1, 0]], names=["x", "y"], ), 100, - cudf.MultiIndex( + lambda: cudf.MultiIndex( levels=[[1, 3, 4, 100], [1, 100, 5]], codes=[[0, 0, 1, 2, 3], [0, 2, 1, 1, 0]], names=["x", "y"], ), ), ( - cudf.MultiIndex( + lambda: cudf.MultiIndex( levels=[["a", "b", "c", None], ["1", None, "5"]], codes=[[0, 0, 1, 2, 3], [0, 2, 1, 1, 0]], names=["x", "y"], ), "100", - cudf.MultiIndex( + lambda: cudf.MultiIndex( levels=[["a", "b", "c", "100"], ["1", "100", "5"]], codes=[[0, 0, 1, 2, 3], [0, 2, 1, 1, 0]], names=["x", "y"], @@ -1222,7 +1240,7 @@ def test_multiindex_to_numpy(): ], ) def test_multiindex_fillna(gdi, fill_value, expected): - assert_eq(expected, gdi.fillna(fill_value)) + assert_eq(expected(), gdi().fillna(fill_value)) @pytest.mark.parametrize( @@ -1478,16 +1496,13 @@ def test_multiindex_argsort(pdi, ascending): assert_eq(expected, actual) -@pytest.mark.parametrize( - "idx", [pd.MultiIndex.from_product([["python", "cobra"], [2018, 2019]])] -) @pytest.mark.parametrize( "names", [[None, None], ["a", None], ["new name", "another name"]] ) @pytest.mark.parametrize("inplace", [True, False]) -def test_multiindex_set_names(idx, names, inplace): - pi = idx.copy() - gi = cudf.from_pandas(idx) +def test_multiindex_set_names(names, inplace): + pi = pd.MultiIndex.from_product([["python", "cobra"], [2018, 2019]]) + gi = cudf.from_pandas(pi) expected = pi.set_names(names=names, inplace=inplace) actual = gi.set_names(names=names, inplace=inplace) @@ -1498,18 +1513,7 @@ def test_multiindex_set_names(idx, names, inplace): assert_eq(expected, actual) -@pytest.mark.parametrize( - "idx", - [ - pd.MultiIndex.from_product( - [["python", "cobra"], [2018, 2019], ["aab", "bcd"]] - ), - pd.MultiIndex.from_product( - [["python", "cobra"], [2018, 2019], ["aab", "bcd"]], - names=[1, 0, 2], - ), - ], -) +@pytest.mark.parametrize("idx_names", [[None, None, None], [1, 0, 2]]) @pytest.mark.parametrize( "level, names", [ @@ -1523,10 +1527,12 @@ def test_multiindex_set_names(idx, names, inplace): ) @pytest.mark.parametrize("inplace", [True, False]) def test_multiindex_set_names_default_and_int_names( - idx, level, names, inplace + idx_names, level, names, inplace ): - pi = idx.copy() - gi = cudf.from_pandas(idx) + pi = pd.MultiIndex.from_product( + [["python", "cobra"], [2018, 2019], ["aab", "bcd"]], names=idx_names + ) + gi = cudf.from_pandas(pi) expected = pi.set_names(names=names, level=level, inplace=inplace) actual = gi.set_names(names=names, level=level, inplace=inplace) @@ -1537,15 +1543,6 @@ def test_multiindex_set_names_default_and_int_names( assert_eq(expected, actual) -@pytest.mark.parametrize( - "idx", - [ - pd.MultiIndex.from_product( - [["python", "cobra"], [2018, 2019], ["aab", "bcd"]], - names=["one", None, "three"], - ), - ], -) @pytest.mark.parametrize( "level, names", [ @@ -1560,9 +1557,12 @@ def test_multiindex_set_names_default_and_int_names( ], ) @pytest.mark.parametrize("inplace", [True, False]) -def test_multiindex_set_names_string_names(idx, level, names, inplace): - pi = idx.copy() - gi = cudf.from_pandas(idx) +def test_multiindex_set_names_string_names(level, names, inplace): + pi = pd.MultiIndex.from_product( + [["python", "cobra"], [2018, 2019], ["aab", "bcd"]], + names=["one", None, "three"], + ) + gi = cudf.from_pandas(pi) expected = pi.set_names(names=names, level=level, inplace=inplace) actual = gi.set_names(names=names, level=level, inplace=inplace) @@ -1590,15 +1590,7 @@ def test_multiindex_set_names_error(level, names): ) -@pytest.mark.parametrize( - "idx", - [ - pd.MultiIndex.from_product([["python", "cobra"], [2018, 2019]]), - pd.MultiIndex.from_product( - [["python", "cobra"], [2018, 2019]], names=["old name", None] - ), - ], -) +@pytest.mark.parametrize("name", [None, "old name"]) @pytest.mark.parametrize( "names", [ @@ -1611,9 +1603,11 @@ def test_multiindex_set_names_error(level, names): ], ) @pytest.mark.parametrize("inplace", [True, False]) -def test_multiindex_rename(idx, names, inplace): - pi = idx.copy() - gi = cudf.from_pandas(idx) +def test_multiindex_rename(name, names, inplace): + pi = pd.MultiIndex.from_product( + [["python", "cobra"], [2018, 2019]], names=[name, None] + ) + gi = cudf.from_pandas(pi) expected = pi.rename(names=names, inplace=inplace) actual = gi.rename(names=names, inplace=inplace) From d171390f5ec640b57404097e2b8d078da12089c6 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Thu, 24 Jul 2025 13:00:28 -0700 Subject: [PATCH 011/366] Move test_testing.py to new cudf classic test directory structure (#19481) Towards https://github.com/rapidsai/cudf/issues/9999 * Adding more shared fixtures in conftest.py where applicable * Further simplify tests Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - Matthew Murray (https://github.com/Matt711) URL: https://github.com/rapidsai/cudf/pull/19481 --- python/cudf/cudf/testing/_utils.py | 44 ++ python/cudf/cudf/tests/conftest.py | 74 +++ .../tests/general_utilities/test_testing.py | 1 - python/cudf/cudf/tests/test_testing.py | 438 ------------------ .../__init__.py | 0 python/cudf/cudf/tests/testing/conftest.py | 27 ++ .../tests/testing/test_assert_column_equal.py | 95 ++++ .../cudf/cudf/tests/testing/test_assert_eq.py | 45 ++ .../tests/testing/test_assert_frame_equal.py | 61 +++ .../tests/testing/test_assert_index_equal.py | 85 ++++ .../tests/testing/test_assert_series_equal.py | 81 ++++ 11 files changed, 512 insertions(+), 439 deletions(-) delete mode 100644 python/cudf/cudf/tests/general_utilities/test_testing.py delete mode 100644 python/cudf/cudf/tests/test_testing.py rename python/cudf/cudf/tests/{general_utilities => testing}/__init__.py (100%) create mode 100644 python/cudf/cudf/tests/testing/conftest.py create mode 100644 python/cudf/cudf/tests/testing/test_assert_column_equal.py create mode 100644 python/cudf/cudf/tests/testing/test_assert_eq.py create mode 100644 python/cudf/cudf/tests/testing/test_assert_frame_equal.py create mode 100644 python/cudf/cudf/tests/testing/test_assert_index_equal.py create mode 100644 python/cudf/cudf/tests/testing/test_assert_series_equal.py diff --git a/python/cudf/cudf/testing/_utils.py b/python/cudf/cudf/testing/_utils.py index 43a3ae44f06..fa0bf52279e 100644 --- a/python/cudf/cudf/testing/_utils.py +++ b/python/cudf/cudf/testing/_utils.py @@ -330,6 +330,50 @@ def assert_column_memory_ne(lhs: ColumnBase, rhs: ColumnBase): raise AssertionError("lhs and rhs holds the same memory.") +def assert_asserters_equal( + pandas_asserter, + cudf_asserter, + pandas_left, + pandas_right, + cudf_left, + cudf_right, + *args, + **kwargs, +): + """ + Assert that a pandas and cudf asserter have equivalent behavior. + + Parameters + ---------- + pandas_asserter : callable + A pandas asserter function. + cudf_asserter : callable + A cudf asserter function. + pandas_left : object + A pandas object as the left argument to the pandas asserter. + pandas_right : object + A pandas object as the right argument to the pandas asserter. + cudf_left : object + A cudf object as the left argument to the cudf asserter. + cudf_right : object + A cudf object as the right argument to the pandas asserter. + *args : tuple + Additional arguments to pass to both asserters. + **kwargs : dict + Additional keyword arguments to both asserters. + """ + # TypeError is raised (erroneously from pandas) when comparing + # categorical indices with different categories. + exceptions = (AssertionError, TypeError) + try: + pandas_asserter(pandas_left, pandas_right, *args, **kwargs) + except exceptions: + with pytest.raises(exceptions): + cudf_asserter(cudf_left, cudf_right, *args, **kwargs) + else: + cudf_asserter(cudf_left, cudf_right, *args, **kwargs) + + parametrize_numeric_dtypes_pairwise = pytest.mark.parametrize( "left_dtype,right_dtype", list(itertools.combinations_with_replacement(NUMERIC_TYPES, 2)), diff --git a/python/cudf/cudf/tests/conftest.py b/python/cudf/cudf/tests/conftest.py index 4b8bd9c4a1c..1d27963a903 100644 --- a/python/cudf/cudf/tests/conftest.py +++ b/python/cudf/cudf/tests/conftest.py @@ -194,3 +194,77 @@ def set_decomp_env_vars(monkeypatch, request): for key, value in env_vars.items(): m.setenv(key, value) yield + + +signed_integer_types = ["int8", "int16", "int32", "int64"] +unsigned_integer_types = ["uint8", "uint16", "uint32", "uint64"] +float_types = ["float32", "float64"] +datetime_types = [ + "datetime64[ns]", + "datetime64[us]", + "datetime64[ms]", + "datetime64[s]", +] +timedelta_types = [ + "timedelta64[ns]", + "timedelta64[us]", + "timedelta64[ms]", + "timedelta64[s]", +] +string_types = ["str"] +bool_types = ["bool"] +category_types = ["category"] + + +@pytest.fixture(params=signed_integer_types) +def signed_integer_types_as_str(request): + """ + - "int8", "int16", "int32", "int64" + - "uint8", "uint16", "uint32", "uint64" + """ + return request.param + + +@pytest.fixture(params=signed_integer_types + unsigned_integer_types) +def integer_types_as_str(request): + """ + - "int8", "int16", "int32", "int64" + - "uint8", "uint16", "uint32", "uint64" + """ + return request.param + + +@pytest.fixture( + params=signed_integer_types + unsigned_integer_types + float_types +) +def numeric_types_as_str(request): + """ + - "int8", "int16", "int32", "int64" + - "uint8", "uint16", "uint32", "uint64" + - "float32", "float64" + """ + return request.param + + +@pytest.fixture( + params=signed_integer_types + + unsigned_integer_types + + float_types + + datetime_types + + timedelta_types + + string_types + + bool_types + + category_types +) +def all_supported_types_as_str(request): + """ + - "int8", "int16", "int32", "int64" + - "uint8", "uint16", "uint32", "uint64" + - "float32", "float64" + - "datetime64[ns]", "datetime64[us]", "datetime64[ms]", "datetime64[s]" + - "timedelta64[ns]", "timedelta64[us]", "timedelta64[ms]", "timedelta64[s]" + - "str" + - "category" + - "bool" + """ + return request.param diff --git a/python/cudf/cudf/tests/general_utilities/test_testing.py b/python/cudf/cudf/tests/general_utilities/test_testing.py deleted file mode 100644 index 06777c8e6af..00000000000 --- a/python/cudf/cudf/tests/general_utilities/test_testing.py +++ /dev/null @@ -1 +0,0 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. diff --git a/python/cudf/cudf/tests/test_testing.py b/python/cudf/cudf/tests/test_testing.py deleted file mode 100644 index dc14051fe0f..00000000000 --- a/python/cudf/cudf/tests/test_testing.py +++ /dev/null @@ -1,438 +0,0 @@ -# Copyright (c) 2020-2025, NVIDIA CORPORATION. - -import numpy as np -import pandas as pd -import pyarrow as pa -import pytest - -import cudf -from cudf.core.column.column import as_column -from cudf.testing import ( - assert_frame_equal, - assert_index_equal, - assert_series_equal, -) -from cudf.testing._utils import ( - NUMERIC_TYPES, - OTHER_TYPES, - assert_column_memory_eq, - assert_column_memory_ne, -) -from cudf.testing.testing import assert_column_equal, assert_eq - - -@pytest.fixture( - params=[ - pa.array([*range(10)]), - pa.array(["hello", "world", "rapids", "AI"]), - pa.array([[1, 2, 3], [4, 5], [6], [], [7]]), - pa.array([{"f0": "hello", "f1": 42}, {"f0": "world", "f1": 3}]), - ] -) -def arrow_arrays(request): - return request.param - - -@pytest.mark.parametrize("rdata", [[1, 2, 5], [1, 2, 6], [1, 2, 5, 6]]) -@pytest.mark.parametrize("exact", ["equiv", True, False]) -@pytest.mark.parametrize("check_names", [True, False]) -@pytest.mark.parametrize("rname", ["a", "b"]) -@pytest.mark.parametrize("check_categorical", [True, False]) -@pytest.mark.parametrize( - "dtype", NUMERIC_TYPES + OTHER_TYPES + ["datetime64[ns]"] -) -def test_basic_assert_index_equal( - rdata, - exact, - check_names, - rname, - check_categorical, - dtype, -): - p_left = pd.Index([1, 2, 3], name="a", dtype=dtype) - p_right = pd.Index(rdata, name=rname, dtype=dtype) - - left = cudf.from_pandas(p_left) - right = cudf.from_pandas(p_right) - - kind = None - try: - pd.testing.assert_index_equal( - p_left, - p_right, - exact=exact, - check_names=check_names, - check_categorical=check_categorical, - ) - except BaseException as e: - kind = type(e) - msg = str(e) - - if kind is not None: - if (kind is TypeError) and ( - msg - == ( - "Categoricals can only be compared " - "if 'categories' are the same." - ) - ): - kind = AssertionError - with pytest.raises(kind): - assert_index_equal( - left, - right, - exact=exact, - check_names=check_names, - check_categorical=check_categorical, - ) - else: - assert_index_equal( - left, - right, - exact=exact, - check_names=check_names, - check_categorical=check_categorical, - ) - - -@pytest.mark.parametrize("rdata", [[1, 2, 5], [1, 2, 6], [1, 2, 5, 6]]) -@pytest.mark.parametrize("check_names", [True, False]) -@pytest.mark.parametrize("rname", ["a", "b"]) -@pytest.mark.parametrize("check_category_order", [True, False]) -@pytest.mark.parametrize("check_categorical", [True, False]) -@pytest.mark.parametrize( - "dtype", NUMERIC_TYPES + OTHER_TYPES + ["datetime64[ns]"] -) -def test_basic_assert_series_equal( - rdata, - rname, - check_names, - check_category_order, - check_categorical, - dtype, -): - p_left = pd.Series([1, 2, 3], name="a", dtype=dtype) - p_right = pd.Series(rdata, name=rname, dtype=dtype) - - left = cudf.from_pandas(p_left) - right = cudf.from_pandas(p_right) - - kind = None - try: - pd.testing.assert_series_equal( - p_left, - p_right, - check_names=check_names, - check_categorical=check_categorical, - check_category_order=check_category_order, - ) - except BaseException as e: - kind = type(e) - - if kind is not None: - with pytest.raises(kind): - assert_series_equal( - left, - right, - check_names=check_names, - check_categorical=check_categorical, - check_category_order=check_category_order, - ) - else: - assert_series_equal( - left, - right, - check_names=check_names, - check_categorical=check_categorical, - check_category_order=check_category_order, - ) - - -@pytest.mark.parametrize( - "other_data", - [ - ["1", "2", "3"], - [[1], [2], [3]], - [{"a": 1}, {"a": 2}, {"a": 3}], - ], -) -def test_assert_column_equal_dtype_edge_cases(other_data): - # string series should be 100% different - # even when the elements are the same - base = as_column([1, 2, 3]) - other = as_column(other_data) - - # for these dtypes, the diff should always be 100% regardless of the values - with pytest.raises( - AssertionError, match=r".*values are different \(100.0 %\).*" - ): - assert_column_equal(base, other, check_dtype=False) - - # the exceptions are the empty and all null cases - assert_column_equal(base.slice(0, 0), other.slice(0, 0), check_dtype=False) - assert_column_equal(other.slice(0, 0), base.slice(0, 0), check_dtype=False) - - base = as_column(cudf.NA, length=len(base), dtype=base.dtype) - other = as_column(cudf.NA, length=len(other), dtype=other.dtype) - - assert_column_equal(base, other, check_dtype=False) - assert_column_equal(other, base, check_dtype=False) - - -@pytest.mark.parametrize( - "rdtype", [["int8", "int16", "int64"], ["int64", "int16", "int8"]] -) -@pytest.mark.parametrize("rname", [["a", "b", "c"], ["b", "c", "a"]]) -@pytest.mark.parametrize("index", [[1, 2, 3], [3, 2, 1]]) -@pytest.mark.parametrize("check_exact", [True, False]) -@pytest.mark.parametrize("check_dtype", [True, False]) -@pytest.mark.parametrize("check_names", [True, False]) -@pytest.mark.parametrize("check_like", [True, False]) -@pytest.mark.parametrize("mismatch", [True, False]) -def test_basic_assert_frame_equal( - rdtype, - rname, - index, - check_exact, - check_dtype, - check_names, - check_like, - mismatch, -): - data = [1, 2, 1] - p_left = pd.DataFrame(index=[1, 2, 3]) - p_left["a"] = np.array(data, dtype="int8") - p_left["b"] = np.array(data, dtype="int16") - if mismatch: - p_left["c"] = np.array([1, 2, 3], dtype="int64") - else: - p_left["c"] = np.array(data, dtype="int64") - - p_right = pd.DataFrame(index=index) - for dtype, name in zip(rdtype, rname): - p_right[name] = np.array(data, dtype=dtype) - - left = cudf.from_pandas(p_left) - right = cudf.from_pandas(p_right) - - kind = None - try: - pd.testing.assert_frame_equal( - p_left, - p_right, - check_exact=check_exact, - check_dtype=check_dtype, - check_names=check_names, - check_like=check_like, - ) - except BaseException as e: - kind = type(e) - - if kind is not None: - with pytest.raises(kind): - assert_frame_equal( - left, - right, - check_exact=check_exact, - check_dtype=check_dtype, - check_names=check_names, - check_like=check_like, - ) - else: - assert_frame_equal( - left, - right, - check_exact=check_exact, - check_dtype=check_dtype, - check_names=check_names, - check_like=check_like, - ) - - -@pytest.mark.parametrize("rdata", [[0, 1, 2, 3], [0, 1, 2, 4]]) -@pytest.mark.parametrize("check_datetimelike_compat", [True, False]) -def test_datetime_like_compaibility(rdata, check_datetimelike_compat): - psr1 = pd.Series([0, 1, 2, 3], dtype="datetime64[ns]") - psr2 = pd.Series(rdata, dtype="datetime64[ns]").astype("str") - - sr1 = cudf.from_pandas(psr1) - sr2 = cudf.from_pandas(psr2) - - kind = None - try: - pd.testing.assert_series_equal( - psr1, psr2, check_datetimelike_compat=check_datetimelike_compat - ) - except BaseException as e: - kind = type(e) - - if kind is not None: - with pytest.raises(kind): - assert_series_equal( - sr1, sr2, check_datetimelike_compat=check_datetimelike_compat - ) - else: - assert_series_equal( - sr1, sr2, check_datetimelike_compat=check_datetimelike_compat - ) - - -@pytest.mark.parametrize( - "rdata", - [ - [[0, 1, 2, 3], ["G", "O", "N", "E"]], - [[0, 1, 2, 4], ["G", "O", "N", "E"]], - ], -) -def test_multiindex_equal(rdata): - pidx1 = pd.MultiIndex.from_arrays( - [[0, 1, 2, 3], ["G", "O", "N", "E"]], names=("n", "id") - ) - pidx2 = pd.MultiIndex.from_arrays(rdata, names=("n", "id")) - - idx1 = cudf.from_pandas(pidx1) - idx2 = cudf.from_pandas(pidx2) - - kind = None - try: - pd.testing.assert_index_equal(pidx1, pidx2) - except BaseException as e: - kind = type(e) - - if kind is not None: - with pytest.raises(kind): - assert_index_equal(idx1, idx2) - else: - assert_index_equal(idx1, idx2) - - -@pytest.mark.parametrize("dtype", ["int8", "uint8", "float32"]) -@pytest.mark.parametrize("check_exact", [True, False]) -@pytest.mark.parametrize("check_dtype", [True, False]) -def test_series_different_type_cases(dtype, check_exact, check_dtype): - data = [0, 1, 2, 3] - - psr1 = pd.Series(data, dtype="uint8") - psr2 = pd.Series(data, dtype=dtype) - - sr1 = cudf.from_pandas(psr1) - sr2 = cudf.from_pandas(psr2) - - kind = None - try: - pd.testing.assert_series_equal( - psr1, psr2, check_exact=check_exact, check_dtype=check_dtype - ) - except BaseException as e: - kind = type(e) - - if kind is not None: - with pytest.raises(kind): - assert_series_equal( - sr1, sr2, check_exact=check_exact, check_dtype=check_dtype - ) - else: - assert_series_equal( - sr1, sr2, check_exact=check_exact, check_dtype=check_dtype - ) - - -@pytest.mark.parametrize( - "dtype", - ["int8", "int16", "int32", "int64"], -) -@pytest.mark.parametrize("exact", ["equiv", True, False]) -def test_range_index_and_int_index_eqaulity(dtype, exact): - pidx1 = pd.RangeIndex(0, stop=5, step=1) - pidx2 = pd.Index([0, 1, 2, 3, 4]) - idx1 = cudf.from_pandas(pidx1) - idx2 = cudf.Index([0, 1, 2, 3, 4], dtype=dtype) - - kind = None - try: - pd.testing.assert_index_equal(pidx1, pidx2, exact=exact) - except BaseException as e: - kind = type(e) - - if kind is not None: - with pytest.raises(kind): - assert_index_equal(idx1, idx2, exact=exact) - else: - assert_index_equal(idx1, idx2, exact=exact) - - -@pytest.mark.parametrize( - "left, right", - [ - (1493282, 1493282), - (1493282.0, 1493282.0 + 1e-8), - ("abc", "abc"), - (0, np.array(0)), - ( - np.datetime64(123456, "ns"), - pd.Timestamp(np.datetime64(123456, "ns")), - ), - ("int64", np.dtype("int64")), - (np.nan, np.nan), - ], -) -def test_basic_scalar_equality(left, right): - assert_eq(left, right) - - -@pytest.mark.parametrize( - "left, right", - [ - (1493282, 1493274), - (1493282.0, 1493282.0 + 1e-6), - ("abc", "abd"), - (0, np.array(1)), - ( - np.datetime64(123456, "ns"), - pd.Timestamp(np.datetime64(123457, "ns")), - ), - ("int64", np.dtype("int32")), - ], -) -def test_basic_scalar_inequality(left, right): - with pytest.raises(AssertionError, match=r".*not (almost )?equal.*"): - assert_eq(left, right) - - -def test_assert_column_memory_basic(arrow_arrays): - left = cudf.core.column.ColumnBase.from_arrow(arrow_arrays) - right = cudf.core.column.ColumnBase.from_arrow(arrow_arrays) - - with pytest.raises(AssertionError): - assert_column_memory_eq(left, right) - assert_column_memory_ne(left, right) - - -def test_assert_column_memory_slice(arrow_arrays): - col = cudf.core.column.ColumnBase.from_arrow(arrow_arrays) - left = col.slice(0, 1) - right = col.slice(1, 2) - - with pytest.raises(AssertionError): - assert_column_memory_eq(left, right) - assert_column_memory_ne(left, right) - - with pytest.raises(AssertionError): - assert_column_memory_eq(left, col) - assert_column_memory_ne(left, col) - - with pytest.raises(AssertionError): - assert_column_memory_eq(right, col) - assert_column_memory_ne(right, col) - - -def test_assert_column_memory_basic_same(arrow_arrays): - data = cudf.core.column.ColumnBase.from_arrow(arrow_arrays) - buf = cudf.core.buffer.as_buffer(data.base_data) - - left = cudf.core.column.build_column(buf, dtype=np.dtype(np.int8)) - right = cudf.core.column.build_column(buf, dtype=np.dtype(np.int8)) - - assert_column_memory_eq(left, right) - with pytest.raises(AssertionError): - assert_column_memory_ne(left, right) diff --git a/python/cudf/cudf/tests/general_utilities/__init__.py b/python/cudf/cudf/tests/testing/__init__.py similarity index 100% rename from python/cudf/cudf/tests/general_utilities/__init__.py rename to python/cudf/cudf/tests/testing/__init__.py diff --git a/python/cudf/cudf/tests/testing/conftest.py b/python/cudf/cudf/tests/testing/conftest.py new file mode 100644 index 00000000000..5798871dabc --- /dev/null +++ b/python/cudf/cudf/tests/testing/conftest.py @@ -0,0 +1,27 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + +import pytest + + +@pytest.fixture(params=[True, False]) +def check_dtype(request): + """Argument for assert_series_equal, assert_frame_equal""" + return request.param + + +@pytest.fixture(params=[True, False]) +def check_exact(request): + """Argument for assert_series_equal, assert_frame_equal""" + return request.param + + +@pytest.fixture(params=[True, False]) +def check_datetimelike_compat(request): + """Argument for assert_series_equal, assert_frame_equal""" + return request.param + + +@pytest.fixture(params=[True, False]) +def check_names(request): + """Argument for assert_series_equal, assert_frame_equal, assert_index_equal""" + return request.param diff --git a/python/cudf/cudf/tests/testing/test_assert_column_equal.py b/python/cudf/cudf/tests/testing/test_assert_column_equal.py new file mode 100644 index 00000000000..7116c8d187e --- /dev/null +++ b/python/cudf/cudf/tests/testing/test_assert_column_equal.py @@ -0,0 +1,95 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + +import numpy as np +import pyarrow as pa +import pytest + +import cudf +from cudf.core.column.column import as_column +from cudf.testing._utils import ( + assert_column_memory_eq, + assert_column_memory_ne, +) +from cudf.testing.testing import assert_column_equal + + +@pytest.fixture( + params=[ + range(10), + ["hello", "world", "rapids", "AI"], + [[1, 2, 3], [4, 5], [6], [], [7]], + [{"f0": "hello", "f1": 42}, {"f0": "world", "f1": 3}], + ] +) +def arrow_arrays(request): + return pa.array(request.param) + + +def test_assert_column_memory_basic(arrow_arrays): + left = cudf.core.column.ColumnBase.from_arrow(arrow_arrays) + right = cudf.core.column.ColumnBase.from_arrow(arrow_arrays) + + with pytest.raises(AssertionError): + assert_column_memory_eq(left, right) + assert_column_memory_ne(left, right) + + +def test_assert_column_memory_slice(arrow_arrays): + col = cudf.core.column.ColumnBase.from_arrow(arrow_arrays) + left = col.slice(0, 1) + right = col.slice(1, 2) + + with pytest.raises(AssertionError): + assert_column_memory_eq(left, right) + assert_column_memory_ne(left, right) + + with pytest.raises(AssertionError): + assert_column_memory_eq(left, col) + assert_column_memory_ne(left, col) + + with pytest.raises(AssertionError): + assert_column_memory_eq(right, col) + assert_column_memory_ne(right, col) + + +def test_assert_column_memory_basic_same(arrow_arrays): + data = cudf.core.column.ColumnBase.from_arrow(arrow_arrays) + buf = cudf.core.buffer.as_buffer(data.base_data) + + left = cudf.core.column.build_column(buf, dtype=np.dtype(np.int8)) + right = cudf.core.column.build_column(buf, dtype=np.dtype(np.int8)) + + assert_column_memory_eq(left, right) + with pytest.raises(AssertionError): + assert_column_memory_ne(left, right) + + +@pytest.mark.parametrize( + "other_data", + [ + ["1", "2", "3"], + [[1], [2], [3]], + [{"a": 1}, {"a": 2}, {"a": 3}], + ], +) +def test_assert_column_equal_dtype_edge_cases(other_data): + # string series should be 100% different + # even when the elements are the same + base = as_column([1, 2, 3]) + other = as_column(other_data) + + # for these dtypes, the diff should always be 100% regardless of the values + with pytest.raises( + AssertionError, match=r".*values are different \(100.0 %\).*" + ): + assert_column_equal(base, other, check_dtype=False) + + # the exceptions are the empty and all null cases + assert_column_equal(base.slice(0, 0), other.slice(0, 0), check_dtype=False) + assert_column_equal(other.slice(0, 0), base.slice(0, 0), check_dtype=False) + + base = as_column(cudf.NA, length=len(base), dtype=base.dtype) + other = as_column(cudf.NA, length=len(other), dtype=other.dtype) + + assert_column_equal(base, other, check_dtype=False) + assert_column_equal(other, base, check_dtype=False) diff --git a/python/cudf/cudf/tests/testing/test_assert_eq.py b/python/cudf/cudf/tests/testing/test_assert_eq.py new file mode 100644 index 00000000000..c9c8a44796e --- /dev/null +++ b/python/cudf/cudf/tests/testing/test_assert_eq.py @@ -0,0 +1,45 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + +import numpy as np +import pandas as pd +import pytest + +from cudf.testing.testing import assert_eq + + +@pytest.mark.parametrize( + "left, right", + [ + (1493282, 1493282), + (1493282.0, 1493282.0 + 1e-8), + ("abc", "abc"), + (0, np.array(0)), + ( + np.datetime64(123456, "ns"), + pd.Timestamp(np.datetime64(123456, "ns")), + ), + ("int64", np.dtype("int64")), + (np.nan, np.nan), + ], +) +def test_basic_scalar_equality(left, right): + assert_eq(left, right) + + +@pytest.mark.parametrize( + "left, right", + [ + (1493282, 1493274), + (1493282.0, 1493282.0 + 1e-6), + ("abc", "abd"), + (0, np.array(1)), + ( + np.datetime64(123456, "ns"), + pd.Timestamp(np.datetime64(123457, "ns")), + ), + ("int64", np.dtype("int32")), + ], +) +def test_basic_scalar_inequality(left, right): + with pytest.raises(AssertionError, match=r".*not (almost )?equal.*"): + assert_eq(left, right) diff --git a/python/cudf/cudf/tests/testing/test_assert_frame_equal.py b/python/cudf/cudf/tests/testing/test_assert_frame_equal.py new file mode 100644 index 00000000000..719dc0b1e60 --- /dev/null +++ b/python/cudf/cudf/tests/testing/test_assert_frame_equal.py @@ -0,0 +1,61 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + +import numpy as np +import pandas as pd +import pytest + +import cudf +from cudf.testing import assert_frame_equal +from cudf.testing._utils import assert_asserters_equal + + +@pytest.fixture(params=[True, False]) +def check_like(request): + """Argument for assert_frame_equal""" + return request.param + + +@pytest.mark.parametrize( + "rdtype", [["int8", "int16", "int64"], ["int64", "int16", "int8"]] +) +@pytest.mark.parametrize("rname", [["a", "b", "c"], ["b", "c", "a"]]) +@pytest.mark.parametrize("index", [[1, 2, 3], [3, 2, 1]]) +@pytest.mark.parametrize("mismatch", [True, False]) +def test_basic_assert_frame_equal( + rdtype, + rname, + index, + check_exact, + check_dtype, + check_names, + check_like, + mismatch, +): + data = [1, 2, 1] + p_left = pd.DataFrame(index=[1, 2, 3]) + p_left["a"] = np.array(data, dtype="int8") + p_left["b"] = np.array(data, dtype="int16") + if mismatch: + p_left["c"] = np.array([1, 2, 3], dtype="int64") + else: + p_left["c"] = np.array(data, dtype="int64") + + p_right = pd.DataFrame(index=index) + for dtype, name in zip(rdtype, rname): + p_right[name] = np.array(data, dtype=dtype) + + left = cudf.from_pandas(p_left) + right = cudf.from_pandas(p_right) + + assert_asserters_equal( + pd.testing.assert_frame_equal, + assert_frame_equal, + p_left, + p_right, + left, + right, + check_exact=check_exact, + check_dtype=check_dtype, + check_names=check_names, + check_like=check_like, + ) diff --git a/python/cudf/cudf/tests/testing/test_assert_index_equal.py b/python/cudf/cudf/tests/testing/test_assert_index_equal.py new file mode 100644 index 00000000000..1b238b5df83 --- /dev/null +++ b/python/cudf/cudf/tests/testing/test_assert_index_equal.py @@ -0,0 +1,85 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + +import pandas as pd +import pytest + +import cudf +from cudf.testing import assert_index_equal +from cudf.testing._utils import assert_asserters_equal + + +@pytest.fixture(params=["equiv", True, False]) +def exact(request): + """Argument for assert_index_equal""" + return request.param + + +def test_range_index_and_int_index_equality( + signed_integer_types_as_str, exact +): + pidx1 = pd.RangeIndex(0, stop=5, step=1) + pidx2 = pd.Index([0, 1, 2, 3, 4]) + idx1 = cudf.from_pandas(pidx1) + idx2 = cudf.Index([0, 1, 2, 3, 4], dtype=signed_integer_types_as_str) + + assert_asserters_equal( + pd.testing.assert_index_equal, + assert_index_equal, + pidx1, + pidx2, + idx1, + idx2, + exact=exact, + ) + + +@pytest.mark.parametrize("rdata", [3, 4], ids=["same", "different"]) +def test_multiindex_equal(rdata): + pidx1 = pd.MultiIndex.from_arrays( + [[0, 1, 2, 3], ["G", "O", "N", "E"]], names=("n", "id") + ) + pidx2 = pd.MultiIndex.from_arrays( + [[0, 1, 2, rdata], ["G", "O", "N", "E"]], names=("n", "id") + ) + + idx1 = cudf.from_pandas(pidx1) + idx2 = cudf.from_pandas(pidx2) + + assert_asserters_equal( + pd.testing.assert_index_equal, + assert_index_equal, + pidx1, + pidx2, + idx1, + idx2, + ) + + +@pytest.mark.parametrize("rdata", [[1, 2, 5], [1, 2, 6], [1, 2, 5, 6]]) +@pytest.mark.parametrize("rname", ["a", "b"]) +@pytest.mark.parametrize("check_categorical", [True, False]) +def test_basic_assert_index_equal( + rdata, + exact, + check_names, + rname, + check_categorical, + all_supported_types_as_str, +): + p_left = pd.Index([1, 2, 3], name="a", dtype=all_supported_types_as_str) + p_right = pd.Index(rdata, name=rname, dtype=all_supported_types_as_str) + + left = cudf.from_pandas(p_left) + right = cudf.from_pandas(p_right) + + assert_asserters_equal( + pd.testing.assert_index_equal, + assert_index_equal, + p_left, + p_right, + left, + right, + exact=exact, + check_names=check_names, + check_categorical=check_categorical, + ) diff --git a/python/cudf/cudf/tests/testing/test_assert_series_equal.py b/python/cudf/cudf/tests/testing/test_assert_series_equal.py new file mode 100644 index 00000000000..9d6f517c88b --- /dev/null +++ b/python/cudf/cudf/tests/testing/test_assert_series_equal.py @@ -0,0 +1,81 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + +import pandas as pd +import pytest + +import cudf +from cudf.testing import assert_series_equal +from cudf.testing._utils import assert_asserters_equal + + +def test_series_different_type_cases( + numeric_types_as_str, check_exact, check_dtype +): + data = [0, 1, 2, 3] + + psr1 = pd.Series(data, dtype="uint8") + psr2 = pd.Series(data, dtype=numeric_types_as_str) + + sr1 = cudf.from_pandas(psr1) + sr2 = cudf.from_pandas(psr2) + + assert_asserters_equal( + pd.testing.assert_series_equal, + assert_series_equal, + psr1, + psr2, + sr1, + sr2, + check_exact=check_exact, + check_dtype=check_dtype, + ) + + +@pytest.mark.parametrize("rdata", [3, 4], ids=["same", "different"]) +def test_datetime_like_compaibility(rdata, check_datetimelike_compat): + psr1 = pd.Series([0, 1, 2, 3], dtype="datetime64[ns]") + psr2 = pd.Series([0, 1, 2, rdata], dtype="datetime64[ns]").astype("str") + + sr1 = cudf.from_pandas(psr1) + sr2 = cudf.from_pandas(psr2) + + assert_asserters_equal( + pd.testing.assert_series_equal, + assert_series_equal, + psr1, + psr2, + sr1, + sr2, + check_datetimelike_compat=check_datetimelike_compat, + ) + + +@pytest.mark.parametrize("rdata", [[1, 2, 5], [1, 2, 6], [1, 2, 5, 6]]) +@pytest.mark.parametrize("rname", ["a", "b"]) +@pytest.mark.parametrize("check_category_order", [True, False]) +@pytest.mark.parametrize("check_categorical", [True, False]) +def test_basic_assert_series_equal( + rdata, + rname, + check_names, + check_category_order, + check_categorical, + all_supported_types_as_str, +): + p_left = pd.Series([1, 2, 3], name="a", dtype=all_supported_types_as_str) + p_right = pd.Series(rdata, name=rname, dtype=all_supported_types_as_str) + + left = cudf.from_pandas(p_left) + right = cudf.from_pandas(p_right) + + assert_asserters_equal( + pd.testing.assert_series_equal, + assert_series_equal, + p_left, + p_right, + left, + right, + check_names=check_names, + check_categorical=check_categorical, + check_category_order=check_category_order, + ) From 4cc4fd59e34b8560ceb21bf2e411a95f39e1d0b9 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Fri, 25 Jul 2025 08:34:37 -0700 Subject: [PATCH 012/366] Use GCC 14 in conda builds. (#19192) conda-forge is migrating to gcc 14, so this PR is updating for alignment. See https://github.com/rapidsai/build-planning/issues/188 Authors: - Vyas Ramasubramani (https://github.com/vyasr) - Nghia Truong (https://github.com/ttnghia) Approvers: - Nghia Truong (https://github.com/ttnghia) - Gil Forsyth (https://github.com/gforsyth) URL: https://github.com/rapidsai/cudf/pull/19192 --- .../all_cuda-129_arch-aarch64.yaml | 2 +- .../all_cuda-129_arch-x86_64.yaml | 2 +- conda/recipes/cudf/conda_build_config.yaml | 4 +- .../cudf_kafka/conda_build_config.yaml | 4 +- conda/recipes/libcudf/conda_build_config.yaml | 4 +- .../recipes/pylibcudf/conda_build_config.yaml | 4 +- cpp/CMakeLists.txt | 5 ++ cpp/examples/basic/CMakeLists.txt | 5 ++ cpp/examples/billion_rows/CMakeLists.txt | 5 ++ cpp/examples/interop/CMakeLists.txt | 5 ++ cpp/examples/nested_types/CMakeLists.txt | 5 ++ cpp/examples/parquet_io/CMakeLists.txt | 5 ++ cpp/examples/string_transforms/CMakeLists.txt | 5 ++ cpp/examples/strings/CMakeLists.txt | 5 ++ cpp/libcudf_kafka/CMakeLists.txt | 5 ++ cpp/src/io/orc/writer_impl.cu | 56 ++++++++++++++++--- cpp/src/io/parquet/writer_impl.cu | 36 ++++++++++-- cpp/src/io/utilities/output_builder.cuh | 7 +++ cpp/tests/lists/extract_tests.cpp | 18 +++++- cpp/tests/partitioning/partition_test.cpp | 28 ++++++---- .../transform/integration/assert_unary.h | 34 +++++++---- dependencies.yaml | 4 +- python/cudf/CMakeLists.txt | 5 ++ python/pylibcudf/CMakeLists.txt | 5 ++ 24 files changed, 213 insertions(+), 45 deletions(-) diff --git a/conda/environments/all_cuda-129_arch-aarch64.yaml b/conda/environments/all_cuda-129_arch-aarch64.yaml index 8be5f80c743..c536a7d662c 100644 --- a/conda/environments/all_cuda-129_arch-aarch64.yaml +++ b/conda/environments/all_cuda-129_arch-aarch64.yaml @@ -33,7 +33,7 @@ dependencies: - fastavro>=0.22.9 - flatbuffers==24.3.25 - fsspec>=0.6.0 -- gcc_linux-aarch64=13.* +- gcc_linux-aarch64=14.* - hypothesis>=6.131.7 - identify>=2.5.20 - include-what-you-use==0.24.0 diff --git a/conda/environments/all_cuda-129_arch-x86_64.yaml b/conda/environments/all_cuda-129_arch-x86_64.yaml index 0e6f4994f83..bf9e857f6bc 100644 --- a/conda/environments/all_cuda-129_arch-x86_64.yaml +++ b/conda/environments/all_cuda-129_arch-x86_64.yaml @@ -33,7 +33,7 @@ dependencies: - fastavro>=0.22.9 - flatbuffers==24.3.25 - fsspec>=0.6.0 -- gcc_linux-64=13.* +- gcc_linux-64=14.* - hypothesis>=6.131.7 - identify>=2.5.20 - include-what-you-use==0.24.0 diff --git a/conda/recipes/cudf/conda_build_config.yaml b/conda/recipes/cudf/conda_build_config.yaml index c35ea54a784..5c68a5fefcb 100644 --- a/conda/recipes/cudf/conda_build_config.yaml +++ b/conda/recipes/cudf/conda_build_config.yaml @@ -1,8 +1,8 @@ c_compiler_version: - - 13 + - 14 cxx_compiler_version: - - 13 + - 14 c_stdlib: - sysroot diff --git a/conda/recipes/cudf_kafka/conda_build_config.yaml b/conda/recipes/cudf_kafka/conda_build_config.yaml index c35ea54a784..5c68a5fefcb 100644 --- a/conda/recipes/cudf_kafka/conda_build_config.yaml +++ b/conda/recipes/cudf_kafka/conda_build_config.yaml @@ -1,8 +1,8 @@ c_compiler_version: - - 13 + - 14 cxx_compiler_version: - - 13 + - 14 c_stdlib: - sysroot diff --git a/conda/recipes/libcudf/conda_build_config.yaml b/conda/recipes/libcudf/conda_build_config.yaml index 5fc7c9eae1b..7ca20165585 100644 --- a/conda/recipes/libcudf/conda_build_config.yaml +++ b/conda/recipes/libcudf/conda_build_config.yaml @@ -1,8 +1,8 @@ c_compiler_version: - - 13 + - 14 cxx_compiler_version: - - 13 + - 14 cuda_compiler: - cuda-nvcc diff --git a/conda/recipes/pylibcudf/conda_build_config.yaml b/conda/recipes/pylibcudf/conda_build_config.yaml index c35ea54a784..5c68a5fefcb 100644 --- a/conda/recipes/pylibcudf/conda_build_config.yaml +++ b/conda/recipes/pylibcudf/conda_build_config.yaml @@ -1,8 +1,8 @@ c_compiler_version: - - 13 + - 14 cxx_compiler_version: - - 13 + - 14 c_stdlib: - sysroot diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 1a158ef4e79..cb277bf22cd 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -133,6 +133,11 @@ set(CUDF_CUDA_FLAGS "") set(CUDF_CXX_DEFINITIONS "") set(CUDF_CUDA_DEFINITIONS "") +# For now, disable CMake's automatic module scanning for C++ files. There is an sccache bug in the +# version RAPIDS uses in CI that causes it to handle the resulting -M* flags incorrectly with +# gcc>=14. We can remove this once we upgrade to a newer sccache version. +set(CMAKE_CXX_SCAN_FOR_MODULES OFF) + # Set logging level set(LIBCUDF_LOGGING_LEVEL "INFO" diff --git a/cpp/examples/basic/CMakeLists.txt b/cpp/examples/basic/CMakeLists.txt index 3ca878f1497..447c8709297 100644 --- a/cpp/examples/basic/CMakeLists.txt +++ b/cpp/examples/basic/CMakeLists.txt @@ -18,6 +18,11 @@ include(../fetch_dependencies.cmake) include(rapids-cmake) rapids_cmake_build_type("Release") +# For now, disable CMake's automatic module scanning for C++ files. There is an sccache bug in the +# version RAPIDS uses in CI that causes it to handle the resulting -M* flags incorrectly with +# gcc>=14. We can remove this once we upgrade to a newer sccache version. +set(CMAKE_CXX_SCAN_FOR_MODULES OFF) + # Configure your project here add_executable(basic_example src/process_csv.cpp) target_link_libraries(basic_example PRIVATE cudf::cudf) diff --git a/cpp/examples/billion_rows/CMakeLists.txt b/cpp/examples/billion_rows/CMakeLists.txt index c0de82ac85a..962ff79c537 100644 --- a/cpp/examples/billion_rows/CMakeLists.txt +++ b/cpp/examples/billion_rows/CMakeLists.txt @@ -18,6 +18,11 @@ include(../fetch_dependencies.cmake) include(rapids-cmake) rapids_cmake_build_type("Release") +# For now, disable CMake's automatic module scanning for C++ files. There is an sccache bug in the +# version RAPIDS uses in CI that causes it to handle the resulting -M* flags incorrectly with +# gcc>=14. We can remove this once we upgrade to a newer sccache version. +set(CMAKE_CXX_SCAN_FOR_MODULES OFF) + list(APPEND CUDF_CUDA_FLAGS --expt-extended-lambda --expt-relaxed-constexpr) add_library(groupby_results OBJECT groupby_results.cpp) diff --git a/cpp/examples/interop/CMakeLists.txt b/cpp/examples/interop/CMakeLists.txt index 1ea9779d4cc..f974735b979 100644 --- a/cpp/examples/interop/CMakeLists.txt +++ b/cpp/examples/interop/CMakeLists.txt @@ -17,6 +17,11 @@ include(../fetch_dependencies.cmake) include(rapids-cmake) rapids_cmake_build_type("Release") +# For now, disable CMake's automatic module scanning for C++ files. There is an sccache bug in the +# version RAPIDS uses in CI that causes it to handle the resulting -M* flags incorrectly with +# gcc>=14. We can remove this once we upgrade to a newer sccache version. +set(CMAKE_CXX_SCAN_FOR_MODULES OFF) + # The Arrow CMake is currently broken if the build type is not set set(CMAKE_BUILD_TYPE Release) # No need to install Arrow libs when only the final example executable is shipped. diff --git a/cpp/examples/nested_types/CMakeLists.txt b/cpp/examples/nested_types/CMakeLists.txt index b5d71d3262d..e91a85d4ee0 100644 --- a/cpp/examples/nested_types/CMakeLists.txt +++ b/cpp/examples/nested_types/CMakeLists.txt @@ -18,6 +18,11 @@ include(../fetch_dependencies.cmake) include(rapids-cmake) rapids_cmake_build_type("Release") +# For now, disable CMake's automatic module scanning for C++ files. There is an sccache bug in the +# version RAPIDS uses in CI that causes it to handle the resulting -M* flags incorrectly with +# gcc>=14. We can remove this once we upgrade to a newer sccache version. +set(CMAKE_CXX_SCAN_FOR_MODULES OFF) + # Configure your project here add_executable(deduplication deduplication.cpp) target_link_libraries(deduplication PRIVATE cudf::cudf) diff --git a/cpp/examples/parquet_io/CMakeLists.txt b/cpp/examples/parquet_io/CMakeLists.txt index 7bcd22445dd..4f520b2bdad 100644 --- a/cpp/examples/parquet_io/CMakeLists.txt +++ b/cpp/examples/parquet_io/CMakeLists.txt @@ -18,6 +18,11 @@ include(../fetch_dependencies.cmake) include(rapids-cmake) rapids_cmake_build_type("Release") +# For now, disable CMake's automatic module scanning for C++ files. There is an sccache bug in the +# version RAPIDS uses in CI that causes it to handle the resulting -M* flags incorrectly with +# gcc>=14. We can remove this once we upgrade to a newer sccache version. +set(CMAKE_CXX_SCAN_FOR_MODULES OFF) + add_library(parquet_io_utils OBJECT common_utils.cpp io_source.cpp) target_compile_features(parquet_io_utils PRIVATE cxx_std_20) target_link_libraries(parquet_io_utils PRIVATE cudf::cudf) diff --git a/cpp/examples/string_transforms/CMakeLists.txt b/cpp/examples/string_transforms/CMakeLists.txt index c1f3aff2e5c..90830eb2820 100644 --- a/cpp/examples/string_transforms/CMakeLists.txt +++ b/cpp/examples/string_transforms/CMakeLists.txt @@ -18,6 +18,11 @@ include(../fetch_dependencies.cmake) include(rapids-cmake) rapids_cmake_build_type("Release") +# For now, disable CMake's automatic module scanning for C++ files. There is an sccache bug in the +# version RAPIDS uses in CI that causes it to handle the resulting -M* flags incorrectly with +# gcc>=14. We can remove this once we upgrade to a newer sccache version. +set(CMAKE_CXX_SCAN_FOR_MODULES OFF) + list(APPEND CUDF_CUDA_FLAGS --expt-extended-lambda --expt-relaxed-constexpr) add_executable(compute_checksum_jit compute_checksum_jit.cpp) diff --git a/cpp/examples/strings/CMakeLists.txt b/cpp/examples/strings/CMakeLists.txt index 7d55d60c062..4e890118dcb 100644 --- a/cpp/examples/strings/CMakeLists.txt +++ b/cpp/examples/strings/CMakeLists.txt @@ -18,6 +18,11 @@ include(../fetch_dependencies.cmake) include(rapids-cmake) rapids_cmake_build_type("Release") +# For now, disable CMake's automatic module scanning for C++ files. There is an sccache bug in the +# version RAPIDS uses in CI that causes it to handle the resulting -M* flags incorrectly with +# gcc>=14. We can remove this once we upgrade to a newer sccache version. +set(CMAKE_CXX_SCAN_FOR_MODULES OFF) + list(APPEND CUDF_CUDA_FLAGS --expt-extended-lambda --expt-relaxed-constexpr) add_executable(libcudf_apis libcudf_apis.cpp) diff --git a/cpp/libcudf_kafka/CMakeLists.txt b/cpp/libcudf_kafka/CMakeLists.txt index 2a62e79f484..06e03bb84d4 100644 --- a/cpp/libcudf_kafka/CMakeLists.txt +++ b/cpp/libcudf_kafka/CMakeLists.txt @@ -29,6 +29,11 @@ project( # Set a default build type if none was specified rapids_cmake_build_type(Release) +# For now, disable CMake's automatic module scanning for C++ files. There is an sccache bug in the +# version RAPIDS uses in CI that causes it to handle the resulting -M* flags incorrectly with +# gcc>=14. We can remove this once we upgrade to a newer sccache version. +set(CMAKE_CXX_SCAN_FOR_MODULES OFF) + # ################################################################################################## # * conda environment ----------------------------------------------------------------------------- rapids_cmake_support_conda_env(conda_env MODIFY_PREFIX_PATH) diff --git a/cpp/src/io/orc/writer_impl.cu b/cpp/src/io/orc/writer_impl.cu index 5a8744f8afd..612f0b69737 100644 --- a/cpp/src/io/orc/writer_impl.cu +++ b/cpp/src/io/orc/writer_impl.cu @@ -910,7 +910,14 @@ encoded_data encode_columns(orc_table_view const& orc_table, std::vector indices; for (auto const& stripe : segmentation.stripes) { for (auto rg_idx_it = stripe.cbegin(); rg_idx_it < stripe.cend() - 1; ++rg_idx_it) { +#if defined(__GNUC__) && (__GNUC__ >= 14) +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wdangling-reference" +#endif auto const& chunk = chunks[col_idx][*rg_idx_it]; +#if defined(__GNUC__) && (__GNUC__ >= 14) +#pragma GCC diagnostic pop +#endif indices.push_back(chunk.start_row); indices.push_back(chunk.start_row + chunk.num_rows); } @@ -959,8 +966,15 @@ encoded_data encode_columns(orc_table_view const& orc_table, if (strm_id >= 0) { size_t stripe_size = 0; std::for_each(stripe.cbegin(), stripe.cend(), [&](auto rg_idx) { +#if defined(__GNUC__) && (__GNUC__ >= 14) +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wdangling-reference" +#endif auto const& ck = chunks[col_idx][rg_idx]; - auto& strm = col_streams[rg_idx]; +#if defined(__GNUC__) && (__GNUC__ >= 14) +#pragma GCC diagnostic pop +#endif + auto& strm = col_streams[rg_idx]; if ((strm_type == CI_DICTIONARY) || (strm_type == CI_DATA2 && ck.encoding_kind == DICTIONARY_V2)) { @@ -995,8 +1009,15 @@ encoded_data encode_columns(orc_table_view const& orc_table, // Set offsets for (auto rg_idx_it = stripe.cbegin(); rg_idx_it < stripe.cend(); ++rg_idx_it) { auto const rg_idx = *rg_idx_it; - auto const& ck = chunks[col_idx][rg_idx]; - auto& strm = col_streams[rg_idx]; +#if defined(__GNUC__) && (__GNUC__ >= 14) +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wdangling-reference" +#endif + auto const& ck = chunks[col_idx][rg_idx]; +#if defined(__GNUC__) && (__GNUC__ >= 14) +#pragma GCC diagnostic pop +#endif + auto& strm = col_streams[rg_idx]; if (strm_id < 0 or (strm_type == CI_DATA && streams[strm_id].length == 0 && (ck.type_kind == DOUBLE || ck.type_kind == FLOAT))) { @@ -1495,7 +1516,14 @@ void write_index_stream(int32_t stripe_id, if (stream.ids[type] > 0) { record.pos = 0; if (compression != compression_type::NONE) { - auto const& ss = strm_desc[stripe_id][stream.ids[type] - (columns.size() + 1)]; +#if defined(__GNUC__) && (__GNUC__ >= 14) +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wdangling-reference" +#endif + auto const& ss = strm_desc[stripe_id][stream.ids[type] - (columns.size() + 1)]; +#if defined(__GNUC__) && (__GNUC__ >= 14) +#pragma GCC diagnostic pop +#endif record.blk_pos = ss.first_block; record.comp_pos = 0; record.comp_size = ss.stream_size; @@ -1522,10 +1550,17 @@ void write_index_stream(int32_t stripe_id, auto kind = TypeKind::STRUCT; // TBD: Not sure we need an empty index stream for column 0 if (stream_id != 0) { +#if defined(__GNUC__) && (__GNUC__ >= 14) +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wdangling-reference" +#endif auto const& strm = enc_streams[column_id][0]; - present = find_record(strm, CI_PRESENT); - data = find_record(strm, CI_DATA); - data2 = find_record(strm, CI_DATA2); +#if defined(__GNUC__) && (__GNUC__ >= 14) +#pragma GCC diagnostic pop +#endif + present = find_record(strm, CI_PRESENT); + data = find_record(strm, CI_DATA); + data2 = find_record(strm, CI_DATA2); // Change string dictionary to int from index point of view kind = columns[column_id].orc_kind(); @@ -1551,7 +1586,14 @@ void write_index_stream(int32_t stripe_id, : (&rg_stats[column_id * segmentation.num_rowgroups() + rowgroup])); if (stream_id != 0) { +#if defined(__GNUC__) && (__GNUC__ >= 14) +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wdangling-reference" +#endif const auto& strm = enc_streams[column_id][rowgroup]; +#if defined(__GNUC__) && (__GNUC__ >= 14) +#pragma GCC diagnostic pop +#endif scan_record(strm, CI_PRESENT, present); scan_record(strm, CI_DATA, data); scan_record(strm, CI_DATA2, data2); diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu index 20236c63344..41fdbe7e6fc 100644 --- a/cpp/src/io/parquet/writer_impl.cu +++ b/cpp/src/io/parquet/writer_impl.cu @@ -2160,7 +2160,14 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta, auto& row_group = agg_meta->file(p).row_groups[global_r]; for (auto i = 0; i < num_columns; i++) { - auto const& ck = chunks[r][i]; +#if defined(__GNUC__) && (__GNUC__ >= 14) +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wdangling-reference" +#endif + auto const& ck = chunks[r][i]; +#if defined(__GNUC__) && (__GNUC__ >= 14) +#pragma GCC diagnostic pop +#endif auto const dev_bfr = ck.is_compressed ? ck.compressed_bfr : ck.uncompressed_bfr; auto& column_chunk_meta = row_group.columns[i].meta_data; @@ -2203,7 +2210,14 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta, auto& row_group = agg_meta->file(p).row_groups[global_r]; for (auto i = 0; i < num_columns; i++) { - auto const& ck = chunks[r][i]; +#if defined(__GNUC__) && (__GNUC__ >= 14) +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wdangling-reference" +#endif + auto const& ck = chunks[r][i]; +#if defined(__GNUC__) && (__GNUC__ >= 14) +#pragma GCC diagnostic pop +#endif auto const& col = col_desc[ck.col_desc_id]; auto& column_chunk_meta = row_group.columns[i].meta_data; @@ -2446,7 +2460,14 @@ void writer::impl::write_parquet_data_to_sink( auto& row_group = _agg_meta->file(p).row_groups[global_r]; for (std::size_t i = 0; i < num_columns; i++) { - auto const& ck = chunks[r][i]; +#if defined(__GNUC__) && (__GNUC__ >= 14) +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wdangling-reference" +#endif + auto const& ck = chunks[r][i]; +#if defined(__GNUC__) && (__GNUC__ >= 14) +#pragma GCC diagnostic pop +#endif auto const dev_bfr = ck.is_compressed ? ck.compressed_bfr : ck.uncompressed_bfr; // Skip the range [0, ck.ck_stat_size) since it has already been copied to host @@ -2493,7 +2514,14 @@ void writer::impl::write_parquet_data_to_sink( int const global_r = global_rowgroup_base[p] + r - first_rg_in_part[p]; auto const& row_group = _agg_meta->file(p).row_groups[global_r]; for (std::size_t i = 0; i < num_columns; i++) { - EncColumnChunk const& ck = chunks[r][i]; +#if defined(__GNUC__) && (__GNUC__ >= 14) +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wdangling-reference" +#endif + EncColumnChunk const& ck = chunks[r][i]; +#if defined(__GNUC__) && (__GNUC__ >= 14) +#pragma GCC diagnostic pop +#endif auto const& column_chunk_meta = row_group.columns[i].meta_data; // start transfer of the column index diff --git a/cpp/src/io/utilities/output_builder.cuh b/cpp/src/io/utilities/output_builder.cuh index 3455ba8436a..d8ce108f09d 100644 --- a/cpp/src/io/utilities/output_builder.cuh +++ b/cpp/src/io/utilities/output_builder.cuh @@ -294,8 +294,15 @@ class output_builder { */ [[nodiscard]] T back_element(rmm::cuda_stream_view stream) const { +#if defined(__GNUC__) && (__GNUC__ >= 14) +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wdangling-reference" +#endif auto const& last_nonempty_chunk = _chunks.size() > 1 and _chunks.back().is_empty() ? _chunks.rbegin()[1] : _chunks.back(); +#if defined(__GNUC__) && (__GNUC__ >= 14) +#pragma GCC diagnostic pop +#endif return last_nonempty_chunk.back_element(stream); } diff --git a/cpp/tests/lists/extract_tests.cpp b/cpp/tests/lists/extract_tests.cpp index 2c24f695c29..d727e2ee459 100644 --- a/cpp/tests/lists/extract_tests.cpp +++ b/cpp/tests/lists/extract_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2024, NVIDIA CORPORATION. + * Copyright (c) 2019-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,6 +14,19 @@ * limitations under the License. */ +// The usage of the lists_column_wrapper `LCW{LCW...` syntax in this file +// causes gcc14 to throw a maybe-uninitialized warning in the copy constructor +// of column_view_base. The same usage in every other test file causes no +// issues, so it seems highly likely to be an incorrect diagnostic. +// Unfortunately, because the warning is in an included file, neither inserting +// ignore pragmas around just the includes nor just around the calling code +// below that uses that syntax (of which there is a decent amount) seems to be +// sufficient to make the compiler happy, so for now the easiest option is to +// ignore the warning for the entire file. +#if defined(__GNUC__) && (__GNUC__ >= 14) +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#endif #include #include #include @@ -423,3 +436,6 @@ TEST_F(ListsExtractColumnIndicesTest, ExtractStrings) } CUDF_TEST_PROGRAM_MAIN() +#if defined(__GNUC__) && (__GNUC__ >= 14) +#pragma GCC diagnostic pop +#endif diff --git a/cpp/tests/partitioning/partition_test.cpp b/cpp/tests/partitioning/partition_test.cpp index 8ea224eb9fc..da54f2233a0 100644 --- a/cpp/tests/partitioning/partition_test.cpp +++ b/cpp/tests/partitioning/partition_test.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2023, NVIDIA CORPORATION. + * Copyright (c) 2020-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -25,6 +25,11 @@ #include #include +#include +#include + +#include + template class PartitionTest : public cudf::test::BaseFixture { using value_type = cudf::test::GetType; @@ -100,15 +105,18 @@ void expect_equal_partitions(cudf::table_view expected, // Split the partitions, sort each partition, then compare for equality auto actual_split = cudf::split(actual, split_points); auto expected_split = cudf::split(expected, split_points); - std::equal(expected_split.begin(), - expected_split.end(), - actual_split.begin(), - [](cudf::table_view expected, cudf::table_view actual) { - auto sorted_expected = cudf::sort(expected); - auto sorted_actual = cudf::sort(actual); - CUDF_TEST_EXPECT_TABLES_EQUAL(*sorted_expected, *sorted_actual); - return true; - }); + + auto begin = + thrust::make_zip_iterator(cuda::std::make_tuple(expected_split.begin(), actual_split.begin())); + auto end = + thrust::make_zip_iterator(cuda::std::make_tuple(expected_split.end(), actual_split.end())); + + std::for_each(begin, end, [](auto const& zipped) { + auto [expected, actual] = zipped; + auto sorted_expected = cudf::sort(expected); + auto sorted_actual = cudf::sort(actual); + CUDF_TEST_EXPECT_TABLES_EQUAL(*sorted_expected, *sorted_actual); + }); } void run_partition_test(cudf::table_view table_to_partition, diff --git a/cpp/tests/transform/integration/assert_unary.h b/cpp/tests/transform/integration/assert_unary.h index 98dc5d1a240..8aa23a03055 100644 --- a/cpp/tests/transform/integration/assert_unary.h +++ b/cpp/tests/transform/integration/assert_unary.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. + * Copyright (c) 2019-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -20,6 +20,11 @@ #include #include +#include +#include + +#include + namespace transformation { template void ASSERT_UNARY(cudf::column_view const& out, cudf::column_view const& in, TypeOpe&& ope) @@ -31,21 +36,28 @@ void ASSERT_UNARY(cudf::column_view const& out, cudf::column_view const& in, Typ ASSERT_TRUE(out_data.size() == in_data.size()); - auto data_comparator = [ope](TypeIn const& in, TypeOut const& out) { - EXPECT_EQ(out, static_cast(ope(in))); - return true; - }; - std::equal(in_data.begin(), in_data.end(), out_data.begin(), data_comparator); + auto begin = thrust::make_zip_iterator(cuda::std::make_tuple(in_data.begin(), out_data.begin())); + auto end = thrust::make_zip_iterator(cuda::std::make_tuple(in_data.end(), out_data.end())); + + std::for_each(begin, end, [ope](auto const& zipped) { + auto [in_val, out_val] = zipped; + EXPECT_EQ(out_val, static_cast(ope(in_val))); + }); auto in_valid = in_h.second; auto out_valid = out_h.second; ASSERT_TRUE(out_valid.size() == in_valid.size()); - auto valid_comparator = [](bool const& in, bool const& out) { - EXPECT_EQ(out, in); - return true; - }; - std::equal(in_valid.begin(), in_valid.end(), out_valid.begin(), valid_comparator); + + auto valid_begin = + thrust::make_zip_iterator(cuda::std::make_tuple(in_valid.begin(), out_valid.begin())); + auto valid_end = + thrust::make_zip_iterator(cuda::std::make_tuple(in_valid.end(), out_valid.end())); + + std::for_each(valid_begin, valid_end, [](auto const& zipped) { + auto [in_flag, out_flag] = zipped; + EXPECT_EQ(out_flag, in_flag); + }); } } // namespace transformation diff --git a/dependencies.yaml b/dependencies.yaml index 8246be146f3..dfcfb40d7a9 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -434,13 +434,13 @@ dependencies: arch: x86_64 cuda: "12.*" packages: - - gcc_linux-64=13.* + - gcc_linux-64=14.* - sysroot_linux-64==2.28 - matrix: arch: aarch64 cuda: "12.*" packages: - - gcc_linux-aarch64=13.* + - gcc_linux-aarch64=14.* - sysroot_linux-aarch64==2.28 - output_types: conda matrices: diff --git a/python/cudf/CMakeLists.txt b/python/cudf/CMakeLists.txt index b3f3f0bf56c..530e50d2075 100644 --- a/python/cudf/CMakeLists.txt +++ b/python/cudf/CMakeLists.txt @@ -18,6 +18,11 @@ include(../../cmake/rapids_config.cmake) include(rapids-cuda) rapids_cuda_init_architectures(cudf-python) +# For now, disable CMake's automatic module scanning for C++ files. There is an sccache bug in the +# version RAPIDS uses in CI that causes it to handle the resulting -M* flags incorrectly with +# gcc>=14. We can remove this once we upgrade to a newer sccache version. +set(CMAKE_CXX_SCAN_FOR_MODULES OFF) + project( cudf-python VERSION "${RAPIDS_VERSION}" diff --git a/python/pylibcudf/CMakeLists.txt b/python/pylibcudf/CMakeLists.txt index ac0bbb82df5..aad128cb695 100644 --- a/python/pylibcudf/CMakeLists.txt +++ b/python/pylibcudf/CMakeLists.txt @@ -18,6 +18,11 @@ include(../../cmake/rapids_config.cmake) include(rapids-cuda) rapids_cuda_init_architectures(pylibcudf) +# For now, disable CMake's automatic module scanning for C++ files. There is an sccache bug in the +# version RAPIDS uses in CI that causes it to handle the resulting -M* flags incorrectly with +# gcc>=14. We can remove this once we upgrade to a newer sccache version. +set(CMAKE_CXX_SCAN_FOR_MODULES OFF) + project( pylibcudf VERSION "${RAPIDS_VERSION}" From 84149b11ac65516a6fd9cf05b2592fc95ad441f8 Mon Sep 17 00:00:00 2001 From: Robert Maynard Date: Fri, 25 Jul 2025 13:58:10 -0400 Subject: [PATCH 013/366] Update build infra to support new branching strategy (#19445) rapids_config will use `RAPIDS_BRANCH` contents to determine what branch to use Authors: - Robert Maynard (https://github.com/robertmaynard) Approvers: - Bradley Dice (https://github.com/bdice) URL: https://github.com/rapidsai/cudf/pull/19445 --- RAPIDS_BRANCH | 1 + cmake/RAPIDS.cmake | 6 +++--- cmake/rapids_config.cmake | 10 ++++++++++ 3 files changed, 14 insertions(+), 3 deletions(-) create mode 100644 RAPIDS_BRANCH diff --git a/RAPIDS_BRANCH b/RAPIDS_BRANCH new file mode 100644 index 00000000000..9b1c52d9415 --- /dev/null +++ b/RAPIDS_BRANCH @@ -0,0 +1 @@ +branch-25.10 diff --git a/cmake/RAPIDS.cmake b/cmake/RAPIDS.cmake index d112951d3c1..40de7cefcd2 100644 --- a/cmake/RAPIDS.cmake +++ b/cmake/RAPIDS.cmake @@ -18,9 +18,9 @@ cmake_minimum_required(VERSION 3.30.4 FATAL_ERROR) # Allow users to control which version is used -if(NOT rapids-cmake-version OR NOT rapids-cmake-version MATCHES [[^([0-9][0-9])\.([0-9][0-9])$]]) +if(NOT rapids-cmake-branch OR NOT rapids-cmake-version) message( - FATAL_ERROR "The CMake variable rapids-cmake-version must be defined in the format MAJOR.MINOR." + FATAL_ERROR "The CMake variable `rapids-cmake-branch` or `rapids-cmake-version` must be defined" ) endif() @@ -33,7 +33,7 @@ endif() # Allow users to control which branch is fetched if(NOT rapids-cmake-branch) # Define a default branch if the user doesn't set one - set(rapids-cmake-branch "branch-${rapids-cmake-version}") + set(rapids-cmake-branch "release/${rapids-cmake-version}") endif() # Allow users to control the exact URL passed to FetchContent diff --git a/cmake/rapids_config.cmake b/cmake/rapids_config.cmake index abe468dce80..b706c926e7a 100644 --- a/cmake/rapids_config.cmake +++ b/cmake/rapids_config.cmake @@ -26,5 +26,15 @@ else() ) endif() +# Use STRINGS to trim whitespace/newlines +file(STRINGS "${CMAKE_CURRENT_LIST_DIR}/../RAPIDS_BRANCH" _rapids_branch) +if(NOT _rapids_branch) + message( + FATAL_ERROR + "Could not determine branch name to use for checking out rapids-cmake. The file \"${CMAKE_CURRENT_LIST_DIR}/../RAPIDS_BRANCH\" is missing." + ) +endif() + set(rapids-cmake-version "${RAPIDS_VERSION_MAJOR_MINOR}") +set(rapids-cmake-branch "${_rapids_branch}") include("${CMAKE_CURRENT_LIST_DIR}/RAPIDS.cmake") From fb80addb3a84d6a811d3169855852062e3e8a326 Mon Sep 17 00:00:00 2001 From: Matthew Murray <41342305+Matt711@users.noreply.github.com> Date: Mon, 28 Jul 2025 09:53:06 -0400 Subject: [PATCH 014/366] Implement top k expression in cudf-polars using `cudf::top_k` (#19431) Now that we have #19303, we can implement top k directly as a unary expression instead of translating the polars IR into a `Sort` + `Slice`. This also has the benefit of getting free improvement once `cudf::top_k` uses CUB's implementation. Authors: - Matthew Murray (https://github.com/Matt711) Approvers: - Matthew Roeschke (https://github.com/mroeschke) URL: https://github.com/rapidsai/cudf/pull/19431 --- .../cudf_polars/dsl/expressions/unary.py | 19 ++++- .../cudf_polars/cudf_polars/dsl/translate.py | 11 --- .../pylibcudf/pylibcudf/libcudf/sorting.pxd | 65 ++++++++++------- python/pylibcudf/pylibcudf/sorting.pxd | 6 +- python/pylibcudf/pylibcudf/sorting.pyi | 8 +++ python/pylibcudf/pylibcudf/sorting.pyx | 69 ++++++++++++++++++- 6 files changed, 140 insertions(+), 38 deletions(-) diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/unary.py b/python/cudf_polars/cudf_polars/dsl/expressions/unary.py index f55e0b9aeed..9147e4a6dbe 100644 --- a/python/cudf_polars/cudf_polars/dsl/expressions/unary.py +++ b/python/cudf_polars/cudf_polars/dsl/expressions/unary.py @@ -5,7 +5,7 @@ from __future__ import annotations -from typing import TYPE_CHECKING, Any, ClassVar +from typing import TYPE_CHECKING, Any, ClassVar, cast import pylibcudf as plc @@ -111,6 +111,7 @@ class UnaryFunction(Expr): "unique", "value_counts", "null_count", + "top_k", } ) _supported_cum_aggs = frozenset( @@ -140,6 +141,7 @@ def __init__( "cum_sum", "drop_nulls", "unique", + "top_k", ) if self.name not in UnaryFunction._supported_fns: @@ -339,6 +341,21 @@ def do_evaluate( ), dtype=self.dtype, ) + elif self.name == "top_k": + (column, k) = ( + child.evaluate(df, context=context) for child in self.children + ) + (reverse,) = self.options + return Column( + plc.sorting.top_k( + column.obj, + cast(Literal, self.children[1]).value, + plc.types.Order.ASCENDING + if reverse + else plc.types.Order.DESCENDING, + ), + dtype=self.dtype, + ) elif self.name in self._OP_MAPPING: column = self.children[0].evaluate(df, context=context) if column.obj.type().id() != self.dtype.id(): diff --git a/python/cudf_polars/cudf_polars/dsl/translate.py b/python/cudf_polars/cudf_polars/dsl/translate.py index ab29752b8b9..50e8dd8690f 100644 --- a/python/cudf_polars/cudf_polars/dsl/translate.py +++ b/python/cudf_polars/cudf_polars/dsl/translate.py @@ -675,17 +675,6 @@ def _( ) elif name == "pow": return expr.BinOp(dtype, plc.binaryop.BinaryOperator.POW, *children) - elif name in "top_k": - (col, k) = children - assert isinstance(k, expr.Literal) - (descending,) = options - return expr.Slice( - dtype, - 0, - k.value, - expr.Sort(dtype, (False, True, not descending), col), - ) - return expr.UnaryFunction(dtype, name, options, *children) raise NotImplementedError( f"No handler for Expr function node with {name=}" diff --git a/python/pylibcudf/pylibcudf/libcudf/sorting.pxd b/python/pylibcudf/pylibcudf/libcudf/sorting.pxd index 342545a0eec..a5fc13dc90e 100644 --- a/python/pylibcudf/pylibcudf/libcudf/sorting.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/sorting.pxd @@ -1,5 +1,5 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. -cimport pylibcudf.libcudf.types as libcudf_types +# Copyright (c) 2020-2025, NVIDIA CORPORATION. + from libcpp cimport bool from libcpp.memory cimport unique_ptr from libcpp.vector cimport vector @@ -9,73 +9,92 @@ from pylibcudf.libcudf.column.column cimport column from pylibcudf.libcudf.column.column_view cimport column_view from pylibcudf.libcudf.table.table cimport table from pylibcudf.libcudf.table.table_view cimport table_view +from pylibcudf.libcudf.types cimport ( + order, + null_order, + null_policy, + null_order, + size_type +) cdef extern from "cudf/sorting.hpp" namespace "cudf" nogil: cdef unique_ptr[column] sorted_order( table_view source_table, - vector[libcudf_types.order] column_order, - vector[libcudf_types.null_order] null_precedence + vector[order] column_order, + vector[null_order] null_precedence ) except +libcudf_exception_handler cdef unique_ptr[column] stable_sorted_order( table_view source_table, - vector[libcudf_types.order] column_order, - vector[libcudf_types.null_order] null_precedence + vector[order] column_order, + vector[null_order] null_precedence ) except +libcudf_exception_handler cdef unique_ptr[column] rank( column_view input_view, rank_method method, - libcudf_types.order column_order, - libcudf_types.null_policy null_handling, - libcudf_types.null_order null_precedence, + order column_order, + null_policy null_handling, + null_order null_precedence, bool percentage) except +libcudf_exception_handler cdef bool is_sorted( const table_view& table, - vector[libcudf_types.order] column_order, - vector[libcudf_types.null_order] null_precedence + vector[order] column_order, + vector[null_order] null_precedence ) except +libcudf_exception_handler cdef unique_ptr[table] segmented_sort_by_key( const table_view& values, const table_view& keys, const column_view& segment_offsets, - vector[libcudf_types.order] column_order, - vector[libcudf_types.null_order] null_precedence + vector[order] column_order, + vector[null_order] null_precedence ) except +libcudf_exception_handler cdef unique_ptr[table] stable_segmented_sort_by_key( const table_view& values, const table_view& keys, const column_view& segment_offsets, - vector[libcudf_types.order] column_order, - vector[libcudf_types.null_order] null_precedence + vector[order] column_order, + vector[null_order] null_precedence ) except +libcudf_exception_handler cdef unique_ptr[table] sort_by_key( const table_view& values, const table_view& keys, - vector[libcudf_types.order] column_order, - vector[libcudf_types.null_order] null_precedence + vector[order] column_order, + vector[null_order] null_precedence ) except +libcudf_exception_handler cdef unique_ptr[table] stable_sort_by_key( const table_view& values, const table_view& keys, - vector[libcudf_types.order] column_order, - vector[libcudf_types.null_order] null_precedence + vector[order] column_order, + vector[null_order] null_precedence ) except +libcudf_exception_handler cdef unique_ptr[table] sort( table_view source_table, - vector[libcudf_types.order] column_order, - vector[libcudf_types.null_order] null_precedence + vector[order] column_order, + vector[null_order] null_precedence ) except +libcudf_exception_handler cdef unique_ptr[table] stable_sort( table_view source_table, - vector[libcudf_types.order] column_order, - vector[libcudf_types.null_order] null_precedence + vector[order] column_order, + vector[null_order] null_precedence + ) except +libcudf_exception_handler + + cdef unique_ptr[column] top_k( + const column_view& col, + size_type k, + order sort_order, + ) except +libcudf_exception_handler + + cdef unique_ptr[column] top_k_order( + const column_view& col, + size_type k, + order sort_order, ) except +libcudf_exception_handler diff --git a/python/pylibcudf/pylibcudf/sorting.pxd b/python/pylibcudf/pylibcudf/sorting.pxd index 8127ab21ad1..91f8354f965 100644 --- a/python/pylibcudf/pylibcudf/sorting.pxd +++ b/python/pylibcudf/pylibcudf/sorting.pxd @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. +# Copyright (c) 2024-2025, NVIDIA CORPORATION. from libcpp cimport bool from pylibcudf.libcudf.aggregation cimport rank_method @@ -60,3 +60,7 @@ cpdef Table stable_sort_by_key( cpdef Table sort(Table source_table, list column_order, list null_precedence) cpdef Table stable_sort(Table source_table, list column_order, list null_precedence) + +cpdef Column top_k(Column col, size_type k, order sort_order = *) + +cpdef Column top_k_order(Column col, size_type k, order sort_order = *) diff --git a/python/pylibcudf/pylibcudf/sorting.pyi b/python/pylibcudf/pylibcudf/sorting.pyi index 5255d869a4d..07ad962d0ce 100644 --- a/python/pylibcudf/pylibcudf/sorting.pyi +++ b/python/pylibcudf/pylibcudf/sorting.pyi @@ -62,3 +62,11 @@ def stable_sort( column_order: list[Order], null_precedence: list[NullOrder], ) -> Table: ... +def top_k( + col: Column, + k: int, + sort_order: Order = Order.DESCENDING, +) -> Column: ... +def top_k_order( + col: Column, k: int, sort_order: Order = Order.DESCENDING +) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/sorting.pyx b/python/pylibcudf/pylibcudf/sorting.pyx index fb29ef8c571..31efc018d6d 100644 --- a/python/pylibcudf/pylibcudf/sorting.pyx +++ b/python/pylibcudf/pylibcudf/sorting.pyx @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. +# Copyright (c) 2024-2025, NVIDIA CORPORATION. from libcpp.memory cimport unique_ptr from libcpp.utility cimport move @@ -7,7 +7,7 @@ from pylibcudf.libcudf cimport sorting as cpp_sorting from pylibcudf.libcudf.aggregation cimport rank_method from pylibcudf.libcudf.column.column cimport column from pylibcudf.libcudf.table.table cimport table -from pylibcudf.libcudf.types cimport null_order, null_policy, order +from pylibcudf.libcudf.types cimport null_order, null_policy, order, size_type from .column cimport Column from .table cimport Table @@ -393,3 +393,68 @@ cpdef Table stable_sort(Table source_table, list column_order, list null_precede c_null_precedence, ) return Table.from_libcudf(move(c_result)) + + +cpdef Column top_k(Column col, size_type k, order sort_order = order.DESCENDING): + """ + Computes the top-k values of a column. + + For details, see :cpp:func:`top_k`. + + Parameters + ---------- + col : Column + The input column. + k : int + The number of top values to retrieve. + sort_order : Order, default DESCENDING + The desired order of the top values. If ASCENDING, the smallest `k` values + are returned. If DESCENDING, the largest `k` values are returned. + + Returns + ------- + Column + A column of the top ``k`` elements from the input. + """ + cdef unique_ptr[column] c_result + with nogil: + c_result = cpp_sorting.top_k( + col.view(), + k, + sort_order, + ) + return Column.from_libcudf(move(c_result)) + + +cpdef Column top_k_order(Column col, size_type k, order sort_order = order.DESCENDING): + """ + Computes the indices of the top-k values of a column. + + This returns the row indices of the top-k elements. + + For details, see :cpp:func:`top_k_order`. + + Parameters + ---------- + col : Column + The input column. + k : int + The number of top values to retrieve. + sort_order : Order, default DESCENDING + The desired order of the top values. If ASCENDING, the indices of the smallest + `k` values are returned. If DESCENDING, the indices of the largest `k` values + are returned. + + Returns + ------- + Column + A column of the indices of the top ``k`` elements. + """ + cdef unique_ptr[column] c_result + with nogil: + c_result = cpp_sorting.top_k_order( + col.view(), + k, + sort_order, + ) + return Column.from_libcudf(move(c_result)) From 62ab698383b83b4fe56a27063d2bdd43b8447d63 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Mon, 28 Jul 2025 10:27:30 -0700 Subject: [PATCH 015/366] Don't run serial cudf_pandas tests when testing multiple pandas versions (#19507) `test_rmm_option_on_import`, which was marked as `serial` (https://github.com/rapidsai/cudf/pull/19345), appears to still flakily fail e.g. https://github.com/rapidsai/cudf/actions/runs/16531500545/job/46759205034?pr=19506. It appears when testing `cudf_pandas` tests with different versions, we also need to add `-m "not serial"` Additionally, changed `--numprocesses=1` to pass 0 instead so we don't incur the overhead of running these tests in a separate, single process Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - Bradley Dice (https://github.com/bdice) URL: https://github.com/rapidsai/cudf/pull/19507 --- ci/cudf_pandas_scripts/run_tests.sh | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/ci/cudf_pandas_scripts/run_tests.sh b/ci/cudf_pandas_scripts/run_tests.sh index e953f7be090..439645a0add 100755 --- a/ci/cudf_pandas_scripts/run_tests.sh +++ b/ci/cudf_pandas_scripts/run_tests.sh @@ -116,7 +116,7 @@ python -m pytest -p cudf.pandas \ # More details: https://github.com/rapidsai/cudf/pull/16930#issuecomment-2707873968 python -m pytest -p cudf.pandas \ --ignore=./python/cudf/cudf_pandas_tests/third_party_integration_tests/ \ - --numprocesses=1 \ + --numprocesses=0 \ -k "profiler" \ ./python/cudf/cudf_pandas_tests/ @@ -133,15 +133,19 @@ for version in "${versions[@]}"; do --numprocesses=8 \ --dist=worksteal \ -k "not profiler" \ + -m "not serial" \ --cov-config=./python/cudf/.coveragerc \ --cov=cudf \ --cov-report=xml:"${RAPIDS_COVERAGE_DIR}/cudf-pandas-coverage.xml" \ --cov-report=term \ ./python/cudf/cudf_pandas_tests/ + # NOTE: We don't currently run serial tests (only 1 as of 2025-07-25) + # with multiple versions of pandas. + python -m pytest -p cudf.pandas \ --ignore=./python/cudf/cudf_pandas_tests/third_party_integration_tests/ \ - --numprocesses=1 \ + --numprocesses=0 \ -k "profiler" \ ./python/cudf/cudf_pandas_tests/ done From a6de6709336ecf199170bb47404240faf0cda78f Mon Sep 17 00:00:00 2001 From: Matthew Murray <41342305+Matt711@users.noreply.github.com> Date: Mon, 28 Jul 2025 17:31:33 -0400 Subject: [PATCH 016/366] Improve readability when printing pylibcudf enums (#19451) This PR makes readability better when printing public enums in pylibcudf by setting `__str__ = __repr__` Authors: - Matthew Murray (https://github.com/Matt711) Approvers: - Lawrence Mitchell (https://github.com/wence-) URL: https://github.com/rapidsai/cudf/pull/19451 --- docs/cudf/source/pylibcudf/developer_docs.md | 16 ++++++++++++++++ python/pylibcudf/pylibcudf/aggregation.pyx | 8 ++++++++ python/pylibcudf/pylibcudf/binaryop.pyx | 4 +++- python/pylibcudf/pylibcudf/copying.pyx | 9 ++++++++- python/pylibcudf/pylibcudf/datetime.pyx | 3 +++ python/pylibcudf/pylibcudf/expressions.pyx | 4 ++++ python/pylibcudf/pylibcudf/io/json.pyx | 4 ++++ python/pylibcudf/pylibcudf/io/types.pyx | 6 ++++++ python/pylibcudf/pylibcudf/labeling.pyx | 4 +++- python/pylibcudf/pylibcudf/lists.pyx | 3 +++ python/pylibcudf/pylibcudf/reduce.pyx | 4 +++- python/pylibcudf/pylibcudf/replace.pyx | 4 +++- python/pylibcudf/pylibcudf/round.pyx | 4 +++- python/pylibcudf/pylibcudf/stream_compaction.pyx | 4 +++- .../pylibcudf/pylibcudf/strings/char_types.pyx | 4 +++- python/pylibcudf/pylibcudf/strings/combine.pyx | 5 ++++- .../pylibcudf/pylibcudf/strings/regex_flags.pyx | 4 +++- python/pylibcudf/pylibcudf/strings/side_type.pyx | 4 +++- python/pylibcudf/pylibcudf/strings/translate.pyx | 4 +++- python/pylibcudf/pylibcudf/types.pyx | 11 +++++++++++ python/pylibcudf/pylibcudf/unary.pyx | 4 +++- 21 files changed, 100 insertions(+), 13 deletions(-) diff --git a/docs/cudf/source/pylibcudf/developer_docs.md b/docs/cudf/source/pylibcudf/developer_docs.md index e33c0b9834c..105dbb61a1b 100644 --- a/docs/cudf/source/pylibcudf/developer_docs.md +++ b/docs/cudf/source/pylibcudf/developer_docs.md @@ -225,6 +225,22 @@ from pylibcudf.libcudf.copying import \ out_of_bounds_policy as OutOfBoundsPolicy # no-cython-lint ``` +### Enum string representations + +By default, Cython's `cpdef enum class` generates a valid Python `Enum` type for +each C++ enum. However, the default `__str__` implementation for these enums is not +very informative. It returns the underlying value (e.g., `11` instead of ``). + +To improve developer experience, we manually set `__str__ = __repr__` for all public +enums. This ensures that printing an enum from Python returns a meaningful name like: + +```python +>>> from pylibcudf.types import TypeId +>>> print(TypeId.INT32) + +``` + + ### Handling overloaded functions in libcudf As a C++ library, libcudf makes extensive use of function overloading. For example, both of the following functions exist in libcudf: diff --git a/python/pylibcudf/pylibcudf/aggregation.pyx b/python/pylibcudf/pylibcudf/aggregation.pyx index 123754fd1ce..9bef36e5c06 100644 --- a/python/pylibcudf/pylibcudf/aggregation.pyx +++ b/python/pylibcudf/pylibcudf/aggregation.pyx @@ -917,3 +917,11 @@ cpdef bool is_valid_aggregation(DataType source, Aggregation agg): True if the aggregation is supported. """ return cpp_is_valid_aggregation(source.c_obj, agg.kind()) + +Kind.__str__ = Kind.__repr__ +BitwiseOp.__str__ = BitwiseOp.__repr__ +CorrelationType.__str__ = CorrelationType.__repr__ +EWMHistory.__str__ = EWMHistory.__repr__ +RankMethod.__str__ = RankMethod.__repr__ +RankPercentage.__str__ = RankPercentage.__repr__ +UdfType.__str__ = UdfType.__repr__ diff --git a/python/pylibcudf/pylibcudf/binaryop.pyx b/python/pylibcudf/pylibcudf/binaryop.pyx index b7b4ecc6e83..c6827431646 100644 --- a/python/pylibcudf/pylibcudf/binaryop.pyx +++ b/python/pylibcudf/pylibcudf/binaryop.pyx @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. +# Copyright (c) 2024-2025, NVIDIA CORPORATION. from cython.operator import dereference @@ -114,3 +114,5 @@ cpdef bool is_supported_operation( rhs.c_obj, op ) + +BinaryOperator.__str__ = BinaryOperator.__repr__ diff --git a/python/pylibcudf/pylibcudf/copying.pyx b/python/pylibcudf/pylibcudf/copying.pyx index fb8b6f9890e..3b0ba0d9555 100644 --- a/python/pylibcudf/pylibcudf/copying.pyx +++ b/python/pylibcudf/pylibcudf/copying.pyx @@ -1,4 +1,4 @@ -# Copyright (c) 2023-2024, NVIDIA CORPORATION. +# Copyright (c) 2023-2025, NVIDIA CORPORATION. from cython.operator import dereference @@ -29,6 +29,8 @@ from pylibcudf.libcudf.copying import \ mask_allocation_policy as MaskAllocationPolicy # no-cython-lint from pylibcudf.libcudf.copying import \ out_of_bounds_policy as OutOfBoundsPolicy # no-cython-lint +from pylibcudf.libcudf.copying import \ + sample_with_replacement as SampleWithReplacement # no-cython-lint from .column cimport Column from .scalar cimport Scalar @@ -39,6 +41,7 @@ from .utils cimport _as_vector __all__ = [ "MaskAllocationPolicy", "OutOfBoundsPolicy", + "SampleWithReplacement", "allocate_like", "boolean_mask_scatter", "copy_if_else", @@ -589,3 +592,7 @@ cpdef Scalar get_element(Column input_column, size_type index): c_output = cpp_copying.get_element(input_column.view(), index) return Scalar.from_libcudf(move(c_output)) + +OutOfBoundsPolicy.__str__ = OutOfBoundsPolicy.__repr__ +MaskAllocationPolicy.__str__ = MaskAllocationPolicy.__repr__ +SampleWithReplacement.__str__ = SampleWithReplacement.__repr__ diff --git a/python/pylibcudf/pylibcudf/datetime.pyx b/python/pylibcudf/pylibcudf/datetime.pyx index 15aee4c3e9e..da736755848 100644 --- a/python/pylibcudf/pylibcudf/datetime.pyx +++ b/python/pylibcudf/pylibcudf/datetime.pyx @@ -299,3 +299,6 @@ cpdef Column days_in_month(Column input): with nogil: result = cpp_days_in_month(input.view()) return Column.from_libcudf(move(result)) + +DatetimeComponent.__str__ = DatetimeComponent.__repr__ +RoundingFrequency.__str__ = RoundingFrequency.__repr__ diff --git a/python/pylibcudf/pylibcudf/expressions.pyx b/python/pylibcudf/pylibcudf/expressions.pyx index 22586eee6d1..ec4db948126 100644 --- a/python/pylibcudf/pylibcudf/expressions.pyx +++ b/python/pylibcudf/pylibcudf/expressions.pyx @@ -480,3 +480,7 @@ def to_expression(str expr, tuple column_names): {name: ColumnReference(i) for i, name in enumerate(column_names)} ) return visitor.visit(ast.parse(expr)) + + +ASTOperator.__str__ = ASTOperator.__repr__ +TableReference.__str__ = TableReference.__repr__ diff --git a/python/pylibcudf/pylibcudf/io/json.pyx b/python/pylibcudf/pylibcudf/io/json.pyx index b91d011adcd..97ab5faf9a0 100644 --- a/python/pylibcudf/pylibcudf/io/json.pyx +++ b/python/pylibcudf/pylibcudf/io/json.pyx @@ -34,6 +34,8 @@ from pylibcudf.libcudf.io.types cimport ( table_with_metadata, ) +from pylibcudf.libcudf.io.json import json_recovery_mode_t as JsonRecoveryModeType # no-cython-lint + from pylibcudf.libcudf.types cimport data_type, size_type from pylibcudf.libcudf.column.column cimport column, column_contents @@ -1087,3 +1089,5 @@ cpdef bool is_supported_write_json(DataType type): For details, see :cpp:func:`is_supported_write_json`. """ return cpp_is_supported_write_json(type.c_obj) + +JsonRecoveryModeType.__str__ = JsonRecoveryModeType.__repr__ diff --git a/python/pylibcudf/pylibcudf/io/types.pyx b/python/pylibcudf/pylibcudf/io/types.pyx index 659f3a948ab..af57af6e694 100644 --- a/python/pylibcudf/pylibcudf/io/types.pyx +++ b/python/pylibcudf/pylibcudf/io/types.pyx @@ -661,3 +661,9 @@ cdef class SinkInfo: self.c_obj = sink_info(paths) __hash__ = None + +ColumnEncoding.__str__ = ColumnEncoding.__repr__ +CompressionType.__str__ = CompressionType.__repr__ +DictionaryPolicy.__str__ = DictionaryPolicy.__repr__ +QuoteStyle.__str__ = QuoteStyle.__repr__ +StatisticsFreq.__str__ = StatisticsFreq.__repr__ diff --git a/python/pylibcudf/pylibcudf/labeling.pyx b/python/pylibcudf/pylibcudf/labeling.pyx index cae1830f6b9..0d93463cc7e 100644 --- a/python/pylibcudf/pylibcudf/labeling.pyx +++ b/python/pylibcudf/pylibcudf/labeling.pyx @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. +# Copyright (c) 2024-2025, NVIDIA CORPORATION. from libcpp.memory cimport unique_ptr from libcpp.utility cimport move @@ -53,3 +53,5 @@ cpdef Column label_bins( ) return Column.from_libcudf(move(c_result)) + +Inclusive.__str__ = Inclusive.__repr__ diff --git a/python/pylibcudf/pylibcudf/lists.pyx b/python/pylibcudf/pylibcudf/lists.pyx index d7e237ac474..eee5a43f6a8 100644 --- a/python/pylibcudf/pylibcudf/lists.pyx +++ b/python/pylibcudf/pylibcudf/lists.pyx @@ -677,3 +677,6 @@ cpdef Column distinct(Column input, null_equality nulls_equal, nan_equality nans nans_equal, ) return Column.from_libcudf(move(c_result)) + +ConcatenateNullPolicy.__str__ = ConcatenateNullPolicy.__repr__ +DuplicateFindOption.__str__ = DuplicateFindOption.__repr__ diff --git a/python/pylibcudf/pylibcudf/reduce.pyx b/python/pylibcudf/pylibcudf/reduce.pyx index 1d6ffd9de10..1fa10dcd376 100644 --- a/python/pylibcudf/pylibcudf/reduce.pyx +++ b/python/pylibcudf/pylibcudf/reduce.pyx @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. +# Copyright (c) 2024-2025, NVIDIA CORPORATION. from cython.operator cimport dereference from libcpp.memory cimport unique_ptr @@ -102,3 +102,5 @@ cpdef tuple minmax(Column col): Scalar.from_libcudf(move(result.first)), Scalar.from_libcudf(move(result.second)), ) + +ScanType.__str__ = ScanType.__repr__ diff --git a/python/pylibcudf/pylibcudf/replace.pyx b/python/pylibcudf/pylibcudf/replace.pyx index 51be2b29277..d84f814a5ee 100644 --- a/python/pylibcudf/pylibcudf/replace.pyx +++ b/python/pylibcudf/pylibcudf/replace.pyx @@ -1,4 +1,4 @@ -# Copyright (c) 2023-2024, NVIDIA CORPORATION. +# Copyright (c) 2023-2025, NVIDIA CORPORATION. from cython.operator import dereference @@ -200,3 +200,5 @@ cpdef Column normalize_nans_and_zeros(Column source_column, bool inplace=False): if not inplace: return Column.from_libcudf(move(c_result)) + +ReplacePolicy.__str__ = ReplacePolicy.__repr__ diff --git a/python/pylibcudf/pylibcudf/round.pyx b/python/pylibcudf/pylibcudf/round.pyx index 09e5a9cc3bc..024cf47a224 100644 --- a/python/pylibcudf/pylibcudf/round.pyx +++ b/python/pylibcudf/pylibcudf/round.pyx @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. +# Copyright (c) 2024-2025, NVIDIA CORPORATION. from libc.stdint cimport int32_t from libcpp.memory cimport unique_ptr from libcpp.utility cimport move @@ -47,3 +47,5 @@ cpdef Column round( ) return Column.from_libcudf(move(c_result)) + +RoundingMethod.__str__ = RoundingMethod.__repr__ diff --git a/python/pylibcudf/pylibcudf/stream_compaction.pyx b/python/pylibcudf/pylibcudf/stream_compaction.pyx index 6e403ca1b07..8f308e3b29e 100644 --- a/python/pylibcudf/pylibcudf/stream_compaction.pyx +++ b/python/pylibcudf/pylibcudf/stream_compaction.pyx @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. +# Copyright (c) 2024-2025, NVIDIA CORPORATION. from libcpp.memory cimport unique_ptr from libcpp.utility cimport move @@ -327,3 +327,5 @@ cpdef size_type distinct_count( return cpp_stream_compaction.distinct_count( source.view(), null_handling, nan_handling ) + +DuplicateKeepOption.__str__ = DuplicateKeepOption.__repr__ diff --git a/python/pylibcudf/pylibcudf/strings/char_types.pyx b/python/pylibcudf/pylibcudf/strings/char_types.pyx index 0af4a1f9c37..119daa911f6 100644 --- a/python/pylibcudf/pylibcudf/strings/char_types.pyx +++ b/python/pylibcudf/pylibcudf/strings/char_types.pyx @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. +# Copyright (c) 2024-2025, NVIDIA CORPORATION. from libcpp.memory cimport unique_ptr from libcpp.utility cimport move @@ -92,3 +92,5 @@ cpdef Column filter_characters_of_type( ) return Column.from_libcudf(move(c_result)) + +StringCharacterTypes.__str__ = StringCharacterTypes.__repr__ diff --git a/python/pylibcudf/pylibcudf/strings/combine.pyx b/python/pylibcudf/pylibcudf/strings/combine.pyx index dc1e72c799b..da78c81c0c0 100644 --- a/python/pylibcudf/pylibcudf/strings/combine.pyx +++ b/python/pylibcudf/pylibcudf/strings/combine.pyx @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. +# Copyright (c) 2024-2025, NVIDIA CORPORATION. from libcpp.memory cimport unique_ptr from libcpp.utility cimport move from pylibcudf.column cimport Column @@ -228,3 +228,6 @@ cpdef Column join_list_elements( else: raise ValueError("separator must be a Column or a Scalar") return Column.from_libcudf(move(c_result)) + +OutputIfEmptyList.__str__ = OutputIfEmptyList.__repr__ +SeparatorOnNulls.__str__ = SeparatorOnNulls.__repr__ diff --git a/python/pylibcudf/pylibcudf/strings/regex_flags.pyx b/python/pylibcudf/pylibcudf/strings/regex_flags.pyx index 65b504e0dc7..4395c9595bb 100644 --- a/python/pylibcudf/pylibcudf/strings/regex_flags.pyx +++ b/python/pylibcudf/pylibcudf/strings/regex_flags.pyx @@ -1,6 +1,8 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. +# Copyright (c) 2024-2025, NVIDIA CORPORATION. from pylibcudf.libcudf.strings.regex_flags import \ regex_flags as RegexFlags # no-cython-lint __all__ = ["RegexFlags"] + +RegexFlags.__str__ = RegexFlags.__repr__ diff --git a/python/pylibcudf/pylibcudf/strings/side_type.pyx b/python/pylibcudf/pylibcudf/strings/side_type.pyx index 87db4206a9c..1b3c7336ef0 100644 --- a/python/pylibcudf/pylibcudf/strings/side_type.pyx +++ b/python/pylibcudf/pylibcudf/strings/side_type.pyx @@ -1,5 +1,7 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. +# Copyright (c) 2024-2025, NVIDIA CORPORATION. from pylibcudf.libcudf.strings.side_type import \ side_type as SideType # no-cython-lint __all__ = ["SideType"] + +SideType.__str__ = SideType.__repr__ diff --git a/python/pylibcudf/pylibcudf/strings/translate.pyx b/python/pylibcudf/pylibcudf/strings/translate.pyx index ba1e8dc5d27..9edb6a3a76f 100644 --- a/python/pylibcudf/pylibcudf/strings/translate.pyx +++ b/python/pylibcudf/pylibcudf/strings/translate.pyx @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. +# Copyright (c) 2024-2025, NVIDIA CORPORATION. from libcpp.memory cimport unique_ptr from libcpp.pair cimport pair from libcpp.utility cimport move @@ -117,3 +117,5 @@ cpdef Column filter_characters( dereference(c_replacement), ) return Column.from_libcudf(move(c_result)) + +FilterType.__str__ = FilterType.__repr__ diff --git a/python/pylibcudf/pylibcudf/types.pyx b/python/pylibcudf/pylibcudf/types.pyx index 5e269f73f8b..6e3eb19be1a 100644 --- a/python/pylibcudf/pylibcudf/types.pyx +++ b/python/pylibcudf/pylibcudf/types.pyx @@ -305,3 +305,14 @@ def _from_arrow(obj: pa.DataType) -> DataType: SIZE_TYPE = DataType(type_to_id[size_type]()) SIZE_TYPE_ID = SIZE_TYPE.id() + +TypeId.__str__ = TypeId.__repr__ +NanPolicy.__str__ = NanPolicy.__repr__ +NullPolicy.__str__ = NullPolicy.__repr__ +Interpolation.__str__ = Interpolation.__repr__ +MaskState.__str__ = MaskState.__repr__ +NanEquality.__str__ = NanEquality.__repr__ +NullEquality.__str__ = NullEquality.__repr__ +NullOrder.__str__ = NullOrder.__repr__ +Order.__str__ = Order.__repr__ +Sorted.__str__ = Sorted.__repr__ diff --git a/python/pylibcudf/pylibcudf/unary.pyx b/python/pylibcudf/pylibcudf/unary.pyx index b738ab53d1b..3915ed8274a 100644 --- a/python/pylibcudf/pylibcudf/unary.pyx +++ b/python/pylibcudf/pylibcudf/unary.pyx @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. +# Copyright (c) 2024-2025, NVIDIA CORPORATION. from libcpp cimport bool from libcpp.memory cimport unique_ptr @@ -184,3 +184,5 @@ cpdef bool is_supported_cast(DataType from_, DataType to): """ with nogil: return cpp_unary.is_supported_cast(from_.c_obj, to.c_obj) + +UnaryOperator.__str__ = UnaryOperator.__repr__ From 1eba62d3e3749fdc165dcd9c8d270a54e77f3458 Mon Sep 17 00:00:00 2001 From: Matthew Murray <41342305+Matt711@users.noreply.github.com> Date: Mon, 28 Jul 2025 20:08:33 -0400 Subject: [PATCH 017/366] Prefer `Column.astype` over `plc.unary.cast` in the fill null unary function expression (#19479) Contributes to https://github.com/rapidsai/cudf/issues/19476 by replacing the direct call to `plc.unary.cast` with `Column.astype` in the `fill_null` unary function expression. Authors: - Matthew Murray (https://github.com/Matt711) Approvers: - Matthew Roeschke (https://github.com/mroeschke) - Lawrence Mitchell (https://github.com/wence-) URL: https://github.com/rapidsai/cudf/pull/19479 --- .../cudf_polars/dsl/expressions/unary.py | 46 ++++++++----------- .../cudf_polars/dsl/utils/aggregations.py | 8 ++++ .../cudf_polars/cudf_polars/testing/plugin.py | 2 - .../cudf_polars/cudf_polars/utils/dtypes.py | 10 ++-- python/cudf_polars/tests/test_drop_nulls.py | 7 ++- python/cudf_polars/tests/test_groupby.py | 8 ++++ 6 files changed, 48 insertions(+), 33 deletions(-) diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/unary.py b/python/cudf_polars/cudf_polars/dsl/expressions/unary.py index 9147e4a6dbe..64c4cae2378 100644 --- a/python/cudf_polars/cudf_polars/dsl/expressions/unary.py +++ b/python/cudf_polars/cudf_polars/dsl/expressions/unary.py @@ -9,7 +9,7 @@ import pylibcudf as plc -from cudf_polars.containers import Column, DataType +from cudf_polars.containers import Column from cudf_polars.dsl.expressions.base import ExecutionContext, Expr from cudf_polars.dsl.expressions.literal import Literal from cudf_polars.utils import dtypes @@ -311,19 +311,20 @@ def do_evaluate( column = self.children[0].evaluate(df, context=context) if column.null_count == 0: return column - if isinstance(self.children[1], Literal): - arg = plc.Scalar.from_py( - self.children[1].value, self.children[1].dtype.plc - ) + fill_value = self.children[1] + if isinstance(fill_value, Literal): + arg = plc.Scalar.from_py(fill_value.value, fill_value.dtype.plc) else: - evaluated = self.children[1].evaluate(df, context=context) + evaluated = fill_value.evaluate(df, context=context) arg = evaluated.obj_scalar if evaluated.is_scalar else evaluated.obj if isinstance(arg, plc.Scalar) and dtypes.can_cast( - column.obj.type(), arg.type() + column.dtype.plc, arg.type() ): # pragma: no cover - arg = plc.unary.cast( - plc.Column.from_scalar(arg, 1), column.obj.type() - ).to_scalar() + arg = ( + Column(plc.Column.from_scalar(arg, 1), dtype=fill_value.dtype) + .astype(column.dtype) + .obj.to_scalar() + ) return Column(plc.replace.replace_nulls(column.obj, arg), dtype=self.dtype) elif self.name == "as_struct": children = [ @@ -358,7 +359,7 @@ def do_evaluate( ) elif self.name in self._OP_MAPPING: column = self.children[0].evaluate(df, context=context) - if column.obj.type().id() != self.dtype.id(): + if column.dtype.plc.id() != self.dtype.id(): arg = plc.unary.cast(column.obj, self.dtype.plc) else: arg = column.obj @@ -369,7 +370,7 @@ def do_evaluate( elif self.name in UnaryFunction._supported_cum_aggs: column = self.children[0].evaluate(df, context=context) plc_col = column.obj - col_type = column.obj.type() + col_type = column.dtype.plc # cum_sum casts # Int8, UInt8, Int16, UInt16 -> Int64 for overflow prevention # Bool -> UInt32 @@ -380,26 +381,19 @@ def do_evaluate( self.name == "cum_sum" and col_type.id() in { - plc.types.TypeId.INT8, - plc.types.TypeId.UINT8, - plc.types.TypeId.INT16, - plc.types.TypeId.UINT16, + plc.TypeId.INT8, + plc.TypeId.UINT8, + plc.TypeId.INT16, + plc.TypeId.UINT16, } ) or ( self.name == "cum_prod" and plc.traits.is_integral(col_type) and plc.types.size_of(col_type) <= 4 ): - plc_col = plc.unary.cast( - plc_col, plc.types.DataType(plc.types.TypeId.INT64) - ) - elif ( - self.name == "cum_sum" - and column.obj.type().id() == plc.types.TypeId.BOOL8 - ): - plc_col = plc.unary.cast( - plc_col, plc.types.DataType(plc.types.TypeId.UINT32) - ) + plc_col = plc.unary.cast(plc_col, plc.DataType(plc.TypeId.INT64)) + elif self.name == "cum_sum" and column.dtype.plc.id() == plc.TypeId.BOOL8: + plc_col = plc.unary.cast(plc_col, plc.DataType(plc.TypeId.UINT32)) if self.name == "cum_sum": agg = plc.aggregation.sum() elif self.name == "cum_prod": diff --git a/python/cudf_polars/cudf_polars/dsl/utils/aggregations.py b/python/cudf_polars/cudf_polars/dsl/utils/aggregations.py index fef9ddfb428..a8280c4c3bc 100644 --- a/python/cudf_polars/cudf_polars/dsl/utils/aggregations.py +++ b/python/cudf_polars/cudf_polars/dsl/utils/aggregations.py @@ -151,6 +151,14 @@ def decompose_single_agg( # agg. replace_nulls(col, 0, is_top=is_top), ) + elif agg.name == "mean": + post_agg_col: expr.Expr = expr.Col( + DataType(pl.Float64), name + ) # libcudf promotes to float64 + if agg.dtype.plc.id() == plc.TypeId.FLOAT32: + # Cast back to float32 to match Polars + post_agg_col = expr.Cast(agg.dtype, post_agg_col) + return [(named_expr, True)], named_expr.reconstruct(post_agg_col) else: return [(named_expr, True)], named_expr.reconstruct( expr.Col(agg.dtype, name) diff --git a/python/cudf_polars/cudf_polars/testing/plugin.py b/python/cudf_polars/cudf_polars/testing/plugin.py index e5aca6766e2..f1d7820c809 100644 --- a/python/cudf_polars/cudf_polars/testing/plugin.py +++ b/python/cudf_polars/cudf_polars/testing/plugin.py @@ -137,7 +137,6 @@ def pytest_configure(config: pytest.Config) -> None: "tests/unit/operations/arithmetic/test_list.py::test_list_arithmetic_values[exec_op_with_expr_no_type_coercion-broadcast_none-none]": "https://github.com/rapidsai/cudf/issues/19408", "tests/unit/operations/test_abs.py::test_abs_duration": "Need to raise for unsupported uops on timelike values", "tests/unit/operations/test_group_by.py::test_group_by_shorthand_quantile": "libcudf quantiles are round to nearest ties to even, polars quantiles are round to nearest ties away from zero", - "tests/unit/operations/test_group_by.py::test_group_by_mean_by_dtype[input7-expected7-Float32-Float32]": "Mismatching dtypes, needs cudf#15852", "tests/unit/operations/test_group_by.py::test_group_by_mean_by_dtype[input10-expected10-Date-output_dtype10]": "Unsupported groupby-agg for a particular dtype", "tests/unit/operations/test_group_by.py::test_group_by_mean_by_dtype[input11-expected11-input_dtype11-output_dtype11]": "Unsupported groupby-agg for a particular dtype", "tests/unit/operations/test_group_by.py::test_group_by_mean_by_dtype[input12-expected12-input_dtype12-output_dtype12]": "Unsupported groupby-agg for a particular dtype", @@ -162,7 +161,6 @@ def pytest_configure(config: pytest.Config) -> None: "tests/unit/sql/test_wildcard_opts.py::test_select_wildcard_errors": "Raises correctly but with different exception", "tests/unit/test_cse.py::test_cse_predicate_self_join": "Debug output on stderr doesn't match", "tests/unit/test_cse.py::test_nested_cache_no_panic_16553": "Needs https://github.com/rapidsai/cudf/issues/18630", - "tests/unit/test_empty.py::test_empty_9137": "Mismatching dtypes, needs cudf#15852", "tests/unit/test_errors.py::test_error_on_empty_group_by": "Incorrect exception raised", "tests/unit/test_predicates.py::test_predicate_pushdown_split_pushable": "Casting that raises not supported on GPU", "tests/unit/io/test_scan.py::test_async_read_21945[scan_type0]": "Debug output on stderr doesn't match", diff --git a/python/cudf_polars/cudf_polars/utils/dtypes.py b/python/cudf_polars/cudf_polars/utils/dtypes.py index 350aae54238..a38652f84d9 100644 --- a/python/cudf_polars/cudf_polars/utils/dtypes.py +++ b/python/cudf_polars/cudf_polars/utils/dtypes.py @@ -21,18 +21,20 @@ def can_cast(from_: plc.DataType, to: plc.DataType) -> bool: """ - Can we cast (via :func:`~.pylibcudf.unary.cast`) between two datatypes. + Determine whether a cast between two datatypes is supported by cudf-polars. Parameters ---------- - from_ + from_ : pylibcudf.DataType Source datatype - to + + to : pylibcudf.DataType Target datatype Returns ------- - True if casting is supported, False otherwise + bool + True if the cast is supported, False otherwise. """ to_is_empty = to.id() == plc.TypeId.EMPTY from_is_empty = from_.id() == plc.TypeId.EMPTY diff --git a/python/cudf_polars/tests/test_drop_nulls.py b/python/cudf_polars/tests/test_drop_nulls.py index 5dfe9f66a97..46a8007a805 100644 --- a/python/cudf_polars/tests/test_drop_nulls.py +++ b/python/cudf_polars/tests/test_drop_nulls.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations @@ -46,6 +46,11 @@ def test_fill_null(null_data, value): assert_gpu_result_equal(q) +def test_fill_null_with_string(): + q = pl.LazyFrame({"a": [None, "a"]}).select(pl.col("a").fill_null("b")) + assert_gpu_result_equal(q) + + @pytest.mark.parametrize( "strategy", ["forward", "backward", "min", "max", "mean", "zero", "one"] ) diff --git a/python/cudf_polars/tests/test_groupby.py b/python/cudf_polars/tests/test_groupby.py index ab66700f2f6..f60a46b52d6 100644 --- a/python/cudf_polars/tests/test_groupby.py +++ b/python/cudf_polars/tests/test_groupby.py @@ -303,3 +303,11 @@ def test_groupby_null_count_raises(df: pl.LazyFrame): def test_groupby_unsupported_non_pointwise_boolean_function(df: pl.LazyFrame, expr): q = df.group_by("key1").agg(expr) assert_ir_translation_raises(q, NotImplementedError) + + +def test_groupby_mean_type_promotion(df: pl.LazyFrame) -> None: + df = df.with_columns(pl.col("float").cast(pl.Float32)) + + q = df.group_by("key1").agg(pl.col("float").mean()) + + assert_gpu_result_equal(q, check_row_order=False) From 6ed64e88bef407fda6dbb92212eee6759dcb1e5a Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Mon, 28 Jul 2025 20:25:15 -0700 Subject: [PATCH 018/366] Add primitive row dispatch support for semi/anti join and cudf::contains (#19518) This PR is a follow-up to #19361, which was reverted due to a NaN handling bug and incorrect CG size used in explicit instantiations. This revised PR addresses those issues and retargets the work for the 25.10 release. Authors: - Yunsong Wang (https://github.com/PointKernel) Approvers: - Bradley Dice (https://github.com/bdice) - Nghia Truong (https://github.com/ttnghia) - Shruti Shivakumar (https://github.com/shrshi) URL: https://github.com/rapidsai/cudf/pull/19518 --- cpp/CMakeLists.txt | 3 + .../cudf/table/primitive_row_operators.cuh | 13 + cpp/src/search/contains_table.cu | 307 ++++-------------- cpp/src/search/contains_table_impl.cu | 126 +++++++ cpp/src/search/contains_table_impl.cuh | 268 +++++++++++++++ cpp/src/search/contains_table_impl_nested.cu | 94 ++++++ .../search/contains_table_impl_primitive.cu | 42 +++ cpp/tests/search/search_test.cpp | 27 ++ 8 files changed, 641 insertions(+), 239 deletions(-) create mode 100644 cpp/src/search/contains_table_impl.cu create mode 100644 cpp/src/search/contains_table_impl.cuh create mode 100644 cpp/src/search/contains_table_impl_nested.cu create mode 100644 cpp/src/search/contains_table_impl_primitive.cu diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 2b42e243033..653c61fcb96 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -686,6 +686,9 @@ add_library( src/search/contains_column.cu src/search/contains_scalar.cu src/search/contains_table.cu + src/search/contains_table_impl.cu + src/search/contains_table_impl_nested.cu + src/search/contains_table_impl_primitive.cu src/search/search_ordered.cu src/sort/is_sorted.cu src/sort/rank.cu diff --git a/cpp/include/cudf/table/primitive_row_operators.cuh b/cpp/include/cudf/table/primitive_row_operators.cuh index 1659101cb2b..3016422938e 100644 --- a/cpp/include/cudf/table/primitive_row_operators.cuh +++ b/cpp/include/cudf/table/primitive_row_operators.cuh @@ -143,6 +143,19 @@ class row_equality_comparator { rhs_row_index); } + /** + * @brief Compares the specified rows for equality. + * + * @param lhs_index The index of the first row to compare (in the lhs table) + * @param rhs_index The index of the second row to compare (in the rhs table) + * @return Boolean indicating if both rows are equal + */ + __device__ bool operator()(cudf::experimental::row::lhs_index_type lhs_index, + cudf::experimental::row::rhs_index_type rhs_index) const + { + return (*this)(static_cast(lhs_index), static_cast(rhs_index)); + } + private: cudf::nullate::DYNAMIC _has_nulls; table_device_view _lhs; diff --git a/cpp/src/search/contains_table.cu b/cpp/src/search/contains_table.cu index 245d9471550..b5d364a2590 100644 --- a/cpp/src/search/contains_table.cu +++ b/cpp/src/search/contains_table.cu @@ -14,177 +14,25 @@ * limitations under the License. */ -#include "join/join_common_utils.cuh" +#include "contains_table_impl.cuh" -#include -#include #include -#include #include +#include #include #include -#include +#include #include #include #include #include -#include -#include -#include +#include namespace cudf::detail { -namespace { - -using cudf::experimental::row::lhs_index_type; -using cudf::experimental::row::rhs_index_type; - -/** - * @brief An hasher adapter wrapping both haystack hasher and needles hasher - */ -template -struct hasher_adapter { - hasher_adapter(HaystackHasher const& haystack_hasher, NeedleHasher const& needle_hasher) - : _haystack_hasher{haystack_hasher}, _needle_hasher{needle_hasher} - { - } - - __device__ constexpr auto operator()(lhs_index_type idx) const noexcept - { - return _needle_hasher(static_cast(idx)); - } - - __device__ constexpr auto operator()(rhs_index_type idx) const noexcept - { - return _haystack_hasher(static_cast(idx)); - } - - private: - HaystackHasher const _haystack_hasher; - NeedleHasher const _needle_hasher; -}; - -/** - * @brief An comparator adapter wrapping both self comparator and two table comparator - */ -template -struct comparator_adapter { - comparator_adapter(SelfEqual const& self_equal, TwoTableEqual const& two_table_equal) - : _self_equal{self_equal}, _two_table_equal{two_table_equal} - { - } - - __device__ constexpr auto operator()(rhs_index_type lhs_index, - rhs_index_type rhs_index) const noexcept - { - auto const lhs = static_cast(lhs_index); - auto const rhs = static_cast(rhs_index); - - return _self_equal(lhs, rhs); - } - - __device__ constexpr auto operator()(lhs_index_type lhs_index, - rhs_index_type rhs_index) const noexcept - { - return _two_table_equal(lhs_index, rhs_index); - } - - private: - SelfEqual const _self_equal; - TwoTableEqual const _two_table_equal; -}; - -/** - * @brief Build a row bitmask for the input table. - * - * The output bitmask will have invalid bits corresponding to the input rows having nulls (at - * any nested level) and vice versa. - * - * @param input The input table - * @param stream CUDA stream used for device memory operations and kernel launches - * @return A pair of pointer to the output bitmask and the buffer containing the bitmask - */ -std::pair build_row_bitmask(table_view const& input, - rmm::cuda_stream_view stream) -{ - auto const nullable_columns = get_nullable_columns(input); - CUDF_EXPECTS(nullable_columns.size() > 0, - "The input table has nulls thus it should have nullable columns."); - - // If there are more than one nullable column, we compute `bitmask_and` of their null masks. - // Otherwise, we have only one nullable column and can use its null mask directly. - if (nullable_columns.size() > 1) { - auto row_bitmask = - cudf::detail::bitmask_and( - table_view{nullable_columns}, stream, cudf::get_current_device_resource_ref()) - .first; - auto const row_bitmask_ptr = static_cast(row_bitmask.data()); - return std::pair(std::move(row_bitmask), row_bitmask_ptr); - } - - return std::pair(rmm::device_buffer{0, stream}, nullable_columns.front().null_mask()); -} - -/** - * @brief Invokes the given `func` with desired comparators based on the specified `compare_nans` - * parameter - * - * @tparam HasNested Flag indicating whether there are nested columns in haystack or needles - * @tparam Hasher Type of device hash function - * @tparam Func Type of the helper function doing `contains` check - * - * @param compare_nulls Control whether nulls should be compared as equal or not - * @param compare_nans Control whether floating-point NaNs values should be compared as equal or not - * @param haystack_has_nulls Flag indicating whether haystack has nulls or not - * @param has_any_nulls Flag indicating whether there are nested nulls is either haystack or needles - * @param self_equal Self table comparator - * @param two_table_equal Two table comparator - * @param d_hasher Device hash functor - * @param func The input functor to invoke - */ -template -void dispatch_nan_comparator( - null_equality compare_nulls, - nan_equality compare_nans, - bool haystack_has_nulls, - bool has_any_nulls, - cudf::experimental::row::equality::self_comparator self_equal, - cudf::experimental::row::equality::two_table_comparator two_table_equal, - Hasher const& d_hasher, - Func&& func) -{ - // Distinguish probing scheme CG sizes between nested and flat types for better performance - auto const probing_scheme = [&]() { - if constexpr (HasNested) { - return cuco::linear_probing<4, Hasher>{d_hasher}; - } else { - return cuco::linear_probing<1, Hasher>{d_hasher}; - } - }(); - - if (compare_nans == nan_equality::ALL_EQUAL) { - using nan_equal_comparator = - cudf::experimental::row::equality::nan_equal_physical_equality_comparator; - auto const d_self_equal = self_equal.equal_to( - nullate::DYNAMIC{haystack_has_nulls}, compare_nulls, nan_equal_comparator{}); - auto const d_two_table_equal = two_table_equal.equal_to( - nullate::DYNAMIC{has_any_nulls}, compare_nulls, nan_equal_comparator{}); - func(d_self_equal, d_two_table_equal, probing_scheme); - } else { - using nan_unequal_comparator = cudf::experimental::row::equality::physical_equality_comparator; - auto const d_self_equal = self_equal.equal_to( - nullate::DYNAMIC{haystack_has_nulls}, compare_nulls, nan_unequal_comparator{}); - auto const d_two_table_equal = two_table_equal.equal_to( - nullate::DYNAMIC{has_any_nulls}, compare_nulls, nan_unequal_comparator{}); - func(d_self_equal, d_two_table_equal, probing_scheme); - } -} - -} // namespace - rmm::device_uvector contains(table_view const& haystack, table_view const& needles, null_equality compare_nulls, @@ -203,94 +51,75 @@ rmm::device_uvector contains(table_view const& haystack, auto const preprocessed_haystack = cudf::experimental::row::equality::preprocessed_table::create(haystack, stream); - auto const haystack_hasher = cudf::experimental::row::hash::row_hasher(preprocessed_haystack); - auto const d_haystack_hasher = haystack_hasher.device_hasher(nullate::DYNAMIC{has_any_nulls}); - auto const needle_hasher = cudf::experimental::row::hash::row_hasher(preprocessed_needles); - auto const d_needle_hasher = needle_hasher.device_hasher(nullate::DYNAMIC{has_any_nulls}); - auto const d_hasher = hasher_adapter{d_haystack_hasher, d_needle_hasher}; - - auto const self_equal = cudf::experimental::row::equality::self_comparator(preprocessed_haystack); - auto const two_table_equal = cudf::experimental::row::equality::two_table_comparator( - preprocessed_needles, preprocessed_haystack); - // The output vector. auto contained = rmm::device_uvector(needles.num_rows(), stream, mr); - auto const haystack_iter = cudf::detail::make_counting_transform_iterator( - size_type{0}, cuda::proclaim_return_type([] __device__(auto idx) { - return rhs_index_type{idx}; - })); - auto const needles_iter = cudf::detail::make_counting_transform_iterator( - size_type{0}, cuda::proclaim_return_type([] __device__(auto idx) { - return lhs_index_type{idx}; - })); - - auto const helper_func = - [&](auto const& d_self_equal, auto const& d_two_table_equal, auto const& probing_scheme) { - auto const d_equal = comparator_adapter{d_self_equal, d_two_table_equal}; - - auto set = cuco::static_set{ - cuco::extent{compute_hash_table_size(haystack.num_rows())}, - cuco::empty_key{rhs_index_type{-1}}, - d_equal, - probing_scheme, - {}, - {}, - cudf::detail::cuco_allocator{rmm::mr::polymorphic_allocator{}, stream}, - stream.value()}; - - if (haystack_has_nulls && compare_nulls == null_equality::UNEQUAL) { - auto const bitmask_buffer_and_ptr = build_row_bitmask(haystack, stream); - auto const row_bitmask_ptr = bitmask_buffer_and_ptr.second; - - // If the haystack table has nulls but they are compared unequal, don't insert them. - // Otherwise, it was known to cause performance issue: - // - https://github.com/rapidsai/cudf/pull/6943 - // - https://github.com/rapidsai/cudf/pull/8277 - set.insert_if_async(haystack_iter, - haystack_iter + haystack.num_rows(), - thrust::counting_iterator(0), // stencil - row_is_valid{row_bitmask_ptr}, - stream.value()); - } else { - set.insert_async(haystack_iter, haystack_iter + haystack.num_rows(), stream.value()); - } - - if (needles_has_nulls && compare_nulls == null_equality::UNEQUAL) { - auto const bitmask_buffer_and_ptr = build_row_bitmask(needles, stream); - auto const row_bitmask_ptr = bitmask_buffer_and_ptr.second; - set.contains_if_async(needles_iter, - needles_iter + needles.num_rows(), - thrust::counting_iterator(0), // stencil - row_is_valid{row_bitmask_ptr}, - contained.begin(), - stream.value()); - } else { - set.contains_async( - needles_iter, needles_iter + needles.num_rows(), contained.begin(), stream.value()); - } - }; - - if (cudf::detail::has_nested_columns(haystack)) { - dispatch_nan_comparator(compare_nulls, - compare_nans, - haystack_has_nulls, - has_any_nulls, - self_equal, - two_table_equal, - d_hasher, - helper_func); + // Only use primitive row operators for non-floating-point types since they don't handle NaN + // equality + auto const has_floating_point = + std::any_of(haystack.begin(), haystack.end(), [](auto const& col) { + return cudf::is_floating_point(col.type()); + }); + if (cudf::is_primitive_row_op_compatible(haystack) && !has_floating_point) { + auto const d_haystack_hasher = + cudf::row::primitive::row_hasher{nullate::DYNAMIC{has_any_nulls}, preprocessed_haystack}; + auto const d_needle_hasher = + cudf::row::primitive::row_hasher{nullate::DYNAMIC{has_any_nulls}, preprocessed_needles}; + auto const d_hasher = hasher_adapter{d_haystack_hasher, d_needle_hasher}; + auto const d_self_equal = cudf::row::primitive::row_equality_comparator{ + nullate::DYNAMIC{has_any_nulls}, preprocessed_haystack, preprocessed_haystack, compare_nulls}; + auto const d_two_table_equal = cudf::row::primitive::row_equality_comparator{ + nullate::DYNAMIC{has_any_nulls}, preprocessed_needles, preprocessed_haystack, compare_nulls}; + auto const d_equal = comparator_adapter{d_self_equal, d_two_table_equal}; + perform_contains(haystack, + needles, + haystack_has_nulls, + needles_has_nulls, + compare_nulls, + d_equal, + cuco::linear_probing<1, decltype(d_hasher)>{d_hasher}, + contained, + stream); } else { - dispatch_nan_comparator(compare_nulls, - compare_nans, - haystack_has_nulls, - has_any_nulls, - self_equal, - two_table_equal, - d_hasher, - helper_func); + auto const haystack_hasher = cudf::experimental::row::hash::row_hasher(preprocessed_haystack); + auto const d_haystack_hasher = haystack_hasher.device_hasher(nullate::DYNAMIC{has_any_nulls}); + auto const needle_hasher = cudf::experimental::row::hash::row_hasher(preprocessed_needles); + auto const d_needle_hasher = needle_hasher.device_hasher(nullate::DYNAMIC{has_any_nulls}); + auto const d_hasher = hasher_adapter{d_haystack_hasher, d_needle_hasher}; + + auto const self_equal = + cudf::experimental::row::equality::self_comparator(preprocessed_haystack); + auto const two_table_equal = cudf::experimental::row::equality::two_table_comparator( + preprocessed_needles, preprocessed_haystack); + + if (cudf::detail::has_nested_columns(haystack)) { + dispatch_nan_comparator(haystack, + needles, + compare_nulls, + compare_nans, + haystack_has_nulls, + needles_has_nulls, + has_any_nulls, + self_equal, + two_table_equal, + d_hasher, + contained, + stream); + } else { + dispatch_nan_comparator(haystack, + needles, + compare_nulls, + compare_nans, + haystack_has_nulls, + needles_has_nulls, + has_any_nulls, + self_equal, + two_table_equal, + d_hasher, + contained, + stream); + } } - return contained; } diff --git a/cpp/src/search/contains_table_impl.cu b/cpp/src/search/contains_table_impl.cu new file mode 100644 index 00000000000..dcbd4855b73 --- /dev/null +++ b/cpp/src/search/contains_table_impl.cu @@ -0,0 +1,126 @@ +/* + * Copyright (c) 2025, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "contains_table_impl.cuh" + +#include +#include +#include + +#include + +namespace cudf::detail { + +using cudf::experimental::row::lhs_index_type; +using cudf::experimental::row::rhs_index_type; + +/** + * @brief Build a row bitmask for the input table. + * + * The output bitmask will have invalid bits corresponding to the input rows having nulls (at + * any nested level) and vice versa. + * + * @param input The input table + * @param stream CUDA stream used for device memory operations and kernel launches + * @return A pair of pointer to the output bitmask and the buffer containing the bitmask + */ +std::pair build_row_bitmask(table_view const& input, + rmm::cuda_stream_view stream) +{ + auto const nullable_columns = get_nullable_columns(input); + CUDF_EXPECTS(nullable_columns.size() > 0, + "The input table has nulls thus it should have nullable columns."); + + // If there are more than one nullable column, we compute `bitmask_and` of their null masks. + // Otherwise, we have only one nullable column and can use its null mask directly. + if (nullable_columns.size() > 1) { + auto row_bitmask = + cudf::detail::bitmask_and( + table_view{nullable_columns}, stream, cudf::get_current_device_resource_ref()) + .first; + auto const row_bitmask_ptr = static_cast(row_bitmask.data()); + return std::pair(std::move(row_bitmask), row_bitmask_ptr); + } + + return std::pair(rmm::device_buffer{0, stream}, nullable_columns.front().null_mask()); +} + +// Explicit instantiations for non-nested types (HasNested=false) +using hasher_adapter_t = hasher_adapter< + cudf::experimental::row::hash::device_row_hasher, + cudf::experimental::row::hash::device_row_hasher>; + +template void dispatch_nan_comparator( + table_view const& haystack, + table_view const& needles, + null_equality compare_nulls, + nan_equality compare_nans, + bool haystack_has_nulls, + bool needles_has_nulls, + bool has_any_nulls, + cudf::experimental::row::equality::self_comparator self_equal, + cudf::experimental::row::equality::two_table_comparator two_table_equal, + hasher_adapter_t const& d_hasher, + rmm::device_uvector& contained, + rmm::cuda_stream_view stream); + +// For HasNested=false (non-nested columns) with nan_equal_comparator +using nan_equal_self_comparator = cudf::experimental::row::equality::device_row_comparator< + false, + cudf::nullate::DYNAMIC, + cudf::experimental::row::equality::nan_equal_physical_equality_comparator>; + +using nan_equal_two_table_comparator = + cudf::experimental::row::equality::strong_index_comparator_adapter; + +using nan_equal_comparator_adapter = + comparator_adapter; + +template void perform_contains(table_view const& haystack, + table_view const& needles, + bool haystack_has_nulls, + bool needles_has_nulls, + null_equality compare_nulls, + nan_equal_comparator_adapter const& d_equal, + cuco::linear_probing<1, hasher_adapter_t> const& probing_scheme, + rmm::device_uvector& contained, + rmm::cuda_stream_view stream); + +// For HasNested=false (non-nested columns) with nan_unequal_comparator +using nan_unequal_self_comparator = cudf::experimental::row::equality::device_row_comparator< + false, + cudf::nullate::DYNAMIC, + cudf::experimental::row::equality::physical_equality_comparator>; + +using nan_unequal_two_table_comparator = + cudf::experimental::row::equality::strong_index_comparator_adapter; + +using nan_unequal_comparator_adapter = + comparator_adapter; + +template void perform_contains(table_view const& haystack, + table_view const& needles, + bool haystack_has_nulls, + bool needles_has_nulls, + null_equality compare_nulls, + nan_unequal_comparator_adapter const& d_equal, + cuco::linear_probing<1, hasher_adapter_t> const& probing_scheme, + rmm::device_uvector& contained, + rmm::cuda_stream_view stream); + +} // namespace cudf::detail diff --git a/cpp/src/search/contains_table_impl.cuh b/cpp/src/search/contains_table_impl.cuh new file mode 100644 index 00000000000..dedf79fc0f7 --- /dev/null +++ b/cpp/src/search/contains_table_impl.cuh @@ -0,0 +1,268 @@ +/* + * Copyright (c) 2025, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "join/join_common_utils.cuh" + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include + +namespace cudf::detail { + +using cudf::experimental::row::lhs_index_type; +using cudf::experimental::row::rhs_index_type; + +/** + * @brief An hasher adapter wrapping both haystack hasher and needles hasher + */ +template +struct hasher_adapter { + hasher_adapter(HaystackHasher const& haystack_hasher, NeedleHasher const& needle_hasher) + : _haystack_hasher{haystack_hasher}, _needle_hasher{needle_hasher} + { + } + + __device__ constexpr auto operator()(lhs_index_type idx) const noexcept + { + return _needle_hasher(static_cast(idx)); + } + + __device__ constexpr auto operator()(rhs_index_type idx) const noexcept + { + return _haystack_hasher(static_cast(idx)); + } + + private: + HaystackHasher const _haystack_hasher; + NeedleHasher const _needle_hasher; +}; + +/** + * @brief An comparator adapter wrapping both self comparator and two table comparator + */ +template +struct comparator_adapter { + comparator_adapter(SelfEqual const& self_equal, TwoTableEqual const& two_table_equal) + : _self_equal{self_equal}, _two_table_equal{two_table_equal} + { + } + + __device__ constexpr auto operator()(rhs_index_type lhs_index, + rhs_index_type rhs_index) const noexcept + { + auto const lhs = static_cast(lhs_index); + auto const rhs = static_cast(rhs_index); + + return _self_equal(lhs, rhs); + } + + __device__ constexpr auto operator()(lhs_index_type lhs_index, + rhs_index_type rhs_index) const noexcept + { + return _two_table_equal(lhs_index, rhs_index); + } + + private: + SelfEqual const _self_equal; + TwoTableEqual const _two_table_equal; +}; + +/** + * @brief Build a row bitmask for the input table. + * + * The output bitmask will have invalid bits corresponding to the input rows having nulls (at + * any nested level) and vice versa. + * + * @param input The input table + * @param stream CUDA stream used for device memory operations and kernel launches + * @return A pair of pointer to the output bitmask and the buffer containing the bitmask + */ +std::pair build_row_bitmask(table_view const& input, + rmm::cuda_stream_view stream); + +/** + * @brief Helper function to perform the contains operation using a hash set + * + * @tparam Comparator Type of the equality comparator + * @tparam ProbingScheme Type of the probing scheme + * + * @param haystack The haystack table view + * @param needles The needles table view + * @param haystack_has_nulls Flag indicating whether haystack has nulls + * @param needles_has_nulls Flag indicating whether needles has nulls + * @param compare_nulls Control whether nulls should be compared as equal or not + * @param d_equal The equality comparator + * @param probing_scheme The probing scheme for the hash set + * @param contained The output vector to store results + * @param stream CUDA stream used for device memory operations and kernel launches + */ +template +void perform_contains(table_view const& haystack, + table_view const& needles, + bool haystack_has_nulls, + bool needles_has_nulls, + null_equality compare_nulls, + Comparator const& d_equal, + ProbingScheme const& probing_scheme, + rmm::device_uvector& contained, + rmm::cuda_stream_view stream) +{ + auto const haystack_iter = cudf::detail::make_counting_transform_iterator( + size_type{0}, cuda::proclaim_return_type([] __device__(auto idx) { + return rhs_index_type{idx}; + })); + + auto const needles_iter = cudf::detail::make_counting_transform_iterator( + size_type{0}, cuda::proclaim_return_type([] __device__(auto idx) { + return lhs_index_type{idx}; + })); + + auto set = cuco::static_set{ + cuco::extent{compute_hash_table_size(haystack.num_rows())}, + cuco::empty_key{rhs_index_type{-1}}, + d_equal, + probing_scheme, + {}, + {}, + cudf::detail::cuco_allocator{rmm::mr::polymorphic_allocator{}, stream}, + stream.value()}; + + if (haystack_has_nulls && compare_nulls == null_equality::UNEQUAL) { + auto const bitmask_buffer_and_ptr = build_row_bitmask(haystack, stream); + auto const row_bitmask_ptr = bitmask_buffer_and_ptr.second; + + // If the haystack table has nulls but they are compared unequal, don't insert them. + // Otherwise, it was known to cause performance issue: + // - https://github.com/rapidsai/cudf/pull/6943 + // - https://github.com/rapidsai/cudf/pull/8277 + set.insert_if_async(haystack_iter, + haystack_iter + haystack.num_rows(), + thrust::counting_iterator(0), // stencil + row_is_valid{row_bitmask_ptr}, + stream.value()); + } else { + set.insert_async(haystack_iter, haystack_iter + haystack.num_rows(), stream.value()); + } + + if (needles_has_nulls && compare_nulls == null_equality::UNEQUAL) { + auto const bitmask_buffer_and_ptr = build_row_bitmask(needles, stream); + auto const row_bitmask_ptr = bitmask_buffer_and_ptr.second; + set.contains_if_async(needles_iter, + needles_iter + needles.num_rows(), + thrust::counting_iterator(0), // stencil + row_is_valid{row_bitmask_ptr}, + contained.begin(), + stream.value()); + } else { + set.contains_async( + needles_iter, needles_iter + needles.num_rows(), contained.begin(), stream.value()); + } +} + +/** + * @brief Invokes perform_contains with desired comparators based on the specified `compare_nans` + * parameter + * + * @tparam HasNested Flag indicating whether there are nested columns in haystack or needles + * @tparam Hasher Type of device hash function + * + * @param haystack The haystack table view + * @param needles The needles table view + * @param compare_nulls Control whether nulls should be compared as equal or not + * @param compare_nans Control whether floating-point NaNs values should be compared as equal or not + * @param haystack_has_nulls Flag indicating whether haystack has nulls or not + * @param needles_has_nulls Flag indicating whether needles has nulls or not + * @param has_any_nulls Flag indicating whether there are nested nulls is either haystack or needles + * @param self_equal Self table comparator + * @param two_table_equal Two table comparator + * @param d_hasher Device hash functor + * @param contained The output vector to store results + * @param stream CUDA stream used for device memory operations and kernel launches + */ +template +void dispatch_nan_comparator( + table_view const& haystack, + table_view const& needles, + null_equality compare_nulls, + nan_equality compare_nans, + bool haystack_has_nulls, + bool needles_has_nulls, + bool has_any_nulls, + cudf::experimental::row::equality::self_comparator self_equal, + cudf::experimental::row::equality::two_table_comparator two_table_equal, + Hasher const& d_hasher, + rmm::device_uvector& contained, + rmm::cuda_stream_view stream) +{ + // Distinguish probing scheme CG sizes between nested and flat types for better performance + auto const probing_scheme = [&]() { + if constexpr (HasNested) { + return cuco::linear_probing<4, Hasher>{d_hasher}; + } else { + return cuco::linear_probing<1, Hasher>{d_hasher}; + } + }(); + + if (compare_nans == nan_equality::ALL_EQUAL) { + using nan_equal_comparator = + cudf::experimental::row::equality::nan_equal_physical_equality_comparator; + auto const d_self_equal = self_equal.equal_to( + nullate::DYNAMIC{haystack_has_nulls}, compare_nulls, nan_equal_comparator{}); + auto const d_two_table_equal = two_table_equal.equal_to( + nullate::DYNAMIC{has_any_nulls}, compare_nulls, nan_equal_comparator{}); + auto const d_equal = comparator_adapter{d_self_equal, d_two_table_equal}; + perform_contains(haystack, + needles, + haystack_has_nulls, + needles_has_nulls, + compare_nulls, + d_equal, + probing_scheme, + contained, + stream); + } else { + using nan_unequal_comparator = cudf::experimental::row::equality::physical_equality_comparator; + auto const d_self_equal = self_equal.equal_to( + nullate::DYNAMIC{haystack_has_nulls}, compare_nulls, nan_unequal_comparator{}); + auto const d_two_table_equal = two_table_equal.equal_to( + nullate::DYNAMIC{has_any_nulls}, compare_nulls, nan_unequal_comparator{}); + auto const d_equal = comparator_adapter{d_self_equal, d_two_table_equal}; + perform_contains(haystack, + needles, + haystack_has_nulls, + needles_has_nulls, + compare_nulls, + d_equal, + probing_scheme, + contained, + stream); + } +} + +} // namespace cudf::detail diff --git a/cpp/src/search/contains_table_impl_nested.cu b/cpp/src/search/contains_table_impl_nested.cu new file mode 100644 index 00000000000..e90870508af --- /dev/null +++ b/cpp/src/search/contains_table_impl_nested.cu @@ -0,0 +1,94 @@ +/* + * Copyright (c) 2025, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "contains_table_impl.cuh" + +#include +#include +#include + +namespace cudf::detail { + +// Explicit instantiations to reduce build time +using hasher_adapter_t = hasher_adapter< + cudf::experimental::row::hash::device_row_hasher, + cudf::experimental::row::hash::device_row_hasher>; + +template void dispatch_nan_comparator( + table_view const& haystack, + table_view const& needles, + null_equality compare_nulls, + nan_equality compare_nans, + bool haystack_has_nulls, + bool needles_has_nulls, + bool has_any_nulls, + cudf::experimental::row::equality::self_comparator self_equal, + cudf::experimental::row::equality::two_table_comparator two_table_equal, + hasher_adapter_t const& d_hasher, + rmm::device_uvector& contained, + rmm::cuda_stream_view stream); + +// Explicit instantiations for perform_contains with nested types (experimental row operations) + +// For HasNested=true (nested columns) with nan_equal_comparator +using nan_equal_self_comparator_nested = cudf::experimental::row::equality::device_row_comparator< + true, + cudf::nullate::DYNAMIC, + cudf::experimental::row::equality::nan_equal_physical_equality_comparator>; + +using nan_equal_two_table_comparator_nested = + cudf::experimental::row::equality::strong_index_comparator_adapter< + nan_equal_self_comparator_nested>; + +using nan_equal_comparator_adapter_nested = + comparator_adapter; + +template void perform_contains(table_view const& haystack, + table_view const& needles, + bool haystack_has_nulls, + bool needles_has_nulls, + null_equality compare_nulls, + nan_equal_comparator_adapter_nested const& d_equal, + cuco::linear_probing<4, hasher_adapter_t> const& probing_scheme, + rmm::device_uvector& contained, + rmm::cuda_stream_view stream); + +// For HasNested=true (nested columns) with nan_unequal_comparator +using nan_unequal_self_comparator_nested = cudf::experimental::row::equality::device_row_comparator< + true, + cudf::nullate::DYNAMIC, + cudf::experimental::row::equality::physical_equality_comparator>; + +using nan_unequal_two_table_comparator_nested = + cudf::experimental::row::equality::strong_index_comparator_adapter< + nan_unequal_self_comparator_nested>; + +using nan_unequal_comparator_adapter_nested = + comparator_adapter; + +template void perform_contains(table_view const& haystack, + table_view const& needles, + bool haystack_has_nulls, + bool needles_has_nulls, + null_equality compare_nulls, + nan_unequal_comparator_adapter_nested const& d_equal, + cuco::linear_probing<4, hasher_adapter_t> const& probing_scheme, + rmm::device_uvector& contained, + rmm::cuda_stream_view stream); + +} // namespace cudf::detail diff --git a/cpp/src/search/contains_table_impl_primitive.cu b/cpp/src/search/contains_table_impl_primitive.cu new file mode 100644 index 00000000000..9d925321509 --- /dev/null +++ b/cpp/src/search/contains_table_impl_primitive.cu @@ -0,0 +1,42 @@ +/* + * Copyright (c) 2025, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "contains_table_impl.cuh" + +#include + +namespace cudf::detail { + +// Explicit instantiation for perform_contains with primitive row operations +using primitive_hasher_adapter_type = + hasher_adapter, cudf::row::primitive::row_hasher<>>; + +using primitive_comparator_adapter_type = + comparator_adapter; + +template void perform_contains( + table_view const& haystack, + table_view const& needles, + bool haystack_has_nulls, + bool needles_has_nulls, + null_equality compare_nulls, + primitive_comparator_adapter_type const& d_equal, + cuco::linear_probing<1, primitive_hasher_adapter_type> const& probing_scheme, + rmm::device_uvector& contained, + rmm::cuda_stream_view stream); + +} // namespace cudf::detail diff --git a/cpp/tests/search/search_test.cpp b/cpp/tests/search/search_test.cpp index ea566fcbd38..5788fd1d5d6 100644 --- a/cpp/tests/search/search_test.cpp +++ b/cpp/tests/search/search_test.cpp @@ -20,10 +20,15 @@ #include #include +#include #include +#include + +#include #include +#include #include struct SearchTest : public cudf::test::BaseFixture {}; @@ -1819,6 +1824,28 @@ TEST_F(SearchTest, multi_contains_empty_input_set_string) CUDF_TEST_EXPECT_COLUMNS_EQUAL(*result, expect); } +TEST_F(SearchTest, multi_contains_primitive_nan_unequal_bug) +{ + auto nan_val = std::numeric_limits::quiet_NaN(); + + fixed_width_column_wrapper haystack{1.0f, nan_val, 3.0f}; + fixed_width_column_wrapper needles{nan_val}; + + auto result = cudf::detail::contains(cudf::table_view{{haystack}}, + cudf::table_view{{needles}}, + cudf::null_equality::EQUAL, + cudf::nan_equality::UNEQUAL, + cudf::get_default_stream(), + cudf::get_current_device_resource_ref()); + + thrust::host_vector result_host(result.size()); + CUDF_CUDA_TRY(cudaMemcpy( + result_host.data(), result.data(), result.size() * sizeof(bool), cudaMemcpyDeviceToHost)); + + // With nan_equality::UNEQUAL, NaN should not match NaN + EXPECT_FALSE(result_host[0]); +} + template struct FixedPointTestAllReps : public cudf::test::BaseFixture {}; From 6b06d2003dd3facd4495d01c563d96a694ef6422 Mon Sep 17 00:00:00 2001 From: Matthew Murray <41342305+Matt711@users.noreply.github.com> Date: Tue, 29 Jul 2025 06:11:31 -0400 Subject: [PATCH 019/366] Fix value counts expression when the column has nulls (#19524) Closes #19523. This is really a one-line fix. We need to pass `null_handling=plc.types.NullPolicy.INCLUDE` to `GroupBy` in the value counts expression implementation. This ensures that we group by nulls keys too. PR also replaces `query` with `q` to match convention used in other cudf-polars tests. Authors: - Matthew Murray (https://github.com/Matt711) - Lawrence Mitchell (https://github.com/wence-) Approvers: - Matthew Roeschke (https://github.com/mroeschke) URL: https://github.com/rapidsai/cudf/pull/19524 --- .../cudf_polars/dsl/expressions/unary.py | 6 +- .../tests/expressions/test_struct.py | 70 ++++++++++--------- 2 files changed, 39 insertions(+), 37 deletions(-) diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/unary.py b/python/cudf_polars/cudf_polars/dsl/expressions/unary.py index 64c4cae2378..440a2efa9cc 100644 --- a/python/cudf_polars/cudf_polars/dsl/expressions/unary.py +++ b/python/cudf_polars/cudf_polars/dsl/expressions/unary.py @@ -257,9 +257,9 @@ def do_evaluate( ) for child in self.children ] - (keys_table, (counts_table,)) = plc.groupby.GroupBy(df.table).aggregate( - gb_requests - ) + (keys_table, (counts_table,)) = plc.groupby.GroupBy( + df.table, null_handling=plc.types.NullPolicy.INCLUDE + ).aggregate(gb_requests) if sort: sort_indices = plc.sorting.stable_sorted_order( counts_table, diff --git a/python/cudf_polars/tests/expressions/test_struct.py b/python/cudf_polars/tests/expressions/test_struct.py index ea82019ecc9..2e4686b9079 100644 --- a/python/cudf_polars/tests/expressions/test_struct.py +++ b/python/cudf_polars/tests/expressions/test_struct.py @@ -27,8 +27,8 @@ def test_field_getitem(request, ldf): reason="not supported until polars 1.31", ) ) - query = ldf.select(pl.col("a").struct[0]) - assert_gpu_result_equal(query) + q = ldf.select(pl.col("a").struct[0]) + assert_gpu_result_equal(q) @pytest.mark.parametrize("fields", [("b",), ("b", "d"), ("^b.*|f.*$",)]) @@ -39,8 +39,8 @@ def test_field(request, ldf, fields): reason="not supported until polars 1.31", ) ) - query = ldf.select(pl.col("a").struct.field(*fields)) - assert_gpu_result_equal(query) + q = ldf.select(pl.col("a").struct.field(*fields)) + assert_gpu_result_equal(q) def test_unnest(request, ldf): @@ -50,8 +50,8 @@ def test_unnest(request, ldf): reason="not supported until polars 1.31", ) ) - query = ldf.select(pl.col("a").struct.unnest()) - assert_gpu_result_equal(query) + q = ldf.select(pl.col("a").struct.unnest()) + assert_gpu_result_equal(q) def test_json_encode(request, ldf): @@ -61,12 +61,12 @@ def test_json_encode(request, ldf): reason="not supported until polars 1.31", ) ) - query = ldf.select(pl.col("a").struct.json_encode()) - assert_gpu_result_equal(query) + q = ldf.select(pl.col("a").struct.json_encode()) + assert_gpu_result_equal(q) ldf_newlines = pl.LazyFrame({"a": [{"b": "c\nd", "d": "\r\nz"}]}) - query = ldf_newlines.select(pl.col("a").struct.json_encode()) - assert_gpu_result_equal(query) + q = ldf_newlines.select(pl.col("a").struct.json_encode()) + assert_gpu_result_equal(q) def test_rename_fields(request, ldf): @@ -76,17 +76,15 @@ def test_rename_fields(request, ldf): reason="not supported until polars 1.31", ) ) - query = ldf.select( - pl.col("a").struct.rename_fields(["1", "2", "3"]).struct.unnest() - ) - assert_gpu_result_equal(query) + q = ldf.select(pl.col("a").struct.rename_fields(["1", "2", "3"]).struct.unnest()) + assert_gpu_result_equal(q) def test_with_fields(ldf): - query = ldf.select( + q = ldf.select( pl.col("a").struct.with_fields(pl.field("b").str.len_chars()).struct.unnest() ) - assert_ir_translation_raises(query, NotImplementedError) + assert_ir_translation_raises(q, NotImplementedError) @pytest.mark.parametrize( @@ -101,47 +99,51 @@ def test_prefix_suffix_fields(request, ldf, expr): reason="not supported until polars 1.31", ) ) - query = ldf.select(expr("foo").struct.unnest()) - assert_gpu_result_equal(query) + q = ldf.select(expr("foo").struct.unnest()) + assert_gpu_result_equal(q) def test_map_field_names(ldf): - query = ldf.select(pl.col("a").name.map_fields(lambda x: x.upper()).struct.unnest()) - assert_ir_translation_raises(query, NotImplementedError) + q = ldf.select(pl.col("a").name.map_fields(lambda x: x.upper()).struct.unnest()) + assert_ir_translation_raises(q, NotImplementedError) @pytest.mark.parametrize("name", [None, "my_count"]) @pytest.mark.parametrize("normalize", [True, False]) def test_value_counts(ldf, name, normalize): # sort=True since order is non-deterministic - query = ldf.select( - pl.col("a").value_counts(sort=True, name=name, normalize=normalize) - ) - assert_gpu_result_equal(query) + q = ldf.select(pl.col("a").value_counts(sort=True, name=name, normalize=normalize)) + assert_gpu_result_equal(q) def test_value_counts_normalize_div_by_zero(): ldf = pl.LazyFrame({"a": []}, schema={"a": pl.Int64()}) - query = ldf.select(pl.col("a").value_counts(normalize=True)) - assert_gpu_result_equal(query) + q = ldf.select(pl.col("a").value_counts(normalize=True)) + assert_gpu_result_equal(q) def test_groupby_value_counts_notimplemented(): lgb = pl.LazyFrame({"a": [1, 2, 3, 4, 5], "b": [1, 2, 3, 4, 5]}).group_by("a") value_counts_expr = pl.col("b").value_counts() - query = lgb.agg(value_counts_expr) - assert_ir_translation_raises(query, NotImplementedError) + q = lgb.agg(value_counts_expr) + assert_ir_translation_raises(q, NotImplementedError) - query = lgb.agg(value_counts_expr.first()) - assert_ir_translation_raises(query, NotImplementedError) + q = lgb.agg(value_counts_expr.first()) + assert_ir_translation_raises(q, NotImplementedError) def test_struct(ldf): - query = ldf.select(pl.struct(pl.all())) - assert_gpu_result_equal(query) + q = ldf.select(pl.struct(pl.all())) + assert_gpu_result_equal(q) def test_nested_struct(): ldf = pl.LazyFrame({"a": [{"x": {"i": 0, "j": 0}, "y": {"i": 0, "k": 1}}]}) - query = ldf.select(pl.struct(pl.all())) - assert_gpu_result_equal(query) + q = ldf.select(pl.struct(pl.all())) + assert_gpu_result_equal(q) + + +def test_value_counts_with_nulls(ldf): + ldf_with_nulls = ldf.select(c=pl.Series(["x", None, "y", "x", None, "x"])) + q = ldf_with_nulls.select(pl.col("c").value_counts(sort=True)) + assert_gpu_result_equal(q) From 4d79b06a058a954e49d2326211790bf5e00cf6b6 Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Tue, 29 Jul 2025 14:53:46 +0100 Subject: [PATCH 020/366] Fix clang-tools version pinning (#19529) We specify this in two places, use the exact pin rather than fuzzy pin in both. Authors: - Lawrence Mitchell (https://github.com/wence-) Approvers: - Bradley Dice (https://github.com/bdice) URL: https://github.com/rapidsai/cudf/pull/19529 --- conda/environments/all_cuda-129_arch-aarch64.yaml | 1 - conda/environments/all_cuda-129_arch-x86_64.yaml | 1 - dependencies.yaml | 2 +- 3 files changed, 1 insertion(+), 3 deletions(-) diff --git a/conda/environments/all_cuda-129_arch-aarch64.yaml b/conda/environments/all_cuda-129_arch-aarch64.yaml index c536a7d662c..15af26470a4 100644 --- a/conda/environments/all_cuda-129_arch-aarch64.yaml +++ b/conda/environments/all_cuda-129_arch-aarch64.yaml @@ -12,7 +12,6 @@ dependencies: - c-compiler - cachetools - certifi -- clang-tools=20.1.4 - clang-tools==20.1.4 - clang==20.1.4 - cmake>=3.30.4 diff --git a/conda/environments/all_cuda-129_arch-x86_64.yaml b/conda/environments/all_cuda-129_arch-x86_64.yaml index bf9e857f6bc..e606a1cd4c3 100644 --- a/conda/environments/all_cuda-129_arch-x86_64.yaml +++ b/conda/environments/all_cuda-129_arch-x86_64.yaml @@ -12,7 +12,6 @@ dependencies: - c-compiler - cachetools - certifi -- clang-tools=20.1.4 - clang-tools==20.1.4 - clang==20.1.4 - cmake>=3.30.4 diff --git a/dependencies.yaml b/dependencies.yaml index dfcfb40d7a9..a03f5ac2c3f 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -563,7 +563,7 @@ dependencies: - output_types: conda packages: - clang==20.1.4 - - clang-tools=20.1.4 + - clang-tools==20.1.4 clang_tidy: common: - output_types: conda From 67bc0734626d73b6bf950dbe2296f28c2300c578 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 29 Jul 2025 07:58:58 -0700 Subject: [PATCH 021/366] Add cudf_polars unit test for `is_in([])` expr (#19525) closes https://github.com/rapidsai/cudf/issues/18853 Appears it has been fixed in the meantime Authors: - Matthew Roeschke (https://github.com/mroeschke) - Matthew Murray (https://github.com/Matt711) Approvers: - Matthew Murray (https://github.com/Matt711) URL: https://github.com/rapidsai/cudf/pull/19525 --- python/cudf_polars/tests/expressions/test_booleanfunction.py | 1 + 1 file changed, 1 insertion(+) diff --git a/python/cudf_polars/tests/expressions/test_booleanfunction.py b/python/cudf_polars/tests/expressions/test_booleanfunction.py index 94a6f50fcfe..200b9571c7f 100644 --- a/python/cudf_polars/tests/expressions/test_booleanfunction.py +++ b/python/cudf_polars/tests/expressions/test_booleanfunction.py @@ -191,6 +191,7 @@ def test_boolean_horizontal(expr, has_nulls, wide): marks=pytest.mark.xfail(reason="Need to support implode agg"), ), pl.col("a").is_in([1, 2, 3]), + pl.col("a").is_in([]), pl.col("a").is_in([3, 4, 2]), pl.col("c").is_in([10, None, 11]), ], From c6efdfc1ea36fbbb8bebf01bd8c6d6132b294408 Mon Sep 17 00:00:00 2001 From: Matthew Murray <41342305+Matt711@users.noreply.github.com> Date: Tue, 29 Jul 2025 13:17:25 -0400 Subject: [PATCH 022/366] [FEA] Add chunked Parquet sink support using the libcudf writer (#19015) Closes https://github.com/rapidsai/cudf/issues/18969. This PR adds support for chunked writing to cudf-polars using the libcudf writer. Chunked writing is not turned on by default in cudf-polars and is controlled via the `n_output_chunks` in the GPU engine config. Authors: - Matthew Murray (https://github.com/Matt711) Approvers: - Tom Augspurger (https://github.com/TomAugspurger) URL: https://github.com/rapidsai/cudf/pull/19015 --- python/cudf_polars/cudf_polars/dsl/ir.py | 124 ++++++++++++++---- .../cudf_polars/cudf_polars/dsl/translate.py | 1 + .../cudf_polars/experimental/io.py | 24 +--- .../cudf_polars/testing/asserts.py | 6 +- .../cudf_polars/cudf_polars/utils/config.py | 15 ++- python/cudf_polars/tests/test_config.py | 7 +- python/cudf_polars/tests/test_sink.py | 21 ++- python/pylibcudf/pylibcudf/io/parquet.pxd | 4 + python/pylibcudf/pylibcudf/io/parquet.pyi | 2 + python/pylibcudf/pylibcudf/io/parquet.pyx | 44 ++++++- 10 files changed, 195 insertions(+), 53 deletions(-) diff --git a/python/cudf_polars/cudf_polars/dsl/ir.py b/python/cudf_polars/cudf_polars/dsl/ir.py index 16fef477ab2..28b20c835a0 100644 --- a/python/cudf_polars/cudf_polars/dsl/ir.py +++ b/python/cudf_polars/cudf_polars/dsl/ir.py @@ -668,18 +668,33 @@ def read_csv_header( class Sink(IR): """Sink a dataframe to a file.""" - __slots__ = ("cloud_options", "kind", "options", "path") - _non_child = ("schema", "kind", "path", "options", "cloud_options") + __slots__ = ("cloud_options", "kind", "options", "parquet_options", "path") + _non_child = ( + "schema", + "kind", + "path", + "parquet_options", + "options", + "cloud_options", + ) kind: str + """The type of file to write to. Eg. Parquet, CSV, etc.""" path: str + """The path to write to""" + parquet_options: ParquetOptions + """GPU-specific configuration options""" + cloud_options: dict[str, Any] | None + """Cloud-related authentication options, currently ignored.""" options: dict[str, Any] + """Sink options from Polars""" def __init__( self, schema: Schema, kind: str, path: str, + parquet_options: ParquetOptions, options: dict[str, Any], cloud_options: dict[str, Any], df: IR, @@ -687,10 +702,11 @@ def __init__( self.schema = schema self.kind = kind self.path = path + self.parquet_options = parquet_options self.options = options self.cloud_options = cloud_options self.children = (df,) - self._non_child_args = (schema, kind, path, options) + self._non_child_args = (schema, kind, path, parquet_options, options) if self.cloud_options is not None and any( self.cloud_options.get(k) is not None for k in ("config", "credential_provider") @@ -783,6 +799,7 @@ def get_hashable(self) -> Hashable: schema_hash, self.kind, self.path, + self.parquet_options, json.dumps(self.options), json.dumps(self.cloud_options), ) # pragma: no cover @@ -821,6 +838,85 @@ def _write_json(cls, target: plc.io.SinkInfo, df: DataFrame) -> None: ) plc.io.json.write_json(options) + @staticmethod + def _make_parquet_metadata(df: DataFrame) -> plc.io.types.TableInputMetadata: + """Create TableInputMetadata and set column names.""" + metadata = plc.io.types.TableInputMetadata(df.table) + for i, name in enumerate(df.column_names): + metadata.column_metadata[i].set_name(name) + return metadata + + @staticmethod + def _apply_parquet_writer_options( + builder: plc.io.parquet.ChunkedParquetWriterOptionsBuilder + | plc.io.parquet.ParquetWriterOptionsBuilder, + options: dict[str, Any], + ) -> ( + plc.io.parquet.ChunkedParquetWriterOptionsBuilder + | plc.io.parquet.ParquetWriterOptionsBuilder + ): + """Apply writer options to the builder.""" + compression = options.get("compression") + if compression and compression != "Uncompressed": + compression_type = getattr( + plc.io.types.CompressionType, compression.upper() + ) + builder = builder.compression(compression_type) + + if (data_page_size := options.get("data_page_size")) is not None: + builder = builder.max_page_size_bytes(data_page_size) + + if (row_group_size := options.get("row_group_size")) is not None: + builder = builder.row_group_size_rows(row_group_size) + + return builder + + @classmethod + def _write_parquet( + cls, + target: plc.io.SinkInfo, + parquet_options: ParquetOptions, + options: dict[str, Any], + df: DataFrame, + ) -> None: + metadata: plc.io.types.TableInputMetadata = cls._make_parquet_metadata(df) + + builder: ( + plc.io.parquet.ChunkedParquetWriterOptionsBuilder + | plc.io.parquet.ParquetWriterOptionsBuilder + ) + + if ( + parquet_options.chunked + and parquet_options.n_output_chunks != 1 + and df.table.num_rows() != 0 + ): + builder = plc.io.parquet.ChunkedParquetWriterOptions.builder( + target + ).metadata(metadata) + builder = cls._apply_parquet_writer_options(builder, options) + writer_options = builder.build() + writer = plc.io.parquet.ChunkedParquetWriter.from_options(writer_options) + + # TODO: Can be based on a heuristic that estimates chunk size + # from the input table size and available GPU memory. + num_chunks = parquet_options.n_output_chunks + table_chunks = plc.copying.split( + df.table, + [i * df.table.num_rows() // num_chunks for i in range(1, num_chunks)], + ) + for chunk in table_chunks: + writer.write(chunk) + writer.close([]) + + else: + builder = plc.io.parquet.ParquetWriterOptions.builder( + target, df.table + ).metadata(metadata) + builder = cls._apply_parquet_writer_options(builder, options) + writer_options = builder.build() + plc.io.parquet.write_parquet(writer_options) + @classmethod @nvtx_annotate_cudf_polars(message="Sink") def do_evaluate( @@ -828,6 +924,7 @@ def do_evaluate( schema: Schema, kind: str, path: str, + parquet_options: ParquetOptions, options: dict[str, Any], df: DataFrame, ) -> DataFrame: @@ -838,27 +935,8 @@ def do_evaluate( Path(path).parent.mkdir(parents=True, exist_ok=True) if kind == "Csv": cls._write_csv(target, options, df) - elif kind == "Parquet": - metadata = plc.io.types.TableInputMetadata(df.table) - for i, name in enumerate(df.column_names): - metadata.column_metadata[i].set_name(name) - - builder = plc.io.parquet.ParquetWriterOptions.builder(target, df.table) - compression = options["compression"] - if compression != "Uncompressed": - builder.compression( - getattr(plc.io.types.CompressionType, compression.upper()) - ) - - writer_options = builder.metadata(metadata).build() - if options["data_page_size"] is not None: - writer_options.set_max_page_size_bytes(options["data_page_size"]) - if options["row_group_size"] is not None: - writer_options.set_row_group_size_rows(options["row_group_size"]) - - plc.io.parquet.write_parquet(writer_options) - + cls._write_parquet(target, parquet_options, options, df) elif kind == "Json": cls._write_json(target, df) diff --git a/python/cudf_polars/cudf_polars/dsl/translate.py b/python/cudf_polars/cudf_polars/dsl/translate.py index 50e8dd8690f..6db8085fab4 100644 --- a/python/cudf_polars/cudf_polars/dsl/translate.py +++ b/python/cudf_polars/cudf_polars/dsl/translate.py @@ -523,6 +523,7 @@ def _(node: pl_ir.Sink, translator: Translator, schema: Schema) -> ir.IR: schema=schema, kind=sink_kind, path=file["target"], + parquet_options=translator.config_options.parquet_options, options=options, cloud_options=cloud_options, df=translator.translate_ir(n=node.input), diff --git a/python/cudf_polars/cudf_polars/experimental/io.py b/python/cudf_polars/cudf_polars/experimental/io.py index 92d0e5b9cf1..e45b6aae470 100644 --- a/python/cudf_polars/cudf_polars/experimental/io.py +++ b/python/cudf_polars/cudf_polars/experimental/io.py @@ -416,12 +416,13 @@ def _sink_to_directory( schema: Schema, kind: str, path: str, + parquet_options: ParquetOptions, options: dict[str, Any], df: DataFrame, ready: None, ) -> DataFrame: """Sink a partition to a new file.""" - return Sink.do_evaluate(schema, kind, path, options, df) + return Sink.do_evaluate(schema, kind, path, parquet_options, options, df) def _sink_to_parquet_file( @@ -434,23 +435,11 @@ def _sink_to_parquet_file( """Sink a partition to an open Parquet file.""" # Set up a new chunked Parquet writer if necessary. if writer is None: - metadata = plc.io.types.TableInputMetadata(df.table) - for i, name in enumerate(df.column_names): - metadata.column_metadata[i].set_name(name) - + metadata = Sink._make_parquet_metadata(df) sink = plc.io.types.SinkInfo([path]) - builder = plc.io.parquet.ChunkedParquetWriterOptions.builder(sink) - compression = options["compression"] - if compression != "Uncompressed": - builder.compression( - getattr(plc.io.types.CompressionType, compression.upper()) - ) - - if options["data_page_size"] is not None: - builder.max_page_size_bytes(options["data_page_size"]) - if options["row_group_size"] is not None: - builder.row_group_size_rows(options["row_group_size"]) - + builder = Sink._apply_parquet_writer_options( + plc.io.parquet.ChunkedParquetWriterOptions.builder(sink), options + ) writer_options = builder.metadata(metadata).build() writer = plc.io.parquet.ChunkedParquetWriter.from_options(writer_options) @@ -565,6 +554,7 @@ def _directory_sink_graph( sink.schema, sink.kind, f"{sink.path}/part.{str(i).zfill(width)}.{suffix}", + sink.parquet_options, sink.options, (child_name, i), setup_name, diff --git a/python/cudf_polars/cudf_polars/testing/asserts.py b/python/cudf_polars/cudf_polars/testing/asserts.py index 19b4f2e16b9..256e0a6267f 100644 --- a/python/cudf_polars/cudf_polars/testing/asserts.py +++ b/python/cudf_polars/cudf_polars/testing/asserts.py @@ -13,7 +13,7 @@ from polars.testing.asserts import assert_frame_equal from cudf_polars.dsl.translate import Translator -from cudf_polars.utils.config import StreamingFallbackMode +from cudf_polars.utils.config import ConfigOptions, StreamingFallbackMode if TYPE_CHECKING: from cudf_polars.typing import OptimizationArgs @@ -381,9 +381,9 @@ def assert_sink_result_equal( # the multi-partition executor might produce multiple files, one per partition. if ( isinstance(engine, GPUEngine) - and engine.config["executor"] == "streaming" + and ConfigOptions.from_polars_engine(engine).executor.name == "streaming" and gpu_path.is_dir() - ): + ): # pragma: no cover result = read_fn(gpu_path.joinpath("*"), **read_kwargs) else: result = read_fn(gpu_path, **read_kwargs) diff --git a/python/cudf_polars/cudf_polars/utils/config.py b/python/cudf_polars/cudf_polars/utils/config.py index 0aa97fcd650..578dfd0694e 100644 --- a/python/cudf_polars/cudf_polars/utils/config.py +++ b/python/cudf_polars/cudf_polars/utils/config.py @@ -176,9 +176,11 @@ class ParquetOptions: Parameters ---------- chunked - Whether to use libcudf's ``ChunkedParquetReader`` to read the parquet - dataset in chunks. This is useful when reading very large parquet - files. + Whether to use libcudf's ``ChunkedParquetReader`` or ``ChunkedParquetWriter`` + to read/write the parquet dataset in chunks. This is useful when reading/writing + very large parquet files. + n_output_chunks + Split the dataframe in ``n_output_chunks`` when using libcudf's ``ChunkedParquetWriter``. chunk_read_limit Limit on total number of bytes to be returned per read, or 0 if there is no limit. @@ -206,6 +208,11 @@ class ParquetOptions: f"{_env_prefix}__CHUNKED", _bool_converter, default=True ) ) + n_output_chunks: int = dataclasses.field( + default_factory=_make_default_factory( + f"{_env_prefix}__N_OUTPUT_CHUNKS", int, default=1 + ) + ) chunk_read_limit: int = dataclasses.field( default_factory=_make_default_factory( f"{_env_prefix}__CHUNK_READ_LIMIT", int, default=0 @@ -230,6 +237,8 @@ class ParquetOptions: def __post_init__(self) -> None: # noqa: D105 if not isinstance(self.chunked, bool): raise TypeError("chunked must be a bool") + if not isinstance(self.n_output_chunks, int): + raise TypeError("n_output_chunks must be an int") if not isinstance(self.chunk_read_limit, int): raise TypeError("chunk_read_limit must be an int") if not isinstance(self.pass_read_limit, int): diff --git a/python/cudf_polars/tests/test_config.py b/python/cudf_polars/tests/test_config.py index c7ef911f239..52651fbe5c8 100644 --- a/python/cudf_polars/tests/test_config.py +++ b/python/cudf_polars/tests/test_config.py @@ -134,14 +134,16 @@ def test_parquet_options(executor: str) -> None: ) ) assert config.parquet_options.chunked is True + assert config.parquet_options.n_output_chunks == 1 config = ConfigOptions.from_polars_engine( pl.GPUEngine( executor=executor, - parquet_options={"chunked": False}, + parquet_options={"chunked": False, "n_output_chunks": 16}, ) ) assert config.parquet_options.chunked is False + assert config.parquet_options.n_output_chunks == 16 def test_validate_streaming_executor_shuffle_method(rapidsmpf_available) -> None: @@ -299,6 +301,7 @@ def test_executor_from_env(monkeypatch: pytest.MonkeyPatch) -> None: def test_parquet_options_from_env(monkeypatch: pytest.MonkeyPatch) -> None: with monkeypatch.context() as m: m.setenv("CUDF_POLARS__PARQUET_OPTIONS__CHUNKED", "0") + m.setenv("CUDF_POLARS__PARQUET_OPTIONS__N_OUTPUT_CHUNKS", "2") m.setenv("CUDF_POLARS__PARQUET_OPTIONS__CHUNK_READ_LIMIT", "100") m.setenv("CUDF_POLARS__PARQUET_OPTIONS__PASS_READ_LIMIT", "200") m.setenv("CUDF_POLARS__PARQUET_OPTIONS__MAX_FOOTER_SAMPLES", "0") @@ -308,6 +311,7 @@ def test_parquet_options_from_env(monkeypatch: pytest.MonkeyPatch) -> None: engine = pl.GPUEngine() config = ConfigOptions.from_polars_engine(engine) assert config.parquet_options.chunked is False + assert config.parquet_options.n_output_chunks == 2 assert config.parquet_options.chunk_read_limit == 100 assert config.parquet_options.pass_read_limit == 200 assert config.parquet_options.max_footer_samples == 0 @@ -401,6 +405,7 @@ def test_cardinality_factor_compat() -> None: "option", [ "chunked", + "n_output_chunks", "chunk_read_limit", "pass_read_limit", "max_footer_samples", diff --git a/python/cudf_polars/tests/test_sink.py b/python/cudf_polars/tests/test_sink.py index a21adbda833..81267ea2ade 100644 --- a/python/cudf_polars/tests/test_sink.py +++ b/python/cudf_polars/tests/test_sink.py @@ -78,7 +78,11 @@ def test_sink_ndjson(df, tmp_path): @pytest.mark.parametrize("mkdir", [True, False]) @pytest.mark.parametrize("data_page_size", [None, 256_000]) @pytest.mark.parametrize("row_group_size", [None, 1_000]) -def test_sink_parquet(df, tmp_path, mkdir, data_page_size, row_group_size): +@pytest.mark.parametrize("is_chunked", [False, True]) +@pytest.mark.parametrize("n_output_chunks", [1, 4, 8]) +def test_sink_parquet( + df, tmp_path, mkdir, data_page_size, row_group_size, is_chunked, n_output_chunks +): assert_sink_result_equal( df, tmp_path / "out.parquet", @@ -87,6 +91,10 @@ def test_sink_parquet(df, tmp_path, mkdir, data_page_size, row_group_size): "data_page_size": data_page_size, "row_group_size": row_group_size, }, + engine=pl.GPUEngine( + raise_on_fail=True, + parquet_options={"chunked": is_chunked, "n_output_chunks": n_output_chunks}, + ), ) @@ -131,3 +139,14 @@ def test_sink_csv_nested_data(tmp_path): pl.exceptions.ComputeError, match="CSV format does not support nested data" ): lf.sink_csv(path, engine=pl.GPUEngine()) + + +def test_chunked_sink_empty_table_to_parquet(tmp_path): + assert_sink_result_equal( + pl.LazyFrame(), + tmp_path / "out.parquet", + engine=pl.GPUEngine( + raise_on_fail=True, + parquet_options={"chunked": True, "n_output_chunks": 2}, + ), + ) diff --git a/python/pylibcudf/pylibcudf/io/parquet.pxd b/python/pylibcudf/pylibcudf/io/parquet.pxd index b9710689ac6..2a925b23f6e 100644 --- a/python/pylibcudf/pylibcudf/io/parquet.pxd +++ b/python/pylibcudf/pylibcudf/io/parquet.pxd @@ -152,6 +152,10 @@ cdef class ParquetWriterOptionsBuilder: cpdef ParquetWriterOptionsBuilder write_arrow_schema(self, bool enabled) + cpdef ParquetWriterOptionsBuilder row_group_size_rows(self, size_type val) + + cpdef ParquetWriterOptionsBuilder max_page_size_bytes(self, size_t val) + cpdef ParquetWriterOptions build(self) cpdef memoryview write_parquet(ParquetWriterOptions options, Stream stream = *) diff --git a/python/pylibcudf/pylibcudf/io/parquet.pyi b/python/pylibcudf/pylibcudf/io/parquet.pyi index 1543ae875a7..ab7e82ce1c7 100644 --- a/python/pylibcudf/pylibcudf/io/parquet.pyi +++ b/python/pylibcudf/pylibcudf/io/parquet.pyi @@ -80,6 +80,8 @@ class ParquetWriterOptionsBuilder: def dictionary_policy(self, val: DictionaryPolicy) -> Self: ... def utc_timestamps(self, enabled: bool) -> Self: ... def write_arrow_schema(self, enabled: bool) -> Self: ... + def row_group_size_rows(self, val: int) -> Self: ... + def max_page_size_bytes(self, val: int) -> Self: ... def build(self) -> ParquetWriterOptions: ... def write_parquet( diff --git a/python/pylibcudf/pylibcudf/io/parquet.pyx b/python/pylibcudf/pylibcudf/io/parquet.pyx index 1774370523c..42803d8d8fb 100644 --- a/python/pylibcudf/pylibcudf/io/parquet.pyx +++ b/python/pylibcudf/pylibcudf/io/parquet.pyx @@ -45,15 +45,17 @@ from pylibcudf.utils cimport _get_stream __all__ = [ "ChunkedParquetReader", + "ChunkedParquetWriterOptions", + "ChunkedParquetWriterOptionsBuilder", + "ParquetChunkedWriter", + "ParquetReaderOptions", + "ParquetReaderOptionsBuilder", "ParquetWriterOptions", "ParquetWriterOptionsBuilder", + "is_supported_write_parquet", + "merge_row_group_metadata", "read_parquet", "write_parquet", - "ParquetReaderOptions", - "ParquetReaderOptionsBuilder", - "ChunkedParquetWriterOptions", - "ChunkedParquetWriterOptionsBuilder" - "merge_row_group_metadata", ] @@ -917,6 +919,38 @@ cdef class ParquetWriterOptionsBuilder: self.c_obj.write_arrow_schema(enabled) return self + cpdef ParquetWriterOptionsBuilder row_group_size_rows(self, size_type val): + """ + Sets the maximum row group size, in rows. + + Parameters + ---------- + val : size_type + Maximum row group size, in rows to set + + Returns + ------- + Self + """ + self.c_obj.row_group_size_rows(val) + return self + + cpdef ParquetWriterOptionsBuilder max_page_size_bytes(self, size_t val): + """ + Sets the maximum uncompressed page size, in bytes. + + Parameters + ---------- + val : size_t + Maximum uncompressed page size, in bytes to set + + Returns + ------- + Self + """ + self.c_obj.max_page_size_bytes(val) + return self + cpdef ParquetWriterOptions build(self): """ Create a ParquetWriterOptions from the set options. From ca872a38539292b73130d4d3432da5b40842d9ad Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Tue, 29 Jul 2025 12:26:04 -0500 Subject: [PATCH 023/366] Add missing import of pyarrow.parquet when reading specified row_groups. (#19509) Closes #19508. Authors: - Bradley Dice (https://github.com/bdice) Approvers: - Matthew Roeschke (https://github.com/mroeschke) - Matthew Murray (https://github.com/Matt711) URL: https://github.com/rapidsai/cudf/pull/19509 --- python/cudf/cudf/io/parquet.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/io/parquet.py b/python/cudf/cudf/io/parquet.py index 352e0e6bfe8..5e14065c08f 100644 --- a/python/cudf/cudf/io/parquet.py +++ b/python/cudf/cudf/io/parquet.py @@ -16,7 +16,6 @@ import numpy as np import pandas as pd -import pyarrow as pa import pylibcudf as plc @@ -2291,8 +2290,10 @@ def _process_metadata( range_index_meta = index_col[0] if row_groups is not None: + import pyarrow.parquet as pq + per_file_metadata = [ - pa.parquet.read_metadata( + pq.read_metadata( # Pyarrow cannot read directly from bytes io.BytesIO(s) if isinstance(s, bytes) else s ) From 67dae5d252e37f689a8a4e9506ec25db39d3b410 Mon Sep 17 00:00:00 2001 From: Matthew Murray <41342305+Matt711@users.noreply.github.com> Date: Tue, 29 Jul 2025 14:27:21 -0400 Subject: [PATCH 024/366] Expose `nvtext::letter_type` to python (#19520) This PR exposes `nvtext::letter_type` enum and improves readability when printing it. See also #19451 Authors: - Matthew Murray (https://github.com/Matt711) Approvers: - Matthew Roeschke (https://github.com/mroeschke) - Kyle Edwards (https://github.com/KyleFromNVIDIA) URL: https://github.com/rapidsai/cudf/pull/19520 --- .../pylibcudf/libcudf/CMakeLists.txt | 2 ++ .../pylibcudf/libcudf/nvtext/CMakeLists.txt | 23 +++++++++++++++++++ .../pylibcudf/libcudf/nvtext/stemmer.pyx | 0 python/pylibcudf/pylibcudf/nvtext/stemmer.pyx | 8 +++++-- 4 files changed, 31 insertions(+), 2 deletions(-) create mode 100644 python/pylibcudf/pylibcudf/libcudf/nvtext/CMakeLists.txt create mode 100644 python/pylibcudf/pylibcudf/libcudf/nvtext/stemmer.pyx diff --git a/python/pylibcudf/pylibcudf/libcudf/CMakeLists.txt b/python/pylibcudf/pylibcudf/libcudf/CMakeLists.txt index 0182cf89ca5..7a6f3d0f7da 100644 --- a/python/pylibcudf/pylibcudf/libcudf/CMakeLists.txt +++ b/python/pylibcudf/pylibcudf/libcudf/CMakeLists.txt @@ -23,6 +23,8 @@ rapids_cython_create_modules( SOURCE_FILES "${cython_sources}" LINKED_LIBRARIES "${linked_libraries}" ASSOCIATED_TARGETS cudf MODULE_PREFIX cpp ) + +add_subdirectory(nvtext) add_subdirectory(io) add_subdirectory(lists) add_subdirectory(strings) diff --git a/python/pylibcudf/pylibcudf/libcudf/nvtext/CMakeLists.txt b/python/pylibcudf/pylibcudf/libcudf/nvtext/CMakeLists.txt new file mode 100644 index 00000000000..ac447646c4d --- /dev/null +++ b/python/pylibcudf/pylibcudf/libcudf/nvtext/CMakeLists.txt @@ -0,0 +1,23 @@ +# ============================================================================= +# Copyright (c) 2025, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under the License +# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express +# or implied. See the License for the specific language governing permissions and limitations under +# the License. +# ============================================================================= + +set(cython_sources stemmer.pyx) + +set(linked_libraries cudf::cudf) + +rapids_cython_create_modules( + CXX + SOURCE_FILES "${cython_sources}" + LINKED_LIBRARIES "${linked_libraries}" ASSOCIATED_TARGETS cudf MODULE_PREFIX cpp_nvtext_ +) diff --git a/python/pylibcudf/pylibcudf/libcudf/nvtext/stemmer.pyx b/python/pylibcudf/pylibcudf/libcudf/nvtext/stemmer.pyx new file mode 100644 index 00000000000..e69de29bb2d diff --git a/python/pylibcudf/pylibcudf/nvtext/stemmer.pyx b/python/pylibcudf/pylibcudf/nvtext/stemmer.pyx index c9e4f1274e4..a64e414b850 100644 --- a/python/pylibcudf/pylibcudf/nvtext/stemmer.pyx +++ b/python/pylibcudf/pylibcudf/nvtext/stemmer.pyx @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. +# Copyright (c) 2024-2025, NVIDIA CORPORATION. from libcpp cimport bool from libcpp.memory cimport unique_ptr @@ -12,7 +12,9 @@ from pylibcudf.libcudf.nvtext.stemmer cimport ( ) from pylibcudf.libcudf.types cimport size_type -__all__ = ["is_letter", "porter_stemmer_measure"] +from pylibcudf.libcudf.nvtext.stemmer import letter_type as LetterType # no-cython-lint + +__all__ = ["is_letter", "porter_stemmer_measure", "LetterType"] cpdef Column is_letter( Column input, @@ -75,3 +77,5 @@ cpdef Column porter_stemmer_measure(Column input): c_result = cpp_porter_stemmer_measure(input.view()) return Column.from_libcudf(move(c_result)) + +LetterType.__str__ = LetterType.__repr__ From 595c27ff4c66d60f7d6b8d9e881b3df5da6483c9 Mon Sep 17 00:00:00 2001 From: Vukasin Milovanovic Date: Wed, 30 Jul 2025 09:42:11 -0700 Subject: [PATCH 025/366] Make nvCOMP ZLIB (de)compression available by default (#19528) closes https://github.com/rapidsai/cudf/issues/19489 nvCOMP ZLIB (de)compression has now been well-tested. Marking this as stable, which means that it's available by default (without the env var). Also expanded test coverage for the ZLIB decompression/compression and updated the user guide. Authors: - Vukasin Milovanovic (https://github.com/vuule) Approvers: - Shruti Shivakumar (https://github.com/shrshi) - David Wendt (https://github.com/davidwendt) URL: https://github.com/rapidsai/cudf/pull/19528 --- cpp/src/io/comp/nvcomp_adapter.cpp | 12 +++------- cpp/tests/io/comp/comp_test.cpp | 10 +++++---- cpp/tests/io/orc_chunked_reader_test.cu | 1 + cpp/tests/io/orc_test.cpp | 2 ++ docs/cudf/source/user_guide/io/io.md | 30 ++++++++++++------------- 5 files changed, 27 insertions(+), 28 deletions(-) diff --git a/cpp/src/io/comp/nvcomp_adapter.cpp b/cpp/src/io/comp/nvcomp_adapter.cpp index 50f486d2f12..2887072fce6 100644 --- a/cpp/src/io/comp/nvcomp_adapter.cpp +++ b/cpp/src/io/comp/nvcomp_adapter.cpp @@ -522,13 +522,7 @@ std::optional is_compression_disabled_impl(compression_type compres feature_status_parameters params) { switch (compression) { - case compression_type::DEFLATE: { - if (not params.are_all_integrations_enabled) { - return "DEFLATE compression is experimental, you can enable it through " - "`LIBCUDF_NVCOMP_POLICY` environment variable."; - } - return std::nullopt; - } + case compression_type::DEFLATE: case compression_type::LZ4: case compression_type::SNAPPY: case compression_type::ZSTD: @@ -544,14 +538,14 @@ std::optional is_decompression_disabled_impl(compression_type compr feature_status_parameters params) { switch (compression) { - case compression_type::DEFLATE: case compression_type::GZIP: { if (not params.are_all_integrations_enabled) { - return "DEFLATE decompression is experimental, you can enable it through " + return "GZIP decompression is experimental, you can enable it through " "`LIBCUDF_NVCOMP_POLICY` environment variable."; } return std::nullopt; } + case compression_type::DEFLATE: case compression_type::LZ4: case compression_type::SNAPPY: case compression_type::ZSTD: { diff --git a/cpp/tests/io/comp/comp_test.cpp b/cpp/tests/io/comp/comp_test.cpp index 3dd1bcf1b56..70b6e78364b 100644 --- a/cpp/tests/io/comp/comp_test.cpp +++ b/cpp/tests/io/comp/comp_test.cpp @@ -340,8 +340,9 @@ TEST_F(NvcompConfigTest, Compression) auto const& comp_disabled = nvcomp::is_compression_disabled; EXPECT_FALSE(comp_disabled(compression_type::DEFLATE, {true, true})); - // all integrations enabled required - EXPECT_TRUE(comp_disabled(compression_type::DEFLATE, {false, true})); + EXPECT_FALSE(comp_disabled(compression_type::DEFLATE, {false, true})); + // stable integrations enabled required + EXPECT_TRUE(comp_disabled(compression_type::DEFLATE, {false, false})); EXPECT_FALSE(comp_disabled(compression_type::ZSTD, {true, true})); EXPECT_FALSE(comp_disabled(compression_type::ZSTD, {false, true})); @@ -360,8 +361,9 @@ TEST_F(NvcompConfigTest, Decompression) auto const& decomp_disabled = nvcomp::is_decompression_disabled; EXPECT_FALSE(decomp_disabled(compression_type::DEFLATE, {true, true})); - // all integrations enabled required - EXPECT_TRUE(decomp_disabled(compression_type::DEFLATE, {false, true})); + EXPECT_FALSE(decomp_disabled(compression_type::DEFLATE, {false, true})); + // stable integrations enabled required + EXPECT_TRUE(decomp_disabled(compression_type::DEFLATE, {false, false})); EXPECT_FALSE(decomp_disabled(compression_type::ZSTD, {true, true})); EXPECT_FALSE(decomp_disabled(compression_type::ZSTD, {false, true})); diff --git a/cpp/tests/io/orc_chunked_reader_test.cu b/cpp/tests/io/orc_chunked_reader_test.cu index d8d1d48551f..a0deb6c80e9 100644 --- a/cpp/tests/io/orc_chunked_reader_test.cu +++ b/cpp/tests/io/orc_chunked_reader_test.cu @@ -1536,6 +1536,7 @@ INSTANTIATE_TEST_CASE_P(Nvcomp, ::testing::Values(cudf::io::compression_type::AUTO, cudf::io::compression_type::SNAPPY, cudf::io::compression_type::LZ4, + cudf::io::compression_type::ZLIB, cudf::io::compression_type::ZSTD))); INSTANTIATE_TEST_CASE_P(DeviceInternal, diff --git a/cpp/tests/io/orc_test.cpp b/cpp/tests/io/orc_test.cpp index dbec125d22a..b0b83f7f419 100644 --- a/cpp/tests/io/orc_test.cpp +++ b/cpp/tests/io/orc_test.cpp @@ -2311,6 +2311,7 @@ INSTANTIATE_TEST_CASE_P(Nvcomp, ::testing::Values(cudf::io::compression_type::AUTO, cudf::io::compression_type::SNAPPY, cudf::io::compression_type::LZ4, + cudf::io::compression_type::ZLIB, cudf::io::compression_type::ZSTD))); INSTANTIATE_TEST_CASE_P(DeviceInternal, @@ -2332,6 +2333,7 @@ INSTANTIATE_TEST_CASE_P(Nvcomp, ::testing::Values(cudf::io::compression_type::AUTO, cudf::io::compression_type::SNAPPY, cudf::io::compression_type::LZ4, + cudf::io::compression_type::ZLIB, cudf::io::compression_type::ZSTD))); INSTANTIATE_TEST_CASE_P(DeviceInternal, diff --git a/docs/cudf/source/user_guide/io/io.md b/docs/cudf/source/user_guide/io/io.md index 600a97d6e16..b38c563b87e 100644 --- a/docs/cudf/source/user_guide/io/io.md +++ b/docs/cudf/source/user_guide/io/io.md @@ -153,21 +153,21 @@ If no value is set, behavior will be the same as the "STABLE" option. .. table:: Current policy for nvCOMP use for different types :widths: 20 20 20 20 20 20 20 20 20 20 - +-----------------------+--------+--------+--------------+--------------+---------+--------+--------------+--------------+--------+ - | | CSV | Parquet | JSON | ORC | AVRO | - +-----------------------+--------+--------+--------------+--------------+---------+--------+--------------+--------------+--------+ - | Compression Type | Writer | Reader | Writer | Reader | Writer¹ | Reader | Writer | Reader | Reader | - +=======================+========+========+==============+==============+=========+========+==============+==============+========+ - | Snappy | ❌ | ❌ | Stable | Stable | ❌ | ❌ | Stable | Stable | ❌ | - +-----------------------+--------+--------+--------------+--------------+---------+--------+--------------+--------------+--------+ - | ZSTD | ❌ | ❌ | Stable | Stable | ❌ | ❌ | Stable | Stable | ❌ | - +-----------------------+--------+--------+--------------+--------------+---------+--------+--------------+--------------+--------+ - | DEFLATE | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | Experimental | Experimental | ❌ | - +-----------------------+--------+--------+--------------+--------------+---------+--------+--------------+--------------+--------+ - | LZ4 | ❌ | ❌ | Stable | Stable | ❌ | ❌ | Stable | Stable | ❌ | - +-----------------------+--------+--------+--------------+--------------+---------+--------+--------------+--------------+--------+ - | GZIP | ❌ | ❌ | Experimental | Experimental | ❌ | ❌ | ❌ | ❌ | ❌ | - +-----------------------+--------+--------+--------------+--------------+---------+--------+--------------+--------------+--------+ + +-----------------------+--------+--------+--------------+--------------+---------+--------+--------------+--------------+--------------+ + | | CSV | Parquet | JSON | ORC | AVRO | + +-----------------------+--------+--------+--------------+--------------+---------+--------+--------------+--------------+--------------+ + | Compression Type | Writer | Reader | Writer | Reader | Writer¹ | Reader | Writer | Reader | Reader | + +=======================+========+========+==============+==============+=========+========+==============+==============+==============+ + | Snappy | ❌ | ❌ | Stable | Stable | ❌ | ❌ | Stable | Stable | Stable | + +-----------------------+--------+--------+--------------+--------------+---------+--------+--------------+--------------+--------------+ + | ZSTD | ❌ | ❌ | Stable | Stable | ❌ | ❌ | Stable | Stable | ❌ | + +-----------------------+--------+--------+--------------+--------------+---------+--------+--------------+--------------+--------------+ + | DEFLATE | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | Stable | Stable | ❌ | + +-----------------------+--------+--------+--------------+--------------+---------+--------+--------------+--------------+--------------+ + | LZ4 | ❌ | ❌ | Stable | Stable | ❌ | ❌ | Stable | Stable | ❌ | + +-----------------------+--------+--------+--------------+--------------+---------+--------+--------------+--------------+--------------+ + | GZIP | ❌ | ❌ | ❌ | Experimental | ❌ | ❌ | ❌ | ❌ | ❌ | + +-----------------------+--------+--------+--------------+--------------+---------+--------+--------------+--------------+--------------+ ``` From 34e01b2ee68c65f851e5abe509b14d1c44302fa9 Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com> Date: Wed, 30 Jul 2025 11:39:41 -0700 Subject: [PATCH 026/366] Add nvtx ranges and minor fix for `lists` types in the next-gen parquet reader (#19493) Contributes to #19469 This PR introduces fixes to multiple minor bugs and edge cases in the next-gen parquet reader. These bugs were discovered while working on a new example called `hybrid_scan` in PR #19469. Authors: - Muhammad Haseeb (https://github.com/mhaseeb123) Approvers: - Vukasin Milovanovic (https://github.com/vuule) - David Wendt (https://github.com/davidwendt) URL: https://github.com/rapidsai/cudf/pull/19493 --- .../experimental/hybrid_scan_helpers.cpp | 19 ++- .../parquet/experimental/hybrid_scan_impl.cpp | 71 +++++++++--- .../parquet/experimental/hybrid_scan_impl.hpp | 4 +- .../experimental/hybrid_scan_preprocess.cu | 2 + .../parquet/experimental/page_index_filter.cu | 109 ++++++++---------- .../io/experimental/hybrid_scan_common.cpp | 2 +- .../io/experimental/hybrid_scan_test.cpp | 75 ++++++++++++ cpp/tests/io/parquet_common.cpp | 6 +- 8 files changed, 198 insertions(+), 90 deletions(-) diff --git a/cpp/src/io/parquet/experimental/hybrid_scan_helpers.cpp b/cpp/src/io/parquet/experimental/hybrid_scan_helpers.cpp index ccf1e60823b..3932e9bc15c 100644 --- a/cpp/src/io/parquet/experimental/hybrid_scan_helpers.cpp +++ b/cpp/src/io/parquet/experimental/hybrid_scan_helpers.cpp @@ -90,10 +90,13 @@ aggregate_reader_metadata::aggregate_reader_metadata(cudf::host_span #include +#include #include #include #include @@ -167,6 +169,8 @@ std::vector> hybrid_scan_reader_impl::filter_row_groups_w parquet_reader_options const& options, rmm::cuda_stream_view stream) { + CUDF_FUNC_RANGE(); + CUDF_EXPECTS(not row_group_indices.empty(), "Empty input row group indices encountered"); CUDF_EXPECTS(options.get_filter().has_value(), "Encountered empty converted filter expression"); @@ -191,6 +195,8 @@ hybrid_scan_reader_impl::secondary_filters_byte_ranges( cudf::host_span const> row_group_indices, parquet_reader_options const& options) { + CUDF_FUNC_RANGE(); + CUDF_EXPECTS(not row_group_indices.empty(), "Empty input row group indices encountered"); CUDF_EXPECTS(options.get_filter().has_value(), "Filter expression must not be empty"); @@ -224,6 +230,8 @@ hybrid_scan_reader_impl::filter_row_groups_with_dictionary_pages( parquet_reader_options const& options, rmm::cuda_stream_view stream) { + CUDF_FUNC_RANGE(); + CUDF_EXPECTS(not row_group_indices.empty(), "Empty input row group indices encountered"); CUDF_EXPECTS(options.get_filter().has_value(), "Encountered empty converted filter expression"); @@ -294,6 +302,8 @@ std::vector> hybrid_scan_reader_impl::filter_row_groups_w CUDF_EXPECTS(not row_group_indices.empty(), "Empty input row group indices encountered"); CUDF_EXPECTS(options.get_filter().has_value(), "Encountered empty converted filter expression"); + CUDF_FUNC_RANGE(); + select_columns(read_columns_mode::FILTER_COLUMNS, options); table_metadata metadata; @@ -322,6 +332,8 @@ hybrid_scan_reader_impl::filter_data_pages_with_stats( CUDF_EXPECTS(not row_group_indices.empty(), "Empty input row group indices encountered"); CUDF_EXPECTS(options.get_filter().has_value(), "Encountered empty converted filter expression"); + CUDF_FUNC_RANGE(); + select_columns(read_columns_mode::FILTER_COLUMNS, options); table_metadata metadata; @@ -349,6 +361,8 @@ std::pair, std::vector> hybrid_scan_reader_impl::get_input_column_chunk_byte_ranges( cudf::host_span const> row_group_indices) const { + CUDF_FUNC_RANGE(); + // Descriptors for all the chunks that make up the selected columns auto const num_input_columns = _input_columns.size(); auto const num_row_groups = @@ -430,6 +444,8 @@ table_with_metadata hybrid_scan_reader_impl::materialize_filter_columns( CUDF_EXPECTS(not row_group_indices.empty(), "Empty input row group indices encountered"); CUDF_EXPECTS(options.get_filter().has_value(), "Encountered empty converted filter expression"); + CUDF_FUNC_RANGE(); + reset_internal_state(); table_metadata metadata; @@ -450,7 +466,7 @@ table_with_metadata hybrid_scan_reader_impl::materialize_filter_columns( prepare_data(row_group_indices, std::move(column_chunk_buffers), data_page_mask, options); - return read_chunk_internal(read_columns_mode::FILTER_COLUMNS, row_mask); + return read_chunk_internal(read_mode::READ_ALL, read_columns_mode::FILTER_COLUMNS, row_mask); } table_with_metadata hybrid_scan_reader_impl::materialize_payload_columns( @@ -464,6 +480,8 @@ table_with_metadata hybrid_scan_reader_impl::materialize_payload_columns( CUDF_EXPECTS(row_mask.null_count() == 0, "Row mask must not have any nulls when materializing payload column"); + CUDF_FUNC_RANGE(); + reset_internal_state(); initialize_options(row_group_indices, options, stream); @@ -477,7 +495,7 @@ table_with_metadata hybrid_scan_reader_impl::materialize_payload_columns( prepare_data(row_group_indices, std::move(column_chunk_buffers), data_page_mask, options); - return read_chunk_internal(read_columns_mode::PAYLOAD_COLUMNS, row_mask); + return read_chunk_internal(read_mode::READ_ALL, read_columns_mode::PAYLOAD_COLUMNS, row_mask); } void hybrid_scan_reader_impl::reset_internal_state() @@ -495,6 +513,8 @@ void hybrid_scan_reader_impl::reset_internal_state() _input_pass_read_limit = 0; _output_chunk_read_limit = 0; _strings_to_categorical = false; + _reader_column_schema.reset(); + _expr_conv = named_to_reference_converter(std::nullopt, table_metadata{}); } void hybrid_scan_reader_impl::initialize_options( @@ -542,7 +562,7 @@ void hybrid_scan_reader_impl::prepare_data( template table_with_metadata hybrid_scan_reader_impl::read_chunk_internal( - read_columns_mode read_columns_mode, RowMaskView row_mask) + read_mode mode, read_columns_mode read_columns_mode, RowMaskView row_mask) { // If `_output_metadata` has been constructed, just copy it over. auto out_metadata = _output_metadata ? table_metadata{*_output_metadata} : table_metadata{}; @@ -555,8 +575,7 @@ table_with_metadata hybrid_scan_reader_impl::read_chunk_internal( // Copy number of total input row groups and number of surviving row groups from predicate // pushdown. out_metadata.num_input_row_groups = _file_itm_data.num_input_row_groups; - // Copy the number surviving row groups from each predicate pushdown only if the filter has - // value. + // Copy the number surviving row groups from each predicate pushdown only if the filter has value if (_expr_conv.get_converted_expr().has_value()) { out_metadata.num_row_groups_after_stats_filter = _file_itm_data.surviving_row_groups.after_stats_filter; @@ -566,6 +585,13 @@ table_with_metadata hybrid_scan_reader_impl::read_chunk_internal( // no work to do (this can happen on the first pass if we have no rows to read) if (!has_more_work()) { + // Check if number of rows per source should be included in output metadata. + if (include_output_num_rows_per_source()) { + // Empty dataframe case: Simply initialize to a list of zeros + out_metadata.num_rows_per_source = + std::vector(_file_itm_data.num_rows_per_source.size(), 0); + } + // Finalize output return finalize_output(read_columns_mode, out_metadata, out_columns, row_mask); } @@ -575,10 +601,10 @@ table_with_metadata hybrid_scan_reader_impl::read_chunk_internal( auto const& read_info = subpass.output_chunk_read_info[subpass.current_output_chunk]; // Allocate memory buffers for the output columns. - allocate_columns(read_mode::READ_ALL, read_info.skip_rows, read_info.num_rows); + allocate_columns(mode, read_info.skip_rows, read_info.num_rows); // Parse data into the output buffers. - decode_page_data(read_mode::READ_ALL, read_info.skip_rows, read_info.num_rows); + decode_page_data(mode, read_info.skip_rows, read_info.num_rows); // Create the final output cudf columns. for (size_t i = 0; i < _output_buffers.size(); ++i) { @@ -597,13 +623,30 @@ table_with_metadata hybrid_scan_reader_impl::read_chunk_internal( } // Only construct `out_metadata` if `_output_metadata` has not been cached. if (!_output_metadata) { - cudf::io::column_name_info& col_name = out_metadata.schema_info[i]; + column_name_info& col_name = out_metadata.schema_info[i]; out_columns.emplace_back(make_column(_output_buffers[i], &col_name, metadata, _stream)); } else { out_columns.emplace_back(make_column(_output_buffers[i], nullptr, metadata, _stream)); } } + out_columns = + cudf::structs::detail::enforce_null_consistency(std::move(out_columns), _stream, _mr); + + // Check if number of rows per source should be included in output metadata. + if (include_output_num_rows_per_source()) { + // For chunked reading, compute the output number of rows per source + if (mode == read_mode::CHUNKED_READ) { + out_metadata.num_rows_per_source = + calculate_output_num_rows_per_source(read_info.skip_rows, read_info.num_rows); + } + // Simply move the number of rows per file if reading all at once + else { + // Move is okay here as we are reading in one go. + out_metadata.num_rows_per_source = std::move(_file_itm_data.num_rows_per_source); + } + } + // Add empty columns if needed. Filter output columns based on filter. return finalize_output(read_columns_mode, out_metadata, out_columns, row_mask); } @@ -619,19 +662,17 @@ table_with_metadata hybrid_scan_reader_impl::finalize_output( // read) for (size_t i = out_columns.size(); i < _output_buffers.size(); ++i) { if (!_output_metadata) { - cudf::io::column_name_info& col_name = out_metadata.schema_info[i]; - out_columns.emplace_back( - cudf::io::detail::empty_like(_output_buffers[i], &col_name, _stream, _mr)); + column_name_info& col_name = out_metadata.schema_info[i]; + out_columns.emplace_back(io::detail::empty_like(_output_buffers[i], &col_name, _stream, _mr)); } else { - out_columns.emplace_back( - cudf::io::detail::empty_like(_output_buffers[i], nullptr, _stream, _mr)); + out_columns.emplace_back(io::detail::empty_like(_output_buffers[i], nullptr, _stream, _mr)); } } if (!_output_metadata) { populate_metadata(out_metadata); // Finally, save the output table metadata into `_output_metadata` for reuse next time. - _output_metadata = std::make_unique(out_metadata); + _output_metadata = std::make_unique(out_metadata); } // advance output chunk/subpass/pass info for non-empty tables if and only if we are in bounds @@ -685,6 +726,8 @@ table_with_metadata hybrid_scan_reader_impl::finalize_output( void hybrid_scan_reader_impl::set_page_mask(cudf::host_span const> data_page_mask) { + CUDF_FUNC_RANGE(); + auto const& pass = _pass_itm_data; auto const& chunks = pass->chunks; diff --git a/cpp/src/io/parquet/experimental/hybrid_scan_impl.hpp b/cpp/src/io/parquet/experimental/hybrid_scan_impl.hpp index 7ebe4df865d..c6519200252 100644 --- a/cpp/src/io/parquet/experimental/hybrid_scan_impl.hpp +++ b/cpp/src/io/parquet/experimental/hybrid_scan_impl.hpp @@ -365,13 +365,15 @@ class hybrid_scan_reader_impl : public parquet::detail::reader_impl { * This function is called internally and expects all preprocessing steps have already been done. * * @tparam RowMaskView View type of the row mask column + * @param[in] mode Read mode indicating if we are reading all at once or chunk by chunk * @param[in] read_columns_mode Read mode indicating if we are reading filter or payload columns * @param[in,out] row_mask Boolean column indicating which rows need to be read after page-pruning * for filter columns, or after materialize step for payload columns * @return The output table along with columns' metadata */ template - table_with_metadata read_chunk_internal(read_columns_mode read_columns_mode, + table_with_metadata read_chunk_internal(read_mode mode, + read_columns_mode read_columns_mode, RowMaskView row_mask); private: diff --git a/cpp/src/io/parquet/experimental/hybrid_scan_preprocess.cu b/cpp/src/io/parquet/experimental/hybrid_scan_preprocess.cu index 94b58f89f92..9a707e6577d 100644 --- a/cpp/src/io/parquet/experimental/hybrid_scan_preprocess.cu +++ b/cpp/src/io/parquet/experimental/hybrid_scan_preprocess.cu @@ -314,6 +314,8 @@ void hybrid_scan_reader_impl::update_row_mask(cudf::column_view in_row_mask, cudf::mutable_column_view out_row_mask, rmm::cuda_stream_view stream) { + CUDF_FUNC_RANGE(); + auto const total_rows = static_cast(in_row_mask.size()); CUDF_EXPECTS(total_rows == out_row_mask.size(), diff --git a/cpp/src/io/parquet/experimental/page_index_filter.cu b/cpp/src/io/parquet/experimental/page_index_filter.cu index e3f1f043047..42a27e3af48 100644 --- a/cpp/src/io/parquet/experimental/page_index_filter.cu +++ b/cpp/src/io/parquet/experimental/page_index_filter.cu @@ -28,6 +28,7 @@ #include #include #include +#include #include #include #include @@ -702,8 +703,12 @@ std::vector> aggregate_reader_metadata::compute_data_page_mask auto const has_page_index = compute_has_page_index(per_file_metadata, row_group_indices, output_column_schemas); - CUDF_EXPECTS(has_page_index, - "Data page mask computation requires the Parquet page index for all output columns"); + + // TODO: Don't use page pruning in case of lists and structs until we support them + if (not has_page_index) { + CUDF_LOG_WARN("Encountered missing Parquet page index for one or more output columns"); + return {}; // An empty data page mask indicates all pages are required + } // Compute page row counts, offsets, and column chunk page offsets for each column std::vector> page_row_counts; @@ -768,12 +773,7 @@ std::vector> aggregate_reader_metadata::compute_data_page_mask auto data_page_mask = std::vector>(); data_page_mask.reserve(num_columns); - std::atomic total_surviving_pages{0}; - - // Tasks to compute data page mask for each column - std::vector>> data_page_mask_tasks; - data_page_mask_tasks.reserve(num_columns); - auto streams = cudf::detail::fork_streams(stream, num_columns); + auto total_surviving_pages = size_t{0}; // For all columns, look up which pages contain at least one required row. i.e. // !validity_it[row_idx] or is_row_required[row_idx] satisfies, and add its byte range to the @@ -782,60 +782,49 @@ std::vector> aggregate_reader_metadata::compute_data_page_mask thrust::counting_iterator(0), thrust::counting_iterator(num_columns), [&](auto const col_idx) { - data_page_mask_tasks.emplace_back( - cudf::detail::host_worker_pool().submit_task([&, col_idx = col_idx] { - // Construct a row indices mapping based on page row counts and offsets - auto const total_pages_in_this_column = page_row_counts[col_idx].size(); - - auto const page_indices = make_page_indices_async( - page_row_counts[col_idx], page_row_offsets[col_idx], total_rows, streams[col_idx]); - - // Device vector to hold page indices with at least one required row - rmm::device_uvector select_page_indices(total_rows, streams[col_idx], mr); - - // Copy page indices with at least one required row - auto const filtered_pages_end_iter = thrust::copy_if( - rmm::exec_policy_nosync(streams[col_idx]), - page_indices.begin(), - page_indices.end(), - thrust::counting_iterator(0), - select_page_indices.begin(), - is_row_required_fn{row_mask.nullable(), row_mask.null_mask(), row_mask.data()}); - - // Remove duplicate page indices across (presorted) rows - auto const filtered_uniq_page_end_iter = - thrust::unique(rmm::exec_policy_nosync(streams[col_idx]), - select_page_indices.begin(), - filtered_pages_end_iter); - - // Number of final filtered pages for this column - size_t const num_surviving_pages_this_column = - thrust::distance(select_page_indices.begin(), filtered_uniq_page_end_iter); - - total_surviving_pages.fetch_add(num_surviving_pages_this_column); - - // Copy the filtered page indices for this column to host - auto host_select_page_indices = cudf::detail::make_host_vector( - cudf::device_span{select_page_indices.data(), - num_surviving_pages_this_column}, - streams[col_idx]); - - // Vector to data page mask the this column - auto valid_pages = std::vector(total_pages_in_this_column, false); - std::for_each(host_select_page_indices.begin(), - host_select_page_indices.end(), - [&](auto const page_idx) { valid_pages[page_idx] = true; }); - - return valid_pages; - })); + // Construct a row indices mapping based on page row counts and offsets + auto const total_pages_in_this_column = page_row_counts[col_idx].size(); + + auto const page_indices = make_page_indices_async( + page_row_counts[col_idx], page_row_offsets[col_idx], total_rows, stream); + + // Device vector to hold page indices with at least one required row + rmm::device_uvector select_page_indices(total_rows, stream, mr); + + // Copy page indices with at least one required row + auto const filtered_pages_end_iter = thrust::copy_if( + rmm::exec_policy_nosync(stream), + page_indices.begin(), + page_indices.end(), + thrust::counting_iterator(0), + select_page_indices.begin(), + is_row_required_fn{row_mask.nullable(), row_mask.null_mask(), row_mask.data()}); + + // Remove duplicate page indices across (presorted) rows + auto const filtered_uniq_page_end_iter = thrust::unique( + rmm::exec_policy_nosync(stream), select_page_indices.begin(), filtered_pages_end_iter); + + // Number of final filtered pages for this column + size_t const num_surviving_pages_this_column = + thrust::distance(select_page_indices.begin(), filtered_uniq_page_end_iter); + + total_surviving_pages += num_surviving_pages_this_column; + + // Copy the filtered page indices for this column to host + auto host_select_page_indices = cudf::detail::make_host_vector( + cudf::device_span{select_page_indices.data(), + num_surviving_pages_this_column}, + stream); + + // Vector to data page mask the this column + auto valid_pages = std::vector(total_pages_in_this_column, false); + std::for_each(host_select_page_indices.begin(), + host_select_page_indices.end(), + [&](auto const page_idx) { valid_pages[page_idx] = true; }); + + data_page_mask.push_back(std::move(valid_pages)); }); - // Collect results from all tasks - std::transform(data_page_mask_tasks.begin(), - data_page_mask_tasks.end(), - std::back_inserter(data_page_mask), - [](auto& task) { return std::move(task).get(); }); - // Total number of input pages across all columns auto const total_pages = std::accumulate( page_row_counts.cbegin(), diff --git a/cpp/tests/io/experimental/hybrid_scan_common.cpp b/cpp/tests/io/experimental/hybrid_scan_common.cpp index c2bdca7c0e2..1224d9d4988 100644 --- a/cpp/tests/io/experimental/hybrid_scan_common.cpp +++ b/cpp/tests/io/experimental/hybrid_scan_common.cpp @@ -89,7 +89,7 @@ cudf::test::strings_column_wrapper constant_strings(cudf::size_type value) auto elements = thrust::make_transform_iterator(thrust::make_constant_iterator(value), [](auto i) { - std::array buf; + std::array buf{}; snprintf(buf.data(), buf.size(), "%04d", i); return std::string(buf.data()); }); diff --git a/cpp/tests/io/experimental/hybrid_scan_test.cpp b/cpp/tests/io/experimental/hybrid_scan_test.cpp index 7b62a2efa6f..316e55a02eb 100644 --- a/cpp/tests/io/experimental/hybrid_scan_test.cpp +++ b/cpp/tests/io/experimental/hybrid_scan_test.cpp @@ -362,3 +362,78 @@ TEST_F(HybridScanTest, PruneDataPagesOnlyAndScanAllColumns) CUDF_TEST_EXPECT_TABLES_EQUIVALENT(expected->select({1, 2}), read_payload_table->view()); } } + +TEST_F(HybridScanTest, MaterializeListPayloadColumn) +{ + srand(0xc0ffee); + using T = uint32_t; + + // Parquet buffer + std::vector parquet_buffer; + { + auto constexpr num_rows = num_ordered_rows; + // int32_t column + auto col0 = testdata::ascending(); + // string column + auto col1 = testdata::ascending(); + // list column + auto bools_iter = + cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 2; }); + auto bools_col = + cudf::test::fixed_width_column_wrapper(bools_iter, bools_iter + num_rows); + auto offsets_iter = thrust::counting_iterator(0); + auto offsets_col = + cudf::test::fixed_width_column_wrapper(offsets_iter, offsets_iter + num_rows + 1); + auto col2 = cudf::make_lists_column( + num_rows, offsets_col.release(), bools_col.release(), 0, rmm::device_buffer{}); + + // Input table + auto table = cudf::table_view{{col0, col1, *col2}}; + cudf::io::table_input_metadata expected_metadata(table); + expected_metadata.column_metadata[0].set_name("col0"); + expected_metadata.column_metadata[1].set_name("col1"); + expected_metadata.column_metadata[2].set_name("col2"); + + // Write to parquet buffer + cudf::io::parquet_writer_options out_opts = + cudf::io::parquet_writer_options::builder(cudf::io::sink_info{&parquet_buffer}, table) + .metadata(std::move(expected_metadata)) + .row_group_size_rows(page_size_for_ordered_tests) + .max_page_size_rows(page_size_for_ordered_tests / 5) + .compression(cudf::io::compression_type::AUTO) + .dictionary_policy(cudf::io::dictionary_policy::ALWAYS) + .stats_level(cudf::io::statistics_freq::STATISTICS_COLUMN); + cudf::io::write_parquet(out_opts); + } + + // Filtering AST - table[0] < 100 + auto constexpr num_filter_columns = 1; + auto literal_value = cudf::numeric_scalar(100); + auto literal = cudf::ast::literal(literal_value); + auto col_ref_0 = cudf::ast::column_name_reference("col0"); + auto filter_expression = cudf::ast::operation(cudf::ast::ast_operator::LESS, col_ref_0, literal); + + auto stream = cudf::get_default_stream(); + auto mr = cudf::get_current_device_resource_ref(); + auto aligned_mr = rmm::mr::aligned_resource_adaptor( + cudf::get_current_device_resource(), bloom_filter_alignment); + + // Read parquet using the hybrid scan reader + auto [read_filter_table, read_payload_table, read_filter_meta, read_payload_meta, row_mask] = + hybrid_scan(parquet_buffer, filter_expression, num_filter_columns, {}, stream, mr, aligned_mr); + + CUDF_EXPECTS(read_filter_table->num_rows() == read_payload_table->num_rows(), + "Filter and payload tables should have the same number of rows"); + + // Check equivalence (equal without checking nullability) with the parquet file read with the + // original reader + { + cudf::io::parquet_reader_options const options = + cudf::io::parquet_reader_options::builder( + cudf::io::source_info(cudf::host_span(parquet_buffer.data(), parquet_buffer.size()))) + .filter(filter_expression); + auto [expected_tbl, expected_meta] = cudf::io::read_parquet(options, stream); + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(expected_tbl->select({0}), read_filter_table->view()); + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(expected_tbl->select({1, 2}), read_payload_table->view()); + } +} diff --git a/cpp/tests/io/parquet_common.cpp b/cpp/tests/io/parquet_common.cpp index 4a6a79c8f2a..aa0f5fafe4e 100644 --- a/cpp/tests/io/parquet_common.cpp +++ b/cpp/tests/io/parquet_common.cpp @@ -479,7 +479,7 @@ std::enable_if_t, cudf::test::strings_colum ascending() { auto elements = cudf::detail::make_counting_transform_iterator(0, [](auto i) { - std::array buf; + std::array buf{}; snprintf(buf.data(), buf.size(), "%09d", i); return std::string(buf.data()); }); @@ -491,7 +491,7 @@ std::enable_if_t, cudf::test::strings_colum descending() { auto elements = cudf::detail::make_counting_transform_iterator(0, [](auto i) { - std::array buf; + std::array buf{}; snprintf(buf.data(), buf.size(), "%09d", static_cast(num_ordered_rows - i)); return std::string(buf.data()); }); @@ -503,7 +503,7 @@ std::enable_if_t, cudf::test::strings_colum unordered() { auto elements = cudf::detail::make_counting_transform_iterator(0, [](auto i) { - std::array buf; + std::array buf{}; snprintf(buf.data(), buf.size(), "%09d", (i % 2 == 0) ? i : (num_ordered_rows - i)); return std::string(buf.data()); }); From aafc3f87f0415112970fb66155ef7574cefb3d44 Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com> Date: Wed, 30 Jul 2025 12:11:20 -0700 Subject: [PATCH 027/366] Derive and use page mask at subpass level for chunked reads (#19515) Contributes to #19526 This PR implements deriving and using a page mask at subpass level in Parquet reader. This is to enable chunked reading in the next-gen reader. Note: PR #19526 implements chunking in the next-gen parquet reader and tests this feature so no new tests here. Authors: - Muhammad Haseeb (https://github.com/mhaseeb123) Approvers: - Vukasin Milovanovic (https://github.com/vuule) - Nghia Truong (https://github.com/ttnghia) URL: https://github.com/rapidsai/cudf/pull/19515 --- .../experimental/hybrid_scan_chunking.cu | 6 +-- .../parquet/experimental/hybrid_scan_impl.cpp | 49 +++++++++++++------ .../parquet/experimental/hybrid_scan_impl.hpp | 4 +- cpp/src/io/parquet/reader_impl.cpp | 15 +++--- cpp/src/io/parquet/reader_impl.hpp | 12 ++++- cpp/src/io/parquet/reader_impl_chunking.cu | 35 ++++++++++++- 6 files changed, 91 insertions(+), 30 deletions(-) diff --git a/cpp/src/io/parquet/experimental/hybrid_scan_chunking.cu b/cpp/src/io/parquet/experimental/hybrid_scan_chunking.cu index c54410de7fe..71f0533de31 100644 --- a/cpp/src/io/parquet/experimental/hybrid_scan_chunking.cu +++ b/cpp/src/io/parquet/experimental/hybrid_scan_chunking.cu @@ -50,6 +50,9 @@ void hybrid_scan_reader_impl::handle_chunking( if (!_pass_itm_data) { // setup the next pass setup_next_pass(std::move(column_chunk_buffers), options); + + // Must be called as soon as we create the pass + set_pass_page_mask(data_page_mask); } auto& pass = *_pass_itm_data; @@ -81,9 +84,6 @@ void hybrid_scan_reader_impl::handle_chunking( } } - // Must be called before `setup_next_subpass()` to select pages to decompress - set_page_mask(data_page_mask); - // setup the next sub pass setup_next_subpass(read_mode::READ_ALL); } diff --git a/cpp/src/io/parquet/experimental/hybrid_scan_impl.cpp b/cpp/src/io/parquet/experimental/hybrid_scan_impl.cpp index cbb794c0b90..48aa1d6c40c 100644 --- a/cpp/src/io/parquet/experimental/hybrid_scan_impl.cpp +++ b/cpp/src/io/parquet/experimental/hybrid_scan_impl.cpp @@ -504,7 +504,8 @@ void hybrid_scan_reader_impl::reset_internal_state() _file_preprocessed = false; _has_page_index = false; _pass_itm_data.reset(); - _page_mask.clear(); + _pass_page_mask.clear(); + _subpass_page_mask.clear(); _output_metadata.reset(); _options.timestamp_type = cudf::data_type{}; _options.num_rows = std::nullopt; @@ -724,19 +725,20 @@ table_with_metadata hybrid_scan_reader_impl::finalize_output( } } -void hybrid_scan_reader_impl::set_page_mask(cudf::host_span const> data_page_mask) +void hybrid_scan_reader_impl::set_pass_page_mask( + cudf::host_span const> data_page_mask) { CUDF_FUNC_RANGE(); auto const& pass = _pass_itm_data; auto const& chunks = pass->chunks; - _page_mask = cudf::detail::make_empty_host_vector(pass->pages.size(), _stream); + _pass_page_mask = cudf::detail::make_empty_host_vector(pass->pages.size(), _stream); auto const num_columns = _input_columns.size(); // Handle the empty page mask case if (data_page_mask.empty()) { - std::fill(_page_mask.begin(), _page_mask.end(), true); + std::fill(_pass_page_mask.begin(), _pass_page_mask.end(), true); return; } @@ -744,23 +746,38 @@ void hybrid_scan_reader_impl::set_page_mask(cudf::host_span co thrust::counting_iterator(0), thrust::counting_iterator(_input_columns.size()), [&](auto col_idx) { - auto const& col_page_mask = data_page_mask[col_idx]; - size_t num_inserted_pages = 0; + auto const& col_page_mask = data_page_mask[col_idx]; + size_t num_inserted_data_pages = 0; for (size_t chunk_idx = col_idx; chunk_idx < chunks.size(); chunk_idx += num_columns) { - if (chunks[chunk_idx].num_dict_pages > 0) { _page_mask.push_back(true); } - // Sanitize the column's page mask and insert - CUDF_EXPECTS(col_page_mask.size() >= num_inserted_pages + chunks[chunk_idx].num_data_pages, - "Encountered invalid data page mask size"); - _page_mask.insert( - _page_mask.end(), - col_page_mask.begin() + num_inserted_pages, - col_page_mask.begin() + num_inserted_pages + chunks[chunk_idx].num_data_pages); - num_inserted_pages += chunks[chunk_idx].num_data_pages; + // Insert a true value for each dictionary page + if (chunks[chunk_idx].num_dict_pages > 0) { _pass_page_mask.push_back(true); } + + // Number of data pages in this column chunk + auto const num_data_pages_this_col_chunk = chunks[chunk_idx].num_data_pages; + + // Make sure we have enough page mask for this column chunk + CUDF_EXPECTS( + col_page_mask.size() >= num_inserted_data_pages + num_data_pages_this_col_chunk, + "Encountered invalid data page mask size"); + + // Insert page mask for this column chunk + _pass_page_mask.insert( + _pass_page_mask.end(), + col_page_mask.begin() + num_inserted_data_pages, + col_page_mask.begin() + num_inserted_data_pages + num_data_pages_this_col_chunk); + + // Update the number of inserted data pages + num_inserted_data_pages += num_data_pages_this_col_chunk; } - CUDF_EXPECTS(num_inserted_pages == col_page_mask.size(), + // Make sure we inserted exactly the number of data pages for this column + CUDF_EXPECTS(num_inserted_data_pages == col_page_mask.size(), "Encountered mismatch in number of data pages and page mask size"); }); + + // Make sure we inserted exactly the number of pages for this pass + CUDF_EXPECTS(_pass_page_mask.size() == pass->pages.size(), + "Encountered mismatch in number of pass pages and page mask size"); } } // namespace cudf::io::parquet::experimental::detail diff --git a/cpp/src/io/parquet/experimental/hybrid_scan_impl.hpp b/cpp/src/io/parquet/experimental/hybrid_scan_impl.hpp index c6519200252..b2c0d40a7d1 100644 --- a/cpp/src/io/parquet/experimental/hybrid_scan_impl.hpp +++ b/cpp/src/io/parquet/experimental/hybrid_scan_impl.hpp @@ -217,11 +217,11 @@ class hybrid_scan_reader_impl : public parquet::detail::reader_impl { rmm::cuda_stream_view stream); /** - * @brief Set the mask for pages + * @brief Set the page mask for the pass pages * * @param data_page_mask Input data page mask from page-pruning step */ - void set_page_mask(cudf::host_span const> data_page_mask); + void set_pass_page_mask(cudf::host_span const> data_page_mask); /** * @brief Fill a BOOL8 row mask column with the specified value diff --git a/cpp/src/io/parquet/reader_impl.cpp b/cpp/src/io/parquet/reader_impl.cpp index 854856a1b40..96aa416d27b 100644 --- a/cpp/src/io/parquet/reader_impl.cpp +++ b/cpp/src/io/parquet/reader_impl.cpp @@ -213,15 +213,15 @@ void reader_impl::decode_page_data(read_mode mode, size_t skip_rows, size_t num_ } } - // Use the `_page_mask` from page pruning stage, if non empty, otherwise set all pages in this - // subpass to be decoded + // Use the `_subpass_page_mask` derived from `_page_mask` if non empty, otherwise set all pages in + // this subpass to be decoded auto host_page_mask = [&]() { - if (_page_mask.empty()) { + if (_subpass_page_mask.empty()) { auto page_mask = cudf::detail::make_host_vector(subpass.pages.size(), _stream); std::fill(page_mask.begin(), page_mask.end(), true); return page_mask; } else { - return _page_mask; + return _subpass_page_mask; } }(); @@ -527,7 +527,9 @@ void reader_impl::decode_page_data(read_mode mode, size_t skip_rows, size_t num_ } reader_impl::reader_impl() - : _options{}, _page_mask{cudf::detail::make_host_vector(0, cudf::get_default_stream())} + : _options{}, + _pass_page_mask{cudf::detail::make_host_vector(0, cudf::get_default_stream())}, + _subpass_page_mask{cudf::detail::make_host_vector(0, cudf::get_default_stream())} { } @@ -557,7 +559,8 @@ reader_impl::reader_impl(std::size_t chunk_read_limit, options.get_num_rows(), options.get_row_groups()}, _sources{std::move(sources)}, - _page_mask{cudf::detail::make_host_vector(0, _stream)}, + _pass_page_mask{cudf::detail::make_host_vector(0, _stream)}, + _subpass_page_mask{cudf::detail::make_host_vector(0, _stream)}, _output_chunk_read_limit{chunk_read_limit}, _input_pass_read_limit{pass_read_limit} { diff --git a/cpp/src/io/parquet/reader_impl.hpp b/cpp/src/io/parquet/reader_impl.hpp index 3848eac8f6d..9312257a7ec 100644 --- a/cpp/src/io/parquet/reader_impl.hpp +++ b/cpp/src/io/parquet/reader_impl.hpp @@ -177,6 +177,11 @@ class reader_impl { */ void setup_next_subpass(read_mode mode); + /** + * @brief Copies over the relevant page mask information for the subpass + */ + void set_subpass_page_mask(); + /** * @brief Read a chunk of data and return an output table. * @@ -417,8 +422,11 @@ class reader_impl { // _output_buffers associated schema indices std::vector _output_column_schemas; - // Page mask for filtering out data pages - cudf::detail::host_vector _page_mask; + // Page mask for filtering out pass data pages + cudf::detail::host_vector _pass_page_mask; + + // Page mask for filtering out subpass data pages + cudf::detail::host_vector _subpass_page_mask; // _output_buffers associated metadata std::unique_ptr _output_metadata; diff --git a/cpp/src/io/parquet/reader_impl_chunking.cu b/cpp/src/io/parquet/reader_impl_chunking.cu index 0a46fb1877c..4b42ef36436 100644 --- a/cpp/src/io/parquet/reader_impl_chunking.cu +++ b/cpp/src/io/parquet/reader_impl_chunking.cu @@ -23,6 +23,7 @@ #include +#include #include #include @@ -337,6 +338,9 @@ void reader_impl::setup_next_subpass(read_mode mode) std::transform( h_spans.begin(), h_spans.end(), subpass.column_page_count.begin(), get_span_size{}); + // Set the page mask information for the subpass + set_subpass_page_mask(); + // decompress the data pages in this subpass; also decompress the dictionary pages in this pass, // if this is the first subpass in the pass if (pass.has_compressed_data) { @@ -344,7 +348,7 @@ void reader_impl::setup_next_subpass(read_mode mode) decompress_page_data(pass.chunks, is_first_subpass ? pass.pages : host_span{}, subpass.pages, - _page_mask, + _subpass_page_mask, _stream, _mr); @@ -664,4 +668,33 @@ void reader_impl::compute_output_chunks_for_subpass() c_info, subpass.pages, subpass.skip_rows, subpass.num_rows, _output_chunk_read_limit, _stream); } +void reader_impl::set_subpass_page_mask() +{ + auto const& pass = _pass_itm_data; + auto const& subpass = pass->subpass; + + // Create a host vector to store the subpass page mask + _subpass_page_mask = cudf::detail::make_host_vector(subpass->pages.size(), _stream); + + // Fill with all true if no pass level page mask is available + if (_pass_page_mask.empty()) { + std::fill(_subpass_page_mask.begin(), _subpass_page_mask.end(), true); + return; + } + + // If this is the only subpass, move the pass level page mask data as is + if (subpass->single_subpass) { + std::move(_pass_page_mask.begin(), _pass_page_mask.end(), _subpass_page_mask.begin()); + return; + } + + // Use the pass page index mask to gather the subpass page mask from the pass level page mask + auto const host_page_src_index = cudf::detail::make_host_vector(subpass->page_src_index, _stream); + thrust::gather(thrust::seq, + host_page_src_index.begin(), + host_page_src_index.end(), + _pass_page_mask.begin(), + _subpass_page_mask.begin()); +} + } // namespace cudf::io::parquet::detail From 5652d7e3cf1438f133cf28cfda0076a9d7d12205 Mon Sep 17 00:00:00 2001 From: Shruti Shivakumar Date: Wed, 30 Jul 2025 12:16:46 -0700 Subject: [PATCH 028/366] Fix logic for number of unique values generated by data profile in benchmarks (#19540) This PR clarifies and corrects the usage of cardinality in the data generator, and restores the previous distribution of the data generated for cardinality `c = 0` . When `c` is a non-negative integer less than the total number of values to be generated, then the number of unique elements in the output is at most `c`. Otherwise, if the cardinality is greater than or equal to the total number of values required, then the cardinality of the output is enforced i.e. the number of unique elements in the output is exactly `c`. Authors: - Shruti Shivakumar (https://github.com/shrshi) Approvers: - David Wendt (https://github.com/davidwendt) - Muhammad Haseeb (https://github.com/mhaseeb123) URL: https://github.com/rapidsai/cudf/pull/19540 --- cpp/benchmarks/common/generate_input.cu | 2 +- cpp/benchmarks/common/generate_input.hpp | 9 ++++++--- cpp/benchmarks/join/generate_input_tables.cuh | 7 ++++--- 3 files changed, 11 insertions(+), 7 deletions(-) diff --git a/cpp/benchmarks/common/generate_input.cu b/cpp/benchmarks/common/generate_input.cu index 55c5133fa03..0488a59099d 100644 --- a/cpp/benchmarks/common/generate_input.cu +++ b/cpp/benchmarks/common/generate_input.cu @@ -549,7 +549,7 @@ struct create_rand_col_fn { thrust::minstd_rand& engine, cudf::size_type num_rows) { - if (profile.get_cardinality() == 0 || profile.get_cardinality() >= num_rows) { + if (profile.get_cardinality() >= num_rows) { return create_distinct_rows_column(profile, engine, num_rows); } return create_random_column(profile, engine, num_rows); diff --git a/cpp/benchmarks/common/generate_input.hpp b/cpp/benchmarks/common/generate_input.hpp index 57834fd11d2..b900acc2f20 100644 --- a/cpp/benchmarks/common/generate_input.hpp +++ b/cpp/benchmarks/common/generate_input.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2024, NVIDIA CORPORATION. + * Copyright (c) 2020-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -238,8 +238,11 @@ class data_profile { double bool_probability_true = 0.5; std::optional null_probability = 0.01; - cudf::size_type cardinality = 2000; - cudf::size_type avg_run_length = 4; + cudf::size_type cardinality = + 2000; /// Upper bound on the number of unique values generated if `0 <= cardinality < n`, where + /// `n` is the total number of values to be generated. If `cardinality >= n`, n` unique + /// values of the requested data type are generated. + cudf::size_type avg_run_length = 4; public: template , std::unique_ptr> generate_i static_cast(build_table_numrows / multiplicity); double const null_probability = Nullable ? 0.3 : 0; - auto const profile = - data_profile{data_profile_builder().null_probability(null_probability).cardinality(0)}; + auto const profile = data_profile{data_profile_builder() + .null_probability(null_probability) + .cardinality(unique_rows_build_table_numrows + 1)}; auto unique_rows_build_table = create_random_table(key_types, row_count{unique_rows_build_table_numrows + 1}, profile, 1); @@ -227,7 +228,7 @@ std::pair, std::unique_ptr> generate_i auto probe_cols = probe_table->release(); for (auto i = 0; i < num_payload_cols; i++) { build_cols.emplace_back(cudf::sequence(build_table_numrows, *init)); - probe_cols.emplace_back(cudf::sequence(build_table_numrows, *init)); + probe_cols.emplace_back(cudf::sequence(probe_table_numrows, *init)); } return std::pair{std::make_unique(std::move(build_cols)), From 5baa96b84657c01026c7c558bd55d13aacaa75e0 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Wed, 30 Jul 2025 15:50:11 -0700 Subject: [PATCH 029/366] Construct more cuDF classic Columns with pylibcudf instead of using Buffers (#19535) Towards https://github.com/rapidsai/cudf/issues/18726 To eventually back a cuDF column with a pylibcudf Column, we should always be constructing a cuDF Column from a pylibcudf Column. Currently there are places where we construct a cuDF column from the custom Buffer class via an RMM object, so converting this to just use pylibcudf instead Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - Bradley Dice (https://github.com/bdice) URL: https://github.com/rapidsai/cudf/pull/19535 --- python/cudf/cudf/core/column/column.py | 26 ++++-- python/cudf/cudf/core/column/numerical.py | 4 +- python/cudf/cudf/core/multiindex.py | 4 +- .../cudf/pandas/scripts/conftest-patch.py | 91 +------------------ python/cudf/cudf/tests/test_multiindex.py | 9 ++ 5 files changed, 34 insertions(+), 100 deletions(-) diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 3eb2e5dbeb3..ac2d9d12752 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -2573,19 +2573,25 @@ def column_empty( elif isinstance(dtype, CategoricalDtype): data = None children = ( - cudf.core.column.NumericalColumn( - data=as_buffer( - rmm.DeviceBuffer(size=row_count * SIZE_TYPE_DTYPE.itemsize) - ), - size=None, - dtype=SIZE_TYPE_DTYPE, + ColumnBase.from_pylibcudf( + plc.Column.from_scalar( + plc.Scalar.from_py( + None, dtype_to_pylibcudf_type(SIZE_TYPE_DTYPE) + ), + row_count, + ) ), ) - elif dtype.kind in "OU" and not isinstance(dtype, DecimalDtype): - data = as_buffer(rmm.DeviceBuffer(size=0)) - children = (as_column(0, length=row_count + 1, dtype=SIZE_TYPE_DTYPE),) else: - data = as_buffer(rmm.DeviceBuffer(size=row_count * dtype.itemsize)) + col = ColumnBase.from_pylibcudf( + plc.Column.from_scalar( + plc.Scalar.from_py(None, dtype_to_pylibcudf_type(dtype)), + row_count, + ) + )._with_type_metadata(dtype) + if for_numba: + col = col.set_mask(None) + return col if row_count > 0 and not for_numba: mask = as_buffer( diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py index 05120c6e687..2292b92e68e 100644 --- a/python/cudf/cudf/core/column/numerical.py +++ b/python/cudf/cudf/core/column/numerical.py @@ -15,7 +15,7 @@ import cudf from cudf.api.types import is_scalar from cudf.core._internals import binaryop -from cudf.core.buffer import acquire_spill_lock, as_buffer +from cudf.core.buffer import acquire_spill_lock from cudf.core.column.column import ColumnBase, as_column, column_empty from cudf.core.column.numerical_base import NumericalBaseColumn from cudf.core.dtypes import CategoricalDtype @@ -307,7 +307,7 @@ def nans_to_nulls(self: Self) -> Self: mask, _ = plc.transform.nans_to_nulls( self.to_pylibcudf(mode="read") ) - return self.set_mask(as_buffer(mask)) + return self.set_mask(mask) def _normalize_binop_operand(self, other: Any) -> pa.Scalar | ColumnBase: if isinstance(other, ColumnBase): diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py index e69e45bf733..f3bdfce4321 100644 --- a/python/cudf/cudf/core/multiindex.py +++ b/python/cudf/cudf/core/multiindex.py @@ -1654,7 +1654,9 @@ def droplevel(self, level=-1) -> Self | Index: new_data.pop(self._data.names[i]) if len(new_data) == 1: - return _index_from_data(new_data) + return Index._from_column( + next(iter(new_data.values())), name=new_names[0] + ) else: mi = type(self)._from_data(new_data) mi.names = new_names diff --git a/python/cudf/cudf/pandas/scripts/conftest-patch.py b/python/cudf/cudf/pandas/scripts/conftest-patch.py index 3ef660fa614..9520f09945e 100644 --- a/python/cudf/cudf/pandas/scripts/conftest-patch.py +++ b/python/cudf/cudf/pandas/scripts/conftest-patch.py @@ -1551,7 +1551,6 @@ def pytest_unconfigure(config): "tests/base/test_misc.py::test_memory_usage[datetime-tz]", "tests/base/test_misc.py::test_memory_usage[datetime]", "tests/base/test_misc.py::test_memory_usage[dict-series]", - "tests/base/test_misc.py::test_memory_usage[empty]", "tests/base/test_misc.py::test_memory_usage[float-series]", "tests/base/test_misc.py::test_memory_usage[float16-series]", "tests/base/test_misc.py::test_memory_usage[float32-series]", @@ -1582,7 +1581,6 @@ def pytest_unconfigure(config): "tests/base/test_misc.py::test_memory_usage[series-with-complex64-index]", "tests/base/test_misc.py::test_memory_usage[series-with-datetime-index]", "tests/base/test_misc.py::test_memory_usage[series-with-datetime-tz-index]", - "tests/base/test_misc.py::test_memory_usage[series-with-empty-index]", "tests/base/test_misc.py::test_memory_usage[series-with-float32-index]", "tests/base/test_misc.py::test_memory_usage[series-with-float64-index]", "tests/base/test_misc.py::test_memory_usage[series-with-int16-index]", @@ -3541,124 +3539,88 @@ def pytest_unconfigure(config): "tests/extension/test_arrow.py::TestArrowArray::test_unary_ufunc_dunder_equivalence[uint8-absolute]", "tests/extension/test_arrow.py::TestArrowArray::test_unary_ufunc_dunder_equivalence[uint8-negative]", "tests/extension/test_arrow.py::TestArrowArray::test_unary_ufunc_dunder_equivalence[uint8-positive]", - "tests/extension/test_arrow.py::TestArrowArray::test_unstack[bool-frame-index1]", "tests/extension/test_arrow.py::TestArrowArray::test_unstack[bool-frame-index2]", "tests/extension/test_arrow.py::TestArrowArray::test_unstack[bool-frame-index3]", - "tests/extension/test_arrow.py::TestArrowArray::test_unstack[bool-series-index1]", "tests/extension/test_arrow.py::TestArrowArray::test_unstack[bool-series-index2]", "tests/extension/test_arrow.py::TestArrowArray::test_unstack[bool-series-index3]", - "tests/extension/test_arrow.py::TestArrowArray::test_unstack[double-frame-index1]", + "tests/extension/test_arrow.py::TestArrowArray::test_unstack[decimal128(7, 3)-frame-index2]", + "tests/extension/test_arrow.py::TestArrowArray::test_unstack[decimal128(7, 3)-frame-index3]", + "tests/extension/test_arrow.py::TestArrowArray::test_unstack[decimal128(7, 3)-series-index2]", + "tests/extension/test_arrow.py::TestArrowArray::test_unstack[decimal128(7, 3)-series-index3]", "tests/extension/test_arrow.py::TestArrowArray::test_unstack[double-frame-index2]", "tests/extension/test_arrow.py::TestArrowArray::test_unstack[double-frame-index3]", - "tests/extension/test_arrow.py::TestArrowArray::test_unstack[double-series-index1]", "tests/extension/test_arrow.py::TestArrowArray::test_unstack[double-series-index2]", "tests/extension/test_arrow.py::TestArrowArray::test_unstack[double-series-index3]", - "tests/extension/test_arrow.py::TestArrowArray::test_unstack[duration[ms]-frame-index1]", "tests/extension/test_arrow.py::TestArrowArray::test_unstack[duration[ms]-frame-index2]", "tests/extension/test_arrow.py::TestArrowArray::test_unstack[duration[ms]-frame-index3]", - "tests/extension/test_arrow.py::TestArrowArray::test_unstack[duration[ms]-series-index1]", "tests/extension/test_arrow.py::TestArrowArray::test_unstack[duration[ms]-series-index2]", "tests/extension/test_arrow.py::TestArrowArray::test_unstack[duration[ms]-series-index3]", - "tests/extension/test_arrow.py::TestArrowArray::test_unstack[duration[ns]-frame-index1]", "tests/extension/test_arrow.py::TestArrowArray::test_unstack[duration[ns]-frame-index2]", "tests/extension/test_arrow.py::TestArrowArray::test_unstack[duration[ns]-frame-index3]", - "tests/extension/test_arrow.py::TestArrowArray::test_unstack[duration[ns]-series-index1]", "tests/extension/test_arrow.py::TestArrowArray::test_unstack[duration[ns]-series-index2]", "tests/extension/test_arrow.py::TestArrowArray::test_unstack[duration[ns]-series-index3]", - "tests/extension/test_arrow.py::TestArrowArray::test_unstack[duration[s]-frame-index1]", "tests/extension/test_arrow.py::TestArrowArray::test_unstack[duration[s]-frame-index2]", "tests/extension/test_arrow.py::TestArrowArray::test_unstack[duration[s]-frame-index3]", - "tests/extension/test_arrow.py::TestArrowArray::test_unstack[duration[s]-series-index1]", "tests/extension/test_arrow.py::TestArrowArray::test_unstack[duration[s]-series-index2]", "tests/extension/test_arrow.py::TestArrowArray::test_unstack[duration[s]-series-index3]", - "tests/extension/test_arrow.py::TestArrowArray::test_unstack[duration[us]-frame-index1]", "tests/extension/test_arrow.py::TestArrowArray::test_unstack[duration[us]-frame-index2]", "tests/extension/test_arrow.py::TestArrowArray::test_unstack[duration[us]-frame-index3]", - "tests/extension/test_arrow.py::TestArrowArray::test_unstack[duration[us]-series-index1]", "tests/extension/test_arrow.py::TestArrowArray::test_unstack[duration[us]-series-index2]", "tests/extension/test_arrow.py::TestArrowArray::test_unstack[duration[us]-series-index3]", - "tests/extension/test_arrow.py::TestArrowArray::test_unstack[float-frame-index1]", "tests/extension/test_arrow.py::TestArrowArray::test_unstack[float-frame-index2]", "tests/extension/test_arrow.py::TestArrowArray::test_unstack[float-frame-index3]", - "tests/extension/test_arrow.py::TestArrowArray::test_unstack[float-series-index1]", "tests/extension/test_arrow.py::TestArrowArray::test_unstack[float-series-index2]", "tests/extension/test_arrow.py::TestArrowArray::test_unstack[float-series-index3]", - "tests/extension/test_arrow.py::TestArrowArray::test_unstack[int16-frame-index1]", "tests/extension/test_arrow.py::TestArrowArray::test_unstack[int16-frame-index2]", "tests/extension/test_arrow.py::TestArrowArray::test_unstack[int16-frame-index3]", - "tests/extension/test_arrow.py::TestArrowArray::test_unstack[int16-series-index1]", "tests/extension/test_arrow.py::TestArrowArray::test_unstack[int16-series-index2]", "tests/extension/test_arrow.py::TestArrowArray::test_unstack[int16-series-index3]", - "tests/extension/test_arrow.py::TestArrowArray::test_unstack[int32-frame-index1]", "tests/extension/test_arrow.py::TestArrowArray::test_unstack[int32-frame-index2]", "tests/extension/test_arrow.py::TestArrowArray::test_unstack[int32-frame-index3]", - "tests/extension/test_arrow.py::TestArrowArray::test_unstack[int32-series-index1]", "tests/extension/test_arrow.py::TestArrowArray::test_unstack[int32-series-index2]", "tests/extension/test_arrow.py::TestArrowArray::test_unstack[int32-series-index3]", - "tests/extension/test_arrow.py::TestArrowArray::test_unstack[int64-frame-index1]", "tests/extension/test_arrow.py::TestArrowArray::test_unstack[int64-frame-index2]", "tests/extension/test_arrow.py::TestArrowArray::test_unstack[int64-frame-index3]", - "tests/extension/test_arrow.py::TestArrowArray::test_unstack[int64-series-index1]", "tests/extension/test_arrow.py::TestArrowArray::test_unstack[int64-series-index2]", "tests/extension/test_arrow.py::TestArrowArray::test_unstack[int64-series-index3]", - "tests/extension/test_arrow.py::TestArrowArray::test_unstack[int8-frame-index1]", "tests/extension/test_arrow.py::TestArrowArray::test_unstack[int8-frame-index2]", "tests/extension/test_arrow.py::TestArrowArray::test_unstack[int8-frame-index3]", - "tests/extension/test_arrow.py::TestArrowArray::test_unstack[int8-series-index1]", "tests/extension/test_arrow.py::TestArrowArray::test_unstack[int8-series-index2]", "tests/extension/test_arrow.py::TestArrowArray::test_unstack[int8-series-index3]", - "tests/extension/test_arrow.py::TestArrowArray::test_unstack[string-frame-index1]", "tests/extension/test_arrow.py::TestArrowArray::test_unstack[string-frame-index2]", "tests/extension/test_arrow.py::TestArrowArray::test_unstack[string-frame-index3]", - "tests/extension/test_arrow.py::TestArrowArray::test_unstack[string-series-index1]", "tests/extension/test_arrow.py::TestArrowArray::test_unstack[string-series-index2]", "tests/extension/test_arrow.py::TestArrowArray::test_unstack[string-series-index3]", - "tests/extension/test_arrow.py::TestArrowArray::test_unstack[timestamp[ms]-frame-index1]", "tests/extension/test_arrow.py::TestArrowArray::test_unstack[timestamp[ms]-frame-index2]", "tests/extension/test_arrow.py::TestArrowArray::test_unstack[timestamp[ms]-frame-index3]", - "tests/extension/test_arrow.py::TestArrowArray::test_unstack[timestamp[ms]-series-index1]", "tests/extension/test_arrow.py::TestArrowArray::test_unstack[timestamp[ms]-series-index2]", "tests/extension/test_arrow.py::TestArrowArray::test_unstack[timestamp[ms]-series-index3]", - "tests/extension/test_arrow.py::TestArrowArray::test_unstack[timestamp[ns]-frame-index1]", "tests/extension/test_arrow.py::TestArrowArray::test_unstack[timestamp[ns]-frame-index2]", "tests/extension/test_arrow.py::TestArrowArray::test_unstack[timestamp[ns]-frame-index3]", - "tests/extension/test_arrow.py::TestArrowArray::test_unstack[timestamp[ns]-series-index1]", "tests/extension/test_arrow.py::TestArrowArray::test_unstack[timestamp[ns]-series-index2]", "tests/extension/test_arrow.py::TestArrowArray::test_unstack[timestamp[ns]-series-index3]", - "tests/extension/test_arrow.py::TestArrowArray::test_unstack[timestamp[s]-frame-index1]", "tests/extension/test_arrow.py::TestArrowArray::test_unstack[timestamp[s]-frame-index2]", "tests/extension/test_arrow.py::TestArrowArray::test_unstack[timestamp[s]-frame-index3]", - "tests/extension/test_arrow.py::TestArrowArray::test_unstack[timestamp[s]-series-index1]", "tests/extension/test_arrow.py::TestArrowArray::test_unstack[timestamp[s]-series-index2]", "tests/extension/test_arrow.py::TestArrowArray::test_unstack[timestamp[s]-series-index3]", - "tests/extension/test_arrow.py::TestArrowArray::test_unstack[timestamp[us]-frame-index1]", "tests/extension/test_arrow.py::TestArrowArray::test_unstack[timestamp[us]-frame-index2]", "tests/extension/test_arrow.py::TestArrowArray::test_unstack[timestamp[us]-frame-index3]", - "tests/extension/test_arrow.py::TestArrowArray::test_unstack[timestamp[us]-series-index1]", "tests/extension/test_arrow.py::TestArrowArray::test_unstack[timestamp[us]-series-index2]", "tests/extension/test_arrow.py::TestArrowArray::test_unstack[timestamp[us]-series-index3]", - "tests/extension/test_arrow.py::TestArrowArray::test_unstack[uint16-frame-index1]", "tests/extension/test_arrow.py::TestArrowArray::test_unstack[uint16-frame-index2]", "tests/extension/test_arrow.py::TestArrowArray::test_unstack[uint16-frame-index3]", - "tests/extension/test_arrow.py::TestArrowArray::test_unstack[uint16-series-index1]", "tests/extension/test_arrow.py::TestArrowArray::test_unstack[uint16-series-index2]", "tests/extension/test_arrow.py::TestArrowArray::test_unstack[uint16-series-index3]", - "tests/extension/test_arrow.py::TestArrowArray::test_unstack[uint32-frame-index1]", "tests/extension/test_arrow.py::TestArrowArray::test_unstack[uint32-frame-index2]", "tests/extension/test_arrow.py::TestArrowArray::test_unstack[uint32-frame-index3]", - "tests/extension/test_arrow.py::TestArrowArray::test_unstack[uint32-series-index1]", "tests/extension/test_arrow.py::TestArrowArray::test_unstack[uint32-series-index2]", "tests/extension/test_arrow.py::TestArrowArray::test_unstack[uint32-series-index3]", - "tests/extension/test_arrow.py::TestArrowArray::test_unstack[uint64-frame-index1]", "tests/extension/test_arrow.py::TestArrowArray::test_unstack[uint64-frame-index2]", "tests/extension/test_arrow.py::TestArrowArray::test_unstack[uint64-frame-index3]", - "tests/extension/test_arrow.py::TestArrowArray::test_unstack[uint64-series-index1]", "tests/extension/test_arrow.py::TestArrowArray::test_unstack[uint64-series-index2]", "tests/extension/test_arrow.py::TestArrowArray::test_unstack[uint64-series-index3]", - "tests/extension/test_arrow.py::TestArrowArray::test_unstack[uint8-frame-index1]", "tests/extension/test_arrow.py::TestArrowArray::test_unstack[uint8-frame-index2]", "tests/extension/test_arrow.py::TestArrowArray::test_unstack[uint8-frame-index3]", - "tests/extension/test_arrow.py::TestArrowArray::test_unstack[uint8-series-index1]", "tests/extension/test_arrow.py::TestArrowArray::test_unstack[uint8-series-index2]", "tests/extension/test_arrow.py::TestArrowArray::test_unstack[uint8-series-index3]", "tests/extension/test_arrow.py::TestLogicalOps::test_kleene_and", @@ -4170,70 +4132,48 @@ def pytest_unconfigure(config): "tests/extension/test_masked.py::TestMaskedArrays::test_to_numpy[UInt32Dtype]", "tests/extension/test_masked.py::TestMaskedArrays::test_to_numpy[UInt64Dtype]", "tests/extension/test_masked.py::TestMaskedArrays::test_to_numpy[UInt8Dtype]", - "tests/extension/test_masked.py::TestMaskedArrays::test_unstack[BooleanDtype-frame-index1]", "tests/extension/test_masked.py::TestMaskedArrays::test_unstack[BooleanDtype-frame-index2]", "tests/extension/test_masked.py::TestMaskedArrays::test_unstack[BooleanDtype-frame-index3]", - "tests/extension/test_masked.py::TestMaskedArrays::test_unstack[BooleanDtype-series-index1]", "tests/extension/test_masked.py::TestMaskedArrays::test_unstack[BooleanDtype-series-index2]", "tests/extension/test_masked.py::TestMaskedArrays::test_unstack[BooleanDtype-series-index3]", - "tests/extension/test_masked.py::TestMaskedArrays::test_unstack[Float32Dtype-frame-index1]", "tests/extension/test_masked.py::TestMaskedArrays::test_unstack[Float32Dtype-frame-index2]", "tests/extension/test_masked.py::TestMaskedArrays::test_unstack[Float32Dtype-frame-index3]", - "tests/extension/test_masked.py::TestMaskedArrays::test_unstack[Float32Dtype-series-index1]", "tests/extension/test_masked.py::TestMaskedArrays::test_unstack[Float32Dtype-series-index2]", "tests/extension/test_masked.py::TestMaskedArrays::test_unstack[Float32Dtype-series-index3]", - "tests/extension/test_masked.py::TestMaskedArrays::test_unstack[Float64Dtype-frame-index1]", "tests/extension/test_masked.py::TestMaskedArrays::test_unstack[Float64Dtype-frame-index2]", "tests/extension/test_masked.py::TestMaskedArrays::test_unstack[Float64Dtype-frame-index3]", - "tests/extension/test_masked.py::TestMaskedArrays::test_unstack[Float64Dtype-series-index1]", "tests/extension/test_masked.py::TestMaskedArrays::test_unstack[Float64Dtype-series-index2]", "tests/extension/test_masked.py::TestMaskedArrays::test_unstack[Float64Dtype-series-index3]", - "tests/extension/test_masked.py::TestMaskedArrays::test_unstack[Int16Dtype-frame-index1]", "tests/extension/test_masked.py::TestMaskedArrays::test_unstack[Int16Dtype-frame-index2]", "tests/extension/test_masked.py::TestMaskedArrays::test_unstack[Int16Dtype-frame-index3]", - "tests/extension/test_masked.py::TestMaskedArrays::test_unstack[Int16Dtype-series-index1]", "tests/extension/test_masked.py::TestMaskedArrays::test_unstack[Int16Dtype-series-index2]", "tests/extension/test_masked.py::TestMaskedArrays::test_unstack[Int16Dtype-series-index3]", - "tests/extension/test_masked.py::TestMaskedArrays::test_unstack[Int32Dtype-frame-index1]", "tests/extension/test_masked.py::TestMaskedArrays::test_unstack[Int32Dtype-frame-index2]", "tests/extension/test_masked.py::TestMaskedArrays::test_unstack[Int32Dtype-frame-index3]", - "tests/extension/test_masked.py::TestMaskedArrays::test_unstack[Int32Dtype-series-index1]", "tests/extension/test_masked.py::TestMaskedArrays::test_unstack[Int32Dtype-series-index2]", "tests/extension/test_masked.py::TestMaskedArrays::test_unstack[Int32Dtype-series-index3]", - "tests/extension/test_masked.py::TestMaskedArrays::test_unstack[Int64Dtype-frame-index1]", "tests/extension/test_masked.py::TestMaskedArrays::test_unstack[Int64Dtype-frame-index2]", "tests/extension/test_masked.py::TestMaskedArrays::test_unstack[Int64Dtype-frame-index3]", - "tests/extension/test_masked.py::TestMaskedArrays::test_unstack[Int64Dtype-series-index1]", "tests/extension/test_masked.py::TestMaskedArrays::test_unstack[Int64Dtype-series-index2]", "tests/extension/test_masked.py::TestMaskedArrays::test_unstack[Int64Dtype-series-index3]", - "tests/extension/test_masked.py::TestMaskedArrays::test_unstack[Int8Dtype-frame-index1]", "tests/extension/test_masked.py::TestMaskedArrays::test_unstack[Int8Dtype-frame-index2]", "tests/extension/test_masked.py::TestMaskedArrays::test_unstack[Int8Dtype-frame-index3]", - "tests/extension/test_masked.py::TestMaskedArrays::test_unstack[Int8Dtype-series-index1]", "tests/extension/test_masked.py::TestMaskedArrays::test_unstack[Int8Dtype-series-index2]", "tests/extension/test_masked.py::TestMaskedArrays::test_unstack[Int8Dtype-series-index3]", - "tests/extension/test_masked.py::TestMaskedArrays::test_unstack[UInt16Dtype-frame-index1]", "tests/extension/test_masked.py::TestMaskedArrays::test_unstack[UInt16Dtype-frame-index2]", "tests/extension/test_masked.py::TestMaskedArrays::test_unstack[UInt16Dtype-frame-index3]", - "tests/extension/test_masked.py::TestMaskedArrays::test_unstack[UInt16Dtype-series-index1]", "tests/extension/test_masked.py::TestMaskedArrays::test_unstack[UInt16Dtype-series-index2]", "tests/extension/test_masked.py::TestMaskedArrays::test_unstack[UInt16Dtype-series-index3]", - "tests/extension/test_masked.py::TestMaskedArrays::test_unstack[UInt32Dtype-frame-index1]", "tests/extension/test_masked.py::TestMaskedArrays::test_unstack[UInt32Dtype-frame-index2]", "tests/extension/test_masked.py::TestMaskedArrays::test_unstack[UInt32Dtype-frame-index3]", - "tests/extension/test_masked.py::TestMaskedArrays::test_unstack[UInt32Dtype-series-index1]", "tests/extension/test_masked.py::TestMaskedArrays::test_unstack[UInt32Dtype-series-index2]", "tests/extension/test_masked.py::TestMaskedArrays::test_unstack[UInt32Dtype-series-index3]", - "tests/extension/test_masked.py::TestMaskedArrays::test_unstack[UInt64Dtype-frame-index1]", "tests/extension/test_masked.py::TestMaskedArrays::test_unstack[UInt64Dtype-frame-index2]", "tests/extension/test_masked.py::TestMaskedArrays::test_unstack[UInt64Dtype-frame-index3]", - "tests/extension/test_masked.py::TestMaskedArrays::test_unstack[UInt64Dtype-series-index1]", "tests/extension/test_masked.py::TestMaskedArrays::test_unstack[UInt64Dtype-series-index2]", "tests/extension/test_masked.py::TestMaskedArrays::test_unstack[UInt64Dtype-series-index3]", - "tests/extension/test_masked.py::TestMaskedArrays::test_unstack[UInt8Dtype-frame-index1]", "tests/extension/test_masked.py::TestMaskedArrays::test_unstack[UInt8Dtype-frame-index2]", "tests/extension/test_masked.py::TestMaskedArrays::test_unstack[UInt8Dtype-frame-index3]", - "tests/extension/test_masked.py::TestMaskedArrays::test_unstack[UInt8Dtype-series-index1]", "tests/extension/test_masked.py::TestMaskedArrays::test_unstack[UInt8Dtype-series-index2]", "tests/extension/test_masked.py::TestMaskedArrays::test_unstack[UInt8Dtype-series-index3]", "tests/extension/test_masked.py::TestMaskedArrays::test_value_counts_with_normalize[BooleanDtype]", @@ -4383,10 +4323,8 @@ def pytest_unconfigure(config): "tests/extension/test_numpy.py::TestNumpyExtensionArray::test_unique[object--Series]", "tests/extension/test_numpy.py::TestNumpyExtensionArray::test_unique[object-unique-]", "tests/extension/test_numpy.py::TestNumpyExtensionArray::test_unique[object-unique-Series]", - "tests/extension/test_numpy.py::TestNumpyExtensionArray::test_unstack[float-frame-index1]", "tests/extension/test_numpy.py::TestNumpyExtensionArray::test_unstack[float-frame-index2]", "tests/extension/test_numpy.py::TestNumpyExtensionArray::test_unstack[float-frame-index3]", - "tests/extension/test_numpy.py::TestNumpyExtensionArray::test_unstack[float-series-index1]", "tests/extension/test_numpy.py::TestNumpyExtensionArray::test_unstack[float-series-index2]", "tests/extension/test_numpy.py::TestNumpyExtensionArray::test_unstack[float-series-index3]", "tests/extension/test_numpy.py::TestNumpyExtensionArray::test_value_counts[float-data_missing-True]", @@ -4923,16 +4861,12 @@ def pytest_unconfigure(config): "tests/extension/test_string.py::TestStringArray::test_unstack[python-True-series-index1]", "tests/extension/test_string.py::TestStringArray::test_unstack[python-True-series-index2]", "tests/extension/test_string.py::TestStringArray::test_unstack[python-True-series-index3]", - "tests/extension/test_string.py::TestStringArray::test_unstack[string=str[pyarrow]-False-frame-index1]", "tests/extension/test_string.py::TestStringArray::test_unstack[string=str[pyarrow]-False-frame-index2]", "tests/extension/test_string.py::TestStringArray::test_unstack[string=str[pyarrow]-False-frame-index3]", - "tests/extension/test_string.py::TestStringArray::test_unstack[string=str[pyarrow]-False-series-index1]", "tests/extension/test_string.py::TestStringArray::test_unstack[string=str[pyarrow]-False-series-index2]", "tests/extension/test_string.py::TestStringArray::test_unstack[string=str[pyarrow]-False-series-index3]", - "tests/extension/test_string.py::TestStringArray::test_unstack[string=str[pyarrow]-True-frame-index1]", "tests/extension/test_string.py::TestStringArray::test_unstack[string=str[pyarrow]-True-frame-index2]", "tests/extension/test_string.py::TestStringArray::test_unstack[string=str[pyarrow]-True-frame-index3]", - "tests/extension/test_string.py::TestStringArray::test_unstack[string=str[pyarrow]-True-series-index1]", "tests/extension/test_string.py::TestStringArray::test_unstack[string=str[pyarrow]-True-series-index2]", "tests/extension/test_string.py::TestStringArray::test_unstack[string=str[pyarrow]-True-series-index3]", "tests/extension/test_string.py::TestStringArray::test_unstack[string=str[python]-False-frame-index0]", @@ -4951,28 +4885,20 @@ def pytest_unconfigure(config): "tests/extension/test_string.py::TestStringArray::test_unstack[string=str[python]-True-series-index1]", "tests/extension/test_string.py::TestStringArray::test_unstack[string=str[python]-True-series-index2]", "tests/extension/test_string.py::TestStringArray::test_unstack[string=str[python]-True-series-index3]", - "tests/extension/test_string.py::TestStringArray::test_unstack[string=string[pyarrow]-False-frame-index1]", "tests/extension/test_string.py::TestStringArray::test_unstack[string=string[pyarrow]-False-frame-index2]", "tests/extension/test_string.py::TestStringArray::test_unstack[string=string[pyarrow]-False-frame-index3]", - "tests/extension/test_string.py::TestStringArray::test_unstack[string=string[pyarrow]-False-series-index1]", "tests/extension/test_string.py::TestStringArray::test_unstack[string=string[pyarrow]-False-series-index2]", "tests/extension/test_string.py::TestStringArray::test_unstack[string=string[pyarrow]-False-series-index3]", - "tests/extension/test_string.py::TestStringArray::test_unstack[string=string[pyarrow]-True-frame-index1]", "tests/extension/test_string.py::TestStringArray::test_unstack[string=string[pyarrow]-True-frame-index2]", "tests/extension/test_string.py::TestStringArray::test_unstack[string=string[pyarrow]-True-frame-index3]", - "tests/extension/test_string.py::TestStringArray::test_unstack[string=string[pyarrow]-True-series-index1]", "tests/extension/test_string.py::TestStringArray::test_unstack[string=string[pyarrow]-True-series-index2]", "tests/extension/test_string.py::TestStringArray::test_unstack[string=string[pyarrow]-True-series-index3]", - "tests/extension/test_string.py::TestStringArray::test_unstack[string=string[python]-False-frame-index1]", "tests/extension/test_string.py::TestStringArray::test_unstack[string=string[python]-False-frame-index2]", "tests/extension/test_string.py::TestStringArray::test_unstack[string=string[python]-False-frame-index3]", - "tests/extension/test_string.py::TestStringArray::test_unstack[string=string[python]-False-series-index1]", "tests/extension/test_string.py::TestStringArray::test_unstack[string=string[python]-False-series-index2]", "tests/extension/test_string.py::TestStringArray::test_unstack[string=string[python]-False-series-index3]", - "tests/extension/test_string.py::TestStringArray::test_unstack[string=string[python]-True-frame-index1]", "tests/extension/test_string.py::TestStringArray::test_unstack[string=string[python]-True-frame-index2]", "tests/extension/test_string.py::TestStringArray::test_unstack[string=string[python]-True-frame-index3]", - "tests/extension/test_string.py::TestStringArray::test_unstack[string=string[python]-True-series-index1]", "tests/extension/test_string.py::TestStringArray::test_unstack[string=string[python]-True-series-index2]", "tests/extension/test_string.py::TestStringArray::test_unstack[string=string[python]-True-series-index3]", "tests/extension/test_string.py::TestStringArray::test_value_counts_with_normalize[pyarrow-False]", @@ -6115,7 +6041,6 @@ def pytest_unconfigure(config): "tests/frame/test_stack_unstack.py::TestDataFrameReshape::test_stack_preserve_categorical_dtype[True-True]", "tests/frame/test_stack_unstack.py::TestDataFrameReshape::test_stack_unstack[False]", "tests/frame/test_stack_unstack.py::TestDataFrameReshape::test_stack_unstack[True]", - "tests/frame/test_stack_unstack.py::TestDataFrameReshape::test_unstack_bool", "tests/frame/test_stack_unstack.py::TestDataFrameReshape::test_unstack_long_index", "tests/frame/test_stack_unstack.py::TestDataFrameReshape::test_unstack_mixed_extension_types[0]", "tests/frame/test_stack_unstack.py::TestDataFrameReshape::test_unstack_mixed_extension_types[1]", @@ -6154,11 +6079,8 @@ def pytest_unconfigure(config): "tests/frame/test_stack_unstack.py::TestStackUnstackMultiLevel::test_stack_unstack_wrong_level_name[True-unstack]", "tests/frame/test_stack_unstack.py::TestStackUnstackMultiLevel::test_unstack_categorical_columns", "tests/frame/test_stack_unstack.py::TestStackUnstackMultiLevel::test_unstack_preserve_types", - "tests/frame/test_stack_unstack.py::TestStackUnstackMultiLevel::test_unstack_unobserved_keys[False]", - "tests/frame/test_stack_unstack.py::TestStackUnstackMultiLevel::test_unstack_unobserved_keys[True]", "tests/frame/test_stack_unstack.py::TestStackUnstackMultiLevel::test_unstack_with_level_has_nan", "tests/frame/test_stack_unstack.py::TestStackUnstackMultiLevel::test_unstack_with_missing_int_cast_to_float", - "tests/frame/test_stack_unstack.py::test_unstack_non_slice_like_blocks", "tests/frame/test_subclass.py::TestDataFrameSubclassing::test_asof", "tests/frame/test_subclass.py::TestDataFrameSubclassing::test_equals_subclass", "tests/frame/test_subclass.py::TestDataFrameSubclassing::test_frame_subclassing_and_slicing", @@ -7030,7 +6952,6 @@ def pytest_unconfigure(config): "tests/groupby/aggregate/test_aggregate.py::test_groupby_aggregate_empty_key_empty_return", "tests/groupby/aggregate/test_aggregate.py::test_groupby_aggregation_multi_level_column", "tests/groupby/aggregate/test_aggregate.py::test_multiindex_custom_func[0]", - "tests/groupby/aggregate/test_aggregate.py::test_nonagg_agg", "tests/groupby/aggregate/test_aggregate.py::test_order_aggregate_multiple_funcs", "tests/groupby/aggregate/test_cython.py::test_cython_agg_EA_known_dtypes[data1-prod-large_int-False]", "tests/groupby/aggregate/test_cython.py::test_cython_agg_EA_known_dtypes[data1-prod-large_int-True]", @@ -8728,7 +8649,6 @@ def pytest_unconfigure(config): "tests/indexes/test_old_base.py::TestBase::test_getitem_2d_deprecated[simple_index4]", "tests/indexes/test_old_base.py::TestBase::test_memory_usage[bool-dtype]", "tests/indexes/test_old_base.py::TestBase::test_memory_usage[categorical]", - "tests/indexes/test_old_base.py::TestBase::test_memory_usage[empty]", "tests/indexes/test_old_base.py::TestBase::test_memory_usage[float32]", "tests/indexes/test_old_base.py::TestBase::test_memory_usage[float64]", "tests/indexes/test_old_base.py::TestBase::test_memory_usage[int16]", @@ -8942,7 +8862,6 @@ def pytest_unconfigure(config): "tests/indexing/multiindex/test_loc.py::test_loc_getitem_drops_levels_for_one_row_dataframe", "tests/indexing/multiindex/test_loc.py::test_loc_getitem_duplicates_multiindex_missing_indexers[indexer6-pos6]", "tests/indexing/multiindex/test_loc.py::test_loc_getitem_index_differently_ordered_slice_none_duplicates[indexer0]", - "tests/indexing/multiindex/test_loc.py::test_loc_getitem_int", "tests/indexing/multiindex/test_loc.py::test_loc_getitem_int_raises_exception", "tests/indexing/multiindex/test_loc.py::test_loc_with_mi_indexer", "tests/indexing/multiindex/test_multiindex.py::TestMultiIndexBasic::test_rename_multiindex_with_duplicates", @@ -8973,7 +8892,6 @@ def pytest_unconfigure(config): "tests/indexing/multiindex/test_slice.py::TestMultiIndexSlicers::test_per_axis_per_level_setitem", "tests/indexing/multiindex/test_sorted.py::TestMultiIndexSorted::test_argsort_with_na", "tests/indexing/multiindex/test_sorted.py::TestMultiIndexSorted::test_frame_getitem_not_sorted", - "tests/indexing/multiindex/test_sorted.py::TestMultiIndexSorted::test_series_getitem_not_sorted", "tests/indexing/test_at.py::test_at_timezone", "tests/indexing/test_categorical.py::TestCategoricalIndex::test_ix_categorical_index", "tests/indexing/test_categorical.py::TestCategoricalIndex::test_loc_getitem_with_non_string_categories[False-idx_values17]", @@ -11200,7 +11118,6 @@ def pytest_unconfigure(config): "tests/resample/test_time_grouper.py::test_aggregate_normal[std]", "tests/resample/test_time_grouper.py::test_aggregate_normal[sum]", "tests/resample/test_time_grouper.py::test_aggregate_normal[var]", - "tests/resample/test_time_grouper.py::test_apply", "tests/resample/test_time_grouper.py::test_apply_iteration", "tests/resample/test_time_grouper.py::test_repr", "tests/resample/test_time_grouper.py::test_upsample_sum[prod-method_args4-expected_values4]", diff --git a/python/cudf/cudf/tests/test_multiindex.py b/python/cudf/cudf/tests/test_multiindex.py index 4c1f5259c61..d274166d0d2 100644 --- a/python/cudf/cudf/tests/test_multiindex.py +++ b/python/cudf/cudf/tests/test_multiindex.py @@ -2188,3 +2188,12 @@ def test_from_arrays_infer_names(idx): expected = pd.MultiIndex.from_arrays(arrays) result = cudf.MultiIndex.from_arrays(arrays) assert_eq(result, expected) + + +def test_multiindex_droplevel_single_level_none_names(): + data = [(1, 2), (3, 4)] + pidx = pd.MultiIndex.from_tuples(data, names=[None, None]) + gidx = cudf.MultiIndex.from_tuples(data, names=[None, None]) + result = gidx.droplevel(0) + expected = pidx.droplevel(0) + assert_eq(result, expected) From 7fddc5ff47b79cc59dea97908b81e452da73cfb0 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Wed, 30 Jul 2025 15:51:26 -0700 Subject: [PATCH 030/366] Move test_series.py to new cudf classic test directory structure (#19485) Towards https://github.com/rapidsai/cudf/issues/9999 This splits `test_series.py` into a directory structure like * `test/series/indexing/*` for loc/iloc/`__getitem__`/`__setitem__` * `test/series/methods/*` for `Series.some_method` * `test/series/test_constructors.py` * `test/series/test_binops.py` Additionally * Adding more shared fixtures in conftest.py where applicable * Further simplify tests * Removed existing stub test files that are no longer applicable Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/19485 --- python/cudf/cudf/tests/conftest.py | 71 + python/cudf/cudf/tests/reshape/__init__.py | 0 python/cudf/cudf/tests/reshape/test_concat.py | 184 + .../cudf/tests/series/indexing/__init__.py | 0 .../tests/series/indexing/test_setitem.py | 73 + .../cudf/tests/series/methods/__init__.py | 0 .../series/methods/test_add_prefix_suffix.py | 25 + .../cudf/tests/series/methods/test_astype.py | 103 + .../tests/series/methods/test_autocorr.py | 34 + .../cudf/tests/series/methods/test_between.py | 62 + .../tests/series/methods/test_contains.py | 19 + .../cudf/tests/series/methods/test_count.py | 28 + .../tests/series/methods/test_describe.py | 60 + .../cudf/tests/series/methods/test_diff.py | 61 + .../tests/series/methods/test_digitize.py | 36 + .../cudf/tests/series/methods/test_drop.py | 162 + .../tests/series/methods/test_duplicated.py | 29 + .../cudf/tests/series/methods/test_equals.py | 19 + .../cudf/tests/series/methods/test_explode.py | 31 + .../tests/series/methods/test_factorize.py | 31 + .../cudf/tests/series/methods/test_fillna.py | 40 + .../tests/series/methods/test_hash_values.py | 47 + .../cudf/tests/series/methods/test_isin.py | 170 + .../tests/series/methods/test_isna_notnull.py | 61 + .../tests/series/methods/test_memory_usage.py | 21 + .../cudf/tests/series/methods/test_mode.py | 70 + .../series/methods/test_nans_to_nulls.py | 12 + .../series/methods/test_nlargest_nsmallest.py | 18 + .../cudf/tests/series/methods/test_nunique.py | 25 + .../cudf/tests/series/methods/test_pipe.py | 64 + .../cudf/tests/series/methods/test_reindex.py | 19 + .../cudf/tests/series/methods/test_rename.py | 37 + .../tests/series/methods/test_reset_index.py | 139 + .../cudf/tests/series/methods/test_round.py | 149 + .../tests/series/methods/test_sort_index.py | 46 + .../cudf/tests/series/methods/test_squeeze.py | 21 + .../cudf/tests/series/methods/test_to_cupy.py | 43 + .../cudf/tests/series/methods/test_to_dict.py | 18 + .../tests/series/methods/test_to_pandas.py | 154 + .../cudf/tests/series/methods/test_tolist.py | 20 + .../tests/series/methods/test_transpose.py | 28 + .../tests/series/methods/test_truncate.py | 50 + .../cudf/tests/series/methods/test_update.py | 38 + .../tests/series/methods/test_value_counts.py | 146 + .../cudf/tests/series/methods/test_where.py | 25 + .../cudf/cudf/tests/series/test_accessors.py | 1 - .../cudf/cudf/tests/series/test_attributes.py | 144 +- .../tests/series/test_binary_operations.py | 1 - python/cudf/cudf/tests/series/test_binops.py | 35 + .../cudf/cudf/tests/series/test_categorial.py | 1 - .../cudf/cudf/tests/series/test_combining.py | 1 - .../cudf/tests/series/test_computation.py | 1 - .../cudf/tests/series/test_constructing.py | 12 - .../cudf/tests/series/test_constructors.py | 637 ++++ .../tests/series/test_function_application.py | 1 - .../cudf/cudf/tests/series/test_indexing.py | 1 - .../tests/series/test_io_serialization.py | 1 - python/cudf/cudf/tests/series/test_missing.py | 1 - .../cudf/cudf/tests/series/test_reshaping.py | 1 - .../cudf/cudf/tests/series/test_selecting.py | 1 - python/cudf/cudf/tests/series/test_sorting.py | 1 - .../cudf/cudf/tests/series/test_timeseries.py | 1 - python/cudf/cudf/tests/test_series.py | 3111 ----------------- 63 files changed, 3304 insertions(+), 3137 deletions(-) create mode 100644 python/cudf/cudf/tests/reshape/__init__.py create mode 100644 python/cudf/cudf/tests/reshape/test_concat.py create mode 100644 python/cudf/cudf/tests/series/indexing/__init__.py create mode 100644 python/cudf/cudf/tests/series/indexing/test_setitem.py create mode 100644 python/cudf/cudf/tests/series/methods/__init__.py create mode 100644 python/cudf/cudf/tests/series/methods/test_add_prefix_suffix.py create mode 100644 python/cudf/cudf/tests/series/methods/test_astype.py create mode 100644 python/cudf/cudf/tests/series/methods/test_autocorr.py create mode 100644 python/cudf/cudf/tests/series/methods/test_between.py create mode 100644 python/cudf/cudf/tests/series/methods/test_contains.py create mode 100644 python/cudf/cudf/tests/series/methods/test_count.py create mode 100644 python/cudf/cudf/tests/series/methods/test_describe.py create mode 100644 python/cudf/cudf/tests/series/methods/test_diff.py create mode 100644 python/cudf/cudf/tests/series/methods/test_digitize.py create mode 100644 python/cudf/cudf/tests/series/methods/test_drop.py create mode 100644 python/cudf/cudf/tests/series/methods/test_duplicated.py create mode 100644 python/cudf/cudf/tests/series/methods/test_equals.py create mode 100644 python/cudf/cudf/tests/series/methods/test_explode.py create mode 100644 python/cudf/cudf/tests/series/methods/test_factorize.py create mode 100644 python/cudf/cudf/tests/series/methods/test_fillna.py create mode 100644 python/cudf/cudf/tests/series/methods/test_hash_values.py create mode 100644 python/cudf/cudf/tests/series/methods/test_isin.py create mode 100644 python/cudf/cudf/tests/series/methods/test_isna_notnull.py create mode 100644 python/cudf/cudf/tests/series/methods/test_memory_usage.py create mode 100644 python/cudf/cudf/tests/series/methods/test_mode.py create mode 100644 python/cudf/cudf/tests/series/methods/test_nans_to_nulls.py create mode 100644 python/cudf/cudf/tests/series/methods/test_nlargest_nsmallest.py create mode 100644 python/cudf/cudf/tests/series/methods/test_nunique.py create mode 100644 python/cudf/cudf/tests/series/methods/test_pipe.py create mode 100644 python/cudf/cudf/tests/series/methods/test_reindex.py create mode 100644 python/cudf/cudf/tests/series/methods/test_rename.py create mode 100644 python/cudf/cudf/tests/series/methods/test_reset_index.py create mode 100644 python/cudf/cudf/tests/series/methods/test_round.py create mode 100644 python/cudf/cudf/tests/series/methods/test_sort_index.py create mode 100644 python/cudf/cudf/tests/series/methods/test_squeeze.py create mode 100644 python/cudf/cudf/tests/series/methods/test_to_cupy.py create mode 100644 python/cudf/cudf/tests/series/methods/test_to_dict.py create mode 100644 python/cudf/cudf/tests/series/methods/test_to_pandas.py create mode 100644 python/cudf/cudf/tests/series/methods/test_tolist.py create mode 100644 python/cudf/cudf/tests/series/methods/test_transpose.py create mode 100644 python/cudf/cudf/tests/series/methods/test_truncate.py create mode 100644 python/cudf/cudf/tests/series/methods/test_update.py create mode 100644 python/cudf/cudf/tests/series/methods/test_value_counts.py create mode 100644 python/cudf/cudf/tests/series/methods/test_where.py delete mode 100644 python/cudf/cudf/tests/series/test_accessors.py delete mode 100644 python/cudf/cudf/tests/series/test_binary_operations.py create mode 100644 python/cudf/cudf/tests/series/test_binops.py delete mode 100644 python/cudf/cudf/tests/series/test_categorial.py delete mode 100644 python/cudf/cudf/tests/series/test_combining.py delete mode 100644 python/cudf/cudf/tests/series/test_computation.py delete mode 100644 python/cudf/cudf/tests/series/test_constructing.py create mode 100644 python/cudf/cudf/tests/series/test_constructors.py delete mode 100644 python/cudf/cudf/tests/series/test_function_application.py delete mode 100644 python/cudf/cudf/tests/series/test_indexing.py delete mode 100644 python/cudf/cudf/tests/series/test_io_serialization.py delete mode 100644 python/cudf/cudf/tests/series/test_missing.py delete mode 100644 python/cudf/cudf/tests/series/test_reshaping.py delete mode 100644 python/cudf/cudf/tests/series/test_selecting.py delete mode 100644 python/cudf/cudf/tests/series/test_sorting.py delete mode 100644 python/cudf/cudf/tests/series/test_timeseries.py delete mode 100644 python/cudf/cudf/tests/test_series.py diff --git a/python/cudf/cudf/tests/conftest.py b/python/cudf/cudf/tests/conftest.py index 1d27963a903..0a57efe2fbd 100644 --- a/python/cudf/cudf/tests/conftest.py +++ b/python/cudf/cudf/tests/conftest.py @@ -246,6 +246,47 @@ def numeric_types_as_str(request): return request.param +@pytest.fixture( + params=signed_integer_types + + unsigned_integer_types + + float_types + + bool_types +) +def numeric_and_bool_types_as_str(request): + """ + - "int8", "int16", "int32", "int64" + - "uint8", "uint16", "uint32", "uint64" + - "float32", "float64" + - "bool" + """ + return request.param + + +@pytest.fixture(params=datetime_types) +def datetime_types_as_str(request): + """ + - "datetime64[ns]", "datetime64[us]", "datetime64[ms]", "datetime64[s]" + """ + return request.param + + +@pytest.fixture(params=timedelta_types) +def timedelta_types_as_str(request): + """ + - "timedelta64[ns]", "timedelta64[us]", "timedelta64[ms]", "timedelta64[s]" + """ + return request.param + + +@pytest.fixture(params=datetime_types + timedelta_types) +def temporal_types_as_str(request): + """ + - "datetime64[ns]", "datetime64[us]", "datetime64[ms]", "datetime64[s]" + - "timedelta64[ns]", "timedelta64[us]", "timedelta64[ms]", "timedelta64[s]" + """ + return request.param + + @pytest.fixture( params=signed_integer_types + unsigned_integer_types @@ -268,3 +309,33 @@ def all_supported_types_as_str(request): - "bool" """ return request.param + + +@pytest.fixture(params=[True, False]) +def dropna(request): + """Param for `dropna` argument""" + return request.param + + +@pytest.fixture(params=[True, False, None]) +def nan_as_null(request): + """Param for `nan_as_null` argument""" + return request.param + + +@pytest.fixture(params=[True, False]) +def inplace(request): + """Param for `inplace` argument""" + return request.param + + +@pytest.fixture(params=[True, False]) +def ignore_index(request): + """Param for `ignore_index` argument""" + return request.param + + +@pytest.fixture(params=[True, False]) +def ascending(request): + """Param for `ascending` argument""" + return request.param diff --git a/python/cudf/cudf/tests/reshape/__init__.py b/python/cudf/cudf/tests/reshape/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/python/cudf/cudf/tests/reshape/test_concat.py b/python/cudf/cudf/tests/reshape/test_concat.py new file mode 100644 index 00000000000..8da43038159 --- /dev/null +++ b/python/cudf/cudf/tests/reshape/test_concat.py @@ -0,0 +1,184 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + +import numpy as np +import pandas as pd +import pytest + +import cudf +from cudf.testing import assert_eq + + +@pytest.mark.parametrize( + "data", + [ + [1, -2, 3, -4], + [1.0, 12.221, 12.34, 13.324, 324.3242], + ], +) +@pytest.mark.parametrize( + "others", + [ + [-10, 11, -12, 13], + [0.1, 0.002, 324.2332, 0.2342], + ], +) +def test_series_concat_basic(data, others, ignore_index): + psr = pd.Series(data) + gsr = cudf.Series(data) + + other_ps = pd.Series(others) + other_gs = cudf.Series(others) + + expected = pd.concat([psr, other_ps], ignore_index=ignore_index) + actual = cudf.concat([gsr, other_gs], ignore_index=ignore_index) + + assert_eq(expected, actual) + + +@pytest.mark.parametrize( + "data", + [ + ["abc", "123"], + ["a"], + ], +) +@pytest.mark.parametrize( + "others", + [ + ["abc", "123"], + ["a"], + ["+", "-", "!", "_", "="], + ], +) +def test_series_concat_basic_str(data, others, ignore_index): + psr = pd.Series(data) + gsr = cudf.Series(data) + + other_ps = pd.Series(others) + other_gs = cudf.Series(others) + + expected = pd.concat([psr, other_ps], ignore_index=ignore_index) + actual = cudf.concat([gsr, other_gs], ignore_index=ignore_index) + assert_eq(expected, actual) + + +@pytest.mark.parametrize( + "data", + [ + pd.Series(["abc", "123"], index=[10, 20]), + pd.Series(["a"], index=[10]), + ], +) +@pytest.mark.parametrize( + "others", + [ + pd.Series(["abc", "123"], index=[50, 20]), + pd.Series(["a"], index=[11]), + pd.Series(["+", "-", "!", "_", "="], index=[12, 13, 14, 15, 16]), + ], +) +def test_series_concat_series_with_index(data, others, ignore_index): + psr = pd.Series(data) + gsr = cudf.Series(data) + + other_ps = others + other_gs = cudf.from_pandas(others) + + expected = pd.concat([psr, other_ps], ignore_index=ignore_index) + actual = cudf.concat([gsr, other_gs], ignore_index=ignore_index) + + assert_eq(expected, actual) + + +def test_series_concat_error_mixed_types(): + gsr = cudf.Series([1, 2, 3, 4]) + other = cudf.Series(["a", "b", "c", "d"]) + + with pytest.raises( + TypeError, + match="cudf does not support mixed types, please type-cast " + "both series to same dtypes.", + ): + cudf.concat([gsr, other]) + + with pytest.raises( + TypeError, + match="cudf does not support mixed types, please type-cast " + "both series to same dtypes.", + ): + cudf.concat([gsr, gsr, other, gsr, other]) + + +@pytest.mark.parametrize( + "data", + [ + pd.Series([-1, 2, -3, 4], index=["a", "b", "c", "d"]), + pd.Series( + [1.0, 12.221, 12.34, 13.324, 324.3242], + index=[ + "float one", + "float two", + "float three", + "float four", + "float five", + ], + ), + ], +) +@pytest.mark.parametrize( + "others", + [ + [ + pd.Series([-10, 11, 12, 13], index=["a", "b", "c", "d"]), + pd.Series([12, 14, -15, 27], index=["d", "e", "z", "x"]), + ], + [ + pd.Series( + [0.1, 0.002, 324.2332, 0.2342], index=["-", "+", "%", "#"] + ), + pd.Series([12, 14, 15, 27], index=["d", "e", "z", "x"]), + ] + * 3, + ], +) +def test_series_concat_list_series_with_index(data, others, ignore_index): + psr = pd.Series(data) + gsr = cudf.Series(data) + + other_ps = others + other_gs = [cudf.from_pandas(obj) for obj in others] + + expected = pd.concat([psr, *other_ps], ignore_index=ignore_index) + actual = cudf.concat([gsr, *other_gs], ignore_index=ignore_index) + + assert_eq(expected, actual) + + +def test_series_concat_existing_buffers(): + a1 = np.arange(10, dtype=np.float64) + gs = cudf.Series(a1) + + # Add new buffer + a2 = cudf.Series(np.arange(5)) + gs = cudf.concat([gs, a2]) + assert len(gs) == 15 + np.testing.assert_equal(gs.to_numpy(), np.hstack([a1, a2.to_numpy()])) + + # Ensure appending to previous buffer + a3 = cudf.Series(np.arange(3)) + gs = cudf.concat([gs, a3]) + assert len(gs) == 18 + a4 = np.hstack([a1, a2.to_numpy(), a3.to_numpy()]) + np.testing.assert_equal(gs.to_numpy(), a4) + + # Appending different dtype + a5 = cudf.Series(np.array([1, 2, 3], dtype=np.int32)) + a6 = cudf.Series(np.array([4.5, 5.5, 6.5], dtype=np.float64)) + gs = cudf.concat([a5, a6]) + np.testing.assert_equal( + gs.to_numpy(), np.hstack([a5.to_numpy(), a6.to_numpy()]) + ) + gs = cudf.concat([cudf.Series(a6), a5]) + np.testing.assert_equal( + gs.to_numpy(), np.hstack([a6.to_numpy(), a5.to_numpy()]) + ) diff --git a/python/cudf/cudf/tests/series/indexing/__init__.py b/python/cudf/cudf/tests/series/indexing/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/python/cudf/cudf/tests/series/indexing/test_setitem.py b/python/cudf/cudf/tests/series/indexing/test_setitem.py new file mode 100644 index 00000000000..88a014191bc --- /dev/null +++ b/python/cudf/cudf/tests/series/indexing/test_setitem.py @@ -0,0 +1,73 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + +import numpy as np +import pandas as pd +import pytest + +import cudf +from cudf.testing import assert_eq +from cudf.testing._utils import ( + assert_exceptions_equal, +) + + +def test_fill_new_category(): + gs = cudf.Series(pd.Categorical(["a", "b", "c"])) + with pytest.raises(TypeError): + gs[0:1] = "d" + + +@pytest.mark.parametrize("dtype", ["int64", "float64"]) +@pytest.mark.parametrize("bool_scalar", [True, False]) +def test_set_bool_error(dtype, bool_scalar): + sr = cudf.Series([1, 2, 3], dtype=dtype) + psr = sr.to_pandas(nullable=True) + + assert_exceptions_equal( + lfunc=sr.__setitem__, + rfunc=psr.__setitem__, + lfunc_args_and_kwargs=([bool_scalar],), + rfunc_args_and_kwargs=([bool_scalar],), + ) + + +@pytest.mark.parametrize( + "data", [[0, 1, 2], ["a", "b", "c"], [0.324, 32.32, 3243.23]] +) +def test_series_setitem_nat_with_non_datetimes(data): + s = cudf.Series(data) + with pytest.raises(TypeError): + s[0] = cudf.NaT + + +def test_series_string_setitem(): + gs = cudf.Series(["abc", "def", "ghi", "xyz", "pqr"]) + ps = gs.to_pandas() + + gs[0] = "NaT" + gs[1] = "NA" + gs[2] = "" + gs[3] = "NaN" + + ps[0] = "NaT" + ps[1] = "NA" + ps[2] = "" + ps[3] = "NaN" + + assert_eq(gs, ps) + + +def test_series_error_nan_non_float_dtypes(): + s = cudf.Series(["a", "b", "c"]) + with pytest.raises(TypeError): + s[0] = np.nan + + s = cudf.Series([1, 2, 3], dtype="datetime64[ns]") + with pytest.raises(TypeError): + s[0] = np.nan + + +def test_series_setitem_mixed_bool_dtype(): + s = cudf.Series([True, False, True]) + with pytest.raises(TypeError): + s[0] = 10 diff --git a/python/cudf/cudf/tests/series/methods/__init__.py b/python/cudf/cudf/tests/series/methods/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/python/cudf/cudf/tests/series/methods/test_add_prefix_suffix.py b/python/cudf/cudf/tests/series/methods/test_add_prefix_suffix.py new file mode 100644 index 00000000000..1483968d100 --- /dev/null +++ b/python/cudf/cudf/tests/series/methods/test_add_prefix_suffix.py @@ -0,0 +1,25 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + + +import cudf +from cudf.testing import assert_eq + + +def test_series_add_prefix(): + cd_s = cudf.Series([1, 2, 3, 4]) + pd_s = cd_s.to_pandas() + + got = cd_s.add_prefix("item_") + expected = pd_s.add_prefix("item_") + + assert_eq(got, expected) + + +def test_series_add_suffix(): + cd_s = cudf.Series([1, 2, 3, 4]) + pd_s = cd_s.to_pandas() + + got = cd_s.add_suffix("_item") + expected = pd_s.add_suffix("_item") + + assert_eq(got, expected) diff --git a/python/cudf/cudf/tests/series/methods/test_astype.py b/python/cudf/cudf/tests/series/methods/test_astype.py new file mode 100644 index 00000000000..6825af8442f --- /dev/null +++ b/python/cudf/cudf/tests/series/methods/test_astype.py @@ -0,0 +1,103 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + +import numpy as np +import pandas as pd +import pyarrow as pa +import pytest + +import cudf +from cudf.testing import assert_eq + + +def test_series_typecast_to_object_error(): + actual = cudf.Series([1, 2, 3], dtype="datetime64[ns]") + with cudf.option_context("mode.pandas_compatible", True): + with pytest.raises(TypeError): + actual.astype(object) + with pytest.raises(TypeError): + actual.astype(np.dtype("object")) + new_series = actual.astype("str") + assert new_series[0] == "1970-01-01 00:00:00.000000001" + + +def test_series_typecast_to_object(): + actual = cudf.Series([1, 2, 3], dtype="datetime64[ns]") + with cudf.option_context("mode.pandas_compatible", False): + new_series = actual.astype(object) + assert new_series[0] == "1970-01-01 00:00:00.000000001" + new_series = actual.astype(np.dtype("object")) + assert new_series[0] == "1970-01-01 00:00:00.000000001" + + +@pytest.mark.parametrize( + "dtype", + [ + pd.ArrowDtype(pa.int8()), + pd.ArrowDtype(pa.int16()), + pd.ArrowDtype(pa.int32()), + pd.ArrowDtype(pa.int64()), + pd.ArrowDtype(pa.uint8()), + pd.ArrowDtype(pa.uint16()), + pd.ArrowDtype(pa.uint32()), + pd.ArrowDtype(pa.uint64()), + pd.ArrowDtype(pa.float32()), + pd.ArrowDtype(pa.float64()), + pd.Int8Dtype(), + pd.Int16Dtype(), + pd.Int32Dtype(), + pd.Int64Dtype(), + pd.UInt8Dtype(), + pd.UInt16Dtype(), + pd.UInt32Dtype(), + pd.UInt64Dtype(), + pd.Float32Dtype(), + pd.Float64Dtype(), + ], +) +@pytest.mark.parametrize("klass", [cudf.Series, cudf.DataFrame, cudf.Index]) +@pytest.mark.parametrize("kind", [lambda x: x, str], ids=["obj", "string"]) +def test_astype_pandas_nullable_pandas_compat(dtype, klass, kind): + ser = klass([1, 2, 3]) + with cudf.option_context("mode.pandas_compatible", True): + actual = ser.astype(kind(dtype)) + expected = klass([1, 2, 3], dtype=kind(dtype)) + assert_eq(actual, expected) + + +@pytest.mark.parametrize( + "type1", + [ + "category", + "interval[int64, right]", + "int64", + "float64", + "str", + "datetime64[ns]", + "timedelta64[ns]", + ], +) +@pytest.mark.parametrize( + "type2", + [ + "category", + "interval[int64, right]", + "int64", + "float64", + "str", + "datetime64[ns]", + "timedelta64[ns]", + ], +) +@pytest.mark.parametrize( + "as_dtype", [lambda x: x, cudf.dtype], ids=["string", "object"] +) +@pytest.mark.parametrize("copy", [True, False]) +def test_empty_astype_always_castable(type1, type2, as_dtype, copy): + ser = cudf.Series([], dtype=as_dtype(type1)) + result = ser.astype(as_dtype(type2), copy=copy) + expected = cudf.Series([], dtype=as_dtype(type2)) + assert_eq(result, expected) + if not copy and cudf.dtype(type1) == cudf.dtype(type2): + assert ser._column is result._column + else: + assert ser._column is not result._column diff --git a/python/cudf/cudf/tests/series/methods/test_autocorr.py b/python/cudf/cudf/tests/series/methods/test_autocorr.py new file mode 100644 index 00000000000..4e7b6ad8e19 --- /dev/null +++ b/python/cudf/cudf/tests/series/methods/test_autocorr.py @@ -0,0 +1,34 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + +import numpy as np +import pytest + +import cudf +from cudf.testing import assert_eq +from cudf.testing._utils import ( + expect_warning_if, +) + + +@pytest.mark.parametrize( + "data", + [ + [0.25, 0.5, 0.2, -0.05], + [0, 1, 2, np.nan, 4, cudf.NA, 6], + ], +) +@pytest.mark.parametrize("lag", [1, 4]) +def test_autocorr(data, lag): + cudf_series = cudf.Series(data) + psr = cudf_series.to_pandas() + + cudf_corr = cudf_series.autocorr(lag=lag) + + # autocorrelation is undefined (nan) for less than two entries, but pandas + # short-circuits when there are 0 entries and bypasses the numpy function + # call that generates an error. + num_both_valid = (psr.notna() & psr.shift(lag).notna()).sum() + with expect_warning_if(num_both_valid == 1, RuntimeWarning): + pd_corr = psr.autocorr(lag=lag) + + assert_eq(pd_corr, cudf_corr) diff --git a/python/cudf/cudf/tests/series/methods/test_between.py b/python/cudf/cudf/tests/series/methods/test_between.py new file mode 100644 index 00000000000..c3bb486e00c --- /dev/null +++ b/python/cudf/cudf/tests/series/methods/test_between.py @@ -0,0 +1,62 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + +import numpy as np +import pandas as pd +import pytest + +import cudf +from cudf.testing import assert_eq + + +@pytest.fixture(params=["both", "neither", "left", "right"]) +def inclusive(request): + return request.param + + +@pytest.mark.parametrize( + "data,left,right", + [ + ([0, 1, 2, 3, 4, 5, 10], 0, 5), + ([0, 1, 2, 3, 4, 5, 10], 10, 1), + ([0, 1, 2, 3, 4, 5], [0, 10, 11] * 2, [1, 2, 5] * 2), + (["a", "few", "set", "of", "strings", "xyz", "abc"], "banana", "few"), + (["a", "few", "set", "of", "strings", "xyz", "abc"], "phone", "hello"), + ( + ["a", "few", "set", "of", "strings", "xyz", "abc"], + ["a", "hello", "rapids", "ai", "world", "chars", "strs"], + ["yes", "no", "hi", "bye", "test", "pass", "fail"], + ), + ([0, 1, 2, np.nan, 4, np.nan, 10], 10, 1), + ], +) +def test_series_between(data, left, right, inclusive): + ps = pd.Series(data) + gs = cudf.from_pandas(ps, nan_as_null=False) + + expected = ps.between(left, right, inclusive=inclusive) + actual = gs.between(left, right, inclusive=inclusive) + + assert_eq(expected, actual) + + +@pytest.mark.parametrize( + "data,left,right", + [ + ([0, 1, 2, None, 4, 5, 10], 0, 5), + ([0, 1, 2, 3, None, 5, 10], 10, 1), + ([None, 1, 2, 3, 4, None], [0, 10, 11] * 2, [1, 2, 5] * 2), + ( + ["a", "few", "set", None, "strings", "xyz", "abc"], + ["a", "hello", "rapids", "ai", "world", "chars", "strs"], + ["yes", "no", "hi", "bye", "test", "pass", "fail"], + ), + ], +) +def test_series_between_with_null(data, left, right, inclusive): + gs = cudf.Series(data) + ps = gs.to_pandas(nullable=True) + + expected = ps.between(left, right, inclusive=inclusive) + actual = gs.between(left, right, inclusive=inclusive) + + assert_eq(expected, actual.to_pandas(nullable=True)) diff --git a/python/cudf/cudf/tests/series/methods/test_contains.py b/python/cudf/cudf/tests/series/methods/test_contains.py new file mode 100644 index 00000000000..a80502b67e5 --- /dev/null +++ b/python/cudf/cudf/tests/series/methods/test_contains.py @@ -0,0 +1,19 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + +import pandas as pd +import pytest + +import cudf +from cudf.testing import assert_eq + + +@pytest.mark.parametrize("data", [[True, False, None], [10, 200, 300]]) +@pytest.mark.parametrize("index", [None, [10, 20, 30]]) +def test_series_contains(data, index): + ps = pd.Series(data, index=index) + gs = cudf.Series(data, index=index) + + assert_eq(1 in ps, 1 in gs) + assert_eq(10 in ps, 10 in gs) + assert_eq(True in ps, True in gs) + assert_eq(False in ps, False in gs) diff --git a/python/cudf/cudf/tests/series/methods/test_count.py b/python/cudf/cudf/tests/series/methods/test_count.py new file mode 100644 index 00000000000..6dd5e6994ff --- /dev/null +++ b/python/cudf/cudf/tests/series/methods/test_count.py @@ -0,0 +1,28 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + +import numpy as np +import pytest + +import cudf +from cudf.testing import assert_eq + + +def test_series_count_invalid_param(): + s = cudf.Series([], dtype="float64") + with pytest.raises(TypeError): + s.count(skipna=True) + + +def test_series_dataframe_count_float(): + gs = cudf.Series([1, 2, 3, None, np.nan, 10], nan_as_null=False) + ps = cudf.Series([1, 2, 3, None, np.nan, 10]) + + with cudf.option_context("mode.pandas_compatible", True): + assert_eq(ps.count(), gs.count()) + assert_eq(ps.to_frame().count(), gs.to_frame().count()) + with cudf.option_context("mode.pandas_compatible", False): + assert_eq(gs.count(), gs.to_pandas(nullable=True).count()) + assert_eq( + gs.to_frame().count(), + gs.to_frame().to_pandas(nullable=True).count(), + ) diff --git a/python/cudf/cudf/tests/series/methods/test_describe.py b/python/cudf/cudf/tests/series/methods/test_describe.py new file mode 100644 index 00000000000..2936fa83aba --- /dev/null +++ b/python/cudf/cudf/tests/series/methods/test_describe.py @@ -0,0 +1,60 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + +import pandas as pd +import pytest + +import cudf +from cudf.testing import assert_eq + + +def test_series_describe_numeric(numeric_types_as_str): + ps = pd.Series([0, 1, 2, 3, 1, 2, 3], dtype=numeric_types_as_str) + gs = cudf.from_pandas(ps) + actual = gs.describe() + expected = ps.describe() + + assert_eq(expected, actual, check_dtype=True) + + +def test_series_describe_temporal(temporal_types_as_str, request): + if "ms" in temporal_types_as_str: + request.applymarker( + pytest.mark.xfail( + reason=f"string formatting of {temporal_types_as_str} incorrect in cuDF" + ) + ) + gs = cudf.Series([0, 1, 2, 3, 1, 2, 3], dtype=temporal_types_as_str) + ps = gs.to_pandas() + + expected = ps.describe() + actual = gs.describe() + + assert_eq(expected.astype("str"), actual) + + +@pytest.mark.parametrize( + "ps", + [ + pd.Series(["a", "b", "c", "d", "e", "a"]), + pd.Series([True, False, True, True, False]), + pd.Series([], dtype="str"), + pd.Series(["a", "b", "c", "a"], dtype="category"), + pd.Series(["d", "e", "f"], dtype="category"), + pd.Series(pd.Categorical(["d", "e", "f"], categories=["f", "e", "d"])), + pd.Series( + pd.Categorical( + ["d", "e", "f"], categories=["f", "e", "d"], ordered=True + ) + ), + ], +) +def test_series_describe_other_types(ps): + gs = cudf.from_pandas(ps) + + expected = ps.describe() + actual = gs.describe() + + if len(ps) == 0: + assert_eq(expected.fillna("a").astype("str"), actual.fillna("a")) + else: + assert_eq(expected.astype("str"), actual) diff --git a/python/cudf/cudf/tests/series/methods/test_diff.py b/python/cudf/cudf/tests/series/methods/test_diff.py new file mode 100644 index 00000000000..1706f75a16f --- /dev/null +++ b/python/cudf/cudf/tests/series/methods/test_diff.py @@ -0,0 +1,61 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + +import numpy as np +import pandas as pd +import pytest + +import cudf +from cudf.testing import assert_eq +from cudf.testing._utils import ( + gen_rand, +) + + +@pytest.mark.parametrize("period", [-20, 1, 0, 1, 2]) +@pytest.mark.parametrize("data_empty", [False, True]) +def test_diff(numeric_types_as_str, period, data_empty): + if data_empty: + data = None + else: + if np.dtype(numeric_types_as_str) == np.int8: + # to keep data in range + data = gen_rand(numeric_types_as_str, 100000, low=-2, high=2) + else: + data = gen_rand(numeric_types_as_str, 100000) + + gs = cudf.Series(data, dtype=numeric_types_as_str) + ps = pd.Series(data, dtype=numeric_types_as_str) + + expected_outcome = ps.diff(period) + diffed_outcome = gs.diff(period).astype(expected_outcome.dtype) + + if data_empty: + assert_eq(diffed_outcome, expected_outcome, check_index_type=False) + else: + assert_eq(diffed_outcome, expected_outcome) + + +def test_diff_unsupported_dtypes(): + gs = cudf.Series(["a", "b", "c", "d", "e"]) + with pytest.raises( + TypeError, + match=r"unsupported operand type\(s\)", + ): + gs.diff() + + +@pytest.mark.parametrize( + "data", + [ + pd.date_range("2020-01-01", "2020-01-06", freq="D"), + [True, True, True, False, True, True], + [1.0, 2.0, 3.5, 4.0, 5.0, -1.7], + [1, 2, 3, 3, 4, 5], + [np.nan, None, None, np.nan, np.nan, None], + ], +) +def test_diff_many_dtypes(data): + ps = pd.Series(data) + gs = cudf.from_pandas(ps) + assert_eq(ps.diff(), gs.diff()) + assert_eq(ps.diff(periods=2), gs.diff(periods=2)) diff --git a/python/cudf/cudf/tests/series/methods/test_digitize.py b/python/cudf/cudf/tests/series/methods/test_digitize.py new file mode 100644 index 00000000000..0403a740bab --- /dev/null +++ b/python/cudf/cudf/tests/series/methods/test_digitize.py @@ -0,0 +1,36 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + +import numpy as np +import pytest + +import cudf + + +@pytest.mark.parametrize("right", [True, False]) +@pytest.mark.parametrize("bins_box", [np.asarray, cudf.Series]) +def test_series_digitize(right, numeric_and_bool_types_as_str, bins_box): + num_rows = 20 + num_bins = 5 + rng = np.random.default_rng(seed=0) + data = rng.integers(0, 100, num_rows).astype(numeric_and_bool_types_as_str) + bins = np.unique( + np.sort( + rng.integers(2, 95, num_bins).astype(numeric_and_bool_types_as_str) + ) + ) + s = cudf.Series(data) + indices = s.digitize(bins_box(bins), right) + np.testing.assert_array_equal( + np.digitize(data, bins, right), indices.to_numpy() + ) + + +def test_series_digitize_invalid_bins(): + rng = np.random.default_rng(seed=0) + s = cudf.Series(rng.integers(0, 30, 80), dtype="int32") + bins = cudf.Series([2, None, None, 50, 90], dtype="int32") + + with pytest.raises( + ValueError, match="`bins` cannot contain null entries." + ): + s.digitize(bins) diff --git a/python/cudf/cudf/tests/series/methods/test_drop.py b/python/cudf/cudf/tests/series/methods/test_drop.py new file mode 100644 index 00000000000..96c7135cd7e --- /dev/null +++ b/python/cudf/cudf/tests/series/methods/test_drop.py @@ -0,0 +1,162 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + +import numpy as np +import pandas as pd +import pytest + +import cudf +from cudf.testing import assert_eq +from cudf.testing._utils import ( + assert_exceptions_equal, +) + + +@pytest.mark.parametrize( + "ps", + [ + pd.Series(["a"] * 20, index=range(0, 20)), + pd.Series(["b", None] * 10, index=range(0, 20), name="ASeries"), + pd.Series( + ["b", None] * 5, + index=pd.Index(list(range(10)), dtype="uint64"), + name="BSeries", + ), + ], +) +@pytest.mark.parametrize( + "labels", + [[0], 1, pd.Index([1, 3, 5]), np.array([1, 3, 5], dtype="float32")], +) +def test_series_drop_labels(ps, labels, inplace): + ps = ps.copy() + gs = cudf.from_pandas(ps) + + expected = ps.drop(labels=labels, axis=0, inplace=inplace) + actual = gs.drop(labels=labels, axis=0, inplace=inplace) + + if inplace: + expected = ps + actual = gs + + assert_eq(expected, actual) + + +@pytest.mark.parametrize("data", [["a"] * 20, ["b", None] * 10]) +@pytest.mark.parametrize("index", [[0], 1, pd.Index([1, 3, 5])]) +def test_series_drop_index(data, index, inplace): + ps = pd.Series(data, index=range(0, 20), name="a") + gs = cudf.from_pandas(ps) + + expected = ps.drop(index=index, inplace=inplace) + actual = gs.drop(index=index, inplace=inplace) + + if inplace: + expected = ps + actual = gs + + assert_eq(expected, actual) + + +@pytest.mark.parametrize( + "index,level", + [ + ("cow", 0), + ("lama", 0), + ("falcon", 0), + ("speed", 1), + ("weight", 1), + ("length", 1), + ( + "cow", + None, + ), + ( + "lama", + None, + ), + ( + "falcon", + None, + ), + ], +) +def test_series_drop_multiindex(index, level, inplace): + ps = pd.Series( + ["a" if i % 2 == 0 else "b" for i in range(0, 10)], + index=pd.MultiIndex( + levels=[ + ["lama", "cow", "falcon"], + ["speed", "weight", "length"], + ], + codes=[ + [0, 0, 0, 1, 1, 1, 2, 2, 2, 1], + [0, 1, 2, 0, 1, 2, 0, 1, 2, 1], + ], + ), + name="abc", + ) + gs = cudf.from_pandas(ps) + + expected = ps.drop(index=index, inplace=inplace, level=level) + actual = gs.drop(index=index, inplace=inplace, level=level) + + if inplace: + expected = ps + actual = gs + + assert_eq(expected, actual) + + +def test_series_drop_edge_inputs(): + gs = cudf.Series([42], name="a") + ps = gs.to_pandas() + + assert_eq(ps.drop(columns=["b"]), gs.drop(columns=["b"])) + + assert_eq(ps.drop(columns="b"), gs.drop(columns="b")) + + assert_exceptions_equal( + lfunc=ps.drop, + rfunc=gs.drop, + lfunc_args_and_kwargs=(["a"], {"columns": "a", "axis": 1}), + rfunc_args_and_kwargs=(["a"], {"columns": "a", "axis": 1}), + ) + + assert_exceptions_equal( + lfunc=ps.drop, + rfunc=gs.drop, + lfunc_args_and_kwargs=([], {}), + rfunc_args_and_kwargs=([], {}), + ) + + assert_exceptions_equal( + lfunc=ps.drop, + rfunc=gs.drop, + lfunc_args_and_kwargs=(["b"], {"axis": 1}), + rfunc_args_and_kwargs=(["b"], {"axis": 1}), + ) + + +def test_series_drop_raises(): + gs = cudf.Series([10, 20, 30], index=["x", "y", "z"], name="c") + ps = gs.to_pandas() + + assert_exceptions_equal( + lfunc=ps.drop, + rfunc=gs.drop, + lfunc_args_and_kwargs=(["p"],), + rfunc_args_and_kwargs=(["p"],), + ) + + # dtype specified mismatch + assert_exceptions_equal( + lfunc=ps.drop, + rfunc=gs.drop, + lfunc_args_and_kwargs=([3],), + rfunc_args_and_kwargs=([3],), + ) + + expect = ps.drop("p", errors="ignore") + actual = gs.drop("p", errors="ignore") + + assert_eq(actual, expect) diff --git a/python/cudf/cudf/tests/series/methods/test_duplicated.py b/python/cudf/cudf/tests/series/methods/test_duplicated.py new file mode 100644 index 00000000000..47c0b537560 --- /dev/null +++ b/python/cudf/cudf/tests/series/methods/test_duplicated.py @@ -0,0 +1,29 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + +import numpy as np +import pytest + +import cudf +from cudf.testing import assert_eq + + +@pytest.mark.parametrize( + "data,index", + [ + ([1, 2, 3], [10, 11, 12]), + ([1, 2, 3, 1, 1, 2, 3, 2], [10, 20, 23, 24, 25, 26, 27, 28]), + ([1, None, 2, None, 3, None, 3, 1], [5, 6, 7, 8, 9, 10, 11, 12]), + ([np.nan, 1.0, np.nan, 5.4, 5.4, 1.0], ["a", "b", "c", "d", "e", "f"]), + ( + ["lama", "cow", "lama", None, "beetle", "lama", None, None], + [1, 4, 10, 11, 2, 100, 200, 400], + ), + ], +) +@pytest.mark.parametrize("keep", ["first", "last", False]) +@pytest.mark.parametrize("name", [None, "a"]) +def test_series_duplicated(data, index, keep, name): + gs = cudf.Series(data, index=index, name=name) + ps = gs.to_pandas() + + assert_eq(gs.duplicated(keep=keep), ps.duplicated(keep=keep)) diff --git a/python/cudf/cudf/tests/series/methods/test_equals.py b/python/cudf/cudf/tests/series/methods/test_equals.py new file mode 100644 index 00000000000..d2e1db334ab --- /dev/null +++ b/python/cudf/cudf/tests/series/methods/test_equals.py @@ -0,0 +1,19 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + +import pytest + +import cudf +from cudf.testing import assert_eq + + +@pytest.mark.parametrize( + "lhs, rhs", [("a", "a"), ("a", "b"), (1, 1.0), (None, None), (None, "a")] +) +def test_equals_names(lhs, rhs): + lhs = cudf.Series([1, 2], name=lhs) + rhs = cudf.Series([1, 2], name=rhs) + + got = lhs.equals(rhs) + expect = lhs.to_pandas().equals(rhs.to_pandas()) + + assert_eq(expect, got) diff --git a/python/cudf/cudf/tests/series/methods/test_explode.py b/python/cudf/cudf/tests/series/methods/test_explode.py new file mode 100644 index 00000000000..0cb54c98aac --- /dev/null +++ b/python/cudf/cudf/tests/series/methods/test_explode.py @@ -0,0 +1,31 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + +import pandas as pd +import pytest + +import cudf +from cudf.testing import assert_eq + + +@pytest.mark.parametrize( + "data", + [[[1, 2, 3], None, [4], [], [5, 6]], [1, 2, 3, 4, 5]], +) +@pytest.mark.parametrize( + "p_index", + [ + None, + ["ia", "ib", "ic", "id", "ie"], + pd.MultiIndex.from_tuples( + [(0, "a"), (0, "b"), (0, "c"), (1, "a"), (1, "b")] + ), + ], +) +def test_explode(data, ignore_index, p_index): + pdf = pd.Series(data, index=p_index, name="someseries") + gdf = cudf.from_pandas(pdf) + + expect = pdf.explode(ignore_index) + got = gdf.explode(ignore_index) + + assert_eq(expect, got, check_dtype=False) diff --git a/python/cudf/cudf/tests/series/methods/test_factorize.py b/python/cudf/cudf/tests/series/methods/test_factorize.py new file mode 100644 index 00000000000..98cc8187fce --- /dev/null +++ b/python/cudf/cudf/tests/series/methods/test_factorize.py @@ -0,0 +1,31 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + +import pytest + +import cudf +from cudf.testing import assert_eq + + +@pytest.mark.parametrize( + "data", + [ + [1, 2, 3, 2, 1], + [1, 2, None, 3, 1, 1], + [], + ["a", "b", "c", None, "z", "a"], + ], +) +@pytest.mark.parametrize("use_na_sentinel", [True, False]) +@pytest.mark.parametrize("sort", [True, False]) +def test_series_factorize_use_na_sentinel(data, use_na_sentinel, sort): + gsr = cudf.Series(data) + psr = gsr.to_pandas(nullable=True) + + expected_labels, expected_cats = psr.factorize( + use_na_sentinel=use_na_sentinel, sort=sort + ) + actual_labels, actual_cats = gsr.factorize( + use_na_sentinel=use_na_sentinel, sort=sort + ) + assert_eq(expected_labels, actual_labels.get()) + assert_eq(expected_cats, actual_cats.to_pandas(nullable=True)) diff --git a/python/cudf/cudf/tests/series/methods/test_fillna.py b/python/cudf/cudf/tests/series/methods/test_fillna.py new file mode 100644 index 00000000000..d317ff85596 --- /dev/null +++ b/python/cudf/cudf/tests/series/methods/test_fillna.py @@ -0,0 +1,40 @@ +# Copyright (c) 2020-2025, NVIDIA CORPORATION. + +import numpy as np +import pytest + +import cudf +from cudf.testing import assert_eq + + +@pytest.mark.parametrize( + "data", + [ + [1, None, 11, 2.0, np.nan], + [np.nan], + [None, None, None], + [np.nan, 1, 10, 393.32, np.nan], + ], +) +@pytest.mark.parametrize("nan_as_null", [True, False]) +@pytest.mark.parametrize("fill_value", [1.2, 332, np.nan]) +def test_fillna_with_nan(data, nan_as_null, fill_value): + gs = cudf.Series(data, dtype="float64", nan_as_null=nan_as_null) + ps = gs.to_pandas() + + expected = ps.fillna(fill_value) + actual = gs.fillna(fill_value) + + assert_eq(expected, actual) + + +def test_fillna_categorical_with_non_categorical_raises(): + ser = cudf.Series([1, None], dtype="category") + with pytest.raises(TypeError): + ser.fillna(cudf.Series([1, 2])) + + +def test_fillna_categorical_with_different_categories_raises(): + ser = cudf.Series([1, None], dtype="category") + with pytest.raises(TypeError): + ser.fillna(cudf.Series([1, 2]), dtype="category") diff --git a/python/cudf/cudf/tests/series/methods/test_hash_values.py b/python/cudf/cudf/tests/series/methods/test_hash_values.py new file mode 100644 index 00000000000..9e64e4dfab6 --- /dev/null +++ b/python/cudf/cudf/tests/series/methods/test_hash_values.py @@ -0,0 +1,47 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. +import hashlib + +import pytest + +import cudf +from cudf.testing import assert_eq + + +@pytest.mark.parametrize( + "method", ["md5", "sha1", "sha224", "sha256", "sha384", "sha512"] +) +def test_series_hash_values(method): + inputs = cudf.Series( + [ + "", + "0", + "A 56 character string to test message padding algorithm.", + "A 63 character string to test message padding algorithm, again.", + "A 64 character string to test message padding algorithm, again!!", + ( + "A very long (greater than 128 bytes/char string) to execute " + "a multi hash-step data point in the hash function being " + "tested. This string needed to be longer." + ), + "All work and no play makes Jack a dull boy", + "!\"#$%&'()*+,-./0123456789:;<=>?@[\\]^_`{|}~", + "\x00\x00\x00\x10\x00\x00\x00\x00", + "\x00\x00\x00\x00", + ] + ) + + def hashlib_compute_digest(data): + hasher = getattr(hashlib, method)() + hasher.update(data.encode("utf-8")) + return hasher.hexdigest() + + hashlib_validation = inputs.to_pandas().apply(hashlib_compute_digest) + validation_results = cudf.Series(hashlib_validation) + hash_values = inputs.hash_values(method=method) + assert_eq(hash_values, validation_results) + + +def test_series_hash_values_invalid_method(): + inputs = cudf.Series(["", "0"]) + with pytest.raises(ValueError): + inputs.hash_values(method="invalid_method") diff --git a/python/cudf/cudf/tests/series/methods/test_isin.py b/python/cudf/cudf/tests/series/methods/test_isin.py new file mode 100644 index 00000000000..dfb7252775a --- /dev/null +++ b/python/cudf/cudf/tests/series/methods/test_isin.py @@ -0,0 +1,170 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + +import numpy as np +import pandas as pd +import pytest + +import cudf +from cudf.core._compat import PANDAS_CURRENT_SUPPORTED_VERSION, PANDAS_VERSION +from cudf.testing import assert_eq +from cudf.testing._utils import ( + expect_warning_if, +) + + +@pytest.mark.parametrize( + "data", + [ + [], + [0, 12, 14], + [0, 14, 12, 12, 3, 10, 12, 14], + np.random.default_rng(seed=0).integers(-100, 100, 200), + pd.Series([0.0, 1.0, None, 10.0]), + [None, None, None, None], + [np.nan, None, -1, 2, 3], + [1, 2], + ], +) +@pytest.mark.parametrize( + "values", + [ + np.random.default_rng(seed=0).integers(-100, 100, 10), + [], + [np.nan, None, -1, 2, 3], + [1.0, 12.0, None, None, 120], + [0.1, 12.1, 14.1], + [0, 14, 12, 12, 3, 10, 12, 14, None], + [None, None, None], + ["0", "12", "14"], + ["0", "12", "14", "a"], + [1.0, 2.5], + ], +) +def test_isin_numeric(data, values): + rng = np.random.default_rng(seed=0) + index = rng.integers(0, 100, len(data)) + psr = pd.Series(data, index=index) + gsr = cudf.Series.from_pandas(psr, nan_as_null=False) + + expected = psr.isin(values) + got = gsr.isin(values) + + assert_eq(got, expected) + + +@pytest.mark.skipif( + PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, + reason="Warning newly introduced in pandas-2.2.0", +) +@pytest.mark.parametrize( + "data", + [ + [], + pd.Series( + ["2018-01-01", "2019-04-03", None, "2019-12-30"], + dtype="datetime64[ns]", + ), + pd.Series( + [ + "2018-01-01", + "2019-04-03", + None, + "2019-12-30", + "2018-01-01", + "2018-01-01", + ], + dtype="datetime64[ns]", + ), + ], +) +@pytest.mark.parametrize( + "values", + [ + [], + [1514764800000000000, 1577664000000000000], + [ + 1514764800000000000, + 1577664000000000000, + 1577664000000000000, + 1577664000000000000, + 1514764800000000000, + ], + ["2019-04-03", "2019-12-30", "2012-01-01"], + [ + "2012-01-01", + "2012-01-01", + "2012-01-01", + "2019-04-03", + "2019-12-30", + "2012-01-01", + ], + ], +) +def test_isin_datetime(data, values): + psr = pd.Series(data) + gsr = cudf.Series.from_pandas(psr) + + is_len_str = isinstance(next(iter(values), None), str) and len(data) + with expect_warning_if(is_len_str): + got = gsr.isin(values) + with expect_warning_if(is_len_str): + expected = psr.isin(values) + assert_eq(got, expected) + + +@pytest.mark.parametrize( + "data", + [ + [], + ["this", "is", None, "a", "test"], + ["test", "this", "test", "is", None, "test", "a", "test"], + ["0", "12", "14"], + ], +) +@pytest.mark.parametrize( + "values", + [ + [], + ["this", "is"], + [None, None, None], + ["12", "14", "19"], + [12, 14, 19], + ["is", "this", "is", "this", "is"], + ], +) +def test_isin_string(data, values): + psr = pd.Series(data) + gsr = cudf.Series.from_pandas(psr) + + got = gsr.isin(values) + expected = psr.isin(values) + assert_eq(got, expected) + + +@pytest.mark.parametrize( + "data", + [ + [], + pd.Series(["a", "b", "c", "c", "c", "d", "e"], dtype="category"), + pd.Series(["a", "b", None, "c", "d", "e"], dtype="category"), + pd.Series([0, 3, 10, 12], dtype="category"), + pd.Series([0, 3, 10, 12, 0, 10, 3, 0, 0, 3, 3], dtype="category"), + ], +) +@pytest.mark.parametrize( + "values", + [ + [], + ["a", "b", None, "f", "words"], + ["0", "12", None, "14"], + [0, 10, 12, None, 39, 40, 1000], + [0, 0, 0, 0, 3, 3, 3, None, 1, 2, 3], + ], +) +def test_isin_categorical(data, values): + psr = pd.Series(data) + gsr = cudf.Series.from_pandas(psr) + + got = gsr.isin(values) + expected = psr.isin(values) + assert_eq(got, expected) diff --git a/python/cudf/cudf/tests/series/methods/test_isna_notnull.py b/python/cudf/cudf/tests/series/methods/test_isna_notnull.py new file mode 100644 index 00000000000..484ee48a5a9 --- /dev/null +++ b/python/cudf/cudf/tests/series/methods/test_isna_notnull.py @@ -0,0 +1,61 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + +import numpy as np +import pandas as pd +import pytest + +import cudf +from cudf.errors import MixedTypeError +from cudf.testing import assert_eq + + +@pytest.fixture( + params=[ + pd.Series([0, 1, 2, np.nan, 4, None, 6]), + pd.Series( + [0, 1, 2, np.nan, 4, None, 6], + index=["q", "w", "e", "r", "t", "y", "u"], + name="a", + ), + pd.Series([0, 1, 2, 3, 4]), + pd.Series(["a", "b", "u", "h", "d"]), + pd.Series([None, None, np.nan, None, np.inf, -np.inf]), + pd.Series([], dtype="float64"), + pd.Series( + [pd.NaT, pd.Timestamp("1939-05-27"), pd.Timestamp("1940-04-25")] + ), + pd.Series([np.nan]), + pd.Series([None]), + pd.Series(["a", "b", "", "c", None, "e"]), + ] +) +def ps(request): + return request.param + + +def test_series_isnull_isna(ps, nan_as_null): + nan_contains = ps.apply(lambda x: isinstance(x, float) and np.isnan(x)) + if nan_as_null is False and ( + nan_contains.any() and not nan_contains.all() and ps.dtype == object + ): + with pytest.raises(MixedTypeError): + cudf.Series.from_pandas(ps, nan_as_null=nan_as_null) + else: + gs = cudf.Series.from_pandas(ps, nan_as_null=nan_as_null) + + assert_eq(ps.isnull(), gs.isnull()) + assert_eq(ps.isna(), gs.isna()) + + +def test_series_notnull_notna(ps, nan_as_null): + nan_contains = ps.apply(lambda x: isinstance(x, float) and np.isnan(x)) + if nan_as_null is False and ( + nan_contains.any() and not nan_contains.all() and ps.dtype == object + ): + with pytest.raises(MixedTypeError): + cudf.Series.from_pandas(ps, nan_as_null=nan_as_null) + else: + gs = cudf.Series.from_pandas(ps, nan_as_null=nan_as_null) + + assert_eq(ps.notnull(), gs.notnull()) + assert_eq(ps.notna(), gs.notna()) diff --git a/python/cudf/cudf/tests/series/methods/test_memory_usage.py b/python/cudf/cudf/tests/series/methods/test_memory_usage.py new file mode 100644 index 00000000000..f9c81aaaef9 --- /dev/null +++ b/python/cudf/cudf/tests/series/methods/test_memory_usage.py @@ -0,0 +1,21 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + + +import cudf + + +def test_series_memory_usage(): + sr = cudf.Series([1, 2, 3, 4], dtype="int64") + assert sr.memory_usage() == 32 + + sliced_sr = sr[2:] + assert sliced_sr.memory_usage() == 16 + + sliced_sr[3] = None + assert sliced_sr.memory_usage() == 80 + + sr = cudf.Series(["hello world", "rapids ai", "abc", "z"]) + assert sr.memory_usage() == 44 + + assert sr[3:].memory_usage() == 9 # z + assert sr[:1].memory_usage() == 19 # hello world diff --git a/python/cudf/cudf/tests/series/methods/test_mode.py b/python/cudf/cudf/tests/series/methods/test_mode.py new file mode 100644 index 00000000000..bfd096e9343 --- /dev/null +++ b/python/cudf/cudf/tests/series/methods/test_mode.py @@ -0,0 +1,70 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + +import pytest + +import cudf +from cudf.testing import assert_eq + + +@pytest.mark.parametrize( + "gs", + [ + lambda: cudf.Series([1, 2, 3]), + lambda: cudf.Series([None]), + lambda: cudf.Series([4]), + lambda: cudf.Series([2, 3, -1, 0, 1], name="test name"), + lambda: cudf.Series( + [1, 2, 3, None, 2, 1], index=["a", "v", "d", "e", "f", "g"] + ), + lambda: cudf.Series([1, 2, 3, None, 2, 1, None], name="abc"), + lambda: cudf.Series(["ab", "bc", "ab", None, "bc", None, None]), + lambda: cudf.Series([None, None, None, None, None], dtype="str"), + lambda: cudf.Series([None, None, None, None, None]), + lambda: cudf.Series( + [ + 123213, + 23123, + 123123, + 12213123, + 12213123, + 12213123, + 23123, + 2312323123, + None, + None, + ], + dtype="timedelta64[ns]", + ), + lambda: cudf.Series( + [ + None, + 1, + 2, + 3242434, + 3233243, + 1, + 2, + 1023, + None, + 12213123, + None, + 2312323123, + None, + None, + ], + dtype="datetime64[ns]", + ), + lambda: cudf.Series(name="empty series", dtype="float64"), + lambda: cudf.Series( + ["a", "b", "c", " ", "a", "b", "z"], dtype="category" + ), + ], +) +def test_series_mode(gs, dropna): + gs = gs() + ps = gs.to_pandas() + + expected = ps.mode(dropna=dropna) + actual = gs.mode(dropna=dropna) + + assert_eq(expected, actual, check_dtype=False) diff --git a/python/cudf/cudf/tests/series/methods/test_nans_to_nulls.py b/python/cudf/cudf/tests/series/methods/test_nans_to_nulls.py new file mode 100644 index 00000000000..3dc2619ab5d --- /dev/null +++ b/python/cudf/cudf/tests/series/methods/test_nans_to_nulls.py @@ -0,0 +1,12 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + +import pytest + +import cudf + + +@pytest.mark.parametrize("value", [1, 1.1]) +def test_nans_to_nulls_noop_copies_column(value): + ser1 = cudf.Series([value]) + ser2 = ser1.nans_to_nulls() + assert ser1._column is not ser2._column diff --git a/python/cudf/cudf/tests/series/methods/test_nlargest_nsmallest.py b/python/cudf/cudf/tests/series/methods/test_nlargest_nsmallest.py new file mode 100644 index 00000000000..329c7f96602 --- /dev/null +++ b/python/cudf/cudf/tests/series/methods/test_nlargest_nsmallest.py @@ -0,0 +1,18 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + +import pytest + +import cudf +from cudf.testing._utils import ( + assert_exceptions_equal, +) + + +@pytest.mark.parametrize("attr", ["nlargest", "nsmallest"]) +def test_series_nlargest_nsmallest_str_error(attr): + gs = cudf.Series(["a", "b", "c", "d", "e"]) + ps = gs.to_pandas() + + assert_exceptions_equal( + getattr(gs, attr), getattr(ps, attr), ([], {"n": 1}), ([], {"n": 1}) + ) diff --git a/python/cudf/cudf/tests/series/methods/test_nunique.py b/python/cudf/cudf/tests/series/methods/test_nunique.py new file mode 100644 index 00000000000..19ff910e316 --- /dev/null +++ b/python/cudf/cudf/tests/series/methods/test_nunique.py @@ -0,0 +1,25 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + +import pandas as pd + +import cudf +from cudf.testing import assert_eq + + +def test_nunique_all_null(dropna): + data = [None, None] + pd_ser = pd.Series(data) + cudf_ser = cudf.Series(data) + result = pd_ser.nunique(dropna=dropna) + expected = cudf_ser.nunique(dropna=dropna) + assert result == expected + + +def test_series_nunique(): + cd_s = cudf.Series([1, 3, 5, 7, 7]) + pd_s = cd_s.to_pandas() + + actual = cd_s.nunique() + expected = pd_s.nunique() + + assert_eq(expected, actual) diff --git a/python/cudf/cudf/tests/series/methods/test_pipe.py b/python/cudf/cudf/tests/series/methods/test_pipe.py new file mode 100644 index 00000000000..3f47aa5bf78 --- /dev/null +++ b/python/cudf/cudf/tests/series/methods/test_pipe.py @@ -0,0 +1,64 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + +import pandas as pd + +import cudf +from cudf.testing import assert_eq +from cudf.testing._utils import ( + assert_exceptions_equal, +) + + +def test_series_pipe(): + psr = pd.Series([10, 20, 30, 40]) + gsr = cudf.Series([10, 20, 30, 40]) + + def custom_add_func(sr, val): + new_sr = sr + val + return new_sr + + def custom_to_str_func(sr, val): + new_sr = sr.astype("str") + val + return new_sr + + expected = ( + psr.pipe(custom_add_func, 11) + .pipe(custom_add_func, val=12) + .pipe(custom_to_str_func, "rapids") + ) + actual = ( + gsr.pipe(custom_add_func, 11) + .pipe(custom_add_func, val=12) + .pipe(custom_to_str_func, "rapids") + ) + + assert_eq(expected, actual) + + expected = ( + psr.pipe((custom_add_func, "sr"), val=11) + .pipe(custom_add_func, val=1) + .pipe(custom_to_str_func, "rapids-ai") + ) + actual = ( + gsr.pipe((custom_add_func, "sr"), val=11) + .pipe(custom_add_func, val=1) + .pipe(custom_to_str_func, "rapids-ai") + ) + + assert_eq(expected, actual) + + +def test_series_pipe_error(): + psr = pd.Series([10, 20, 30, 40]) + gsr = cudf.Series([10, 20, 30, 40]) + + def custom_add_func(sr, val): + new_sr = sr + val + return new_sr + + assert_exceptions_equal( + lfunc=psr.pipe, + rfunc=gsr.pipe, + lfunc_args_and_kwargs=([(custom_add_func, "val")], {"val": 11}), + rfunc_args_and_kwargs=([(custom_add_func, "val")], {"val": 11}), + ) diff --git a/python/cudf/cudf/tests/series/methods/test_reindex.py b/python/cudf/cudf/tests/series/methods/test_reindex.py new file mode 100644 index 00000000000..a2bcac2fb3d --- /dev/null +++ b/python/cudf/cudf/tests/series/methods/test_reindex.py @@ -0,0 +1,19 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + + +import cudf +from cudf.testing._utils import ( + assert_exceptions_equal, +) + + +def test_series_duplicate_index_reindex(): + gs = cudf.Series([0, 1, 2, 3], index=[0, 0, 1, 1]) + ps = gs.to_pandas() + + assert_exceptions_equal( + gs.reindex, + ps.reindex, + lfunc_args_and_kwargs=([10, 11, 12, 13], {}), + rfunc_args_and_kwargs=([10, 11, 12, 13], {}), + ) diff --git a/python/cudf/cudf/tests/series/methods/test_rename.py b/python/cudf/cudf/tests/series/methods/test_rename.py new file mode 100644 index 00000000000..f5705511d5c --- /dev/null +++ b/python/cudf/cudf/tests/series/methods/test_rename.py @@ -0,0 +1,37 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + +import pandas as pd +import pytest + +import cudf +from cudf.testing import assert_eq + + +def test_series_unique_pandas_compatibility(): + gs = cudf.Series([10, 11, 12, 11, 10]) + ps = gs.to_pandas() + with cudf.option_context("mode.pandas_compatible", True): + actual = gs.unique() + expected = ps.unique() + assert_eq(actual, expected) + + +@pytest.mark.parametrize("initial_name", [None, "a"]) +@pytest.mark.parametrize("name", [None, "a"]) +def test_series_rename(initial_name, name): + gsr = cudf.Series([1, 2, 3], name=initial_name) + psr = pd.Series([1, 2, 3], name=initial_name) + + assert_eq(gsr, psr) + + actual = gsr.rename(name) + expected = psr.rename(name) + + assert_eq(actual, expected) + + +@pytest.mark.parametrize("index", [lambda x: x * 2, {1: 2}]) +def test_rename_index_not_supported(index): + ser = cudf.Series(range(2)) + with pytest.raises(NotImplementedError): + ser.rename(index=index) diff --git a/python/cudf/cudf/tests/series/methods/test_reset_index.py b/python/cudf/cudf/tests/series/methods/test_reset_index.py new file mode 100644 index 00000000000..a1025a4dfda --- /dev/null +++ b/python/cudf/cudf/tests/series/methods/test_reset_index.py @@ -0,0 +1,139 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + +import pandas as pd +import pytest + +import cudf +from cudf.api.extensions import no_default +from cudf.testing import assert_eq +from cudf.testing._utils import ( + assert_exceptions_equal, +) + + +@pytest.fixture(params=[True, False]) +def drop(request): + """Param for `drop` argument""" + return request.param + + +@pytest.mark.parametrize("level", [None, 0, "l0", 1, ["l0", 1]]) +@pytest.mark.parametrize("original_name", [None, "original_ser"]) +@pytest.mark.parametrize("name", [None, "ser", no_default]) +def test_reset_index(level, drop, inplace, original_name, name): + midx = pd.MultiIndex.from_tuples( + [("a", 1), ("a", 2), ("b", 1), ("b", 2)], names=["l0", None] + ) + ps = pd.Series(range(4), index=midx, name=original_name) + gs = cudf.from_pandas(ps) + + if not drop and inplace: + pytest.skip( + "For exception checks, see " + "test_reset_index_dup_level_name_exceptions" + ) + + expect = ps.reset_index(level=level, drop=drop, name=name, inplace=inplace) + + got = gs.reset_index(level=level, drop=drop, name=name, inplace=inplace) + if inplace: + expect = ps + got = gs + + assert_eq(expect, got) + + +@pytest.mark.parametrize("level", [None, 0, 1, [None]]) +@pytest.mark.parametrize("original_name", [None, "original_ser"]) +@pytest.mark.parametrize("name", [None, "ser"]) +def test_reset_index_dup_level_name(level, drop, inplace, original_name, name): + # midx levels are named [None, None] + midx = pd.MultiIndex.from_tuples([("a", 1), ("a", 2), ("b", 1), ("b", 2)]) + ps = pd.Series(range(4), index=midx, name=original_name) + gs = cudf.from_pandas(ps) + if level == [None] or not drop and inplace: + pytest.skip( + "For exception checks, see " + "test_reset_index_dup_level_name_exceptions" + ) + + expect = ps.reset_index(level=level, drop=drop, inplace=inplace, name=name) + got = gs.reset_index(level=level, drop=drop, inplace=inplace, name=name) + if inplace: + expect = ps + got = gs + + assert_eq(expect, got) + + +@pytest.mark.parametrize("original_name", [None, "original_ser"]) +@pytest.mark.parametrize("name", [None, "ser"]) +def test_reset_index_named(drop, inplace, original_name, name): + ps = pd.Series(range(4), index=["x", "y", "z", "w"], name=original_name) + gs = cudf.from_pandas(ps) + + ps.index.name = "cudf" + gs.index.name = "cudf" + + if not drop and inplace: + pytest.skip( + "For exception checks, see " + "test_reset_index_dup_level_name_exceptions" + ) + + expect = ps.reset_index(drop=drop, inplace=inplace, name=name) + got = gs.reset_index(drop=drop, inplace=inplace, name=name) + + if inplace: + expect = ps + got = gs + + assert_eq(expect, got) + + +def test_reset_index_dup_level_name_exceptions(): + midx = pd.MultiIndex.from_tuples([("a", 1), ("a", 2), ("b", 1), ("b", 2)]) + ps = pd.Series(range(4), index=midx) + gs = cudf.from_pandas(ps) + + # Should specify duplicate level names with level number. + assert_exceptions_equal( + lfunc=ps.reset_index, + rfunc=gs.reset_index, + lfunc_args_and_kwargs=( + [], + {"level": [None]}, + ), + rfunc_args_and_kwargs=( + [], + {"level": [None]}, + ), + ) + + # Cannot use drop=False and inplace=True to turn a series into dataframe. + assert_exceptions_equal( + lfunc=ps.reset_index, + rfunc=gs.reset_index, + lfunc_args_and_kwargs=( + [], + {"drop": False, "inplace": True}, + ), + rfunc_args_and_kwargs=( + [], + {"drop": False, "inplace": True}, + ), + ) + + # Pandas raises the above exception should these two inputs crosses. + assert_exceptions_equal( + lfunc=ps.reset_index, + rfunc=gs.reset_index, + lfunc_args_and_kwargs=( + [], + {"level": [None], "drop": False, "inplace": True}, + ), + rfunc_args_and_kwargs=( + [], + {"level": [None], "drop": False, "inplace": True}, + ), + ) diff --git a/python/cudf/cudf/tests/series/methods/test_round.py b/python/cudf/cudf/tests/series/methods/test_round.py new file mode 100644 index 00000000000..320371a7f38 --- /dev/null +++ b/python/cudf/cudf/tests/series/methods/test_round.py @@ -0,0 +1,149 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + +import decimal + +import numpy as np +import pandas as pd +import pytest + +import cudf +from cudf.testing import assert_eq + + +@pytest.mark.parametrize( + "arr", + [ + np.random.default_rng(seed=0).normal(-100, 100, 10), + np.random.default_rng(seed=0).integers(-50, 50, 10), + np.zeros(10), + np.repeat([-0.6459412758761901], 10), + np.repeat(np.nan, 10), + np.array([1.123, 2.343, np.nan, 0.0]), + np.arange(-100.5, 101.5, 1), + ], +) +@pytest.mark.parametrize("decimals", [-3, -1, 0, 1, 12, np.int8(1)]) +def test_series_round(arr, decimals, nan_as_null): + pser = pd.Series(arr) + ser = cudf.Series(arr, nan_as_null=nan_as_null) + result = ser.round(decimals) + expected = pser.round(decimals) + + assert_eq(result, expected) + + +def test_series_round_half_up(): + s = cudf.Series([0.0, 1.0, 1.2, 1.7, 0.5, 1.5, 2.5, None]) + expect = cudf.Series([0.0, 1.0, 1.0, 2.0, 1.0, 2.0, 3.0, None]) + got = s.round(how="half_up") + assert_eq(expect, got) + + +@pytest.mark.parametrize( + "series_data", + [ + [1.0, None, np.nan, 4.0], + [1.24430, None, np.nan, 4.423530], + [1.24430, np.nan, 4.423530], + [-1.24430, np.nan, -4.423530], + np.repeat(np.nan, 100), + ], +) +@pytest.mark.parametrize("decimal", [0, 1, 3]) +def test_round_nan_as_null_false(series_data, decimal): + series = cudf.Series(series_data, nan_as_null=False) + pser = series.to_pandas() + result = series.round(decimal) + expected = pser.round(decimal) + assert_eq(result, expected, atol=1e-10) + + +@pytest.mark.parametrize( + "data, dtype, decimals, expected_half_up, expected_half_even", + [ + ( + [1.234, 2.345, 3.456], + cudf.Decimal32Dtype(precision=5, scale=3), + 2, + [1.23, 2.35, 3.46], + [1.23, 2.34, 3.46], + ), + ( + [1.234, 2.345, 3.456], + cudf.Decimal32Dtype(precision=5, scale=3), + 0, + [1.0, 2.0, 3.0], + [1.0, 2.0, 3.0], + ), + ( + [1.234, 2.345, 3.456], + cudf.Decimal32Dtype(precision=5, scale=3), + 3, + [1.234, 2.345, 3.456], + [1.234, 2.345, 3.456], + ), + ( + [1.234567, 2.345678, 3.456789], + cudf.Decimal64Dtype(precision=10, scale=6), + 4, + [1.2346, 2.3457, 3.4568], + [1.2346, 2.3457, 3.4568], + ), + ( + [1.234567, 2.345678, 3.456789], + cudf.Decimal64Dtype(precision=10, scale=6), + 2, + [1.23, 2.35, 3.46], + [1.23, 2.35, 3.46], + ), + ( + [1.234567, 2.345678, 3.456789], + cudf.Decimal64Dtype(precision=10, scale=6), + 6, + [1.234567, 2.345678, 3.456789], + [1.234567, 2.345678, 3.456789], + ), + ], +) +def test_series_round_decimal( + data, dtype, decimals, expected_half_up, expected_half_even +): + ser = cudf.Series(data).astype(dtype) + + result_half_up = ser.round(decimals=decimals, how="half_up").astype(dtype) + expected_ser_half_up = cudf.Series(expected_half_up).astype(dtype) + assert_eq(result_half_up, expected_ser_half_up) + + result_half_even = ser.round(decimals=decimals, how="half_even").astype( + dtype + ) + expected_ser_half_even = cudf.Series(expected_half_even).astype(dtype) + assert_eq(result_half_even, expected_ser_half_even) + + +@pytest.mark.parametrize( + "data", + [ + [1.2234242333234, 323432.3243423, np.nan], + pd.Series([34224, 324324, 324342], dtype="datetime64[ns]"), + pd.Series([224.242, None, 2424.234324], dtype="category"), + [ + decimal.Decimal("342.3243234234242"), + decimal.Decimal("89.32432497687622"), + None, + ], + ], +) +@pytest.mark.parametrize("digits", [0, 1, 7]) +def test_series_round_builtin(data, digits): + ps = pd.Series(data) + gs = cudf.from_pandas(ps, nan_as_null=False) + + # TODO: Remove `to_frame` workaround + # after following issue is fixed: + # https://github.com/pandas-dev/pandas/issues/55114 + expected = round(ps.to_frame(), digits)[0] + expected.name = None + actual = round(gs, digits) + + assert_eq(expected, actual) diff --git a/python/cudf/cudf/tests/series/methods/test_sort_index.py b/python/cudf/cudf/tests/series/methods/test_sort_index.py new file mode 100644 index 00000000000..4b2fddf7425 --- /dev/null +++ b/python/cudf/cudf/tests/series/methods/test_sort_index.py @@ -0,0 +1,46 @@ +# Copyright (c) 2020-2025, NVIDIA CORPORATION. + +import numpy as np +import pandas as pd +import pytest + +import cudf +from cudf.testing import assert_eq + + +@pytest.mark.parametrize( + "index", + [ + pd.RangeIndex(0, 3, 1), + [3.0, 1.0, np.nan], + ["a", "z", None], + pd.RangeIndex(4, -1, -2), + ], +) +@pytest.mark.parametrize("axis", [0, "index"]) +@pytest.mark.parametrize("na_position", ["first", "last"]) +def test_series_sort_index( + index, axis, ascending, inplace, ignore_index, na_position +): + ps = pd.Series([10, 3, 12], index=index) + gs = cudf.from_pandas(ps) + + expected = ps.sort_index( + axis=axis, + ascending=ascending, + ignore_index=ignore_index, + inplace=inplace, + na_position=na_position, + ) + got = gs.sort_index( + axis=axis, + ascending=ascending, + ignore_index=ignore_index, + inplace=inplace, + na_position=na_position, + ) + + if inplace is True: + assert_eq(ps, gs, check_index_type=True) + else: + assert_eq(expected, got, check_index_type=True) diff --git a/python/cudf/cudf/tests/series/methods/test_squeeze.py b/python/cudf/cudf/tests/series/methods/test_squeeze.py new file mode 100644 index 00000000000..4e39c235e83 --- /dev/null +++ b/python/cudf/cudf/tests/series/methods/test_squeeze.py @@ -0,0 +1,21 @@ +# Copyright (c) 2020-2025, NVIDIA CORPORATION. + +import pytest + +import cudf +from cudf.testing import assert_eq + + +@pytest.mark.parametrize("axis", [None, 0, "index"]) +@pytest.mark.parametrize("data", [[1, 2], [1]]) +def test_squeeze(axis, data): + ser = cudf.Series(data) + result = ser.squeeze(axis=axis) + expected = ser.to_pandas().squeeze(axis=axis) + assert_eq(result, expected) + + +@pytest.mark.parametrize("axis", [1, "columns"]) +def test_squeeze_invalid_axis(axis): + with pytest.raises(ValueError): + cudf.Series([1]).squeeze(axis=axis) diff --git a/python/cudf/cudf/tests/series/methods/test_to_cupy.py b/python/cudf/cudf/tests/series/methods/test_to_cupy.py new file mode 100644 index 00000000000..54897a73b0f --- /dev/null +++ b/python/cudf/cudf/tests/series/methods/test_to_cupy.py @@ -0,0 +1,43 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + +import cupy as cp +import numpy as np +import pytest + +import cudf +from cudf.testing import assert_eq + + +@pytest.mark.parametrize("has_nulls", [False, True]) +@pytest.mark.parametrize("use_na_value", [False, True]) +def test_series_to_cupy( + numeric_and_bool_types_as_str, has_nulls, use_na_value +): + size = 10 + if numeric_and_bool_types_as_str == "bool": + np_data = np.array([True, False] * (size // 2), dtype=bool) + else: + np_data = np.arange(size, dtype=numeric_and_bool_types_as_str) + + if has_nulls: + np_data = np_data.astype("object") + np_data[::2] = None + + sr = cudf.Series(np_data, dtype=numeric_and_bool_types_as_str) + + if not has_nulls: + assert_eq(sr.values, cp.asarray(sr)) + return + + if has_nulls and not use_na_value: + with pytest.raises(ValueError, match="Column must have no nulls"): + sr.to_cupy() + return + + na_value = { + "bool": False, + "float32": 0.0, + "float64": 0.0, + }.get(numeric_and_bool_types_as_str, 0) + expected = cp.asarray(sr.fillna(na_value)) if has_nulls else cp.asarray(sr) + assert_eq(sr.to_cupy(na_value=na_value), expected) diff --git a/python/cudf/cudf/tests/series/methods/test_to_dict.py b/python/cudf/cudf/tests/series/methods/test_to_dict.py new file mode 100644 index 00000000000..9bbb4b9e652 --- /dev/null +++ b/python/cudf/cudf/tests/series/methods/test_to_dict.py @@ -0,0 +1,18 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. +from collections import OrderedDict, defaultdict + +import pytest + +import cudf +from cudf.testing import assert_eq + + +@pytest.mark.parametrize("into", [dict, OrderedDict, defaultdict(list)]) +def test_series_to_dict(into): + gs = cudf.Series(["ab", "de", "zx"], index=[10, 20, 100]) + ps = gs.to_pandas() + + actual = gs.to_dict(into=into) + expected = ps.to_dict(into=into) + + assert_eq(expected, actual) diff --git a/python/cudf/cudf/tests/series/methods/test_to_pandas.py b/python/cudf/cudf/tests/series/methods/test_to_pandas.py new file mode 100644 index 00000000000..8e7ced2ea30 --- /dev/null +++ b/python/cudf/cudf/tests/series/methods/test_to_pandas.py @@ -0,0 +1,154 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + +import datetime +import decimal + +import pandas as pd +import pyarrow as pa +import pytest + +import cudf +from cudf.testing import assert_eq + + +@pytest.mark.parametrize( + "sr_data,expected_psr", + [ + ( + pa.array([1, 2, None, 3], type=pa.uint8()), + pd.Series([1, 2, None, 3], dtype=pd.UInt8Dtype()), + ), + ( + pa.array([23, None, None, 32], type=pa.uint16()), + pd.Series([23, None, None, 32], dtype=pd.UInt16Dtype()), + ), + ( + pa.array([None, 123, None, 1], type=pa.uint32()), + pd.Series([None, 123, None, 1], dtype=pd.UInt32Dtype()), + ), + ( + pa.array([234, 2323, 23432, None, None, 224], type=pa.uint64()), + pd.Series( + [234, 2323, 23432, None, None, 224], dtype=pd.UInt64Dtype() + ), + ), + ( + pa.array([-10, 1, None, -1, None, 3], type=pa.int8()), + pd.Series([-10, 1, None, -1, None, 3], dtype=pd.Int8Dtype()), + ), + ( + pa.array([111, None, 222, None, 13], type=pa.int16()), + pd.Series([111, None, 222, None, 13], dtype=pd.Int16Dtype()), + ), + ( + pa.array([11, None, 22, 33, None, 2, None, 3], type=pa.int32()), + pd.Series( + [11, None, 22, 33, None, 2, None, 3], dtype=pd.Int32Dtype() + ), + ), + ( + pa.array( + [32431, None, None, 32322, 0, 10, -32324, None], + type=pa.int64(), + ), + pd.Series( + [32431, None, None, 32322, 0, 10, -32324, None], + dtype=pd.Int64Dtype(), + ), + ), + ( + pa.array( + [True, None, False, None, False, True, True, False], + type=pa.bool_(), + ), + pd.Series( + [True, None, False, None, False, True, True, False], + dtype=pd.BooleanDtype(), + ), + ), + ( + pa.array( + [ + "abc", + "a", + None, + "hello world", + "foo buzz", + "", + None, + "rapids ai", + ], + type=pa.string(), + ), + pd.Series( + [ + "abc", + "a", + None, + "hello world", + "foo buzz", + "", + None, + "rapids ai", + ], + dtype=pd.StringDtype(), + ), + ), + ( + pa.array( + [1, 2, None, 10.2, None], + type=pa.float32(), + ), + pd.Series( + [1, 2, None, 10.2, None], + dtype=pd.Float32Dtype(), + ), + ), + ], +) +def test_series_to_pandas_nullable_dtypes(sr_data, expected_psr): + sr = cudf.Series(sr_data) + actual_psr = sr.to_pandas(nullable=True) + + assert_eq(actual_psr, expected_psr) + + +@pytest.mark.parametrize( + "scalar", + [ + 1, + 1.0, + "a", + datetime.datetime(2020, 1, 1), + datetime.timedelta(1), + {"1": 2}, + [1], + decimal.Decimal("1.0"), + ], +) +def test_series_to_pandas_arrow_type_nullable_raises(scalar): + pa_array = pa.array([scalar, None]) + ser = cudf.Series(pa_array) + with pytest.raises(ValueError, match=".* cannot both be set"): + ser.to_pandas(nullable=True, arrow_type=True) + + +@pytest.mark.parametrize( + "scalar", + [ + 1, + 1.0, + "a", + datetime.datetime(2020, 1, 1), + datetime.timedelta(1), + {"1": 2}, + [1], + decimal.Decimal("1.0"), + ], +) +def test_series_to_pandas_arrow_type(scalar): + pa_array = pa.array([scalar, None]) + ser = cudf.Series(pa_array) + result = ser.to_pandas(arrow_type=True) + expected = pd.Series(pd.arrays.ArrowExtensionArray(pa_array)) + pd.testing.assert_series_equal(result, expected) diff --git a/python/cudf/cudf/tests/series/methods/test_tolist.py b/python/cudf/cudf/tests/series/methods/test_tolist.py new file mode 100644 index 00000000000..f1675198642 --- /dev/null +++ b/python/cudf/cudf/tests/series/methods/test_tolist.py @@ -0,0 +1,20 @@ +# Copyright (c) 2023-2025, NVIDIA CORPORATION. +import re + +import pytest + +import cudf + + +def test_series_tolist(): + gsr = cudf.Series([1, 2, 3]) + + with pytest.raises( + TypeError, + match=re.escape( + r"cuDF does not support conversion to host memory " + r"via the `tolist()` method. Consider using " + r"`.to_arrow().to_pylist()` to construct a Python list." + ), + ): + gsr.tolist() diff --git a/python/cudf/cudf/tests/series/methods/test_transpose.py b/python/cudf/cudf/tests/series/methods/test_transpose.py new file mode 100644 index 00000000000..14fa4606400 --- /dev/null +++ b/python/cudf/cudf/tests/series/methods/test_transpose.py @@ -0,0 +1,28 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + +import pandas as pd +import pytest + +import cudf +from cudf.testing import assert_eq + + +@pytest.mark.parametrize( + "data", + [ + [0, 1, 2, 3], + ["abc", "a", None, "hello world", "foo buzz", "", None, "rapids ai"], + ], +) +def test_series_transpose(data): + psr = pd.Series(data=data) + csr = cudf.Series(data=data) + + cudf_transposed = csr.transpose() + pd_transposed = psr.transpose() + cudf_property = csr.T + pd_property = psr.T + + assert_eq(pd_transposed, cudf_transposed) + assert_eq(pd_property, cudf_property) + assert_eq(cudf_transposed, csr) diff --git a/python/cudf/cudf/tests/series/methods/test_truncate.py b/python/cudf/cudf/tests/series/methods/test_truncate.py new file mode 100644 index 00000000000..8a471f0f7a9 --- /dev/null +++ b/python/cudf/cudf/tests/series/methods/test_truncate.py @@ -0,0 +1,50 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + +import pytest + +import cudf +from cudf.testing import assert_eq +from cudf.testing._utils import ( + assert_exceptions_equal, +) + + +def test_series_truncate(): + csr = cudf.Series([1, 2, 3, 4]) + psr = csr.to_pandas() + + assert_eq(csr.truncate(), psr.truncate()) + assert_eq(csr.truncate(1, 2), psr.truncate(1, 2)) + assert_eq(csr.truncate(before=1, after=2), psr.truncate(before=1, after=2)) + + +def test_series_truncate_errors(): + csr = cudf.Series([1, 2, 3, 4]) + with pytest.raises(ValueError): + csr.truncate(axis=1) + with pytest.raises(ValueError): + csr.truncate(copy=False) + + csr.index = [3, 2, 1, 6] + psr = csr.to_pandas() + assert_exceptions_equal( + lfunc=csr.truncate, + rfunc=psr.truncate, + ) + + +def test_series_truncate_datetimeindex(): + dates = cudf.date_range( + "2021-01-01 23:45:00", "2021-01-02 23:46:00", freq="s" + ) + csr = cudf.Series(range(len(dates)), index=dates) + psr = csr.to_pandas() + + assert_eq( + csr.truncate( + before="2021-01-01 23:45:18", after="2021-01-01 23:45:27" + ), + psr.truncate( + before="2021-01-01 23:45:18", after="2021-01-01 23:45:27" + ), + ) diff --git a/python/cudf/cudf/tests/series/methods/test_update.py b/python/cudf/cudf/tests/series/methods/test_update.py new file mode 100644 index 00000000000..51211f94e0e --- /dev/null +++ b/python/cudf/cudf/tests/series/methods/test_update.py @@ -0,0 +1,38 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + +import numpy as np +import pandas as pd +import pytest + +import cudf +from cudf.testing import assert_eq + + +@pytest.mark.parametrize("index", [None, [1, 2, 3]]) +@pytest.mark.parametrize( + "other", + [ + pd.Series([4, 5, 6]), + pd.Series([4, 5, 6, 7, 8]), + pd.Series([4, np.nan, 6]), + [4, np.nan, 6], + {1: 9}, + ], +) +def test_series_update(index, other): + pd_data = pd.Series([1, 2, 3], index=index) + data = cudf.Series.from_pandas(pd_data) + gs = data.copy(deep=True) + if isinstance(other, pd.Series): + other = cudf.Series.from_pandas(other, nan_as_null=False) + g_other = other.copy(deep=True) + p_other = g_other.to_pandas() + else: + g_other = other + p_other = other + + ps = gs.to_pandas() + + ps.update(p_other) + gs.update(g_other) + assert_eq(gs, ps) diff --git a/python/cudf/cudf/tests/series/methods/test_value_counts.py b/python/cudf/cudf/tests/series/methods/test_value_counts.py new file mode 100644 index 00000000000..68246ecfbf6 --- /dev/null +++ b/python/cudf/cudf/tests/series/methods/test_value_counts.py @@ -0,0 +1,146 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. +from string import ascii_letters, digits + +import numpy as np +import pandas as pd +import pytest + +import cudf +from cudf.core.column.column import as_column +from cudf.testing import assert_eq + + +@pytest.fixture(params=[True, False]) +def normalize(request): + """Argument for value_counts""" + return request.param + + +@pytest.mark.parametrize( + "data", + [ + [], + pd.date_range("2010-01-01", "2010-02-01"), + [None, None], + [pd.Timestamp(2020, 1, 1), pd.NaT], + ], +) +def test_series_datetime_value_counts(data, normalize, dropna): + psr = pd.Series(data, dtype="datetime64[ns]") + gsr = cudf.from_pandas(psr) + expected = psr.value_counts(dropna=dropna, normalize=normalize) + got = gsr.value_counts(dropna=dropna, normalize=normalize) + + assert_eq(expected.sort_index(), got.sort_index(), check_dtype=False) + assert_eq( + expected.reset_index(drop=True), + got.reset_index(drop=True), + check_dtype=False, + check_index_type=True, + ) + + +def test_categorical_value_counts(dropna, normalize): + num_elements = 20 + rng = np.random.default_rng(seed=12) + pd_cat = pd.Categorical( + pd.Series( + rng.choice(list(ascii_letters + digits), num_elements), + dtype="category", + ) + ) + + # gdf + gdf_value_counts = cudf.Series(pd_cat).value_counts( + dropna=dropna, normalize=normalize + ) + + # pandas + pdf_value_counts = pd.Series(pd_cat).value_counts( + dropna=dropna, normalize=normalize + ) + + # verify + assert_eq( + pdf_value_counts.sort_index(), + gdf_value_counts.sort_index(), + check_dtype=False, + check_index_type=True, + ) + assert_eq( + pdf_value_counts.reset_index(drop=True), + gdf_value_counts.reset_index(drop=True), + check_dtype=False, + check_index_type=True, + ) + + +def test_series_value_counts(dropna, normalize): + rng = np.random.default_rng(seed=0) + size = 10 + arr = rng.integers(low=-1, high=10, size=size) + mask = arr != -1 + sr = cudf.Series._from_column( + as_column(arr).set_mask(cudf.Series(mask)._column.as_mask()) + ) + sr.name = "col" + + expect = ( + sr.to_pandas() + .value_counts(dropna=dropna, normalize=normalize) + .sort_index() + ) + got = sr.value_counts(dropna=dropna, normalize=normalize).sort_index() + + assert_eq(expect, got, check_dtype=True, check_index_type=False) + + +@pytest.mark.parametrize("bins", [1, 3]) +def test_series_value_counts_bins(bins): + psr = pd.Series([1.0, 2.0, 2.0, 3.0, 3.0, 3.0]) + gsr = cudf.from_pandas(psr) + + expected = psr.value_counts(bins=bins) + got = gsr.value_counts(bins=bins) + + assert_eq(expected.sort_index(), got.sort_index(), check_dtype=True) + + +@pytest.mark.parametrize("bins", [1, 3]) +def test_series_value_counts_bins_dropna(bins, dropna): + psr = pd.Series([1.0, 2.0, 2.0, 3.0, 3.0, 3.0, np.nan]) + gsr = cudf.from_pandas(psr) + + expected = psr.value_counts(bins=bins, dropna=dropna) + got = gsr.value_counts(bins=bins, dropna=dropna) + + assert_eq(expected.sort_index(), got.sort_index(), check_dtype=True) + + +def test_series_value_counts_optional_arguments(ascending, dropna, normalize): + psr = pd.Series([1.0, 2.0, 2.0, 3.0, 3.0, 3.0, None]) + gsr = cudf.from_pandas(psr) + + expected = psr.value_counts( + ascending=ascending, dropna=dropna, normalize=normalize + ) + got = gsr.value_counts( + ascending=ascending, dropna=dropna, normalize=normalize + ) + + assert_eq(expected.sort_index(), got.sort_index(), check_dtype=True) + assert_eq( + expected.reset_index(drop=True), + got.reset_index(drop=True), + check_dtype=True, + ) + + +def test_series_categorical_missing_value_count(): + ps = pd.Series(pd.Categorical(list("abcccb"), categories=list("cabd"))) + gs = cudf.from_pandas(ps) + + expected = ps.value_counts() + actual = gs.value_counts() + + assert_eq(expected, actual, check_dtype=False) diff --git a/python/cudf/cudf/tests/series/methods/test_where.py b/python/cudf/cudf/tests/series/methods/test_where.py new file mode 100644 index 00000000000..e0f01fd3cb8 --- /dev/null +++ b/python/cudf/cudf/tests/series/methods/test_where.py @@ -0,0 +1,25 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. +import re + +import pytest + +import cudf + + +def test_series_where_mixed_dtypes_error(): + s = cudf.Series(["a", "b", "c"]) + with pytest.raises( + TypeError, + match=re.escape( + "cudf does not support mixed types, please type-cast " + "the column of dataframe/series and other " + "to same dtypes." + ), + ): + s.where([True, False, True], [1, 2, 3]) + + +def test_series_where_mixed_bool_dtype(): + s = cudf.Series([True, False, True]) + with pytest.raises(TypeError): + s.where(~s, 10) diff --git a/python/cudf/cudf/tests/series/test_accessors.py b/python/cudf/cudf/tests/series/test_accessors.py deleted file mode 100644 index 06777c8e6af..00000000000 --- a/python/cudf/cudf/tests/series/test_accessors.py +++ /dev/null @@ -1 +0,0 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. diff --git a/python/cudf/cudf/tests/series/test_attributes.py b/python/cudf/cudf/tests/series/test_attributes.py index 06777c8e6af..7ae81a6c257 100644 --- a/python/cudf/cudf/tests/series/test_attributes.py +++ b/python/cudf/cudf/tests/series/test_attributes.py @@ -1 +1,143 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. +# Copyright (c) 2023-2025, NVIDIA CORPORATION. +import re + +import numpy as np +import pandas as pd +import pytest + +import cudf +from cudf.testing import assert_eq + + +@pytest.fixture( + params=[ + pd.Series([0, 1, 2, np.nan, 4, None, 6]), + pd.Series( + [0, 1, 2, np.nan, 4, None, 6], + index=["q", "w", "e", "r", "t", "y", "u"], + name="a", + ), + pd.Series([0, 1, 2, 3, 4]), + pd.Series(["a", "b", "u", "h", "d"]), + pd.Series([None, None, np.nan, None, np.inf, -np.inf]), + pd.Series([], dtype="float64"), + pd.Series( + [pd.NaT, pd.Timestamp("1939-05-27"), pd.Timestamp("1940-04-25")] + ), + pd.Series([np.nan]), + pd.Series([None]), + pd.Series(["a", "b", "", "c", None, "e"]), + ] +) +def ps(request): + return request.param + + +def test_series_iter_error(): + gs = cudf.Series([1, 2, 3]) + + with pytest.raises( + TypeError, + match=re.escape( + f"{gs.__class__.__name__} object is not iterable. " + f"Consider using `.to_arrow()`, `.to_pandas()` or `.values_host` " + f"if you wish to iterate over the values." + ), + ): + iter(gs) + + with pytest.raises( + TypeError, + match=re.escape( + f"{gs.__class__.__name__} object is not iterable. " + f"Consider using `.to_arrow()`, `.to_pandas()` or `.values_host` " + f"if you wish to iterate over the values." + ), + ): + gs.items() + + with pytest.raises( + TypeError, + match=re.escape( + f"{gs.__class__.__name__} object is not iterable. " + f"Consider using `.to_arrow()`, `.to_pandas()` or `.values_host` " + f"if you wish to iterate over the values." + ), + ): + gs.iteritems() + + with pytest.raises(TypeError): + iter(gs._column) + + +@pytest.mark.parametrize("data", [[], [None, None], ["a", None]]) +def test_series_size(data): + psr = pd.Series(data) + gsr = cudf.Series(data) + + assert_eq(psr.size, gsr.size) + + +def test_set_index_unequal_length(): + s = cudf.Series(dtype="float64") + with pytest.raises(ValueError): + s.index = [1, 2, 3] + + +@pytest.mark.parametrize( + "data", + [ + [], + [1, 2, 3, 4], + ["a", "b", "c"], + [1.2, 2.2, 4.5], + [np.nan, np.nan], + [None, None, None], + ], +) +def test_axes(data): + csr = cudf.Series(data) + psr = csr.to_pandas() + + expected = psr.axes + actual = csr.axes + + for e, a in zip(expected, actual, strict=True): + assert_eq(e, a) + + +@pytest.mark.parametrize( + "data", + [ + [1, 2, 3], + pytest.param( + [np.nan, 10, 15, 16], + marks=pytest.mark.xfail( + reason="https://github.com/pandas-dev/pandas/issues/49818" + ), + ), + [np.nan, None, 10, 20], + ["ab", "zx", "pq"], + ["ab", "zx", None, "pq"], + [], + ], +) +def test_series_hasnans(data): + gs = cudf.Series(data, nan_as_null=False) + ps = gs.to_pandas(nullable=True) + + # Check type to avoid mixing Python bool and NumPy bool + assert isinstance(gs.hasnans, bool) + assert gs.hasnans == ps.hasnans + + +def test_dtype_dtypes_equal(): + ser = cudf.Series([0]) + assert ser.dtype is ser.dtypes + assert ser.dtypes is ser.to_pandas().dtypes + + +def test_roundtrip_series_plc_column(ps): + expect = cudf.Series(ps) + actual = cudf.Series.from_pylibcudf(*expect.to_pylibcudf()) + assert_eq(expect, actual) diff --git a/python/cudf/cudf/tests/series/test_binary_operations.py b/python/cudf/cudf/tests/series/test_binary_operations.py deleted file mode 100644 index 06777c8e6af..00000000000 --- a/python/cudf/cudf/tests/series/test_binary_operations.py +++ /dev/null @@ -1 +0,0 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. diff --git a/python/cudf/cudf/tests/series/test_binops.py b/python/cudf/cudf/tests/series/test_binops.py new file mode 100644 index 00000000000..1c288a48e90 --- /dev/null +++ b/python/cudf/cudf/tests/series/test_binops.py @@ -0,0 +1,35 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. +import operator + +import pandas as pd +import pytest + +import cudf +from cudf.testing._utils import ( + assert_exceptions_equal, +) + + +@pytest.mark.parametrize( + "sr1", [pd.Series([10, 11, 12], index=["a", "b", "z"]), pd.Series(["a"])] +) +@pytest.mark.parametrize( + "sr2", + [pd.Series([], dtype="float64"), pd.Series(["a", "a", "c", "z", "A"])], +) +@pytest.mark.parametrize( + "op", + [ + operator.eq, + operator.ne, + operator.lt, + operator.gt, + operator.le, + operator.ge, + ], +) +def test_series_error_equality(sr1, sr2, op): + gsr1 = cudf.from_pandas(sr1) + gsr2 = cudf.from_pandas(sr2) + + assert_exceptions_equal(op, op, ([sr1, sr2],), ([gsr1, gsr2],)) diff --git a/python/cudf/cudf/tests/series/test_categorial.py b/python/cudf/cudf/tests/series/test_categorial.py deleted file mode 100644 index 06777c8e6af..00000000000 --- a/python/cudf/cudf/tests/series/test_categorial.py +++ /dev/null @@ -1 +0,0 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. diff --git a/python/cudf/cudf/tests/series/test_combining.py b/python/cudf/cudf/tests/series/test_combining.py deleted file mode 100644 index 06777c8e6af..00000000000 --- a/python/cudf/cudf/tests/series/test_combining.py +++ /dev/null @@ -1 +0,0 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. diff --git a/python/cudf/cudf/tests/series/test_computation.py b/python/cudf/cudf/tests/series/test_computation.py deleted file mode 100644 index 06777c8e6af..00000000000 --- a/python/cudf/cudf/tests/series/test_computation.py +++ /dev/null @@ -1 +0,0 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. diff --git a/python/cudf/cudf/tests/series/test_constructing.py b/python/cudf/cudf/tests/series/test_constructing.py deleted file mode 100644 index 6600e99ade3..00000000000 --- a/python/cudf/cudf/tests/series/test_constructing.py +++ /dev/null @@ -1,12 +0,0 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. -import numpy as np - -import cudf - - -def test_construct_int_series_with_nulls_compat_mode(): - # in compatibility mode, constructing a Series - # with nulls should result in a floating Series: - with cudf.option_context("mode.pandas_compatible", True): - s = cudf.Series([1, 2, None]) - assert s.dtype == np.dtype("float64") diff --git a/python/cudf/cudf/tests/series/test_constructors.py b/python/cudf/cudf/tests/series/test_constructors.py new file mode 100644 index 00000000000..b8bca586361 --- /dev/null +++ b/python/cudf/cudf/tests/series/test_constructors.py @@ -0,0 +1,637 @@ +# Copyright (c) 2023-2025, NVIDIA CORPORATION. +import decimal + +import cupy as cp +import numpy as np +import pandas as pd +import pyarrow as pa +import pytest + +import cudf +from cudf.core.column.column import as_column +from cudf.errors import MixedTypeError +from cudf.testing import assert_eq +from cudf.testing._utils import assert_exceptions_equal + + +def test_construct_int_series_with_nulls_compat_mode(): + # in compatibility mode, constructing a Series + # with nulls should result in a floating Series: + with cudf.option_context("mode.pandas_compatible", True): + s = cudf.Series([1, 2, None]) + assert s.dtype == np.dtype("float64") + + +@pytest.mark.parametrize( + "data", + [ + {"a": 1, "b": 2, "c": 24, "d": 1010}, + {"a": 1}, + {1: "a", 2: "b", 24: "c", 1010: "d"}, + {1: "a"}, + {"a": [1]}, + ], +) +def test_series_init_dict(data): + pandas_series = pd.Series(data) + cudf_series = cudf.Series(data) + + assert_eq(pandas_series, cudf_series) + + +def test_series_unitness_np_datetimelike_units(): + data = np.array([np.timedelta64(1)]) + with pytest.raises(TypeError): + cudf.Series(data) + with pytest.raises(TypeError): + pd.Series(data) + + +def test_list_category_like_maintains_dtype(): + dtype = cudf.CategoricalDtype(categories=[1, 2, 3, 4], ordered=True) + data = [1, 2, 3] + result = cudf.Series._from_column(as_column(data, dtype=dtype)) + expected = pd.Series(data, dtype=dtype.to_pandas()) + assert_eq(result, expected) + + +def test_list_interval_like_maintains_dtype(): + dtype = cudf.IntervalDtype(subtype=np.int8) + data = [pd.Interval(1, 2)] + result = cudf.Series._from_column(as_column(data, dtype=dtype)) + expected = pd.Series(data, dtype=dtype.to_pandas()) + assert_eq(result, expected) + + +@pytest.mark.parametrize( + "klass", [cudf.Series, cudf.Index, pd.Series, pd.Index] +) +def test_series_from_named_object_name_priority(klass): + result = cudf.Series(klass([1], name="a"), name="b") + assert result.name == "b" + + +@pytest.mark.parametrize( + "data", + [ + {"a": 1, "b": 2, "c": 3}, + cudf.Series([1, 2, 3], index=list("abc")), + pd.Series([1, 2, 3], index=list("abc")), + ], +) +def test_series_from_object_with_index_index_arg_reindex(data): + result = cudf.Series(data, index=list("bca")) + expected = cudf.Series([2, 3, 1], index=list("bca")) + assert_eq(result, expected) + + +@pytest.mark.parametrize( + "data", + [ + {0: 1, 1: 2, 2: 3}, + cudf.Series([1, 2, 3]), + cudf.Index([1, 2, 3]), + pd.Series([1, 2, 3]), + pd.Index([1, 2, 3]), + [1, 2, 3], + ], +) +def test_series_dtype_astypes(data): + result = cudf.Series(data, dtype="float64") + expected = cudf.Series([1.0, 2.0, 3.0]) + assert_eq(result, expected) + + +@pytest.mark.parametrize("pa_type", [pa.string, pa.large_string]) +def test_series_from_large_string(pa_type): + pa_string_array = pa.array(["a", "b", "c"]).cast(pa_type()) + got = cudf.Series(pa_string_array) + expected = pd.Series(pa_string_array) + + assert_eq(expected, got) + + +def test_series_init_with_nans(): + with cudf.option_context("mode.pandas_compatible", True): + gs = cudf.Series([1, 2, 3, np.nan]) + assert gs.dtype == np.dtype("float64") + ps = pd.Series([1, 2, 3, np.nan]) + assert_eq(ps, gs) + + +@pytest.mark.parametrize( + "data", + [ + [[1, 2, 3], [10, 20]], + [[1.0, 2.0, 3.0], None, [10.0, 20.0, np.nan]], + [[5, 6], None, [1]], + [None, None, None, None, None, [10, 20]], + ], +) +@pytest.mark.parametrize("klass", [cudf.Series, list, cp.array]) +def test_nested_series_from_sequence_data(data, klass): + actual = cudf.Series( + [klass(val) if val is not None else val for val in data] + ) + expected = cudf.Series(data) + assert_eq(actual, expected) + + +@pytest.mark.parametrize( + "data", + [ + lambda: cp.ones(5, dtype=cp.float16), + lambda: np.ones(5, dtype="float16"), + lambda: pd.Series([0.1, 1.2, 3.3], dtype="float16"), + pytest.param( + lambda: pa.array(np.ones(5, dtype="float16")), + marks=pytest.mark.xfail( + reason="https://issues.apache.org/jira/browse/ARROW-13762" + ), + ), + ], +) +def test_series_raises_float16(data): + data = data() + with pytest.raises(TypeError): + cudf.Series(data) + + +@pytest.mark.parametrize( + "data", [[True, False, None, True, False], [None, None], []] +) +@pytest.mark.parametrize("bool_dtype", ["bool", "boolean", pd.BooleanDtype()]) +def test_nullable_bool_dtype_series(data, bool_dtype): + psr = pd.Series(data, dtype=pd.BooleanDtype()) + gsr = cudf.Series(data, dtype=bool_dtype) + + assert_eq(psr, gsr.to_pandas(nullable=True)) + + +@pytest.mark.parametrize("data", [None, 123, 33243243232423, 0]) +@pytest.mark.parametrize("klass", [pd.Timestamp, pd.Timedelta]) +def test_temporal_scalar_series_init(data, klass): + scalar = klass(data) + expected = pd.Series([scalar]) + actual = cudf.Series([scalar]) + + assert_eq(expected, actual) + + expected = pd.Series(scalar) + actual = cudf.Series(scalar) + + assert_eq(expected, actual) + + +def test_series_from_series_index_no_shallow_copy(): + ser1 = cudf.Series(range(3), index=list("abc")) + ser2 = cudf.Series(ser1) + assert ser1.index is ser2.index + + +def test_int8_int16_construction(): + s = cudf.Series([np.int8(8), np.int16(128)]) + assert s.dtype == np.dtype("i2") + + +@pytest.mark.parametrize( + "data", [[0, 1, 2, 3, 4], range(5), [np.int8(8), np.int16(128)]] +) +def test_default_integer_bitwidth_construction(default_integer_bitwidth, data): + s = cudf.Series(data) + assert s.dtype == np.dtype(f"i{default_integer_bitwidth // 8}") + + +@pytest.mark.parametrize("data", [[1.5, 2.5, 4.5], [1000, 2000, 4000, 3.14]]) +def test_default_float_bitwidth_construction(default_float_bitwidth, data): + s = cudf.Series(data) + assert s.dtype == np.dtype(f"f{default_float_bitwidth // 8}") + + +def test_series_ordered_dedup(): + # part of https://github.com/rapidsai/cudf/issues/11486 + rng = np.random.default_rng(seed=0) + sr = cudf.Series(rng.integers(0, 100, 1000)) + # pandas unique() preserves order + expect = pd.Series(sr.to_pandas().unique()) + got = cudf.Series._from_column(sr._column.unique()) + assert_eq(expect.values, got.values) + + +def test_int64_equality(): + s = cudf.Series(np.asarray([2**63 - 10, 2**63 - 100], dtype=np.int64)) + assert (s != np.int64(2**63 - 1)).all() + + +@pytest.mark.parametrize( + "data", + [ + {"a": 1, "b": 2, "c": 24, "d": 1010}, + {"a": 1}, + ], +) +@pytest.mark.parametrize( + "index", [None, ["b", "c"], ["d", "a", "c", "b"], ["a"]] +) +def test_series_init_dict_with_index(data, index): + pandas_series = pd.Series(data, index=index) + cudf_series = cudf.Series(data, index=index) + + assert_eq(pandas_series, cudf_series) + + +def test_series_data_and_index_length_mismatch(): + assert_exceptions_equal( + lfunc=pd.Series, + rfunc=cudf.Series, + lfunc_args_and_kwargs=([], {"data": [11], "index": [10, 11]}), + rfunc_args_and_kwargs=([], {"data": [11], "index": [10, 11]}), + ) + + +@pytest.mark.parametrize( + "dtype", ["datetime64[ns]", "timedelta64[ns]", "object", "str"] +) +def test_series_mixed_dtype_error(dtype): + ps = pd.concat([pd.Series([1, 2, 3], dtype=dtype), pd.Series([10, 11])]) + with pytest.raises(TypeError): + cudf.Series(ps) + with pytest.raises(TypeError): + cudf.Series(ps.array) + + +@pytest.mark.parametrize("data", ["abc", None, 1, 3.7]) +@pytest.mark.parametrize( + "index", [None, ["b", "c"], ["d", "a", "c", "b"], ["a"]] +) +def test_series_init_scalar_with_index(data, index): + pandas_series = pd.Series(data, index=index) + cudf_series = cudf.Series(data, index=index) + + assert_eq( + pandas_series, + cudf_series, + check_index_type=data is not None or index is not None, + check_dtype=data is not None, + ) + + +@pytest.mark.parametrize( + "data", + [ + [1, 2, 3, 4], + [10, 20, None, None], + ], +) +@pytest.mark.parametrize("copy", [True, False]) +def test_series_copy(data, copy): + psr = pd.Series(data) + gsr = cudf.from_pandas(psr) + + new_psr = pd.Series(psr, copy=copy) + new_gsr = cudf.Series(gsr, copy=copy) + + new_psr.iloc[0] = 999 + new_gsr.iloc[0] = 999 + + assert_eq(psr, gsr) + assert_eq(new_psr, new_gsr) + + +def test_series_init_from_series_and_index(): + ser = cudf.Series([4, 7, -5, 3], index=["d", "b", "a", "c"]) + result = cudf.Series(ser, index=list("abcd")) + expected = cudf.Series([-5, 7, 3, 4], index=list("abcd")) + assert_eq(result, expected) + + +def test_series_constructor_unbounded_sequence(): + class A: + def __getitem__(self, key): + return 1 + + with pytest.raises(TypeError): + cudf.Series(A()) + + +def test_series_constructor_error_mixed_type(): + with pytest.raises(MixedTypeError): + cudf.Series(["abc", np.nan, "123"], nan_as_null=False) + + +def test_series_from_pandas_sparse(): + pser = pd.Series(range(2), dtype=pd.SparseDtype(np.int64, 0)) + with pytest.raises(NotImplementedError): + cudf.Series(pser) + + +def test_multi_dim_series_error(): + arr = cp.array([(1, 2), (3, 4)]) + with pytest.raises(ValueError): + cudf.Series(arr) + + +def test_bool_series_mixed_dtype_error(): + ps = pd.Series([True, False, None]) + all_bool_ps = pd.Series([True, False, True], dtype="object") + # ps now has `object` dtype, which + # isn't supported by `cudf`. + with cudf.option_context("mode.pandas_compatible", True): + with pytest.raises(TypeError): + cudf.Series(ps) + with pytest.raises(TypeError): + cudf.from_pandas(ps) + with pytest.raises(TypeError): + cudf.Series(ps, dtype=bool) + expected = cudf.Series(all_bool_ps, dtype=bool) + assert_eq(expected, all_bool_ps.astype(bool)) + nan_bools_mix = pd.Series([True, False, True, np.nan], dtype="object") + gs = cudf.Series(nan_bools_mix, nan_as_null=True) + assert_eq(gs.to_pandas(nullable=True), nan_bools_mix.astype("boolean")) + with pytest.raises(TypeError): + cudf.Series(nan_bools_mix, nan_as_null=False) + + +@pytest.mark.parametrize("klass", [cudf.Index, cudf.Series]) +@pytest.mark.parametrize( + "data", [pa.array([float("nan")]), pa.chunked_array([[float("nan")]])] +) +def test_nan_as_null_from_arrow_objects(klass, data): + result = klass(data, nan_as_null=True) + expected = klass(pa.array([None], type=pa.float64())) + assert_eq(result, expected) + + +@pytest.mark.parametrize("reso", ["M", "ps"]) +@pytest.mark.parametrize("typ", ["M", "m"]) +def test_series_invalid_reso_dtype(reso, typ): + with pytest.raises(TypeError): + cudf.Series([], dtype=f"{typ}8[{reso}]") + + +@pytest.mark.parametrize("base_name", [None, "a"]) +def test_series_to_frame_none_name(base_name): + result = cudf.Series(range(1), name=base_name).to_frame(name=None) + expected = pd.Series(range(1), name=base_name).to_frame(name=None) + assert_eq(result, expected) + + +@pytest.mark.parametrize("klass", [cudf.Series, cudf.Index]) +@pytest.mark.parametrize( + "data", + [ + pa.array([1, None], type=pa.int64()), + pa.chunked_array([[1, None]], type=pa.int64()), + ], +) +def test_from_arrow_array_dtype(klass, data): + obj = klass(data, dtype="int8") + assert obj.dtype == np.dtype("int8") + + +@pytest.mark.parametrize( + "nat, value", + [ + [np.datetime64("nat", "ns"), np.datetime64("2020-01-01", "ns")], + [np.timedelta64("nat", "ns"), np.timedelta64(1, "ns")], + ], +) +@pytest.mark.parametrize("nan_as_null", [True, False]) +def test_series_np_array_nat_nan_as_nulls(nat, value, nan_as_null): + expected = np.array([nat, value]) + ser = cudf.Series(expected, nan_as_null=nan_as_null) + assert ser[0] is pd.NaT + assert ser[1] == value + + +def test_null_like_to_nan_pandas_compat(): + with cudf.option_context("mode.pandas_compatible", True): + ser = cudf.Series([1, 2, np.nan, 10, None]) + pser = pd.Series([1, 2, np.nan, 10, None]) + + assert pser.dtype == ser.dtype + assert_eq(ser, pser) + + +def test_non_strings_dtype_object_pandas_compat_raises(): + with cudf.option_context("mode.pandas_compatible", True): + with pytest.raises(TypeError): + cudf.Series([1], dtype=object) + + +@pytest.mark.parametrize("arr", [np.array, cp.array, pd.Series]) +def test_construct_nonnative_array(arr): + data = [1, 2, 3.5, 4] + dtype = np.dtype("f4") + native = arr(data, dtype=dtype) + nonnative = arr(data, dtype=dtype.newbyteorder()) + result = cudf.Series(nonnative) + expected = cudf.Series(native) + assert_eq(result, expected) + + +@pytest.mark.parametrize("nan_as_null", [True, False]) +def test_construct_all_pd_NA_with_dtype(nan_as_null): + result = cudf.Series( + [pd.NA, pd.NA], dtype=np.dtype(np.float64), nan_as_null=nan_as_null + ) + expected = cudf.Series(pa.array([None, None], type=pa.float64())) + assert_eq(result, expected) + + +def test_series_empty_dtype(): + expected = pd.Series([]) + actual = cudf.Series([]) + assert_eq(expected, actual, check_dtype=True) + + +@pytest.mark.parametrize("data", [None, {}, []]) +def test_series_empty_index_rangeindex(data): + expected = cudf.RangeIndex(0) + result = cudf.Series(data).index + assert_eq(result, expected) + + +@pytest.mark.parametrize( + "pandas_type", + [ + pd.ArrowDtype(pa.int8()), + pd.ArrowDtype(pa.int16()), + pd.ArrowDtype(pa.int32()), + pd.ArrowDtype(pa.int64()), + pd.ArrowDtype(pa.uint8()), + pd.ArrowDtype(pa.uint16()), + pd.ArrowDtype(pa.uint32()), + pd.ArrowDtype(pa.uint64()), + pd.ArrowDtype(pa.float32()), + pd.ArrowDtype(pa.float64()), + pd.Int8Dtype(), + pd.Int16Dtype(), + pd.Int32Dtype(), + pd.Int64Dtype(), + pd.UInt8Dtype(), + pd.UInt16Dtype(), + pd.UInt32Dtype(), + pd.UInt64Dtype(), + pd.Float32Dtype(), + pd.Float64Dtype(), + ], +) +def test_series_arrow_numeric_types_roundtrip(pandas_type): + ps = pd.Series([1, 2, 3], dtype=pandas_type) + pi = pd.Index(ps) + pdf = ps.to_frame() + + with cudf.option_context("mode.pandas_compatible", True): + gs = cudf.from_pandas(ps) + assert_eq(ps, gs) + + with cudf.option_context("mode.pandas_compatible", True): + gi = cudf.from_pandas(pi) + assert_eq(pi, gi) + + with cudf.option_context("mode.pandas_compatible", True): + gdf = cudf.from_pandas(pdf) + assert_eq(pdf, gdf) + + +@pytest.mark.parametrize( + "pandas_type", [pd.ArrowDtype(pa.bool_()), pd.BooleanDtype()] +) +def test_series_arrow_bool_types_roundtrip(pandas_type): + ps = pd.Series([True, False, None], dtype=pandas_type) + pi = pd.Index(ps) + pdf = ps.to_frame() + + with cudf.option_context("mode.pandas_compatible", True): + gs = cudf.from_pandas(ps) + assert_eq(ps, gs) + + with cudf.option_context("mode.pandas_compatible", True): + gi = cudf.from_pandas(pi) + assert_eq(pi, gi) + + with cudf.option_context("mode.pandas_compatible", True): + gdf = cudf.from_pandas(pdf) + assert_eq(pdf, gdf) + + +@pytest.mark.parametrize( + "pandas_type", [pd.ArrowDtype(pa.string()), pd.StringDtype()] +) +def test_series_arrow_string_types_roundtrip(pandas_type): + ps = pd.Series(["abc", None, "xyz"], dtype=pandas_type) + pi = pd.Index(ps) + pdf = ps.to_frame() + + with cudf.option_context("mode.pandas_compatible", True): + gs = cudf.from_pandas(ps) + assert_eq(ps, gs) + + with cudf.option_context("mode.pandas_compatible", True): + gi = cudf.from_pandas(pi) + assert_eq(pi, gi) + + with cudf.option_context("mode.pandas_compatible", True): + gdf = cudf.from_pandas(pdf) + assert_eq(pdf, gdf) + + +def test_series_arrow_category_types_roundtrip(): + pa_array = pa.array(pd.Series([1, 2, 3], dtype="category")) + ps = pd.Series([1, 2, 3], dtype=pd.ArrowDtype(pa_array.type)) + pi = pd.Index(ps) + pdf = pi.to_frame() + + with cudf.option_context("mode.pandas_compatible", True): + with pytest.raises(NotImplementedError): + cudf.from_pandas(ps) + + with cudf.option_context("mode.pandas_compatible", True): + with pytest.raises(NotImplementedError): + cudf.from_pandas(pi) + + with cudf.option_context("mode.pandas_compatible", True): + with pytest.raises(NotImplementedError): + cudf.from_pandas(pdf) + + +@pytest.mark.parametrize( + "pa_type", + [pa.decimal128(10, 2), pa.decimal128(5, 2), pa.decimal128(20, 2)], +) +def test_series_arrow_decimal_types_roundtrip(pa_type): + ps = pd.Series( + [ + decimal.Decimal("1.2"), + decimal.Decimal("20.56"), + decimal.Decimal("3"), + ], + dtype=pd.ArrowDtype(pa_type), + ) + pdf = ps.to_frame() + + with cudf.option_context("mode.pandas_compatible", True): + gs = cudf.from_pandas(ps) + assert_eq(ps, gs) + + with cudf.option_context("mode.pandas_compatible", True): + gdf = cudf.from_pandas(pdf) + assert_eq(pdf, gdf) + + +def test_series_arrow_struct_types_roundtrip(): + ps = pd.Series( + [{"a": 1}, {"b": "abc"}], + dtype=pd.ArrowDtype(pa.struct({"a": pa.int64(), "b": pa.string()})), + ) + pdf = ps.to_frame() + + with cudf.option_context("mode.pandas_compatible", True): + gs = cudf.from_pandas(ps) + assert_eq(ps, gs) + + with cudf.option_context("mode.pandas_compatible", True): + gdf = cudf.from_pandas(pdf) + assert_eq(pdf, gdf) + + +def test_series_arrow_list_types_roundtrip(): + ps = pd.Series([[1], [2], [4]], dtype=pd.ArrowDtype(pa.list_(pa.int64()))) + with cudf.option_context("mode.pandas_compatible", True): + gs = cudf.from_pandas(ps) + assert_eq(ps, gs) + pdf = ps.to_frame() + + with cudf.option_context("mode.pandas_compatible", True): + gdf = cudf.from_pandas(pdf) + assert_eq(pdf, gdf) + + +def test_series_error_nan_mixed_types(): + ps = pd.Series([np.nan, "ab", "cd"]) + with cudf.option_context("mode.pandas_compatible", True): + with pytest.raises(MixedTypeError): + cudf.from_pandas(ps) + + +@pytest.mark.parametrize("klass", [cudf.Series, cudf.Index]) +def test_from_pandas_object_dtype_passed_dtype(klass): + result = klass(pd.Series([True, False], dtype=object), dtype="int8") + expected = klass(pa.array([1, 0], type=pa.int8())) + assert_eq(result, expected) + + +def test_to_dense_array(): + rng = np.random.default_rng(seed=0) + data = rng.random(8) + mask = np.asarray([0b11010110]).astype(np.byte) + sr = cudf.Series._from_column( + as_column(data, dtype=np.float64).set_mask(mask) + ) + assert sr.has_nulls + assert sr.null_count != len(sr) + filled = sr.to_numpy(na_value=np.nan) + dense = sr.dropna().to_numpy() + assert dense.size < filled.size + assert filled.size == len(sr) diff --git a/python/cudf/cudf/tests/series/test_function_application.py b/python/cudf/cudf/tests/series/test_function_application.py deleted file mode 100644 index 06777c8e6af..00000000000 --- a/python/cudf/cudf/tests/series/test_function_application.py +++ /dev/null @@ -1 +0,0 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. diff --git a/python/cudf/cudf/tests/series/test_indexing.py b/python/cudf/cudf/tests/series/test_indexing.py deleted file mode 100644 index 06777c8e6af..00000000000 --- a/python/cudf/cudf/tests/series/test_indexing.py +++ /dev/null @@ -1 +0,0 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. diff --git a/python/cudf/cudf/tests/series/test_io_serialization.py b/python/cudf/cudf/tests/series/test_io_serialization.py deleted file mode 100644 index 06777c8e6af..00000000000 --- a/python/cudf/cudf/tests/series/test_io_serialization.py +++ /dev/null @@ -1 +0,0 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. diff --git a/python/cudf/cudf/tests/series/test_missing.py b/python/cudf/cudf/tests/series/test_missing.py deleted file mode 100644 index 06777c8e6af..00000000000 --- a/python/cudf/cudf/tests/series/test_missing.py +++ /dev/null @@ -1 +0,0 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. diff --git a/python/cudf/cudf/tests/series/test_reshaping.py b/python/cudf/cudf/tests/series/test_reshaping.py deleted file mode 100644 index 06777c8e6af..00000000000 --- a/python/cudf/cudf/tests/series/test_reshaping.py +++ /dev/null @@ -1 +0,0 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. diff --git a/python/cudf/cudf/tests/series/test_selecting.py b/python/cudf/cudf/tests/series/test_selecting.py deleted file mode 100644 index 06777c8e6af..00000000000 --- a/python/cudf/cudf/tests/series/test_selecting.py +++ /dev/null @@ -1 +0,0 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. diff --git a/python/cudf/cudf/tests/series/test_sorting.py b/python/cudf/cudf/tests/series/test_sorting.py deleted file mode 100644 index 06777c8e6af..00000000000 --- a/python/cudf/cudf/tests/series/test_sorting.py +++ /dev/null @@ -1 +0,0 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. diff --git a/python/cudf/cudf/tests/series/test_timeseries.py b/python/cudf/cudf/tests/series/test_timeseries.py deleted file mode 100644 index 06777c8e6af..00000000000 --- a/python/cudf/cudf/tests/series/test_timeseries.py +++ /dev/null @@ -1 +0,0 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py deleted file mode 100644 index 45a0532ed8c..00000000000 --- a/python/cudf/cudf/tests/test_series.py +++ /dev/null @@ -1,3111 +0,0 @@ -# Copyright (c) 2020-2025, NVIDIA CORPORATION. -import datetime -import decimal -import hashlib -import operator -import re -from collections import OrderedDict, defaultdict -from string import ascii_letters, digits - -import cupy as cp -import numpy as np -import pandas as pd -import pyarrow as pa -import pytest - -import cudf -from cudf.api.extensions import no_default -from cudf.core._compat import PANDAS_CURRENT_SUPPORTED_VERSION, PANDAS_VERSION -from cudf.core.column.column import as_column -from cudf.errors import MixedTypeError -from cudf.testing import assert_eq -from cudf.testing._utils import ( - NUMERIC_TYPES, - SERIES_OR_INDEX_NAMES, - TIMEDELTA_TYPES, - assert_exceptions_equal, - expect_warning_if, - gen_rand, -) - - -@pytest.fixture( - params=[ - pd.Series([0, 1, 2, np.nan, 4, None, 6]), - pd.Series( - [0, 1, 2, np.nan, 4, None, 6], - index=["q", "w", "e", "r", "t", "y", "u"], - name="a", - ), - pd.Series([0, 1, 2, 3, 4]), - pd.Series(["a", "b", "u", "h", "d"]), - pd.Series([None, None, np.nan, None, np.inf, -np.inf]), - pd.Series([], dtype="float64"), - pd.Series( - [pd.NaT, pd.Timestamp("1939-05-27"), pd.Timestamp("1940-04-25")] - ), - pd.Series([np.nan]), - pd.Series([None]), - pd.Series(["a", "b", "", "c", None, "e"]), - ] -) -def ps(request): - return request.param - - -@pytest.mark.parametrize( - "data", - [ - {"a": 1, "b": 2, "c": 24, "d": 1010}, - {"a": 1}, - {1: "a", 2: "b", 24: "c", 1010: "d"}, - {1: "a"}, - ], -) -def test_series_init_dict(data): - pandas_series = pd.Series(data) - cudf_series = cudf.Series(data) - - assert_eq(pandas_series, cudf_series) - - -@pytest.mark.parametrize( - "data", - [ - { - "a": [1, 2, 3], - "b": [2, 3, 5], - "c": [24, 12212, 22233], - "d": [1010, 101010, 1111], - }, - {"a": [1]}, - ], -) -def test_series_init_dict_lists(data): - assert_eq(pd.Series(data), cudf.Series(data)) - - -@pytest.mark.parametrize( - "data", - [ - [1, 2, 3, 4], - [1.0, 12.221, 12.34, 13.324, 324.3242], - [-10, -1111, 100, 11, 133], - ], -) -@pytest.mark.parametrize( - "others", - [ - [10, 11, 12, 13], - [0.1, 0.002, 324.2332, 0.2342], - [-10, -1111, 100, 11, 133], - ], -) -@pytest.mark.parametrize("ignore_index", [True, False]) -def test_series_concat_basic(data, others, ignore_index): - psr = pd.Series(data) - gsr = cudf.Series(data) - - other_ps = pd.Series(others) - other_gs = cudf.Series(others) - - expected = pd.concat([psr, other_ps], ignore_index=ignore_index) - actual = cudf.concat([gsr, other_gs], ignore_index=ignore_index) - - assert_eq(expected, actual) - - -@pytest.mark.parametrize( - "data", - [ - [ - "abc", - "def", - "this is a string", - "this is another string", - "a", - "b", - "c", - ], - ["a"], - ], -) -@pytest.mark.parametrize( - "others", - [ - [ - "abc", - "def", - "this is a string", - "this is another string", - "a", - "b", - "c", - ], - ["a"], - ["1", "2", "3", "4", "5"], - ["+", "-", "!", "_", "="], - ], -) -@pytest.mark.parametrize("ignore_index", [True, False]) -def test_series_concat_basic_str(data, others, ignore_index): - psr = pd.Series(data) - gsr = cudf.Series(data) - - other_ps = pd.Series(others) - other_gs = cudf.Series(others) - - expected = pd.concat([psr, other_ps], ignore_index=ignore_index) - actual = cudf.concat([gsr, other_gs], ignore_index=ignore_index) - assert_eq(expected, actual) - - -@pytest.mark.parametrize( - "data", - [ - pd.Series( - [ - "abc", - "def", - "this is a string", - "this is another string", - "a", - "b", - "c", - ], - index=[10, 20, 30, 40, 50, 60, 70], - ), - pd.Series(["a"], index=[2]), - ], -) -@pytest.mark.parametrize( - "others", - [ - pd.Series( - [ - "abc", - "def", - "this is a string", - "this is another string", - "a", - "b", - "c", - ], - index=[10, 20, 30, 40, 50, 60, 70], - ), - pd.Series(["a"], index=[133]), - pd.Series(["1", "2", "3", "4", "5"], index=[-10, 22, 33, 44, 49]), - pd.Series(["+", "-", "!", "_", "="], index=[11, 22, 33, 44, 2]), - ], -) -@pytest.mark.parametrize("ignore_index", [True, False]) -def test_series_concat_series_with_index(data, others, ignore_index): - psr = pd.Series(data) - gsr = cudf.Series(data) - - other_ps = others - other_gs = cudf.from_pandas(others) - - expected = pd.concat([psr, other_ps], ignore_index=ignore_index) - actual = cudf.concat([gsr, other_gs], ignore_index=ignore_index) - - assert_eq(expected, actual) - - -def test_series_concat_error_mixed_types(): - gsr = cudf.Series([1, 2, 3, 4]) - other = cudf.Series(["a", "b", "c", "d"]) - - with pytest.raises( - TypeError, - match="cudf does not support mixed types, please type-cast " - "both series to same dtypes.", - ): - cudf.concat([gsr, other]) - - with pytest.raises( - TypeError, - match="cudf does not support mixed types, please type-cast " - "both series to same dtypes.", - ): - cudf.concat([gsr, gsr, other, gsr, other]) - - -@pytest.mark.parametrize( - "data", - [ - pd.Series([1, 2, 3, 4], index=["a", "b", "c", "d"]), - pd.Series( - [1.0, 12.221, 12.34, 13.324, 324.3242], - index=[ - "float one", - "float two", - "float three", - "float four", - "float five", - ], - ), - pd.Series( - [-10, -1111, 100, 11, 133], - index=["one", "two", "three", "four", "five"], - ), - ], -) -@pytest.mark.parametrize( - "others", - [ - [ - pd.Series([10, 11, 12, 13], index=["a", "b", "c", "d"]), - pd.Series([12, 14, 15, 27], index=["d", "e", "z", "x"]), - ], - [ - pd.Series([10, 11, 12, 13], index=["a", "b", "c", "d"]), - pd.Series([12, 14, 15, 27], index=["d", "e", "z", "x"]), - ] - * 25, - [ - pd.Series( - [0.1, 0.002, 324.2332, 0.2342], index=["-", "+", "%", "#"] - ), - pd.Series([12, 14, 15, 27], index=["d", "e", "z", "x"]), - ] - * 46, - [ - pd.Series( - [-10, -1111, 100, 11, 133], - index=["aa", "vv", "bb", "dd", "ll"], - ) - ], - ], -) -@pytest.mark.parametrize("ignore_index", [True, False]) -def test_series_concat_list_series_with_index(data, others, ignore_index): - psr = pd.Series(data) - gsr = cudf.Series(data) - - other_ps = others - other_gs = [cudf.from_pandas(obj) for obj in others] - - expected = pd.concat([psr, *other_ps], ignore_index=ignore_index) - actual = cudf.concat([gsr, *other_gs], ignore_index=ignore_index) - - assert_eq(expected, actual) - - -def test_series_concat_existing_buffers(): - a1 = np.arange(10, dtype=np.float64) - gs = cudf.Series(a1) - - # Add new buffer - a2 = cudf.Series(np.arange(5)) - gs = cudf.concat([gs, a2]) - assert len(gs) == 15 - np.testing.assert_equal(gs.to_numpy(), np.hstack([a1, a2.to_numpy()])) - - # Ensure appending to previous buffer - a3 = cudf.Series(np.arange(3)) - gs = cudf.concat([gs, a3]) - assert len(gs) == 18 - a4 = np.hstack([a1, a2.to_numpy(), a3.to_numpy()]) - np.testing.assert_equal(gs.to_numpy(), a4) - - # Appending different dtype - a5 = cudf.Series(np.array([1, 2, 3], dtype=np.int32)) - a6 = cudf.Series(np.array([4.5, 5.5, 6.5], dtype=np.float64)) - gs = cudf.concat([a5, a6]) - np.testing.assert_equal( - gs.to_numpy(), np.hstack([a5.to_numpy(), a6.to_numpy()]) - ) - gs = cudf.concat([cudf.Series(a6), a5]) - np.testing.assert_equal( - gs.to_numpy(), np.hstack([a6.to_numpy(), a5.to_numpy()]) - ) - - -def test_series_column_iter_error(): - gs = cudf.Series([1, 2, 3]) - - with pytest.raises( - TypeError, - match=re.escape( - f"{gs.__class__.__name__} object is not iterable. " - f"Consider using `.to_arrow()`, `.to_pandas()` or `.values_host` " - f"if you wish to iterate over the values." - ), - ): - iter(gs) - - with pytest.raises( - TypeError, - match=re.escape( - f"{gs.__class__.__name__} object is not iterable. " - f"Consider using `.to_arrow()`, `.to_pandas()` or `.values_host` " - f"if you wish to iterate over the values." - ), - ): - gs.items() - - with pytest.raises( - TypeError, - match=re.escape( - f"{gs.__class__.__name__} object is not iterable. " - f"Consider using `.to_arrow()`, `.to_pandas()` or `.values_host` " - f"if you wish to iterate over the values." - ), - ): - gs.iteritems() - - with pytest.raises(TypeError): - iter(gs._column) - - -@pytest.mark.parametrize( - "data", - [ - [1.0, 2.0, None, 4.0, 5.0], - ["a", "b", "c", "d", "e"], - ["a", "b", None, "d", "e"], - [None, None, None, None, None], - np.array(["1991-11-20", "2004-12-04"], dtype=np.datetime64), - np.array(["1991-11-20", None], dtype=np.datetime64), - np.array( - ["1991-11-20 05:15:00", "2004-12-04 10:00:00"], dtype=np.datetime64 - ), - np.array(["1991-11-20 05:15:00", None], dtype=np.datetime64), - ], -) -def test_series_tolist(data): - psr = pd.Series(data) - gsr = cudf.from_pandas(psr) - - with pytest.raises( - TypeError, - match=re.escape( - r"cuDF does not support conversion to host memory " - r"via the `tolist()` method. Consider using " - r"`.to_arrow().to_pylist()` to construct a Python list." - ), - ): - gsr.tolist() - - -@pytest.mark.parametrize( - "data", - [[], [None, None], ["a"], ["a", "b", "c"] * 500, [1.0, 2.0, 0.3] * 57], -) -def test_series_size(data): - psr = pd.Series(data) - gsr = cudf.Series(data) - - assert_eq(psr.size, gsr.size) - - -@pytest.mark.parametrize("dtype", NUMERIC_TYPES) -def test_series_describe_numeric(dtype): - ps = pd.Series([0, 1, 2, 3, 1, 2, 3], dtype=dtype) - gs = cudf.from_pandas(ps) - actual = gs.describe() - expected = ps.describe() - - assert_eq(expected, actual, check_dtype=True) - - -@pytest.mark.parametrize("dtype", ["datetime64[ns]"]) -def test_series_describe_datetime(dtype): - # Note that other datetime units are not tested because pandas does not - # support them. When specified coarser units, cuDF datetime columns cannot - # represent fractional time for quantiles of the column, which may require - # interpolation, this differs from pandas which always stay in [ns] unit. - gs = cudf.Series([0, 1, 2, 3, 1, 2, 3], dtype=dtype) - ps = gs.to_pandas() - - # Treating datetimes as categoricals is deprecated in pandas and will - # be removed in future. Future behavior is treating datetime as numeric. - expected = ps.describe() - actual = gs.describe() - - assert_eq(expected.astype("str"), actual) - - -@pytest.mark.parametrize("dtype", TIMEDELTA_TYPES) -def test_series_describe_timedelta(dtype): - ps = pd.Series([0, 1, 2, 3, 1, 2, 3], dtype=dtype) - gs = cudf.from_pandas(ps) - - expected = ps.describe() - actual = gs.describe() - - assert_eq(actual, expected.astype("str")) - - -@pytest.mark.parametrize( - "ps", - [ - pd.Series(["a", "b", "c", "d", "e", "a"]), - pd.Series([True, False, True, True, False]), - pd.Series([], dtype="str"), - pd.Series(["a", "b", "c", "a"], dtype="category"), - pd.Series(["d", "e", "f"], dtype="category"), - pd.Series(pd.Categorical(["d", "e", "f"], categories=["f", "e", "d"])), - pd.Series( - pd.Categorical( - ["d", "e", "f"], categories=["f", "e", "d"], ordered=True - ) - ), - ], -) -def test_series_describe_other_types(ps): - gs = cudf.from_pandas(ps) - - expected = ps.describe() - actual = gs.describe() - - if len(ps) == 0: - assert_eq(expected.fillna("a").astype("str"), actual.fillna("a")) - else: - assert_eq(expected.astype("str"), actual) - - -@pytest.mark.parametrize( - "data", - [ - [1, 2, 3, 2, 1], - [1, 2, None, 3, 1, 1], - [], - ["a", "b", "c", None, "z", "a"], - ], -) -@pytest.mark.parametrize("use_na_sentinel", [True, False]) -def test_series_factorize_use_na_sentinel(data, use_na_sentinel): - gsr = cudf.Series(data) - psr = gsr.to_pandas(nullable=True) - - expected_labels, expected_cats = psr.factorize( - use_na_sentinel=use_na_sentinel, sort=True - ) - actual_labels, actual_cats = gsr.factorize( - use_na_sentinel=use_na_sentinel, sort=True - ) - assert_eq(expected_labels, actual_labels.get()) - assert_eq(expected_cats, actual_cats.to_pandas(nullable=True)) - - -@pytest.mark.parametrize( - "data", - [ - [1, 2, 3, 2, 1], - [1, 2, None, 3, 1, 1], - [], - ["a", "b", "c", None, "z", "a"], - ], -) -@pytest.mark.parametrize("sort", [True, False]) -def test_series_factorize_sort(data, sort): - gsr = cudf.Series(data) - psr = gsr.to_pandas(nullable=True) - - expected_labels, expected_cats = psr.factorize(sort=sort) - actual_labels, actual_cats = gsr.factorize(sort=sort) - assert_eq(expected_labels, actual_labels.get()) - assert_eq(expected_cats, actual_cats.to_pandas(nullable=True)) - - -@pytest.mark.parametrize( - "data", - [ - pd.Series([], dtype="datetime64[ns]"), - pd.Series(pd.date_range("2010-01-01", "2010-02-01")), - pd.Series([None, None], dtype="datetime64[ns]"), - ], -) -@pytest.mark.parametrize("dropna", [True, False]) -@pytest.mark.parametrize("normalize", [True, False]) -@pytest.mark.parametrize("nulls", ["none", "some"]) -def test_series_datetime_value_counts(data, nulls, normalize, dropna): - psr = data.copy() - rng = np.random.default_rng(seed=0) - if len(data) > 0: - if nulls == "one": - p = rng.integers(0, len(data)) - psr[p] = None - elif nulls == "some": - p = rng.integers(0, len(data), 2) - psr[p] = None - - gsr = cudf.from_pandas(psr) - expected = psr.value_counts(dropna=dropna, normalize=normalize) - got = gsr.value_counts(dropna=dropna, normalize=normalize) - - assert_eq(expected.sort_index(), got.sort_index(), check_dtype=False) - assert_eq( - expected.reset_index(drop=True), - got.reset_index(drop=True), - check_dtype=False, - check_index_type=True, - ) - - -@pytest.mark.parametrize("dropna", [True, False]) -@pytest.mark.parametrize("normalize", [True, False]) -@pytest.mark.parametrize("num_elements", [10, 100, 1000]) -def test_categorical_value_counts(dropna, normalize, num_elements): - # create categorical series - rng = np.random.default_rng(seed=12) - pd_cat = pd.Categorical( - pd.Series( - rng.choice(list(ascii_letters + digits), num_elements), - dtype="category", - ) - ) - - # gdf - gdf = cudf.DataFrame() - gdf["a"] = cudf.Series.from_pandas(pd_cat) - gdf_value_counts = gdf["a"].value_counts( - dropna=dropna, normalize=normalize - ) - - # pandas - pdf = pd.DataFrame() - pdf["a"] = pd_cat - pdf_value_counts = pdf["a"].value_counts( - dropna=dropna, normalize=normalize - ) - - # verify - assert_eq( - pdf_value_counts.sort_index(), - gdf_value_counts.sort_index(), - check_dtype=False, - check_index_type=True, - ) - assert_eq( - pdf_value_counts.reset_index(drop=True), - gdf_value_counts.reset_index(drop=True), - check_dtype=False, - check_index_type=True, - ) - - -@pytest.mark.parametrize("dropna", [True, False]) -@pytest.mark.parametrize("normalize", [True, False]) -def test_series_value_counts(dropna, normalize): - rng = np.random.default_rng(seed=0) - for size in [10**x for x in range(5)]: - arr = rng.integers(low=-1, high=10, size=size) - mask = arr != -1 - sr = cudf.Series._from_column( - as_column(arr).set_mask(cudf.Series(mask)._column.as_mask()) - ) - sr.name = "col" - - expect = ( - sr.to_pandas() - .value_counts(dropna=dropna, normalize=normalize) - .sort_index() - ) - got = sr.value_counts(dropna=dropna, normalize=normalize).sort_index() - - assert_eq(expect, got, check_dtype=True, check_index_type=False) - - -@pytest.mark.parametrize("bins", [1, 2, 3]) -def test_series_value_counts_bins(bins): - psr = pd.Series([1.0, 2.0, 2.0, 3.0, 3.0, 3.0]) - gsr = cudf.from_pandas(psr) - - expected = psr.value_counts(bins=bins) - got = gsr.value_counts(bins=bins) - - assert_eq(expected.sort_index(), got.sort_index(), check_dtype=True) - - -@pytest.mark.parametrize("bins", [1, 2, 3]) -@pytest.mark.parametrize("dropna", [True, False]) -def test_series_value_counts_bins_dropna(bins, dropna): - psr = pd.Series([1.0, 2.0, 2.0, 3.0, 3.0, 3.0, np.nan]) - gsr = cudf.from_pandas(psr) - - expected = psr.value_counts(bins=bins, dropna=dropna) - got = gsr.value_counts(bins=bins, dropna=dropna) - - assert_eq(expected.sort_index(), got.sort_index(), check_dtype=True) - - -@pytest.mark.parametrize("ascending", [True, False]) -@pytest.mark.parametrize("dropna", [True, False]) -@pytest.mark.parametrize("normalize", [True, False]) -def test_series_value_counts_optional_arguments(ascending, dropna, normalize): - psr = pd.Series([1.0, 2.0, 2.0, 3.0, 3.0, 3.0, None]) - gsr = cudf.from_pandas(psr) - - expected = psr.value_counts( - ascending=ascending, dropna=dropna, normalize=normalize - ) - got = gsr.value_counts( - ascending=ascending, dropna=dropna, normalize=normalize - ) - - assert_eq(expected.sort_index(), got.sort_index(), check_dtype=True) - assert_eq( - expected.reset_index(drop=True), - got.reset_index(drop=True), - check_dtype=True, - ) - - -@pytest.mark.parametrize( - "gs", - [ - cudf.Series([1, 2, 3]), - cudf.Series([None]), - cudf.Series([4]), - cudf.Series([2, 3, -1, 0, 1], name="test name"), - cudf.Series( - [1, 2, 3, None, 2, 1], index=["a", "v", "d", "e", "f", "g"] - ), - cudf.Series([1, 2, 3, None, 2, 1, None], name="abc"), - cudf.Series(["ab", "bc", "ab", None, "bc", None, None]), - cudf.Series([None, None, None, None, None], dtype="str"), - cudf.Series([None, None, None, None, None]), - cudf.Series( - [ - 123213, - 23123, - 123123, - 12213123, - 12213123, - 12213123, - 23123, - 2312323123, - None, - None, - ], - dtype="timedelta64[ns]", - ), - cudf.Series( - [ - None, - 1, - 2, - 3242434, - 3233243, - 1, - 2, - 1023, - None, - 12213123, - None, - 2312323123, - None, - None, - ], - dtype="datetime64[ns]", - ), - cudf.Series(name="empty series", dtype="float64"), - cudf.Series(["a", "b", "c", " ", "a", "b", "z"], dtype="category"), - ], -) -@pytest.mark.parametrize("dropna", [True, False]) -def test_series_mode(gs, dropna): - ps = gs.to_pandas() - - expected = ps.mode(dropna=dropna) - actual = gs.mode(dropna=dropna) - - assert_eq(expected, actual, check_dtype=False) - - -@pytest.mark.parametrize( - "arr", - [ - np.random.default_rng(seed=0).normal(-100, 100, 1000), - np.random.default_rng(seed=0).integers(-50, 50, 1000), - np.zeros(100), - np.repeat([-0.6459412758761901], 100), - np.repeat(np.nan, 100), - np.array([1.123, 2.343, np.nan, 0.0]), - np.arange(-100.5, 101.5, 1), - ], -) -@pytest.mark.parametrize("decimals", [-5, -3, -1, 0, 1, 4, 12, np.int8(1)]) -def test_series_round(arr, decimals): - pser = pd.Series(arr) - ser = cudf.Series(arr) - result = ser.round(decimals) - expected = pser.round(decimals) - - assert_eq(result, expected) - rng = np.random.default_rng(seed=0) - # with nulls, maintaining existing null mask - arr = arr.astype("float64") # for pandas nulls - arr.ravel()[rng.choice(arr.shape[0], arr.shape[0] // 2, replace=False)] = ( - np.nan - ) - - pser = pd.Series(arr) - ser = cudf.Series(arr) - result = ser.round(decimals) - expected = pser.round(decimals) - - assert_eq(result, expected) - - -def test_series_round_half_up(): - s = cudf.Series([0.0, 1.0, 1.2, 1.7, 0.5, 1.5, 2.5, None]) - expect = cudf.Series([0.0, 1.0, 1.0, 2.0, 1.0, 2.0, 3.0, None]) - got = s.round(how="half_up") - assert_eq(expect, got) - - -@pytest.mark.parametrize( - "series_data", - [ - [1.0, None, np.nan, 4.0], - [1.24430, None, np.nan, 4.423530], - [1.24430, np.nan, 4.423530], - [-1.24430, np.nan, -4.423530], - np.repeat(np.nan, 100), - ], -) -@pytest.mark.parametrize("decimal", [0, 1, 2, 3]) -def test_round_nan_as_null_false(series_data, decimal): - series = cudf.Series(series_data, nan_as_null=False) - pser = series.to_pandas() - result = series.round(decimal) - expected = pser.round(decimal) - assert_eq(result, expected, atol=1e-10) - - -@pytest.mark.parametrize( - "data, dtype, decimals, expected_half_up, expected_half_even", - [ - ( - [1.234, 2.345, 3.456], - cudf.Decimal32Dtype(precision=5, scale=3), - 2, - [1.23, 2.35, 3.46], - [1.23, 2.34, 3.46], - ), - ( - [1.234, 2.345, 3.456], - cudf.Decimal32Dtype(precision=5, scale=3), - 0, - [1.0, 2.0, 3.0], - [1.0, 2.0, 3.0], - ), - ( - [1.234, 2.345, 3.456], - cudf.Decimal32Dtype(precision=5, scale=3), - 3, - [1.234, 2.345, 3.456], - [1.234, 2.345, 3.456], - ), - ( - [1.234567, 2.345678, 3.456789], - cudf.Decimal64Dtype(precision=10, scale=6), - 4, - [1.2346, 2.3457, 3.4568], - [1.2346, 2.3457, 3.4568], - ), - ( - [1.234567, 2.345678, 3.456789], - cudf.Decimal64Dtype(precision=10, scale=6), - 2, - [1.23, 2.35, 3.46], - [1.23, 2.35, 3.46], - ), - ( - [1.234567, 2.345678, 3.456789], - cudf.Decimal64Dtype(precision=10, scale=6), - 6, - [1.234567, 2.345678, 3.456789], - [1.234567, 2.345678, 3.456789], - ), - ], -) -def test_series_round_decimal( - data, dtype, decimals, expected_half_up, expected_half_even -): - ser = cudf.Series(data).astype(dtype) - - result_half_up = ser.round(decimals=decimals, how="half_up").astype(dtype) - expected_ser_half_up = cudf.Series(expected_half_up).astype(dtype) - assert_eq(result_half_up, expected_ser_half_up) - - result_half_even = ser.round(decimals=decimals, how="half_even").astype( - dtype - ) - expected_ser_half_even = cudf.Series(expected_half_even).astype(dtype) - assert_eq(result_half_even, expected_ser_half_even) - - -@pytest.mark.parametrize("nan_as_null", [True, False, None]) -def test_series_isnull_isna(ps, nan_as_null): - nan_contains = ps.apply(lambda x: isinstance(x, float) and np.isnan(x)) - if nan_as_null is False and ( - nan_contains.any() and not nan_contains.all() and ps.dtype == object - ): - with pytest.raises(MixedTypeError): - cudf.Series.from_pandas(ps, nan_as_null=nan_as_null) - else: - gs = cudf.Series.from_pandas(ps, nan_as_null=nan_as_null) - - assert_eq(ps.isnull(), gs.isnull()) - assert_eq(ps.isna(), gs.isna()) - - -@pytest.mark.parametrize("nan_as_null", [True, False, None]) -def test_series_notnull_notna(ps, nan_as_null): - nan_contains = ps.apply(lambda x: isinstance(x, float) and np.isnan(x)) - if nan_as_null is False and ( - nan_contains.any() and not nan_contains.all() and ps.dtype == object - ): - with pytest.raises(MixedTypeError): - cudf.Series.from_pandas(ps, nan_as_null=nan_as_null) - else: - gs = cudf.Series.from_pandas(ps, nan_as_null=nan_as_null) - - assert_eq(ps.notnull(), gs.notnull()) - assert_eq(ps.notna(), gs.notna()) - - -@pytest.mark.parametrize( - "sr1", [pd.Series([10, 11, 12], index=["a", "b", "z"]), pd.Series(["a"])] -) -@pytest.mark.parametrize( - "sr2", - [pd.Series([], dtype="float64"), pd.Series(["a", "a", "c", "z", "A"])], -) -@pytest.mark.parametrize( - "op", - [ - operator.eq, - operator.ne, - operator.lt, - operator.gt, - operator.le, - operator.ge, - ], -) -def test_series_error_equality(sr1, sr2, op): - gsr1 = cudf.from_pandas(sr1) - gsr2 = cudf.from_pandas(sr2) - - assert_exceptions_equal(op, op, ([sr1, sr2],), ([gsr1, gsr2],)) - - -def test_series_memory_usage(): - sr = cudf.Series([1, 2, 3, 4], dtype="int64") - assert sr.memory_usage() == 32 - - sliced_sr = sr[2:] - assert sliced_sr.memory_usage() == 16 - - sliced_sr[3] = None - assert sliced_sr.memory_usage() == 80 - - sr = cudf.Series(["hello world", "rapids ai", "abc", "z"]) - assert sr.memory_usage() == 44 - - assert sr[3:].memory_usage() == 9 # z - assert sr[:1].memory_usage() == 19 # hello world - - -@pytest.mark.parametrize( - "sr_data,expected_psr", - [ - ( - pa.array([1, 2, None, 3], type=pa.uint8()), - pd.Series([1, 2, None, 3], dtype=pd.UInt8Dtype()), - ), - ( - pa.array([23, None, None, 32], type=pa.uint16()), - pd.Series([23, None, None, 32], dtype=pd.UInt16Dtype()), - ), - ( - pa.array([None, 123, None, 1], type=pa.uint32()), - pd.Series([None, 123, None, 1], dtype=pd.UInt32Dtype()), - ), - ( - pa.array([234, 2323, 23432, None, None, 224], type=pa.uint64()), - pd.Series( - [234, 2323, 23432, None, None, 224], dtype=pd.UInt64Dtype() - ), - ), - ( - pa.array([-10, 1, None, -1, None, 3], type=pa.int8()), - pd.Series([-10, 1, None, -1, None, 3], dtype=pd.Int8Dtype()), - ), - ( - pa.array([111, None, 222, None, 13], type=pa.int16()), - pd.Series([111, None, 222, None, 13], dtype=pd.Int16Dtype()), - ), - ( - pa.array([11, None, 22, 33, None, 2, None, 3], type=pa.int32()), - pd.Series( - [11, None, 22, 33, None, 2, None, 3], dtype=pd.Int32Dtype() - ), - ), - ( - pa.array( - [32431, None, None, 32322, 0, 10, -32324, None], - type=pa.int64(), - ), - pd.Series( - [32431, None, None, 32322, 0, 10, -32324, None], - dtype=pd.Int64Dtype(), - ), - ), - ( - pa.array( - [True, None, False, None, False, True, True, False], - type=pa.bool_(), - ), - pd.Series( - [True, None, False, None, False, True, True, False], - dtype=pd.BooleanDtype(), - ), - ), - ( - pa.array( - [ - "abc", - "a", - None, - "hello world", - "foo buzz", - "", - None, - "rapids ai", - ], - type=pa.string(), - ), - pd.Series( - [ - "abc", - "a", - None, - "hello world", - "foo buzz", - "", - None, - "rapids ai", - ], - dtype=pd.StringDtype(), - ), - ), - ( - pa.array( - [1, 2, None, 10.2, None], - type=pa.float32(), - ), - pd.Series( - [1, 2, None, 10.2, None], - dtype=pd.Float32Dtype(), - ), - ), - ], -) -def test_series_to_pandas_nullable_dtypes(sr_data, expected_psr): - sr = cudf.Series(sr_data) - actual_psr = sr.to_pandas(nullable=True) - - assert_eq(actual_psr, expected_psr) - - -def test_series_pipe(): - psr = pd.Series([10, 20, 30, 40]) - gsr = cudf.Series([10, 20, 30, 40]) - - def custom_add_func(sr, val): - new_sr = sr + val - return new_sr - - def custom_to_str_func(sr, val): - new_sr = sr.astype("str") + val - return new_sr - - expected = ( - psr.pipe(custom_add_func, 11) - .pipe(custom_add_func, val=12) - .pipe(custom_to_str_func, "rapids") - ) - actual = ( - gsr.pipe(custom_add_func, 11) - .pipe(custom_add_func, val=12) - .pipe(custom_to_str_func, "rapids") - ) - - assert_eq(expected, actual) - - expected = ( - psr.pipe((custom_add_func, "sr"), val=11) - .pipe(custom_add_func, val=1) - .pipe(custom_to_str_func, "rapids-ai") - ) - actual = ( - gsr.pipe((custom_add_func, "sr"), val=11) - .pipe(custom_add_func, val=1) - .pipe(custom_to_str_func, "rapids-ai") - ) - - assert_eq(expected, actual) - - -def test_series_pipe_error(): - psr = pd.Series([10, 20, 30, 40]) - gsr = cudf.Series([10, 20, 30, 40]) - - def custom_add_func(sr, val): - new_sr = sr + val - return new_sr - - assert_exceptions_equal( - lfunc=psr.pipe, - rfunc=gsr.pipe, - lfunc_args_and_kwargs=([(custom_add_func, "val")], {"val": 11}), - rfunc_args_and_kwargs=([(custom_add_func, "val")], {"val": 11}), - ) - - -@pytest.mark.parametrize( - "pd_data", - [pd.Series([1, 2, 3]), pd.Series([10, 11, 12], index=[1, 2, 3])], -) -@pytest.mark.parametrize( - "other", - [ - pd.Series([4, 5, 6]), - pd.Series([4, 5, 6, 7, 8]), - pd.Series([4, np.nan, 6]), - [4, np.nan, 6], - {1: 9}, - ], -) -def test_series_update(pd_data, other): - data = cudf.Series.from_pandas(pd_data) - gs = data.copy(deep=True) - if isinstance(other, pd.Series): - other = cudf.Series.from_pandas(other, nan_as_null=False) - g_other = other.copy(deep=True) - p_other = g_other.to_pandas() - else: - g_other = other - p_other = other - - ps = gs.to_pandas() - - ps.update(p_other) - gs.update(g_other) - assert_eq(gs, ps) - - -@pytest.mark.parametrize( - "data", - [ - [1, None, 11, 2.0, np.nan], - [np.nan], - [None, None, None], - [np.nan, 1, 10, 393.32, np.nan], - ], -) -@pytest.mark.parametrize("nan_as_null", [True, False]) -@pytest.mark.parametrize("fill_value", [1.2, 332, np.nan]) -def test_fillna_with_nan(data, nan_as_null, fill_value): - gs = cudf.Series(data, dtype="float64", nan_as_null=nan_as_null) - ps = gs.to_pandas() - - expected = ps.fillna(fill_value) - actual = gs.fillna(fill_value) - - assert_eq(expected, actual) - - -def test_fillna_categorical_with_non_categorical_raises(): - ser = cudf.Series([1, None], dtype="category") - with pytest.raises(TypeError): - ser.fillna(cudf.Series([1, 2])) - - -def test_fillna_categorical_with_different_categories_raises(): - ser = cudf.Series([1, None], dtype="category") - with pytest.raises(TypeError): - ser.fillna(cudf.Series([1, 2]), dtype="category") - - -def test_series_mask_mixed_dtypes_error(): - s = cudf.Series(["a", "b", "c"]) - with pytest.raises( - TypeError, - match=re.escape( - "cudf does not support mixed types, please type-cast " - "the column of dataframe/series and other " - "to same dtypes." - ), - ): - s.where([True, False, True], [1, 2, 3]) - - -@pytest.mark.parametrize( - "ps", - [ - pd.Series(["a"] * 20, index=range(0, 20)), - pd.Series(["b", None] * 10, index=range(0, 20), name="ASeries"), - pd.Series( - ["b", None] * 5, - index=pd.Index(list(range(10)), dtype="uint64"), - name="BSeries", - ), - ], -) -@pytest.mark.parametrize( - "labels", - [ - [1], - [0], - 1, - 5, - [5, 9], - pd.Index([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]), - pd.Index([0, 1, 2, 3, 4], dtype="float32"), - ], -) -@pytest.mark.parametrize("inplace", [True, False]) -def test_series_drop_labels(ps, labels, inplace): - ps = ps.copy() - gs = cudf.from_pandas(ps) - - expected = ps.drop(labels=labels, axis=0, inplace=inplace) - actual = gs.drop(labels=labels, axis=0, inplace=inplace) - - if inplace: - expected = ps - actual = gs - - assert_eq(expected, actual) - - -@pytest.mark.parametrize( - "ps", - [ - pd.Series(["a"] * 20, index=range(0, 20)), - pd.Series(["b", None] * 10, index=range(0, 20), name="ASeries"), - ], -) -@pytest.mark.parametrize( - "index", - [[1], [0], 1, 5, [5, 9], pd.Index([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])], -) -@pytest.mark.parametrize("inplace", [True, False]) -def test_series_drop_index(ps, index, inplace): - ps = ps.copy() - gs = cudf.from_pandas(ps) - - expected = ps.drop(index=index, inplace=inplace) - actual = gs.drop(index=index, inplace=inplace) - - if inplace: - expected = ps - actual = gs - - assert_eq(expected, actual) - - -@pytest.mark.parametrize( - "ps", - [ - pd.Series( - ["a" if i % 2 == 0 else "b" for i in range(0, 10)], - index=pd.MultiIndex( - levels=[ - ["lama", "cow", "falcon"], - ["speed", "weight", "length"], - ], - codes=[ - [0, 0, 0, 1, 1, 1, 2, 2, 2, 1], - [0, 1, 2, 0, 1, 2, 0, 1, 2, 1], - ], - ), - name="abc", - ) - ], -) -@pytest.mark.parametrize( - "index,level", - [ - ("cow", 0), - ("lama", 0), - ("falcon", 0), - ("speed", 1), - ("weight", 1), - ("length", 1), - ( - "cow", - None, - ), - ( - "lama", - None, - ), - ( - "falcon", - None, - ), - ], -) -@pytest.mark.parametrize("inplace", [True, False]) -def test_series_drop_multiindex(ps, index, level, inplace): - ps = ps.copy() - gs = cudf.from_pandas(ps) - - expected = ps.drop(index=index, inplace=inplace, level=level) - actual = gs.drop(index=index, inplace=inplace, level=level) - - if inplace: - expected = ps - actual = gs - - assert_eq(expected, actual) - - -def test_series_drop_edge_inputs(): - gs = cudf.Series([42], name="a") - ps = gs.to_pandas() - - assert_eq(ps.drop(columns=["b"]), gs.drop(columns=["b"])) - - assert_eq(ps.drop(columns="b"), gs.drop(columns="b")) - - assert_exceptions_equal( - lfunc=ps.drop, - rfunc=gs.drop, - lfunc_args_and_kwargs=(["a"], {"columns": "a", "axis": 1}), - rfunc_args_and_kwargs=(["a"], {"columns": "a", "axis": 1}), - ) - - assert_exceptions_equal( - lfunc=ps.drop, - rfunc=gs.drop, - lfunc_args_and_kwargs=([], {}), - rfunc_args_and_kwargs=([], {}), - ) - - assert_exceptions_equal( - lfunc=ps.drop, - rfunc=gs.drop, - lfunc_args_and_kwargs=(["b"], {"axis": 1}), - rfunc_args_and_kwargs=(["b"], {"axis": 1}), - ) - - -def test_series_drop_raises(): - gs = cudf.Series([10, 20, 30], index=["x", "y", "z"], name="c") - ps = gs.to_pandas() - - assert_exceptions_equal( - lfunc=ps.drop, - rfunc=gs.drop, - lfunc_args_and_kwargs=(["p"],), - rfunc_args_and_kwargs=(["p"],), - ) - - # dtype specified mismatch - assert_exceptions_equal( - lfunc=ps.drop, - rfunc=gs.drop, - lfunc_args_and_kwargs=([3],), - rfunc_args_and_kwargs=([3],), - ) - - expect = ps.drop("p", errors="ignore") - actual = gs.drop("p", errors="ignore") - - assert_eq(actual, expect) - - -@pytest.mark.parametrize( - "data", - [[[1, 2, 3], None, [4], [], [5, 6]], [1, 2, 3, 4, 5]], -) -@pytest.mark.parametrize("ignore_index", [True, False]) -@pytest.mark.parametrize( - "p_index", - [ - None, - ["ia", "ib", "ic", "id", "ie"], - pd.MultiIndex.from_tuples( - [(0, "a"), (0, "b"), (0, "c"), (1, "a"), (1, "b")] - ), - ], -) -def test_explode(data, ignore_index, p_index): - pdf = pd.Series(data, index=p_index, name="someseries") - gdf = cudf.from_pandas(pdf) - - expect = pdf.explode(ignore_index) - got = gdf.explode(ignore_index) - - assert_eq(expect, got, check_dtype=False) - - -@pytest.mark.parametrize( - "data", - [ - [[1, 2, 3], [10, 20]], - [[1.0, 2.0, 3.0], None, [10.0, 20.0, np.nan]], - [[5, 6], None, [1]], - [None, None, None, None, None, [10, 20]], - ], -) -@pytest.mark.parametrize("klass", [cudf.Series, list, cp.array]) -def test_nested_series_from_sequence_data(data, klass): - actual = cudf.Series( - [klass(val) if val is not None else val for val in data] - ) - expected = cudf.Series(data) - assert_eq(actual, expected) - - -@pytest.mark.parametrize( - "data", - [ - cp.ones(5, dtype=cp.float16), - np.ones(5, dtype="float16"), - pd.Series([0.1, 1.2, 3.3], dtype="float16"), - pytest.param( - pa.array(np.ones(5, dtype="float16")), - marks=pytest.mark.xfail( - reason="https://issues.apache.org/jira/browse/ARROW-13762" - ), - ), - ], -) -def test_series_raises_float16(data): - with pytest.raises(TypeError): - cudf.Series(data) - - -@pytest.mark.parametrize( - "index", - [ - pd.RangeIndex(0, 3, 1), - [3.0, 1.0, np.nan], - ["a", "z", None], - pd.RangeIndex(4, -1, -2), - ], -) -@pytest.mark.parametrize("axis", [0, "index"]) -@pytest.mark.parametrize("ascending", [True, False]) -@pytest.mark.parametrize("ignore_index", [True, False]) -@pytest.mark.parametrize("inplace", [True, False]) -@pytest.mark.parametrize("na_position", ["first", "last"]) -def test_series_sort_index( - index, axis, ascending, inplace, ignore_index, na_position -): - ps = pd.Series([10, 3, 12], index=index) - gs = cudf.from_pandas(ps) - - expected = ps.sort_index( - axis=axis, - ascending=ascending, - ignore_index=ignore_index, - inplace=inplace, - na_position=na_position, - ) - got = gs.sort_index( - axis=axis, - ascending=ascending, - ignore_index=ignore_index, - inplace=inplace, - na_position=na_position, - ) - - if inplace is True: - assert_eq(ps, gs, check_index_type=True) - else: - assert_eq(expected, got, check_index_type=True) - - -@pytest.mark.parametrize( - "method", ["md5", "sha1", "sha224", "sha256", "sha384", "sha512"] -) -def test_series_hash_values(method): - inputs = cudf.Series( - [ - "", - "0", - "A 56 character string to test message padding algorithm.", - "A 63 character string to test message padding algorithm, again.", - "A 64 character string to test message padding algorithm, again!!", - ( - "A very long (greater than 128 bytes/char string) to execute " - "a multi hash-step data point in the hash function being " - "tested. This string needed to be longer." - ), - "All work and no play makes Jack a dull boy", - "!\"#$%&'()*+,-./0123456789:;<=>?@[\\]^_`{|}~", - "\x00\x00\x00\x10\x00\x00\x00\x00", - "\x00\x00\x00\x00", - ] - ) - - def hashlib_compute_digest(data): - hasher = getattr(hashlib, method)() - hasher.update(data.encode("utf-8")) - return hasher.hexdigest() - - hashlib_validation = inputs.to_pandas().apply(hashlib_compute_digest) - validation_results = cudf.Series(hashlib_validation) - hash_values = inputs.hash_values(method=method) - assert_eq(hash_values, validation_results) - - -def test_series_hash_values_invalid_method(): - inputs = cudf.Series(["", "0"]) - with pytest.raises(ValueError): - inputs.hash_values(method="invalid_method") - - -def test_set_index_unequal_length(): - s = cudf.Series(dtype="float64") - with pytest.raises(ValueError): - s.index = [1, 2, 3] - - -@pytest.mark.parametrize( - "lhs, rhs", [("a", "a"), ("a", "b"), (1, 1.0), (None, None), (None, "a")] -) -def test_equals_names(lhs, rhs): - lhs = cudf.Series([1, 2], name=lhs) - rhs = cudf.Series([1, 2], name=rhs) - - got = lhs.equals(rhs) - expect = lhs.to_pandas().equals(rhs.to_pandas()) - - assert_eq(expect, got) - - -@pytest.mark.parametrize( - "data", [[True, False, None, True, False], [None, None], []] -) -@pytest.mark.parametrize("bool_dtype", ["bool", "boolean", pd.BooleanDtype()]) -def test_nullable_bool_dtype_series(data, bool_dtype): - psr = pd.Series(data, dtype=pd.BooleanDtype()) - gsr = cudf.Series(data, dtype=bool_dtype) - - assert_eq(psr, gsr.to_pandas(nullable=True)) - - -@pytest.mark.parametrize("level", [None, 0, "l0", 1, ["l0", 1]]) -@pytest.mark.parametrize("drop", [True, False]) -@pytest.mark.parametrize("original_name", [None, "original_ser"]) -@pytest.mark.parametrize("name", [None, "ser", no_default]) -@pytest.mark.parametrize("inplace", [True, False]) -def test_reset_index(level, drop, inplace, original_name, name): - midx = pd.MultiIndex.from_tuples( - [("a", 1), ("a", 2), ("b", 1), ("b", 2)], names=["l0", None] - ) - ps = pd.Series(range(4), index=midx, name=original_name) - gs = cudf.from_pandas(ps) - - if not drop and inplace: - pytest.skip( - "For exception checks, see " - "test_reset_index_dup_level_name_exceptions" - ) - - expect = ps.reset_index(level=level, drop=drop, name=name, inplace=inplace) - - got = gs.reset_index(level=level, drop=drop, name=name, inplace=inplace) - if inplace: - expect = ps - got = gs - - assert_eq(expect, got) - - -@pytest.mark.parametrize("level", [None, 0, 1, [None]]) -@pytest.mark.parametrize("drop", [False, True]) -@pytest.mark.parametrize("inplace", [False, True]) -@pytest.mark.parametrize("original_name", [None, "original_ser"]) -@pytest.mark.parametrize("name", [None, "ser"]) -def test_reset_index_dup_level_name(level, drop, inplace, original_name, name): - # midx levels are named [None, None] - midx = pd.MultiIndex.from_tuples([("a", 1), ("a", 2), ("b", 1), ("b", 2)]) - ps = pd.Series(range(4), index=midx, name=original_name) - gs = cudf.from_pandas(ps) - if level == [None] or not drop and inplace: - pytest.skip( - "For exception checks, see " - "test_reset_index_dup_level_name_exceptions" - ) - - expect = ps.reset_index(level=level, drop=drop, inplace=inplace, name=name) - got = gs.reset_index(level=level, drop=drop, inplace=inplace, name=name) - if inplace: - expect = ps - got = gs - - assert_eq(expect, got) - - -@pytest.mark.parametrize("drop", [True, False]) -@pytest.mark.parametrize("inplace", [True, False]) -@pytest.mark.parametrize("original_name", [None, "original_ser"]) -@pytest.mark.parametrize("name", [None, "ser"]) -def test_reset_index_named(drop, inplace, original_name, name): - ps = pd.Series(range(4), index=["x", "y", "z", "w"], name=original_name) - gs = cudf.from_pandas(ps) - - ps.index.name = "cudf" - gs.index.name = "cudf" - - if not drop and inplace: - pytest.skip( - "For exception checks, see " - "test_reset_index_dup_level_name_exceptions" - ) - - expect = ps.reset_index(drop=drop, inplace=inplace, name=name) - got = gs.reset_index(drop=drop, inplace=inplace, name=name) - - if inplace: - expect = ps - got = gs - - assert_eq(expect, got) - - -def test_reset_index_dup_level_name_exceptions(): - midx = pd.MultiIndex.from_tuples([("a", 1), ("a", 2), ("b", 1), ("b", 2)]) - ps = pd.Series(range(4), index=midx) - gs = cudf.from_pandas(ps) - - # Should specify duplicate level names with level number. - assert_exceptions_equal( - lfunc=ps.reset_index, - rfunc=gs.reset_index, - lfunc_args_and_kwargs=( - [], - {"level": [None]}, - ), - rfunc_args_and_kwargs=( - [], - {"level": [None]}, - ), - ) - - # Cannot use drop=False and inplace=True to turn a series into dataframe. - assert_exceptions_equal( - lfunc=ps.reset_index, - rfunc=gs.reset_index, - lfunc_args_and_kwargs=( - [], - {"drop": False, "inplace": True}, - ), - rfunc_args_and_kwargs=( - [], - {"drop": False, "inplace": True}, - ), - ) - - # Pandas raises the above exception should these two inputs crosses. - assert_exceptions_equal( - lfunc=ps.reset_index, - rfunc=gs.reset_index, - lfunc_args_and_kwargs=( - [], - {"level": [None], "drop": False, "inplace": True}, - ), - rfunc_args_and_kwargs=( - [], - {"level": [None], "drop": False, "inplace": True}, - ), - ) - - -def test_series_add_prefix(): - cd_s = cudf.Series([1, 2, 3, 4]) - pd_s = cd_s.to_pandas() - - got = cd_s.add_prefix("item_") - expected = pd_s.add_prefix("item_") - - assert_eq(got, expected) - - -def test_series_add_suffix(): - cd_s = cudf.Series([1, 2, 3, 4]) - pd_s = cd_s.to_pandas() - - got = cd_s.add_suffix("_item") - expected = pd_s.add_suffix("_item") - - assert_eq(got, expected) - - -@pytest.mark.parametrize( - "data", - [ - [0.25, 0.5, 0.2, -0.05], - [0, 1, 2, np.nan, 4, cudf.NA, 6], - ], -) -@pytest.mark.parametrize("lag", [1, 2, 3, 4]) -def test_autocorr(data, lag): - cudf_series = cudf.Series(data) - psr = cudf_series.to_pandas() - - cudf_corr = cudf_series.autocorr(lag=lag) - - # autocorrelation is undefined (nan) for less than two entries, but pandas - # short-circuits when there are 0 entries and bypasses the numpy function - # call that generates an error. - num_both_valid = (psr.notna() & psr.shift(lag).notna()).sum() - with expect_warning_if(num_both_valid == 1, RuntimeWarning): - pd_corr = psr.autocorr(lag=lag) - - assert_eq(pd_corr, cudf_corr) - - -@pytest.mark.parametrize( - "data", - [ - [0, 1, 2, 3], - ["abc", "a", None, "hello world", "foo buzz", "", None, "rapids ai"], - ], -) -def test_series_transpose(data): - psr = pd.Series(data=data) - csr = cudf.Series(data=data) - - cudf_transposed = csr.transpose() - pd_transposed = psr.transpose() - cudf_property = csr.T - pd_property = psr.T - - assert_eq(pd_transposed, cudf_transposed) - assert_eq(pd_property, cudf_property) - assert_eq(cudf_transposed, csr) - - -@pytest.mark.parametrize( - "data", - [1, 3, 5, 7, 7], -) -def test_series_nunique(data): - cd_s = cudf.Series(data) - pd_s = cd_s.to_pandas() - - actual = cd_s.nunique() - expected = pd_s.nunique() - - assert_eq(expected, actual) - - -@pytest.mark.parametrize( - "data", - [1, 3, 5, 7, 7], -) -def test_series_nunique_index(data): - cd_s = cudf.Series(data) - pd_s = cd_s.to_pandas() - - actual = cd_s.index.nunique() - expected = pd_s.index.nunique() - - assert_eq(expected, actual) - - -@pytest.mark.parametrize( - "data", - [ - [], - [1, 2, 3, 4], - ["a", "b", "c"], - [1.2, 2.2, 4.5], - [np.nan, np.nan], - [None, None, None], - ], -) -def test_axes(data): - csr = cudf.Series(data) - psr = csr.to_pandas() - - expected = psr.axes - actual = csr.axes - - for e, a in zip(expected, actual): - assert_eq(e, a) - - -def test_series_truncate(): - csr = cudf.Series([1, 2, 3, 4]) - psr = csr.to_pandas() - - assert_eq(csr.truncate(), psr.truncate()) - assert_eq(csr.truncate(1, 2), psr.truncate(1, 2)) - assert_eq(csr.truncate(before=1, after=2), psr.truncate(before=1, after=2)) - - -def test_series_truncate_errors(): - csr = cudf.Series([1, 2, 3, 4]) - with pytest.raises(ValueError): - csr.truncate(axis=1) - with pytest.raises(ValueError): - csr.truncate(copy=False) - - csr.index = [3, 2, 1, 6] - psr = csr.to_pandas() - assert_exceptions_equal( - lfunc=csr.truncate, - rfunc=psr.truncate, - ) - - -def test_series_truncate_datetimeindex(): - dates = cudf.date_range( - "2021-01-01 23:45:00", "2021-01-02 23:46:00", freq="s" - ) - csr = cudf.Series(range(len(dates)), index=dates) - psr = csr.to_pandas() - - assert_eq( - csr.truncate( - before="2021-01-01 23:45:18", after="2021-01-01 23:45:27" - ), - psr.truncate( - before="2021-01-01 23:45:18", after="2021-01-01 23:45:27" - ), - ) - - -@pytest.mark.parametrize( - "data", - [ - [], - [0, 12, 14], - [0, 14, 12, 12, 3, 10, 12, 14], - np.random.default_rng(seed=0).integers(-100, 100, 200), - pd.Series([0.0, 1.0, None, 10.0]), - [None, None, None, None], - [np.nan, None, -1, 2, 3], - [1, 2], - ], -) -@pytest.mark.parametrize( - "values", - [ - np.random.default_rng(seed=0).integers(-100, 100, 10), - [], - [np.nan, None, -1, 2, 3], - [1.0, 12.0, None, None, 120], - [0.1, 12.1, 14.1], - [0, 14, 12, 12, 3, 10, 12, 14, None], - [None, None, None], - ["0", "12", "14"], - ["0", "12", "14", "a"], - [1.0, 2.5], - ], -) -def test_isin_numeric(data, values): - rng = np.random.default_rng(seed=0) - index = rng.integers(0, 100, len(data)) - psr = pd.Series(data, index=index) - gsr = cudf.Series.from_pandas(psr, nan_as_null=False) - - expected = psr.isin(values) - got = gsr.isin(values) - - assert_eq(got, expected) - - -def test_fill_new_category(): - gs = cudf.Series(pd.Categorical(["a", "b", "c"])) - with pytest.raises(TypeError): - gs[0:1] = "d" - - -@pytest.mark.skipif( - PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, - reason="Warning newly introduced in pandas-2.2.0", -) -@pytest.mark.parametrize( - "data", - [ - [], - pd.Series( - ["2018-01-01", "2019-04-03", None, "2019-12-30"], - dtype="datetime64[ns]", - ), - pd.Series( - [ - "2018-01-01", - "2019-04-03", - None, - "2019-12-30", - "2018-01-01", - "2018-01-01", - ], - dtype="datetime64[ns]", - ), - ], -) -@pytest.mark.parametrize( - "values", - [ - [], - [1514764800000000000, 1577664000000000000], - [ - 1514764800000000000, - 1577664000000000000, - 1577664000000000000, - 1577664000000000000, - 1514764800000000000, - ], - ["2019-04-03", "2019-12-30", "2012-01-01"], - [ - "2012-01-01", - "2012-01-01", - "2012-01-01", - "2019-04-03", - "2019-12-30", - "2012-01-01", - ], - ], -) -def test_isin_datetime(data, values): - psr = pd.Series(data) - gsr = cudf.Series.from_pandas(psr) - - is_len_str = isinstance(next(iter(values), None), str) and len(data) - with expect_warning_if(is_len_str): - got = gsr.isin(values) - with expect_warning_if(is_len_str): - expected = psr.isin(values) - assert_eq(got, expected) - - -@pytest.mark.parametrize( - "data", - [ - [], - ["this", "is", None, "a", "test"], - ["test", "this", "test", "is", None, "test", "a", "test"], - ["0", "12", "14"], - ], -) -@pytest.mark.parametrize( - "values", - [ - [], - ["this", "is"], - [None, None, None], - ["12", "14", "19"], - [12, 14, 19], - ["is", "this", "is", "this", "is"], - ], -) -def test_isin_string(data, values): - psr = pd.Series(data) - gsr = cudf.Series.from_pandas(psr) - - got = gsr.isin(values) - expected = psr.isin(values) - assert_eq(got, expected) - - -@pytest.mark.parametrize( - "data", - [ - [], - pd.Series(["a", "b", "c", "c", "c", "d", "e"], dtype="category"), - pd.Series(["a", "b", None, "c", "d", "e"], dtype="category"), - pd.Series([0, 3, 10, 12], dtype="category"), - pd.Series([0, 3, 10, 12, 0, 10, 3, 0, 0, 3, 3], dtype="category"), - ], -) -@pytest.mark.parametrize( - "values", - [ - [], - ["a", "b", None, "f", "words"], - ["0", "12", None, "14"], - [0, 10, 12, None, 39, 40, 1000], - [0, 0, 0, 0, 3, 3, 3, None, 1, 2, 3], - ], -) -def test_isin_categorical(data, values): - psr = pd.Series(data) - gsr = cudf.Series.from_pandas(psr) - - got = gsr.isin(values) - expected = psr.isin(values) - assert_eq(got, expected) - - -@pytest.mark.parametrize("dtype", NUMERIC_TYPES) -@pytest.mark.parametrize("period", [-1, -5, -10, -20, 0, 1, 5, 10, 20]) -@pytest.mark.parametrize("data_empty", [False, True]) -def test_diff(dtype, period, data_empty): - if data_empty: - data = None - else: - if dtype == np.int8: - # to keep data in range - data = gen_rand(dtype, 100000, low=-2, high=2) - else: - data = gen_rand(dtype, 100000) - - gs = cudf.Series(data, dtype=dtype) - ps = pd.Series(data, dtype=dtype) - - expected_outcome = ps.diff(period) - diffed_outcome = gs.diff(period).astype(expected_outcome.dtype) - - if data_empty: - assert_eq(diffed_outcome, expected_outcome, check_index_type=False) - else: - assert_eq(diffed_outcome, expected_outcome) - - -def test_diff_unsupported_dtypes(): - gs = cudf.Series(["a", "b", "c", "d", "e"]) - with pytest.raises( - TypeError, - match=r"unsupported operand type\(s\)", - ): - gs.diff() - - -@pytest.mark.parametrize( - "data", - [ - pd.date_range("2020-01-01", "2020-01-06", freq="D"), - [True, True, True, False, True, True], - [1.0, 2.0, 3.5, 4.0, 5.0, -1.7], - [1, 2, 3, 3, 4, 5], - [np.nan, None, None, np.nan, np.nan, None], - ], -) -def test_diff_many_dtypes(data): - ps = pd.Series(data) - gs = cudf.from_pandas(ps) - assert_eq(ps.diff(), gs.diff()) - assert_eq(ps.diff(periods=2), gs.diff(periods=2)) - - -@pytest.mark.parametrize("num_rows", [1, 100]) -@pytest.mark.parametrize("num_bins", [1, 10]) -@pytest.mark.parametrize("right", [True, False]) -@pytest.mark.parametrize("dtype", [*NUMERIC_TYPES, "bool"]) -@pytest.mark.parametrize("series_bins", [True, False]) -def test_series_digitize(num_rows, num_bins, right, dtype, series_bins): - rng = np.random.default_rng(seed=0) - data = rng.integers(0, 100, num_rows).astype(dtype) - bins = np.unique(np.sort(rng.integers(2, 95, num_bins).astype(dtype))) - s = cudf.Series(data) - if series_bins: - s_bins = cudf.Series(bins) - indices = s.digitize(s_bins, right) - else: - indices = s.digitize(bins, right) - np.testing.assert_array_equal( - np.digitize(data, bins, right), indices.to_numpy() - ) - - -def test_series_digitize_invalid_bins(): - rng = np.random.default_rng(seed=0) - s = cudf.Series(rng.integers(0, 30, 80), dtype="int32") - bins = cudf.Series([2, None, None, 50, 90], dtype="int32") - - with pytest.raises( - ValueError, match="`bins` cannot contain null entries." - ): - _ = s.digitize(bins) - - -@pytest.mark.parametrize( - "data,left,right", - [ - ([0, 1, 2, 3, 4, 5, 10], 0, 5), - ([0, 1, 2, 3, 4, 5, 10], 10, 1), - ([0, 1, 2, 3, 4, 5], [0, 10, 11] * 2, [1, 2, 5] * 2), - (["a", "few", "set", "of", "strings", "xyz", "abc"], "banana", "few"), - (["a", "few", "set", "of", "strings", "xyz", "abc"], "phone", "hello"), - ( - ["a", "few", "set", "of", "strings", "xyz", "abc"], - ["a", "hello", "rapids", "ai", "world", "chars", "strs"], - ["yes", "no", "hi", "bye", "test", "pass", "fail"], - ), - ([0, 1, 2, np.nan, 4, np.nan, 10], 10, 1), - ], -) -@pytest.mark.parametrize("inclusive", ["both", "neither", "left", "right"]) -def test_series_between(data, left, right, inclusive): - ps = pd.Series(data) - gs = cudf.from_pandas(ps, nan_as_null=False) - - expected = ps.between(left, right, inclusive=inclusive) - actual = gs.between(left, right, inclusive=inclusive) - - assert_eq(expected, actual) - - -@pytest.mark.parametrize( - "data,left,right", - [ - ([0, 1, 2, None, 4, 5, 10], 0, 5), - ([0, 1, 2, 3, None, 5, 10], 10, 1), - ([None, 1, 2, 3, 4, None], [0, 10, 11] * 2, [1, 2, 5] * 2), - ( - ["a", "few", "set", None, "strings", "xyz", "abc"], - ["a", "hello", "rapids", "ai", "world", "chars", "strs"], - ["yes", "no", "hi", "bye", "test", "pass", "fail"], - ), - ], -) -@pytest.mark.parametrize("inclusive", ["both", "neither", "left", "right"]) -def test_series_between_with_null(data, left, right, inclusive): - gs = cudf.Series(data) - ps = gs.to_pandas(nullable=True) - - expected = ps.between(left, right, inclusive=inclusive) - actual = gs.between(left, right, inclusive=inclusive) - - assert_eq(expected, actual.to_pandas(nullable=True)) - - -def test_default_construction(): - s = cudf.Series([np.int8(8), np.int16(128)]) - assert s.dtype == np.dtype("i2") - - -@pytest.mark.parametrize( - "data", [[0, 1, 2, 3, 4], range(5), [np.int8(8), np.int16(128)]] -) -def test_default_integer_bitwidth_construction(default_integer_bitwidth, data): - s = cudf.Series(data) - assert s.dtype == np.dtype(f"i{default_integer_bitwidth // 8}") - - -@pytest.mark.parametrize("data", [[1.5, 2.5, 4.5], [1000, 2000, 4000, 3.14]]) -def test_default_float_bitwidth_construction(default_float_bitwidth, data): - s = cudf.Series(data) - assert s.dtype == np.dtype(f"f{default_float_bitwidth // 8}") - - -def test_series_ordered_dedup(): - # part of https://github.com/rapidsai/cudf/issues/11486 - rng = np.random.default_rng(seed=0) - sr = cudf.Series(rng.integers(0, 100, 1000)) - # pandas unique() preserves order - expect = pd.Series(sr.to_pandas().unique()) - got = cudf.Series._from_column(sr._column.unique()) - assert_eq(expect.values, got.values) - - -@pytest.mark.parametrize("dtype", ["int64", "float64"]) -@pytest.mark.parametrize("bool_scalar", [True, False]) -def test_set_bool_error(dtype, bool_scalar): - sr = cudf.Series([1, 2, 3], dtype=dtype) - psr = sr.to_pandas(nullable=True) - - assert_exceptions_equal( - lfunc=sr.__setitem__, - rfunc=psr.__setitem__, - lfunc_args_and_kwargs=([bool_scalar],), - rfunc_args_and_kwargs=([bool_scalar],), - ) - - -def test_int64_equality(): - s = cudf.Series(np.asarray([2**63 - 10, 2**63 - 100], dtype=np.int64)) - assert (s != np.int64(2**63 - 1)).all() - - -@pytest.mark.parametrize("into", [dict, OrderedDict, defaultdict(list)]) -def test_series_to_dict(into): - gs = cudf.Series(["ab", "de", "zx"], index=[10, 20, 100]) - ps = gs.to_pandas() - - actual = gs.to_dict(into=into) - expected = ps.to_dict(into=into) - - assert_eq(expected, actual) - - -@pytest.mark.parametrize( - "data", - [ - [1, 2, 3], - pytest.param( - [np.nan, 10, 15, 16], - marks=pytest.mark.xfail( - reason="https://github.com/pandas-dev/pandas/issues/49818" - ), - ), - [np.nan, None, 10, 20], - ["ab", "zx", "pq"], - ["ab", "zx", None, "pq"], - [], - ], -) -def test_series_hasnans(data): - gs = cudf.Series(data, nan_as_null=False) - ps = gs.to_pandas(nullable=True) - - # Check type to avoid mixing Python bool and NumPy bool - assert isinstance(gs.hasnans, bool) - assert gs.hasnans == ps.hasnans - - -@pytest.mark.parametrize( - "data,index", - [ - ([1, 2, 3], [10, 11, 12]), - ([1, 2, 3, 1, 1, 2, 3, 2], [10, 20, 23, 24, 25, 26, 27, 28]), - ([1, None, 2, None, 3, None, 3, 1], [5, 6, 7, 8, 9, 10, 11, 12]), - ([np.nan, 1.0, np.nan, 5.4, 5.4, 1.0], ["a", "b", "c", "d", "e", "f"]), - ( - ["lama", "cow", "lama", None, "beetle", "lama", None, None], - [1, 4, 10, 11, 2, 100, 200, 400], - ), - ], -) -@pytest.mark.parametrize("keep", ["first", "last", False]) -@pytest.mark.parametrize("name", [None, "a"]) -def test_series_duplicated(data, index, keep, name): - gs = cudf.Series(data, index=index, name=name) - ps = gs.to_pandas() - - assert_eq(gs.duplicated(keep=keep), ps.duplicated(keep=keep)) - - -@pytest.mark.parametrize( - "data", - [ - [1, 2, 3, 4], - [10, 20, None, None], - ], -) -@pytest.mark.parametrize("copy", [True, False]) -def test_series_copy(data, copy): - psr = pd.Series(data) - gsr = cudf.from_pandas(psr) - - new_psr = pd.Series(psr, copy=copy) - new_gsr = cudf.Series(gsr, copy=copy) - - new_psr.iloc[0] = 999 - new_gsr.iloc[0] = 999 - - assert_eq(psr, gsr) - assert_eq(new_psr, new_gsr) - - -@pytest.mark.parametrize( - "data", - [ - {"a": 1, "b": 2, "c": 24, "d": 1010}, - {"a": 1}, - ], -) -@pytest.mark.parametrize( - "index", [None, ["b", "c"], ["d", "a", "c", "b"], ["a"]] -) -def test_series_init_dict_with_index(data, index): - pandas_series = pd.Series(data, index=index) - cudf_series = cudf.Series(data, index=index) - - assert_eq(pandas_series, cudf_series) - - -@pytest.mark.parametrize("data", ["abc", None, 1, 3.7]) -@pytest.mark.parametrize( - "index", [None, ["b", "c"], ["d", "a", "c", "b"], ["a"]] -) -def test_series_init_scalar_with_index(data, index): - pandas_series = pd.Series(data, index=index) - cudf_series = cudf.Series(data, index=index) - - assert_eq( - pandas_series, - cudf_series, - check_index_type=data is not None or index is not None, - check_dtype=data is not None, - ) - - -def test_series_init_error(): - assert_exceptions_equal( - lfunc=pd.Series, - rfunc=cudf.Series, - lfunc_args_and_kwargs=([], {"data": [11], "index": [10, 11]}), - rfunc_args_and_kwargs=([], {"data": [11], "index": [10, 11]}), - ) - - -def test_series_init_from_series_and_index(): - ser = cudf.Series([4, 7, -5, 3], index=["d", "b", "a", "c"]) - result = cudf.Series(ser, index=list("abcd")) - expected = cudf.Series([-5, 7, 3, 4], index=list("abcd")) - assert_eq(result, expected) - - -@pytest.mark.parametrize( - "dtype", ["datetime64[ns]", "timedelta64[ns]", "object", "str"] -) -def test_series_mixed_dtype_error(dtype): - ps = pd.concat([pd.Series([1, 2, 3], dtype=dtype), pd.Series([10, 11])]) - with pytest.raises(TypeError): - cudf.Series(ps) - with pytest.raises(TypeError): - cudf.Series(ps.array) - - -@pytest.mark.parametrize("data", [[True, False, None], [10, 200, 300]]) -@pytest.mark.parametrize("index", [None, [10, 20, 30]]) -def test_series_contains(data, index): - ps = pd.Series(data, index=index) - gs = cudf.Series(data, index=index) - - assert_eq(1 in ps, 1 in gs) - assert_eq(10 in ps, 10 in gs) - assert_eq(True in ps, True in gs) - assert_eq(False in ps, False in gs) - - -def test_series_from_pandas_sparse(): - pser = pd.Series(range(2), dtype=pd.SparseDtype(np.int64, 0)) - with pytest.raises(NotImplementedError): - cudf.Series(pser) - - -def test_series_constructor_unbounded_sequence(): - class A: - def __getitem__(self, key): - return 1 - - with pytest.raises(TypeError): - cudf.Series(A()) - - -def test_series_constructor_error_mixed_type(): - with pytest.raises(MixedTypeError): - cudf.Series(["abc", np.nan, "123"], nan_as_null=False) - - -def test_series_typecast_to_object_error(): - actual = cudf.Series([1, 2, 3], dtype="datetime64[ns]") - with cudf.option_context("mode.pandas_compatible", True): - with pytest.raises(TypeError): - actual.astype(object) - with pytest.raises(TypeError): - actual.astype(np.dtype("object")) - new_series = actual.astype("str") - assert new_series[0] == "1970-01-01 00:00:00.000000001" - - -def test_series_typecast_to_object(): - actual = cudf.Series([1, 2, 3], dtype="datetime64[ns]") - with cudf.option_context("mode.pandas_compatible", False): - new_series = actual.astype(object) - assert new_series[0] == "1970-01-01 00:00:00.000000001" - new_series = actual.astype(np.dtype("object")) - assert new_series[0] == "1970-01-01 00:00:00.000000001" - - -@pytest.mark.parametrize("attr", ["nlargest", "nsmallest"]) -def test_series_nlargest_nsmallest_str_error(attr): - gs = cudf.Series(["a", "b", "c", "d", "e"]) - ps = gs.to_pandas() - - assert_exceptions_equal( - getattr(gs, attr), getattr(ps, attr), ([], {"n": 1}), ([], {"n": 1}) - ) - - -def test_series_unique_pandas_compatibility(): - gs = cudf.Series([10, 11, 12, 11, 10]) - ps = gs.to_pandas() - with cudf.option_context("mode.pandas_compatible", True): - actual = gs.unique() - expected = ps.unique() - assert_eq(actual, expected) - - -@pytest.mark.parametrize("initial_name", SERIES_OR_INDEX_NAMES) -@pytest.mark.parametrize("name", SERIES_OR_INDEX_NAMES) -def test_series_rename(initial_name, name): - gsr = cudf.Series([1, 2, 3], name=initial_name) - psr = pd.Series([1, 2, 3], name=initial_name) - - assert_eq(gsr, psr) - - actual = gsr.rename(name) - expected = psr.rename(name) - - assert_eq(actual, expected) - - -@pytest.mark.parametrize("index", [lambda x: x * 2, {1: 2}]) -def test_rename_index_not_supported(index): - ser = cudf.Series(range(2)) - with pytest.raises(NotImplementedError): - ser.rename(index=index) - - -@pytest.mark.parametrize( - "data", - [ - [1.2234242333234, 323432.3243423, np.nan], - pd.Series([34224, 324324, 324342], dtype="datetime64[ns]"), - pd.Series([224.242, None, 2424.234324], dtype="category"), - [ - decimal.Decimal("342.3243234234242"), - decimal.Decimal("89.32432497687622"), - None, - ], - ], -) -@pytest.mark.parametrize("digits", [0, 1, 3, 4, 10]) -def test_series_round_builtin(data, digits): - ps = pd.Series(data) - gs = cudf.from_pandas(ps, nan_as_null=False) - - # TODO: Remove `to_frame` workaround - # after following issue is fixed: - # https://github.com/pandas-dev/pandas/issues/55114 - expected = round(ps.to_frame(), digits)[0] - expected.name = None - actual = round(gs, digits) - - assert_eq(expected, actual) - - -def test_series_empty_dtype(): - expected = pd.Series([]) - actual = cudf.Series([]) - assert_eq(expected, actual, check_dtype=True) - - -@pytest.mark.parametrize("data", [None, {}, []]) -def test_series_empty_index_rangeindex(data): - expected = cudf.RangeIndex(0) - result = cudf.Series(data).index - assert_eq(result, expected) - - -def test_series_count_invalid_param(): - s = cudf.Series([], dtype="float64") - with pytest.raises(TypeError): - s.count(skipna=True) - - -@pytest.mark.parametrize( - "data", [[0, 1, 2], ["a", "b", "c"], [0.324, 32.32, 3243.23]] -) -def test_series_setitem_nat_with_non_datetimes(data): - s = cudf.Series(data) - with pytest.raises(TypeError): - s[0] = cudf.NaT - - -def test_series_string_setitem(): - gs = cudf.Series(["abc", "def", "ghi", "xyz", "pqr"]) - ps = gs.to_pandas() - - gs[0] = "NaT" - gs[1] = "NA" - gs[2] = "" - gs[3] = "NaN" - - ps[0] = "NaT" - ps[1] = "NA" - ps[2] = "" - ps[3] = "NaN" - - assert_eq(gs, ps) - - -def test_multi_dim_series_error(): - arr = cp.array([(1, 2), (3, 4)]) - with pytest.raises(ValueError): - cudf.Series(arr) - - -def test_bool_series_mixed_dtype_error(): - ps = pd.Series([True, False, None]) - all_bool_ps = pd.Series([True, False, True], dtype="object") - # ps now has `object` dtype, which - # isn't supported by `cudf`. - with cudf.option_context("mode.pandas_compatible", True): - with pytest.raises(TypeError): - cudf.Series(ps) - with pytest.raises(TypeError): - cudf.from_pandas(ps) - with pytest.raises(TypeError): - cudf.Series(ps, dtype=bool) - expected = cudf.Series(all_bool_ps, dtype=bool) - assert_eq(expected, all_bool_ps.astype(bool)) - nan_bools_mix = pd.Series([True, False, True, np.nan], dtype="object") - gs = cudf.Series(nan_bools_mix, nan_as_null=True) - assert_eq(gs.to_pandas(nullable=True), nan_bools_mix.astype("boolean")) - with pytest.raises(TypeError): - cudf.Series(nan_bools_mix, nan_as_null=False) - - -@pytest.mark.parametrize( - "pandas_type", - [ - pd.ArrowDtype(pa.int8()), - pd.ArrowDtype(pa.int16()), - pd.ArrowDtype(pa.int32()), - pd.ArrowDtype(pa.int64()), - pd.ArrowDtype(pa.uint8()), - pd.ArrowDtype(pa.uint16()), - pd.ArrowDtype(pa.uint32()), - pd.ArrowDtype(pa.uint64()), - pd.ArrowDtype(pa.float32()), - pd.ArrowDtype(pa.float64()), - pd.Int8Dtype(), - pd.Int16Dtype(), - pd.Int32Dtype(), - pd.Int64Dtype(), - pd.UInt8Dtype(), - pd.UInt16Dtype(), - pd.UInt32Dtype(), - pd.UInt64Dtype(), - pd.Float32Dtype(), - pd.Float64Dtype(), - ], -) -def test_series_arrow_numeric_types_roundtrip(pandas_type): - ps = pd.Series([1, 2, 3], dtype=pandas_type) - pi = pd.Index(ps) - pdf = ps.to_frame() - - with cudf.option_context("mode.pandas_compatible", True): - gs = cudf.from_pandas(ps) - assert_eq(ps, gs) - - with cudf.option_context("mode.pandas_compatible", True): - gi = cudf.from_pandas(pi) - assert_eq(pi, gi) - - with cudf.option_context("mode.pandas_compatible", True): - gdf = cudf.from_pandas(pdf) - assert_eq(pdf, gdf) - - -@pytest.mark.parametrize( - "pandas_type", [pd.ArrowDtype(pa.bool_()), pd.BooleanDtype()] -) -def test_series_arrow_bool_types_roundtrip(pandas_type): - ps = pd.Series([True, False, None], dtype=pandas_type) - pi = pd.Index(ps) - pdf = ps.to_frame() - - with cudf.option_context("mode.pandas_compatible", True): - gs = cudf.from_pandas(ps) - assert_eq(ps, gs) - - with cudf.option_context("mode.pandas_compatible", True): - gi = cudf.from_pandas(pi) - assert_eq(pi, gi) - - with cudf.option_context("mode.pandas_compatible", True): - gdf = cudf.from_pandas(pdf) - assert_eq(pdf, gdf) - - -@pytest.mark.parametrize( - "pandas_type", [pd.ArrowDtype(pa.string()), pd.StringDtype()] -) -def test_series_arrow_string_types_roundtrip(pandas_type): - ps = pd.Series(["abc", None, "xyz"], dtype=pandas_type) - pi = pd.Index(ps) - pdf = ps.to_frame() - - with cudf.option_context("mode.pandas_compatible", True): - gs = cudf.from_pandas(ps) - assert_eq(ps, gs) - - with cudf.option_context("mode.pandas_compatible", True): - gi = cudf.from_pandas(pi) - assert_eq(pi, gi) - - with cudf.option_context("mode.pandas_compatible", True): - gdf = cudf.from_pandas(pdf) - assert_eq(pdf, gdf) - - -def test_series_arrow_category_types_roundtrip(): - pa_array = pa.array(pd.Series([1, 2, 3], dtype="category")) - ps = pd.Series([1, 2, 3], dtype=pd.ArrowDtype(pa_array.type)) - pi = pd.Index(ps) - pdf = pi.to_frame() - - with cudf.option_context("mode.pandas_compatible", True): - with pytest.raises(NotImplementedError): - cudf.from_pandas(ps) - - with cudf.option_context("mode.pandas_compatible", True): - with pytest.raises(NotImplementedError): - cudf.from_pandas(pi) - - with cudf.option_context("mode.pandas_compatible", True): - with pytest.raises(NotImplementedError): - cudf.from_pandas(pdf) - - -@pytest.mark.parametrize( - "pa_type", - [pa.decimal128(10, 2), pa.decimal128(5, 2), pa.decimal128(20, 2)], -) -def test_series_arrow_decimal_types_roundtrip(pa_type): - ps = pd.Series( - [ - decimal.Decimal("1.2"), - decimal.Decimal("20.56"), - decimal.Decimal("3"), - ], - dtype=pd.ArrowDtype(pa_type), - ) - pdf = ps.to_frame() - - with cudf.option_context("mode.pandas_compatible", True): - gs = cudf.from_pandas(ps) - assert_eq(ps, gs) - - with cudf.option_context("mode.pandas_compatible", True): - gdf = cudf.from_pandas(pdf) - assert_eq(pdf, gdf) - - -def test_series_arrow_struct_types_roundtrip(): - ps = pd.Series( - [{"a": 1}, {"b": "abc"}], - dtype=pd.ArrowDtype(pa.struct({"a": pa.int64(), "b": pa.string()})), - ) - pdf = ps.to_frame() - - with cudf.option_context("mode.pandas_compatible", True): - gs = cudf.from_pandas(ps) - assert_eq(ps, gs) - - with cudf.option_context("mode.pandas_compatible", True): - gdf = cudf.from_pandas(pdf) - assert_eq(pdf, gdf) - - -def test_series_arrow_list_types_roundtrip(): - ps = pd.Series([[1], [2], [4]], dtype=pd.ArrowDtype(pa.list_(pa.int64()))) - with cudf.option_context("mode.pandas_compatible", True): - gs = cudf.from_pandas(ps) - assert_eq(ps, gs) - pdf = ps.to_frame() - - with cudf.option_context("mode.pandas_compatible", True): - gdf = cudf.from_pandas(pdf) - assert_eq(pdf, gdf) - - -@pytest.mark.parametrize("base_name", [None, "a"]) -def test_series_to_frame_none_name(base_name): - result = cudf.Series(range(1), name=base_name).to_frame(name=None) - expected = pd.Series(range(1), name=base_name).to_frame(name=None) - assert_eq(result, expected) - - -@pytest.mark.parametrize("klass", [cudf.Index, cudf.Series]) -@pytest.mark.parametrize( - "data", [pa.array([float("nan")]), pa.chunked_array([[float("nan")]])] -) -def test_nan_as_null_from_arrow_objects(klass, data): - result = klass(data, nan_as_null=True) - expected = klass(pa.array([None], type=pa.float64())) - assert_eq(result, expected) - - -@pytest.mark.parametrize("reso", ["M", "ps"]) -@pytest.mark.parametrize("typ", ["M", "m"]) -def test_series_invalid_reso_dtype(reso, typ): - with pytest.raises(TypeError): - cudf.Series([], dtype=f"{typ}8[{reso}]") - - -def test_series_categorical_missing_value_count(): - ps = pd.Series(pd.Categorical(list("abcccb"), categories=list("cabd"))) - gs = cudf.from_pandas(ps) - - expected = ps.value_counts() - actual = gs.value_counts() - - assert_eq(expected, actual, check_dtype=False) - - -def test_series_error_nan_mixed_types(): - ps = pd.Series([np.nan, "ab", "cd"]) - with cudf.option_context("mode.pandas_compatible", True): - with pytest.raises(MixedTypeError): - cudf.from_pandas(ps) - - -def test_series_error_nan_non_float_dtypes(): - s = cudf.Series(["a", "b", "c"]) - with pytest.raises(TypeError): - s[0] = np.nan - - s = cudf.Series([1, 2, 3], dtype="datetime64[ns]") - with pytest.raises(TypeError): - s[0] = np.nan - - -@pytest.mark.parametrize( - "dtype", - [ - pd.ArrowDtype(pa.int8()), - pd.ArrowDtype(pa.int16()), - pd.ArrowDtype(pa.int32()), - pd.ArrowDtype(pa.int64()), - pd.ArrowDtype(pa.uint8()), - pd.ArrowDtype(pa.uint16()), - pd.ArrowDtype(pa.uint32()), - pd.ArrowDtype(pa.uint64()), - pd.ArrowDtype(pa.float32()), - pd.ArrowDtype(pa.float64()), - pd.Int8Dtype(), - pd.Int16Dtype(), - pd.Int32Dtype(), - pd.Int64Dtype(), - pd.UInt8Dtype(), - pd.UInt16Dtype(), - pd.UInt32Dtype(), - pd.UInt64Dtype(), - pd.Float32Dtype(), - pd.Float64Dtype(), - ], -) -@pytest.mark.parametrize("klass", [cudf.Series, cudf.DataFrame, cudf.Index]) -@pytest.mark.parametrize("kind", [lambda x: x, str], ids=["obj", "string"]) -def test_astype_pandas_nullable_pandas_compat(dtype, klass, kind): - ser = klass([1, 2, 3]) - with cudf.option_context("mode.pandas_compatible", True): - actual = ser.astype(kind(dtype)) - expected = klass([1, 2, 3], dtype=kind(dtype)) - assert_eq(actual, expected) - - -@pytest.mark.parametrize("klass", [cudf.Series, cudf.Index]) -@pytest.mark.parametrize( - "data", - [ - pa.array([1, None], type=pa.int64()), - pa.chunked_array([[1, None]], type=pa.int64()), - ], -) -def test_from_arrow_array_dtype(klass, data): - obj = klass(data, dtype="int8") - assert obj.dtype == np.dtype("int8") - - -@pytest.mark.parametrize("klass", [cudf.Series, cudf.Index]) -def test_from_pandas_object_dtype_passed_dtype(klass): - result = klass(pd.Series([True, False], dtype=object), dtype="int8") - expected = klass(pa.array([1, 0], type=pa.int8())) - assert_eq(result, expected) - - -def test_series_where_mixed_bool_dtype(): - s = cudf.Series([True, False, True]) - with pytest.raises(TypeError): - s.where(~s, 10) - - -def test_series_setitem_mixed_bool_dtype(): - s = cudf.Series([True, False, True]) - with pytest.raises(TypeError): - s[0] = 10 - - -@pytest.mark.parametrize( - "nat, value", - [ - [np.datetime64("nat", "ns"), np.datetime64("2020-01-01", "ns")], - [np.timedelta64("nat", "ns"), np.timedelta64(1, "ns")], - ], -) -@pytest.mark.parametrize("nan_as_null", [True, False]) -def test_series_np_array_nat_nan_as_nulls(nat, value, nan_as_null): - expected = np.array([nat, value]) - ser = cudf.Series(expected, nan_as_null=nan_as_null) - assert ser[0] is pd.NaT - assert ser[1] == value - - -def test_series_unitness_np_datetimelike_units(): - data = np.array([np.timedelta64(1)]) - with pytest.raises(TypeError): - cudf.Series(data) - with pytest.raises(TypeError): - pd.Series(data) - - -def test_series_duplicate_index_reindex(): - gs = cudf.Series([0, 1, 2, 3], index=[0, 0, 1, 1]) - ps = gs.to_pandas() - - assert_exceptions_equal( - gs.reindex, - ps.reindex, - lfunc_args_and_kwargs=([10, 11, 12, 13], {}), - rfunc_args_and_kwargs=([10, 11, 12, 13], {}), - ) - - -def test_list_category_like_maintains_dtype(): - dtype = cudf.CategoricalDtype(categories=[1, 2, 3, 4], ordered=True) - data = [1, 2, 3] - result = cudf.Series._from_column(as_column(data, dtype=dtype)) - expected = pd.Series(data, dtype=dtype.to_pandas()) - assert_eq(result, expected) - - -def test_list_interval_like_maintains_dtype(): - dtype = cudf.IntervalDtype(subtype=np.int8) - data = [pd.Interval(1, 2)] - result = cudf.Series._from_column(as_column(data, dtype=dtype)) - expected = pd.Series(data, dtype=dtype.to_pandas()) - assert_eq(result, expected) - - -@pytest.mark.parametrize( - "klass", [cudf.Series, cudf.Index, pd.Series, pd.Index] -) -def test_series_from_named_object_name_priority(klass): - result = cudf.Series(klass([1], name="a"), name="b") - assert result.name == "b" - - -@pytest.mark.parametrize( - "data", - [ - {"a": 1, "b": 2, "c": 3}, - cudf.Series([1, 2, 3], index=list("abc")), - pd.Series([1, 2, 3], index=list("abc")), - ], -) -def test_series_from_object_with_index_index_arg_reindex(data): - result = cudf.Series(data, index=list("bca")) - expected = cudf.Series([2, 3, 1], index=list("bca")) - assert_eq(result, expected) - - -@pytest.mark.parametrize( - "data", - [ - {0: 1, 1: 2, 2: 3}, - cudf.Series([1, 2, 3]), - cudf.Index([1, 2, 3]), - pd.Series([1, 2, 3]), - pd.Index([1, 2, 3]), - [1, 2, 3], - ], -) -def test_series_dtype_astypes(data): - result = cudf.Series(data, dtype="float64") - expected = cudf.Series([1.0, 2.0, 3.0]) - assert_eq(result, expected) - - -@pytest.mark.parametrize("pa_type", [pa.string, pa.large_string]) -def test_series_from_large_string(pa_type): - pa_string_array = pa.array(["a", "b", "c"]).cast(pa_type()) - got = cudf.Series(pa_string_array) - expected = pd.Series(pa_string_array) - - assert_eq(expected, got) - - -@pytest.mark.parametrize( - "scalar", - [ - 1, - 1.0, - "a", - datetime.datetime(2020, 1, 1), - datetime.timedelta(1), - {"1": 2}, - [1], - decimal.Decimal("1.0"), - ], -) -def test_series_to_pandas_arrow_type_nullable_raises(scalar): - pa_array = pa.array([scalar, None]) - ser = cudf.Series(pa_array) - with pytest.raises(ValueError, match=".* cannot both be set"): - ser.to_pandas(nullable=True, arrow_type=True) - - -@pytest.mark.parametrize( - "scalar", - [ - 1, - 1.0, - "a", - datetime.datetime(2020, 1, 1), - datetime.timedelta(1), - {"1": 2}, - [1], - decimal.Decimal("1.0"), - ], -) -def test_series_to_pandas_arrow_type(scalar): - pa_array = pa.array([scalar, None]) - ser = cudf.Series(pa_array) - result = ser.to_pandas(arrow_type=True) - expected = pd.Series(pd.arrays.ArrowExtensionArray(pa_array)) - pd.testing.assert_series_equal(result, expected) - - -@pytest.mark.parametrize("axis", [None, 0, "index"]) -@pytest.mark.parametrize("data", [[1, 2], [1]]) -def test_squeeze(axis, data): - ser = cudf.Series(data) - result = ser.squeeze(axis=axis) - expected = ser.to_pandas().squeeze(axis=axis) - assert_eq(result, expected) - - -@pytest.mark.parametrize("axis", [1, "columns"]) -def test_squeeze_invalid_axis(axis): - with pytest.raises(ValueError): - cudf.Series([1]).squeeze(axis=axis) - - -def test_series_init_with_nans(): - with cudf.option_context("mode.pandas_compatible", True): - gs = cudf.Series([1, 2, 3, np.nan]) - assert gs.dtype == np.dtype("float64") - ps = pd.Series([1, 2, 3, np.nan]) - assert_eq(ps, gs) - - -@pytest.mark.parametrize("data", [None, 123, 33243243232423, 0]) -def test_timestamp_series_init(data): - scalar = pd.Timestamp(data) - expected = pd.Series([scalar]) - actual = cudf.Series([scalar]) - - assert_eq(expected, actual) - - expected = pd.Series(scalar) - actual = cudf.Series(scalar) - - assert_eq(expected, actual) - - -@pytest.mark.parametrize("data", [None, 123, 33243243232423, 0]) -def test_timedelta_series_init(data): - scalar = pd.Timedelta(data) - expected = pd.Series([scalar]) - actual = cudf.Series([scalar]) - - assert_eq(expected, actual) - - expected = pd.Series(scalar) - actual = cudf.Series(scalar) - - assert_eq(expected, actual) - - -def test_series_from_series_index_no_shallow_copy(): - ser1 = cudf.Series(range(3), index=list("abc")) - ser2 = cudf.Series(ser1) - assert ser1.index is ser2.index - - -@pytest.mark.parametrize("value", [1, 1.1]) -def test_nans_to_nulls_noop_copies_column(value): - ser1 = cudf.Series([value]) - ser2 = ser1.nans_to_nulls() - assert ser1._column is not ser2._column - - -@pytest.mark.parametrize("dropna", [False, True]) -def test_nunique_all_null(dropna): - data = [None, None] - pd_ser = pd.Series(data) - cudf_ser = cudf.Series(data) - result = pd_ser.nunique(dropna=dropna) - expected = cudf_ser.nunique(dropna=dropna) - assert result == expected - - -@pytest.mark.parametrize( - "type1", - [ - "category", - "interval[int64, right]", - "int64", - "float64", - "str", - "datetime64[ns]", - "timedelta64[ns]", - ], -) -@pytest.mark.parametrize( - "type2", - [ - "category", - "interval[int64, right]", - "int64", - "float64", - "str", - "datetime64[ns]", - "timedelta64[ns]", - ], -) -@pytest.mark.parametrize( - "as_dtype", [lambda x: x, cudf.dtype], ids=["string", "object"] -) -@pytest.mark.parametrize("copy", [True, False]) -def test_empty_astype_always_castable(type1, type2, as_dtype, copy): - ser = cudf.Series([], dtype=as_dtype(type1)) - result = ser.astype(as_dtype(type2), copy=copy) - expected = cudf.Series([], dtype=as_dtype(type2)) - assert_eq(result, expected) - if not copy and cudf.dtype(type1) == cudf.dtype(type2): - assert ser._column is result._column - else: - assert ser._column is not result._column - - -def test_dtype_dtypes_equal(): - ser = cudf.Series([0]) - assert ser.dtype is ser.dtypes - assert ser.dtypes is ser.to_pandas().dtypes - - -def test_null_like_to_nan_pandas_compat(): - with cudf.option_context("mode.pandas_compatible", True): - ser = cudf.Series([1, 2, np.nan, 10, None]) - pser = pd.Series([1, 2, np.nan, 10, None]) - - assert pser.dtype == ser.dtype - assert_eq(ser, pser) - - -def test_roundtrip_series_plc_column(ps): - expect = cudf.Series(ps) - actual = cudf.Series.from_pylibcudf(*expect.to_pylibcudf()) - assert_eq(expect, actual) - - -def test_non_strings_dtype_object_pandas_compat_raises(): - with cudf.option_context("mode.pandas_compatible", True): - with pytest.raises(TypeError): - cudf.Series([1], dtype=object) - - -def test_series_dataframe_count_float(): - gs = cudf.Series([1, 2, 3, None, np.nan, 10], nan_as_null=False) - ps = cudf.Series([1, 2, 3, None, np.nan, 10]) - - with cudf.option_context("mode.pandas_compatible", True): - assert_eq(ps.count(), gs.count()) - assert_eq(ps.to_frame().count(), gs.to_frame().count()) - with cudf.option_context("mode.pandas_compatible", False): - assert_eq(gs.count(), gs.to_pandas(nullable=True).count()) - assert_eq( - gs.to_frame().count(), - gs.to_frame().to_pandas(nullable=True).count(), - ) - - -@pytest.mark.parametrize("arr", [np.array, cp.array, pd.Series]) -def test_construct_nonnative_array(arr): - data = [1, 2, 3.5, 4] - dtype = np.dtype("f4") - native = arr(data, dtype=dtype) - nonnative = arr(data, dtype=dtype.newbyteorder()) - result = cudf.Series(nonnative) - expected = cudf.Series(native) - assert_eq(result, expected) - - -@pytest.mark.parametrize("nan_as_null", [True, False]) -def test_construct_all_pd_NA_with_dtype(nan_as_null): - result = cudf.Series( - [pd.NA, pd.NA], dtype=np.dtype(np.float64), nan_as_null=nan_as_null - ) - expected = cudf.Series(pa.array([None, None], type=pa.float64())) - assert_eq(result, expected) - - -@pytest.mark.parametrize( - "dtype", - [ - "int8", - "int16", - "int32", - "int64", - "uint8", - "uint16", - "uint32", - "uint64", - "float32", - "float64", - "bool", - ], -) -@pytest.mark.parametrize("has_nulls", [False, True]) -@pytest.mark.parametrize("use_na_value", [False, True]) -def test_series_to_cupy(dtype, has_nulls, use_na_value): - size = 10 - if dtype == "bool": - np_data = np.array([True, False] * (size // 2), dtype=bool) - else: - np_data = np.arange(size, dtype=dtype) - - if has_nulls: - np_data = np_data.astype("object") - np_data[::2] = None - - sr = cudf.Series(np_data, dtype=dtype) - - if not has_nulls: - assert_eq(sr.values, cp.asarray(sr)) - return - - if has_nulls and not use_na_value: - with pytest.raises(ValueError, match="Column must have no nulls"): - sr.to_cupy() - return - - na_value = { - "bool": False, - "float32": 0.0, - "float64": 0.0, - }.get(dtype, 0) - expected = cp.asarray(sr.fillna(na_value)) if has_nulls else cp.asarray(sr) - assert_eq(sr.to_cupy(na_value=na_value), expected) - - -def test_to_dense_array(): - rng = np.random.default_rng(seed=0) - data = rng.random(8) - mask = np.asarray([0b11010110]).astype(np.byte) - sr = cudf.Series._from_column( - as_column(data, dtype=np.float64).set_mask(mask) - ) - assert sr.has_nulls - assert sr.null_count != len(sr) - filled = sr.to_numpy(na_value=np.nan) - dense = sr.dropna().to_numpy() - assert dense.size < filled.size - assert filled.size == len(sr) From 47c5732cdba8fe3cbae3d44648618001b89f0fc2 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Wed, 30 Jul 2025 17:16:50 -0700 Subject: [PATCH 031/366] Get rid of CG logic in the mixed semi-join kernel (#19536) This PR simplifies the mixed semi-join kernel by removing CG-related logic, as the underlying code path is only ever used with a CG size of 1. Authors: - Yunsong Wang (https://github.com/PointKernel) Approvers: - Nghia Truong (https://github.com/ttnghia) - Shruti Shivakumar (https://github.com/shrshi) URL: https://github.com/rapidsai/cudf/pull/19536 --- cpp/src/join/mixed_join_kernels_semi.cu | 20 +++++--------------- 1 file changed, 5 insertions(+), 15 deletions(-) diff --git a/cpp/src/join/mixed_join_kernels_semi.cu b/cpp/src/join/mixed_join_kernels_semi.cu index 4c063b6202e..3cf081e5ded 100644 --- a/cpp/src/join/mixed_join_kernels_semi.cu +++ b/cpp/src/join/mixed_join_kernels_semi.cu @@ -23,13 +23,9 @@ #include #include -#include - namespace cudf { namespace detail { -namespace cg = cooperative_groups; - template CUDF_KERNEL void __launch_bounds__(block_size) mixed_join_semi(table_device_view left_table, @@ -41,10 +37,6 @@ CUDF_KERNEL void __launch_bounds__(block_size) cudf::device_span left_table_keep_mask, cudf::ast::detail::expression_device_view device_expression_data) { - auto constexpr cg_size = hash_set_ref_type::cg_size; - - auto const tile = cg::tiled_partition(cg::this_thread_block()); - // Normally the casting of a shared memory array is used to create multiple // arrays of different types from the shared memory buffer, but here it is // used to circumvent conflicts between arrays of different types between @@ -53,7 +45,7 @@ CUDF_KERNEL void __launch_bounds__(block_size) auto intermediate_storage = reinterpret_cast*>(raw_intermediate_storage); auto thread_intermediate_storage = - intermediate_storage + (tile.meta_group_rank() * device_expression_data.num_intermediates); + intermediate_storage + (threadIdx.x * device_expression_data.num_intermediates); // Equality evaluator to use auto const evaluator = cudf::ast::detail::expression_evaluator( @@ -69,15 +61,13 @@ CUDF_KERNEL void __launch_bounds__(block_size) // Total number of rows to query the set auto const outer_num_rows = left_table.num_rows(); - // Grid stride for the tile - auto const cg_grid_stride = cudf::detail::grid_1d::grid_stride() / cg_size; + auto const grid_stride = cudf::detail::grid_1d::grid_stride(); // Find all the rows in the left table that are in the hash table - for (auto outer_row_index = cudf::detail::grid_1d::global_thread_id() / cg_size; + for (auto outer_row_index = cudf::detail::grid_1d::global_thread_id(); outer_row_index < outer_num_rows; - outer_row_index += cg_grid_stride) { - auto const result = set_ref_equality.contains(tile, outer_row_index); - if (tile.thread_rank() == 0) { left_table_keep_mask[outer_row_index] = result; } + outer_row_index += grid_stride) { + left_table_keep_mask[outer_row_index] = set_ref_equality.contains(outer_row_index); } } From 05b83e5dade9c0515e7c1c1821edcc6959e24618 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Wed, 30 Jul 2025 18:37:36 -0700 Subject: [PATCH 032/366] Move timeout in cudf.pandas pandas unit tests script to ci script (#19542) When running `run-pandas-tests.sh` to debug pandas unit tests, it seems like setting `breakpoint()`s does not drop into pdb (seems to hang) due to the `timeout` command. Additionally, I think we're more concerned about timeouts when running in CI than locally, so moving this `timeout` command to the CI script instead. Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - Matthew Murray (https://github.com/Matt711) - Jake Awe (https://github.com/AyodeAwe) URL: https://github.com/rapidsai/cudf/pull/19542 --- ci/cudf_pandas_scripts/pandas-tests/run.sh | 2 +- python/cudf/cudf/pandas/scripts/run-pandas-tests.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/ci/cudf_pandas_scripts/pandas-tests/run.sh b/ci/cudf_pandas_scripts/pandas-tests/run.sh index 74d1fc4bdaf..25ce70da01f 100755 --- a/ci/cudf_pandas_scripts/pandas-tests/run.sh +++ b/ci/cudf_pandas_scripts/pandas-tests/run.sh @@ -33,7 +33,7 @@ RESULTS_DIR=${RAPIDS_TESTS_DIR:-"$(mktemp -d)"} RAPIDS_TESTS_DIR=${RAPIDS_TESTS_DIR:-"${RESULTS_DIR}/test-results"}/ mkdir -p "${RAPIDS_TESTS_DIR}" -bash python/cudf/cudf/pandas/scripts/run-pandas-tests.sh \ +timeout 90m bash python/cudf/cudf/pandas/scripts/run-pandas-tests.sh \ --numprocesses 5 \ --tb=line \ -vv \ diff --git a/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh b/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh index ccf33ea6ee8..7ceff137df7 100755 --- a/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh +++ b/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh @@ -155,7 +155,7 @@ PYTEST_IGNORES=("--ignore=tests/io/parser/common/test_read_errors.py" ) -PANDAS_CI="1" timeout 90m python -m pytest -p cudf.pandas \ +PANDAS_CI="1" python -m pytest -p cudf.pandas \ --import-mode=importlib \ -k "$TEST_THAT_NEED_MOTO_SERVER and $TEST_THAT_CRASH_PYTEST_WORKERS and $TEST_THAT_NEED_REASON_TO_SKIP and $TEST_THAT_USE_STRING_DTYPE_GROUPBY and $TEST_THAT_USE_WEAKREFS" \ "${PYTEST_IGNORES[@]}" \ From 39a09ca7b9c4da3118e81aea0fe213dda195ab1b Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Wed, 30 Jul 2025 18:38:46 -0700 Subject: [PATCH 033/366] Move test_avro/test_api_types.py and some DataFrame tests to new cudf classic test directory structure (#19490) Towards https://github.com/rapidsai/cudf/issues/9999 Also starts a `test/dataframe` structure similar to https://github.com/rapidsai/cudf/pull/19485 Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/19490 --- python/cudf/cudf/tests/conftest.py | 28 + .../cudf/tests/dataframe/methods/__init__.py | 0 .../{ => dataframe/methods}/test_applymap.py | 0 .../test_convert_dtypes.py} | 0 .../methods/test_nlargest_nsmallest.py | 57 ++ .../dataframe/methods/test_scatter_by_map.py | 94 +++ .../dataframe/methods/test_sort_values.py | 194 ++++++ .../tests/dataframe/methods/test_to_arrow.py | 15 + .../cudf/tests/dataframe/test_attributes.py | 1 - .../tests/dataframe/test_binary_operations.py | 1 - .../cudf/tests/dataframe/test_combining.py | 1 - .../cudf/tests/dataframe/test_computation.py | 1 - .../cudf/tests/dataframe/test_constructing.py | 1 - .../dataframe/test_function_application.py | 1 - .../cudf/tests/dataframe/test_indexing.py | 1 - .../tests/dataframe/test_io_serialization.py | 50 -- .../cudf/cudf/tests/dataframe/test_missing.py | 1 - .../cudf/tests/dataframe/test_reindexing.py | 1 - .../cudf/tests/dataframe/test_reshaping.py | 1 - .../cudf/tests/dataframe/test_selecting.py | 1 - .../cudf/cudf/tests/dataframe/test_sorting.py | 1 - .../cudf/tests/dataframe/test_timeseries.py | 1 - .../{ => general_functions}/test_api_types.py | 0 .../general_functions/test_conversion.py | 1 - .../test_data_manipulation.py | 1 - .../general_functions/test_datetimelike.py | 1 - .../cudf/cudf/tests/input_output/test_avro.py | 658 +++++++++++++++++- .../cudf/tests/input_output/test_parquet.py | 35 + .../test_avro_reader_fastavro_integration.py | 658 ------------------ python/cudf/cudf/tests/test_sorting.py | 325 +-------- 30 files changed, 1081 insertions(+), 1049 deletions(-) create mode 100644 python/cudf/cudf/tests/dataframe/methods/__init__.py rename python/cudf/cudf/tests/{ => dataframe/methods}/test_applymap.py (100%) rename python/cudf/cudf/tests/dataframe/{test_conversion.py => methods/test_convert_dtypes.py} (100%) create mode 100644 python/cudf/cudf/tests/dataframe/methods/test_nlargest_nsmallest.py create mode 100644 python/cudf/cudf/tests/dataframe/methods/test_scatter_by_map.py create mode 100644 python/cudf/cudf/tests/dataframe/methods/test_sort_values.py create mode 100644 python/cudf/cudf/tests/dataframe/methods/test_to_arrow.py delete mode 100644 python/cudf/cudf/tests/dataframe/test_attributes.py delete mode 100644 python/cudf/cudf/tests/dataframe/test_binary_operations.py delete mode 100644 python/cudf/cudf/tests/dataframe/test_combining.py delete mode 100644 python/cudf/cudf/tests/dataframe/test_computation.py delete mode 100644 python/cudf/cudf/tests/dataframe/test_constructing.py delete mode 100644 python/cudf/cudf/tests/dataframe/test_function_application.py delete mode 100644 python/cudf/cudf/tests/dataframe/test_indexing.py delete mode 100644 python/cudf/cudf/tests/dataframe/test_io_serialization.py delete mode 100644 python/cudf/cudf/tests/dataframe/test_missing.py delete mode 100644 python/cudf/cudf/tests/dataframe/test_reindexing.py delete mode 100644 python/cudf/cudf/tests/dataframe/test_reshaping.py delete mode 100644 python/cudf/cudf/tests/dataframe/test_selecting.py delete mode 100644 python/cudf/cudf/tests/dataframe/test_sorting.py delete mode 100644 python/cudf/cudf/tests/dataframe/test_timeseries.py rename python/cudf/cudf/tests/{ => general_functions}/test_api_types.py (100%) delete mode 100644 python/cudf/cudf/tests/general_functions/test_conversion.py delete mode 100644 python/cudf/cudf/tests/general_functions/test_data_manipulation.py delete mode 100644 python/cudf/cudf/tests/general_functions/test_datetimelike.py delete mode 100644 python/cudf/cudf/tests/test_avro_reader_fastavro_integration.py diff --git a/python/cudf/cudf/tests/conftest.py b/python/cudf/cudf/tests/conftest.py index 0a57efe2fbd..9d43aeff1fd 100644 --- a/python/cudf/cudf/tests/conftest.py +++ b/python/cudf/cudf/tests/conftest.py @@ -234,6 +234,14 @@ def integer_types_as_str(request): return request.param +@pytest.fixture(params=float_types) +def float_types_as_str(request): + """ + - "float32", "float64" + """ + return request.param + + @pytest.fixture( params=signed_integer_types + unsigned_integer_types + float_types ) @@ -287,6 +295,26 @@ def temporal_types_as_str(request): return request.param +@pytest.fixture( + params=signed_integer_types + + unsigned_integer_types + + float_types + + bool_types + + datetime_types + + timedelta_types +) +def numeric_and_temporal_types_as_str(request): + """ + - "int8", "int16", "int32", "int64" + - "uint8", "uint16", "uint32", "uint64" + - "float32", "float64" + - "bool" + - "datetime64[ns]", "datetime64[us]", "datetime64[ms]", "datetime64[s]" + - "timedelta64[ns]", "timedelta64[us]", "timedelta64[ms]", "timedelta64[s]" + """ + return request.param + + @pytest.fixture( params=signed_integer_types + unsigned_integer_types diff --git a/python/cudf/cudf/tests/dataframe/methods/__init__.py b/python/cudf/cudf/tests/dataframe/methods/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/python/cudf/cudf/tests/test_applymap.py b/python/cudf/cudf/tests/dataframe/methods/test_applymap.py similarity index 100% rename from python/cudf/cudf/tests/test_applymap.py rename to python/cudf/cudf/tests/dataframe/methods/test_applymap.py diff --git a/python/cudf/cudf/tests/dataframe/test_conversion.py b/python/cudf/cudf/tests/dataframe/methods/test_convert_dtypes.py similarity index 100% rename from python/cudf/cudf/tests/dataframe/test_conversion.py rename to python/cudf/cudf/tests/dataframe/methods/test_convert_dtypes.py diff --git a/python/cudf/cudf/tests/dataframe/methods/test_nlargest_nsmallest.py b/python/cudf/cudf/tests/dataframe/methods/test_nlargest_nsmallest.py new file mode 100644 index 00000000000..6c148cef4a6 --- /dev/null +++ b/python/cudf/cudf/tests/dataframe/methods/test_nlargest_nsmallest.py @@ -0,0 +1,57 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + + +import numpy as np +import pandas as pd +import pytest + +from cudf import DataFrame +from cudf.testing import assert_eq + + +@pytest.mark.parametrize("n", [10, 5]) +@pytest.mark.parametrize("op", ["nsmallest", "nlargest"]) +@pytest.mark.parametrize("columns", ["a", ["b", "a"]]) +def test_dataframe_nlargest_nsmallest(n, op, columns): + nelem = 10 + rng = np.random.default_rng(seed=0) + aa = rng.random(nelem) + bb = rng.random(nelem) + + df = DataFrame({"a": aa, "b": bb}) + pdf = df.to_pandas() + assert_eq(getattr(df, op)(n, columns), getattr(pdf, op)(n, columns)) + + +@pytest.mark.parametrize( + "sliceobj", [slice(1, None), slice(None, -1), slice(1, -1)] +) +def test_dataframe_nlargest_sliced(sliceobj): + nelem = 20 + n = 10 + rng = np.random.default_rng(seed=0) + df = pd.DataFrame() + df["a"] = rng.random(nelem) + df["b"] = rng.random(nelem) + + expect = df[sliceobj].nlargest(n, "a") + gdf = DataFrame.from_pandas(df) + got = gdf[sliceobj].nlargest(n, "a") + assert (got.to_pandas() == expect).all().all() + + +@pytest.mark.parametrize( + "sliceobj", [slice(1, None), slice(None, -1), slice(1, -1)] +) +def test_dataframe_nsmallest_sliced(sliceobj): + nelem = 20 + n = 10 + rng = np.random.default_rng(seed=0) + df = pd.DataFrame() + df["a"] = rng.random(nelem) + df["b"] = rng.random(nelem) + + expect = df[sliceobj].nsmallest(n, "a") + gdf = DataFrame.from_pandas(df) + got = gdf[sliceobj].nsmallest(n, "a") + assert (got.to_pandas() == expect).all().all() diff --git a/python/cudf/cudf/tests/dataframe/methods/test_scatter_by_map.py b/python/cudf/cudf/tests/dataframe/methods/test_scatter_by_map.py new file mode 100644 index 00000000000..b6897b3c052 --- /dev/null +++ b/python/cudf/cudf/tests/dataframe/methods/test_scatter_by_map.py @@ -0,0 +1,94 @@ +# Copyright (c) 2018-2025, NVIDIA CORPORATION. + + +import numpy as np +import pytest + +from cudf import DataFrame +from cudf.core.column import NumericalColumn + + +@pytest.mark.parametrize("map_size", [1, 8]) +@pytest.mark.parametrize("nelem", [1, 10]) +@pytest.mark.parametrize("keep", [True, False]) +def test_dataframe_scatter_by_map(map_size, nelem, keep): + strlist = ["dog", "cat", "fish", "bird", "pig", "fox", "cow", "goat"] + rng = np.random.default_rng(seed=0) + df = DataFrame( + { + "a": rng.choice(strlist[:map_size], nelem), + "b": rng.uniform(low=0, high=map_size, size=nelem), + "c": rng.integers(map_size, size=nelem), + } + ) + df["d"] = df["a"].astype("category") + + def _check_scatter_by_map(dfs, col): + assert len(dfs) == map_size + nrows = 0 + name = col.name + for i, df in enumerate(dfs): + nrows += len(df) + if len(df) > 0: + # Make sure the column types were preserved + assert isinstance(df[name]._column, type(col._column)) + try: + sr = df[name].astype(np.int32) + except ValueError: + sr = df[name] + assert sr.nunique() <= 1 + if sr.nunique() == 1: + if isinstance(df[name]._column, NumericalColumn): + assert sr.iloc[0] == i + assert nrows == nelem + + with pytest.warns(UserWarning): + _check_scatter_by_map( + df.scatter_by_map("a", map_size, keep_index=keep), df["a"] + ) + _check_scatter_by_map( + df.scatter_by_map("b", map_size, keep_index=keep), df["b"] + ) + _check_scatter_by_map( + df.scatter_by_map("c", map_size, keep_index=keep), df["c"] + ) + with pytest.warns(UserWarning): + _check_scatter_by_map( + df.scatter_by_map("d", map_size, keep_index=keep), df["d"] + ) + + if map_size == 2 and nelem == 100: + with pytest.warns(UserWarning): + df.scatter_by_map("a") # Auto-detect map_size + with pytest.raises(ValueError): + with pytest.warns(UserWarning): + df.scatter_by_map("a", map_size=1, debug=True) # Bad map_size + + # Test Index + df2 = df.set_index("c") + generic_result = df2.scatter_by_map("b", map_size, keep_index=keep) + _check_scatter_by_map(generic_result, df2["b"]) + if keep: + for frame in generic_result: + assert isinstance(frame.index, type(df2.index)) + + # Test MultiIndex + df2 = df.set_index(["a", "c"]) + multiindex_result = df2.scatter_by_map("b", map_size, keep_index=keep) + _check_scatter_by_map(multiindex_result, df2["b"]) + if keep: + for frame in multiindex_result: + assert isinstance(frame.index, type(df2.index)) + + +@pytest.mark.parametrize("ids", [[-1, 0, 1, 0], [0, 2, 3, 0]]) +def test_dataframe_scatter_by_map_7513(ids): + df = DataFrame({"id": ids, "val": [0, 1, 2, 3]}) + with pytest.raises(ValueError): + df.scatter_by_map(df["id"]) + + +def test_dataframe_scatter_by_map_empty(): + df = DataFrame({"a": [], "b": []}, dtype="float64") + scattered = df.scatter_by_map(df["a"]) + assert len(scattered) == 0 diff --git a/python/cudf/cudf/tests/dataframe/methods/test_sort_values.py b/python/cudf/cudf/tests/dataframe/methods/test_sort_values.py new file mode 100644 index 00000000000..1c322ff67af --- /dev/null +++ b/python/cudf/cudf/tests/dataframe/methods/test_sort_values.py @@ -0,0 +1,194 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + +import string + +import numpy as np +import pandas as pd +import pytest + +from cudf import DataFrame, option_context +from cudf.core._compat import PANDAS_CURRENT_SUPPORTED_VERSION, PANDAS_VERSION +from cudf.testing import assert_eq +from cudf.testing._utils import ( + assert_exceptions_equal, + expect_warning_if, +) + + +def test_dataframe_sort_values(numeric_types_as_str): + nelem = 25 + rng = np.random.default_rng(seed=0) + df = DataFrame() + df["a"] = aa = (100 * rng.random(nelem)).astype(numeric_types_as_str) + df["b"] = bb = (100 * rng.random(nelem)).astype(numeric_types_as_str) + sorted_df = df.sort_values(by="a") + # Check + sorted_index = np.argsort(aa, kind="mergesort") + assert_eq(sorted_df.index.values, sorted_index) + assert_eq(sorted_df["a"].values, aa[sorted_index]) + assert_eq(sorted_df["b"].values, bb[sorted_index]) + + +def test_sort_values_nans_pandas_compat(): + data = {"a": [0, 0, 2, -1], "b": [1, 3, 2, None]} + with option_context("mode.pandas_compatible", True): + result = DataFrame(data).sort_values("b", na_position="first") + expected = pd.DataFrame(data).sort_values("b", na_position="first") + assert_eq(result, expected) + + +@pytest.mark.parametrize("index", ["a", "b", ["a", "b"]]) +def test_dataframe_sort_values_ignore_index(index, ignore_index): + if ( + PANDAS_VERSION >= PANDAS_CURRENT_SUPPORTED_VERSION + and isinstance(index, list) + and not ignore_index + ): + pytest.skip( + reason="Unstable sorting by pandas(numpy): https://github.com/pandas-dev/pandas/issues/57531" + ) + + gdf = DataFrame( + {"a": [1, 3, 5, 2, 4], "b": [1, 1, 2, 2, 3], "c": [9, 7, 7, 7, 1]} + ) + gdf = gdf.set_index(index) + + pdf = gdf.to_pandas() + + expect = pdf.sort_values(list(pdf.columns), ignore_index=ignore_index) + got = gdf.sort_values((gdf.columns), ignore_index=ignore_index) + + assert_eq(expect, got) + + +@pytest.mark.parametrize( + "sliceobj", [slice(1, None), slice(None, -1), slice(1, -1)] +) +def test_dataframe_sort_values_sliced(sliceobj): + rng = np.random.default_rng(seed=0) + df = pd.DataFrame({"a": rng.random(20)}) + + expect = df[sliceobj]["a"].sort_values() + gdf = DataFrame.from_pandas(df) + got = gdf[sliceobj]["a"].sort_values() + assert (got.to_pandas() == expect).all() + + +@pytest.mark.parametrize("num_rows", [0, 5]) +@pytest.mark.parametrize("na_position", ["first", "last"]) +def test_dataframe_multi_column( + num_rows, numeric_and_temporal_types_as_str, ascending, na_position +): + num_cols = 5 + rng = np.random.default_rng(seed=0) + by = list(string.ascii_lowercase[:num_cols]) + pdf = pd.DataFrame() + + for i in range(5): + colname = string.ascii_lowercase[i] + data = rng.integers(0, 26, num_rows).astype( + numeric_and_temporal_types_as_str + ) + pdf[colname] = data + + gdf = DataFrame.from_pandas(pdf) + + got = gdf.sort_values(by, ascending=ascending, na_position=na_position) + expect = pdf.sort_values(by, ascending=ascending, na_position=na_position) + + assert_eq( + got[by].reset_index(drop=True), expect[by].reset_index(drop=True) + ) + + +@pytest.mark.parametrize("num_rows", [0, 3]) +@pytest.mark.parametrize("nulls", ["some", "all"]) +@pytest.mark.parametrize("na_position", ["first", "last"]) +def test_dataframe_multi_column_nulls( + num_rows, float_types_as_str, nulls, ascending, na_position +): + num_cols = 2 + rng = np.random.default_rng(seed=0) + by = list(string.ascii_lowercase[:num_cols]) + pdf = pd.DataFrame() + + for colname in string.ascii_lowercase[:3]: + data = rng.integers(0, 26, num_rows).astype(float_types_as_str) + if nulls == "some": + idx = np.array([], dtype="int64") + if num_rows > 0: + idx = rng.choice( + num_rows, size=int(num_rows / 4), replace=False + ) + data[idx] = np.nan + elif nulls == "all": + data[:] = np.nan + pdf[colname] = data + + gdf = DataFrame.from_pandas(pdf) + + got = gdf.sort_values(by, ascending=ascending, na_position=na_position) + expect = pdf.sort_values(by, ascending=ascending, na_position=na_position) + + assert_eq( + got[by].reset_index(drop=True), expect[by].reset_index(drop=True) + ) + + +@pytest.mark.parametrize("ascending1", [True, False]) +@pytest.mark.parametrize("ascending2", [True, False]) +@pytest.mark.parametrize("na_position", ["first", "last"]) +def test_dataframe_multi_column_nulls_multiple_ascending( + ascending1, ascending2, na_position +): + ascending = (ascending1, ascending2) + pdf = pd.DataFrame( + {"a": [3, 1, None, 2, 2, None, 1], "b": [1, 2, 3, 4, 5, 6, 7]} + ) + gdf = DataFrame.from_pandas(pdf) + expect = pdf.sort_values( + by=["a", "b"], ascending=ascending, na_position=na_position + ) + actual = gdf.sort_values( + by=["a", "b"], ascending=ascending, na_position=na_position + ) + + assert_eq(actual, expect) + + +@pytest.mark.parametrize( + "kind", ["quicksort", "mergesort", "heapsort", "stable"] +) +def test_dataframe_sort_values_kind(numeric_types_as_str, kind): + nelem = 20 + rng = np.random.default_rng(seed=0) + df = DataFrame() + df["a"] = aa = (100 * rng.random(nelem)).astype(numeric_types_as_str) + df["b"] = bb = (100 * rng.random(nelem)).astype(numeric_types_as_str) + with expect_warning_if(kind != "quicksort", UserWarning): + sorted_df = df.sort_values(by="a", kind=kind) + # Check + sorted_index = np.argsort(aa, kind="mergesort") + assert_eq(sorted_df.index.values, sorted_index) + assert_eq(sorted_df["a"].values, aa[sorted_index]) + assert_eq(sorted_df["b"].values, bb[sorted_index]) + + +def test_sort_values_by_index_level(): + df = pd.DataFrame({"a": [1, 3, 2]}, index=pd.Index([1, 3, 2], name="b")) + cudf_df = DataFrame.from_pandas(df) + result = cudf_df.sort_values("b") + expected = df.sort_values("b") + assert_eq(result, expected) + + +def test_sort_values_by_ambiguous(): + df = pd.DataFrame({"a": [1, 3, 2]}, index=pd.Index([1, 3, 2], name="a")) + cudf_df = DataFrame.from_pandas(df) + + assert_exceptions_equal( + lfunc=df.sort_values, + rfunc=cudf_df.sort_values, + lfunc_args_and_kwargs=(["a"], {}), + rfunc_args_and_kwargs=(["a"], {}), + ) diff --git a/python/cudf/cudf/tests/dataframe/methods/test_to_arrow.py b/python/cudf/cudf/tests/dataframe/methods/test_to_arrow.py new file mode 100644 index 00000000000..4bbed8fab9e --- /dev/null +++ b/python/cudf/cudf/tests/dataframe/methods/test_to_arrow.py @@ -0,0 +1,15 @@ +# Copyright (c) 2023-2025, NVIDIA CORPORATION. + +import pyarrow as pa +import pytest + +import cudf + + +@pytest.mark.parametrize("preserve_index", [False, True, None]) +def test_dataframe_to_arrow_preserve_index(preserve_index): + df = cudf.DataFrame({"x": ["cat", "dog"] * 5}) + pf = df.to_pandas() + expect = pa.Table.from_pandas(pf, preserve_index=preserve_index).schema + got = df.to_arrow(preserve_index=preserve_index).schema + assert expect == got diff --git a/python/cudf/cudf/tests/dataframe/test_attributes.py b/python/cudf/cudf/tests/dataframe/test_attributes.py deleted file mode 100644 index 06777c8e6af..00000000000 --- a/python/cudf/cudf/tests/dataframe/test_attributes.py +++ /dev/null @@ -1 +0,0 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. diff --git a/python/cudf/cudf/tests/dataframe/test_binary_operations.py b/python/cudf/cudf/tests/dataframe/test_binary_operations.py deleted file mode 100644 index 06777c8e6af..00000000000 --- a/python/cudf/cudf/tests/dataframe/test_binary_operations.py +++ /dev/null @@ -1 +0,0 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. diff --git a/python/cudf/cudf/tests/dataframe/test_combining.py b/python/cudf/cudf/tests/dataframe/test_combining.py deleted file mode 100644 index 06777c8e6af..00000000000 --- a/python/cudf/cudf/tests/dataframe/test_combining.py +++ /dev/null @@ -1 +0,0 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. diff --git a/python/cudf/cudf/tests/dataframe/test_computation.py b/python/cudf/cudf/tests/dataframe/test_computation.py deleted file mode 100644 index 06777c8e6af..00000000000 --- a/python/cudf/cudf/tests/dataframe/test_computation.py +++ /dev/null @@ -1 +0,0 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. diff --git a/python/cudf/cudf/tests/dataframe/test_constructing.py b/python/cudf/cudf/tests/dataframe/test_constructing.py deleted file mode 100644 index 06777c8e6af..00000000000 --- a/python/cudf/cudf/tests/dataframe/test_constructing.py +++ /dev/null @@ -1 +0,0 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. diff --git a/python/cudf/cudf/tests/dataframe/test_function_application.py b/python/cudf/cudf/tests/dataframe/test_function_application.py deleted file mode 100644 index 06777c8e6af..00000000000 --- a/python/cudf/cudf/tests/dataframe/test_function_application.py +++ /dev/null @@ -1 +0,0 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. diff --git a/python/cudf/cudf/tests/dataframe/test_indexing.py b/python/cudf/cudf/tests/dataframe/test_indexing.py deleted file mode 100644 index 06777c8e6af..00000000000 --- a/python/cudf/cudf/tests/dataframe/test_indexing.py +++ /dev/null @@ -1 +0,0 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. diff --git a/python/cudf/cudf/tests/dataframe/test_io_serialization.py b/python/cudf/cudf/tests/dataframe/test_io_serialization.py deleted file mode 100644 index 04508dbf2ec..00000000000 --- a/python/cudf/cudf/tests/dataframe/test_io_serialization.py +++ /dev/null @@ -1,50 +0,0 @@ -# Copyright (c) 2023-2025, NVIDIA CORPORATION. -from io import BytesIO - -import pandas as pd -import pyarrow as pa -import pyarrow.parquet as pq -import pytest - -import cudf -from cudf.testing import assert_eq - - -@pytest.mark.parametrize( - "index", - [range(1, 11), list(range(1, 11)), range(1, 11)[::2]], - ids=["RangeIndex", "IntIndex", "StridedRange"], -) -@pytest.mark.parametrize("write_index", [False, True, None]) -@pytest.mark.parametrize("empty", [False, True], ids=["nonempty", "empty"]) -def test_dataframe_parquet_roundtrip(index, write_index, empty): - if empty: - data = {} - else: - data = {"a": [i * 2 for i in index]} - df = cudf.DataFrame(data=data, index=index) - pf = pd.DataFrame(data=data, index=index) - gpu_buf = BytesIO() - cpu_buf = BytesIO() - - df.to_parquet(gpu_buf, index=write_index) - pf.to_parquet(cpu_buf, index=write_index) - gpu_table = pq.read_table(gpu_buf) - cpu_table = pq.read_table(cpu_buf) - metadata_equal = ( - gpu_table.schema.pandas_metadata == cpu_table.schema.pandas_metadata - ) - assert metadata_equal - - gpu_read = cudf.read_parquet(gpu_buf) - cpu_read = cudf.read_parquet(cpu_buf) - assert_eq(gpu_read, cpu_read) - - -@pytest.mark.parametrize("preserve_index", [False, True, None]) -def test_dataframe_to_arrow_preserve_index(preserve_index): - df = cudf.DataFrame({"x": ["cat", "dog"] * 5}) - pf = df.to_pandas() - expect = pa.Table.from_pandas(pf, preserve_index=preserve_index).schema - got = df.to_arrow(preserve_index=preserve_index).schema - assert expect == got diff --git a/python/cudf/cudf/tests/dataframe/test_missing.py b/python/cudf/cudf/tests/dataframe/test_missing.py deleted file mode 100644 index 06777c8e6af..00000000000 --- a/python/cudf/cudf/tests/dataframe/test_missing.py +++ /dev/null @@ -1 +0,0 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. diff --git a/python/cudf/cudf/tests/dataframe/test_reindexing.py b/python/cudf/cudf/tests/dataframe/test_reindexing.py deleted file mode 100644 index 06777c8e6af..00000000000 --- a/python/cudf/cudf/tests/dataframe/test_reindexing.py +++ /dev/null @@ -1 +0,0 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. diff --git a/python/cudf/cudf/tests/dataframe/test_reshaping.py b/python/cudf/cudf/tests/dataframe/test_reshaping.py deleted file mode 100644 index 06777c8e6af..00000000000 --- a/python/cudf/cudf/tests/dataframe/test_reshaping.py +++ /dev/null @@ -1 +0,0 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. diff --git a/python/cudf/cudf/tests/dataframe/test_selecting.py b/python/cudf/cudf/tests/dataframe/test_selecting.py deleted file mode 100644 index 06777c8e6af..00000000000 --- a/python/cudf/cudf/tests/dataframe/test_selecting.py +++ /dev/null @@ -1 +0,0 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. diff --git a/python/cudf/cudf/tests/dataframe/test_sorting.py b/python/cudf/cudf/tests/dataframe/test_sorting.py deleted file mode 100644 index 06777c8e6af..00000000000 --- a/python/cudf/cudf/tests/dataframe/test_sorting.py +++ /dev/null @@ -1 +0,0 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. diff --git a/python/cudf/cudf/tests/dataframe/test_timeseries.py b/python/cudf/cudf/tests/dataframe/test_timeseries.py deleted file mode 100644 index 06777c8e6af..00000000000 --- a/python/cudf/cudf/tests/dataframe/test_timeseries.py +++ /dev/null @@ -1 +0,0 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. diff --git a/python/cudf/cudf/tests/test_api_types.py b/python/cudf/cudf/tests/general_functions/test_api_types.py similarity index 100% rename from python/cudf/cudf/tests/test_api_types.py rename to python/cudf/cudf/tests/general_functions/test_api_types.py diff --git a/python/cudf/cudf/tests/general_functions/test_conversion.py b/python/cudf/cudf/tests/general_functions/test_conversion.py deleted file mode 100644 index 06777c8e6af..00000000000 --- a/python/cudf/cudf/tests/general_functions/test_conversion.py +++ /dev/null @@ -1 +0,0 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. diff --git a/python/cudf/cudf/tests/general_functions/test_data_manipulation.py b/python/cudf/cudf/tests/general_functions/test_data_manipulation.py deleted file mode 100644 index 06777c8e6af..00000000000 --- a/python/cudf/cudf/tests/general_functions/test_data_manipulation.py +++ /dev/null @@ -1 +0,0 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. diff --git a/python/cudf/cudf/tests/general_functions/test_datetimelike.py b/python/cudf/cudf/tests/general_functions/test_datetimelike.py deleted file mode 100644 index 06777c8e6af..00000000000 --- a/python/cudf/cudf/tests/general_functions/test_datetimelike.py +++ /dev/null @@ -1 +0,0 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. diff --git a/python/cudf/cudf/tests/input_output/test_avro.py b/python/cudf/cudf/tests/input_output/test_avro.py index 06777c8e6af..43cf83dc050 100644 --- a/python/cudf/cudf/tests/input_output/test_avro.py +++ b/python/cudf/cudf/tests/input_output/test_avro.py @@ -1 +1,657 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. +# Copyright (c) 2021-2025, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from __future__ import annotations + +import datetime +import io +import pathlib + +import fastavro +import numpy as np +import pandas as pd +import pytest + +import cudf +from cudf.core._compat import PANDAS_CURRENT_SUPPORTED_VERSION, PANDAS_VERSION +from cudf.testing import assert_eq +from cudf.testing.dataset_generator import rand_dataframe + + +def cudf_from_avro_util(schema: dict, records: list) -> cudf.DataFrame: + schema = [] if schema is None else fastavro.parse_schema(schema) + buffer = io.BytesIO() + fastavro.writer(buffer, schema, records) + buffer.seek(0) + return cudf.read_avro(buffer) + + +@pytest.fixture( + params=[ + ("boolean", "bool"), + ("int", "int32"), + ("long", "int64"), + ("float", "float32"), + ("double", "float64"), + ("bytes", "str"), + ("string", "str"), + ] +) +def avro_type_params(request): + return request.param + + +@pytest.fixture(params=[True, False]) +def nullable(request): + return request.param + + +@pytest.fixture(params=[True, False]) +def prepend_null(request): + return request.param + + +@pytest.mark.parametrize("namespace", [None, "root_ns"]) +def test_can_detect_dtype_from_avro_type( + avro_type_params, namespace, nullable +): + avro_type, expected_dtype = avro_type_params + avro_type = avro_type if not nullable else ["null", avro_type] + + schema = fastavro.parse_schema( + { + "type": "record", + "name": "test", + "namespace": namespace, + "fields": [{"name": "prop", "type": avro_type}], + } + ) + + actual = cudf_from_avro_util(schema, []) + + expected = cudf.DataFrame( + {"prop": cudf.Series(None, None, expected_dtype)} + ) + + assert_eq(expected, actual) + + +@pytest.mark.parametrize("namespace", [None, "root_ns"]) +def test_can_detect_dtype_from_avro_type_nested( + avro_type_params, namespace, nullable +): + avro_type, expected_dtype = avro_type_params + avro_type = avro_type if not nullable else ["null", avro_type] + + schema_leaf = { + "name": "leaf", + "type": "record", + "fields": [{"name": "prop3", "type": avro_type}], + } + + schema_child = { + "name": "child", + "type": "record", + "fields": [{"name": "prop2", "type": schema_leaf}], + } + + schema_root = { + "name": "root", + "type": "record", + "namespace": namespace, + "fields": [{"name": "prop1", "type": schema_child}], + } + + actual = cudf_from_avro_util(schema_root, []) + + col_name = "{ns}child.{ns}leaf.prop3".format( + ns="" if namespace is None else namespace + "." + ) + + expected = cudf.DataFrame( + {col_name: cudf.Series(None, None, expected_dtype)} + ) + + assert_eq(expected, actual) + + +@pytest.mark.parametrize( + "avro_type, cudf_type, avro_val, cudf_val", + [ + ("boolean", "bool", True, True), + ("boolean", "bool", False, False), + ("int", "int32", 1234, 1234), + ("long", "int64", 1234, 1234), + ("float", "float32", 12.34, 12.34), + ("double", "float64", 12.34, 12.34), + ("string", "str", "heyϴ", "heyϴ"), + ], +) +def test_can_parse_single_value(avro_type, cudf_type, avro_val, cudf_val): + schema_root = { + "name": "root", + "type": "record", + "fields": [{"name": "prop", "type": ["null", avro_type]}], + } + + records = [ + {"prop": avro_val}, + ] + + actual = cudf_from_avro_util(schema_root, records) + + expected = cudf.DataFrame( + {"prop": cudf.Series(data=[cudf_val], dtype=cudf_type)} + ) + + assert_eq(expected, actual) + + +def test_can_parse_single_null(avro_type_params): + avro_type, expected_dtype = avro_type_params + schema_root = { + "name": "root", + "type": "record", + "fields": [{"name": "prop", "type": ["null", avro_type]}], + } + + records = [{"prop": None}] + + actual = cudf_from_avro_util(schema_root, records) + + expected = cudf.DataFrame( + {"prop": cudf.Series(data=[None], dtype=expected_dtype)} + ) + + assert_eq(expected, actual) + + +def test_can_parse_no_data(avro_type_params): + avro_type, expected_dtype = avro_type_params + schema_root = { + "name": "root", + "type": "record", + "fields": [{"name": "prop", "type": ["null", avro_type]}], + } + + records = [] + + actual = cudf_from_avro_util(schema_root, records) + + expected = cudf.DataFrame( + {"prop": cudf.Series(data=[], dtype=expected_dtype)} + ) + + assert_eq(expected, actual) + + +@pytest.mark.xfail( + reason="cudf avro reader is unable to parse zero-field metadata." +) +def test_can_parse_no_fields(avro_type_params): + avro_type, expected_dtype = avro_type_params + schema_root = { + "name": "root", + "type": "record", + "fields": [], + } + + records = [] + + actual = cudf_from_avro_util(schema_root, records) + + expected = cudf.DataFrame() + + assert_eq(expected, actual) + + +def test_can_parse_no_schema(): + schema_root = None + records = [] + actual = cudf_from_avro_util(schema_root, records) + expected = cudf.DataFrame() + assert_eq(expected, actual) + + +@pytest.mark.parametrize("rows", [0, 20]) +@pytest.mark.parametrize("codec", ["null", "deflate", "snappy"]) +def test_avro_decompression(set_decomp_env_vars, rows, codec): + schema = { + "name": "root", + "type": "record", + "fields": [ + {"name": "0", "type": "int"}, + {"name": "1", "type": "string"}, + ], + } + + # N.B. rand_dataframe() is brutally slow for some reason. Switching to + # np.random() speeds things up by a factor of 10. + # See also: https://github.com/rapidsai/cudf/issues/13128 + df = rand_dataframe( + [ + {"dtype": "int32", "null_frequency": 0, "cardinality": 1000}, + { + "dtype": "str", + "null_frequency": 0, + "cardinality": 100, + "max_string_length": 10, + }, + ], + rows, + seed=0, + use_threads=False, + ) + expected_df = cudf.DataFrame.from_arrow(df) + + records = df.to_pandas().to_dict(orient="records") + + buffer = io.BytesIO() + fastavro.writer(buffer, schema, records, codec=codec) + buffer.seek(0) + got_df = cudf.read_avro(buffer) + + assert_eq(expected_df, got_df) + + +@pytest.mark.parametrize("namespace", [None, "root_ns"]) +def test_can_detect_dtypes_from_avro_logical_type( + namespace, + nullable, + prepend_null, +): + logical_type = "date" + primitive_type = "int" + expected_dtype = "datetime64[s]" + avro_type = [{"logicalType": logical_type, "type": primitive_type}] + if nullable: + if prepend_null: + avro_type.insert(0, "null") + else: + avro_type.append("null") + + schema = fastavro.parse_schema( + { + "type": "record", + "name": "test", + "namespace": namespace, + "fields": [{"name": "prop", "type": avro_type}], + } + ) + + actual = cudf_from_avro_util(schema, []) + + expected = cudf.DataFrame( + {"prop": cudf.Series(None, None, expected_dtype)} + ) + + assert_eq(expected, actual) + + +def get_days_from_epoch(date: datetime.date | None) -> int | None: + if date is None: + return None + return (date - datetime.date(1970, 1, 1)).days + + +@pytest.mark.parametrize("namespace", [None, "root_ns"]) +@pytest.mark.skipif( + PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, + reason="Fails in older versions of pandas (datetime(9999, ...) too large)", +) +def test_can_parse_avro_date_logical_type(namespace, nullable, prepend_null): + avro_type = {"logicalType": "date", "type": "int"} + if nullable: + if prepend_null: + avro_type = ["null", avro_type] + else: + avro_type = [avro_type, "null"] + + schema_dict = { + "type": "record", + "name": "test", + "fields": [ + {"name": "o_date", "type": avro_type}, + ], + } + + if namespace: + schema_dict["namespace"] = namespace + + schema = fastavro.parse_schema(schema_dict) + + # Insert some None values in no particular order. These will get converted + # into avro "nulls" by the fastavro writer (or filtered out if we're not + # nullable). The first and last dates are epoch min/max values, the rest + # are arbitrarily chosen. + dates = [ + None, + datetime.date(1970, 1, 1), + datetime.date(1970, 1, 2), + datetime.date(1981, 10, 25), + None, + None, + datetime.date(2012, 5, 18), + None, + datetime.date(2019, 9, 3), + None, + datetime.date(9999, 12, 31), + ] + + if not nullable: + dates = [date for date in dates if date is not None] + + days_from_epoch = [get_days_from_epoch(date) for date in dates] + + records = [{"o_date": day} for day in days_from_epoch] + + actual = cudf_from_avro_util(schema, records) + + expected = cudf.DataFrame( + {"o_date": cudf.Series(dates, dtype="datetime64[s]")} + ) + + assert_eq(expected, actual) + + +def test_alltypes_plain_avro(): + # During development of the logical type support, the Java avro tests were + # triggering CUDA kernel crashes (null pointer dereferences). We were able + # to replicate the behavior in a C++ test case, and then subsequently came + # up with this Python unit test to also trigger the problematic code path. + # + # So, unlike the other tests, this test is inherently reactive in nature, + # added simply to verify we fixed the problematic code path that was + # causing CUDA kernel crashes. + # + # See https://github.com/rapidsai/cudf/pull/12788#issuecomment-1468822875 + # for more information. + relpath = "../../../../java/src/test/resources/alltypes_plain.avro" + path = pathlib.Path(__file__).parent.parent.joinpath(relpath).resolve() + assert path.is_file(), path + path = str(path) + + with open(path, "rb") as f: + reader = fastavro.reader(f) + records = [record for record in reader] + + # For reference: + # + # >>> from pprint import pprint + # >>> pprint(reader.writer_schema) + # {'fields': [{'name': 'id', 'type': ['int', 'null']}, + # {'name': 'bool_col', 'type': ['boolean', 'null']}, + # {'name': 'tinyint_col', 'type': ['int', 'null']}, + # {'name': 'smallint_col', 'type': ['int', 'null']}, + # {'name': 'int_col', 'type': ['int', 'null']}, + # {'name': 'bigint_col', 'type': ['long', 'null']}, + # {'name': 'float_col', 'type': ['float', 'null']}, + # {'name': 'double_col', 'type': ['double', 'null']}, + # {'name': 'date_string_col', 'type': ['bytes', 'null']}, + # {'name': 'string_col', 'type': ['bytes', 'null']}, + # {'name': 'timestamp_col', + # 'type': [{'logicalType': 'timestamp-micros', + # 'type': 'long'}, + # 'null']}], + # 'name': 'topLevelRecord', + # 'type': 'record'} + # + # >>> pprint(records[0]) + # {'bigint_col': 0, + # 'bool_col': True, + # 'date_string_col': b'03/01/09', + # 'double_col': 0.0, + # 'float_col': 0.0, + # 'id': 4, + # 'int_col': 0, + # 'smallint_col': 0, + # 'string_col': b'0', + # 'timestamp_col': datetime.datetime(2009, 3, 1, 0, 0, + # tzinfo=datetime.timezone.utc), + # 'tinyint_col': 0} + + # Nothing particularly special about these columns, other than them being + # the ones that @davidwendt used to coerce the crash. + columns = ["bool_col", "int_col", "timestamp_col"] + + # This next line would trigger the fatal CUDA kernel crash. + actual = cudf.read_avro(path, columns=columns) + + # If we get here, we haven't crashed, obviously. Verify the returned data + # frame meets our expectations. We need to fiddle with the dtypes of the + # expected data frame in order to correctly match the schema definition and + # our corresponding read_avro()-returned data frame. + + data = [{column: row[column] for column in columns} for row in records] + + # discard timezone information as we don't support it: + expected = pd.DataFrame(data) + expected["timestamp_col"].dt.tz_localize(None) + + # The fastavro.reader supports the `'logicalType': 'timestamp-micros'` used + # by the 'timestamp_col' column, which is converted into Python + # datetime.datetime() objects (see output of pprint(records[0]) above). + # As we don't support that logical type yet in cudf, we need to convert to + # int64, then divide by 1000 to convert from nanoseconds to microseconds. + timestamps = expected["timestamp_col"].astype("int64") + timestamps //= 1000 + expected["timestamp_col"] = timestamps + + # Furthermore, we need to force the 'int_col' into an int32, per the schema + # definition. (It ends up as an int64 due to cudf.DataFrame() defaulting + # all Python int values to int64 sans a dtype= override.) + expected["int_col"] = expected["int_col"].astype("int32") + + assert_eq(actual, expected) + + +def multiblock_testname_ids(param): + (total_rows, num_rows, skip_rows, sync_interval) = param + return f"{total_rows=}-{num_rows=}-{skip_rows=}-{sync_interval=}" + + +# The following values are used to test various boundary conditions associated +# with multiblock avro files. Each tuple consists of four values: total number +# of rows to generate, number of rows to limit the result set to, number of +# rows to skip, and number of rows per block. If the total number of rows and +# number of rows (i.e. first and second tuple elements) are equal, it means +# that all rows will be returned. If the rows per block also equals the first +# two numbers, it means that a single block will be used. +@pytest.fixture( + ids=multiblock_testname_ids, + params=[ + (10, 10, 9, 9), + (10, 10, 9, 5), + (10, 10, 9, 3), + (10, 10, 9, 2), + (10, 10, 9, 10), + (10, 10, 8, 2), + (10, 10, 5, 5), + (10, 10, 2, 9), + (10, 10, 2, 2), + (10, 10, 1, 9), + (10, 10, 1, 5), + (10, 10, 1, 2), + (10, 10, 1, 10), + (10, 10, 10, 9), + (10, 10, 10, 5), + (10, 10, 10, 2), + (10, 10, 10, 10), + (10, 10, 0, 9), + (10, 10, 0, 5), + (10, 10, 0, 2), + (10, 10, 0, 10), + (100, 100, 99, 10), + (100, 100, 90, 90), + (100, 100, 90, 89), + (100, 100, 90, 88), + (100, 100, 90, 87), + (100, 100, 90, 5), + (100, 100, 89, 90), + (100, 100, 87, 90), + (100, 100, 50, 7), + (100, 100, 50, 31), + (10, 1, 8, 9), + (100, 1, 99, 10), + (100, 1, 98, 10), + (100, 1, 97, 10), + (100, 3, 90, 87), + (100, 4, 90, 5), + (100, 2, 89, 90), + (100, 9, 87, 90), + (100, 20, 50, 7), + (100, 10, 50, 31), + (100, 20, 50, 31), + (100, 30, 50, 31), + (256, 256, 0, 256), + (256, 256, 0, 32), + (256, 256, 0, 31), + (256, 256, 0, 33), + (256, 256, 31, 32), + (256, 256, 32, 31), + (256, 256, 31, 33), + (512, 512, 0, 32), + (512, 512, 0, 31), + (512, 512, 0, 33), + (512, 512, 31, 32), + (512, 512, 32, 31), + (512, 512, 31, 33), + (1024, 1024, 0, 1), + (1024, 1024, 0, 3), + (1024, 1024, 0, 7), + (1024, 1024, 0, 8), + (1024, 1024, 0, 9), + (1024, 1024, 0, 15), + (1024, 1024, 0, 16), + (1024, 1024, 0, 17), + (1024, 1024, 0, 32), + (1024, 1024, 0, 31), + (1024, 1024, 0, 33), + (1024, 1024, 31, 32), + (1024, 1024, 32, 31), + (1024, 1024, 31, 33), + (2048, 2048, 0, 31), + (2048, 2048, 0, 32), + (2048, 2048, 0, 33), + (2048, 2048, 0, 2048), + ], +) +def total_rows_and_num_rows_and_skip_rows_and_rows_per_block(request): + return request.param + + +# N.B. The float32 and float64 types are chosen specifically to exercise +# the only path in the avro reader GPU code that can process multiple +# rows in parallel (via warp-level parallelism). See the logic around +# the line `if (cur + min_row_size * rows_remaining == end)` in +# gpuDecodeAvroColumnData(). +@pytest.mark.parametrize("dtype", ["str", "float32", "float64"]) +@pytest.mark.parametrize( + "use_sync_interval", + [True, False], + ids=["use_sync_interval", "ignore_sync_interval"], +) +@pytest.mark.parametrize("codec", ["null", "deflate", "snappy"]) +def test_avro_reader_multiblock( + dtype, + codec, + use_sync_interval, + total_rows_and_num_rows_and_skip_rows_and_rows_per_block, +): + ( + total_rows, + num_rows, + skip_rows, + rows_per_block, + ) = total_rows_and_num_rows_and_skip_rows_and_rows_per_block + + assert total_rows >= num_rows + assert rows_per_block <= total_rows + + limit_rows = num_rows != total_rows + if limit_rows: + assert total_rows >= num_rows + skip_rows + + if dtype == "str": + avro_type = "string" + + # Generate a list of strings, each of which is a 6-digit number, padded + # with leading zeros. This data set was very useful during development + # of the multiblock avro reader logic, as you get implicit feedback as + # to what may have gone wrong when the test fails, based on the + # expected vs actual values. + values = [f"{i:0>6}" for i in range(0, total_rows)] + + # Strings are encoded in avro with a zigzag-encoded length prefix, and + # then the string data. As all of our strings are fixed at length 6, + # we only need one byte to encode the length prefix (0xc). Thus, our + # bytes per row is 6 + 1 = 7. + bytes_per_row = len(values[0]) + 1 + assert bytes_per_row == 7, bytes_per_row + else: + assert dtype in ("float32", "float64") + avro_type = "float" if dtype == "float32" else "double" + rng = np.random.default_rng(seed=0) + # We don't use rand_dataframe() here, because it increases the + # execution time of each test by a factor of 10 or more (it appears + # to use a very costly approach to generating random data). + # See also: https://github.com/rapidsai/cudf/issues/13128 + values = rng.random(total_rows).astype(dtype) + bytes_per_row = values.dtype.itemsize + + # The sync_interval is the number of bytes between sync blocks. We know + # how many bytes we need per row, so we can calculate the number of bytes + # per block by multiplying the number of rows per block by the bytes per + # row. This is the sync interval. + total_bytes_per_block = rows_per_block * bytes_per_row + sync_interval = total_bytes_per_block + + source_df = cudf.DataFrame({"0": pd.Series(values)}) + + if limit_rows: + expected_df = source_df[skip_rows : skip_rows + num_rows].reset_index( + drop=True + ) + else: + expected_df = source_df[skip_rows:].reset_index(drop=True) + + records = source_df.to_pandas().to_dict(orient="records") + + schema = { + "name": "root", + "type": "record", + "fields": [ + {"name": "0", "type": avro_type}, + ], + } + + if use_sync_interval: + kwds = {"sync_interval": sync_interval} + else: + kwds = {} + + kwds["codec"] = codec + + buffer = io.BytesIO() + fastavro.writer(buffer, schema, records, **kwds) + buffer.seek(0) + + if not limit_rows: + # Explicitly set num_rows to None if we want to read all rows. This + # ensures we exercise the logic behind a read_avro() call where the + # caller doesn't specify the number of rows desired (which will be the + # most common use case). + num_rows = None + actual_df = cudf.read_avro(buffer, skiprows=skip_rows, num_rows=num_rows) + + assert_eq(expected_df, actual_df) diff --git a/python/cudf/cudf/tests/input_output/test_parquet.py b/python/cudf/cudf/tests/input_output/test_parquet.py index cde1bccf2f7..a377bcac285 100644 --- a/python/cudf/cudf/tests/input_output/test_parquet.py +++ b/python/cudf/cudf/tests/input_output/test_parquet.py @@ -1,10 +1,14 @@ # Copyright (c) 2023-2025, NVIDIA CORPORATION. +from io import BytesIO + import pandas as pd import pyarrow as pa import pyarrow.parquet as pq +import pytest import cudf +from cudf.testing import assert_eq def test_parquet_long_list(tmpdir): @@ -55,3 +59,34 @@ def test_parquet_long_list(tmpdir): actual = cudf.read_parquet(file_name) expected = pd.read_parquet(file_name) assert actual.to_arrow().equals(pa.Table.from_pandas(expected)) + + +@pytest.mark.parametrize( + "index", + [range(1, 11), list(range(1, 11)), range(1, 11)[::2]], + ids=["RangeIndex", "IntIndex", "StridedRange"], +) +@pytest.mark.parametrize("write_index", [False, True, None]) +@pytest.mark.parametrize("empty", [False, True], ids=["nonempty", "empty"]) +def test_dataframe_parquet_roundtrip(index, write_index, empty): + if empty: + data = {} + else: + data = {"a": [i * 2 for i in index]} + df = cudf.DataFrame(data=data, index=index) + pf = pd.DataFrame(data=data, index=index) + gpu_buf = BytesIO() + cpu_buf = BytesIO() + + df.to_parquet(gpu_buf, index=write_index) + pf.to_parquet(cpu_buf, index=write_index) + gpu_table = pq.read_table(gpu_buf) + cpu_table = pq.read_table(cpu_buf) + metadata_equal = ( + gpu_table.schema.pandas_metadata == cpu_table.schema.pandas_metadata + ) + assert metadata_equal + + gpu_read = cudf.read_parquet(gpu_buf) + cpu_read = cudf.read_parquet(cpu_buf) + assert_eq(gpu_read, cpu_read) diff --git a/python/cudf/cudf/tests/test_avro_reader_fastavro_integration.py b/python/cudf/cudf/tests/test_avro_reader_fastavro_integration.py deleted file mode 100644 index 6f66ba79098..00000000000 --- a/python/cudf/cudf/tests/test_avro_reader_fastavro_integration.py +++ /dev/null @@ -1,658 +0,0 @@ -# Copyright (c) 2021-2025, NVIDIA CORPORATION. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -from __future__ import annotations - -import datetime -import io -import pathlib - -import fastavro -import numpy as np -import pandas as pd -import pytest - -import cudf -from cudf.core._compat import PANDAS_CURRENT_SUPPORTED_VERSION, PANDAS_VERSION -from cudf.testing import assert_eq -from cudf.testing.dataset_generator import rand_dataframe - - -def cudf_from_avro_util(schema: dict, records: list) -> cudf.DataFrame: - schema = [] if schema is None else fastavro.parse_schema(schema) - buffer = io.BytesIO() - fastavro.writer(buffer, schema, records) - buffer.seek(0) - return cudf.read_avro(buffer) - - -@pytest.fixture( - params=[ - ("boolean", "bool"), - ("int", "int32"), - ("long", "int64"), - ("float", "float32"), - ("double", "float64"), - ("bytes", "str"), - ("string", "str"), - ] -) -def avro_type_params(request): - return request.param - - -@pytest.fixture(params=[True, False]) -def nullable(request): - return request.param - - -@pytest.fixture(params=[True, False]) -def prepend_null(request): - return request.param - - -@pytest.mark.parametrize("namespace", [None, "root_ns"]) -def test_can_detect_dtype_from_avro_type( - avro_type_params, namespace, nullable -): - avro_type, expected_dtype = avro_type_params - avro_type = avro_type if not nullable else ["null", avro_type] - - schema = fastavro.parse_schema( - { - "type": "record", - "name": "test", - "namespace": namespace, - "fields": [{"name": "prop", "type": avro_type}], - } - ) - - actual = cudf_from_avro_util(schema, []) - - expected = cudf.DataFrame( - {"prop": cudf.Series(None, None, expected_dtype)} - ) - - assert_eq(expected, actual) - - -@pytest.mark.parametrize("namespace", [None, "root_ns"]) -def test_can_detect_dtype_from_avro_type_nested( - avro_type_params, namespace, nullable -): - avro_type, expected_dtype = avro_type_params - avro_type = avro_type if not nullable else ["null", avro_type] - - schema_leaf = { - "name": "leaf", - "type": "record", - "fields": [{"name": "prop3", "type": avro_type}], - } - - schema_child = { - "name": "child", - "type": "record", - "fields": [{"name": "prop2", "type": schema_leaf}], - } - - schema_root = { - "name": "root", - "type": "record", - "namespace": namespace, - "fields": [{"name": "prop1", "type": schema_child}], - } - - actual = cudf_from_avro_util(schema_root, []) - - col_name = "{ns}child.{ns}leaf.prop3".format( - ns="" if namespace is None else namespace + "." - ) - - expected = cudf.DataFrame( - {col_name: cudf.Series(None, None, expected_dtype)} - ) - - assert_eq(expected, actual) - - -@pytest.mark.parametrize( - "avro_type, cudf_type, avro_val, cudf_val", - [ - ("boolean", "bool", True, True), - ("boolean", "bool", False, False), - ("int", "int32", 1234, 1234), - ("long", "int64", 1234, 1234), - ("float", "float32", 12.34, 12.34), - ("double", "float64", 12.34, 12.34), - ("string", "str", "heyϴ", "heyϴ"), - # ("bytes", "str", "heyϴ", "heyϴ"), - ], -) -def test_can_parse_single_value(avro_type, cudf_type, avro_val, cudf_val): - schema_root = { - "name": "root", - "type": "record", - "fields": [{"name": "prop", "type": ["null", avro_type]}], - } - - records = [ - {"prop": avro_val}, - ] - - actual = cudf_from_avro_util(schema_root, records) - - expected = cudf.DataFrame( - {"prop": cudf.Series(data=[cudf_val], dtype=cudf_type)} - ) - - assert_eq(expected, actual) - - -def test_can_parse_single_null(avro_type_params): - avro_type, expected_dtype = avro_type_params - schema_root = { - "name": "root", - "type": "record", - "fields": [{"name": "prop", "type": ["null", avro_type]}], - } - - records = [{"prop": None}] - - actual = cudf_from_avro_util(schema_root, records) - - expected = cudf.DataFrame( - {"prop": cudf.Series(data=[None], dtype=expected_dtype)} - ) - - assert_eq(expected, actual) - - -def test_can_parse_no_data(avro_type_params): - avro_type, expected_dtype = avro_type_params - schema_root = { - "name": "root", - "type": "record", - "fields": [{"name": "prop", "type": ["null", avro_type]}], - } - - records = [] - - actual = cudf_from_avro_util(schema_root, records) - - expected = cudf.DataFrame( - {"prop": cudf.Series(data=[], dtype=expected_dtype)} - ) - - assert_eq(expected, actual) - - -@pytest.mark.xfail( - reason="cudf avro reader is unable to parse zero-field metadata." -) -def test_can_parse_no_fields(avro_type_params): - avro_type, expected_dtype = avro_type_params - schema_root = { - "name": "root", - "type": "record", - "fields": [], - } - - records = [] - - actual = cudf_from_avro_util(schema_root, records) - - expected = cudf.DataFrame() - - assert_eq(expected, actual) - - -def test_can_parse_no_schema(): - schema_root = None - records = [] - actual = cudf_from_avro_util(schema_root, records) - expected = cudf.DataFrame() - assert_eq(expected, actual) - - -@pytest.mark.parametrize("rows", [0, 1, 10, 1000]) -@pytest.mark.parametrize("codec", ["null", "deflate", "snappy"]) -def test_avro_decompression(set_decomp_env_vars, rows, codec): - schema = { - "name": "root", - "type": "record", - "fields": [ - {"name": "0", "type": "int"}, - {"name": "1", "type": "string"}, - ], - } - - # N.B. rand_dataframe() is brutally slow for some reason. Switching to - # np.random() speeds things up by a factor of 10. - # See also: https://github.com/rapidsai/cudf/issues/13128 - df = rand_dataframe( - [ - {"dtype": "int32", "null_frequency": 0, "cardinality": 1000}, - { - "dtype": "str", - "null_frequency": 0, - "cardinality": 100, - "max_string_length": 10, - }, - ], - rows, - seed=0, - use_threads=False, - ) - expected_df = cudf.DataFrame.from_arrow(df) - - records = df.to_pandas().to_dict(orient="records") - - buffer = io.BytesIO() - fastavro.writer(buffer, schema, records, codec=codec) - buffer.seek(0) - got_df = cudf.read_avro(buffer) - - assert_eq(expected_df, got_df) - - -@pytest.mark.parametrize("namespace", [None, "root_ns"]) -def test_can_detect_dtypes_from_avro_logical_type( - namespace, - nullable, - prepend_null, -): - logical_type = "date" - primitive_type = "int" - expected_dtype = "datetime64[s]" - avro_type = [{"logicalType": logical_type, "type": primitive_type}] - if nullable: - if prepend_null: - avro_type.insert(0, "null") - else: - avro_type.append("null") - - schema = fastavro.parse_schema( - { - "type": "record", - "name": "test", - "namespace": namespace, - "fields": [{"name": "prop", "type": avro_type}], - } - ) - - actual = cudf_from_avro_util(schema, []) - - expected = cudf.DataFrame( - {"prop": cudf.Series(None, None, expected_dtype)} - ) - - assert_eq(expected, actual) - - -def get_days_from_epoch(date: datetime.date | None) -> int | None: - if date is None: - return None - return (date - datetime.date(1970, 1, 1)).days - - -@pytest.mark.parametrize("namespace", [None, "root_ns"]) -@pytest.mark.skipif( - PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, - reason="Fails in older versions of pandas (datetime(9999, ...) too large)", -) -def test_can_parse_avro_date_logical_type(namespace, nullable, prepend_null): - avro_type = {"logicalType": "date", "type": "int"} - if nullable: - if prepend_null: - avro_type = ["null", avro_type] - else: - avro_type = [avro_type, "null"] - - schema_dict = { - "type": "record", - "name": "test", - "fields": [ - {"name": "o_date", "type": avro_type}, - ], - } - - if namespace: - schema_dict["namespace"] = namespace - - schema = fastavro.parse_schema(schema_dict) - - # Insert some None values in no particular order. These will get converted - # into avro "nulls" by the fastavro writer (or filtered out if we're not - # nullable). The first and last dates are epoch min/max values, the rest - # are arbitrarily chosen. - dates = [ - None, - datetime.date(1970, 1, 1), - datetime.date(1970, 1, 2), - datetime.date(1981, 10, 25), - None, - None, - datetime.date(2012, 5, 18), - None, - datetime.date(2019, 9, 3), - None, - datetime.date(9999, 12, 31), - ] - - if not nullable: - dates = [date for date in dates if date is not None] - - days_from_epoch = [get_days_from_epoch(date) for date in dates] - - records = [{"o_date": day} for day in days_from_epoch] - - actual = cudf_from_avro_util(schema, records) - - expected = cudf.DataFrame( - {"o_date": cudf.Series(dates, dtype="datetime64[s]")} - ) - - assert_eq(expected, actual) - - -def test_alltypes_plain_avro(): - # During development of the logical type support, the Java avro tests were - # triggering CUDA kernel crashes (null pointer dereferences). We were able - # to replicate the behavior in a C++ test case, and then subsequently came - # up with this Python unit test to also trigger the problematic code path. - # - # So, unlike the other tests, this test is inherently reactive in nature, - # added simply to verify we fixed the problematic code path that was - # causing CUDA kernel crashes. - # - # See https://github.com/rapidsai/cudf/pull/12788#issuecomment-1468822875 - # for more information. - relpath = "../../../../java/src/test/resources/alltypes_plain.avro" - path = pathlib.Path(__file__).parent.joinpath(relpath).resolve() - assert path.is_file(), path - path = str(path) - - with open(path, "rb") as f: - reader = fastavro.reader(f) - records = [record for record in reader] - - # For reference: - # - # >>> from pprint import pprint - # >>> pprint(reader.writer_schema) - # {'fields': [{'name': 'id', 'type': ['int', 'null']}, - # {'name': 'bool_col', 'type': ['boolean', 'null']}, - # {'name': 'tinyint_col', 'type': ['int', 'null']}, - # {'name': 'smallint_col', 'type': ['int', 'null']}, - # {'name': 'int_col', 'type': ['int', 'null']}, - # {'name': 'bigint_col', 'type': ['long', 'null']}, - # {'name': 'float_col', 'type': ['float', 'null']}, - # {'name': 'double_col', 'type': ['double', 'null']}, - # {'name': 'date_string_col', 'type': ['bytes', 'null']}, - # {'name': 'string_col', 'type': ['bytes', 'null']}, - # {'name': 'timestamp_col', - # 'type': [{'logicalType': 'timestamp-micros', - # 'type': 'long'}, - # 'null']}], - # 'name': 'topLevelRecord', - # 'type': 'record'} - # - # >>> pprint(records[0]) - # {'bigint_col': 0, - # 'bool_col': True, - # 'date_string_col': b'03/01/09', - # 'double_col': 0.0, - # 'float_col': 0.0, - # 'id': 4, - # 'int_col': 0, - # 'smallint_col': 0, - # 'string_col': b'0', - # 'timestamp_col': datetime.datetime(2009, 3, 1, 0, 0, - # tzinfo=datetime.timezone.utc), - # 'tinyint_col': 0} - - # Nothing particularly special about these columns, other than them being - # the ones that @davidwendt used to coerce the crash. - columns = ["bool_col", "int_col", "timestamp_col"] - - # This next line would trigger the fatal CUDA kernel crash. - actual = cudf.read_avro(path, columns=columns) - - # If we get here, we haven't crashed, obviously. Verify the returned data - # frame meets our expectations. We need to fiddle with the dtypes of the - # expected data frame in order to correctly match the schema definition and - # our corresponding read_avro()-returned data frame. - - data = [{column: row[column] for column in columns} for row in records] - - # discard timezone information as we don't support it: - expected = pd.DataFrame(data) - expected["timestamp_col"].dt.tz_localize(None) - - # The fastavro.reader supports the `'logicalType': 'timestamp-micros'` used - # by the 'timestamp_col' column, which is converted into Python - # datetime.datetime() objects (see output of pprint(records[0]) above). - # As we don't support that logical type yet in cudf, we need to convert to - # int64, then divide by 1000 to convert from nanoseconds to microseconds. - timestamps = expected["timestamp_col"].astype("int64") - timestamps //= 1000 - expected["timestamp_col"] = timestamps - - # Furthermore, we need to force the 'int_col' into an int32, per the schema - # definition. (It ends up as an int64 due to cudf.DataFrame() defaulting - # all Python int values to int64 sans a dtype= override.) - expected["int_col"] = expected["int_col"].astype("int32") - - assert_eq(actual, expected) - - -def multiblock_testname_ids(param): - (total_rows, num_rows, skip_rows, sync_interval) = param - return f"{total_rows=}-{num_rows=}-{skip_rows=}-{sync_interval=}" - - -# The following values are used to test various boundary conditions associated -# with multiblock avro files. Each tuple consists of four values: total number -# of rows to generate, number of rows to limit the result set to, number of -# rows to skip, and number of rows per block. If the total number of rows and -# number of rows (i.e. first and second tuple elements) are equal, it means -# that all rows will be returned. If the rows per block also equals the first -# two numbers, it means that a single block will be used. -@pytest.fixture( - ids=multiblock_testname_ids, - params=[ - (10, 10, 9, 9), - (10, 10, 9, 5), - (10, 10, 9, 3), - (10, 10, 9, 2), - (10, 10, 9, 10), - (10, 10, 8, 2), - (10, 10, 5, 5), - (10, 10, 2, 9), - (10, 10, 2, 2), - (10, 10, 1, 9), - (10, 10, 1, 5), - (10, 10, 1, 2), - (10, 10, 1, 10), - (10, 10, 10, 9), - (10, 10, 10, 5), - (10, 10, 10, 2), - (10, 10, 10, 10), - (10, 10, 0, 9), - (10, 10, 0, 5), - (10, 10, 0, 2), - (10, 10, 0, 10), - (100, 100, 99, 10), - (100, 100, 90, 90), - (100, 100, 90, 89), - (100, 100, 90, 88), - (100, 100, 90, 87), - (100, 100, 90, 5), - (100, 100, 89, 90), - (100, 100, 87, 90), - (100, 100, 50, 7), - (100, 100, 50, 31), - (10, 1, 8, 9), - (100, 1, 99, 10), - (100, 1, 98, 10), - (100, 1, 97, 10), - (100, 3, 90, 87), - (100, 4, 90, 5), - (100, 2, 89, 90), - (100, 9, 87, 90), - (100, 20, 50, 7), - (100, 10, 50, 31), - (100, 20, 50, 31), - (100, 30, 50, 31), - (256, 256, 0, 256), - (256, 256, 0, 32), - (256, 256, 0, 31), - (256, 256, 0, 33), - (256, 256, 31, 32), - (256, 256, 32, 31), - (256, 256, 31, 33), - (512, 512, 0, 32), - (512, 512, 0, 31), - (512, 512, 0, 33), - (512, 512, 31, 32), - (512, 512, 32, 31), - (512, 512, 31, 33), - (1024, 1024, 0, 1), - (1024, 1024, 0, 3), - (1024, 1024, 0, 7), - (1024, 1024, 0, 8), - (1024, 1024, 0, 9), - (1024, 1024, 0, 15), - (1024, 1024, 0, 16), - (1024, 1024, 0, 17), - (1024, 1024, 0, 32), - (1024, 1024, 0, 31), - (1024, 1024, 0, 33), - (1024, 1024, 31, 32), - (1024, 1024, 32, 31), - (1024, 1024, 31, 33), - (2048, 2048, 0, 31), - (2048, 2048, 0, 32), - (2048, 2048, 0, 33), - (2048, 2048, 0, 2048), - ], -) -def total_rows_and_num_rows_and_skip_rows_and_rows_per_block(request): - return request.param - - -# N.B. The float32 and float64 types are chosen specifically to exercise -# the only path in the avro reader GPU code that can process multiple -# rows in parallel (via warp-level parallelism). See the logic around -# the line `if (cur + min_row_size * rows_remaining == end)` in -# gpuDecodeAvroColumnData(). -@pytest.mark.parametrize("dtype", ["str", "float32", "float64"]) -@pytest.mark.parametrize( - "use_sync_interval", - [True, False], - ids=["use_sync_interval", "ignore_sync_interval"], -) -@pytest.mark.parametrize("codec", ["null", "deflate", "snappy"]) -def test_avro_reader_multiblock( - dtype, - codec, - use_sync_interval, - total_rows_and_num_rows_and_skip_rows_and_rows_per_block, -): - ( - total_rows, - num_rows, - skip_rows, - rows_per_block, - ) = total_rows_and_num_rows_and_skip_rows_and_rows_per_block - - assert total_rows >= num_rows - assert rows_per_block <= total_rows - - limit_rows = num_rows != total_rows - if limit_rows: - assert total_rows >= num_rows + skip_rows - - if dtype == "str": - avro_type = "string" - - # Generate a list of strings, each of which is a 6-digit number, padded - # with leading zeros. This data set was very useful during development - # of the multiblock avro reader logic, as you get implicit feedback as - # to what may have gone wrong when the test fails, based on the - # expected vs actual values. - values = [f"{i:0>6}" for i in range(0, total_rows)] - - # Strings are encoded in avro with a zigzag-encoded length prefix, and - # then the string data. As all of our strings are fixed at length 6, - # we only need one byte to encode the length prefix (0xc). Thus, our - # bytes per row is 6 + 1 = 7. - bytes_per_row = len(values[0]) + 1 - assert bytes_per_row == 7, bytes_per_row - else: - assert dtype in ("float32", "float64") - avro_type = "float" if dtype == "float32" else "double" - rng = np.random.default_rng(seed=0) - # We don't use rand_dataframe() here, because it increases the - # execution time of each test by a factor of 10 or more (it appears - # to use a very costly approach to generating random data). - # See also: https://github.com/rapidsai/cudf/issues/13128 - values = rng.random(total_rows).astype(dtype) - bytes_per_row = values.dtype.itemsize - - # The sync_interval is the number of bytes between sync blocks. We know - # how many bytes we need per row, so we can calculate the number of bytes - # per block by multiplying the number of rows per block by the bytes per - # row. This is the sync interval. - total_bytes_per_block = rows_per_block * bytes_per_row - sync_interval = total_bytes_per_block - - source_df = cudf.DataFrame({"0": pd.Series(values)}) - - if limit_rows: - expected_df = source_df[skip_rows : skip_rows + num_rows].reset_index( - drop=True - ) - else: - expected_df = source_df[skip_rows:].reset_index(drop=True) - - records = source_df.to_pandas().to_dict(orient="records") - - schema = { - "name": "root", - "type": "record", - "fields": [ - {"name": "0", "type": avro_type}, - ], - } - - if use_sync_interval: - kwds = {"sync_interval": sync_interval} - else: - kwds = {} - - kwds["codec"] = codec - - buffer = io.BytesIO() - fastavro.writer(buffer, schema, records, **kwds) - buffer.seek(0) - - if not limit_rows: - # Explicitly set num_rows to None if we want to read all rows. This - # ensures we exercise the logic behind a read_avro() call where the - # caller doesn't specify the number of rows desired (which will be the - # most common use case). - num_rows = None - actual_df = cudf.read_avro(buffer, skiprows=skip_rows, num_rows=num_rows) - - assert_eq(expected_df, actual_df) diff --git a/python/cudf/cudf/tests/test_sorting.py b/python/cudf/cudf/tests/test_sorting.py index be1e278eea6..2ee6904ef4d 100644 --- a/python/cudf/cudf/tests/test_sorting.py +++ b/python/cudf/cudf/tests/test_sorting.py @@ -1,20 +1,14 @@ # Copyright (c) 2018-2025, NVIDIA CORPORATION. -import string import numpy as np import pandas as pd import pytest -from cudf import DataFrame, Series, option_context -from cudf.core._compat import PANDAS_CURRENT_SUPPORTED_VERSION, PANDAS_VERSION -from cudf.core.column import NumericalColumn +from cudf import Series from cudf.testing import assert_eq from cudf.testing._utils import ( - DATETIME_TYPES, - NUMERIC_TYPES, assert_exceptions_equal, - expect_warning_if, ) @@ -42,52 +36,6 @@ def sliceobj(request): return request.param -def test_dataframe_sort_values(nelem, dtype): - rng = np.random.default_rng(seed=0) - df = DataFrame() - df["a"] = aa = (100 * rng.random(nelem)).astype(dtype) - df["b"] = bb = (100 * rng.random(nelem)).astype(dtype) - sorted_df = df.sort_values(by="a") - # Check - sorted_index = np.argsort(aa, kind="mergesort") - assert_eq(sorted_df.index.values, sorted_index) - assert_eq(sorted_df["a"].values, aa[sorted_index]) - assert_eq(sorted_df["b"].values, bb[sorted_index]) - - -def test_sort_values_nans_pandas_compat(): - data = {"a": [0, 0, 2, -1], "b": [1, 3, 2, None]} - with option_context("mode.pandas_compatible", True): - result = DataFrame(data).sort_values("b", na_position="first") - expected = pd.DataFrame(data).sort_values("b", na_position="first") - assert_eq(result, expected) - - -@pytest.mark.parametrize("ignore_index", [True, False]) -@pytest.mark.parametrize("index", ["a", "b", ["a", "b"]]) -def test_dataframe_sort_values_ignore_index(index, ignore_index): - if ( - PANDAS_VERSION >= PANDAS_CURRENT_SUPPORTED_VERSION - and isinstance(index, list) - and not ignore_index - ): - pytest.skip( - reason="Unstable sorting by pandas(numpy): https://github.com/pandas-dev/pandas/issues/57531" - ) - - gdf = DataFrame( - {"a": [1, 3, 5, 2, 4], "b": [1, 1, 2, 2, 3], "c": [9, 7, 7, 7, 1]} - ) - gdf = gdf.set_index(index) - - pdf = gdf.to_pandas() - - expect = pdf.sort_values(list(pdf.columns), ignore_index=ignore_index) - got = gdf.sort_values((gdf.columns), ignore_index=ignore_index) - - assert_eq(expect, got) - - @pytest.mark.parametrize("ignore_index", [True, False]) def test_series_sort_values_ignore_index(ignore_index): gsr = Series([1, 3, 5, 2, 4]) @@ -98,18 +46,6 @@ def test_series_sort_values_ignore_index(ignore_index): assert_eq(expect, got) -@pytest.mark.parametrize("nelem", [10, 100]) -def test_dataframe_sort_values_sliced(nelem, sliceobj): - rng = np.random.default_rng(seed=0) - df = pd.DataFrame() - df["a"] = rng.random(nelem) - - expect = df[sliceobj]["a"].sort_values() - gdf = DataFrame.from_pandas(df) - got = gdf[sliceobj]["a"].sort_values() - assert (got.to_pandas() == expect).all() - - @pytest.mark.parametrize("asc", [True, False]) def test_series_argsort(nelem, dtype, asc): rng = np.random.default_rng(seed=0) @@ -171,262 +107,3 @@ def test_series_nsmallest(data, n): lfunc_args_and_kwargs=([], {"n": 3, "keep": "what"}), rfunc_args_and_kwargs=([], {"n": 3, "keep": "what"}), ) - - -@pytest.mark.parametrize("nelem,n", [(1, 1), (100, 100), (10, 5), (100, 10)]) -@pytest.mark.parametrize("op", ["nsmallest", "nlargest"]) -@pytest.mark.parametrize("columns", ["a", ["b", "a"]]) -def test_dataframe_nlargest_nsmallest(nelem, n, op, columns): - rng = np.random.default_rng(seed=0) - aa = rng.random(nelem) - bb = rng.random(nelem) - - df = DataFrame({"a": aa, "b": bb}) - pdf = df.to_pandas() - assert_eq(getattr(df, op)(n, columns), getattr(pdf, op)(n, columns)) - - -@pytest.mark.parametrize("counts", [(10, 5), (100, 10)]) -def test_dataframe_nlargest_sliced(counts, sliceobj): - nelem, n = counts - rng = np.random.default_rng(seed=0) - df = pd.DataFrame() - df["a"] = rng.random(nelem) - df["b"] = rng.random(nelem) - - expect = df[sliceobj].nlargest(n, "a") - gdf = DataFrame.from_pandas(df) - got = gdf[sliceobj].nlargest(n, "a") - assert (got.to_pandas() == expect).all().all() - - -@pytest.mark.parametrize("counts", [(10, 5), (100, 10)]) -def test_dataframe_nsmallest_sliced(counts, sliceobj): - nelem, n = counts - rng = np.random.default_rng(seed=0) - df = pd.DataFrame() - df["a"] = rng.random(nelem) - df["b"] = rng.random(nelem) - - expect = df[sliceobj].nsmallest(n, "a") - gdf = DataFrame.from_pandas(df) - got = gdf[sliceobj].nsmallest(n, "a") - assert (got.to_pandas() == expect).all().all() - - -@pytest.mark.parametrize("num_cols", [1, 2, 3, 5]) -@pytest.mark.parametrize("num_rows", [0, 1, 2, 1000]) -@pytest.mark.parametrize("dtype", NUMERIC_TYPES + DATETIME_TYPES) -@pytest.mark.parametrize("ascending", [True, False]) -@pytest.mark.parametrize("na_position", ["first", "last"]) -def test_dataframe_multi_column( - num_cols, num_rows, dtype, ascending, na_position -): - rng = np.random.default_rng(seed=0) - by = list(string.ascii_lowercase[:num_cols]) - pdf = pd.DataFrame() - - for i in range(5): - colname = string.ascii_lowercase[i] - data = rng.integers(0, 26, num_rows).astype(dtype) - pdf[colname] = data - - gdf = DataFrame.from_pandas(pdf) - - got = gdf.sort_values(by, ascending=ascending, na_position=na_position) - expect = pdf.sort_values(by, ascending=ascending, na_position=na_position) - - assert_eq( - got[by].reset_index(drop=True), expect[by].reset_index(drop=True) - ) - - -@pytest.mark.parametrize("num_cols", [1, 2, 3]) -@pytest.mark.parametrize("num_rows", [0, 1, 2, 3, 5]) -@pytest.mark.parametrize("dtype", ["float32", "float64"]) -@pytest.mark.parametrize("nulls", ["some", "all"]) -@pytest.mark.parametrize("ascending", [True, False]) -@pytest.mark.parametrize("na_position", ["first", "last"]) -def test_dataframe_multi_column_nulls( - num_cols, num_rows, dtype, nulls, ascending, na_position -): - rng = np.random.default_rng(seed=0) - by = list(string.ascii_lowercase[:num_cols]) - pdf = pd.DataFrame() - - for i in range(3): - colname = string.ascii_lowercase[i] - data = rng.integers(0, 26, num_rows).astype(dtype) - if nulls == "some": - idx = np.array([], dtype="int64") - if num_rows > 0: - idx = rng.choice( - num_rows, size=int(num_rows / 4), replace=False - ) - data[idx] = np.nan - elif nulls == "all": - data[:] = np.nan - pdf[colname] = data - - gdf = DataFrame.from_pandas(pdf) - - got = gdf.sort_values(by, ascending=ascending, na_position=na_position) - expect = pdf.sort_values(by, ascending=ascending, na_position=na_position) - - assert_eq( - got[by].reset_index(drop=True), expect[by].reset_index(drop=True) - ) - - -@pytest.mark.parametrize("ascending1", [True, False]) -@pytest.mark.parametrize("ascending2", [True, False]) -@pytest.mark.parametrize("na_position", ["first", "last"]) -def test_dataframe_multi_column_nulls_multiple_ascending( - ascending1, ascending2, na_position -): - ascending = (ascending1, ascending2) - pdf = pd.DataFrame( - {"a": [3, 1, None, 2, 2, None, 1], "b": [1, 2, 3, 4, 5, 6, 7]} - ) - gdf = DataFrame.from_pandas(pdf) - expect = pdf.sort_values( - by=["a", "b"], ascending=ascending, na_position=na_position - ) - actual = gdf.sort_values( - by=["a", "b"], ascending=ascending, na_position=na_position - ) - - assert_eq(actual, expect) - - -@pytest.mark.parametrize("nelem", [1, 100]) -def test_series_nlargest_nelem(nelem): - rng = np.random.default_rng(seed=0) - elems = rng.random(nelem) - gds = Series(elems).nlargest(nelem) - pds = pd.Series(elems).nlargest(nelem) - - assert (pds == gds.to_pandas()).all().all() - - -@pytest.mark.parametrize("map_size", [1, 2, 8]) -@pytest.mark.parametrize("nelem", [1, 10, 100]) -@pytest.mark.parametrize("keep", [True, False]) -def test_dataframe_scatter_by_map(map_size, nelem, keep): - strlist = ["dog", "cat", "fish", "bird", "pig", "fox", "cow", "goat"] - rng = np.random.default_rng(seed=0) - df = DataFrame( - { - "a": rng.choice(strlist[:map_size], nelem), - "b": rng.uniform(low=0, high=map_size, size=nelem), - "c": rng.integers(map_size, size=nelem), - } - ) - df["d"] = df["a"].astype("category") - - def _check_scatter_by_map(dfs, col): - assert len(dfs) == map_size - nrows = 0 - name = col.name - for i, df in enumerate(dfs): - nrows += len(df) - if len(df) > 0: - # Make sure the column types were preserved - assert isinstance(df[name]._column, type(col._column)) - try: - sr = df[name].astype(np.int32) - except ValueError: - sr = df[name] - assert sr.nunique() <= 1 - if sr.nunique() == 1: - if isinstance(df[name]._column, NumericalColumn): - assert sr.iloc[0] == i - assert nrows == nelem - - with pytest.warns(UserWarning): - _check_scatter_by_map( - df.scatter_by_map("a", map_size, keep_index=keep), df["a"] - ) - _check_scatter_by_map( - df.scatter_by_map("b", map_size, keep_index=keep), df["b"] - ) - _check_scatter_by_map( - df.scatter_by_map("c", map_size, keep_index=keep), df["c"] - ) - with pytest.warns(UserWarning): - _check_scatter_by_map( - df.scatter_by_map("d", map_size, keep_index=keep), df["d"] - ) - - if map_size == 2 and nelem == 100: - with pytest.warns(UserWarning): - df.scatter_by_map("a") # Auto-detect map_size - with pytest.raises(ValueError): - with pytest.warns(UserWarning): - df.scatter_by_map("a", map_size=1, debug=True) # Bad map_size - - # Test Index - df2 = df.set_index("c") - generic_result = df2.scatter_by_map("b", map_size, keep_index=keep) - _check_scatter_by_map(generic_result, df2["b"]) - if keep: - for frame in generic_result: - isinstance(frame.index, type(df2.index)) - - # Test MultiIndex - df2 = df.set_index(["a", "c"]) - multiindex_result = df2.scatter_by_map("b", map_size, keep_index=keep) - _check_scatter_by_map(multiindex_result, df2["b"]) - if keep: - for frame in multiindex_result: - isinstance(frame.index, type(df2.index)) - - -@pytest.mark.parametrize( - "kind", ["quicksort", "mergesort", "heapsort", "stable"] -) -def test_dataframe_sort_values_kind(nelem, dtype, kind): - rng = np.random.default_rng(seed=0) - df = DataFrame() - df["a"] = aa = (100 * rng.random(nelem)).astype(dtype) - df["b"] = bb = (100 * rng.random(nelem)).astype(dtype) - with expect_warning_if(kind != "quicksort", UserWarning): - sorted_df = df.sort_values(by="a", kind=kind) - # Check - sorted_index = np.argsort(aa, kind="mergesort") - assert_eq(sorted_df.index.values, sorted_index) - assert_eq(sorted_df["a"].values, aa[sorted_index]) - assert_eq(sorted_df["b"].values, bb[sorted_index]) - - -@pytest.mark.parametrize("ids", [[-1, 0, 1, 0], [0, 2, 3, 0]]) -def test_dataframe_scatter_by_map_7513(ids): - df = DataFrame({"id": ids, "val": [0, 1, 2, 3]}) - with pytest.raises(ValueError): - df.scatter_by_map(df["id"]) - - -def test_dataframe_scatter_by_map_empty(): - df = DataFrame({"a": [], "b": []}, dtype="float64") - scattered = df.scatter_by_map(df["a"]) - assert len(scattered) == 0 - - -def test_sort_values_by_index_level(): - df = pd.DataFrame({"a": [1, 3, 2]}, index=pd.Index([1, 3, 2], name="b")) - cudf_df = DataFrame.from_pandas(df) - result = cudf_df.sort_values("b") - expected = df.sort_values("b") - assert_eq(result, expected) - - -def test_sort_values_by_ambiguous(): - df = pd.DataFrame({"a": [1, 3, 2]}, index=pd.Index([1, 3, 2], name="a")) - cudf_df = DataFrame.from_pandas(df) - - assert_exceptions_equal( - lfunc=df.sort_values, - rfunc=cudf_df.sort_values, - lfunc_args_and_kwargs=(["a"], {}), - rfunc_args_and_kwargs=(["a"], {}), - ) From 4956cd75529ded809eb6408e977e4fa9a44b5451 Mon Sep 17 00:00:00 2001 From: Matthew Murray <41342305+Matt711@users.noreply.github.com> Date: Wed, 30 Jul 2025 21:56:20 -0400 Subject: [PATCH 034/366] Make dividing a boolean column return f64 dtype in cudf-polars (#19443) Closes https://github.com/rapidsai/cudf/issues/19408 Authors: - Matthew Murray (https://github.com/Matt711) Approvers: - Matthew Roeschke (https://github.com/mroeschke) URL: https://github.com/rapidsai/cudf/pull/19443 --- python/cudf_polars/cudf_polars/dsl/translate.py | 2 ++ python/cudf_polars/cudf_polars/testing/plugin.py | 8 -------- .../cudf_polars/tests/expressions/test_numeric_binops.py | 9 +++++++++ 3 files changed, 11 insertions(+), 8 deletions(-) diff --git a/python/cudf_polars/cudf_polars/dsl/translate.py b/python/cudf_polars/cudf_polars/dsl/translate.py index 6db8085fab4..5c5537be43e 100644 --- a/python/cudf_polars/cudf_polars/dsl/translate.py +++ b/python/cudf_polars/cudf_polars/dsl/translate.py @@ -864,6 +864,8 @@ def _( dtype: DataType, schema: Schema, ) -> expr.Expr: + if plc.traits.is_boolean(dtype.plc) and node.op == pl_expr.Operator.TrueDivide: + dtype = DataType(pl.Float64()) return expr.BinOp( dtype, expr.BinOp._MAPPING[node.op], diff --git a/python/cudf_polars/cudf_polars/testing/plugin.py b/python/cudf_polars/cudf_polars/testing/plugin.py index f1d7820c809..e205a7299d3 100644 --- a/python/cudf_polars/cudf_polars/testing/plugin.py +++ b/python/cudf_polars/cudf_polars/testing/plugin.py @@ -127,14 +127,6 @@ def pytest_configure(config: pytest.Config) -> None: "tests/unit/lazyframe/test_lazyframe.py::test_lazy_cache_hit": "Debug output on stderr doesn't match", "tests/unit/operations/aggregation/test_aggregations.py::test_binary_op_agg_context_no_simplify_expr_12423": "groupby-agg of just literals should not produce collect_list", "tests/unit/operations/aggregation/test_aggregations.py::test_nan_inf_aggregation": "treatment of nans and nulls together is different in libcudf and polars in groupby-agg context", - "tests/unit/operations/arithmetic/test_array.py::test_array_arithmetic_values[exec_op_with_expr_no_type_coercion-broadcast_left-none]": "https://github.com/rapidsai/cudf/issues/19408", - "tests/unit/operations/arithmetic/test_array.py::test_array_arithmetic_values[exec_op_with_expr_no_type_coercion-broadcast_right-none]": "https://github.com/rapidsai/cudf/issues/19408", - "tests/unit/operations/arithmetic/test_array.py::test_array_arithmetic_values[exec_op_with_expr_no_type_coercion-broadcast_both-none]": "https://github.com/rapidsai/cudf/issues/19408", - "tests/unit/operations/arithmetic/test_array.py::test_array_arithmetic_values[exec_op_with_expr_no_type_coercion-broadcast_none-none]": "https://github.com/rapidsai/cudf/issues/19408", - "tests/unit/operations/arithmetic/test_list.py::test_list_arithmetic_values[exec_op_with_expr_no_type_coercion-broadcast_left-none]": "https://github.com/rapidsai/cudf/issues/19408", - "tests/unit/operations/arithmetic/test_list.py::test_list_arithmetic_values[exec_op_with_expr_no_type_coercion-broadcast_right-none]": "https://github.com/rapidsai/cudf/issues/19408", - "tests/unit/operations/arithmetic/test_list.py::test_list_arithmetic_values[exec_op_with_expr_no_type_coercion-broadcast_both-none]": "https://github.com/rapidsai/cudf/issues/19408", - "tests/unit/operations/arithmetic/test_list.py::test_list_arithmetic_values[exec_op_with_expr_no_type_coercion-broadcast_none-none]": "https://github.com/rapidsai/cudf/issues/19408", "tests/unit/operations/test_abs.py::test_abs_duration": "Need to raise for unsupported uops on timelike values", "tests/unit/operations/test_group_by.py::test_group_by_shorthand_quantile": "libcudf quantiles are round to nearest ties to even, polars quantiles are round to nearest ties away from zero", "tests/unit/operations/test_group_by.py::test_group_by_mean_by_dtype[input10-expected10-Date-output_dtype10]": "Unsupported groupby-agg for a particular dtype", diff --git a/python/cudf_polars/tests/expressions/test_numeric_binops.py b/python/cudf_polars/tests/expressions/test_numeric_binops.py index 62e14a0f5f3..690b4173b8c 100644 --- a/python/cudf_polars/tests/expressions/test_numeric_binops.py +++ b/python/cudf_polars/tests/expressions/test_numeric_binops.py @@ -97,3 +97,12 @@ def test_floor_div_binop_by_zero(zero, ltype): q = df.select(pl.col("a") // zero) assert_gpu_result_equal(q) + + +@pytest.mark.parametrize("divisor", [1, 2.0]) +def test_true_div_boolean_column(divisor): + df = pl.LazyFrame({"a": [True, False]}) + + q = df.select(pl.col("a") / divisor) + + assert_gpu_result_equal(q) From dfba7d3fced0063914a837d77f4c1d3fa77a338a Mon Sep 17 00:00:00 2001 From: David Wendt <45795991+davidwendt@users.noreply.github.com> Date: Thu, 31 Jul 2025 09:39:36 -0400 Subject: [PATCH 035/366] Fix contiguous-split nvbench cmake build (#19534) Fixes the cmake configure line for `CONTIGUOUS_SPLIT_NVBENCH` Authors: - David Wendt (https://github.com/davidwendt) Approvers: - Bradley Dice (https://github.com/bdice) - Muhammad Haseeb (https://github.com/mhaseeb123) - Nghia Truong (https://github.com/ttnghia) URL: https://github.com/rapidsai/cudf/pull/19534 --- cpp/benchmarks/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt index 12ebdf8ef2b..c8d2cae9fd2 100644 --- a/cpp/benchmarks/CMakeLists.txt +++ b/cpp/benchmarks/CMakeLists.txt @@ -151,7 +151,7 @@ ConfigureNVBench( # ################################################################################################## # * contiguous_split benchmark ------------------------------------------------------------------- -ConfigureBench(CONTIGUOUS_SPLIT_NVBENCH contiguous_split/contiguous_split.cpp) +ConfigureNVBench(CONTIGUOUS_SPLIT_NVBENCH contiguous_split/contiguous_split.cpp) # ################################################################################################## # * lists scatter benchmark ----------------------------------------------------------------------- From 2741661ea039b4277af14b129eae5e7361a1e0f7 Mon Sep 17 00:00:00 2001 From: David Wendt <45795991+davidwendt@users.noreply.github.com> Date: Thu, 31 Jul 2025 09:39:57 -0400 Subject: [PATCH 036/366] Remove c++ stringview interop example (#19516) Removes the Arrow stringview interop example since libcudf supports this conversion in its interop APIs now. Authors: - David Wendt (https://github.com/davidwendt) Approvers: - Nghia Truong (https://github.com/ttnghia) - Muhammad Haseeb (https://github.com/mhaseeb123) - Basit Ayantunde (https://github.com/lamarrr) URL: https://github.com/rapidsai/cudf/pull/19516 --- cpp/examples/build.sh | 1 - cpp/examples/interop/CMakeLists.txt | 34 ------ cpp/examples/interop/interop.cpp | 177 ---------------------------- 3 files changed, 212 deletions(-) delete mode 100644 cpp/examples/interop/CMakeLists.txt delete mode 100644 cpp/examples/interop/interop.cpp diff --git a/cpp/examples/build.sh b/cpp/examples/build.sh index e6ceaf5b6e6..7296a1afd04 100755 --- a/cpp/examples/build.sh +++ b/cpp/examples/build.sh @@ -63,4 +63,3 @@ build_example string_transforms build_example nested_types build_example parquet_io build_example billion_rows -build_example interop diff --git a/cpp/examples/interop/CMakeLists.txt b/cpp/examples/interop/CMakeLists.txt deleted file mode 100644 index f974735b979..00000000000 --- a/cpp/examples/interop/CMakeLists.txt +++ /dev/null @@ -1,34 +0,0 @@ -# Copyright (c) 2024-2025, NVIDIA CORPORATION. - -cmake_minimum_required(VERSION 3.30.4 FATAL_ERROR) - -include(../set_cuda_architecture.cmake) - -rapids_cuda_init_architectures(interop_example) - -project( - interop_example - VERSION 0.0.1 - LANGUAGES CXX CUDA -) - -include(../fetch_dependencies.cmake) - -include(rapids-cmake) -rapids_cmake_build_type("Release") - -# For now, disable CMake's automatic module scanning for C++ files. There is an sccache bug in the -# version RAPIDS uses in CI that causes it to handle the resulting -M* flags incorrectly with -# gcc>=14. We can remove this once we upgrade to a newer sccache version. -set(CMAKE_CXX_SCAN_FOR_MODULES OFF) - -# The Arrow CMake is currently broken if the build type is not set -set(CMAKE_BUILD_TYPE Release) -# No need to install Arrow libs when only the final example executable is shipped. -set(CUDF_EXCLUDE_ARROW_FROM_ALL ON) -include(../../cmake/thirdparty/get_arrow.cmake) - -add_executable(interop interop.cpp) -target_link_libraries(interop PRIVATE cudf::cudf) -target_compile_features(interop PRIVATE cxx_std_20) -target_link_libraries(interop PRIVATE ${ARROW_LIBRARIES}) diff --git a/cpp/examples/interop/interop.cpp b/cpp/examples/interop/interop.cpp deleted file mode 100644 index b01b04489a6..00000000000 --- a/cpp/examples/interop/interop.cpp +++ /dev/null @@ -1,177 +0,0 @@ -/* - * Copyright (c) 2024-2025, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include -#include -#include -#include - -#include - -#include -#include - -// Helper functions to create StringViews -inline arrow::StringViewType::c_type to_inline_string_view(const void* data, int32_t const& size) -{ - arrow::StringViewType::c_type out; - out.inlined = {size, {}}; - memcpy(&out.inlined.data, data, size); - return out; -} -inline arrow::StringViewType::c_type to_inline_string_view(std::string_view const& v) -{ - return to_inline_string_view(v.data(), static_cast(v.size())); -} -inline arrow::StringViewType::c_type to_string_view(const void* data, - int32_t const& size, - int32_t const& buffer_index, - int32_t const& offset) -{ - if (size <= arrow::StringViewType::kInlineSize) { return to_inline_string_view(data, size); } - arrow::StringViewType::c_type out; - out.ref = {size, {}, buffer_index, offset}; - memcpy(&out.ref.prefix, data, sizeof(out.ref.prefix)); - return out; -} -inline arrow::StringViewType::c_type to_string_view(std::string_view const& v, - int32_t const& buffer_index, - int32_t const& offset) -{ - return to_string_view(v.data(), static_cast(v.size()), buffer_index, offset); -} - -/** - * @brief Create a StringViewArray - * - * @param data_buffers The data buffers - * @param views The string views - * @param validate Whether to validate the array - */ -arrow::Result> make_string_view_array( - arrow::BufferVector const& data_buffers, - std::vector const& views, - bool validate = true) -{ - auto const length = static_cast(views.size()); - auto const arr = std::make_shared( - arrow::utf8_view(), length, arrow::Buffer::FromVector(views), std::move(data_buffers)); - if (validate) { RETURN_NOT_OK(arr->ValidateFull()); } - return arr; -} - -/** - * @brief Convert a vector of strings into a vector of the - * constituent chars and a vector of offsets. - * - * @param strings The vector of strings - */ -auto make_chars_and_offsets(std::vector const& strings) -{ - std::vector chars{}; - std::vector offsets(1, 0); - for (auto& str : strings) { - chars.insert(chars.end(), std::cbegin(str), std::cend(str)); - auto const last_offset = static_cast(offsets.back()); - auto const next_offset = last_offset + str.length(); - CUDF_EXPECTS( - next_offset < static_cast(std::numeric_limits::max()), - "Cannot use arrow_string_view_to_cudf_column to build a large strings column"); - offsets.push_back(static_cast(next_offset)); - } - return std::make_tuple(std::move(chars), std::move(offsets)); -}; - -/** - * @brief Convert an Arrow StringViewArray to a cudf::column - * - * @param array The Arrow StringViewArray - * @param stream The CUDA stream used for device memory operations and kernel launches - * @param mr Device memory resource used to allocate the returned column's device memory - */ -std::unique_ptr arrow_string_view_to_cudf_column( - std::shared_ptr const& array, - rmm::cuda_stream_view stream = cudf::get_default_stream(), - rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()) -{ - // Convert the string views into chars and offsets - std::vector strings; - for (auto i = 0; i < array->length(); i++) { - strings.push_back(array->GetString(i)); - } - auto const [chars, offsets] = make_chars_and_offsets(strings); - - // Copy the chars vector to the device - rmm::device_uvector d_chars(chars.size(), stream, mr); - CUDF_CUDA_TRY(cudaMemcpyAsync( - d_chars.data(), chars.data(), chars.size() * sizeof(char), cudaMemcpyDefault, stream.value())); - - // Copy the offsets vector to the device - // and wrap it in a cudf::column - rmm::device_uvector d_offsets(offsets.size(), stream, mr); - CUDF_CUDA_TRY(cudaMemcpyAsync(d_offsets.data(), - offsets.data(), - offsets.size() * sizeof(cudf::size_type), - cudaMemcpyDefault, - stream.value())); - auto offsets_col = - std::make_unique(std::move(d_offsets), rmm::device_buffer{0, stream, mr}, 0); - - // Create a string column out of the chars and offsets - return cudf::make_strings_column(array->length(), - std::move(offsets_col), - d_chars.release(), - 0, - rmm::device_buffer{0, stream, mr}); -} - -int main(int argc, char** argv) -{ - std::vector> data_buffers; - std::vector views; - - // Define the data buffers and string views - auto const buffer_a = - arrow::Buffer::FromString("hello rapids teamapache arrow interopnvidiacudf"); - data_buffers.push_back(buffer_a); - views.push_back(to_string_view("hello rapid steam", 0, 0)); - views.push_back(to_string_view("apache arrow interop", 0, 17)); - views.push_back(to_inline_string_view("nvidia")); - views.push_back(to_inline_string_view("cudf")); - - // Create a StringViewArray - auto const string_view_col = make_string_view_array(data_buffers, views, true).ValueOrDie(); - std::cout << string_view_col->ToString() << std::endl; - - // Convert the StringViewArray to a cudf::column - auto const cudf_col = arrow_string_view_to_cudf_column(string_view_col); - - // Write the cudf::column as CSV - auto const tbl_view = cudf::table_view({cudf_col->view()}); - std::vector const names = {"col_a"}; - - std::vector h_buffer; - cudf::io::csv_writer_options writer_options = - cudf::io::csv_writer_options::builder(cudf::io::sink_info(&h_buffer), tbl_view) - .include_header(not names.empty()) - .names(names); - - cudf::io::write_csv(writer_options); - auto const result = std::string(h_buffer.data(), h_buffer.size()); - std::cout << result << std::endl; - - return 0; -} From 0b5a8949de6b4dcf80cee2166cba680676049b1d Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Thu, 31 Jul 2025 10:48:29 -0700 Subject: [PATCH 037/366] Simplify clang dependency spec (#19546) This is a slight simplification on https://github.com/rapidsai/cudf/pull/19529/ by having clang versions specified in only one place and splitting out include-what-you-use. Authors: - Vyas Ramasubramani (https://github.com/vyasr) Approvers: - James Lamb (https://github.com/jameslamb) URL: https://github.com/rapidsai/cudf/pull/19546 --- dependencies.yaml | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/dependencies.yaml b/dependencies.yaml index a03f5ac2c3f..0153304a4e2 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -10,8 +10,7 @@ files: - build_all - build_cpp - build_python_common - - clang_format - - clang_tidy + - clang - cuda - cuda_version - depends_on_cupy @@ -22,6 +21,7 @@ files: - depends_on_rmm - develop - docs + - iwyu - notebooks - py_version - pyarrow_run @@ -124,10 +124,11 @@ files: includes: - build_all - build_base - - clang_tidy + - clang - cuda - cuda_version - develop + - iwyu - py_version docs: output: none @@ -558,18 +559,16 @@ dependencies: - output_types: conda packages: - &doxygen doxygen=1.9.1 # pre-commit hook needs a specific version. - clang_format: + clang: common: - output_types: conda packages: - clang==20.1.4 - clang-tools==20.1.4 - clang_tidy: + iwyu: common: - output_types: conda packages: - - clang==20.1.4 - - clang-tools==20.1.4 - include-what-you-use==0.24.0 docs: common: From de8b114f81cca754e8d188ccf9ff6f8543af553a Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com> Date: Thu, 31 Jul 2025 14:46:22 -0700 Subject: [PATCH 038/366] Separate row mask and page mask computation and usage (#19537) Contributes to #19526 This PR separates the computation of row mask and data page mask for filter columns providing more control to the `materialize_filter_columns`. Authors: - Muhammad Haseeb (https://github.com/mhaseeb123) Approvers: - Vukasin Milovanovic (https://github.com/vuule) - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/19537 --- .../cudf/io/experimental/hybrid_scan.hpp | 106 +++++++------ .../io/parquet/experimental/hybrid_scan.cpp | 27 ++-- .../experimental/hybrid_scan_helpers.hpp | 7 +- .../parquet/experimental/hybrid_scan_impl.cpp | 44 +++--- .../parquet/experimental/hybrid_scan_impl.hpp | 15 +- .../parquet/experimental/page_index_filter.cu | 2 +- .../experimental/hybrid_scan_filters_test.cpp | 140 ++++++++---------- .../io/experimental/hybrid_scan_test.cpp | 23 ++- 8 files changed, 184 insertions(+), 180 deletions(-) diff --git a/cpp/include/cudf/io/experimental/hybrid_scan.hpp b/cpp/include/cudf/io/experimental/hybrid_scan.hpp index f7f31874d5e..c6a8038d3eb 100644 --- a/cpp/include/cudf/io/experimental/hybrid_scan.hpp +++ b/cpp/include/cudf/io/experimental/hybrid_scan.hpp @@ -50,6 +50,15 @@ namespace io::parquet::experimental { * @file */ +/** + * @brief Whether to compute and use a page mask using the row mask to skip decompression and + * decoding of the masked pages + */ +enum class use_data_page_mask : bool { + YES = true, ///< Compute and use a data page mask + NO = false ///< Do not compute or use a data page mask +}; + /** * @brief The experimental parquet reader class to optimally read parquet files subject to * highly selective filters, called a Hybrid Scan operation @@ -171,15 +180,15 @@ namespace io::parquet::experimental { * } * @endcode * - * Filter column page pruning (OPTIONAL): Once the row groups are filtered, the next step is to - * optionally prune the data pages within the current span of row groups subject to the same filter - * expression using page statistics contained in the page index of the parquet file. To get started, - * first set up the page index using the `setup_page_index()` function if not previously done and - * then filter the data pages using the `filter_data_pages_with_stats()` function. This function - * returns a row mask. i.e. BOOL8 column indicating which rows may survive in the materialized table - * of filter columns (first reader pass), and a data page mask. i.e. a vector of boolean host - * vectors indicating which data pages for each filter column need to be processed to materialize - * the table filter columns (first reader pass). + * Build an initial row mask: Once the row groups are filtered, the next step is to build an + * initial row mask column to indicate which rows in the current span of row groups will survive in + * the read table. This initial row mask may be a BOOL8 cudf column of size equal to the + * total number of rows in the current span of row groups (computed by `total_rows_in_row_groups()`) + * containing all `true` values. Alternatively, the row mask may be built with + * the `build_row_mask_with_page_index_stats()` function and contain a `true` value for only the + * rows that survive the page-level statistics from the page index subject to the same filter as row + * groups. Note that this step requires the page index to be set up using the `setup_page_index()` + * function. * @code{.cpp} * // If not already done, get the page index byte range * auto page_index_byte_range = reader->page_index_byte_range(); @@ -190,24 +199,28 @@ namespace io::parquet::experimental { * // If not already done, Set up the page index now * reader->setup_page_index(page_index_bytes); * - * // Optional: Prune filter column data pages using statistics in page index - * auto [row_mask, data_page_mask] = - * reader->filter_data_pages_with_stats(current_row_group_indices, options, stream, mr); + * // Build a row mask column containing all `true` values + * auto const num_rows = reader->total_rows_in_row_groups(current_row_group_indices); + * auto row_mask = cudf::make_numeric_column( + * cudf::data_type{cudf::type_id::BOOL8}, num_rows, rmm::device_buffer{}, 0, stream, mr); + * + * // Alternatively, build a row mask column indicating only the rows that survive the page-level + * statistics in the page index + * row_mask = reader->build_row_mask_with_page_index_stats(current_row_group_indices, options, + * stream, mr); * @endcode * - * Materialize filter columns: Once we are finished with pruning row groups and filter column data - * pages, the next step is to materialize filter columns into a table (first reader pass). This is + * Materialize filter columns: Once we are done with pruning row groups and constructing the row + * mask, the next step is to materialize filter columns into a table (first reader pass). This is * done using the `materialize_filter_columns()` function. This function requires a vector of device - * buffers containing column chunk data for the current list of row groups, and the data page and - * row masks obtained from the page pruning step. The function returns a table of materialized - * filter columns and also updates the row mask column to only the valid rows that satisfy the - * filter expression. If no row group pruning is needed, pass a span of all row group indices from - * `all_row_groups()` function as the current list of row groups. Similarly, if no page pruning is - * desired, pass an empty span as data page mask and a mutable view of a BOOL8 column of size equal - * to total number of rows in the current row groups list (computed by `total_rows_in_row_groups()`) - * containing all `true` values as row mask. Further, the byte ranges for the required column chunk - * data may be obtained using the `filter_column_chunks_byte_ranges()` function and read into a - * corresponding vector of vectors of device buffers. + * buffers containing column chunk data for the current list of row groups, and a mutable view of + * the current row mask. The function optionally builds a mask for the current data pages using the + * input row mask to skip decompression and decoding of the pruned pages based on the + * `mask_data_pages` argument. The filter columns are then read into a table and filtered based on + * the filter expression and the row mask is updated to only indicate the rows that survive in the + * read table. The final table is returned. The byte ranges for the required column chunk data may + * be obtained using the `filter_column_chunks_byte_ranges()` function and read into a corresponding + * vector of vectors of device buffers. * @code{.cpp} * // Get byte ranges of column chunk byte ranges from the reader * auto const filter_column_chunk_byte_ranges = @@ -219,24 +232,21 @@ namespace io::parquet::experimental { * * // Materialize the table with only the filter columns * auto [filter_table, filter_metadata] = - * reader->materialize_filter_columns(data_page_mask, - * current_row_group_indices, + * reader->materialize_filter_columns(current_row_group_indices, * std::move(filter_column_chunk_buffers), * row_mask->mutable_view(), + * use_data_page_mask::YES/NO, * options, * stream); * @endcode * * Materialize payload columns: Once the filter columns are materialized, the final step is to * materialize the payload columns into another table (second reader pass). This is done using the - * `materialize_payload_columns()` function. This function requires a vector of device buffers - * containing column chunk data for the current list of row groups, and the updated row mask from - * the `materialize_filter_columns()`. The function uses the row mask - may be a BOOL8 column of - * size equal to total number of rows in the current row groups list containing all `true` values if - * no pruning is desired - to internally prune payload column data pages and mask the materialized - * payload columns to the desired rows. Similar to the first reader pass, the byte ranges for the - * required column chunk data may be obtained using the `payload_column_chunks_byte_ranges()` - * function and read into a corresponding vector of vectors of device buffers. + * `materialize_payload_columns()` function which is identical to the `materialize_filter_columns()` + * in terms of functionality except that it accepts an immutable view of the row mask and uses it to + * filter the read output table before returning it. The byte ranges for the required column chunk + * data may be obtained using the `payload_column_chunks_byte_ranges()` function and read into a + * corresponding vector of vectors of device buffers. * @code{.cpp} * // Get column chunk byte ranges from the reader * auto const payload_column_chunk_byte_ranges = @@ -251,6 +261,7 @@ namespace io::parquet::experimental { * reader->materialize_payload_columns(current_row_group_indices, * std::move(payload_column_chunk_buffers), * row_mask->view(), + * use_data_page_mask::YES/NO, * options, * stream); * @endcode @@ -258,7 +269,7 @@ namespace io::parquet::experimental { * Once both reader passes are complete, the filter and payload column tables may be trivially * combined by releasing the columns from both tables and moving them into a new cudf table. * - * @note The performance advantage of this reader is most pronounced when the filter expression + * @note The performance advantage of this reader is most prominent when the filter expression * is highly selective, i.e. when the data in filter columns are at least partially ordered and the * number of rows that survive the filter is small compared to the total number of rows in the * parquet file. Otherwise, the performance is identical to the `cudf::io::read_parquet()` function. @@ -390,21 +401,21 @@ class hybrid_scan_reader { rmm::cuda_stream_view stream) const; /** - * @brief Filter data pages of filter columns using page statistics from page index metadata + * @brief Builds a boolean column indicating which rows survive the page statistics in the page + * index * * @param row_group_indices Input row groups indices * @param options Parquet reader options * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned column's device memory - * @return A pair of boolean column indicating rows corresponding to data pages after - * page-pruning, and a list of boolean vectors indicating which data pages are not pruned, - * one per filter column. + * @return A boolean column indicating which filter column rows survive the statistics in the page + * index */ - [[nodiscard]] std::pair, std::vector>> - filter_data_pages_with_stats(cudf::host_span row_group_indices, - parquet_reader_options const& options, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) const; + [[nodiscard]] std::unique_ptr build_row_mask_with_page_index_stats( + cudf::host_span row_group_indices, + parquet_reader_options const& options, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) const; /** * @brief Get byte ranges of column chunks of filter columns @@ -421,20 +432,19 @@ class hybrid_scan_reader { * @brief Materializes filter columns and updates the input row mask to only the rows * that exist in the output table * - * @param page_mask Boolean vectors indicating which data pages are not pruned, one per filter - * column. All data pages considered not pruned if empty * @param row_group_indices Input row groups indices * @param column_chunk_buffers Device buffers containing column chunk data of filter columns * @param[in,out] row_mask Mutable boolean column indicating surviving rows from page pruning + * @param mask_data_pages Whether to build and use a data page mask using the row mask * @param options Parquet reader options * @param stream CUDA stream used for device memory operations and kernel launches * @return Table of materialized filter columns and metadata */ [[nodiscard]] table_with_metadata materialize_filter_columns( - cudf::host_span const> page_mask, cudf::host_span row_group_indices, std::vector column_chunk_buffers, cudf::mutable_column_view row_mask, + use_data_page_mask mask_data_pages, parquet_reader_options const& options, rmm::cuda_stream_view stream) const; @@ -455,6 +465,7 @@ class hybrid_scan_reader { * @param row_group_indices Input row groups indices * @param column_chunk_buffers Device buffers containing column chunk data of payload columns * @param row_mask Boolean column indicating which rows need to be read. All rows read if empty + * @param mask_data_pages Whether to build and use a data page mask using the row mask * @param options Parquet reader options * @param stream CUDA stream used for device memory operations and kernel launches * @return Table of materialized payload columns and metadata @@ -463,6 +474,7 @@ class hybrid_scan_reader { cudf::host_span row_group_indices, std::vector column_chunk_buffers, cudf::column_view row_mask, + use_data_page_mask mask_data_pages, parquet_reader_options const& options, rmm::cuda_stream_view stream) const; diff --git a/cpp/src/io/parquet/experimental/hybrid_scan.cpp b/cpp/src/io/parquet/experimental/hybrid_scan.cpp index fc8ef765a50..f14ff508561 100644 --- a/cpp/src/io/parquet/experimental/hybrid_scan.cpp +++ b/cpp/src/io/parquet/experimental/hybrid_scan.cpp @@ -123,17 +123,17 @@ std::vector hybrid_scan_reader::filter_row_groups_with_bloom_fi .front(); } -std::pair, std::vector>> -hybrid_scan_reader::filter_data_pages_with_stats(cudf::host_span row_group_indices, - parquet_reader_options const& options, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) const +std::unique_ptr hybrid_scan_reader::build_row_mask_with_page_index_stats( + cudf::host_span row_group_indices, + parquet_reader_options const& options, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) const { // Temporary vector with row group indices from the first source auto const input_row_group_indices = std::vector>{{row_group_indices.begin(), row_group_indices.end()}}; - return _impl->filter_data_pages_with_stats(input_row_group_indices, options, stream, mr); + return _impl->build_row_mask_with_page_index_stats(input_row_group_indices, options, stream, mr); } [[nodiscard]] std::vector @@ -148,10 +148,10 @@ hybrid_scan_reader::filter_column_chunks_byte_ranges( } table_with_metadata hybrid_scan_reader::materialize_filter_columns( - cudf::host_span const> data_page_mask, cudf::host_span row_group_indices, std::vector column_chunk_buffers, cudf::mutable_column_view row_mask, + use_data_page_mask mask_data_pages, parquet_reader_options const& options, rmm::cuda_stream_view stream) const { @@ -159,10 +159,10 @@ table_with_metadata hybrid_scan_reader::materialize_filter_columns( auto const input_row_group_indices = std::vector>{{row_group_indices.begin(), row_group_indices.end()}}; - return _impl->materialize_filter_columns(data_page_mask, - input_row_group_indices, + return _impl->materialize_filter_columns(input_row_group_indices, std::move(column_chunk_buffers), row_mask, + mask_data_pages, options, stream); } @@ -181,6 +181,7 @@ table_with_metadata hybrid_scan_reader::materialize_payload_columns( cudf::host_span row_group_indices, std::vector column_chunk_buffers, cudf::column_view row_mask, + use_data_page_mask mask_data_pages, parquet_reader_options const& options, rmm::cuda_stream_view stream) const { @@ -188,8 +189,12 @@ table_with_metadata hybrid_scan_reader::materialize_payload_columns( auto const input_row_group_indices = std::vector>{{row_group_indices.begin(), row_group_indices.end()}}; - return _impl->materialize_payload_columns( - input_row_group_indices, std::move(column_chunk_buffers), row_mask, options, stream); + return _impl->materialize_payload_columns(input_row_group_indices, + std::move(column_chunk_buffers), + row_mask, + mask_data_pages, + options, + stream); } } // namespace cudf::io::parquet::experimental diff --git a/cpp/src/io/parquet/experimental/hybrid_scan_helpers.hpp b/cpp/src/io/parquet/experimental/hybrid_scan_helpers.hpp index 4372da85451..a60ad2a79ba 100644 --- a/cpp/src/io/parquet/experimental/hybrid_scan_helpers.hpp +++ b/cpp/src/io/parquet/experimental/hybrid_scan_helpers.hpp @@ -237,19 +237,20 @@ class aggregate_reader_metadata : public aggregate_reader_metadata_base { rmm::cuda_stream_view stream) const; /** - * @brief Filter data pages using statistics page-level statistics based on predicate filter + * @brief Builds a row mask based on the data pages that survive page-level statistics based on + * predicate filter * * @param row_group_indices Input row groups indices * @param output_dtypes Datatypes of output columns * @param output_column_schemas schema indices of output columns - * @param filter AST expression to filter data pages based on `PageIndex` statistics + * @param filter AST expression to filter data pages based on page index statistics * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned column's device memory * * @return A boolean column representing a mask of rows surviving the predicate filter at * page-level */ - [[nodiscard]] std::unique_ptr filter_data_pages_with_stats( + [[nodiscard]] std::unique_ptr build_row_mask_with_page_index_stats( cudf::host_span const> row_group_indices, cudf::host_span output_dtypes, cudf::host_span output_column_schemas, diff --git a/cpp/src/io/parquet/experimental/hybrid_scan_impl.cpp b/cpp/src/io/parquet/experimental/hybrid_scan_impl.cpp index 48aa1d6c40c..354eb2adde1 100644 --- a/cpp/src/io/parquet/experimental/hybrid_scan_impl.cpp +++ b/cpp/src/io/parquet/experimental/hybrid_scan_impl.cpp @@ -322,8 +322,7 @@ std::vector> hybrid_scan_reader_impl::filter_row_groups_w stream); } -std::pair, std::vector>> -hybrid_scan_reader_impl::filter_data_pages_with_stats( +std::unique_ptr hybrid_scan_reader_impl::build_row_mask_with_page_index_stats( cudf::host_span const> row_group_indices, parquet_reader_options const& options, rmm::cuda_stream_view stream, @@ -343,18 +342,13 @@ hybrid_scan_reader_impl::filter_data_pages_with_stats( "Columns names in filter expression must be convertible to index references"); auto output_dtypes = get_output_types(_output_buffers_template); - auto row_mask = - _extended_metadata->filter_data_pages_with_stats(row_group_indices, - output_dtypes, - _output_column_schemas, - expr_conv.get_converted_expr().value(), - stream, - mr); - - auto data_page_mask = _extended_metadata->compute_data_page_mask( - row_mask->view(), row_group_indices, output_dtypes, _output_column_schemas, stream); - - return {std::move(row_mask), std::move(data_page_mask)}; + return _extended_metadata->build_row_mask_with_page_index_stats( + row_group_indices, + output_dtypes, + _output_column_schemas, + expr_conv.get_converted_expr().value(), + stream, + mr); } std::pair, std::vector> @@ -434,10 +428,10 @@ hybrid_scan_reader_impl::payload_column_chunks_byte_ranges( } table_with_metadata hybrid_scan_reader_impl::materialize_filter_columns( - cudf::host_span const> data_page_mask, cudf::host_span const> row_group_indices, std::vector column_chunk_buffers, cudf::mutable_column_view row_mask, + use_data_page_mask mask_data_pages, parquet_reader_options const& options, rmm::cuda_stream_view stream) { @@ -458,11 +452,13 @@ table_with_metadata hybrid_scan_reader_impl::materialize_filter_columns( select_columns(read_columns_mode::FILTER_COLUMNS, options); - // If the data page mask is empty, fill the row mask with all true values - if (data_page_mask.empty()) { - auto const value = cudf::numeric_scalar(true, true, stream); - cudf::fill_in_place(row_mask, 0, row_mask.size(), value, stream); - } + auto output_dtypes = get_output_types(_output_buffers_template); + + auto data_page_mask = + (mask_data_pages == use_data_page_mask::YES) + ? _extended_metadata->compute_data_page_mask( + row_mask, row_group_indices, output_dtypes, _output_column_schemas, stream) + : std::vector>{}; prepare_data(row_group_indices, std::move(column_chunk_buffers), data_page_mask, options); @@ -473,6 +469,7 @@ table_with_metadata hybrid_scan_reader_impl::materialize_payload_columns( cudf::host_span const> row_group_indices, std::vector column_chunk_buffers, cudf::column_view row_mask, + use_data_page_mask mask_data_pages, parquet_reader_options const& options, rmm::cuda_stream_view stream) { @@ -490,8 +487,11 @@ table_with_metadata hybrid_scan_reader_impl::materialize_payload_columns( auto output_dtypes = get_output_types(_output_buffers_template); - auto data_page_mask = _extended_metadata->compute_data_page_mask( - row_mask, row_group_indices, output_dtypes, _output_column_schemas, stream); + auto data_page_mask = + (mask_data_pages == use_data_page_mask::YES) + ? _extended_metadata->compute_data_page_mask( + row_mask, row_group_indices, output_dtypes, _output_column_schemas, stream) + : std::vector>{}; prepare_data(row_group_indices, std::move(column_chunk_buffers), data_page_mask, options); diff --git a/cpp/src/io/parquet/experimental/hybrid_scan_impl.hpp b/cpp/src/io/parquet/experimental/hybrid_scan_impl.hpp index b2c0d40a7d1..feca87aeef4 100644 --- a/cpp/src/io/parquet/experimental/hybrid_scan_impl.hpp +++ b/cpp/src/io/parquet/experimental/hybrid_scan_impl.hpp @@ -120,13 +120,13 @@ class hybrid_scan_reader_impl : public parquet::detail::reader_impl { rmm::cuda_stream_view stream); /** - * @copydoc cudf::io::experimental::hybrid_scan::filter_data_pages_with_stats + * @copydoc cudf::io::experimental::hybrid_scan::build_row_mask_with_page_index_stats */ - [[nodiscard]] std::pair, std::vector>> - filter_data_pages_with_stats(cudf::host_span const> row_group_indices, - parquet_reader_options const& options, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr); + [[nodiscard]] std::unique_ptr build_row_mask_with_page_index_stats( + cudf::host_span const> row_group_indices, + parquet_reader_options const& options, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr); /** * @brief Fetches byte ranges of column chunks of filter columns @@ -144,10 +144,10 @@ class hybrid_scan_reader_impl : public parquet::detail::reader_impl { * @copydoc cudf::io::experimental::hybrid_scan::materialize_filter_columns */ [[nodiscard]] table_with_metadata materialize_filter_columns( - cudf::host_span const> data_page_pask, cudf::host_span const> row_group_indices, std::vector column_chunk_buffers, cudf::mutable_column_view row_mask, + use_data_page_mask mask_data_pages, parquet_reader_options const& options, rmm::cuda_stream_view stream); @@ -170,6 +170,7 @@ class hybrid_scan_reader_impl : public parquet::detail::reader_impl { cudf::host_span const> row_group_indices, std::vector column_chunk_buffers, cudf::column_view row_mask, + use_data_page_mask mask_data_pages, parquet_reader_options const& options, rmm::cuda_stream_view stream); diff --git a/cpp/src/io/parquet/experimental/page_index_filter.cu b/cpp/src/io/parquet/experimental/page_index_filter.cu index 42a27e3af48..91311322c8d 100644 --- a/cpp/src/io/parquet/experimental/page_index_filter.cu +++ b/cpp/src/io/parquet/experimental/page_index_filter.cu @@ -603,7 +603,7 @@ struct is_row_required_fn { } // namespace -std::unique_ptr aggregate_reader_metadata::filter_data_pages_with_stats( +std::unique_ptr aggregate_reader_metadata::build_row_mask_with_page_index_stats( cudf::host_span const> row_group_indices, cudf::host_span output_dtypes, cudf::host_span output_column_schemas, diff --git a/cpp/tests/io/experimental/hybrid_scan_filters_test.cpp b/cpp/tests/io/experimental/hybrid_scan_filters_test.cpp index 1870adc8dea..6f47a0e2351 100644 --- a/cpp/tests/io/experimental/hybrid_scan_filters_test.cpp +++ b/cpp/tests/io/experimental/hybrid_scan_filters_test.cpp @@ -248,63 +248,55 @@ TYPED_TEST(PageFilteringWithPageIndexStats, FilterPagesWithPageIndexStats) reinterpret_cast(file_buffer.data()), file_buffer.size()); // Helper function to test data page filteration using page index stats - auto const test_filter_data_pages_with_stats = - [&](cudf::ast::operation const& filter_expression, - cudf::size_type const num_filter_columns, - cudf::size_type const expected_num_pages_after_page_index_filter, - rmm::cuda_stream_view stream = cudf::get_default_stream(), - rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()) { - // Create reader options with empty source info - cudf::io::parquet_reader_options options = - cudf::io::parquet_reader_options::builder().filter(filter_expression); - - // Fetch footer and page index bytes from the buffer. - auto const footer_buffer = fetch_footer_bytes(file_buffer_span); - - // Create hybrid scan reader with footer bytes - auto const reader = std::make_unique( - footer_buffer, options); - - // Get all row groups from the reader - auto input_row_group_indices = reader->all_row_groups(options); - - // Span to track current row group indices - auto current_row_group_indices = cudf::host_span(input_row_group_indices); - - // Calling `filter_data_pages_with_stats` before setting up the page index should raise an - // error - EXPECT_THROW(std::ignore = reader->filter_data_pages_with_stats( - current_row_group_indices, options, stream, mr), - std::runtime_error); - - // Set up the page index - auto const page_index_byte_range = reader->page_index_byte_range(); - auto const page_index_buffer = - fetch_page_index_bytes(file_buffer_span, page_index_byte_range); - reader->setup_page_index(page_index_buffer); - - // Filter the data pages with page index stats - auto const [row_mask, data_page_mask] = - reader->filter_data_pages_with_stats(current_row_group_indices, options, stream, mr); - EXPECT_EQ(data_page_mask.size(), num_filter_columns); - - auto const expected_num_rows = reader->total_rows_in_row_groups(current_row_group_indices); - EXPECT_EQ(row_mask->type().id(), cudf::type_id::BOOL8); - EXPECT_EQ(row_mask->size(), expected_num_rows); - EXPECT_EQ(row_mask->null_count(), 0); - - // Half the pages should survive the page index filter - - // Count the number of pages that survive the page index filter - auto const num_pages_after_page_index_filter = - std::accumulate(data_page_mask.begin(), - data_page_mask.end(), - cudf::size_type{0}, - [](auto sum, auto const& page_mask) { - return sum + std::count(page_mask.cbegin(), page_mask.cend(), true); - }); - EXPECT_EQ(num_pages_after_page_index_filter, expected_num_pages_after_page_index_filter); - }; + auto const test_filter_data_pages_with_stats = [&](cudf::ast::operation const& filter_expression, + cudf::size_type const expected_surviving_rows, + rmm::cuda_stream_view stream = + cudf::get_default_stream(), + rmm::device_async_resource_ref mr = + cudf::get_current_device_resource_ref()) { + // Create reader options with empty source info + cudf::io::parquet_reader_options options = + cudf::io::parquet_reader_options::builder().filter(filter_expression); + + // Fetch footer and page index bytes from the buffer. + auto const footer_buffer = fetch_footer_bytes(file_buffer_span); + + // Create hybrid scan reader with footer bytes + auto const reader = + std::make_unique(footer_buffer, options); + + // Get all row groups from the reader + auto input_row_group_indices = reader->all_row_groups(options); + + // Span to track current row group indices + auto current_row_group_indices = cudf::host_span(input_row_group_indices); + + // Calling `filter_data_pages_with_stats` before setting up the page index should raise an + // error + EXPECT_THROW(std::ignore = reader->build_row_mask_with_page_index_stats( + current_row_group_indices, options, stream, mr), + std::runtime_error); + + // Set up the page index + auto const page_index_byte_range = reader->page_index_byte_range(); + auto const page_index_buffer = fetch_page_index_bytes(file_buffer_span, page_index_byte_range); + reader->setup_page_index(page_index_buffer); + + // Filter the data pages with page index stats + auto const row_mask = + reader->build_row_mask_with_page_index_stats(current_row_group_indices, options, stream, mr); + + auto const expected_num_rows = reader->total_rows_in_row_groups(current_row_group_indices); + EXPECT_EQ(row_mask->type().id(), cudf::type_id::BOOL8); + EXPECT_EQ(row_mask->size(), expected_num_rows); + EXPECT_EQ(row_mask->null_count(), 0); + + // Copy the row mask to the host and count the number of surviving rows + auto const host_row_mask = cudf::detail::make_host_vector( + {row_mask->view().data(), static_cast(row_mask->view().size())}, stream); + EXPECT_EQ(std::count(host_row_mask.begin(), host_row_mask.end(), true), + expected_surviving_rows); + }; // Filtering AST - table[0] < 100 { @@ -312,12 +304,11 @@ TYPED_TEST(PageFilteringWithPageIndexStats, FilterPagesWithPageIndexStats) auto const literal = cudf::ast::literal(literal_value); auto const col_ref = cudf::ast::column_name_reference("col0"); auto filter_expression = cudf::ast::operation(cudf::ast::ast_operator::LESS, col_ref, literal); - auto constexpr num_filter_columns = 1; - // Half the pages should be filtered out by the page index filter - auto constexpr expected_num_pages_after_page_index_filter = - num_concat * (num_ordered_rows / page_size_for_ordered_tests) / 2; - test_filter_data_pages_with_stats( - filter_expression, num_filter_columns, expected_num_pages_after_page_index_filter); + // Half the pages (unsigned) or 3/4th the pages (signed) should be filtered out by the page + // index filter + auto constexpr expected_surviving_rows = + (num_concat * num_ordered_rows) / (std::is_signed_v ? 4 : 2); + test_filter_data_pages_with_stats(filter_expression, expected_surviving_rows); } // Filtering AST - table[2] >= 10000 @@ -327,12 +318,11 @@ TYPED_TEST(PageFilteringWithPageIndexStats, FilterPagesWithPageIndexStats) auto col_ref = cudf::ast::column_name_reference("col2"); auto filter_expression = cudf::ast::operation(cudf::ast::ast_operator::GREATER_EQUAL, col_ref, literal); - auto constexpr num_filter_columns = 1; - // Half the pages should be filtered out by the page index filter - auto constexpr expected_num_pages_after_page_index_filter = - num_concat * (num_ordered_rows / page_size_for_ordered_tests) / 2; - test_filter_data_pages_with_stats( - filter_expression, num_filter_columns, expected_num_pages_after_page_index_filter); + // Half the pages (unsigned) or 3/4th the pages (signed) should be filtered out by the page + // index filter + auto constexpr expected_surviving_rows = + (num_concat * num_ordered_rows) / (std::is_signed_v ? 4 : 2); + test_filter_data_pages_with_stats(filter_expression, expected_surviving_rows); } // Filtering AST - table[0] < 50 AND table[2] < "000010000" @@ -351,11 +341,9 @@ TYPED_TEST(PageFilteringWithPageIndexStats, FilterPagesWithPageIndexStats) auto filter_expression = cudf::ast::operation( cudf::ast::ast_operator::LOGICAL_AND, filter_expression1, filter_expression2); - auto constexpr num_filter_columns = 2; // Only one page per num_concat per filter column should survive - auto constexpr expected_num_pages_after_page_index_filter = 1 * num_concat * num_filter_columns; - test_filter_data_pages_with_stats( - filter_expression, num_filter_columns, expected_num_pages_after_page_index_filter); + auto constexpr expected_surviving_rows = num_concat * page_size_for_ordered_tests; + test_filter_data_pages_with_stats(filter_expression, expected_surviving_rows); } // Filtering AST - table[0] > 150 OR table[2] < "000005000" @@ -374,12 +362,10 @@ TYPED_TEST(PageFilteringWithPageIndexStats, FilterPagesWithPageIndexStats) auto filter_expression = cudf::ast::operation( cudf::ast::ast_operator::LOGICAL_OR, filter_expression1, filter_expression2); - auto constexpr num_filter_columns = 2; // Two pages (3rd and 0th from respective conditions) per num_concat per filter column should // survive - auto constexpr expected_num_pages_after_page_index_filter = 2 * num_concat * num_filter_columns; - test_filter_data_pages_with_stats( - filter_expression, num_filter_columns, expected_num_pages_after_page_index_filter); + auto constexpr expected_surviving_rows = 2 * num_concat * page_size_for_ordered_tests; + test_filter_data_pages_with_stats(filter_expression, expected_surviving_rows); } } diff --git a/cpp/tests/io/experimental/hybrid_scan_test.cpp b/cpp/tests/io/experimental/hybrid_scan_test.cpp index 316e55a02eb..6ec572d1e49 100644 --- a/cpp/tests/io/experimental/hybrid_scan_test.cpp +++ b/cpp/tests/io/experimental/hybrid_scan_test.cpp @@ -141,10 +141,8 @@ auto hybrid_scan(std::vector& buffer, } // Filter data pages with page index stats - auto [row_mask, data_page_mask] = - reader->filter_data_pages_with_stats(current_row_group_indices, options, stream, mr); - - EXPECT_EQ(data_page_mask.size(), num_filter_columns); + auto row_mask = + reader->build_row_mask_with_page_index_stats(current_row_group_indices, options, stream, mr); // Get column chunk byte ranges from the reader auto const filter_column_chunk_byte_ranges = @@ -156,10 +154,10 @@ auto hybrid_scan(std::vector& buffer, // Materialize the table with only the filter columns auto [filter_table, filter_metadata] = - reader->materialize_filter_columns(data_page_mask, - current_row_group_indices, + reader->materialize_filter_columns(current_row_group_indices, std::move(filter_column_chunk_buffers), row_mask->mutable_view(), + cudf::io::parquet::experimental::use_data_page_mask::YES, options, stream); @@ -168,14 +166,15 @@ auto hybrid_scan(std::vector& buffer, reader->payload_column_chunks_byte_ranges(current_row_group_indices, options); // Fetch column chunk device buffers from the input buffer - [[maybe_unused]] auto payload_column_chunk_buffers = + auto payload_column_chunk_buffers = fetch_byte_ranges(file_buffer_span, payload_column_chunk_byte_ranges, stream, mr); // Materialize the table with only the payload columns - [[maybe_unused]] auto [payload_table, payload_metadata] = + auto [payload_table, payload_metadata] = reader->materialize_payload_columns(current_row_group_indices, std::move(payload_column_chunk_buffers), row_mask->view(), + cudf::io::parquet::experimental::use_data_page_mask::YES, options, stream); @@ -228,7 +227,7 @@ TEST_F(HybridScanTest, PruneRowGroupsOnlyAndScanAllColumns) cudf::io::parquet_reader_options::builder( cudf::io::source_info(cudf::host_span(parquet_buffer.data(), parquet_buffer.size()))) .filter(filter_expression); - auto [expected_tbl, expected_meta] = cudf::io::read_parquet(options, stream); + auto [expected_tbl, expected_meta] = read_parquet(options, stream); CUDF_TEST_EXPECT_TABLES_EQUIVALENT(expected_tbl->select({0}), read_filter_table->view()); CUDF_TEST_EXPECT_TABLES_EQUIVALENT(expected_tbl->select({1, 2}), read_payload_table->view()); } @@ -275,7 +274,7 @@ TEST_F(HybridScanTest, PruneRowGroupsOnlyAndScanSelectColumns) cudf::io::parquet_reader_options::builder( cudf::io::source_info(cudf::host_span(parquet_buffer.data(), parquet_buffer.size()))) .filter(filter_expression); - auto [expected_tbl, expected_meta] = cudf::io::read_parquet(options, stream); + auto [expected_tbl, expected_meta] = read_parquet(options, stream); CUDF_TEST_EXPECT_TABLES_EQUIVALENT(expected_tbl->select({0}), read_filter_table->view()); CUDF_TEST_EXPECT_TABLES_EQUIVALENT(expected_tbl->select({2}), read_payload_table->view()); } @@ -298,7 +297,7 @@ TEST_F(HybridScanTest, PruneRowGroupsOnlyAndScanSelectColumns) cudf::io::parquet_reader_options::builder( cudf::io::source_info(cudf::host_span(parquet_buffer.data(), parquet_buffer.size()))) .filter(filter_expression); - auto [expected_tbl, expected_meta] = cudf::io::read_parquet(options, stream); + auto [expected_tbl, expected_meta] = read_parquet(options, stream); CUDF_TEST_EXPECT_TABLES_EQUIVALENT(expected_tbl->select({0}), read_filter_table->view()); CUDF_TEST_EXPECT_TABLES_EQUIVALENT(expected_tbl->select({2, 1}), read_payload_table->view()); } @@ -341,7 +340,7 @@ TEST_F(HybridScanTest, PruneDataPagesOnlyAndScanAllColumns) cudf::io::parquet_reader_options::builder( cudf::io::source_info(cudf::host_span(buffer.data(), buffer.size()))) .filter(filter_expression); - auto [expected_tbl, expected_meta] = cudf::io::read_parquet(options, stream); + auto [expected_tbl, expected_meta] = read_parquet(options, stream); CUDF_TEST_EXPECT_TABLES_EQUIVALENT(expected_tbl->select({0}), read_filter_table->view()); CUDF_TEST_EXPECT_TABLES_EQUIVALENT(expected_tbl->select({1, 2}), read_payload_table->view()); } From f16d1d007bb3cb3d423d4f7f43fadf0b71261bc8 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Thu, 31 Jul 2025 18:44:49 -0700 Subject: [PATCH 039/366] Add support for streams to all copying APIs. (#19553) Contributes to #15163 Authors: - Vyas Ramasubramani (https://github.com/vyasr) Approvers: - Matthew Roeschke (https://github.com/mroeschke) - Matthew Murray (https://github.com/Matt711) URL: https://github.com/rapidsai/cudf/pull/19553 --- python/pylibcudf/pylibcudf/column.pyx | 3 +- python/pylibcudf/pylibcudf/copying.pxd | 31 +++-- python/pylibcudf/pylibcudf/copying.pyi | 48 +++++-- python/pylibcudf/pylibcudf/copying.pyx | 124 +++++++++++++----- .../pylibcudf/pylibcudf/libcudf/copying.pxd | 56 +++++--- .../libcudf/utilities/default_stream.pxd | 3 + 6 files changed, 196 insertions(+), 69 deletions(-) diff --git a/python/pylibcudf/pylibcudf/column.pyx b/python/pylibcudf/pylibcudf/column.pyx index eb0ed33f1e8..40ca7b0e4c3 100644 --- a/python/pylibcudf/pylibcudf/column.pyx +++ b/python/pylibcudf/pylibcudf/column.pyx @@ -32,6 +32,7 @@ from pylibcudf.libcudf.strings.strings_column_view cimport strings_column_view from pylibcudf.libcudf.types cimport size_type, size_of as cpp_size_of, bitmask_type from pylibcudf.libcudf.utilities.traits cimport is_fixed_width from pylibcudf.libcudf.copying cimport get_element +from pylibcudf.libcudf.utilities.default_stream cimport get_default_stream from rmm.pylibrmm.device_buffer cimport DeviceBuffer @@ -719,7 +720,7 @@ cdef class Column: cdef unique_ptr[scalar] result with nogil: - result = get_element(cv, 0) + result = get_element(cv, 0, get_default_stream()) return Scalar.from_libcudf(move(result)) diff --git a/python/pylibcudf/pylibcudf/copying.pxd b/python/pylibcudf/pylibcudf/copying.pxd index 7dfed437673..892769582c0 100644 --- a/python/pylibcudf/pylibcudf/copying.pxd +++ b/python/pylibcudf/pylibcudf/copying.pxd @@ -1,4 +1,4 @@ -# Copyright (c) 2023-2024, NVIDIA CORPORATION. +# Copyright (c) 2023-2025, NVIDIA CORPORATION. from libcpp cimport bool as cbool from pylibcudf.libcudf.copying cimport ( @@ -7,6 +7,8 @@ from pylibcudf.libcudf.copying cimport ( ) from pylibcudf.libcudf.types cimport size_type +from rmm.pylibrmm.stream cimport Stream + from .column cimport Column from .scalar cimport Scalar from .table cimport Table @@ -35,14 +37,19 @@ ctypedef fused RightCopyIfElseOperand: cpdef Table gather( Table source_table, Column gather_map, - out_of_bounds_policy bounds_policy + out_of_bounds_policy bounds_policy, + Stream stream=* ) -cpdef Table scatter(TableOrListOfScalars source, Column scatter_map, Table target_table) +cpdef Table scatter( + TableOrListOfScalars source, Column scatter_map, Table target_table, Stream stream=* +) cpdef ColumnOrTable empty_like(ColumnOrTable input) -cpdef Column allocate_like(Column input_column, mask_allocation_policy policy, size=*) +cpdef Column allocate_like( + Column input_column, mask_allocation_policy policy, size=*, Stream stream=* +) cpdef Column copy_range_in_place( Column input_column, @@ -50,6 +57,7 @@ cpdef Column copy_range_in_place( size_type input_begin, size_type input_end, size_type target_begin, + Stream stream=* ) cpdef Column copy_range( @@ -58,24 +66,27 @@ cpdef Column copy_range( size_type input_begin, size_type input_end, size_type target_begin, + Stream stream=* ) -cpdef Column shift(Column input, size_type offset, Scalar fill_value) +cpdef Column shift(Column input, size_type offset, Scalar fill_value, Stream stream=*) -cpdef list slice(ColumnOrTable input, list indices) +cpdef list slice(ColumnOrTable input, list indices, Stream stream=*) -cpdef list split(ColumnOrTable input, list splits) +cpdef list split(ColumnOrTable input, list splits, Stream stream=*) cpdef Column copy_if_else( LeftCopyIfElseOperand lhs, RightCopyIfElseOperand rhs, - Column boolean_mask + Column boolean_mask, + Stream stream=* ) cpdef Table boolean_mask_scatter( TableOrListOfScalars input, Table target, - Column boolean_mask + Column boolean_mask, + Stream stream=* ) -cpdef Scalar get_element(Column input_column, size_type index) +cpdef Scalar get_element(Column input_column, size_type index, Stream stream=*) diff --git a/python/pylibcudf/pylibcudf/copying.pyi b/python/pylibcudf/pylibcudf/copying.pyi index 6cf4ed48724..864f1170993 100644 --- a/python/pylibcudf/pylibcudf/copying.pyi +++ b/python/pylibcudf/pylibcudf/copying.pyi @@ -3,6 +3,8 @@ from enum import IntEnum from typing import TypeVar +from rmm.pylibrmm.stream import Stream + from pylibcudf.column import Column from pylibcudf.scalar import Scalar from pylibcudf.table import Table @@ -19,14 +21,23 @@ class OutOfBoundsPolicy(IntEnum): ColumnOrTable = TypeVar("ColumnOrTable", Column, Table) def gather( - source_table: Table, gather_map: Column, bounds_policy: OutOfBoundsPolicy + source_table: Table, + gather_map: Column, + bounds_policy: OutOfBoundsPolicy, + stream: Stream | None = None, ) -> Table: ... def scatter( - source: Table | list[Scalar], scatter_map: Column, target_table: Table + source: Table | list[Scalar], + scatter_map: Column, + target_table: Table, + stream: Stream | None = None, ) -> Table: ... def empty_like(input: ColumnOrTable) -> ColumnOrTable: ... def allocate_like( - input_column: Column, policy: MaskAllocationPolicy, size: int | None = None + input_column: Column, + policy: MaskAllocationPolicy, + size: int | None = None, + stream: Stream | None = None, ) -> Column: ... def copy_range_in_place( input_column: Column, @@ -34,6 +45,7 @@ def copy_range_in_place( input_begin: int, input_end: int, target_begin: int, + stream: Stream | None = None, ) -> Column: ... def copy_range( input_column: Column, @@ -41,14 +53,32 @@ def copy_range( input_begin: int, input_end: int, target_begin: int, + stream: Stream | None = None, +) -> Column: ... +def shift( + input: Column, + offset: int, + fill_value: Scalar, + stream: Stream | None = None, ) -> Column: ... -def shift(input: Column, offset: int, fill_value: Scalar) -> Column: ... -def slice(input: ColumnOrTable, indices: list[int]) -> list[ColumnOrTable]: ... -def split(input: ColumnOrTable, splits: list[int]) -> list[ColumnOrTable]: ... +def slice( + input: ColumnOrTable, indices: list[int], stream: Stream | None = None +) -> list[ColumnOrTable]: ... +def split( + input: ColumnOrTable, splits: list[int], stream: Stream | None = None +) -> list[ColumnOrTable]: ... def copy_if_else( - lhs: Column | Scalar, rhs: Column | Scalar, boolean_mask: Column + lhs: Column | Scalar, + rhs: Column | Scalar, + boolean_mask: Column, + stream: Stream | None = None, ) -> Column: ... def boolean_mask_scatter( - input: Table | list[Scalar], target: Table, boolean_mask: Column + input: Table | list[Scalar], + target: Table, + boolean_mask: Column, + stream: Stream | None = None, ) -> Table: ... -def get_element(input_column: Column, index: int) -> Scalar: ... +def get_element( + input_column: Column, index: int, stream: Stream | None = None +) -> Scalar: ... diff --git a/python/pylibcudf/pylibcudf/copying.pyx b/python/pylibcudf/pylibcudf/copying.pyx index 3b0ba0d9555..d7f7fa14068 100644 --- a/python/pylibcudf/pylibcudf/copying.pyx +++ b/python/pylibcudf/pylibcudf/copying.pyx @@ -32,10 +32,12 @@ from pylibcudf.libcudf.copying import \ from pylibcudf.libcudf.copying import \ sample_with_replacement as SampleWithReplacement # no-cython-lint +from rmm.pylibrmm.stream cimport Stream + from .column cimport Column from .scalar cimport Scalar from .table cimport Table -from .utils cimport _as_vector +from .utils cimport _as_vector, _get_stream __all__ = [ @@ -59,7 +61,8 @@ __all__ = [ cpdef Table gather( Table source_table, Column gather_map, - out_of_bounds_policy bounds_policy + out_of_bounds_policy bounds_policy, + Stream stream=None ): """Select rows from source_table according to the provided gather_map. @@ -74,6 +77,8 @@ cpdef Table gather( bounds_policy : out_of_bounds_policy Controls whether out of bounds indices are checked and nullified in the output or if indices are assumed to be in bounds. + stream : Stream | None + CUDA stream on which to perform the operation. Returns ------- @@ -86,20 +91,24 @@ cpdef Table gather( If the gather_map contains nulls. """ cdef unique_ptr[table] c_result + stream = _get_stream(stream) + with nogil: c_result = cpp_copying.gather( source_table.view(), gather_map.view(), - bounds_policy + bounds_policy, + stream.view() ) - return Table.from_libcudf(move(c_result)) + return Table.from_libcudf(move(c_result), stream) cpdef Table scatter( TableOrListOfScalars source, Column scatter_map, - Table target_table + Table target_table, + Stream stream=None ): """Scatter from source into target_table according to scatter_map. @@ -116,6 +125,8 @@ cpdef Table scatter( A mapping from rows in source to rows in target_table. target_table : Table The table object into which to scatter data. + stream : Stream | None + CUDA stream on which to perform the operation. Returns ------- @@ -138,12 +149,15 @@ cpdef Table scatter( """ cdef unique_ptr[table] c_result cdef vector[reference_wrapper[const scalar]] source_scalars + stream = _get_stream(stream) + if TableOrListOfScalars is Table: with nogil: c_result = cpp_copying.scatter( source.view(), scatter_map.view(), target_table.view(), + stream.view() ) else: source_scalars = _as_vector(source) @@ -152,8 +166,9 @@ cpdef Table scatter( source_scalars, scatter_map.view(), target_table.view(), + stream.view() ) - return Table.from_libcudf(move(c_result)) + return Table.from_libcudf(move(c_result), stream) cpdef ColumnOrTable empty_like(ColumnOrTable input): @@ -184,7 +199,7 @@ cpdef ColumnOrTable empty_like(ColumnOrTable input): cpdef Column allocate_like( - Column input_column, mask_allocation_policy policy, size=None + Column input_column, mask_allocation_policy policy, size=None, Stream stream=None ): """Allocate a column with the same type as input_column. @@ -199,6 +214,8 @@ cpdef Column allocate_like( size : int, optional The number of elements to allocate in the output column. If not specified, the size of the input column is used. + stream : Stream | None + CUDA stream on which to perform the operation. Returns ------- @@ -208,15 +225,17 @@ cpdef Column allocate_like( cdef unique_ptr[column] c_result cdef size_type c_size = size if size is not None else input_column.size() + stream = _get_stream(stream) with nogil: c_result = cpp_copying.allocate_like( input_column.view(), c_size, policy, + stream.view() ) - return Column.from_libcudf(move(c_result)) + return Column.from_libcudf(move(c_result), stream) cpdef Column copy_range_in_place( @@ -225,6 +244,7 @@ cpdef Column copy_range_in_place( size_type input_begin, size_type input_end, size_type target_begin, + Stream stream=None ): """Copy a range of elements from input_column to target_column. @@ -244,6 +264,8 @@ cpdef Column copy_range_in_place( The index of the last element in input_column to copy. target_begin : int The index of the first element in target_column to overwrite. + stream : Stream | None + CUDA stream on which to perform the operation. Raises ------ @@ -261,13 +283,16 @@ cpdef Column copy_range_in_place( # try and pass a temporary that decays to an rvalue reference in where the # function requires an lvalue reference. cdef mutable_column_view target_view = target_column.mutable_view() + stream = _get_stream(stream) + with nogil: cpp_copying.copy_range_in_place( input_column.view(), target_view, input_begin, input_end, - target_begin + target_begin, + stream.view() ) @@ -277,6 +302,7 @@ cpdef Column copy_range( size_type input_begin, size_type input_end, size_type target_begin, + Stream stream=None ): """Copy a range of elements from input_column to target_column. @@ -294,6 +320,8 @@ cpdef Column copy_range( The index of the last element in input_column to copy. target_begin : int The index of the first element in target_column to overwrite. + stream : Stream | None + CUDA stream on which to perform the operation. Returns ------- @@ -309,6 +337,7 @@ cpdef Column copy_range( If target and source have different types. """ cdef unique_ptr[column] c_result + stream = _get_stream(stream) with nogil: c_result = cpp_copying.copy_range( @@ -316,13 +345,16 @@ cpdef Column copy_range( target_column.view(), input_begin, input_end, - target_begin + target_begin, + stream.view() ) - return Column.from_libcudf(move(c_result)) + return Column.from_libcudf(move(c_result), stream) -cpdef Column shift(Column input, size_type offset, Scalar fill_value): +cpdef Column shift( + Column input, size_type offset, Scalar fill_value, Stream stream=None +): """Shift the elements of input by offset. For details on the implementation, see :cpp:func:`shift`. @@ -336,6 +368,8 @@ cpdef Column shift(Column input, size_type offset, Scalar fill_value): fill_values : Scalar The value to use for elements that are shifted in from outside the bounds of the input column. + stream : Stream | None + CUDA stream on which to perform the operation. Returns ------- @@ -349,16 +383,19 @@ cpdef Column shift(Column input, size_type offset, Scalar fill_value): of fixed width or string type. """ cdef unique_ptr[column] c_result + stream = _get_stream(stream) + with nogil: c_result = cpp_copying.shift( input.view(), offset, - dereference(fill_value.c_obj) + dereference(fill_value.c_obj), + stream.view() ) - return Column.from_libcudf(move(c_result)) + return Column.from_libcudf(move(c_result), stream) -cpdef list slice(ColumnOrTable input, list indices): +cpdef list slice(ColumnOrTable input, list indices, Stream stream=None): """Slice input according to indices. For details on the implementation, see :cpp:func:`slice`. @@ -369,6 +406,8 @@ cpdef list slice(ColumnOrTable input, list indices): The column or table to slice. indices : List[int] The indices to select from input. + stream : Stream | None + CUDA stream on which to perform the operation. Returns ------- @@ -387,9 +426,11 @@ cpdef list slice(ColumnOrTable input, list indices): cdef vector[column_view] c_col_result cdef vector[table_view] c_tbl_result cdef int i + stream = _get_stream(stream) + if ColumnOrTable is Column: with nogil: - c_col_result = cpp_copying.slice(input.view(), c_indices) + c_col_result = cpp_copying.slice(input.view(), c_indices, stream.view()) return [ Column.from_column_view(c_col_result[i], input) @@ -397,7 +438,7 @@ cpdef list slice(ColumnOrTable input, list indices): ] else: with nogil: - c_tbl_result = cpp_copying.slice(input.view(), c_indices) + c_tbl_result = cpp_copying.slice(input.view(), c_indices, stream.view()) return [ Table.from_table_view(c_tbl_result[i], input) @@ -405,7 +446,7 @@ cpdef list slice(ColumnOrTable input, list indices): ] -cpdef list split(ColumnOrTable input, list splits): +cpdef list split(ColumnOrTable input, list splits, Stream stream=None): """Split input into multiple. For details on the implementation, see :cpp:func:`split`. @@ -416,6 +457,8 @@ cpdef list split(ColumnOrTable input, list splits): The column to split. splits : List[int] The indices at which to split the column. + stream : Stream | None + CUDA stream on which to perform the operation. Returns ------- @@ -426,10 +469,11 @@ cpdef list split(ColumnOrTable input, list splits): cdef vector[column_view] c_col_result cdef vector[table_view] c_tbl_result cdef int i + stream = _get_stream(stream) if ColumnOrTable is Column: with nogil: - c_col_result = cpp_copying.split(input.view(), c_splits) + c_col_result = cpp_copying.split(input.view(), c_splits, stream.view()) return [ Column.from_column_view(c_col_result[i], input) @@ -437,7 +481,7 @@ cpdef list split(ColumnOrTable input, list splits): ] else: with nogil: - c_tbl_result = cpp_copying.split(input.view(), c_splits) + c_tbl_result = cpp_copying.split(input.view(), c_splits, stream.view()) return [ Table.from_table_view(c_tbl_result[i], input) @@ -448,7 +492,8 @@ cpdef list split(ColumnOrTable input, list splits): cpdef Column copy_if_else( LeftCopyIfElseOperand lhs, RightCopyIfElseOperand rhs, - Column boolean_mask + Column boolean_mask, + Stream stream=None ): """Copy elements from lhs or rhs into a new column according to boolean_mask. @@ -464,6 +509,8 @@ cpdef Column copy_if_else( boolean_mask is False. boolean_mask : Column The boolean mask to use to select elements from lhs and rhs. + stream : Stream | None + CUDA stream on which to perform the operation. Returns ------- @@ -480,37 +527,43 @@ cpdef Column copy_if_else( columns), or if lhs and rhs are not of the same length (if both are columns). """ cdef unique_ptr[column] result + stream = _get_stream(stream) if LeftCopyIfElseOperand is Column and RightCopyIfElseOperand is Column: with nogil: result = cpp_copying.copy_if_else( lhs.view(), rhs.view(), - boolean_mask.view() + boolean_mask.view(), + stream.view() ) elif LeftCopyIfElseOperand is Column and RightCopyIfElseOperand is Scalar: with nogil: result = cpp_copying.copy_if_else( - lhs.view(), dereference(rhs.c_obj), boolean_mask.view() + lhs.view(), dereference(rhs.c_obj), boolean_mask.view(), stream.view() ) elif LeftCopyIfElseOperand is Scalar and RightCopyIfElseOperand is Column: with nogil: result = cpp_copying.copy_if_else( - dereference(lhs.c_obj), rhs.view(), boolean_mask.view() + dereference(lhs.c_obj), rhs.view(), boolean_mask.view(), stream.view() ) else: with nogil: result = cpp_copying.copy_if_else( - dereference(lhs.c_obj), dereference(rhs.c_obj), boolean_mask.view() + dereference(lhs.c_obj), + dereference(rhs.c_obj), + boolean_mask.view(), + stream.view() ) - return Column.from_libcudf(move(result)) + return Column.from_libcudf(move(result), stream) cpdef Table boolean_mask_scatter( TableOrListOfScalars input, Table target, - Column boolean_mask + Column boolean_mask, + Stream stream=None ): """Scatter rows from input into target according to boolean_mask. @@ -527,6 +580,8 @@ cpdef Table boolean_mask_scatter( The table object into which to scatter data. boolean_mask : Column A mapping from rows in input to rows in target. + stream : Stream | None + CUDA stream on which to perform the operation. Returns ------- @@ -545,13 +600,15 @@ cpdef Table boolean_mask_scatter( """ cdef unique_ptr[table] result cdef vector[reference_wrapper[const scalar]] source_scalars + stream = _get_stream(stream) if TableOrListOfScalars is Table: with nogil: result = cpp_copying.boolean_mask_scatter( input.view(), target.view(), - boolean_mask.view() + boolean_mask.view(), + stream.view() ) else: source_scalars = _as_vector(input) @@ -560,12 +617,13 @@ cpdef Table boolean_mask_scatter( source_scalars, target.view(), boolean_mask.view(), + stream.view() ) - return Table.from_libcudf(move(result)) + return Table.from_libcudf(move(result), stream) -cpdef Scalar get_element(Column input_column, size_type index): +cpdef Scalar get_element(Column input_column, size_type index, Stream stream=None): """Get the element at index from input_column. For details on the implementation, see :cpp:func:`get_element`. @@ -576,6 +634,8 @@ cpdef Scalar get_element(Column input_column, size_type index): The column from which to get the element. index : int The index of the element to get. + stream : Stream | None + CUDA stream on which to perform the operation. Returns ------- @@ -588,8 +648,10 @@ cpdef Scalar get_element(Column input_column, size_type index): If index is out of bounds. """ cdef unique_ptr[scalar] c_output + stream = _get_stream(stream) + with nogil: - c_output = cpp_copying.get_element(input_column.view(), index) + c_output = cpp_copying.get_element(input_column.view(), index, stream.view()) return Scalar.from_libcudf(move(c_output)) diff --git a/python/pylibcudf/pylibcudf/libcudf/copying.pxd b/python/pylibcudf/pylibcudf/libcudf/copying.pxd index 5a05284e86a..5ee4b9879eb 100644 --- a/python/pylibcudf/pylibcudf/libcudf/copying.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/copying.pxd @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. +# Copyright (c) 2020-2025, NVIDIA CORPORATION. from libc.stdint cimport int32_t, int64_t, uint8_t from libcpp cimport bool from libcpp.functional cimport reference_wrapper @@ -16,6 +16,7 @@ from pylibcudf.libcudf.table.table_view cimport table_view from pylibcudf.libcudf.types cimport size_type from rmm.librmm.device_buffer cimport device_buffer +from rmm.librmm.cuda_stream_view cimport cuda_stream_view ctypedef const scalar constscalar @@ -27,25 +28,29 @@ cdef extern from "cudf/copying.hpp" namespace "cudf" nogil: cdef unique_ptr[table] gather ( const table_view& source_table, const column_view& gather_map, - out_of_bounds_policy policy + out_of_bounds_policy policy, + cuda_stream_view stream ) except +libcudf_exception_handler cdef unique_ptr[column] shift( const column_view& input, size_type offset, - const scalar& fill_values + const scalar& fill_values, + cuda_stream_view stream ) except +libcudf_exception_handler cdef unique_ptr[table] scatter ( const table_view& source_table, const column_view& scatter_map, const table_view& target_table, + cuda_stream_view stream ) except +libcudf_exception_handler cdef unique_ptr[table] scatter ( const vector[reference_wrapper[constscalar]]& source_scalars, const column_view& indices, const table_view& target, + cuda_stream_view stream ) except +libcudf_exception_handler cpdef enum class mask_allocation_policy(int32_t): @@ -59,13 +64,15 @@ cdef extern from "cudf/copying.hpp" namespace "cudf" nogil: cdef unique_ptr[column] allocate_like ( const column_view& input_column, - mask_allocation_policy policy + mask_allocation_policy policy, + cuda_stream_view stream ) except +libcudf_exception_handler cdef unique_ptr[column] allocate_like ( const column_view& input_column, size_type size, - mask_allocation_policy policy + mask_allocation_policy policy, + cuda_stream_view stream ) except +libcudf_exception_handler cdef unique_ptr[table] empty_like ( @@ -77,7 +84,8 @@ cdef extern from "cudf/copying.hpp" namespace "cudf" nogil: mutable_column_view& target_column, size_type input_begin, size_type input_end, - size_type target_begin + size_type target_begin, + cuda_stream_view stream ) except +libcudf_exception_handler cdef unique_ptr[column] copy_range ( @@ -85,68 +93,80 @@ cdef extern from "cudf/copying.hpp" namespace "cudf" nogil: const column_view& target_column, size_type input_begin, size_type input_end, - size_type target_begin + size_type target_begin, + cuda_stream_view stream ) except +libcudf_exception_handler cdef vector[column_view] slice ( const column_view& input_column, - vector[size_type] indices + vector[size_type] indices, + cuda_stream_view stream ) except +libcudf_exception_handler cdef vector[table_view] slice ( const table_view& input_table, - vector[size_type] indices + vector[size_type] indices, + cuda_stream_view stream ) except +libcudf_exception_handler cdef vector[column_view] split ( const column_view& input_column, - vector[size_type] splits + vector[size_type] splits, + cuda_stream_view stream ) except +libcudf_exception_handler cdef vector[table_view] split ( const table_view& input_table, - vector[size_type] splits + vector[size_type] splits, + cuda_stream_view stream ) except +libcudf_exception_handler cdef unique_ptr[column] copy_if_else ( const column_view& lhs, const column_view& rhs, - const column_view& boolean_mask + const column_view& boolean_mask, + cuda_stream_view stream ) except +libcudf_exception_handler cdef unique_ptr[column] copy_if_else ( const scalar& lhs, const column_view& rhs, - const column_view& boolean_mask + const column_view& boolean_mask, + cuda_stream_view stream ) except +libcudf_exception_handler cdef unique_ptr[column] copy_if_else ( const column_view& lhs, const scalar& rhs, - const column_view boolean_mask + const column_view boolean_mask, + cuda_stream_view stream ) except +libcudf_exception_handler cdef unique_ptr[column] copy_if_else ( const scalar& lhs, const scalar& rhs, - const column_view boolean_mask + const column_view boolean_mask, + cuda_stream_view stream ) except +libcudf_exception_handler cdef unique_ptr[table] boolean_mask_scatter ( const table_view& input, const table_view& target, - const column_view& boolean_mask + const column_view& boolean_mask, + cuda_stream_view stream ) except +libcudf_exception_handler cdef unique_ptr[table] boolean_mask_scatter ( const vector[reference_wrapper[constscalar]]& input, const table_view& target, - const column_view& boolean_mask + const column_view& boolean_mask, + cuda_stream_view stream ) except +libcudf_exception_handler cdef unique_ptr[scalar] get_element ( const column_view& input, - size_type index + size_type index, + cuda_stream_view stream ) except +libcudf_exception_handler cpdef enum class sample_with_replacement(bool): diff --git a/python/pylibcudf/pylibcudf/libcudf/utilities/default_stream.pxd b/python/pylibcudf/pylibcudf/libcudf/utilities/default_stream.pxd index d94915a419f..538d233b63d 100644 --- a/python/pylibcudf/pylibcudf/libcudf/utilities/default_stream.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/utilities/default_stream.pxd @@ -1,6 +1,9 @@ # Copyright (c) 2025, NVIDIA CORPORATION. from libcpp cimport bool +from rmm.librmm.cuda_stream_view cimport cuda_stream_view + cdef extern from "cudf/utilities/default_stream.hpp" namespace "cudf" nogil: cdef bool is_ptds_enabled() + cdef cuda_stream_view get_default_stream() From c80d49c5fbf821bd11a3d997be3aadfd7a002ab9 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 1 Aug 2025 10:59:24 -0500 Subject: [PATCH 040/366] Capture commit hashes in pdsh benchmarks (#19548) This updates the software versions we record in the pdsh benchmarks to include the commit hash for cudf-polars and rapidsmpf. Note that this changes the serialized representation. Instead of ```json "software": { "cudf_polars": "{version}" } ``` it will now be: ```json "software": { "cudf_polars": { "version": "{version}", "commit": "{commit}" } } ``` Authors: - Tom Augspurger (https://github.com/TomAugspurger) Approvers: - Matthew Murray (https://github.com/Matt711) URL: https://github.com/rapidsai/cudf/pull/19548 --- .../experimental/benchmarks/utils.py | 27 +++++++++++++++---- 1 file changed, 22 insertions(+), 5 deletions(-) diff --git a/python/cudf_polars/cudf_polars/experimental/benchmarks/utils.py b/python/cudf_polars/cudf_polars/experimental/benchmarks/utils.py index a63f1025d60..7e56e3c57d3 100644 --- a/python/cudf_polars/cudf_polars/experimental/benchmarks/utils.py +++ b/python/cudf_polars/cudf_polars/experimental/benchmarks/utils.py @@ -54,14 +54,22 @@ class Record: duration: float +@dataclasses.dataclass +class VersionInfo: + """Information about the commit of the software used to run the query.""" + + version: str + commit: str + + @dataclasses.dataclass class PackageVersions: """Information about the versions of the software used to run the query.""" - cudf_polars: str + cudf_polars: str | VersionInfo polars: str python: str - rapidsmpf: str | None + rapidsmpf: str | VersionInfo | None @classmethod def collect(cls) -> PackageVersions: @@ -71,15 +79,24 @@ def collect(cls) -> PackageVersions: "polars", "rapidsmpf", ] - versions = {} + versions: dict[str, str | VersionInfo | None] = {} for name in packages: try: package = importlib.import_module(name) - versions[name] = package.__version__ except (AttributeError, ImportError): # noqa: PERF203 versions[name] = None + else: + if name in ("cudf_polars", "rapidsmpf"): + versions[name] = VersionInfo( + version=package.__version__, + commit=package.__git_commit__, + ) + else: + versions[name] = package.__version__ + versions["python"] = ".".join(str(v) for v in sys.version_info[:3]) - return cls(**versions) + # we manually ensure that only cudf-polars and rapidsmpf have a VersionInfo + return cls(**versions) # type: ignore[arg-type] @dataclasses.dataclass From 37a496796dc056a8801075bb16112823f00aadcf Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Fri, 1 Aug 2025 09:53:46 -0700 Subject: [PATCH 041/366] Move str accessor tests in test_string.py to new cudf classic test directory structure (#19557) Towards https://github.com/rapidsai/cudf/issues/9999 Towards https://github.com/rapidsai/cudf/issues/15723 Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - Lawrence Mitchell (https://github.com/wence-) URL: https://github.com/rapidsai/cudf/pull/19557 --- .../cudf/tests/series/accessors/__init__.py | 0 .../cudf/tests/series/accessors/test_str.py | 2847 ++++++++++++++++ python/cudf/cudf/tests/test_string.py | 2874 +---------------- 3 files changed, 2871 insertions(+), 2850 deletions(-) create mode 100644 python/cudf/cudf/tests/series/accessors/__init__.py create mode 100644 python/cudf/cudf/tests/series/accessors/test_str.py diff --git a/python/cudf/cudf/tests/series/accessors/__init__.py b/python/cudf/cudf/tests/series/accessors/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/python/cudf/cudf/tests/series/accessors/test_str.py b/python/cudf/cudf/tests/series/accessors/test_str.py new file mode 100644 index 00000000000..509ade12ce1 --- /dev/null +++ b/python/cudf/cudf/tests/series/accessors/test_str.py @@ -0,0 +1,2847 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + +import json +import re +import urllib.parse +from contextlib import nullcontext as does_not_raise + +import numpy as np +import pandas as pd +import pyarrow as pa +import pytest + +import cudf +from cudf import concat +from cudf.testing import assert_eq +from cudf.testing._utils import ( + assert_exceptions_equal, + expect_warning_if, +) + + +def raise_builder(flags, exceptions): + if any(flags): + return pytest.raises(exceptions) + else: + return does_not_raise() + + +@pytest.fixture( + params=[ + ["AbC", "de", "FGHI", "j", "kLm"], + ["nOPq", None, "RsT", None, "uVw"], + [None, None, None, None, None], + ], + ids=["no_nulls", "some_nulls", "all_nulls"], +) +def data(request): + return request.param + + +@pytest.fixture( + params=[None, [10, 11, 12, 13, 14]], ids=["None_index", "Set_index"] +) +def index(request): + return request.param + + +@pytest.fixture +def ps_gs(data, index): + ps = pd.Series(data, index=index, dtype="str", name="nice name") + gs = cudf.Series(data, index=index, dtype="str", name="nice name") + return (ps, gs) + + +def test_getitem_out_of_bounds(): + data = ["123", "12", "1"] + pd_ser = pd.Series(data) + cudf_ser = cudf.Series(data) + expected = pd_ser.str[2] + result = cudf_ser.str[2] + assert_eq(result, expected) + + expected = pd_ser.str[-2] + result = cudf_ser.str[-2] + assert_eq(result, expected) + + +@pytest.mark.parametrize("method", ["startswith", "endswith"]) +@pytest.mark.parametrize("pat", [None, (1, 2), pd.Series([1])]) +def test_startsendwith_invalid_pat(method, pat): + ser = cudf.Series(["1"]) + with pytest.raises(TypeError): + getattr(ser.str, method)(pat) + + +@pytest.mark.parametrize("method", ["rindex", "index"]) +def test_index_int64_pandas_compat(method): + data = ["ABCDEFG", "BCDEFEF", "DEFGHIJEF", "EFGHEF"] + with cudf.option_context("mode.pandas_compatible", True): + result = getattr(cudf.Series(data).str, method)("E", 4, 8) + expected = getattr(pd.Series(data).str, method)("E", 4, 8) + assert_eq(result, expected) + + +def test_replace_invalid_scalar_repl(): + ser = cudf.Series(["1"]) + with pytest.raises(TypeError): + ser.str.replace("1", 2) + + +def test_string_methods_setattr(): + ser = cudf.Series(["ab", "cd", "ef"]) + pser = ser.to_pandas() + + assert_exceptions_equal( + lfunc=ser.str.__setattr__, + rfunc=pser.str.__setattr__, + lfunc_args_and_kwargs=(("a", "b"),), + rfunc_args_and_kwargs=(("a", "b"),), + ) + + +@pytest.mark.parametrize( + "data", + [ + [ + """ + { + "store":{ + "book":[ + { + "category":"reference", + "author":"Nigel Rees", + "title":"Sayings of the Century", + "price":8.95 + }, + { + "category":"fiction", + "author":"Evelyn Waugh", + "title":"Sword of Honour", + "price":12.99 + } + ] + } + } + """ + ], + [ + """ + { + "store":{ + "book":[ + { + "category":"reference", + "author":"Nigel Rees", + "title":"Sayings of the Century", + "price":8.95 + } + ] + } + } + """, + """ + { + "store":{ + "book":[ + { + "category":"fiction", + "author":"Evelyn Waugh", + "title":"Sword of Honour", + "price":12.99 + } + ] + } + } + """, + ], + ], +) +def test_string_get_json_object_n(data): + gs = cudf.Series(data) + ps = pd.Series(data) + + assert_eq( + json.loads(gs.str.get_json_object("$.store")[0]), + ps.apply(lambda x: json.loads(x)["store"])[0], + ) + assert_eq( + json.loads(gs.str.get_json_object("$.store.book")[0]), + ps.apply(lambda x: json.loads(x)["store"]["book"])[0], + ) + assert_eq( + gs.str.get_json_object("$.store.book[0].category"), + ps.apply(lambda x: json.loads(x)["store"]["book"][0]["category"]), + ) + + +@pytest.mark.parametrize( + "json_path", ["$.store", "$.store.book", "$.store.book[*].category", " "] +) +def test_string_get_json_object_empty_json_strings(json_path): + gs = cudf.Series( + [ + """ + { + "":{ + "":[ + { + "":"", + "":"", + "":"" + }, + { + "":"fiction", + "":"", + "title":"" + } + ] + } + } + """ + ] + ) + + got = gs.str.get_json_object(json_path) + expect = cudf.Series([None], dtype="object") + + assert_eq(got, expect) + + +@pytest.mark.parametrize("json_path", ["a", ".", "/.store"]) +def test_string_get_json_object_invalid_JSONPath(json_path): + gs = cudf.Series( + [ + """ + { + "store":{ + "book":[ + { + "category":"reference", + "author":"Nigel Rees", + "title":"Sayings of the Century", + "price":8.95 + }, + { + "category":"fiction", + "author":"Evelyn Waugh", + "title":"Sword of Honour", + "price":12.99 + } + ] + } + } + """ + ] + ) + + with pytest.raises(ValueError): + gs.str.get_json_object(json_path) + + +def test_string_get_json_object_allow_single_quotes(): + gs = cudf.Series( + [ + """ + { + "store":{ + "book":[ + { + 'author':"Nigel Rees", + "title":'Sayings of the Century', + "price":8.95 + }, + { + "category":"fiction", + "author":"Evelyn Waugh", + 'title':"Sword of Honour", + "price":12.99 + } + ] + } + } + """ + ] + ) + assert_eq( + gs.str.get_json_object( + "$.store.book[0].author", allow_single_quotes=True + ), + cudf.Series(["Nigel Rees"]), + ) + assert_eq( + gs.str.get_json_object( + "$.store.book[*].title", allow_single_quotes=True + ), + cudf.Series(["['Sayings of the Century',\"Sword of Honour\"]"]), + ) + + assert_eq( + gs.str.get_json_object( + "$.store.book[0].author", allow_single_quotes=False + ), + cudf.Series([None]), + ) + assert_eq( + gs.str.get_json_object( + "$.store.book[*].title", allow_single_quotes=False + ), + cudf.Series([None]), + ) + + +def test_string_get_json_object_strip_quotes_from_single_strings(): + gs = cudf.Series( + [ + """ + { + "store":{ + "book":[ + { + "author":"Nigel Rees", + "title":"Sayings of the Century", + "price":8.95 + }, + { + "category":"fiction", + "author":"Evelyn Waugh", + "title":"Sword of Honour", + "price":12.99 + } + ] + } + } + """ + ] + ) + assert_eq( + gs.str.get_json_object( + "$.store.book[0].author", strip_quotes_from_single_strings=True + ), + cudf.Series(["Nigel Rees"]), + ) + assert_eq( + gs.str.get_json_object( + "$.store.book[*].title", strip_quotes_from_single_strings=True + ), + cudf.Series(['["Sayings of the Century","Sword of Honour"]']), + ) + assert_eq( + gs.str.get_json_object( + "$.store.book[0].author", strip_quotes_from_single_strings=False + ), + cudf.Series(['"Nigel Rees"']), + ) + assert_eq( + gs.str.get_json_object( + "$.store.book[*].title", strip_quotes_from_single_strings=False + ), + cudf.Series(['["Sayings of the Century","Sword of Honour"]']), + ) + + +def test_string_get_json_object_missing_fields_as_nulls(): + gs = cudf.Series( + [ + """ + { + "store":{ + "book":[ + { + "author":"Nigel Rees", + "title":"Sayings of the Century", + "price":8.95 + }, + { + "category":"fiction", + "author":"Evelyn Waugh", + "title":"Sword of Honour", + "price":12.99 + } + ] + } + } + """ + ] + ) + assert_eq( + gs.str.get_json_object( + "$.store.book[0].category", missing_fields_as_nulls=True + ), + cudf.Series(["null"]), + ) + assert_eq( + gs.str.get_json_object( + "$.store.book[*].category", missing_fields_as_nulls=True + ), + cudf.Series(['[null,"fiction"]']), + ) + assert_eq( + gs.str.get_json_object( + "$.store.book[0].category", missing_fields_as_nulls=False + ), + cudf.Series([None]), + ) + assert_eq( + gs.str.get_json_object( + "$.store.book[*].category", missing_fields_as_nulls=False + ), + cudf.Series(['["fiction"]']), + ) + + +def test_str_join_lists_error(): + sr = cudf.Series([["a", "a"], ["b"], ["c"]]) + + with pytest.raises( + ValueError, match="sep_na_rep cannot be defined when `sep` is scalar." + ): + sr.str.join(sep="-", sep_na_rep="-") + + with pytest.raises( + TypeError, + match=re.escape( + "string_na_rep should be a string scalar, got [10, 20] of type " + ": " + ), + ): + sr.str.join(string_na_rep=[10, 20]) + + with pytest.raises( + ValueError, + match=re.escape( + "sep should be of similar size to the series, got: 2, expected: 3" + ), + ): + sr.str.join(sep=["=", "-"]) + + with pytest.raises( + TypeError, + match=re.escape( + "sep_na_rep should be a string scalar, got " + "['na'] of type: " + ), + ): + sr.str.join(sep=["-", "+", "."], sep_na_rep=["na"]) + + with pytest.raises( + TypeError, + match=re.escape( + "sep should be an str, array-like or Series object, " + "found " + ), + ): + sr.str.join(sep=cudf.DataFrame()) + + +@pytest.mark.parametrize( + "sr,sep,string_na_rep,sep_na_rep,expected", + [ + ( + [["a", "a"], ["b"], ["c"]], + "-", + None, + None, + ["a-a", "b", "c"], + ), + ( + [["a", "b"], [None], [None, "hello", None, "world"]], + "__", + "=", + None, + ["a__b", None, "=__hello__=__world"], + ), + ( + [ + ["a", None, "b"], + [None], + [None, "hello", None, "world"], + None, + ], + ["-", "_", "**", "!"], + None, + None, + ["a--b", None, "**hello****world", None], + ), + ( + [ + ["a", None, "b"], + [None], + [None, "hello", None, "world"], + None, + ], + ["-", "_", "**", None], + "rep_str", + "sep_str", + ["a-rep_str-b", None, "rep_str**hello**rep_str**world", None], + ), + ( + [[None, "a"], [None], None], + ["-", "_", None], + "rep_str", + None, + ["rep_str-a", None, None], + ), + ( + [[None, "a"], [None], None], + ["-", "_", None], + None, + "sep_str", + ["-a", None, None], + ), + ], +) +def test_str_join_lists(sr, sep, string_na_rep, sep_na_rep, expected): + sr = cudf.Series(sr) + actual = sr.str.join( + sep=sep, string_na_rep=string_na_rep, sep_na_rep=sep_na_rep + ) + expected = cudf.Series(expected) + assert_eq(actual, expected) + + +@pytest.mark.parametrize( + "patterns, expected", + [ + ( + lambda: ["a", "s", "g", "i", "o", "r"], + [ + [-1, 0, 5, 3, -1, 2], + [-1, -1, -1, -1, 1, -1], + [2, 0, -1, -1, -1, 3], + [-1, -1, -1, 0, -1, -1], + ], + ), + ( + lambda: cudf.Series(["a", "string", "g", "inn", "o", "r", "sea"]), + [ + [-1, 0, 5, -1, -1, 2, -1], + [-1, -1, -1, -1, 1, -1, -1], + [2, -1, -1, -1, -1, 3, 0], + [-1, -1, -1, -1, -1, -1, -1], + ], + ), + ], +) +def test_str_find_multiple(patterns, expected): + s = cudf.Series(["strings", "to", "search", "in"]) + t = patterns() + + expected = cudf.Series(expected) + + # We convert to pandas because find_multiple returns ListDtype(int32) + # and expected is ListDtype(int64). + # Currently there is no easy way to type-cast these to match. + assert_eq(s.str.find_multiple(t).to_pandas(), expected.to_pandas()) + + s = cudf.Index(s) + t = cudf.Index(t) + + expected.index = s + + assert_eq(s.str.find_multiple(t).to_pandas(), expected.to_pandas()) + + +def test_str_find_multiple_error(): + s = cudf.Series(["strings", "to", "search", "in"]) + with pytest.raises( + TypeError, + match=re.escape( + "patterns should be an array-like or a Series object, found " + "" + ), + ): + s.str.find_multiple("a") + + t = cudf.Series([1, 2, 3]) + with pytest.raises( + TypeError, + match=re.escape("patterns can only be of 'string' dtype, got: int64"), + ): + s.str.find_multiple(t) + + +def test_str_iterate_error(): + s = cudf.Series(["abc", "xyz"]) + with pytest.raises(TypeError): + iter(s.str) + + +@pytest.mark.parametrize( + "data", + [ + ["abc", "xyz", "pqr", "tuv"], + ["aaaaaaaaaaaa", None], + ], +) +@pytest.mark.parametrize( + "index", + [ + 0, + 1, + 2, + slice(0, 1, 2), + slice(0, 5, 2), + slice(-1, -2, 1), + slice(-1, -2, -1), + slice(-2, -1, -1), + slice(-2, -1, 1), + slice(0), + slice(None), + ], +) +def test_string_str_subscriptable(data, index): + psr = pd.Series(data) + gsr = cudf.Series(data) + + assert_eq(psr.str[index], gsr.str[index]) + + psi = pd.Index(data) + gsi = cudf.Index(data) + + assert_eq(psi.str[index], gsi.str[index]) + + +@pytest.mark.parametrize( + "data,expected", + [ + (["aaaaaaaaaaaa"], [12]), + (["abc", "d", "ef"], [3, 1, 2]), + (["Hello", "Bye", "Thanks 😊"], [5, 3, 11]), + (["\n\t", "Bye", "Thanks 😊"], [2, 3, 11]), + ], +) +def test_string_str_byte_count(data, expected): + sr = cudf.Series(data) + expected = cudf.Series(expected, dtype="int32") + actual = sr.str.byte_count() + assert_eq(expected, actual) + + si = cudf.Index(data) + expected = cudf.Index(expected, dtype="int32") + actual = si.str.byte_count() + assert_eq(expected, actual) + + +@pytest.mark.parametrize( + "data,expected", + [ + (["1", "2", "3", "4", "5"], [True, True, True, True, True]), + ( + ["1.1", "2.0", "3.2", "4.3", "5."], + [False, False, False, False, False], + ), + ( + [".12312", "213123.", ".3223.", "323423.."], + [False, False, False, False], + ), + ([""], [False]), + ( + ["1..1", "+2", "++3", "4++", "-5"], + [False, True, False, False, True], + ), + ( + [ + "24313345435345 ", + "+2632726478", + "++367293674326", + "4382493264392746.237649274692++", + "-578239479238469264", + ], + [False, True, False, False, True], + ), + ( + ["2a2b", "a+b", "++a", "a.b++", "-b"], + [False, False, False, False, False], + ), + ( + ["2a2b", "1+3", "9.0++a", "+", "-"], + [False, False, False, False, False], + ), + ], +) +def test_str_isinteger(data, expected): + sr = cudf.Series(data, dtype="str") + expected = cudf.Series(expected) + actual = sr.str.isinteger() + assert_eq(expected, actual) + + sr = cudf.Index(data) + expected = cudf.Index(expected) + actual = sr.str.isinteger() + assert_eq(expected, actual) + + +@pytest.mark.parametrize( + "data,expected", + [ + (["1", "2", "3", "4", "5"], [True, True, True, True, True]), + (["1.1", "2.0", "3.2", "4.3", "5."], [True, True, True, True, True]), + ([""], [False]), + ( + [".12312", "213123.", ".3223.", "323423.."], + [True, True, False, False], + ), + ( + ["1.00.323.1", "+2.1", "++3.30", "4.9991++", "-5.3"], + [False, True, False, False, True], + ), + ( + [ + "24313345435345 ", + "+2632726478", + "++367293674326", + "4382493264392746.237649274692++", + "-578239479238469264", + ], + [False, True, False, False, True], + ), + ( + [ + "24313345435345.32732 ", + "+2632726478.3627638276", + "++0.326294632367293674326", + "4382493264392746.237649274692++", + "-57823947923.8469264", + ], + [False, True, False, False, True], + ), + ( + ["2a2b", "a+b", "++a", "a.b++", "-b"], + [False, False, False, False, False], + ), + ( + ["2a2b", "1+3", "9.0++a", "+", "-"], + [False, False, False, False, False], + ), + ], +) +def test_str_isfloat(data, expected): + sr = cudf.Series(data, dtype="str") + expected = cudf.Series(expected) + actual = sr.str.isfloat() + assert_eq(expected, actual) + + sr = cudf.Index(data) + expected = cudf.Index(expected) + actual = sr.str.isfloat() + assert_eq(expected, actual) + + +def test_string_isipv4(): + gsr = cudf.Series( + [ + "", + None, + "1...1", + "141.168.0.1", + "127.0.0.1", + "1.255.0.1", + "256.27.28.26", + "25.257.28.26", + "25.27.258.26", + "25.27.28.256", + "-1.0.0.0", + ] + ) + got = gsr.str.isipv4() + expected = cudf.Series( + [ + False, + None, + False, + True, + True, + True, + False, + False, + False, + False, + False, + ] + ) + assert_eq(expected, got) + + +def test_string_ip4_to_int(): + gsr = cudf.Series( + ["", None, "hello", "41.168.0.1", "127.0.0.1", "41.197.0.1"] + ) + expected = cudf.Series([0, None, 0, 698875905, 2130706433, 700776449]) + + got = gsr.str.ip2int() + assert_eq(expected, got) + + got = gsr.str.ip_to_int() # alias + assert_eq(expected, got) + + +def test_string_istimestamp(): + gsr = cudf.Series( + [ + "", + None, + "20201009 123456.987654AM+0100", + "1920111 012345.000001", + "18201235 012345.1", + "20201009 250001.2", + "20201009 129901.3", + "20201009 123499.4", + "20201009 000000.500000PM-0130", + "20201009:000000.600000", + "20201009 010203.700000PM-2500", + "20201009 010203.800000AM+0590", + "20201009 010203.900000AP-0000", + ] + ) + got = gsr.str.istimestamp(r"%Y%m%d %H%M%S.%f%p%z") + expected = cudf.Series( + [ + False, + None, + True, + False, + False, + False, + False, + False, + True, + False, + False, + False, + False, + ] + ) + assert_eq(expected, got) + + +def test_istimestamp_empty(): + gsr = cudf.Series([], dtype="object") + result = gsr.str.istimestamp("%Y%m%d") + expected = cudf.Series([], dtype="bool") + assert_eq(result, expected) + + +@pytest.mark.parametrize( + "data", + [ + ["f0:18:98:22:c2:e4", "00:00:00:00:00:00", "ff:ff:ff:ff:ff:ff"], + ["f0189822c2e4", "000000000000", "ffffffffffff"], + ["0xf0189822c2e4", "0x000000000000", "0xffffffffffff"], + ["0Xf0189822c2e4", "0X000000000000", "0Xffffffffffff"], + ], +) +def test_string_hex_to_int(data): + gsr = cudf.Series(data) + + expected = cudf.Series([263988422296292, 0, 281474976710655]) + + got = gsr.str.htoi() + assert_eq(expected, got) + + got = gsr.str.hex_to_int() # alias + assert_eq(expected, got) + + +def test_string_ishex(): + gsr = cudf.Series(["", None, "0x01a2b3c4d5e6f", "0789", "ABCDEF0"]) + got = gsr.str.ishex() + expected = cudf.Series([False, None, True, True, True]) + assert_eq(expected, got) + + +def test_string_str_code_points(): + data = [ + "abc", + "Def", + None, + "jLl", + "dog and cat", + "accénted", + "", + " 1234 ", + "XYZ", + ] + gs = cudf.Series(data) + expected = [ + 97, + 98, + 99, + 68, + 101, + 102, + 106, + 76, + 108, + 100, + 111, + 103, + 32, + 97, + 110, + 100, + 32, + 99, + 97, + 116, + 97, + 99, + 99, + 50089, + 110, + 116, + 101, + 100, + 32, + 49, + 50, + 51, + 52, + 32, + 88, + 89, + 90, + ] + expected = cudf.Series(expected) + + assert_eq(expected, gs.str.code_points(), check_dtype=False) + + +@pytest.mark.parametrize( + "data", + [ + ["http://www.hellow.com", "/home/nvidia/nfs", "123.45 ~ABCDEF"], + ["23", "³", "⅕", ""], + [" ", "\t\r\n ", ""], + ["$", "B", "Aab$", "$$ca", "C$B$", "cat"], + ], +) +def test_string_str_url_encode(data): + gs = cudf.Series(data) + + got = gs.str.url_encode() + expected = pd.Series([urllib.parse.quote(url, safe="~") for url in data]) + assert_eq(expected, got) + + +def test_string_str_decode_url(): + data = [ + "http://www.hellow.com?k1=acc%C3%A9nted&k2=a%2F/b.c", + "%2Fhome%2fnfs", + "987%20ZYX", + ] + gs = cudf.Series(data) + + got = gs.str.url_decode() + expected = pd.Series([urllib.parse.unquote(url) for url in data]) + assert_eq(expected, got) + + +@pytest.mark.parametrize( + "data", + [ + ["abc", "xyz", "a", "ab", "123", "097"], + ["A B", "1.5", "3,000"], + ["23", "³", "⅕", ""], + [" ", "\t\r\n ", ""], + ["$", "B", "Aab$", "$$ca", "C$B$", "cat"], + ["line to be wrapped", "another line to be wrapped"], + ["hello", "there", "world", "+1234", "-1234", None, "accént", ""], + ["1. Ant. ", "2. Bee!\n", "3. Cat?\t", None], + ], +) +def test_string_str_translate(data): + ps = pd.Series(data) + gs = cudf.Series(data) + + assert_eq( + ps.str.translate(str.maketrans({"a": "z"})), + gs.str.translate(str.maketrans({"a": "z"})), + ) + assert_eq( + pd.Index(ps).str.translate(str.maketrans({"a": "z"})), + cudf.Index(gs).str.translate(str.maketrans({"a": "z"})), + ) + assert_eq( + ps.str.translate(str.maketrans({"a": "z", "i": "$", "z": "1"})), + gs.str.translate(str.maketrans({"a": "z", "i": "$", "z": "1"})), + ) + assert_eq( + pd.Index(ps).str.translate( + str.maketrans({"a": "z", "i": "$", "z": "1"}) + ), + cudf.Index(gs).str.translate( + str.maketrans({"a": "z", "i": "$", "z": "1"}) + ), + ) + assert_eq( + ps.str.translate( + str.maketrans({"+": "-", "-": "$", "?": "!", "B": "."}) + ), + gs.str.translate( + str.maketrans({"+": "-", "-": "$", "?": "!", "B": "."}) + ), + ) + assert_eq( + pd.Index(ps).str.translate( + str.maketrans({"+": "-", "-": "$", "?": "!", "B": "."}) + ), + cudf.Index(gs).str.translate( + str.maketrans({"+": "-", "-": "$", "?": "!", "B": "."}) + ), + ) + assert_eq( + ps.str.translate(str.maketrans({"é": "É"})), + gs.str.translate(str.maketrans({"é": "É"})), + ) + + +def test_string_str_filter_characters(): + data = [ + "hello world", + "A+B+C+D", + "?!@#$%^&*()", + "accént", + None, + "$1.50", + "", + ] + gs = cudf.Series(data) + expected = cudf.Series( + ["helloworld", "ABCD", "", "accnt", None, "150", ""] + ) + filter = {"a": "z", "A": "Z", "0": "9"} + assert_eq(expected, gs.str.filter_characters(filter)) + + expected = cudf.Series([" ", "+++", "?!@#$%^&*()", "é", None, "$.", ""]) + assert_eq(expected, gs.str.filter_characters(filter, False)) + + expected = cudf.Series( + ["hello world", "A B C D", " ", "acc nt", None, " 1 50", ""] + ) + assert_eq(expected, gs.str.filter_characters(filter, True, " ")) + + with pytest.raises(TypeError): + gs.str.filter_characters(filter, True, ["a"]) + + +@pytest.mark.parametrize( + "data,sub,er", + [ + (["abc", "xyz", "a", "ab", "123", "097"], "a", ValueError), + (["A B", "1.5", "3,000"], "abc", ValueError), + (["23", "³", "⅕", ""], "⅕", ValueError), + ([" ", "\t\r\n ", ""], "\n", ValueError), + (["$", "B", "Aab$", "$$ca", "C$B$", "cat"], "$", ValueError), + (["line to be wrapped", "another line to be wrapped"], " ", None), + ( + ["hello", "there", "world", "+1234", "-1234", None, "accént", ""], + "+", + ValueError, + ), + (["line to be wrapped", "another line to be wrapped"], "", None), + ], +) +def test_string_str_rindex(data, sub, er): + ps = pd.Series(data) + gs = cudf.Series(data) + + if er is None: + assert_eq(ps.str.rindex(sub), gs.str.rindex(sub), check_dtype=False) + assert_eq( + pd.Index(ps).str.rindex(sub), + cudf.Index(gs).str.rindex(sub), + exact=False, + ) + + try: + ps.str.rindex(sub) + except er: + pass + else: + assert not er + + try: + gs.str.rindex(sub) + except er: + pass + else: + assert not er + + +@pytest.mark.parametrize( + "data,sub,expect", + [ + ( + ["abc", "xyz", "a", "ab", "123", "097"], + ["b", "y", "a", "c", "4", "8"], + [True, True, True, False, False, False], + ), + ( + ["A B", "1.5", "3,000", "23", "³", "⅕"], + ["A B", ".", ",", "1", " ", " "], + [True, True, True, False, False, False], + ), + ( + [" ", "\t", "\r", "\f ", "\n", ""], + ["", "\t", "\r", "xx", "yy", "zz"], + [True, True, True, False, False, False], + ), + ( + ["$", "B", "Aab$", "$$ca", "C$B$", "cat"], + ["$", "B", "ab", "*", "@", "dog"], + [True, True, True, False, False, False], + ), + ( + ["hello", "there", "world", "-1234", None, "accént"], + ["lo", "e", "o", "+1234", " ", "e"], + [True, True, True, False, None, False], + ), + ( + ["1. Ant. ", "2. Bee!\n", "3. Cat?\t", "", "x", None], + ["A", "B", "C", " ", "y", "e"], + [True, True, True, False, False, None], + ), + ], +) +def test_string_contains_multi(data, sub, expect): + gs = cudf.Series(data) + sub = cudf.Series(sub) + got = gs.str.contains(sub) + expect = cudf.Series(expect) + assert_eq(expect, got, check_dtype=False) + + +# Pandas does not allow 'case' or 'flags' if 'pat' is re.Pattern +# This covers contains, match, count, and replace +@pytest.mark.parametrize( + "pat", + [re.compile("[n-z]"), re.compile("[A-Z]"), re.compile("de"), "A"], +) +@pytest.mark.parametrize("repl", ["xyz", "", " "]) +def test_string_compiled_re(ps_gs, pat, repl): + ps, gs = ps_gs + + expect = ps.str.contains(pat, regex=True) + got = gs.str.contains(pat, regex=True) + assert_eq(expect, got) + + expect = ps.str.match(pat) + got = gs.str.match(pat) + assert_eq(expect, got) + + expect = ps.str.count(pat) + got = gs.str.count(pat) + assert_eq(expect, got, check_dtype=False) + + expect = ps.str.replace(pat, repl, regex=True) + got = gs.str.replace(pat, repl, regex=True) + assert_eq(expect, got) + + +@pytest.mark.parametrize( + "data", + [ + ["abc", "xyz", "a", "ab", "123", "097"], + ["A B", "1.5", "3,000"], + ["23", "³", "⅕", ""], + [" ", "\t\r\n ", ""], + ["$", "B", "Aab$", "$$ca", "C$B$", "cat"], + ["line to be wrapped", "another line to be wrapped"], + ["hello", "there", "world", "+1234", "-1234", None, "accént", ""], + ["1. Ant. ", "2. Bee!\n", "3. Cat?\t", None], + ], +) +@pytest.mark.parametrize("pat", ["", " ", "a", "abc", "cat", "$", "\n"]) +def test_string_str_match(data, pat): + ps = pd.Series(data) + gs = cudf.Series(data) + + assert_eq(ps.str.match(pat), gs.str.match(pat)) + assert_eq( + pd.Index(pd.Index(ps).str.match(pat)), cudf.Index(gs).str.match(pat) + ) + + +@pytest.mark.parametrize( + "data,sub,er", + [ + (["abc", "xyz", "a", "ab", "123", "097"], "a", ValueError), + (["A B", "1.5", "3,000"], "abc", ValueError), + (["23", "³", "⅕", ""], "⅕", ValueError), + ([" ", "\t\r\n ", ""], "\n", ValueError), + (["$", "B", "Aab$", "$$ca", "C$B$", "cat"], "$", ValueError), + (["line to be wrapped", "another line to be wrapped"], " ", None), + ( + ["hello", "there", "world", "+1234", "-1234", None, "accént", ""], + "+", + ValueError, + ), + (["line to be wrapped", "another line to be wrapped"], "", None), + ], +) +def test_string_str_index(data, sub, er): + ps = pd.Series(data) + gs = cudf.Series(data) + + if er is None: + assert_eq(ps.str.index(sub), gs.str.index(sub), check_dtype=False) + + try: + ps.str.index(sub) + except er: + pass + else: + assert not er + + try: + gs.str.index(sub) + except er: + pass + else: + assert not er + + +@pytest.mark.parametrize( + "data", + [ + ["str_foo", "str_bar", "no_prefix", "", None], + ["foo_str", "bar_str", "no_suffix", "", None], + ], +) +def test_string_remove_suffix_prefix(data): + ps = pd.Series(data) + gs = cudf.Series(data) + + got = gs.str.removeprefix("str_") + expect = ps.str.removeprefix("str_") + assert_eq( + expect, + got, + check_dtype=False, + ) + got = gs.str.removesuffix("_str") + expect = ps.str.removesuffix("_str") + assert_eq( + expect, + got, + check_dtype=False, + ) + + +@pytest.mark.parametrize( + "data", + [ + ["abc", "xyz", "a", "ab", "123", "097"], + ["A B", "1.5", "3,000"], + ["23", "³", "⅕", ""], + [" ", "\t\r\n ", ""], + ["$", "B", "Aab$", "$$ca", "C$B$", "cat"], + ["line to be wrapped", "another line to be wrapped"], + ["hello", "there", "world", "+1234", "-1234", None, "accént", ""], + ["1. Ant. ", "2. Bee!\n", "3. Cat?\t", None], + ], +) +@pytest.mark.parametrize( + "sub", + ["", " ", "a", "abc", "cat", "$", "\n"], +) +def test_string_find(data, sub): + ps = pd.Series(data) + gs = cudf.Series(data) + + got = gs.str.find(sub) + expect = ps.str.find(sub) + assert_eq( + expect, + got, + check_dtype=False, + ) + + got = gs.str.find(sub, start=1) + expect = ps.str.find(sub, start=1) + assert_eq( + expect, + got, + check_dtype=False, + ) + + got = gs.str.find(sub, end=10) + expect = ps.str.find(sub, end=10) + assert_eq( + expect, + got, + check_dtype=False, + ) + + got = gs.str.find(sub, start=2, end=10) + expect = ps.str.find(sub, start=2, end=10) + assert_eq( + expect, + got, + check_dtype=False, + ) + + got = gs.str.rfind(sub) + expect = ps.str.rfind(sub) + assert_eq( + expect, + got, + check_dtype=False, + ) + + got = gs.str.rfind(sub, start=1) + expect = ps.str.rfind(sub, start=1) + assert_eq( + expect, + got, + check_dtype=False, + ) + + got = gs.str.rfind(sub, end=10) + expect = ps.str.rfind(sub, end=10) + assert_eq( + expect, + got, + check_dtype=False, + ) + + got = gs.str.rfind(sub, start=2, end=10) + expect = ps.str.rfind(sub, start=2, end=10) + assert_eq( + expect, + got, + check_dtype=False, + ) + + +@pytest.mark.parametrize( + "data", + [ + ["abc", "xyz", "a", "ab", "123", "097"], + ["A B", "1.5", "3,000"], + ["23", "³", "⅕", ""], + [" ", "\t\r\n ", ""], + ["$", "B", "Aab$", "$$ca", "C$B$", "cat"], + ["line to be wrapped", "another line to be wrapped"], + ["hello", "there", "world", "+1234", "-1234", None, "accént", ""], + ["1. Ant. ", "2. Bee!\n", "3. Cat?\t", None], + ], +) +@pytest.mark.parametrize( + "pat", + ["", None, " ", "a", "abc", "cat", "$", "\n"], +) +def test_string_starts_ends(data, pat): + ps = pd.Series(data) + gs = cudf.Series(data) + + if pat is None: + assert_exceptions_equal( + lfunc=ps.str.startswith, + rfunc=gs.str.startswith, + lfunc_args_and_kwargs=([pat],), + rfunc_args_and_kwargs=([pat],), + ) + assert_exceptions_equal( + lfunc=ps.str.endswith, + rfunc=gs.str.endswith, + lfunc_args_and_kwargs=([pat],), + rfunc_args_and_kwargs=([pat],), + ) + else: + assert_eq( + ps.str.startswith(pat), gs.str.startswith(pat), check_dtype=False + ) + assert_eq( + ps.str.endswith(pat), gs.str.endswith(pat), check_dtype=False + ) + + +@pytest.mark.parametrize( + "data,pat", + [ + ( + ["abc", "xyz", "a", "ab", "123", "097"], + ("abc", "x", "a", "b", "3", "7"), + ), + (["A B", "1.5", "3,000"], ("A ", ".", ",")), + (["23", "³", "⅕", ""], ("23", "³", "⅕", "")), + ([" ", "\t\r\n ", ""], ("d", "\n ", "")), + ( + ["$", "B", "Aab$", "$$ca", "C$B$", "cat"], + ("$", "$", "a", "<", "(", "#"), + ), + ( + ["line to be wrapped", "another line to be wrapped"], + ("another", "wrapped"), + ), + ( + ["hello", "there", "world", "+1234", "-1234", None, "accént", ""], + ("hsdjfk", "", "ll", "+", "-", "w", "-", "én"), + ), + ( + ["1. Ant. ", "2. Bee!\n", "3. Cat?\t", None], + ("1. Ant. ", "2. Bee!\n", "3. Cat?\t", ""), + ), + ], +) +def test_string_starts_ends_list_like_pat(data, pat): + gs = cudf.Series(data) + + starts_expected = [] + ends_expected = [] + for i in range(len(pat)): + if data[i] is None: + starts_expected.append(None) + ends_expected.append(None) + else: + if pat[i] is None: + starts_expected.append(False) + ends_expected.append(False) + else: + starts_expected.append(data[i].startswith(pat[i])) + ends_expected.append(data[i].endswith(pat[i])) + starts_expected = pd.Series(starts_expected) + ends_expected = pd.Series(ends_expected) + assert_eq(starts_expected, gs.str.startswith(pat), check_dtype=False) + assert_eq(ends_expected, gs.str.endswith(pat), check_dtype=False) + + +@pytest.mark.parametrize( + "find", + [ + "(\\d)(\\d)", + "(\\d)(\\d)", + "(\\d)(\\d)", + "(\\d)(\\d)", + "([a-z])-([a-z])", + "([a-z])-([a-zé])", + "([a-z])-([a-z])", + "([a-z])-([a-zé])", + re.compile("([A-Z])(\\d)"), + ], +) +@pytest.mark.parametrize( + "replace", + ["\\1-\\2", "V\\2-\\1", "\\1 \\2", "\\2 \\1", "X\\1+\\2Z", "X\\1+\\2Z"], +) +def test_string_replace_with_backrefs(find, replace): + s = [ + "A543", + "Z756", + "", + None, + "tést-string", + "two-thréé four-fivé", + "abcd-éfgh", + "tést-string-again", + ] + ps = pd.Series(s) + gs = cudf.Series(s) + got = gs.str.replace_with_backrefs(find, replace) + expected = ps.str.replace(find, replace, regex=True) + assert_eq(got, expected) + + got = cudf.Index(gs).str.replace_with_backrefs(find, replace) + expected = pd.Index(ps).str.replace(find, replace, regex=True) + assert_eq(got, expected) + + +@pytest.mark.parametrize( + "data", + [ + ["abc", "xyz", "a", "ab", "123", "097"], + ["A B", "1.5", "3,000"], + ["23", "³", "⅕", ""], + [" ", "\t\r\n ", ""], + ["$", "B", "Aab$", "$$ca", "C$B$", "cat", "cat\ndog"], + ["line\nto be wrapped", "another\nline\nto be wrapped"], + ], +) +@pytest.mark.parametrize( + "pat", + ["a", " ", "\t", "another", "0", r"\$", "^line$", "line.*be", "cat$"], +) +@pytest.mark.parametrize("flags", [0, re.MULTILINE, re.DOTALL]) +def test_string_count(data, pat, flags): + gs = cudf.Series(data) + ps = pd.Series(data) + + assert_eq( + gs.str.count(pat=pat, flags=flags), + ps.str.count(pat=pat, flags=flags), + check_dtype=False, + ) + assert_eq( + cudf.Index(gs).str.count(pat=pat), + pd.Index(ps).str.count(pat=pat), + exact=False, + ) + + +@pytest.mark.parametrize( + "pat, flags", + [ + ("Monkey", 0), + ("on", 0), + ("b", 0), + ("on$", 0), + ("on$", re.MULTILINE), + ("o.*k", re.DOTALL), + ], +) +def test_string_findall(pat, flags): + test_data = ["Lion", "Monkey", "Rabbit", "Don\nkey"] + ps = pd.Series(test_data) + gs = cudf.Series(test_data) + + expected = ps.str.findall(pat, flags) + actual = gs.str.findall(pat, flags) + assert_eq(expected, actual) + + +@pytest.mark.parametrize( + "pat, flags, pos", + [ + ("Monkey", 0, [-1, 0, -1, -1]), + ("on", 0, [2, 1, -1, 1]), + ("bit", 0, [-1, -1, 3, -1]), + ("on$", 0, [2, -1, -1, -1]), + ("on$", re.MULTILINE, [2, -1, -1, 1]), + ("o.*k", re.DOTALL, [-1, 1, -1, 1]), + ], +) +def test_string_find_re(pat, flags, pos): + test_data = ["Lion", "Monkey", "Rabbit", "Don\nkey"] + gs = cudf.Series(test_data) + + expected = pd.Series(pos, dtype=np.int32) + actual = gs.str.find_re(pat, flags) + assert_eq(expected, actual) + + +def test_string_replace_multi(): + ps = pd.Series(["hello", "goodbye"]) + gs = cudf.Series(["hello", "goodbye"]) + expect = ps.str.replace("e", "E").str.replace("o", "O") + got = gs.str.replace(["e", "o"], ["E", "O"]) + + assert_eq(expect, got) + + ps = pd.Series(["foo", "fuz", np.nan]) + gs = cudf.Series.from_pandas(ps) + + expect = ps.str.replace("f.", "ba", regex=True) + got = gs.str.replace(["f."], ["ba"], regex=True) + assert_eq(expect, got) + + ps = pd.Series(["f.o", "fuz", np.nan]) + gs = cudf.Series.from_pandas(ps) + + expect = ps.str.replace("f.", "ba", regex=False) + got = gs.str.replace(["f."], ["ba"], regex=False) + assert_eq(expect, got) + + +@pytest.mark.parametrize( + "data", + [ + ["A,,B", "1,,5", "3,00,0"], + ["Linda van der Berg", "George Pitt-Rivers"], + ["+23", "³", "⅕", ""], + [" ", "\t\r\n ", ""], + ["hello", "there", "world", "+1234", "-1234", None, "accént", ""], + ["1. Ant. ", "2. Bee!\n", "3. Cat?\t", None], + ], +) +@pytest.mark.parametrize("width", [0, 1, 25]) +@pytest.mark.parametrize("side", ["left", "right", "both"]) +@pytest.mark.parametrize("fillchar", [" ", ".", "\n", "+", "\t"]) +def test_strings_pad_tests(data, width, side, fillchar): + gs = cudf.Series(data) + ps = pd.Series(data) + + assert_eq( + ps.str.pad(width=width, side=side, fillchar=fillchar), + gs.str.pad(width=width, side=side, fillchar=fillchar), + ) + + gi = cudf.Index(data) + pi = pd.Index(data) + + assert_eq( + pi.str.pad(width=width, side=side, fillchar=fillchar), + gi.str.pad(width=width, side=side, fillchar=fillchar), + ) + + +@pytest.mark.parametrize( + "data", + [ + ["abc", "xyz", "a", "ab", "123", "097"], + ["A B", "1.5", "3,000"], + ["23", "³", "⅕", ""], + pytest.param([" ", "\t\r\n ", ""], marks=pytest.mark.xfail), + ["leopard", "Golden Eagle", "SNAKE", ""], + ["line to be wrapped", "another line to be wrapped"], + ], +) +@pytest.mark.parametrize("width", [1, 20]) +def test_string_wrap(data, width): + gs = cudf.Series(data) + ps = pd.Series(data) + + assert_eq( + gs.str.wrap( + width=width, + break_long_words=False, + expand_tabs=False, + replace_whitespace=True, + drop_whitespace=True, + break_on_hyphens=False, + ), + ps.str.wrap( + width=width, + break_long_words=False, + expand_tabs=False, + replace_whitespace=True, + drop_whitespace=True, + break_on_hyphens=False, + ), + ) + + gi = cudf.Index(data) + pi = pd.Index(data) + + assert_eq( + gi.str.wrap( + width=width, + break_long_words=False, + expand_tabs=False, + replace_whitespace=True, + drop_whitespace=True, + break_on_hyphens=False, + ), + pi.str.wrap( + width=width, + break_long_words=False, + expand_tabs=False, + replace_whitespace=True, + drop_whitespace=True, + break_on_hyphens=False, + ), + ) + + +@pytest.mark.parametrize( + "data", + [ + ["A,,B", "1,,5", "3,00,0"], + ["Linda van der Berg", "George Pitt-Rivers"], + ["³", "⅕", ""], + ["hello", "there", "world", "+1234", "-1234", None, "accént", ""], + [" ", "\t\r\n ", ""], + ["1. Ant. ", "2. Bee!\n", "3. Cat?\t", None], + ], +) +@pytest.mark.parametrize("width", [0, 20]) +def test_strings_zfill_tests(data, width): + gs = cudf.Series(data) + ps = pd.Series(data) + + assert_eq(ps.str.zfill(width=width), gs.str.zfill(width=width)) + + gi = cudf.Index(data) + pi = pd.Index(data) + + assert_eq(pi.str.zfill(width=width), gi.str.zfill(width=width)) + + +def test_string_strip_fail(): + gs = cudf.Series(["a", "aa", ""]) + with pytest.raises(TypeError): + gs.str.strip(["a"]) + with pytest.raises(TypeError): + gs.str.lstrip(["a"]) + with pytest.raises(TypeError): + gs.str.rstrip(["a"]) + + +@pytest.mark.parametrize( + "data", + [ + ["koala", "fox", "chameleon"], + ["A,,B", "1,,5", "3,00,0"], + ["Linda van der Berg", "George Pitt-Rivers"], + ["23", "³", "⅕", ""], + [" ", "\t\r\n ", ""], + [ + "this is a regular sentence", + "https://docs.python.org/3/tutorial/index.html", + None, + ], + ["1. Ant. ", "2. Bee!\n", "3. Cat?\t", None], + ], +) +@pytest.mark.parametrize("width", [0, 20]) +@pytest.mark.parametrize("fillchar", ["⅕", "1", ".", "t", " ", ","]) +def test_strings_filling_tests(data, width, fillchar): + gs = cudf.Series(data) + ps = pd.Series(data) + + assert_eq( + ps.str.center(width=width, fillchar=fillchar), + gs.str.center(width=width, fillchar=fillchar), + ) + assert_eq( + ps.str.ljust(width=width, fillchar=fillchar), + gs.str.ljust(width=width, fillchar=fillchar), + ) + assert_eq( + ps.str.rjust(width=width, fillchar=fillchar), + gs.str.rjust(width=width, fillchar=fillchar), + ) + + gi = cudf.Index(data) + pi = pd.Index(data) + + assert_eq( + pi.str.center(width=width, fillchar=fillchar), + gi.str.center(width=width, fillchar=fillchar), + ) + assert_eq( + pi.str.ljust(width=width, fillchar=fillchar), + gi.str.ljust(width=width, fillchar=fillchar), + ) + assert_eq( + pi.str.rjust(width=width, fillchar=fillchar), + gi.str.rjust(width=width, fillchar=fillchar), + ) + + +@pytest.mark.parametrize("n", [-1, 0, 1, 4]) +@pytest.mark.parametrize("expand", [True, False]) +def test_string_rsplit_re(n, expand): + data = ["a b", " c ", " d", "e ", "f"] + ps = pd.Series(data, dtype="str") + gs = cudf.Series(data, dtype="str") + + # Pandas does not yet support the regex parameter for rsplit + import inspect + + assert ( + "regex" + not in inspect.signature(pd.Series.str.rsplit).parameters.keys() + ) + + expect = ps.str.rsplit(pat=" ", n=n, expand=expand) + got = gs.str.rsplit(pat="\\s", n=n, expand=expand, regex=True) + assert_eq(expect, got) + + +@pytest.mark.parametrize( + "data", + [ + ["koala", "fox", "chameleon"], + ["A,,B", "1,,5", "3,00,0"], + ["Linda van der Berg", "George Pitt-Rivers"], + ["23", "³", "⅕", ""], + [" ", "\t\r\n ", ""], + [ + "this is a regular sentence", + "https://docs.python.org/3/tutorial/index.html", + None, + ], + ], +) +@pytest.mark.parametrize("n", [-1, 0, 1, 4]) +@pytest.mark.parametrize("expand", [True, False]) +def test_strings_split(data, n, expand): + gs = cudf.Series(data) + ps = pd.Series(data) + + assert_eq( + ps.str.split(n=n, expand=expand).reset_index(), + gs.str.split(n=n, expand=expand).reset_index(), + check_index_type=False, + ) + + assert_eq( + ps.str.split(",", n=n, expand=expand), + gs.str.split(",", n=n, expand=expand), + ) + assert_eq( + ps.str.split("-", n=n, expand=expand), + gs.str.split("-", n=n, expand=expand), + ) + + +@pytest.mark.parametrize( + "data", + [ + ["koala", "fox", "chameleon"], + ["A,,B", "1,,5", "3,00,0"], + ["Linda van der Berg", "George Pitt-Rivers"], + ["23", "³", "⅕", ""], + [" ", "\t\r\n ", ""], + [ + "this is a regular sentence", + "https://docs.python.org/3/tutorial/index.html", + None, + ], + ["1. Ant. ", "2. Bee!\n", "3. Cat?\t", None], + ], +) +@pytest.mark.parametrize( + "to_strip", ["⅕", None, "123.", ".!? \n\t", "123.!? \n\t", " ", ".", ","] +) +def test_strings_strip_tests(data, to_strip): + gs = cudf.Series(data) + ps = pd.Series(data) + + assert_eq(ps.str.strip(to_strip=to_strip), gs.str.strip(to_strip=to_strip)) + assert_eq( + ps.str.rstrip(to_strip=to_strip), gs.str.rstrip(to_strip=to_strip) + ) + assert_eq( + ps.str.lstrip(to_strip=to_strip), gs.str.lstrip(to_strip=to_strip) + ) + + gi = cudf.Index(data) + pi = pd.Index(data) + + assert_eq(pi.str.strip(to_strip=to_strip), gi.str.strip(to_strip=to_strip)) + assert_eq( + pi.str.rstrip(to_strip=to_strip), gi.str.rstrip(to_strip=to_strip) + ) + assert_eq( + pi.str.lstrip(to_strip=to_strip), gi.str.lstrip(to_strip=to_strip) + ) + + +def test_string_is_title(): + data = [ + "leopard", + "Golden Eagle", + "SNAKE", + "", + "!A", + "hello World", + "A B C", + "#", + "AƻB", + "Ⓑⓖ", + "Art of War", + ] + gs = cudf.Series(data) + ps = pd.Series(data) + + assert_eq(gs.str.istitle(), ps.str.istitle()) + + +@pytest.mark.parametrize( + "data", + [ + ["koala", "fox", "chameleon"], + ["A,,B", "1,,5", "3,00,0"], + ["Linda van der Berg", "George Pitt-Rivers"], + ["23", "³", "⅕", ""], + [" ", "\t\r\n ", ""], + ], +) +def test_strings_rpartition(data): + gs = cudf.Series(data) + ps = pd.Series(data) + + assert_eq(ps.str.rpartition(), gs.str.rpartition()) + assert_eq(ps.str.rpartition("-"), gs.str.rpartition("-")) + assert_eq(ps.str.rpartition(","), gs.str.rpartition(",")) + + +@pytest.mark.parametrize( + "data", + [ + ["koala", "fox", "chameleon"], + ["A,,B", "1,,5", "3,00,0"], + ["Linda van der Berg", "George Pitt-Rivers"], + ["23", "³", "⅕", ""], + [" ", "\t\r\n ", ""], + ], +) +def test_strings_partition(data): + gs = cudf.Series(data, name="str_name") + ps = pd.Series(data, name="str_name") + + assert_eq(ps.str.partition(), gs.str.partition()) + assert_eq(ps.str.partition(","), gs.str.partition(",")) + assert_eq(ps.str.partition("-"), gs.str.partition("-")) + + gi = cudf.Index(data, name="new name") + pi = pd.Index(data, name="new name") + assert_eq(pi.str.partition(), gi.str.partition()) + assert_eq(pi.str.partition(","), gi.str.partition(",")) + assert_eq(pi.str.partition("-"), gi.str.partition("-")) + + +def test_string_partition_fail(): + gs = cudf.Series(["abc", "aa", "cba"]) + with pytest.raises(TypeError): + gs.str.partition(["a"]) + with pytest.raises(TypeError): + gs.str.rpartition(["a"]) + + +@pytest.mark.parametrize( + "data", + [ + ["koala", "fox", "chameleon"], + ["A,,B", "1,,5", "3,00,0"], + ["Linda van der Berg", "George Pitt-Rivers"], + ["23", "³", "⅕", ""], + [" ", "\t\r\n ", ""], + [ + "this is a regular sentence", + "https://docs.python.org/3/tutorial/index.html", + None, + ], + ], +) +@pytest.mark.parametrize("n", [-1, 2, 9]) +@pytest.mark.parametrize("expand", [True, False]) +def test_strings_rsplit(data, n, expand): + gs = cudf.Series(data) + ps = pd.Series(data) + + assert_eq( + ps.str.rsplit(n=n, expand=expand).reset_index(), + gs.str.rsplit(n=n, expand=expand).reset_index(), + check_index_type=False, + ) + assert_eq( + ps.str.rsplit(",", n=n, expand=expand), + gs.str.rsplit(",", n=n, expand=expand), + ) + assert_eq( + ps.str.rsplit("-", n=n, expand=expand), + gs.str.rsplit("-", n=n, expand=expand), + ) + + +@pytest.fixture( + params=[ + ["abc", "xyz", "a", "ab", "123", "097"], + ["abcdefghij", "0123456789", "9876543210", None, "accénted", ""], + ["koala", "fox", "chameleon"], + [ + "1234567890", + "de", + "1.75", + "-34", + "+9.8", + "7¼", + "x³", + "2³", + "12⅝", + "", + "\t\r\n ", + ], + ["one", "one1", "1", ""], + ["A B", "1.5", "3,000"], + ["23", "³", "⅕", ""], + [" ", "\t\r\n ", ""], + ["leopard", "Golden Eagle", "SNAKE", ""], + [r"¯\_(ツ)_/¯", "(╯°□°)╯︵ ┻━┻", "┬─┬ノ( º _ ºノ)"], + ["a1", "A1", "a!", "A!", "!1", "aA"], + [ + None, + "The quick bRoWn fox juMps over the laze DOG", + '123nr98nv9rev!$#INF4390v03n1243<>?}{:-"', + "accénted", + ], + ] +) +def data_char_types(request): + return request.param + + +@pytest.mark.parametrize( + "type_op", + [ + "isdecimal", + "isalnum", + "isalpha", + "isdigit", + "isnumeric", + "isupper", + "islower", + ], +) +def test_string_char_types(type_op, data_char_types): + gs = cudf.Series(data_char_types) + ps = pd.Series(data_char_types) + + assert_eq(getattr(gs.str, type_op)(), getattr(ps.str, type_op)()) + + +def test_string_filter_alphanum(): + data = ["1234567890", "!@#$%^&*()", ",./<>?;:[]}{|+=", "abc DEF"] + expected = [] + for st in data: + rs = "" + for c in st: + if str.isalnum(c): + rs = rs + c + expected.append(rs) + + gs = cudf.Series(data) + assert_eq(gs.str.filter_alphanum(), cudf.Series(expected)) + + expected = [] + for st in data: + rs = "" + for c in st: + if not str.isalnum(c): + rs = rs + c + expected.append(rs) + assert_eq(gs.str.filter_alphanum(keep=False), cudf.Series(expected)) + + expected = [] + for st in data: + rs = "" + for c in st: + if str.isalnum(c): + rs = rs + c + else: + rs = rs + "*" + expected.append(rs) + assert_eq(gs.str.filter_alphanum("*"), cudf.Series(expected)) + + expected = [] + for st in data: + rs = "" + for c in st: + if not str.isalnum(c): + rs = rs + c + else: + rs = rs + "*" + expected.append(rs) + assert_eq(gs.str.filter_alphanum("*", keep=False), cudf.Series(expected)) + + with pytest.raises(TypeError): + gs.str.filter_alphanum(["a"]) + + +@pytest.mark.parametrize( + "case_op", + [ + "title", + "capitalize", + "lower", + "upper", + "swapcase", + "isdecimal", + "isalnum", + "isalpha", + "isdigit", + "isnumeric", + "isspace", + ], +) +def test_string_char_case(case_op, data_char_types): + gs = cudf.Series(data_char_types) + ps = pd.Series(data_char_types) + assert_eq(getattr(gs.str, case_op)(), getattr(ps.str, case_op)()) + + +def test_string_isempty(data_char_types): + gs = cudf.Series(data_char_types) + ps = pd.Series(data_char_types) + assert_eq(gs.str.isempty(), ps == "") + + +@pytest.mark.parametrize( + "string", + [ + ["Cbe", "cbe", "CbeD", "Cb", "ghi", "Cb"], + ["abc", "xyz", "a", "ab", "123", "097"], + ["abcdefghij", "0123456789", "9876543210", None, "accénted", ""], + ], +) +@pytest.mark.parametrize("index", [-100, -3, -1, 0, 1, 4, 50]) +def test_string_get(string, index): + pds = pd.Series(string) + gds = cudf.Series(string) + + assert_eq( + pds.str.get(index).fillna(""), + gds.str.get(index).fillna(""), + ) + + +@pytest.mark.parametrize( + "string", + [ + ["abc", "xyz", "a", "ab", "123", "097"], + ["abcdefghij", "0123456789", "9876543210", None, "accénted", ""], + ["koala", "fox", "chameleon"], + ], +) +@pytest.mark.parametrize("number", [-10, 0, 1, 3, 10]) +@pytest.mark.parametrize("diff", [0, 3]) +def test_string_slice_str(string, number, diff): + pds = pd.Series(string) + gds = cudf.Series(string) + + assert_eq(pds.str.slice(start=number), gds.str.slice(start=number)) + assert_eq(pds.str.slice(stop=number), gds.str.slice(stop=number)) + assert_eq(pds.str.slice(), gds.str.slice()) + assert_eq( + pds.str.slice(start=number, stop=number + diff), + gds.str.slice(start=number, stop=number + diff), + ) + if diff != 0: + assert_eq(pds.str.slice(step=diff), gds.str.slice(step=diff)) + assert_eq( + pds.str.slice(start=number, stop=number + diff, step=diff), + gds.str.slice(start=number, stop=number + diff, step=diff), + ) + + +def test_string_slice_from(): + gs = cudf.Series(["hello world", "holy accéntéd", "batman", None, ""]) + d_starts = cudf.Series([2, 3, 0, -1, -1], dtype=np.int32) + d_stops = cudf.Series([-1, -1, 0, -1, -1], dtype=np.int32) + got = gs.str.slice_from(starts=d_starts, stops=d_stops) + expected = cudf.Series(["llo world", "y accéntéd", "", None, ""]) + assert_eq(got, expected) + + +@pytest.mark.parametrize( + "string", + [ + ["abc", "xyz", "a", "ab", "123", "097"], + ["abcdefghij", "0123456789", "9876543210", None, "accénted", ""], + ["koala", "fox", "chameleon"], + ], +) +@pytest.mark.parametrize("number", [0, 1, 10]) +@pytest.mark.parametrize("diff", [0, 3]) +@pytest.mark.parametrize("repl", ["2", "!!"]) +def test_string_slice_replace(string, number, diff, repl): + pds = pd.Series(string) + gds = cudf.Series(string) + + assert_eq( + pds.str.slice_replace(start=number, repl=repl), + gds.str.slice_replace(start=number, repl=repl), + check_dtype=False, + ) + assert_eq( + pds.str.slice_replace(stop=number, repl=repl), + gds.str.slice_replace(stop=number, repl=repl), + ) + assert_eq(pds.str.slice_replace(), gds.str.slice_replace()) + assert_eq( + pds.str.slice_replace(start=number, stop=number + diff), + gds.str.slice_replace(start=number, stop=number + diff), + ) + assert_eq( + pds.str.slice_replace(start=number, stop=number + diff, repl=repl), + gds.str.slice_replace(start=number, stop=number + diff, repl=repl), + check_dtype=False, + ) + + +def test_string_slice_replace_fail(): + gs = cudf.Series(["abc", "xyz", ""]) + with pytest.raises(TypeError): + gs.str.slice_replace(0, 1, ["_"]) + + +def test_string_insert(): + gs = cudf.Series(["hello world", "holy accéntéd", "batman", None, ""]) + + ps = pd.Series(["hello world", "holy accéntéd", "batman", None, ""]) + + assert_eq(gs.str.insert(0, ""), gs) + assert_eq(gs.str.insert(0, "+"), "+" + ps) + assert_eq(gs.str.insert(-1, "---"), ps + "---") + assert_eq( + gs.str.insert(5, "---"), + ps.str.slice(stop=5) + "---" + ps.str.slice(start=5), + ) + + with pytest.raises(TypeError): + gs.str.insert(0, ["+"]) + + +def test_string_slice(): + df = cudf.DataFrame({"a": ["hello", "world"]}) + pdf = pd.DataFrame({"a": ["hello", "world"]}) + a_slice_got = df.a.str.slice(0, 2) + a_slice_expected = pdf.a.str.slice(0, 2) + + assert isinstance(a_slice_got, cudf.Series) + assert_eq(a_slice_expected, a_slice_got) + + +@pytest.mark.parametrize("pat", [None, "\\s+"]) +@pytest.mark.parametrize("regex", [False, True]) +@pytest.mark.parametrize("expand", [False, True]) +def test_string_split_all_empty(pat, regex, expand): + ps = pd.Series(["", "", "", ""], dtype="str") + gs = cudf.Series(["", "", "", ""], dtype="str") + + expect = ps.str.split(pat=pat, expand=expand, regex=regex) + got = gs.str.split(pat=pat, expand=expand, regex=regex) + + if isinstance(got, cudf.DataFrame): + assert_eq(expect, got, check_column_type=False) + else: + assert_eq(expect, got) + + +@pytest.mark.parametrize( + "data", + [ + ["a b", " c ", " d", "e ", "f"], + ["a-b", "-c-", "---d", "e---", "f"], + ["ab", "c", "d", "e", "f"], + [None, None, None, None, None], + ], +) +@pytest.mark.parametrize("pat", [None, " ", "\\-+", "\\s+"]) +@pytest.mark.parametrize("n", [-1, 0, 1, 3, 10]) +@pytest.mark.parametrize("expand", [True, False]) +def test_string_split_re(data, pat, n, expand): + ps = pd.Series(data, dtype="str") + gs = cudf.Series(data, dtype="str") + + expect = ps.str.split(pat=pat, n=n, expand=expand, regex=True) + got = gs.str.split(pat=pat, n=n, expand=expand, regex=True) + + assert_eq(expect, got) + + +def test_string_lower(ps_gs): + ps, gs = ps_gs + + expect = ps.str.lower() + got = gs.str.lower() + + assert_eq(expect, got) + + +def test_string_upper(ps_gs): + ps, gs = ps_gs + + expect = ps.str.upper() + got = gs.str.upper() + + assert_eq(expect, got) + + +@pytest.mark.parametrize( + "data", + [ + ["a b", " c ", " d", "e ", "f"], + ["a-b", "-c-", "---d", "e---", "f"], + ["ab", "c", "d", "e", "f"], + [None, None, None, None, None], + ], +) +@pytest.mark.parametrize("pat", [None, " ", "-"]) +@pytest.mark.parametrize("n", [-1, 0, 1, 3, 10]) +@pytest.mark.parametrize("expand", [True, False]) +def test_string_split(data, pat, n, expand): + ps = pd.Series(data, dtype="str") + gs = cudf.Series(data, dtype="str") + + expect = ps.str.split(pat=pat, n=n, expand=expand) + got = gs.str.split(pat=pat, n=n, expand=expand) + + assert_eq(expect, got) + + +# Pandas doesn't respect the `n` parameter so ignoring it in test parameters +@pytest.mark.parametrize( + "pat,regex", + [("a", False), ("f", False), (r"[a-z]", True), (r"[A-Z]", True)], +) +@pytest.mark.parametrize("repl", ["qwerty", "", " "]) +@pytest.mark.parametrize("case,case_raise", [(None, 0), (True, 1), (False, 1)]) +@pytest.mark.parametrize("flags,flags_raise", [(0, 0), (re.U, 1)]) +def test_string_replace( + ps_gs, pat, repl, case, case_raise, flags, flags_raise, regex +): + ps, gs = ps_gs + + expectation = raise_builder([case_raise, flags_raise], NotImplementedError) + + with expectation: + expect = ps.str.replace(pat, repl, case=case, flags=flags, regex=regex) + got = gs.str.replace(pat, repl, case=case, flags=flags, regex=regex) + + assert_eq(expect, got) + + +@pytest.mark.parametrize("pat", ["A*", "F?H?"]) +def test_string_replace_zero_length(ps_gs, pat): + ps, gs = ps_gs + + expect = ps.str.replace(pat, "_", regex=True) + got = gs.str.replace(pat, "_", regex=True) + + assert_eq(expect, got) + + +@pytest.mark.parametrize( + "pat,regex", + [ + ("a", False), + ("a", True), + ("f", False), + (r"[a-z]", True), + (r"[A-Z]", True), + ("hello", False), + ("FGHI", False), + ], +) +@pytest.mark.parametrize( + "flags,flags_raise", + [(0, 0), (re.MULTILINE | re.DOTALL, 0), (re.I, 1), (re.I | re.DOTALL, 1)], +) +@pytest.mark.parametrize("na,na_raise", [(np.nan, 0), (None, 1), ("", 1)]) +def test_string_contains(ps_gs, pat, regex, flags, flags_raise, na, na_raise): + ps, gs = ps_gs + + expectation = does_not_raise() + if flags_raise or na_raise: + expectation = pytest.raises(NotImplementedError) + + with expectation: + with expect_warning_if( + na == "" or (na is None and not (flags_raise or na_raise)), + match=( + "Allowing a non-bool 'na' in obj.str.contains is deprecated " + "and will raise in a future version." + ), + ): + expect = ps.str.contains(pat, flags=flags, na=na, regex=regex) + got = gs.str.contains(pat, flags=flags, na=na, regex=regex) + assert_eq(expect, got) + + +def test_string_contains_case(ps_gs): + ps, gs = ps_gs + with pytest.raises(NotImplementedError): + gs.str.contains("A", case=False) + expected = ps.str.contains("A", regex=False, case=False) + got = gs.str.contains("A", regex=False, case=False) + assert_eq(expected, got) + got = gs.str.contains("a", regex=False, case=False) + assert_eq(expected, got) + + +@pytest.mark.parametrize( + "pat,esc,expect", + [ + ("abc", "", [True, False, False, False, False, False]), + ("b%", "/", [False, True, False, False, False, False]), + ("%b", ":", [False, True, False, False, False, False]), + ("%b%", "*", [True, True, False, False, False, False]), + ("___", "", [True, True, True, False, False, False]), + ("__/%", "/", [False, False, True, False, False, False]), + ("55/____", "/", [False, False, False, True, False, False]), + ("%:%%", ":", [False, False, True, False, False, False]), + ("55*_100", "*", [False, False, False, True, False, False]), + ("abc", "abc", [True, False, False, False, False, False]), + ], +) +def test_string_like(pat, esc, expect): + expectation = does_not_raise() + if len(esc) > 1: + expectation = pytest.raises(ValueError) + + with expectation: + gs = cudf.Series(["abc", "bab", "99%", "55_100", "", "556100"]) + got = gs.str.like(pat, esc) + expect = cudf.Series(expect) + assert_eq(expect, got, check_dtype=False) + + +@pytest.mark.parametrize( + "repeats", + [ + 2, + 0, + -3, + [5, 4, 3, 2, 6], + [5, None, 3, 2, 6], + [0, 0, 0, 0, 0], + [-1, -2, -3, -4, -5], + [None, None, None, None, None], + ], +) +def test_string_repeat(data, repeats): + ps = pd.Series(["hello", "world", None, "", "!"]) + gs = cudf.from_pandas(ps) + + expect = ps.str.repeat(repeats) + got = gs.str.repeat(repeats) + + assert_eq(expect, got) + + +def test_string_cat_str_error(): + gs = cudf.Series(["a", "v", "s"]) + # https://github.com/pandas-dev/pandas/issues/28277 + # ability to pass StringMethods is being removed in future. + with pytest.raises( + TypeError, + match=re.escape( + "others must be Series, Index, DataFrame, np.ndarrary " + "or list-like (either containing only strings or " + "containing only objects of type Series/Index/" + "np.ndarray[1-dim])" + ), + ): + gs.str.cat(gs.str) + + +@pytest.mark.parametrize("sep", ["", " ", ",", "|||"]) +def test_string_join(ps_gs, sep): + ps, gs = ps_gs + + expect = ps.str.join(sep) + got = gs.str.join(sep) + + assert_eq(expect, got) + + +@pytest.mark.parametrize("pat", [r"(a)", r"(f)", r"([a-z])", r"([A-Z])"]) +@pytest.mark.parametrize("expand", [True, False]) +@pytest.mark.parametrize( + "flags,flags_raise", [(0, 0), (re.M | re.S, 0), (re.I, 1)] +) +def test_string_extract(ps_gs, pat, expand, flags, flags_raise): + ps, gs = ps_gs + expectation = raise_builder([flags_raise], NotImplementedError) + + with expectation: + expect = ps.str.extract(pat, flags=flags, expand=expand) + got = gs.str.extract(pat, flags=flags, expand=expand) + + assert_eq(expect, got) + + +def test_string_invalid_regex(): + gs = cudf.Series(["a"]) + with pytest.raises(RuntimeError): + gs.str.extract(r"{\}") + + +def _cat_convert_seq_to_cudf(others): + pd_others = others + if isinstance(pd_others, (pd.Series, pd.Index)): + gd_others = cudf.from_pandas(pd_others) + else: + gd_others = pd_others + if isinstance(gd_others, (list, tuple)): + temp_tuple = [ + cudf.from_pandas(elem) + if isinstance(elem, (pd.Series, pd.Index)) + else elem + for elem in gd_others + ] + + if isinstance(gd_others, tuple): + gd_others = tuple(temp_tuple) + else: + gd_others = list(temp_tuple) + return gd_others + + +@pytest.mark.parametrize( + "data", + [["a", None, "c", None, "e"], ["a", "b", "c", "d", "a"]], +) +@pytest.mark.parametrize( + "others", + [ + None, + ["f", "g", "h", "i", "j"], + pd.Series(["AbC", "de", "FGHI", "j", "kLm"]), + pd.Index(["f", "g", "h", "i", "j"]), + pd.Index(["AbC", "de", "FGHI", "j", "kLm"]), + [ + np.array(["f", "g", "h", "i", "j"]), + np.array(["f", "g", "h", "i", "j"]), + ], + [ + pd.Series(["f", "g", "h", "i", "j"]), + pd.Series(["f", "g", "h", "i", "j"]), + ], + pytest.param( + [ + pd.Series(["f", "g", "h", "i", "j"]), + np.array(["f", "g", "h", "i", "j"]), + ], + marks=pytest.mark.xfail( + reason="https://github.com/rapidsai/cudf/issues/5862" + ), + ), + pytest.param( + ( + pd.Series(["f", "g", "h", "i", "j"]), + np.array(["f", "a", "b", "f", "a"]), + pd.Series(["f", "g", "h", "i", "j"]), + np.array(["f", "a", "b", "f", "a"]), + np.array(["f", "a", "b", "f", "a"]), + pd.Index(["1", "2", "3", "4", "5"]), + np.array(["f", "a", "b", "f", "a"]), + pd.Index(["f", "g", "h", "i", "j"]), + ), + marks=pytest.mark.xfail( + reason="https://github.com/pandas-dev/pandas/issues/33436" + ), + ), + [ + pd.Series( + ["hello", "world", "abc", "xyz", "pqr"], + index=["a", "b", "c", "d", "e"], + ), + pd.Series( + ["abc", "xyz", "hello", "pqr", "world"], + index=["a", "b", "c", "d", "e"], + ), + ], + [ + pd.Series( + ["hello", "world", "abc", "xyz", "pqr"], + index=[10, 11, 12, 13, 14], + ), + pd.Series( + ["abc", "xyz", "hello", "pqr", "world"], + index=[10, 15, 11, 13, 14], + ), + ], + [ + pd.Series( + ["hello", "world", "abc", "xyz", "pqr"], + index=["1", "2", "3", "4", "5"], + ), + pd.Series( + ["abc", "xyz", "hello", "pqr", "world"], + index=["1", "2", "3", "4", "5"], + ), + ], + ], +) +@pytest.mark.parametrize("sep", [None, "", " ", ",", "|||"]) +@pytest.mark.parametrize("na_rep", [None, "", "null", "a"]) +@pytest.mark.parametrize("name", [None, "This is the name"]) +def test_string_index_duplicate_str_cat(data, others, sep, na_rep, name): + pi, gi = pd.Index(data, name=name), cudf.Index(data, name=name) + + pd_others = others + gd_others = _cat_convert_seq_to_cudf(others) + + got = gi.str.cat(others=gd_others, sep=sep, na_rep=na_rep) + expect = pi.str.cat(others=pd_others, sep=sep, na_rep=na_rep) + + # TODO: Remove got.sort_values call once we have `join` param support + # in `.str.cat` + # https://github.com/rapidsai/cudf/issues/5862 + + assert_eq( + expect.sort_values() if not isinstance(expect, str) else expect, + got.sort_values() if not isinstance(got, str) else got, + exact=False, + ) + + +@pytest.mark.parametrize( + "others", + [ + None, + ["f", "g", "h", "i", "j"], + ("f", "g", "h", "i", "j"), + pd.Series(["f", "g", "h", "i", "j"]), + pd.Series(["AbC", "de", "FGHI", "j", "kLm"]), + pd.Index(["f", "g", "h", "i", "j"]), + pd.Index(["AbC", "de", "FGHI", "j", "kLm"]), + ( + np.array(["f", "g", "h", "i", "j"]), + np.array(["f", "g", "h", "i", "j"]), + ), + [ + np.array(["f", "g", "h", "i", "j"]), + np.array(["f", "g", "h", "i", "j"]), + ], + [ + pd.Series(["f", "g", "h", "i", "j"]), + pd.Series(["f", "g", "h", "i", "j"]), + ], + ( + pd.Series(["f", "g", "h", "i", "j"]), + pd.Series(["f", "g", "h", "i", "j"]), + ), + [ + pd.Series(["f", "g", "h", "i", "j"]), + np.array(["f", "g", "h", "i", "j"]), + ], + ( + pd.Series(["f", "g", "h", "i", "j"]), + np.array(["f", "g", "h", "i", "j"]), + ), + ( + pd.Series(["f", "g", "h", "i", "j"]), + np.array(["f", "a", "b", "f", "a"]), + pd.Series(["f", "g", "h", "i", "j"]), + np.array(["f", "a", "b", "f", "a"]), + np.array(["f", "a", "b", "f", "a"]), + pd.Index(["1", "2", "3", "4", "5"]), + np.array(["f", "a", "b", "f", "a"]), + pd.Index(["f", "g", "h", "i", "j"]), + ), + [ + pd.Index(["f", "g", "h", "i", "j"]), + np.array(["f", "a", "b", "f", "a"]), + pd.Series(["f", "g", "h", "i", "j"]), + np.array(["f", "a", "b", "f", "a"]), + np.array(["f", "a", "b", "f", "a"]), + pd.Index(["f", "g", "h", "i", "j"]), + np.array(["f", "a", "b", "f", "a"]), + pd.Index(["f", "g", "h", "i", "j"]), + ], + [ + pd.Series(["hello", "world", "abc", "xyz", "pqr"]), + pd.Series(["abc", "xyz", "hello", "pqr", "world"]), + ], + [ + pd.Series( + ["hello", "world", "abc", "xyz", "pqr"], + index=[10, 11, 12, 13, 14], + ), + pd.Series( + ["abc", "xyz", "hello", "pqr", "world"], + index=[10, 15, 11, 13, 14], + ), + ], + [ + pd.Series( + ["hello", "world", "abc", "xyz", "pqr"], + index=["10", "11", "12", "13", "14"], + ), + pd.Series( + ["abc", "xyz", "hello", "pqr", "world"], + index=["10", "11", "12", "13", "14"], + ), + ], + [ + pd.Series( + ["hello", "world", "abc", "xyz", "pqr"], + index=["10", "11", "12", "13", "14"], + ), + pd.Series( + ["abc", "xyz", "hello", "pqr", "world"], + index=["10", "15", "11", "13", "14"], + ), + ], + [ + pd.Series( + ["hello", "world", "abc", "xyz", "pqr"], + index=["1", "2", "3", "4", "5"], + ), + pd.Series( + ["abc", "xyz", "hello", "pqr", "world"], + index=["10", "11", "12", "13", "14"], + ), + ], + ], +) +@pytest.mark.parametrize("sep", [None, "", " ", "|", ",", "|||"]) +@pytest.mark.parametrize("na_rep", [None, "", "null", "a"]) +@pytest.mark.parametrize( + "index", + [["1", "2", "3", "4", "5"]], +) +def test_string_cat(ps_gs, others, sep, na_rep, index): + ps, gs = ps_gs + + pd_others = others + gd_others = _cat_convert_seq_to_cudf(others) + + expect = ps.str.cat(others=pd_others, sep=sep, na_rep=na_rep) + got = gs.str.cat(others=gd_others, sep=sep, na_rep=na_rep) + assert_eq(expect, got) + + ps.index = index + gs.index = index + + expect = ps.str.cat(others=ps.index, sep=sep, na_rep=na_rep) + got = gs.str.cat(others=gs.index, sep=sep, na_rep=na_rep) + + assert_eq(expect, got) + + expect = ps.str.cat(others=[ps.index, ps.index], sep=sep, na_rep=na_rep) + got = gs.str.cat(others=[gs.index, gs.index], sep=sep, na_rep=na_rep) + + assert_eq(expect, got) + + expect = ps.str.cat(others=(ps.index, ps.index), sep=sep, na_rep=na_rep) + got = gs.str.cat(others=(gs.index, gs.index), sep=sep, na_rep=na_rep) + + assert_eq(expect, got) + + +@pytest.mark.parametrize( + "data", + [ + ["1", "2", "3", "4", "5"], + ["a", "b", "c", "d", "e"], + ["a", "b", "c", None, "e"], + ], +) +@pytest.mark.parametrize( + "others", + [ + None, + ["f", "g", "h", "i", "j"], + ("f", "g", "h", "i", "j"), + pd.Series(["f", "g", "h", "i", "j"]), + pd.Series(["AbC", "de", "FGHI", "j", "kLm"]), + pd.Index(["f", "g", "h", "i", "j"]), + pd.Index(["AbC", "de", "FGHI", "j", "kLm"]), + ( + np.array(["f", "g", "h", "i", "j"]), + np.array(["f", "g", "h", "i", "j"]), + ), + [ + np.array(["f", "g", "h", "i", "j"]), + np.array(["f", "g", "h", "i", "j"]), + ], + [ + pd.Series(["f", "g", "h", "i", "j"]), + pd.Series(["f", "g", "h", "i", "j"]), + ], + ( + pd.Series(["f", "g", "h", "i", "j"]), + np.array(["f", "a", "b", "f", "a"]), + pd.Series(["f", "g", "h", "i", "j"]), + np.array(["f", "a", "b", "f", "a"]), + np.array(["f", "a", "b", "f", "a"]), + pd.Index(["1", "2", "3", "4", "5"]), + np.array(["f", "a", "b", "f", "a"]), + pd.Index(["f", "g", "h", "i", "j"]), + ), + [ + pd.Index(["f", "g", "h", "i", "j"]), + np.array(["f", "a", "b", "f", "a"]), + pd.Series(["f", "g", "h", "i", "j"]), + np.array(["f", "a", "b", "f", "a"]), + np.array(["f", "a", "b", "f", "a"]), + pd.Index(["f", "g", "h", "i", "j"]), + np.array(["f", "a", "b", "f", "a"]), + pd.Index(["f", "g", "h", "i", "j"]), + ], + [ + pd.Series( + ["hello", "world", "abc", "xyz", "pqr"], + index=["a", "b", "c", "d", "e"], + ), + pd.Series( + ["abc", "xyz", "hello", "pqr", "world"], + index=["a", "b", "c", "d", "e"], + ), + ], + [ + pd.Series( + ["hello", "world", "abc", "xyz", "pqr"], + index=[10, 11, 12, 13, 14], + ), + pd.Series( + ["abc", "xyz", "hello", "pqr", "world"], + index=[10, 15, 11, 13, 14], + ), + ], + [ + pd.Series( + ["hello", "world", "abc", "xyz", "pqr"], + index=["1", "2", "3", "4", "5"], + ), + pd.Series( + ["abc", "xyz", "hello", "pqr", "world"], + index=["1", "2", "3", "4", "5"], + ), + ], + ], +) +@pytest.mark.parametrize("sep", [None, "", " ", "|", "|||"]) +@pytest.mark.parametrize("na_rep", [None, "", "null", "a"]) +@pytest.mark.parametrize("name", [None, "This is the name"]) +def test_string_index_str_cat(data, others, sep, na_rep, name): + pi, gi = pd.Index(data, name=name), cudf.Index(data, name=name) + + pd_others = others + gd_others = _cat_convert_seq_to_cudf(others) + + expect = pi.str.cat(others=pd_others, sep=sep, na_rep=na_rep) + got = gi.str.cat(others=gd_others, sep=sep, na_rep=na_rep) + + assert_eq( + expect, + got, + exact=False, + ) + + +def test_string_len(ps_gs): + ps, gs = ps_gs + + expect = ps.str.len() + got = gs.str.len() + + # Can't handle nulls in Pandas so use PyArrow instead + # Pandas will return as a float64 so need to typecast to int32 + expect = pa.array(expect, from_pandas=True).cast(pa.int32()) + got = got.to_arrow() + assert pa.Array.equals(expect, got) + + +def test_string_concat(): + data1 = ["a", "b", "c", "d", "e"] + data2 = ["f", "g", "h", "i", "j"] + index = [1, 2, 3, 4, 5] + + ps1 = pd.Series(data1, index=index) + ps2 = pd.Series(data2, index=index) + gs1 = cudf.Series(data1, index=index) + gs2 = cudf.Series(data2, index=index) + + expect = pd.concat([ps1, ps2]) + got = concat([gs1, gs2]) + + assert_eq(expect, got) + + expect = ps1.str.cat(ps2) + got = gs1.str.cat(gs2) + + assert_eq(expect, got) + + +@pytest.mark.parametrize("name", [None, "new name", 123]) +def test_string_misc_name(ps_gs, name): + ps, gs = ps_gs + ps.name = name + gs.name = name + + expect = ps.str.slice(0, 1) + got = gs.str.slice(0, 1) + + assert_eq(expect, got) + assert_eq(ps + ps, gs + gs) + assert_eq(ps + "RAPIDS", gs + "RAPIDS") + assert_eq("RAPIDS" + ps, "RAPIDS" + gs) diff --git a/python/cudf/cudf/tests/test_string.py b/python/cudf/cudf/tests/test_string.py index 7a66b2bbfcf..4b9e47ed275 100644 --- a/python/cudf/cudf/tests/test_string.py +++ b/python/cudf/cudf/tests/test_string.py @@ -1,9 +1,5 @@ # Copyright (c) 2018-2025, NVIDIA CORPORATION. -import json -import re -import urllib.parse -from contextlib import nullcontext as does_not_raise from decimal import Decimal from sys import getsizeof @@ -16,7 +12,6 @@ import rmm import cudf -from cudf import concat from cudf.core.buffer import as_buffer from cudf.core.column.string import StringColumn from cudf.core.index import Index @@ -25,18 +20,10 @@ DATETIME_TYPES, NUMERIC_TYPES, assert_exceptions_equal, - expect_warning_if, ) from cudf.utils import dtypes as dtypeutils -def raise_builder(flags, exceptions): - if any(flags): - return pytest.raises(exceptions) - else: - return does_not_raise() - - @pytest.fixture( params=[ ["AbC", "de", "FGHI", "j", "kLm"], @@ -347,27 +334,6 @@ def test_string_empty_numeric_astype(dtype): assert_eq(expect, got) -def test_string_concat(): - data1 = ["a", "b", "c", "d", "e"] - data2 = ["f", "g", "h", "i", "j"] - index = [1, 2, 3, 4, 5] - - ps1 = pd.Series(data1, index=index) - ps2 = pd.Series(data2, index=index) - gs1 = cudf.Series(data1, index=index) - gs2 = cudf.Series(data2, index=index) - - expect = pd.concat([ps1, ps2]) - got = concat([gs1, gs2]) - - assert_eq(expect, got) - - expect = ps1.str.cat(ps2) - got = gs1.str.cat(gs2) - - assert_eq(expect, got) - - @pytest.mark.parametrize("ascending", [True, False]) def test_string_sort(ps_gs, ascending): ps, gs = ps_gs @@ -378,635 +344,6 @@ def test_string_sort(ps_gs, ascending): assert_eq(expect, got) -def test_string_len(ps_gs): - ps, gs = ps_gs - - expect = ps.str.len() - got = gs.str.len() - - # Can't handle nulls in Pandas so use PyArrow instead - # Pandas will return as a float64 so need to typecast to int32 - expect = pa.array(expect, from_pandas=True).cast(pa.int32()) - got = got.to_arrow() - assert pa.Array.equals(expect, got) - - -def _cat_convert_seq_to_cudf(others): - pd_others = others - if isinstance(pd_others, (pd.Series, pd.Index)): - gd_others = cudf.from_pandas(pd_others) - else: - gd_others = pd_others - if isinstance(gd_others, (list, tuple)): - temp_tuple = [ - cudf.from_pandas(elem) - if isinstance(elem, (pd.Series, pd.Index)) - else elem - for elem in gd_others - ] - - if isinstance(gd_others, tuple): - gd_others = tuple(temp_tuple) - else: - gd_others = list(temp_tuple) - return gd_others - - -@pytest.mark.parametrize( - "others", - [ - None, - ["f", "g", "h", "i", "j"], - ("f", "g", "h", "i", "j"), - pd.Series(["f", "g", "h", "i", "j"]), - pd.Series(["AbC", "de", "FGHI", "j", "kLm"]), - pd.Index(["f", "g", "h", "i", "j"]), - pd.Index(["AbC", "de", "FGHI", "j", "kLm"]), - ( - np.array(["f", "g", "h", "i", "j"]), - np.array(["f", "g", "h", "i", "j"]), - ), - [ - np.array(["f", "g", "h", "i", "j"]), - np.array(["f", "g", "h", "i", "j"]), - ], - [ - pd.Series(["f", "g", "h", "i", "j"]), - pd.Series(["f", "g", "h", "i", "j"]), - ], - ( - pd.Series(["f", "g", "h", "i", "j"]), - pd.Series(["f", "g", "h", "i", "j"]), - ), - [ - pd.Series(["f", "g", "h", "i", "j"]), - np.array(["f", "g", "h", "i", "j"]), - ], - ( - pd.Series(["f", "g", "h", "i", "j"]), - np.array(["f", "g", "h", "i", "j"]), - ), - ( - pd.Series(["f", "g", "h", "i", "j"]), - np.array(["f", "a", "b", "f", "a"]), - pd.Series(["f", "g", "h", "i", "j"]), - np.array(["f", "a", "b", "f", "a"]), - np.array(["f", "a", "b", "f", "a"]), - pd.Index(["1", "2", "3", "4", "5"]), - np.array(["f", "a", "b", "f", "a"]), - pd.Index(["f", "g", "h", "i", "j"]), - ), - [ - pd.Index(["f", "g", "h", "i", "j"]), - np.array(["f", "a", "b", "f", "a"]), - pd.Series(["f", "g", "h", "i", "j"]), - np.array(["f", "a", "b", "f", "a"]), - np.array(["f", "a", "b", "f", "a"]), - pd.Index(["f", "g", "h", "i", "j"]), - np.array(["f", "a", "b", "f", "a"]), - pd.Index(["f", "g", "h", "i", "j"]), - ], - [ - pd.Series(["hello", "world", "abc", "xyz", "pqr"]), - pd.Series(["abc", "xyz", "hello", "pqr", "world"]), - ], - [ - pd.Series( - ["hello", "world", "abc", "xyz", "pqr"], - index=[10, 11, 12, 13, 14], - ), - pd.Series( - ["abc", "xyz", "hello", "pqr", "world"], - index=[10, 15, 11, 13, 14], - ), - ], - [ - pd.Series( - ["hello", "world", "abc", "xyz", "pqr"], - index=["10", "11", "12", "13", "14"], - ), - pd.Series( - ["abc", "xyz", "hello", "pqr", "world"], - index=["10", "11", "12", "13", "14"], - ), - ], - [ - pd.Series( - ["hello", "world", "abc", "xyz", "pqr"], - index=["10", "11", "12", "13", "14"], - ), - pd.Series( - ["abc", "xyz", "hello", "pqr", "world"], - index=["10", "15", "11", "13", "14"], - ), - ], - [ - pd.Series( - ["hello", "world", "abc", "xyz", "pqr"], - index=["1", "2", "3", "4", "5"], - ), - pd.Series( - ["abc", "xyz", "hello", "pqr", "world"], - index=["10", "11", "12", "13", "14"], - ), - ], - ], -) -@pytest.mark.parametrize("sep", [None, "", " ", "|", ",", "|||"]) -@pytest.mark.parametrize("na_rep", [None, "", "null", "a"]) -@pytest.mark.parametrize( - "index", - [["1", "2", "3", "4", "5"]], -) -def test_string_cat(ps_gs, others, sep, na_rep, index): - ps, gs = ps_gs - - pd_others = others - gd_others = _cat_convert_seq_to_cudf(others) - - expect = ps.str.cat(others=pd_others, sep=sep, na_rep=na_rep) - got = gs.str.cat(others=gd_others, sep=sep, na_rep=na_rep) - assert_eq(expect, got) - - ps.index = index - gs.index = index - - expect = ps.str.cat(others=ps.index, sep=sep, na_rep=na_rep) - got = gs.str.cat(others=gs.index, sep=sep, na_rep=na_rep) - - assert_eq(expect, got) - - expect = ps.str.cat(others=[ps.index, ps.index], sep=sep, na_rep=na_rep) - got = gs.str.cat(others=[gs.index, gs.index], sep=sep, na_rep=na_rep) - - assert_eq(expect, got) - - expect = ps.str.cat(others=(ps.index, ps.index), sep=sep, na_rep=na_rep) - got = gs.str.cat(others=(gs.index, gs.index), sep=sep, na_rep=na_rep) - - assert_eq(expect, got) - - -@pytest.mark.parametrize( - "data", - [ - ["1", "2", "3", "4", "5"], - ["a", "b", "c", "d", "e"], - ["a", "b", "c", None, "e"], - ], -) -@pytest.mark.parametrize( - "others", - [ - None, - ["f", "g", "h", "i", "j"], - ("f", "g", "h", "i", "j"), - pd.Series(["f", "g", "h", "i", "j"]), - pd.Series(["AbC", "de", "FGHI", "j", "kLm"]), - pd.Index(["f", "g", "h", "i", "j"]), - pd.Index(["AbC", "de", "FGHI", "j", "kLm"]), - ( - np.array(["f", "g", "h", "i", "j"]), - np.array(["f", "g", "h", "i", "j"]), - ), - [ - np.array(["f", "g", "h", "i", "j"]), - np.array(["f", "g", "h", "i", "j"]), - ], - [ - pd.Series(["f", "g", "h", "i", "j"]), - pd.Series(["f", "g", "h", "i", "j"]), - ], - ( - pd.Series(["f", "g", "h", "i", "j"]), - np.array(["f", "a", "b", "f", "a"]), - pd.Series(["f", "g", "h", "i", "j"]), - np.array(["f", "a", "b", "f", "a"]), - np.array(["f", "a", "b", "f", "a"]), - pd.Index(["1", "2", "3", "4", "5"]), - np.array(["f", "a", "b", "f", "a"]), - pd.Index(["f", "g", "h", "i", "j"]), - ), - [ - pd.Index(["f", "g", "h", "i", "j"]), - np.array(["f", "a", "b", "f", "a"]), - pd.Series(["f", "g", "h", "i", "j"]), - np.array(["f", "a", "b", "f", "a"]), - np.array(["f", "a", "b", "f", "a"]), - pd.Index(["f", "g", "h", "i", "j"]), - np.array(["f", "a", "b", "f", "a"]), - pd.Index(["f", "g", "h", "i", "j"]), - ], - [ - pd.Series( - ["hello", "world", "abc", "xyz", "pqr"], - index=["a", "b", "c", "d", "e"], - ), - pd.Series( - ["abc", "xyz", "hello", "pqr", "world"], - index=["a", "b", "c", "d", "e"], - ), - ], - [ - pd.Series( - ["hello", "world", "abc", "xyz", "pqr"], - index=[10, 11, 12, 13, 14], - ), - pd.Series( - ["abc", "xyz", "hello", "pqr", "world"], - index=[10, 15, 11, 13, 14], - ), - ], - [ - pd.Series( - ["hello", "world", "abc", "xyz", "pqr"], - index=["1", "2", "3", "4", "5"], - ), - pd.Series( - ["abc", "xyz", "hello", "pqr", "world"], - index=["1", "2", "3", "4", "5"], - ), - ], - ], -) -@pytest.mark.parametrize("sep", [None, "", " ", "|", ",", "|||"]) -@pytest.mark.parametrize("na_rep", [None, "", "null", "a"]) -@pytest.mark.parametrize("name", [None, "This is the name"]) -def test_string_index_str_cat(data, others, sep, na_rep, name): - pi, gi = pd.Index(data, name=name), cudf.Index(data, name=name) - - pd_others = others - gd_others = _cat_convert_seq_to_cudf(others) - - expect = pi.str.cat(others=pd_others, sep=sep, na_rep=na_rep) - got = gi.str.cat(others=gd_others, sep=sep, na_rep=na_rep) - - assert_eq( - expect, - got, - exact=False, - ) - - -@pytest.mark.parametrize( - "data", - [["a", None, "c", None, "e"], ["a", "b", "c", "d", "a"]], -) -@pytest.mark.parametrize( - "others", - [ - None, - ["f", "g", "h", "i", "j"], - pd.Series(["AbC", "de", "FGHI", "j", "kLm"]), - pd.Index(["f", "g", "h", "i", "j"]), - pd.Index(["AbC", "de", "FGHI", "j", "kLm"]), - [ - np.array(["f", "g", "h", "i", "j"]), - np.array(["f", "g", "h", "i", "j"]), - ], - [ - pd.Series(["f", "g", "h", "i", "j"]), - pd.Series(["f", "g", "h", "i", "j"]), - ], - pytest.param( - [ - pd.Series(["f", "g", "h", "i", "j"]), - np.array(["f", "g", "h", "i", "j"]), - ], - marks=pytest.mark.xfail( - reason="https://github.com/rapidsai/cudf/issues/5862" - ), - ), - pytest.param( - ( - pd.Series(["f", "g", "h", "i", "j"]), - np.array(["f", "a", "b", "f", "a"]), - pd.Series(["f", "g", "h", "i", "j"]), - np.array(["f", "a", "b", "f", "a"]), - np.array(["f", "a", "b", "f", "a"]), - pd.Index(["1", "2", "3", "4", "5"]), - np.array(["f", "a", "b", "f", "a"]), - pd.Index(["f", "g", "h", "i", "j"]), - ), - marks=pytest.mark.xfail( - reason="https://github.com/pandas-dev/pandas/issues/33436" - ), - ), - [ - pd.Series( - ["hello", "world", "abc", "xyz", "pqr"], - index=["a", "b", "c", "d", "e"], - ), - pd.Series( - ["abc", "xyz", "hello", "pqr", "world"], - index=["a", "b", "c", "d", "e"], - ), - ], - [ - pd.Series( - ["hello", "world", "abc", "xyz", "pqr"], - index=[10, 11, 12, 13, 14], - ), - pd.Series( - ["abc", "xyz", "hello", "pqr", "world"], - index=[10, 15, 11, 13, 14], - ), - ], - [ - pd.Series( - ["hello", "world", "abc", "xyz", "pqr"], - index=["1", "2", "3", "4", "5"], - ), - pd.Series( - ["abc", "xyz", "hello", "pqr", "world"], - index=["1", "2", "3", "4", "5"], - ), - ], - ], -) -@pytest.mark.parametrize("sep", [None, "", " ", "|", ",", "|||"]) -@pytest.mark.parametrize("na_rep", [None, "", "null", "a"]) -@pytest.mark.parametrize("name", [None, "This is the name"]) -def test_string_index_duplicate_str_cat(data, others, sep, na_rep, name): - pi, gi = pd.Index(data, name=name), cudf.Index(data, name=name) - - pd_others = others - gd_others = _cat_convert_seq_to_cudf(others) - - got = gi.str.cat(others=gd_others, sep=sep, na_rep=na_rep) - expect = pi.str.cat(others=pd_others, sep=sep, na_rep=na_rep) - - # TODO: Remove got.sort_values call once we have `join` param support - # in `.str.cat` - # https://github.com/rapidsai/cudf/issues/5862 - - assert_eq( - expect.sort_values() if not isinstance(expect, str) else expect, - got.sort_values() if not isinstance(got, str) else got, - exact=False, - ) - - -def test_string_cat_str_error(): - gs = cudf.Series(["a", "v", "s"]) - # https://github.com/pandas-dev/pandas/issues/28277 - # ability to pass StringMethods is being removed in future. - with pytest.raises( - TypeError, - match=re.escape( - "others must be Series, Index, DataFrame, np.ndarrary " - "or list-like (either containing only strings or " - "containing only objects of type Series/Index/" - "np.ndarray[1-dim])" - ), - ): - gs.str.cat(gs.str) - - -@pytest.mark.parametrize("sep", ["", " ", "|", ",", "|||"]) -def test_string_join(ps_gs, sep): - ps, gs = ps_gs - - expect = ps.str.join(sep) - got = gs.str.join(sep) - - assert_eq(expect, got) - - -@pytest.mark.parametrize("pat", [r"(a)", r"(f)", r"([a-z])", r"([A-Z])"]) -@pytest.mark.parametrize("expand", [True, False]) -@pytest.mark.parametrize( - "flags,flags_raise", [(0, 0), (re.M | re.S, 0), (re.I, 1)] -) -def test_string_extract(ps_gs, pat, expand, flags, flags_raise): - ps, gs = ps_gs - expectation = raise_builder([flags_raise], NotImplementedError) - - with expectation: - expect = ps.str.extract(pat, flags=flags, expand=expand) - got = gs.str.extract(pat, flags=flags, expand=expand) - - assert_eq(expect, got) - - -def test_string_invalid_regex(): - gs = cudf.Series(["a"]) - with pytest.raises(RuntimeError): - gs.str.extract(r"{\}") - - -@pytest.mark.parametrize( - "pat,regex", - [ - ("a", False), - ("a", True), - ("f", False), - (r"[a-z]", True), - (r"[A-Z]", True), - ("hello", False), - ("FGHI", False), - ], -) -@pytest.mark.parametrize( - "flags,flags_raise", - [(0, 0), (re.MULTILINE | re.DOTALL, 0), (re.I, 1), (re.I | re.DOTALL, 1)], -) -@pytest.mark.parametrize("na,na_raise", [(np.nan, 0), (None, 1), ("", 1)]) -def test_string_contains(ps_gs, pat, regex, flags, flags_raise, na, na_raise): - ps, gs = ps_gs - - expectation = does_not_raise() - if flags_raise or na_raise: - expectation = pytest.raises(NotImplementedError) - - with expectation: - with expect_warning_if( - na == "" or (na is None and not (flags_raise or na_raise)), - match=( - "Allowing a non-bool 'na' in obj.str.contains is deprecated " - "and will raise in a future version." - ), - ): - expect = ps.str.contains(pat, flags=flags, na=na, regex=regex) - got = gs.str.contains(pat, flags=flags, na=na, regex=regex) - assert_eq(expect, got) - - -def test_string_contains_case(ps_gs): - ps, gs = ps_gs - with pytest.raises(NotImplementedError): - gs.str.contains("A", case=False) - expected = ps.str.contains("A", regex=False, case=False) - got = gs.str.contains("A", regex=False, case=False) - assert_eq(expected, got) - got = gs.str.contains("a", regex=False, case=False) - assert_eq(expected, got) - - -@pytest.mark.parametrize( - "pat,esc,expect", - [ - ("abc", "", [True, False, False, False, False, False]), - ("b%", "/", [False, True, False, False, False, False]), - ("%b", ":", [False, True, False, False, False, False]), - ("%b%", "*", [True, True, False, False, False, False]), - ("___", "", [True, True, True, False, False, False]), - ("__/%", "/", [False, False, True, False, False, False]), - ("55/____", "/", [False, False, False, True, False, False]), - ("%:%%", ":", [False, False, True, False, False, False]), - ("55*_100", "*", [False, False, False, True, False, False]), - ("abc", "abc", [True, False, False, False, False, False]), - ], -) -def test_string_like(pat, esc, expect): - expectation = does_not_raise() - if len(esc) > 1: - expectation = pytest.raises(ValueError) - - with expectation: - gs = cudf.Series(["abc", "bab", "99%", "55_100", "", "556100"]) - got = gs.str.like(pat, esc) - expect = cudf.Series(expect) - assert_eq(expect, got, check_dtype=False) - - -@pytest.mark.parametrize( - "data", - [["hello", "world", None, "", "!"]], -) -@pytest.mark.parametrize( - "repeats", - [ - 2, - 0, - -3, - [5, 4, 3, 2, 6], - [5, None, 3, 2, 6], - [0, 0, 0, 0, 0], - [-1, -2, -3, -4, -5], - [None, None, None, None, None], - ], -) -def test_string_repeat(data, repeats): - ps = pd.Series(data) - gs = cudf.from_pandas(ps) - - expect = ps.str.repeat(repeats) - got = gs.str.repeat(repeats) - - assert_eq(expect, got) - - -# Pandas doesn't respect the `n` parameter so ignoring it in test parameters -@pytest.mark.parametrize( - "pat,regex", - [("a", False), ("f", False), (r"[a-z]", True), (r"[A-Z]", True)], -) -@pytest.mark.parametrize("repl", ["qwerty", "", " "]) -@pytest.mark.parametrize("case,case_raise", [(None, 0), (True, 1), (False, 1)]) -@pytest.mark.parametrize("flags,flags_raise", [(0, 0), (re.U, 1)]) -def test_string_replace( - ps_gs, pat, repl, case, case_raise, flags, flags_raise, regex -): - ps, gs = ps_gs - - expectation = raise_builder([case_raise, flags_raise], NotImplementedError) - - with expectation: - expect = ps.str.replace(pat, repl, case=case, flags=flags, regex=regex) - got = gs.str.replace(pat, repl, case=case, flags=flags, regex=regex) - - assert_eq(expect, got) - - -@pytest.mark.parametrize("pat", ["A*", "F?H?"]) -def test_string_replace_zero_length(ps_gs, pat): - ps, gs = ps_gs - - expect = ps.str.replace(pat, "_", regex=True) - got = gs.str.replace(pat, "_", regex=True) - - assert_eq(expect, got) - - -def test_string_lower(ps_gs): - ps, gs = ps_gs - - expect = ps.str.lower() - got = gs.str.lower() - - assert_eq(expect, got) - - -def test_string_upper(ps_gs): - ps, gs = ps_gs - - expect = ps.str.upper() - got = gs.str.upper() - - assert_eq(expect, got) - - -@pytest.mark.parametrize( - "data", - [ - ["a b", " c ", " d", "e ", "f"], - ["a-b", "-c-", "---d", "e---", "f"], - ["ab", "c", "d", "e", "f"], - [None, None, None, None, None], - ], -) -@pytest.mark.parametrize("pat", [None, " ", "-"]) -@pytest.mark.parametrize("n", [-1, 0, 1, 3, 10]) -@pytest.mark.parametrize("expand", [True, False]) -def test_string_split(data, pat, n, expand): - ps = pd.Series(data, dtype="str") - gs = cudf.Series(data, dtype="str") - - expect = ps.str.split(pat=pat, n=n, expand=expand) - got = gs.str.split(pat=pat, n=n, expand=expand) - - assert_eq(expect, got) - - -@pytest.mark.parametrize( - "data", - [ - ["a b", " c ", " d", "e ", "f"], - ["a-b", "-c-", "---d", "e---", "f"], - ["ab", "c", "d", "e", "f"], - [None, None, None, None, None], - ], -) -@pytest.mark.parametrize("pat", [None, " ", "\\-+", "\\s+"]) -@pytest.mark.parametrize("n", [-1, 0, 1, 3, 10]) -@pytest.mark.parametrize("expand", [True, False]) -def test_string_split_re(data, pat, n, expand): - ps = pd.Series(data, dtype="str") - gs = cudf.Series(data, dtype="str") - - expect = ps.str.split(pat=pat, n=n, expand=expand, regex=True) - got = gs.str.split(pat=pat, n=n, expand=expand, regex=True) - - assert_eq(expect, got) - - -@pytest.mark.parametrize("pat", [None, "\\s+"]) -@pytest.mark.parametrize("regex", [False, True]) -@pytest.mark.parametrize("expand", [False, True]) -def test_string_split_all_empty(pat, regex, expand): - ps = pd.Series(["", "", "", ""], dtype="str") - gs = cudf.Series(["", "", "", ""], dtype="str") - - expect = ps.str.split(pat=pat, expand=expand, regex=regex) - got = gs.str.split(pat=pat, expand=expand, regex=regex) - - if isinstance(got, cudf.DataFrame): - assert_eq(expect, got, check_column_type=False) - else: - assert_eq(expect, got) - - @pytest.mark.parametrize( "str_data", [[], ["a", "b", "c", "d", "e"], [None, None, None, None, None]] ) @@ -1134,16 +471,6 @@ def test_string_unique(item): assert_eq(pres, gres) -def test_string_slice(): - df = cudf.DataFrame({"a": ["hello", "world"]}) - pdf = pd.DataFrame({"a": ["hello", "world"]}) - a_slice_got = df.a.str.slice(0, 2) - a_slice_expected = pdf.a.str.slice(0, 2) - - assert isinstance(a_slice_got, cudf.Series) - assert_eq(a_slice_expected, a_slice_got) - - def test_string_equality(): data1 = ["b", "c", "d", "a", "c"] data2 = ["a", None, "c", "a", "c"] @@ -1193,21 +520,6 @@ def test_string_binary_op_add(lhs, rhs): assert_eq(pds, gds) -@pytest.mark.parametrize("name", [None, "new name", 123]) -def test_string_misc_name(ps_gs, name): - ps, gs = ps_gs - ps.name = name - gs.name = name - - expect = ps.str.slice(0, 1) - got = gs.str.slice(0, 1) - - assert_eq(expect, got) - assert_eq(ps + ps, gs + gs) - assert_eq(ps + "RAPIDS", gs + "RAPIDS") - assert_eq("RAPIDS" + ps, "RAPIDS" + gs) - - def test_string_no_children_properties(): empty_col = StringColumn( as_buffer(rmm.DeviceBuffer(size=0)), @@ -1224,1378 +536,41 @@ def test_string_no_children_properties(): assert getsizeof(empty_col) >= 0 # Accounts for Python GC overhead -@pytest.mark.parametrize( - "string", - [ - ["Cbe", "cbe", "CbeD", "Cb", "ghi", "Cb"], - ["abc", "xyz", "a", "ab", "123", "097"], - ["abcdefghij", "0123456789", "9876543210", None, "accénted", ""], - ], -) -@pytest.mark.parametrize( - "index", [-100, -5, -2, -6, -1, 0, 1, 2, 3, 9, 10, 100] -) -def test_string_get(string, index): - pds = pd.Series(string) - gds = cudf.Series(string) +def test_string_table_view_creation(): + data = ["hi"] * 25 + [None] * 2027 + psr = pd.Series(data) + gsr = cudf.Series.from_pandas(psr) - assert_eq( - pds.str.get(index).fillna(""), - gds.str.get(index).fillna(""), - ) + expect = psr[:1] + got = gsr[:1] + + assert_eq(expect, got) @pytest.mark.parametrize( - "string", + "data,dtype", [ - ["abc", "xyz", "a", "ab", "123", "097"], - ["abcdefghij", "0123456789", "9876543210", None, "accénted", ""], - ["koala", "fox", "chameleon"], + (["0.1", "10.2", "10.876"], "float"), + (["-0.1", "10.2", "+10.876"], "float"), + (["1", "10.2", "10.876"], "float32"), + (["+123", "6344556789", "0"], "int"), + (["+123", "6344556789", "0"], "uint64"), + (["+123", "6344556789", "0"], "float"), + (["0.1", "-10.2", "10.876", None], "float"), ], ) -@pytest.mark.parametrize( - "number", - [-10, 0, 1, 3, 10], -) -@pytest.mark.parametrize( - "diff", - [0, 2, 5, 9], -) -def test_string_slice_str(string, number, diff): - pds = pd.Series(string) - gds = cudf.Series(string) - - assert_eq(pds.str.slice(start=number), gds.str.slice(start=number)) - assert_eq(pds.str.slice(stop=number), gds.str.slice(stop=number)) - assert_eq(pds.str.slice(), gds.str.slice()) - assert_eq( - pds.str.slice(start=number, stop=number + diff), - gds.str.slice(start=number, stop=number + diff), - ) - if diff != 0: - assert_eq(pds.str.slice(step=diff), gds.str.slice(step=diff)) - assert_eq( - pds.str.slice(start=number, stop=number + diff, step=diff), - gds.str.slice(start=number, stop=number + diff, step=diff), - ) - +@pytest.mark.parametrize("obj_type", [None, "str", "category"]) +def test_string_typecast(data, obj_type, dtype): + psr = pd.Series(data, dtype=obj_type) + gsr = cudf.Series(data, dtype=obj_type) -def test_string_slice_from(): - gs = cudf.Series(["hello world", "holy accéntéd", "batman", None, ""]) - d_starts = cudf.Series([2, 3, 0, -1, -1], dtype=np.int32) - d_stops = cudf.Series([-1, -1, 0, -1, -1], dtype=np.int32) - got = gs.str.slice_from(starts=d_starts, stops=d_stops) - expected = cudf.Series(["llo world", "y accéntéd", "", None, ""]) - assert_eq(got, expected) + expect = psr.astype(dtype=dtype) + actual = gsr.astype(dtype=dtype) + assert_eq(expect, actual) @pytest.mark.parametrize( - "string", - [ - ["abc", "xyz", "a", "ab", "123", "097"], - ["abcdefghij", "0123456789", "9876543210", None, "accénted", ""], - ["koala", "fox", "chameleon"], - ], -) -@pytest.mark.parametrize("number", [0, 1, 10]) -@pytest.mark.parametrize("diff", [0, 2, 9]) -@pytest.mark.parametrize("repr", ["2", "!!"]) -def test_string_slice_replace(string, number, diff, repr): - pds = pd.Series(string) - gds = cudf.Series(string) - - assert_eq( - pds.str.slice_replace(start=number, repl=repr), - gds.str.slice_replace(start=number, repl=repr), - check_dtype=False, - ) - assert_eq( - pds.str.slice_replace(stop=number, repl=repr), - gds.str.slice_replace(stop=number, repl=repr), - ) - assert_eq(pds.str.slice_replace(), gds.str.slice_replace()) - assert_eq( - pds.str.slice_replace(start=number, stop=number + diff), - gds.str.slice_replace(start=number, stop=number + diff), - ) - assert_eq( - pds.str.slice_replace(start=number, stop=number + diff, repl=repr), - gds.str.slice_replace(start=number, stop=number + diff, repl=repr), - check_dtype=False, - ) - - -def test_string_slice_replace_fail(): - gs = cudf.Series(["abc", "xyz", ""]) - with pytest.raises(TypeError): - gs.str.slice_replace(0, 1, ["_"]) - - -def test_string_insert(): - gs = cudf.Series(["hello world", "holy accéntéd", "batman", None, ""]) - - ps = pd.Series(["hello world", "holy accéntéd", "batman", None, ""]) - - assert_eq(gs.str.insert(0, ""), gs) - assert_eq(gs.str.insert(0, "+"), "+" + ps) - assert_eq(gs.str.insert(-1, "---"), ps + "---") - assert_eq( - gs.str.insert(5, "---"), - ps.str.slice(stop=5) + "---" + ps.str.slice(start=5), - ) - - with pytest.raises(TypeError): - gs.str.insert(0, ["+"]) - - -@pytest.fixture( - params=[ - ["abc", "xyz", "a", "ab", "123", "097"], - ["abcdefghij", "0123456789", "9876543210", None, "accénted", ""], - ["koala", "fox", "chameleon"], - [ - "1234567890", - "de", - "1.75", - "-34", - "+9.8", - "7¼", - "x³", - "2³", - "12⅝", - "", - "\t\r\n ", - ], - ["one", "one1", "1", ""], - ["A B", "1.5", "3,000"], - ["23", "³", "⅕", ""], - [" ", "\t\r\n ", ""], - ["leopard", "Golden Eagle", "SNAKE", ""], - [r"¯\_(ツ)_/¯", "(╯°□°)╯︵ ┻━┻", "┬─┬ノ( º _ ºノ)"], - ["a1", "A1", "a!", "A!", "!1", "aA"], - [ - None, - "The quick bRoWn fox juMps over the laze DOG", - '123nr98nv9rev!$#INF4390v03n1243<>?}{:-"', - "accénted", - ], - ] -) -def data_char_types(request): - return request.param - - -@pytest.mark.parametrize( - "type_op", - [ - "isdecimal", - "isalnum", - "isalpha", - "isdigit", - "isnumeric", - "isupper", - "islower", - ], -) -def test_string_char_types(type_op, data_char_types): - gs = cudf.Series(data_char_types) - ps = pd.Series(data_char_types) - - assert_eq(getattr(gs.str, type_op)(), getattr(ps.str, type_op)()) - - -def test_string_filter_alphanum(): - data = ["1234567890", "!@#$%^&*()", ",./<>?;:[]}{|+=", "abc DEF"] - expected = [] - for st in data: - rs = "" - for c in st: - if str.isalnum(c): - rs = rs + c - expected.append(rs) - - gs = cudf.Series(data) - assert_eq(gs.str.filter_alphanum(), cudf.Series(expected)) - - expected = [] - for st in data: - rs = "" - for c in st: - if not str.isalnum(c): - rs = rs + c - expected.append(rs) - assert_eq(gs.str.filter_alphanum(keep=False), cudf.Series(expected)) - - expected = [] - for st in data: - rs = "" - for c in st: - if str.isalnum(c): - rs = rs + c - else: - rs = rs + "*" - expected.append(rs) - assert_eq(gs.str.filter_alphanum("*"), cudf.Series(expected)) - - expected = [] - for st in data: - rs = "" - for c in st: - if not str.isalnum(c): - rs = rs + c - else: - rs = rs + "*" - expected.append(rs) - assert_eq(gs.str.filter_alphanum("*", keep=False), cudf.Series(expected)) - - with pytest.raises(TypeError): - gs.str.filter_alphanum(["a"]) - - -@pytest.mark.parametrize( - "case_op", ["title", "capitalize", "lower", "upper", "swapcase"] -) -def test_string_char_case(case_op, data_char_types): - gs = cudf.Series(data_char_types) - ps = pd.Series(data_char_types) - - s = gs.str - a = getattr(s, case_op) - - assert_eq(a(), getattr(ps.str, case_op)()) - - assert_eq(gs.str.capitalize(), ps.str.capitalize()) - assert_eq(gs.str.isdecimal(), ps.str.isdecimal()) - assert_eq(gs.str.isalnum(), ps.str.isalnum()) - assert_eq(gs.str.isalpha(), ps.str.isalpha()) - assert_eq(gs.str.isdigit(), ps.str.isdigit()) - assert_eq(gs.str.isnumeric(), ps.str.isnumeric()) - assert_eq(gs.str.isspace(), ps.str.isspace()) - - assert_eq(gs.str.isempty(), ps == "") - - -def test_string_is_title(): - data = [ - "leopard", - "Golden Eagle", - "SNAKE", - "", - "!A", - "hello World", - "A B C", - "#", - "AƻB", - "Ⓑⓖ", - "Art of War", - ] - gs = cudf.Series(data) - ps = pd.Series(data) - - assert_eq(gs.str.istitle(), ps.str.istitle()) - - -@pytest.mark.parametrize( - "data", - [ - ["koala", "fox", "chameleon"], - ["A,,B", "1,,5", "3,00,0"], - ["Linda van der Berg", "George Pitt-Rivers"], - ["23", "³", "⅕", ""], - [" ", "\t\r\n ", ""], - ], -) -def test_strings_rpartition(data): - gs = cudf.Series(data) - ps = pd.Series(data) - - assert_eq(ps.str.rpartition(), gs.str.rpartition()) - assert_eq(ps.str.rpartition("-"), gs.str.rpartition("-")) - assert_eq(ps.str.rpartition(","), gs.str.rpartition(",")) - - -@pytest.mark.parametrize( - "data", - [ - ["koala", "fox", "chameleon"], - ["A,,B", "1,,5", "3,00,0"], - ["Linda van der Berg", "George Pitt-Rivers"], - ["23", "³", "⅕", ""], - [" ", "\t\r\n ", ""], - ], -) -def test_strings_partition(data): - gs = cudf.Series(data, name="str_name") - ps = pd.Series(data, name="str_name") - - assert_eq(ps.str.partition(), gs.str.partition()) - assert_eq(ps.str.partition(","), gs.str.partition(",")) - assert_eq(ps.str.partition("-"), gs.str.partition("-")) - - gi = cudf.Index(data, name="new name") - pi = pd.Index(data, name="new name") - assert_eq(pi.str.partition(), gi.str.partition()) - assert_eq(pi.str.partition(","), gi.str.partition(",")) - assert_eq(pi.str.partition("-"), gi.str.partition("-")) - - -def test_string_partition_fail(): - gs = cudf.Series(["abc", "aa", "cba"]) - with pytest.raises(TypeError): - gs.str.partition(["a"]) - with pytest.raises(TypeError): - gs.str.rpartition(["a"]) - - -@pytest.mark.parametrize( - "data", - [ - ["koala", "fox", "chameleon"], - ["A,,B", "1,,5", "3,00,0"], - ["Linda van der Berg", "George Pitt-Rivers"], - ["23", "³", "⅕", ""], - [" ", "\t\r\n ", ""], - [ - "this is a regular sentence", - "https://docs.python.org/3/tutorial/index.html", - None, - ], - ], -) -@pytest.mark.parametrize("n", [-1, 2, 1, 9]) -@pytest.mark.parametrize("expand", [True, False]) -def test_strings_rsplit(data, n, expand): - gs = cudf.Series(data) - ps = pd.Series(data) - - assert_eq( - ps.str.rsplit(n=n, expand=expand).reset_index(), - gs.str.rsplit(n=n, expand=expand).reset_index(), - check_index_type=False, - ) - assert_eq( - ps.str.rsplit(",", n=n, expand=expand), - gs.str.rsplit(",", n=n, expand=expand), - ) - assert_eq( - ps.str.rsplit("-", n=n, expand=expand), - gs.str.rsplit("-", n=n, expand=expand), - ) - - -@pytest.mark.parametrize("n", [-1, 0, 1, 3, 10]) -@pytest.mark.parametrize("expand", [True, False]) -def test_string_rsplit_re(n, expand): - data = ["a b", " c ", " d", "e ", "f"] - ps = pd.Series(data, dtype="str") - gs = cudf.Series(data, dtype="str") - - # Pandas does not yet support the regex parameter for rsplit - import inspect - - assert ( - "regex" - not in inspect.signature(pd.Series.str.rsplit).parameters.keys() - ) - - expect = ps.str.rsplit(pat=" ", n=n, expand=expand) - got = gs.str.rsplit(pat="\\s", n=n, expand=expand, regex=True) - assert_eq(expect, got) - - -@pytest.mark.parametrize( - "data", - [ - ["koala", "fox", "chameleon"], - ["A,,B", "1,,5", "3,00,0"], - ["Linda van der Berg", "George Pitt-Rivers"], - ["23", "³", "⅕", ""], - [" ", "\t\r\n ", ""], - [ - "this is a regular sentence", - "https://docs.python.org/3/tutorial/index.html", - None, - ], - ], -) -@pytest.mark.parametrize("n", [-1, 2, 1, 9]) -@pytest.mark.parametrize("expand", [True, False]) -def test_strings_split(data, n, expand): - gs = cudf.Series(data) - ps = pd.Series(data) - - assert_eq( - ps.str.split(n=n, expand=expand).reset_index(), - gs.str.split(n=n, expand=expand).reset_index(), - check_index_type=False, - ) - - assert_eq( - ps.str.split(",", n=n, expand=expand), - gs.str.split(",", n=n, expand=expand), - ) - assert_eq( - ps.str.split("-", n=n, expand=expand), - gs.str.split("-", n=n, expand=expand), - ) - - -@pytest.mark.parametrize( - "data", - [ - ["koala", "fox", "chameleon"], - ["A,,B", "1,,5", "3,00,0"], - ["Linda van der Berg", "George Pitt-Rivers"], - ["23", "³", "⅕", ""], - [" ", "\t\r\n ", ""], - [ - "this is a regular sentence", - "https://docs.python.org/3/tutorial/index.html", - None, - ], - ["1. Ant. ", "2. Bee!\n", "3. Cat?\t", None], - ], -) -@pytest.mark.parametrize( - "to_strip", ["⅕", None, "123.", ".!? \n\t", "123.!? \n\t", " ", ".", ","] -) -def test_strings_strip_tests(data, to_strip): - gs = cudf.Series(data) - ps = pd.Series(data) - - assert_eq(ps.str.strip(to_strip=to_strip), gs.str.strip(to_strip=to_strip)) - assert_eq( - ps.str.rstrip(to_strip=to_strip), gs.str.rstrip(to_strip=to_strip) - ) - assert_eq( - ps.str.lstrip(to_strip=to_strip), gs.str.lstrip(to_strip=to_strip) - ) - - gi = cudf.Index(data) - pi = pd.Index(data) - - assert_eq(pi.str.strip(to_strip=to_strip), gi.str.strip(to_strip=to_strip)) - assert_eq( - pi.str.rstrip(to_strip=to_strip), gi.str.rstrip(to_strip=to_strip) - ) - assert_eq( - pi.str.lstrip(to_strip=to_strip), gi.str.lstrip(to_strip=to_strip) - ) - - -def test_string_strip_fail(): - gs = cudf.Series(["a", "aa", ""]) - with pytest.raises(TypeError): - gs.str.strip(["a"]) - with pytest.raises(TypeError): - gs.str.lstrip(["a"]) - with pytest.raises(TypeError): - gs.str.rstrip(["a"]) - - -@pytest.mark.parametrize( - "data", - [ - ["koala", "fox", "chameleon"], - ["A,,B", "1,,5", "3,00,0"], - ["Linda van der Berg", "George Pitt-Rivers"], - ["23", "³", "⅕", ""], - [" ", "\t\r\n ", ""], - [ - "this is a regular sentence", - "https://docs.python.org/3/tutorial/index.html", - None, - ], - ["1. Ant. ", "2. Bee!\n", "3. Cat?\t", None], - ], -) -@pytest.mark.parametrize("width", [0, 1, 4, 9, 100]) -@pytest.mark.parametrize("fillchar", ["⅕", "1", ".", "t", " ", ","]) -def test_strings_filling_tests(data, width, fillchar): - gs = cudf.Series(data) - ps = pd.Series(data) - - assert_eq( - ps.str.center(width=width, fillchar=fillchar), - gs.str.center(width=width, fillchar=fillchar), - ) - assert_eq( - ps.str.ljust(width=width, fillchar=fillchar), - gs.str.ljust(width=width, fillchar=fillchar), - ) - assert_eq( - ps.str.rjust(width=width, fillchar=fillchar), - gs.str.rjust(width=width, fillchar=fillchar), - ) - - gi = cudf.Index(data) - pi = pd.Index(data) - - assert_eq( - pi.str.center(width=width, fillchar=fillchar), - gi.str.center(width=width, fillchar=fillchar), - ) - assert_eq( - pi.str.ljust(width=width, fillchar=fillchar), - gi.str.ljust(width=width, fillchar=fillchar), - ) - assert_eq( - pi.str.rjust(width=width, fillchar=fillchar), - gi.str.rjust(width=width, fillchar=fillchar), - ) - - -@pytest.mark.parametrize( - "data", - [ - ["A,,B", "1,,5", "3,00,0"], - ["Linda van der Berg", "George Pitt-Rivers"], - ["³", "⅕", ""], - ["hello", "there", "world", "+1234", "-1234", None, "accént", ""], - [" ", "\t\r\n ", ""], - ["1. Ant. ", "2. Bee!\n", "3. Cat?\t", None], - ], -) -@pytest.mark.parametrize("width", [0, 1, 4, 6, 9, 100]) -def test_strings_zfill_tests(data, width): - gs = cudf.Series(data) - ps = pd.Series(data) - - assert_eq(ps.str.zfill(width=width), gs.str.zfill(width=width)) - - gi = cudf.Index(data) - pi = pd.Index(data) - - assert_eq(pi.str.zfill(width=width), gi.str.zfill(width=width)) - - -@pytest.mark.parametrize( - "data", - [ - ["A,,B", "1,,5", "3,00,0"], - ["Linda van der Berg", "George Pitt-Rivers"], - ["+23", "³", "⅕", ""], - [" ", "\t\r\n ", ""], - ["hello", "there", "world", "+1234", "-1234", None, "accént", ""], - ["1. Ant. ", "2. Bee!\n", "3. Cat?\t", None], - ], -) -@pytest.mark.parametrize("width", [0, 1, 4, 9, 100]) -@pytest.mark.parametrize( - "side", - ["left", "right", "both"], -) -@pytest.mark.parametrize("fillchar", [" ", ".", "\n", "+", "\t"]) -def test_strings_pad_tests(data, width, side, fillchar): - gs = cudf.Series(data) - ps = pd.Series(data) - - assert_eq( - ps.str.pad(width=width, side=side, fillchar=fillchar), - gs.str.pad(width=width, side=side, fillchar=fillchar), - ) - - gi = cudf.Index(data) - pi = pd.Index(data) - - assert_eq( - pi.str.pad(width=width, side=side, fillchar=fillchar), - gi.str.pad(width=width, side=side, fillchar=fillchar), - ) - - -@pytest.mark.parametrize( - "data", - [ - ["abc", "xyz", "a", "ab", "123", "097"], - ["A B", "1.5", "3,000"], - ["23", "³", "⅕", ""], - # [" ", "\t\r\n ", ""], - ["leopard", "Golden Eagle", "SNAKE", ""], - ["line to be wrapped", "another line to be wrapped"], - ], -) -@pytest.mark.parametrize("width", [1, 4, 8, 12, 100]) -def test_string_wrap(data, width): - gs = cudf.Series(data) - ps = pd.Series(data) - - assert_eq( - gs.str.wrap( - width=width, - break_long_words=False, - expand_tabs=False, - replace_whitespace=True, - drop_whitespace=True, - break_on_hyphens=False, - ), - ps.str.wrap( - width=width, - break_long_words=False, - expand_tabs=False, - replace_whitespace=True, - drop_whitespace=True, - break_on_hyphens=False, - ), - ) - - gi = cudf.Index(data) - pi = pd.Index(data) - - assert_eq( - gi.str.wrap( - width=width, - break_long_words=False, - expand_tabs=False, - replace_whitespace=True, - drop_whitespace=True, - break_on_hyphens=False, - ), - pi.str.wrap( - width=width, - break_long_words=False, - expand_tabs=False, - replace_whitespace=True, - drop_whitespace=True, - break_on_hyphens=False, - ), - ) - - -@pytest.mark.parametrize( - "data", - [ - ["abc", "xyz", "a", "ab", "123", "097"], - ["A B", "1.5", "3,000"], - ["23", "³", "⅕", ""], - [" ", "\t\r\n ", ""], - ["$", "B", "Aab$", "$$ca", "C$B$", "cat", "cat\ndog"], - ["line\nto be wrapped", "another\nline\nto be wrapped"], - ], -) -@pytest.mark.parametrize( - "pat", - ["a", " ", "\t", "another", "0", r"\$", "^line$", "line.*be", "cat$"], -) -@pytest.mark.parametrize("flags", [0, re.MULTILINE, re.DOTALL]) -def test_string_count(data, pat, flags): - gs = cudf.Series(data) - ps = pd.Series(data) - - assert_eq( - gs.str.count(pat=pat, flags=flags), - ps.str.count(pat=pat, flags=flags), - check_dtype=False, - ) - assert_eq( - cudf.Index(gs).str.count(pat=pat), - pd.Index(ps).str.count(pat=pat), - exact=False, - ) - - -@pytest.mark.parametrize( - "pat, flags", - [ - ("Monkey", 0), - ("on", 0), - ("b", 0), - ("on$", 0), - ("on$", re.MULTILINE), - ("o.*k", re.DOTALL), - ], -) -def test_string_findall(pat, flags): - test_data = ["Lion", "Monkey", "Rabbit", "Don\nkey"] - ps = pd.Series(test_data) - gs = cudf.Series(test_data) - - expected = ps.str.findall(pat, flags) - actual = gs.str.findall(pat, flags) - assert_eq(expected, actual) - - -@pytest.mark.parametrize( - "pat, flags, pos", - [ - ("Monkey", 0, [-1, 0, -1, -1]), - ("on", 0, [2, 1, -1, 1]), - ("bit", 0, [-1, -1, 3, -1]), - ("on$", 0, [2, -1, -1, -1]), - ("on$", re.MULTILINE, [2, -1, -1, 1]), - ("o.*k", re.DOTALL, [-1, 1, -1, 1]), - ], -) -def test_string_find_re(pat, flags, pos): - test_data = ["Lion", "Monkey", "Rabbit", "Don\nkey"] - gs = cudf.Series(test_data) - - expected = pd.Series(pos, dtype=np.int32) - actual = gs.str.find_re(pat, flags) - assert_eq(expected, actual) - - -def test_string_replace_multi(): - ps = pd.Series(["hello", "goodbye"]) - gs = cudf.Series(["hello", "goodbye"]) - expect = ps.str.replace("e", "E").str.replace("o", "O") - got = gs.str.replace(["e", "o"], ["E", "O"]) - - assert_eq(expect, got) - - ps = pd.Series(["foo", "fuz", np.nan]) - gs = cudf.Series.from_pandas(ps) - - expect = ps.str.replace("f.", "ba", regex=True) - got = gs.str.replace(["f."], ["ba"], regex=True) - assert_eq(expect, got) - - ps = pd.Series(["f.o", "fuz", np.nan]) - gs = cudf.Series.from_pandas(ps) - - expect = ps.str.replace("f.", "ba", regex=False) - got = gs.str.replace(["f."], ["ba"], regex=False) - assert_eq(expect, got) - - -@pytest.mark.parametrize( - "find", - [ - "(\\d)(\\d)", - "(\\d)(\\d)", - "(\\d)(\\d)", - "(\\d)(\\d)", - "([a-z])-([a-z])", - "([a-z])-([a-zé])", - "([a-z])-([a-z])", - "([a-z])-([a-zé])", - re.compile("([A-Z])(\\d)"), - ], -) -@pytest.mark.parametrize( - "replace", - ["\\1-\\2", "V\\2-\\1", "\\1 \\2", "\\2 \\1", "X\\1+\\2Z", "X\\1+\\2Z"], -) -def test_string_replace_with_backrefs(find, replace): - s = [ - "A543", - "Z756", - "", - None, - "tést-string", - "two-thréé four-fivé", - "abcd-éfgh", - "tést-string-again", - ] - ps = pd.Series(s) - gs = cudf.Series(s) - got = gs.str.replace_with_backrefs(find, replace) - expected = ps.str.replace(find, replace, regex=True) - assert_eq(got, expected) - - got = cudf.Index(gs).str.replace_with_backrefs(find, replace) - expected = pd.Index(ps).str.replace(find, replace, regex=True) - assert_eq(got, expected) - - -def test_string_table_view_creation(): - data = ["hi"] * 25 + [None] * 2027 - psr = pd.Series(data) - gsr = cudf.Series.from_pandas(psr) - - expect = psr[:1] - got = gsr[:1] - - assert_eq(expect, got) - - -@pytest.mark.parametrize( - "data", - [ - ["abc", "xyz", "a", "ab", "123", "097"], - ["A B", "1.5", "3,000"], - ["23", "³", "⅕", ""], - [" ", "\t\r\n ", ""], - ["$", "B", "Aab$", "$$ca", "C$B$", "cat"], - ["line to be wrapped", "another line to be wrapped"], - ["hello", "there", "world", "+1234", "-1234", None, "accént", ""], - ["1. Ant. ", "2. Bee!\n", "3. Cat?\t", None], - ], -) -@pytest.mark.parametrize( - "pat", - ["", None, " ", "a", "abc", "cat", "$", "\n"], -) -def test_string_starts_ends(data, pat): - ps = pd.Series(data) - gs = cudf.Series(data) - - if pat is None: - assert_exceptions_equal( - lfunc=ps.str.startswith, - rfunc=gs.str.startswith, - lfunc_args_and_kwargs=([pat],), - rfunc_args_and_kwargs=([pat],), - ) - assert_exceptions_equal( - lfunc=ps.str.endswith, - rfunc=gs.str.endswith, - lfunc_args_and_kwargs=([pat],), - rfunc_args_and_kwargs=([pat],), - ) - else: - assert_eq( - ps.str.startswith(pat), gs.str.startswith(pat), check_dtype=False - ) - assert_eq( - ps.str.endswith(pat), gs.str.endswith(pat), check_dtype=False - ) - - -@pytest.mark.parametrize( - "data,pat", - [ - ( - ["abc", "xyz", "a", "ab", "123", "097"], - ("abc", "x", "a", "b", "3", "7"), - ), - (["A B", "1.5", "3,000"], ("A ", ".", ",")), - (["23", "³", "⅕", ""], ("23", "³", "⅕", "")), - ([" ", "\t\r\n ", ""], ("d", "\n ", "")), - ( - ["$", "B", "Aab$", "$$ca", "C$B$", "cat"], - ("$", "$", "a", "<", "(", "#"), - ), - ( - ["line to be wrapped", "another line to be wrapped"], - ("another", "wrapped"), - ), - ( - ["hello", "there", "world", "+1234", "-1234", None, "accént", ""], - ("hsdjfk", "", "ll", "+", "-", "w", "-", "én"), - ), - ( - ["1. Ant. ", "2. Bee!\n", "3. Cat?\t", None], - ("1. Ant. ", "2. Bee!\n", "3. Cat?\t", ""), - ), - ], -) -def test_string_starts_ends_list_like_pat(data, pat): - gs = cudf.Series(data) - - starts_expected = [] - ends_expected = [] - for i in range(len(pat)): - if data[i] is None: - starts_expected.append(None) - ends_expected.append(None) - else: - if pat[i] is None: - starts_expected.append(False) - ends_expected.append(False) - else: - starts_expected.append(data[i].startswith(pat[i])) - ends_expected.append(data[i].endswith(pat[i])) - starts_expected = pd.Series(starts_expected) - ends_expected = pd.Series(ends_expected) - assert_eq(starts_expected, gs.str.startswith(pat), check_dtype=False) - assert_eq(ends_expected, gs.str.endswith(pat), check_dtype=False) - - -@pytest.mark.parametrize( - "data", - [ - ["str_foo", "str_bar", "no_prefix", "", None], - ["foo_str", "bar_str", "no_suffix", "", None], - ], -) -def test_string_remove_suffix_prefix(data): - ps = pd.Series(data) - gs = cudf.Series(data) - - got = gs.str.removeprefix("str_") - expect = ps.str.removeprefix("str_") - assert_eq( - expect, - got, - check_dtype=False, - ) - got = gs.str.removesuffix("_str") - expect = ps.str.removesuffix("_str") - assert_eq( - expect, - got, - check_dtype=False, - ) - - -@pytest.mark.parametrize( - "data", - [ - ["abc", "xyz", "a", "ab", "123", "097"], - ["A B", "1.5", "3,000"], - ["23", "³", "⅕", ""], - [" ", "\t\r\n ", ""], - ["$", "B", "Aab$", "$$ca", "C$B$", "cat"], - ["line to be wrapped", "another line to be wrapped"], - ["hello", "there", "world", "+1234", "-1234", None, "accént", ""], - ["1. Ant. ", "2. Bee!\n", "3. Cat?\t", None], - ], -) -@pytest.mark.parametrize( - "sub", - ["", " ", "a", "abc", "cat", "$", "\n"], -) -def test_string_find(data, sub): - ps = pd.Series(data) - gs = cudf.Series(data) - - got = gs.str.find(sub) - expect = ps.str.find(sub) - assert_eq( - expect, - got, - check_dtype=False, - ) - - got = gs.str.find(sub, start=1) - expect = ps.str.find(sub, start=1) - assert_eq( - expect, - got, - check_dtype=False, - ) - - got = gs.str.find(sub, end=10) - expect = ps.str.find(sub, end=10) - assert_eq( - expect, - got, - check_dtype=False, - ) - - got = gs.str.find(sub, start=2, end=10) - expect = ps.str.find(sub, start=2, end=10) - assert_eq( - expect, - got, - check_dtype=False, - ) - - got = gs.str.rfind(sub) - expect = ps.str.rfind(sub) - assert_eq( - expect, - got, - check_dtype=False, - ) - - got = gs.str.rfind(sub, start=1) - expect = ps.str.rfind(sub, start=1) - assert_eq( - expect, - got, - check_dtype=False, - ) - - got = gs.str.rfind(sub, end=10) - expect = ps.str.rfind(sub, end=10) - assert_eq( - expect, - got, - check_dtype=False, - ) - - got = gs.str.rfind(sub, start=2, end=10) - expect = ps.str.rfind(sub, start=2, end=10) - assert_eq( - expect, - got, - check_dtype=False, - ) - - -@pytest.mark.parametrize( - "data,sub,er", - [ - (["abc", "xyz", "a", "ab", "123", "097"], "a", ValueError), - (["A B", "1.5", "3,000"], "abc", ValueError), - (["23", "³", "⅕", ""], "⅕", ValueError), - ([" ", "\t\r\n ", ""], "\n", ValueError), - (["$", "B", "Aab$", "$$ca", "C$B$", "cat"], "$", ValueError), - (["line to be wrapped", "another line to be wrapped"], " ", None), - ( - ["hello", "there", "world", "+1234", "-1234", None, "accént", ""], - "+", - ValueError, - ), - (["line to be wrapped", "another line to be wrapped"], "", None), - ], -) -def test_string_str_index(data, sub, er): - ps = pd.Series(data) - gs = cudf.Series(data) - - if er is None: - assert_eq(ps.str.index(sub), gs.str.index(sub), check_dtype=False) - - try: - ps.str.index(sub) - except er: - pass - else: - assert not er - - try: - gs.str.index(sub) - except er: - pass - else: - assert not er - - -@pytest.mark.parametrize( - "data,sub,er", - [ - (["abc", "xyz", "a", "ab", "123", "097"], "a", ValueError), - (["A B", "1.5", "3,000"], "abc", ValueError), - (["23", "³", "⅕", ""], "⅕", ValueError), - ([" ", "\t\r\n ", ""], "\n", ValueError), - (["$", "B", "Aab$", "$$ca", "C$B$", "cat"], "$", ValueError), - (["line to be wrapped", "another line to be wrapped"], " ", None), - ( - ["hello", "there", "world", "+1234", "-1234", None, "accént", ""], - "+", - ValueError, - ), - (["line to be wrapped", "another line to be wrapped"], "", None), - ], -) -def test_string_str_rindex(data, sub, er): - ps = pd.Series(data) - gs = cudf.Series(data) - - if er is None: - assert_eq(ps.str.rindex(sub), gs.str.rindex(sub), check_dtype=False) - assert_eq( - pd.Index(ps).str.rindex(sub), - cudf.Index(gs).str.rindex(sub), - exact=False, - ) - - try: - ps.str.rindex(sub) - except er: - pass - else: - assert not er - - try: - gs.str.rindex(sub) - except er: - pass - else: - assert not er - - -@pytest.mark.parametrize( - "data,sub,expect", - [ - ( - ["abc", "xyz", "a", "ab", "123", "097"], - ["b", "y", "a", "c", "4", "8"], - [True, True, True, False, False, False], - ), - ( - ["A B", "1.5", "3,000", "23", "³", "⅕"], - ["A B", ".", ",", "1", " ", " "], - [True, True, True, False, False, False], - ), - ( - [" ", "\t", "\r", "\f ", "\n", ""], - ["", "\t", "\r", "xx", "yy", "zz"], - [True, True, True, False, False, False], - ), - ( - ["$", "B", "Aab$", "$$ca", "C$B$", "cat"], - ["$", "B", "ab", "*", "@", "dog"], - [True, True, True, False, False, False], - ), - ( - ["hello", "there", "world", "-1234", None, "accént"], - ["lo", "e", "o", "+1234", " ", "e"], - [True, True, True, False, None, False], - ), - ( - ["1. Ant. ", "2. Bee!\n", "3. Cat?\t", "", "x", None], - ["A", "B", "C", " ", "y", "e"], - [True, True, True, False, False, None], - ), - ], -) -def test_string_contains_multi(data, sub, expect): - gs = cudf.Series(data) - sub = cudf.Series(sub) - got = gs.str.contains(sub) - expect = cudf.Series(expect) - assert_eq(expect, got, check_dtype=False) - - -# Pandas does not allow 'case' or 'flags' if 'pat' is re.Pattern -# This covers contains, match, count, and replace -@pytest.mark.parametrize( - "pat", - [re.compile("[n-z]"), re.compile("[A-Z]"), re.compile("de"), "A"], -) -@pytest.mark.parametrize("repl", ["xyz", "", " "]) -def test_string_compiled_re(ps_gs, pat, repl): - ps, gs = ps_gs - - expect = ps.str.contains(pat, regex=True) - got = gs.str.contains(pat, regex=True) - assert_eq(expect, got) - - expect = ps.str.match(pat) - got = gs.str.match(pat) - assert_eq(expect, got) - - expect = ps.str.count(pat) - got = gs.str.count(pat) - assert_eq(expect, got, check_dtype=False) - - expect = ps.str.replace(pat, repl, regex=True) - got = gs.str.replace(pat, repl, regex=True) - assert_eq(expect, got) - - -@pytest.mark.parametrize( - "data", - [ - ["abc", "xyz", "a", "ab", "123", "097"], - ["A B", "1.5", "3,000"], - ["23", "³", "⅕", ""], - [" ", "\t\r\n ", ""], - ["$", "B", "Aab$", "$$ca", "C$B$", "cat"], - ["line to be wrapped", "another line to be wrapped"], - ["hello", "there", "world", "+1234", "-1234", None, "accént", ""], - ["1. Ant. ", "2. Bee!\n", "3. Cat?\t", None], - ], -) -@pytest.mark.parametrize("pat", ["", " ", "a", "abc", "cat", "$", "\n"]) -def test_string_str_match(data, pat): - ps = pd.Series(data) - gs = cudf.Series(data) - - assert_eq(ps.str.match(pat), gs.str.match(pat)) - assert_eq( - pd.Index(pd.Index(ps).str.match(pat)), cudf.Index(gs).str.match(pat) - ) - - -@pytest.mark.parametrize( - "data", - [ - ["abc", "xyz", "a", "ab", "123", "097"], - ["A B", "1.5", "3,000"], - ["23", "³", "⅕", ""], - [" ", "\t\r\n ", ""], - ["$", "B", "Aab$", "$$ca", "C$B$", "cat"], - ["line to be wrapped", "another line to be wrapped"], - ["hello", "there", "world", "+1234", "-1234", None, "accént", ""], - ["1. Ant. ", "2. Bee!\n", "3. Cat?\t", None], - ], -) -def test_string_str_translate(data): - ps = pd.Series(data) - gs = cudf.Series(data) - - assert_eq( - ps.str.translate(str.maketrans({"a": "z"})), - gs.str.translate(str.maketrans({"a": "z"})), - ) - assert_eq( - pd.Index(ps).str.translate(str.maketrans({"a": "z"})), - cudf.Index(gs).str.translate(str.maketrans({"a": "z"})), - ) - assert_eq( - ps.str.translate(str.maketrans({"a": "z", "i": "$", "z": "1"})), - gs.str.translate(str.maketrans({"a": "z", "i": "$", "z": "1"})), - ) - assert_eq( - pd.Index(ps).str.translate( - str.maketrans({"a": "z", "i": "$", "z": "1"}) - ), - cudf.Index(gs).str.translate( - str.maketrans({"a": "z", "i": "$", "z": "1"}) - ), - ) - assert_eq( - ps.str.translate( - str.maketrans({"+": "-", "-": "$", "?": "!", "B": "."}) - ), - gs.str.translate( - str.maketrans({"+": "-", "-": "$", "?": "!", "B": "."}) - ), - ) - assert_eq( - pd.Index(ps).str.translate( - str.maketrans({"+": "-", "-": "$", "?": "!", "B": "."}) - ), - cudf.Index(gs).str.translate( - str.maketrans({"+": "-", "-": "$", "?": "!", "B": "."}) - ), - ) - assert_eq( - ps.str.translate(str.maketrans({"é": "É"})), - gs.str.translate(str.maketrans({"é": "É"})), - ) - - -def test_string_str_filter_characters(): - data = [ - "hello world", - "A+B+C+D", - "?!@#$%^&*()", - "accént", - None, - "$1.50", - "", - ] - gs = cudf.Series(data) - expected = cudf.Series( - ["helloworld", "ABCD", "", "accnt", None, "150", ""] - ) - filter = {"a": "z", "A": "Z", "0": "9"} - assert_eq(expected, gs.str.filter_characters(filter)) - - expected = cudf.Series([" ", "+++", "?!@#$%^&*()", "é", None, "$.", ""]) - assert_eq(expected, gs.str.filter_characters(filter, False)) - - expected = cudf.Series( - ["hello world", "A B C D", " ", "acc nt", None, " 1 50", ""] - ) - assert_eq(expected, gs.str.filter_characters(filter, True, " ")) - - with pytest.raises(TypeError): - gs.str.filter_characters(filter, True, ["a"]) - - -def test_string_str_code_points(): - data = [ - "abc", - "Def", - None, - "jLl", - "dog and cat", - "accénted", - "", - " 1234 ", - "XYZ", - ] - gs = cudf.Series(data) - expected = [ - 97, - 98, - 99, - 68, - 101, - 102, - 106, - 76, - 108, - 100, - 111, - 103, - 32, - 97, - 110, - 100, - 32, - 99, - 97, - 116, - 97, - 99, - 99, - 50089, - 110, - 116, - 101, - 100, - 32, - 49, - 50, - 51, - 52, - 32, - 88, - 89, - 90, - ] - expected = cudf.Series(expected) - - assert_eq(expected, gs.str.code_points(), check_dtype=False) - - -@pytest.mark.parametrize( - "data", - [ - ["http://www.hellow.com", "/home/nvidia/nfs", "123.45 ~ABCDEF"], - ["23", "³", "⅕", ""], - [" ", "\t\r\n ", ""], - ["$", "B", "Aab$", "$$ca", "C$B$", "cat"], - ], -) -def test_string_str_url_encode(data): - gs = cudf.Series(data) - - got = gs.str.url_encode() - expected = pd.Series([urllib.parse.quote(url, safe="~") for url in data]) - assert_eq(expected, got) - - -@pytest.mark.parametrize( - "data", - [ - [ - "http://www.hellow.com?k1=acc%C3%A9nted&k2=a%2F/b.c", - "%2Fhome%2fnfs", - "987%20ZYX", - ] - ], -) -def test_string_str_decode_url(data): - gs = cudf.Series(data) - - got = gs.str.url_decode() - expected = pd.Series([urllib.parse.unquote(url) for url in data]) - assert_eq(expected, got) - - -@pytest.mark.parametrize( - "data,dtype", - [ - (["0.1", "10.2", "10.876"], "float"), - (["-0.1", "10.2", "+10.876"], "float"), - (["1", "10.2", "10.876"], "float32"), - (["+123", "6344556789", "0"], "int"), - (["+123", "6344556789", "0"], "uint64"), - (["+123", "6344556789", "0"], "float"), - (["0.1", "-10.2", "10.876", None], "float"), - ], -) -@pytest.mark.parametrize("obj_type", [None, "str", "category"]) -def test_string_typecast(data, obj_type, dtype): - psr = pd.Series(data, dtype=obj_type) - gsr = cudf.Series(data, dtype=obj_type) - - expect = psr.astype(dtype=dtype) - actual = gsr.astype(dtype=dtype) - assert_eq(expect, actual) - - -@pytest.mark.parametrize( - "data,dtype", + "data,dtype", [ (["0.1", "10.2", "10.876"], "int"), (["1", "10.2", "+10.876"], "int"), @@ -2636,93 +611,6 @@ def test_string_typecast_error(data, obj_type, dtype): ) -@pytest.mark.parametrize( - "data", - [ - ["f0:18:98:22:c2:e4", "00:00:00:00:00:00", "ff:ff:ff:ff:ff:ff"], - ["f0189822c2e4", "000000000000", "ffffffffffff"], - ["0xf0189822c2e4", "0x000000000000", "0xffffffffffff"], - ["0Xf0189822c2e4", "0X000000000000", "0Xffffffffffff"], - ], -) -def test_string_hex_to_int(data): - gsr = cudf.Series(data) - - expected = cudf.Series([263988422296292, 0, 281474976710655]) - - got = gsr.str.htoi() - assert_eq(expected, got) - - got = gsr.str.hex_to_int() # alias - assert_eq(expected, got) - - -def test_string_ishex(): - gsr = cudf.Series(["", None, "0x01a2b3c4d5e6f", "0789", "ABCDEF0"]) - got = gsr.str.ishex() - expected = cudf.Series([False, None, True, True, True]) - assert_eq(expected, got) - - -def test_string_istimestamp(): - gsr = cudf.Series( - [ - "", - None, - "20201009 123456.987654AM+0100", - "1920111 012345.000001", - "18201235 012345.1", - "20201009 250001.2", - "20201009 129901.3", - "20201009 123499.4", - "20201009 000000.500000PM-0130", - "20201009:000000.600000", - "20201009 010203.700000PM-2500", - "20201009 010203.800000AM+0590", - "20201009 010203.900000AP-0000", - ] - ) - got = gsr.str.istimestamp(r"%Y%m%d %H%M%S.%f%p%z") - expected = cudf.Series( - [ - False, - None, - True, - False, - False, - False, - False, - False, - True, - False, - False, - False, - False, - ] - ) - assert_eq(expected, got) - - -def test_istimestamp_empty(): - gsr = cudf.Series([], dtype="object") - result = gsr.str.istimestamp("%Y%m%d") - expected = cudf.Series([], dtype="bool") - assert_eq(result, expected) - - -def test_string_ip4_to_int(): - gsr = cudf.Series( - ["", None, "hello", "41.168.0.1", "127.0.0.1", "41.197.0.1"] - ) - expected = cudf.Series([0, None, 0, 698875905, 2130706433, 700776449]) - - got = gsr.str.ip2int() - assert_eq(expected, got) - - got = gsr.str.ip_to_int() # alias - assert_eq(expected, got) - - def test_string_int_to_ipv4(): gsr = cudf.Series([0, None, 0, 698875905, 2130706433, 700776449]).astype( "uint32" @@ -2736,41 +624,6 @@ def test_string_int_to_ipv4(): assert_eq(expected, got) -def test_string_isipv4(): - gsr = cudf.Series( - [ - "", - None, - "1...1", - "141.168.0.1", - "127.0.0.1", - "1.255.0.1", - "256.27.28.26", - "25.257.28.26", - "25.27.258.26", - "25.27.28.256", - "-1.0.0.0", - ] - ) - got = gsr.str.isipv4() - expected = cudf.Series( - [ - False, - None, - False, - True, - True, - True, - False, - False, - False, - False, - False, - ] - ) - assert_eq(expected, got) - - @pytest.mark.parametrize( "dtype", sorted(list(dtypeutils.NUMERIC_TYPES - {"uint32"})) ) @@ -2780,170 +633,6 @@ def test_string_int_to_ipv4_dtype_fail(dtype): gsr._column.int2ip() -@pytest.mark.parametrize( - "data", - [ - ["abc", "xyz", "pqr", "tuv"], - ["aaaaaaaaaaaa"], - ["aaaaaaaaaaaa", "bdfeqwert", "poiuytre"], - ], -) -@pytest.mark.parametrize( - "index", - [ - 0, - 1, - 2, - slice(0, 1, 2), - slice(0, 5, 2), - slice(-1, -2, 1), - slice(-1, -2, -1), - slice(-2, -1, -1), - slice(-2, -1, 1), - slice(0), - slice(None), - ], -) -def test_string_str_subscriptable(data, index): - psr = pd.Series(data) - gsr = cudf.Series(data) - - assert_eq(psr.str[index], gsr.str[index]) - - psi = pd.Index(data) - gsi = cudf.Index(data) - - assert_eq(psi.str[index], gsi.str[index]) - - -@pytest.mark.parametrize( - "data,expected", - [ - (["abc", "xyz", "pqr", "tuv"], [3, 3, 3, 3]), - (["aaaaaaaaaaaa"], [12]), - (["aaaaaaaaaaaa", "bdfeqwert", "poiuytre"], [12, 9, 8]), - (["abc", "d", "ef"], [3, 1, 2]), - (["Hello", "Bye", "Thanks 😊"], [5, 3, 11]), - (["\n\t", "Bye", "Thanks 😊"], [2, 3, 11]), - ], -) -def test_string_str_byte_count(data, expected): - sr = cudf.Series(data) - expected = cudf.Series(expected, dtype="int32") - actual = sr.str.byte_count() - assert_eq(expected, actual) - - si = cudf.Index(data) - expected = cudf.Index(expected, dtype="int32") - actual = si.str.byte_count() - assert_eq(expected, actual) - - -@pytest.mark.parametrize( - "data,expected", - [ - (["1", "2", "3", "4", "5"], [True, True, True, True, True]), - ( - ["1.1", "2.0", "3.2", "4.3", "5."], - [False, False, False, False, False], - ), - ( - [".12312", "213123.", ".3223.", "323423.."], - [False, False, False, False], - ), - ([""], [False]), - ( - ["1..1", "+2", "++3", "4++", "-5"], - [False, True, False, False, True], - ), - ( - [ - "24313345435345 ", - "+2632726478", - "++367293674326", - "4382493264392746.237649274692++", - "-578239479238469264", - ], - [False, True, False, False, True], - ), - ( - ["2a2b", "a+b", "++a", "a.b++", "-b"], - [False, False, False, False, False], - ), - ( - ["2a2b", "1+3", "9.0++a", "+", "-"], - [False, False, False, False, False], - ), - ], -) -def test_str_isinteger(data, expected): - sr = cudf.Series(data, dtype="str") - expected = cudf.Series(expected) - actual = sr.str.isinteger() - assert_eq(expected, actual) - - sr = cudf.Index(data) - expected = cudf.Index(expected) - actual = sr.str.isinteger() - assert_eq(expected, actual) - - -@pytest.mark.parametrize( - "data,expected", - [ - (["1", "2", "3", "4", "5"], [True, True, True, True, True]), - (["1.1", "2.0", "3.2", "4.3", "5."], [True, True, True, True, True]), - ([""], [False]), - ( - [".12312", "213123.", ".3223.", "323423.."], - [True, True, False, False], - ), - ( - ["1.00.323.1", "+2.1", "++3.30", "4.9991++", "-5.3"], - [False, True, False, False, True], - ), - ( - [ - "24313345435345 ", - "+2632726478", - "++367293674326", - "4382493264392746.237649274692++", - "-578239479238469264", - ], - [False, True, False, False, True], - ), - ( - [ - "24313345435345.32732 ", - "+2632726478.3627638276", - "++0.326294632367293674326", - "4382493264392746.237649274692++", - "-57823947923.8469264", - ], - [False, True, False, False, True], - ), - ( - ["2a2b", "a+b", "++a", "a.b++", "-b"], - [False, False, False, False, False], - ), - ( - ["2a2b", "1+3", "9.0++a", "+", "-"], - [False, False, False, False, False], - ), - ], -) -def test_str_isfloat(data, expected): - sr = cudf.Series(data, dtype="str") - expected = cudf.Series(expected) - actual = sr.str.isfloat() - assert_eq(expected, actual) - - sr = cudf.Index(data) - expected = cudf.Index(expected) - actual = sr.str.isfloat() - assert_eq(expected, actual) - - @pytest.mark.parametrize( "data", [ @@ -3061,473 +750,6 @@ def test_string_slice_with_mask(): assert_eq(actual, expected) -@pytest.mark.parametrize( - "data", - [ - [ - """ - { - "store":{ - "book":[ - { - "category":"reference", - "author":"Nigel Rees", - "title":"Sayings of the Century", - "price":8.95 - }, - { - "category":"fiction", - "author":"Evelyn Waugh", - "title":"Sword of Honour", - "price":12.99 - } - ] - } - } - """ - ], - [ - """ - { - "store":{ - "book":[ - { - "category":"reference", - "author":"Nigel Rees", - "title":"Sayings of the Century", - "price":8.95 - } - ] - } - } - """, - """ - { - "store":{ - "book":[ - { - "category":"fiction", - "author":"Evelyn Waugh", - "title":"Sword of Honour", - "price":12.99 - } - ] - } - } - """, - ], - ], -) -def test_string_get_json_object_n(data): - gs = cudf.Series(data) - ps = pd.Series(data) - - assert_eq( - json.loads(gs.str.get_json_object("$.store")[0]), - ps.apply(lambda x: json.loads(x)["store"])[0], - ) - assert_eq( - json.loads(gs.str.get_json_object("$.store.book")[0]), - ps.apply(lambda x: json.loads(x)["store"]["book"])[0], - ) - assert_eq( - gs.str.get_json_object("$.store.book[0].category"), - ps.apply(lambda x: json.loads(x)["store"]["book"][0]["category"]), - ) - - -@pytest.mark.parametrize( - "json_path", ["$.store", "$.store.book", "$.store.book[*].category", " "] -) -def test_string_get_json_object_empty_json_strings(json_path): - gs = cudf.Series( - [ - """ - { - "":{ - "":[ - { - "":"", - "":"", - "":"" - }, - { - "":"fiction", - "":"", - "title":"" - } - ] - } - } - """ - ] - ) - - got = gs.str.get_json_object(json_path) - expect = cudf.Series([None], dtype="object") - - assert_eq(got, expect) - - -@pytest.mark.parametrize("json_path", ["a", ".", "/.store"]) -def test_string_get_json_object_invalid_JSONPath(json_path): - gs = cudf.Series( - [ - """ - { - "store":{ - "book":[ - { - "category":"reference", - "author":"Nigel Rees", - "title":"Sayings of the Century", - "price":8.95 - }, - { - "category":"fiction", - "author":"Evelyn Waugh", - "title":"Sword of Honour", - "price":12.99 - } - ] - } - } - """ - ] - ) - - with pytest.raises(ValueError): - gs.str.get_json_object(json_path) - - -def test_string_get_json_object_allow_single_quotes(): - gs = cudf.Series( - [ - """ - { - "store":{ - "book":[ - { - 'author':"Nigel Rees", - "title":'Sayings of the Century', - "price":8.95 - }, - { - "category":"fiction", - "author":"Evelyn Waugh", - 'title':"Sword of Honour", - "price":12.99 - } - ] - } - } - """ - ] - ) - assert_eq( - gs.str.get_json_object( - "$.store.book[0].author", allow_single_quotes=True - ), - cudf.Series(["Nigel Rees"]), - ) - assert_eq( - gs.str.get_json_object( - "$.store.book[*].title", allow_single_quotes=True - ), - cudf.Series(["['Sayings of the Century',\"Sword of Honour\"]"]), - ) - - assert_eq( - gs.str.get_json_object( - "$.store.book[0].author", allow_single_quotes=False - ), - cudf.Series([None]), - ) - assert_eq( - gs.str.get_json_object( - "$.store.book[*].title", allow_single_quotes=False - ), - cudf.Series([None]), - ) - - -def test_string_get_json_object_strip_quotes_from_single_strings(): - gs = cudf.Series( - [ - """ - { - "store":{ - "book":[ - { - "author":"Nigel Rees", - "title":"Sayings of the Century", - "price":8.95 - }, - { - "category":"fiction", - "author":"Evelyn Waugh", - "title":"Sword of Honour", - "price":12.99 - } - ] - } - } - """ - ] - ) - assert_eq( - gs.str.get_json_object( - "$.store.book[0].author", strip_quotes_from_single_strings=True - ), - cudf.Series(["Nigel Rees"]), - ) - assert_eq( - gs.str.get_json_object( - "$.store.book[*].title", strip_quotes_from_single_strings=True - ), - cudf.Series(['["Sayings of the Century","Sword of Honour"]']), - ) - assert_eq( - gs.str.get_json_object( - "$.store.book[0].author", strip_quotes_from_single_strings=False - ), - cudf.Series(['"Nigel Rees"']), - ) - assert_eq( - gs.str.get_json_object( - "$.store.book[*].title", strip_quotes_from_single_strings=False - ), - cudf.Series(['["Sayings of the Century","Sword of Honour"]']), - ) - - -def test_string_get_json_object_missing_fields_as_nulls(): - gs = cudf.Series( - [ - """ - { - "store":{ - "book":[ - { - "author":"Nigel Rees", - "title":"Sayings of the Century", - "price":8.95 - }, - { - "category":"fiction", - "author":"Evelyn Waugh", - "title":"Sword of Honour", - "price":12.99 - } - ] - } - } - """ - ] - ) - assert_eq( - gs.str.get_json_object( - "$.store.book[0].category", missing_fields_as_nulls=True - ), - cudf.Series(["null"]), - ) - assert_eq( - gs.str.get_json_object( - "$.store.book[*].category", missing_fields_as_nulls=True - ), - cudf.Series(['[null,"fiction"]']), - ) - assert_eq( - gs.str.get_json_object( - "$.store.book[0].category", missing_fields_as_nulls=False - ), - cudf.Series([None]), - ) - assert_eq( - gs.str.get_json_object( - "$.store.book[*].category", missing_fields_as_nulls=False - ), - cudf.Series(['["fiction"]']), - ) - - -def test_str_join_lists_error(): - sr = cudf.Series([["a", "a"], ["b"], ["c"]]) - - with pytest.raises( - ValueError, match="sep_na_rep cannot be defined when `sep` is scalar." - ): - sr.str.join(sep="-", sep_na_rep="-") - - with pytest.raises( - TypeError, - match=re.escape( - "string_na_rep should be a string scalar, got [10, 20] of type " - ": " - ), - ): - sr.str.join(string_na_rep=[10, 20]) - - with pytest.raises( - ValueError, - match=re.escape( - "sep should be of similar size to the series, got: 2, expected: 3" - ), - ): - sr.str.join(sep=["=", "-"]) - - with pytest.raises( - TypeError, - match=re.escape( - "sep_na_rep should be a string scalar, got " - "['na'] of type: " - ), - ): - sr.str.join(sep=["-", "+", "."], sep_na_rep=["na"]) - - with pytest.raises( - TypeError, - match=re.escape( - "sep should be an str, array-like or Series object, " - "found " - ), - ): - sr.str.join(sep=cudf.DataFrame()) - - -@pytest.mark.parametrize( - "sr,sep,string_na_rep,sep_na_rep,expected", - [ - ( - [["a", "a"], ["b"], ["c"]], - "-", - None, - None, - ["a-a", "b", "c"], - ), - ( - [["a", "b"], [None], [None, "hello", None, "world"]], - "__", - "=", - None, - ["a__b", None, "=__hello__=__world"], - ), - ( - [ - ["a", None, "b"], - [None], - [None, "hello", None, "world"], - None, - ], - ["-", "_", "**", "!"], - None, - None, - ["a--b", None, "**hello****world", None], - ), - ( - [ - ["a", None, "b"], - [None], - [None, "hello", None, "world"], - None, - ], - ["-", "_", "**", None], - "rep_str", - "sep_str", - ["a-rep_str-b", None, "rep_str**hello**rep_str**world", None], - ), - ( - [[None, "a"], [None], None], - ["-", "_", None], - "rep_str", - None, - ["rep_str-a", None, None], - ), - ( - [[None, "a"], [None], None], - ["-", "_", None], - None, - "sep_str", - ["-a", None, None], - ), - ], -) -def test_str_join_lists(sr, sep, string_na_rep, sep_na_rep, expected): - sr = cudf.Series(sr) - actual = sr.str.join( - sep=sep, string_na_rep=string_na_rep, sep_na_rep=sep_na_rep - ) - expected = cudf.Series(expected) - assert_eq(actual, expected) - - -@pytest.mark.parametrize( - "patterns, expected", - [ - ( - lambda: ["a", "s", "g", "i", "o", "r"], - [ - [-1, 0, 5, 3, -1, 2], - [-1, -1, -1, -1, 1, -1], - [2, 0, -1, -1, -1, 3], - [-1, -1, -1, 0, -1, -1], - ], - ), - ( - lambda: cudf.Series(["a", "string", "g", "inn", "o", "r", "sea"]), - [ - [-1, 0, 5, -1, -1, 2, -1], - [-1, -1, -1, -1, 1, -1, -1], - [2, -1, -1, -1, -1, 3, 0], - [-1, -1, -1, -1, -1, -1, -1], - ], - ), - ], -) -def test_str_find_multiple(patterns, expected): - s = cudf.Series(["strings", "to", "search", "in"]) - t = patterns() - - expected = cudf.Series(expected) - - # We convert to pandas because find_multiple returns ListDtype(int32) - # and expected is ListDtype(int64). - # Currently there is no easy way to type-cast these to match. - assert_eq(s.str.find_multiple(t).to_pandas(), expected.to_pandas()) - - s = cudf.Index(s) - t = cudf.Index(t) - - expected.index = s - - assert_eq(s.str.find_multiple(t).to_pandas(), expected.to_pandas()) - - -def test_str_find_multiple_error(): - s = cudf.Series(["strings", "to", "search", "in"]) - with pytest.raises( - TypeError, - match=re.escape( - "patterns should be an array-like or a Series object, found " - "" - ), - ): - s.str.find_multiple("a") - - t = cudf.Series([1, 2, 3]) - with pytest.raises( - TypeError, - match=re.escape("patterns can only be of 'string' dtype, got: int64"), - ): - s.str.find_multiple(t) - - -def test_str_iterate_error(): - s = cudf.Series(["abc", "xyz"]) - with pytest.raises(TypeError): - iter(s.str) - - def test_string_reduction_error(): s = cudf.Series([None, None], dtype="str") ps = s.to_pandas(nullable=True) @@ -3544,51 +766,3 @@ def test_string_reduction_error(): lfunc_args_and_kwargs=([], {"skipna": False}), rfunc_args_and_kwargs=([], {"skipna": False}), ) - - -def test_getitem_out_of_bounds(): - data = ["123", "12", "1"] - pd_ser = pd.Series(data) - cudf_ser = cudf.Series(data) - expected = pd_ser.str[2] - result = cudf_ser.str[2] - assert_eq(result, expected) - - expected = pd_ser.str[-2] - result = cudf_ser.str[-2] - assert_eq(result, expected) - - -@pytest.mark.parametrize("method", ["startswith", "endswith"]) -@pytest.mark.parametrize("pat", [None, (1, 2), pd.Series([1])]) -def test_startsendwith_invalid_pat(method, pat): - ser = cudf.Series(["1"]) - with pytest.raises(TypeError): - getattr(ser.str, method)(pat) - - -@pytest.mark.parametrize("method", ["rindex", "index"]) -def test_index_int64_pandas_compat(method): - data = ["ABCDEFG", "BCDEFEF", "DEFGHIJEF", "EFGHEF"] - with cudf.option_context("mode.pandas_compatible", True): - result = getattr(cudf.Series(data).str, method)("E", 4, 8) - expected = getattr(pd.Series(data).str, method)("E", 4, 8) - assert_eq(result, expected) - - -def test_replace_invalid_scalar_repl(): - ser = cudf.Series(["1"]) - with pytest.raises(TypeError): - ser.str.replace("1", 2) - - -def test_string_methods_setattr(): - ser = cudf.Series(["ab", "cd", "ef"]) - pser = ser.to_pandas() - - assert_exceptions_equal( - lfunc=ser.str.__setattr__, - rfunc=pser.str.__setattr__, - lfunc_args_and_kwargs=(("a", "b"),), - rfunc_args_and_kwargs=(("a", "b"),), - ) From af5fb305cb06d6e2218f1142165856b325c2cbbe Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Fri, 1 Aug 2025 09:54:12 -0700 Subject: [PATCH 042/366] Move (most of) test_timedelta.py and test_struct.py to new cudf classic test directory structure (#19551) Towards https://github.com/rapidsai/cudf/issues/9999 Towards https://github.com/rapidsai/cudf/issues/15723 The rest of the `test_timedelta.py` tests are waiting for reduction and binops tests to cleaned up like in e.g https://github.com/rapidsai/cudf/pull/19473 Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - Lawrence Mitchell (https://github.com/wence-) URL: https://github.com/rapidsai/cudf/pull/19551 --- python/cudf/cudf/tests/conftest.py | 12 +- .../indexing}/__init__.py | 0 .../tests/dataframe/indexing/test_getitem.py | 10 + .../tests/dataframe/methods/test_to_struct.py | 58 ++ .../tests/{structs => dtypes}/__init__.py | 0 .../cudf/tests/dtypes/test_structdtype.py | 27 + .../cudf/cudf/tests/indexes/index/__init__.py | 0 .../tests/indexes/index/test_constructor.py | 35 ++ .../indexes/timedelta/test_components.py | 1 - .../indexes/timedelta/test_constructing.py | 1 - .../indexes/timedelta/test_conversion.py | 1 - .../tests/indexes/timedeltaindex/__init__.py | 0 .../timedeltaindex/methods/__init__.py | 0 .../methods/test_total_seconds.py | 37 ++ .../indexes/timedeltaindex/test_attributes.py | 100 ++++ .../timedeltaindex/test_constructor.py | 32 + .../cudf/tests/series/accessors/test_dt.py | 95 +++ .../tests/series/accessors/test_struct.py | 68 +++ .../tests/series/indexing/test_getitem.py | 150 +++++ .../cudf/tests/series/indexing/test_iloc.py | 23 + .../tests/series/indexing/test_setitem.py | 45 ++ .../cudf/tests/series/methods/test_astype.py | 213 +++++++ .../cudf/tests/series/methods/test_copy.py | 9 + .../cudf/tests/series/methods/test_fillna.py | 41 ++ .../tests/series/methods/test_memory_usage.py | 23 +- .../tests/series/methods/test_to_numpy.py | 34 ++ .../tests/series/methods/test_to_pandas.py | 43 ++ .../cudf/cudf/tests/series/test_attributes.py | 30 +- .../cudf/tests/series/test_constructors.py | 117 ++++ .../cudf/tests/structs/test_struct_methods.py | 1 - python/cudf/cudf/tests/test_struct.py | 403 ------------- python/cudf/cudf/tests/test_timedelta.py | 561 ------------------ 32 files changed, 1189 insertions(+), 981 deletions(-) rename python/cudf/cudf/tests/{indexes/timedelta => dataframe/indexing}/__init__.py (100%) create mode 100644 python/cudf/cudf/tests/dataframe/indexing/test_getitem.py create mode 100644 python/cudf/cudf/tests/dataframe/methods/test_to_struct.py rename python/cudf/cudf/tests/{structs => dtypes}/__init__.py (100%) create mode 100644 python/cudf/cudf/tests/dtypes/test_structdtype.py create mode 100644 python/cudf/cudf/tests/indexes/index/__init__.py create mode 100644 python/cudf/cudf/tests/indexes/index/test_constructor.py delete mode 100644 python/cudf/cudf/tests/indexes/timedelta/test_components.py delete mode 100644 python/cudf/cudf/tests/indexes/timedelta/test_constructing.py delete mode 100644 python/cudf/cudf/tests/indexes/timedelta/test_conversion.py create mode 100644 python/cudf/cudf/tests/indexes/timedeltaindex/__init__.py create mode 100644 python/cudf/cudf/tests/indexes/timedeltaindex/methods/__init__.py create mode 100644 python/cudf/cudf/tests/indexes/timedeltaindex/methods/test_total_seconds.py create mode 100644 python/cudf/cudf/tests/indexes/timedeltaindex/test_attributes.py create mode 100644 python/cudf/cudf/tests/indexes/timedeltaindex/test_constructor.py create mode 100644 python/cudf/cudf/tests/series/accessors/test_dt.py create mode 100644 python/cudf/cudf/tests/series/accessors/test_struct.py create mode 100644 python/cudf/cudf/tests/series/indexing/test_getitem.py create mode 100644 python/cudf/cudf/tests/series/indexing/test_iloc.py create mode 100644 python/cudf/cudf/tests/series/methods/test_copy.py create mode 100644 python/cudf/cudf/tests/series/methods/test_to_numpy.py delete mode 100644 python/cudf/cudf/tests/structs/test_struct_methods.py delete mode 100644 python/cudf/cudf/tests/test_struct.py diff --git a/python/cudf/cudf/tests/conftest.py b/python/cudf/cudf/tests/conftest.py index 9d43aeff1fd..008867211f4 100644 --- a/python/cudf/cudf/tests/conftest.py +++ b/python/cudf/cudf/tests/conftest.py @@ -146,18 +146,14 @@ def pytest_sessionfinish(session, exitstatus): @pytest.fixture(params=[32, 64]) def default_integer_bitwidth(request): - old_default = cudf.get_option("default_integer_bitwidth") - cudf.set_option("default_integer_bitwidth", request.param) - yield request.param - cudf.set_option("default_integer_bitwidth", old_default) + with cudf.option_context("default_integer_bitwidth", request.param): + yield request.param @pytest.fixture(params=[32, 64]) def default_float_bitwidth(request): - old_default = cudf.get_option("default_float_bitwidth") - cudf.set_option("default_float_bitwidth", request.param) - yield request.param - cudf.set_option("default_float_bitwidth", old_default) + with cudf.option_context("default_float_bitwidth", request.param): + yield request.param @pytest.hookimpl(tryfirst=True, hookwrapper=True) diff --git a/python/cudf/cudf/tests/indexes/timedelta/__init__.py b/python/cudf/cudf/tests/dataframe/indexing/__init__.py similarity index 100% rename from python/cudf/cudf/tests/indexes/timedelta/__init__.py rename to python/cudf/cudf/tests/dataframe/indexing/__init__.py diff --git a/python/cudf/cudf/tests/dataframe/indexing/test_getitem.py b/python/cudf/cudf/tests/dataframe/indexing/test_getitem.py new file mode 100644 index 00000000000..6453a4abca1 --- /dev/null +++ b/python/cudf/cudf/tests/dataframe/indexing/test_getitem.py @@ -0,0 +1,10 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + +import cudf +from cudf.testing import assert_eq + + +def test_struct_of_struct_loc(): + df = cudf.DataFrame({"col": [{"a": {"b": 1}}]}) + expect = cudf.Series([{"a": {"b": 1}}], name="col") + assert_eq(expect, df["col"]) diff --git a/python/cudf/cudf/tests/dataframe/methods/test_to_struct.py b/python/cudf/cudf/tests/dataframe/methods/test_to_struct.py new file mode 100644 index 00000000000..e6a65ad5823 --- /dev/null +++ b/python/cudf/cudf/tests/dataframe/methods/test_to_struct.py @@ -0,0 +1,58 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + +import numpy as np +import pandas as pd +import pytest + +import cudf +from cudf.testing import assert_eq + + +def test_struct_with_datetime_and_timedelta(temporal_types_as_str): + df = cudf.DataFrame( + { + "a": [12, 232, 2334], + "datetime": cudf.Series( + [23432, 3432423, 324324], dtype=temporal_types_as_str + ), + } + ) + series = df.to_struct() + a_array = np.array([12, 232, 2334]) + datetime_array = np.array([23432, 3432423, 324324]).astype( + temporal_types_as_str + ) + + actual = series.to_pandas() + values_list = [] + for i, val in enumerate(a_array): + values_list.append({"a": val, "datetime": datetime_array[i]}) + + expected = pd.Series(values_list) + assert_eq(expected, actual) + + +def test_dataframe_to_struct(): + df = cudf.DataFrame() + expect = cudf.Series(dtype=cudf.StructDtype({})) + got = df.to_struct() + assert_eq(expect, got) + + df = cudf.DataFrame({"a": [1, 2, 3], "b": ["x", "y", "z"]}) + expect = cudf.Series( + [{"a": 1, "b": "x"}, {"a": 2, "b": "y"}, {"a": 3, "b": "z"}] + ) + got = df.to_struct() + assert_eq(expect, got) + + # check that a copy was made: + df["a"][0] = 5 + assert_eq(got, expect) + + # check that a non-string (but convertible to string) named column can be + # converted to struct + df = cudf.DataFrame([[1, 2], [3, 4]], columns=[(1, "b"), 0]) + expect = cudf.Series([{"(1, 'b')": 1, "0": 2}, {"(1, 'b')": 3, "0": 4}]) + with pytest.warns(UserWarning, match="will be casted"): + got = df.to_struct() + assert_eq(got, expect) diff --git a/python/cudf/cudf/tests/structs/__init__.py b/python/cudf/cudf/tests/dtypes/__init__.py similarity index 100% rename from python/cudf/cudf/tests/structs/__init__.py rename to python/cudf/cudf/tests/dtypes/__init__.py diff --git a/python/cudf/cudf/tests/dtypes/test_structdtype.py b/python/cudf/cudf/tests/dtypes/test_structdtype.py new file mode 100644 index 00000000000..5ccb4640b76 --- /dev/null +++ b/python/cudf/cudf/tests/dtypes/test_structdtype.py @@ -0,0 +1,27 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + +import numpy as np +import pytest + +import cudf + + +@pytest.mark.parametrize( + "fields", + [ + {"a": np.dtype(np.int64)}, + {"a": np.dtype(np.int64), "b": None}, + { + "a": cudf.ListDtype(np.dtype(np.int64)), + "b": cudf.Decimal64Dtype(1, 0), + }, + { + "a": cudf.ListDtype(cudf.StructDtype({"b": np.dtype(np.int64)})), + "b": cudf.ListDtype(cudf.ListDtype(np.dtype(np.int64))), + }, + ], +) +def test_serialize_struct_dtype(fields): + dtype = cudf.StructDtype(fields) + recreated = dtype.__class__.device_deserialize(*dtype.device_serialize()) + assert recreated == dtype diff --git a/python/cudf/cudf/tests/indexes/index/__init__.py b/python/cudf/cudf/tests/indexes/index/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/python/cudf/cudf/tests/indexes/index/test_constructor.py b/python/cudf/cudf/tests/indexes/index/test_constructor.py new file mode 100644 index 00000000000..7e9a577b28f --- /dev/null +++ b/python/cudf/cudf/tests/indexes/index/test_constructor.py @@ -0,0 +1,35 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + +import cupy as cp +import numpy as np +import pytest + +import cudf +from cudf.testing import assert_eq + + +@pytest.mark.parametrize( + "data", + [ + [1000000, 200000, 3000000], + [1000000, 200000, None], + [], + [None], + [None, None, None, None, None], + [12, 12, 22, 343, 4353534, 435342], + np.array([10, 20, 30, None, 100]), + cp.asarray([10, 20, 30, 100]), + [1000000, 200000, 3000000], + [1000000, 200000, None], + [1], + [12, 11, 232, 223432411, 2343241, 234324, 23234], + [12, 11, 2.32, 2234.32411, 2343.241, 23432.4, 23234], + [1.321, 1132.324, 23223231.11, 233.41, 0.2434, 332, 323], + [12, 11, 2.32, 2234.32411, 2343.241, 23432.4, 23234], + ], +) +def test_infer_timedelta_index(data, timedelta_types_as_str): + gdi = cudf.Index(data, dtype=timedelta_types_as_str) + pdi = gdi.to_pandas() + + assert_eq(pdi, gdi) diff --git a/python/cudf/cudf/tests/indexes/timedelta/test_components.py b/python/cudf/cudf/tests/indexes/timedelta/test_components.py deleted file mode 100644 index 06777c8e6af..00000000000 --- a/python/cudf/cudf/tests/indexes/timedelta/test_components.py +++ /dev/null @@ -1 +0,0 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. diff --git a/python/cudf/cudf/tests/indexes/timedelta/test_constructing.py b/python/cudf/cudf/tests/indexes/timedelta/test_constructing.py deleted file mode 100644 index 06777c8e6af..00000000000 --- a/python/cudf/cudf/tests/indexes/timedelta/test_constructing.py +++ /dev/null @@ -1 +0,0 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. diff --git a/python/cudf/cudf/tests/indexes/timedelta/test_conversion.py b/python/cudf/cudf/tests/indexes/timedelta/test_conversion.py deleted file mode 100644 index 06777c8e6af..00000000000 --- a/python/cudf/cudf/tests/indexes/timedelta/test_conversion.py +++ /dev/null @@ -1 +0,0 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. diff --git a/python/cudf/cudf/tests/indexes/timedeltaindex/__init__.py b/python/cudf/cudf/tests/indexes/timedeltaindex/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/python/cudf/cudf/tests/indexes/timedeltaindex/methods/__init__.py b/python/cudf/cudf/tests/indexes/timedeltaindex/methods/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/python/cudf/cudf/tests/indexes/timedeltaindex/methods/test_total_seconds.py b/python/cudf/cudf/tests/indexes/timedeltaindex/methods/test_total_seconds.py new file mode 100644 index 00000000000..14d8f75d1a3 --- /dev/null +++ b/python/cudf/cudf/tests/indexes/timedeltaindex/methods/test_total_seconds.py @@ -0,0 +1,37 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + +import cupy as cp +import numpy as np +import pytest + +import cudf +from cudf.testing import assert_eq + + +@pytest.mark.parametrize( + "data", + [ + [1000000, 200000, 3000000], + [1000000, 200000, None], + [], + [None], + [None, None, None, None, None], + [12, 12, 22, 343, 4353534, 435342], + np.array([10, 20, 30, None, 100]), + cp.asarray([10, 20, 30, 100]), + [1000000, 200000, 3000000], + [1000000, 200000, None], + [1], + [12, 11, 232, 223432411, 2343241, 234324, 23234], + [12, 11, 2.32, 2234.32411, 2343.241, 23432.4, 23234], + [1.321, 1132.324, 23223231.11, 233.41, 0.2434, 332, 323], + [12, 11, 2.32, 2234.32411, 2343.241, 23432.4, 23234], + ], +) +def test_timedelta_index_total_seconds(data, timedelta_types_as_str): + gi = cudf.TimedeltaIndex(data, dtype=timedelta_types_as_str) + pi = gi.to_pandas() + + expected = pi.total_seconds() + actual = gi.total_seconds() + assert_eq(expected, actual) diff --git a/python/cudf/cudf/tests/indexes/timedeltaindex/test_attributes.py b/python/cudf/cudf/tests/indexes/timedeltaindex/test_attributes.py new file mode 100644 index 00000000000..b74e173bd4e --- /dev/null +++ b/python/cudf/cudf/tests/indexes/timedeltaindex/test_attributes.py @@ -0,0 +1,100 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + + +import cupy as cp +import numpy as np +import pandas as pd +import pytest + +import cudf +from cudf.testing import assert_eq + + +@pytest.mark.parametrize( + "data", + [ + [1000000, 200000, 3000000], + [1000000, 200000, None], + [], + [None], + [None, None, None, None, None], + [12, 12, 22, 343, 4353534, 435342], + np.array([10, 20, 30, None, 100]), + cp.asarray([10, 20, 30, 100]), + [1000000, 200000, 3000000], + [1000000, 200000, None], + [1], + [12, 11, 232, 223432411, 2343241, 234324, 23234], + [12, 11, 2.32, 2234.32411, 2343.241, 23432.4, 23234], + [1.321, 1132.324, 23223231.11, 233.41, 0.2434, 332, 323], + [12, 11, 2.32, 2234.32411, 2343.241, 23432.4, 23234], + ], +) +@pytest.mark.parametrize("name", ["abcd", None]) +def test_timedelta_index_properties(data, timedelta_types_as_str, name): + gdi = cudf.TimedeltaIndex(data, dtype=timedelta_types_as_str, name=name) + pdi = gdi.to_pandas() + + def local_assert(expected, actual): + if actual._column.null_count: + assert_eq(expected, actual.astype("float64")) + else: + assert_eq(expected, actual) + + expected_days = pdi.days + actual_days = gdi.days + + local_assert(expected_days, actual_days) + + expected_seconds = pdi.seconds + actual_seconds = gdi.seconds + + local_assert(expected_seconds, actual_seconds) + + expected_microseconds = pdi.microseconds + actual_microseconds = gdi.microseconds + + local_assert(expected_microseconds, actual_microseconds) + + expected_nanoseconds = pdi.nanoseconds + actual_nanoseconds = gdi.nanoseconds + + local_assert(expected_nanoseconds, actual_nanoseconds) + + expected_components = pdi.components + actual_components = gdi.components + + if actual_components.isnull().any().any(): + assert_eq(expected_components, actual_components.astype("float")) + else: + assert_eq( + expected_components, + actual_components, + check_index_type=not actual_components.empty, + ) + + +def test_tdi_unit(): + pd_tdi = pd.TimedeltaIndex( + ["1 day", "2 days", "3 days"], dtype="timedelta64[ns]" + ) + cudf_tdi = cudf.from_pandas(pd_tdi) + + result = pd_tdi.unit + expected = cudf_tdi.unit + assert result == expected + + +def test_tdi_asi8(): + pd_tdi = pd.TimedeltaIndex(["1 day", "2 days", "3 days"]) + cudf_tdi = cudf.from_pandas(pd_tdi) + + result = pd_tdi.asi8 + expected = cudf_tdi.asi8 + assert_eq(result, expected) + + +def test_error_values(): + s = cudf.Series([1, 2, 3], dtype="timedelta64[ns]") + with pytest.raises(NotImplementedError, match="cupy does not support"): + s.values diff --git a/python/cudf/cudf/tests/indexes/timedeltaindex/test_constructor.py b/python/cudf/cudf/tests/indexes/timedeltaindex/test_constructor.py new file mode 100644 index 00000000000..568a9a4edb5 --- /dev/null +++ b/python/cudf/cudf/tests/indexes/timedeltaindex/test_constructor.py @@ -0,0 +1,32 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + +import pandas as pd +import pytest + +import cudf +from cudf.testing import assert_eq + + +def test_timedelta_constructor(): + data = [43534, 43543, 37897, 2000] + dtype = "timedelta64[ns]" + expected = pd.TimedeltaIndex(data=data, dtype=dtype) + actual = cudf.TimedeltaIndex(data=data, dtype=dtype) + + assert_eq(expected, actual) + + expected = pd.TimedeltaIndex(data=pd.Series(data), dtype=dtype) + actual = cudf.TimedeltaIndex(data=cudf.Series(data), dtype=dtype) + + assert_eq(expected, actual) + + +@pytest.mark.parametrize("name", [None, "delta-index"]) +def test_create_TimedeltaIndex(timedelta_types_as_str, name): + gdi = cudf.TimedeltaIndex( + [1132223, 2023232, 342234324, 4234324], + dtype=timedelta_types_as_str, + name=name, + ) + pdi = gdi.to_pandas() + assert_eq(pdi, gdi) diff --git a/python/cudf/cudf/tests/series/accessors/test_dt.py b/python/cudf/cudf/tests/series/accessors/test_dt.py new file mode 100644 index 00000000000..40b604bc043 --- /dev/null +++ b/python/cudf/cudf/tests/series/accessors/test_dt.py @@ -0,0 +1,95 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + +import cupy as cp +import numpy as np +import pytest + +import cudf +from cudf.testing import assert_eq + + +@pytest.fixture( + params=[ + [1000000, 200000, 3000000], + [1000000, 200000, None], + [], + [None], + [None, None, None, None, None], + [12, 12, 22, 343, 4353534, 435342], + np.array([10, 20, 30, None, 100]), + cp.asarray([10, 20, 30, 100]), + [1000000, 200000, 3000000], + [1000000, 200000, None], + [1], + [12, 11, 232, 223432411, 2343241, 234324, 23234], + [12, 11, 2.32, 2234.32411, 2343.241, 23432.4, 23234], + [1.321, 1132.324, 23223231.11, 233.41, 0.2434, 332, 323], + [ + 136457654736252, + 134736784364431, + 245345345545332, + 223432411, + 2343241, + 3634548734, + 23234, + ], + [12, 11, 2.32, 2234.32411, 2343.241, 23432.4, 23234], + ] +) +def timedelta_data(request): + return request.param + + +def test_timedelta_dt_components(timedelta_data, timedelta_types_as_str): + gsr = cudf.Series(timedelta_data, dtype=timedelta_types_as_str) + psr = gsr.to_pandas() + + expected = psr.dt.components + actual = gsr.dt.components + + if gsr.isnull().any(): + assert_eq(expected, actual.astype("float")) + else: + assert_eq(expected, actual) + + +def test_timedelta_dt_properties(timedelta_data, timedelta_types_as_str): + gsr = cudf.Series(timedelta_data, dtype=timedelta_types_as_str) + psr = gsr.to_pandas() + + def local_assert(expected, actual, **kwargs): + if gsr.isnull().any(): + assert_eq(expected, actual.astype("float"), **kwargs) + else: + assert_eq(expected, actual, **kwargs) + + expected_days = psr.dt.days + actual_days = gsr.dt.days + + local_assert(expected_days, actual_days, check_dtype=False) + + expected_seconds = psr.dt.seconds + actual_seconds = gsr.dt.seconds + + local_assert(expected_seconds, actual_seconds, check_dtype=False) + + expected_microseconds = psr.dt.microseconds + actual_microseconds = gsr.dt.microseconds + + local_assert(expected_microseconds, actual_microseconds, check_dtype=False) + + expected_nanoseconds = psr.dt.nanoseconds + actual_nanoseconds = gsr.dt.nanoseconds + + local_assert(expected_nanoseconds, actual_nanoseconds, check_dtype=False) + + +def test_timedelta_series_total_seconds( + timedelta_data, timedelta_types_as_str +): + gsr = cudf.Series(timedelta_data, dtype=timedelta_types_as_str) + psr = gsr.to_pandas() + + expected = psr.dt.total_seconds() + actual = gsr.dt.total_seconds() + assert_eq(expected, actual) diff --git a/python/cudf/cudf/tests/series/accessors/test_struct.py b/python/cudf/cudf/tests/series/accessors/test_struct.py new file mode 100644 index 00000000000..ce68e097302 --- /dev/null +++ b/python/cudf/cudf/tests/series/accessors/test_struct.py @@ -0,0 +1,68 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + +import pytest + +import cudf +from cudf.testing import assert_eq + + +def test_struct_iterate_error(): + s = cudf.Series( + [{"f2": {"a": "sf21"}, "f1": "a"}, {"f1": "sf12", "f2": None}] + ) + with pytest.raises(TypeError): + iter(s.struct) + + +@pytest.mark.parametrize( + "data", + [ + [{}], + [{"a": None}], + [{"a": 1}], + [{"a": "one"}], + [{"a": 1}, {"a": 2}], + [{"a": 1, "b": "one"}, {"a": 2, "b": "two"}], + [{"b": "two", "a": None}, None, {"a": "one", "b": "two"}], + ], +) +def test_struct_field_errors(data): + got = cudf.Series(data) + + with pytest.raises(KeyError): + got.struct.field("notWithinFields") + + with pytest.raises(IndexError): + got.struct.field(100) + + +def test_struct_explode(): + s = cudf.Series([], dtype=cudf.StructDtype({})) + expect = cudf.DataFrame({}) + assert_eq(expect, s.struct.explode()) + + s = cudf.Series( + [ + {"a": 1, "b": "x"}, + {"a": 2, "b": "y"}, + {"a": 3, "b": "z"}, + {"a": 4, "b": "a"}, + ] + ) + expect = cudf.DataFrame({"a": [1, 2, 3, 4], "b": ["x", "y", "z", "a"]}) + got = s.struct.explode() + assert_eq(expect, got) + + # check that a copy was made: + got["a"][0] = 5 + assert_eq(s.struct.explode(), expect) + + +@pytest.mark.parametrize( + "key, expect", [(0, [1, 3]), (1, [2, 4]), ("a", [1, 3]), ("b", [2, 4])] +) +def test_struct_for_field(key, expect): + sr = cudf.Series([{"a": 1, "b": 2}, {"a": 3, "b": 4}]) + expect = cudf.Series(expect) + got = sr.struct.field(key) + assert_eq(expect, got) diff --git a/python/cudf/cudf/tests/series/indexing/test_getitem.py b/python/cudf/cudf/tests/series/indexing/test_getitem.py new file mode 100644 index 00000000000..568d6761cdd --- /dev/null +++ b/python/cudf/cudf/tests/series/indexing/test_getitem.py @@ -0,0 +1,150 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + +import pandas as pd +import pytest + +import cudf +from cudf.testing import assert_eq + + +@pytest.mark.parametrize( + "data, idx, expected", + [ + ( + [{"f2": {"a": "sf21"}, "f1": "a"}, {"f1": "sf12", "f2": None}], + 0, + {"f1": "a", "f2": {"a": "sf21"}}, + ), + ( + [ + {"f2": {"a": "sf21"}}, + {"f1": "sf12", "f2": None}, + ], + 0, + {"f1": cudf.NA, "f2": {"a": "sf21"}}, + ), + ( + [{"a": "123"}, {"a": "sf12", "b": {"a": {"b": "c"}}}], + 1, + {"a": "sf12", "b": {"a": {"b": "c"}}}, + ), + ], +) +def test_nested_struct_extract_host_scalars(data, idx, expected): + series = cudf.Series(data) + + def _nested_na_replace(struct_scalar): + """ + Replace `cudf.NA` with `None` in the dict + """ + for key, value in struct_scalar.items(): + if value is cudf.NA: + struct_scalar[key] = None + return struct_scalar + + assert _nested_na_replace(series[idx]) == _nested_na_replace(expected) + + +def test_nested_struct_from_pandas_empty(): + # tests constructing nested structs columns that would result in + # libcudf EMPTY type child columns inheriting their parent's null + # mask. See GH PR: #10761 + pdf = pd.Series([[{"c": {"x": None}}], [{"c": None}]]) + gdf = cudf.from_pandas(pdf) + + assert_eq(pdf, gdf) + + +def test_struct_int_values(): + series = cudf.Series( + [{"a": 1, "b": 2}, {"a": 10, "b": None}, {"a": 5, "b": 6}] + ) + actual_series = series.to_pandas() + + assert isinstance(actual_series[0]["b"], int) + assert isinstance(actual_series[1]["b"], type(None)) + assert isinstance(actual_series[2]["b"], int) + + +def test_struct_slice_nested_struct(): + data = [ + {"a": {"b": 42, "c": "abc"}}, + {"a": {"b": 42, "c": "hello world"}}, + ] + + got = cudf.Series(data)[0:1] + expect = cudf.Series(data[0:1]) + assert got.to_arrow() == expect.to_arrow() + + +@pytest.mark.parametrize( + "series, slce", + [ + ( + [ + {"a": "Hello world", "b": []}, + {"a": "CUDF", "b": [1, 2, 3], "c": 1}, + {}, + None, + ], + slice(1, None), + ), + ( + [ + {"a": "Hello world", "b": []}, + {"a": "CUDF", "b": [1, 2, 3], "c": 1}, + {}, + None, + {"d": ["Hello", "rapids"]}, + None, + cudf.NA, + ], + slice(1, 5), + ), + ( + [ + {"a": "Hello world", "b": []}, + {"a": "CUDF", "b": [1, 2, 3], "c": 1}, + {}, + None, + {"c": 5}, + None, + cudf.NA, + ], + slice(None, 4), + ), + ([{"a": {"b": 42, "c": -1}}, {"a": {"b": 0, "c": None}}], slice(0, 1)), + ], +) +def test_struct_slice(series, slce): + got = cudf.Series(series)[slce] + expected = cudf.Series(series[slce]) + assert got.to_arrow() == expected.to_arrow() + + +@pytest.mark.parametrize( + "series, expected", + [ + ( + [ + {"a": "Hello world", "b": []}, + {"a": "CUDF", "b": [1, 2, 3], "c": 1}, + {}, + ], + {"a": "Hello world", "b": [], "c": cudf.NA}, + ), + ([{}], {}), + ( + [{"b": True}, {"a": 1, "c": [1, 2, 3], "d": "1", "b": False}], + {"a": cudf.NA, "c": cudf.NA, "d": cudf.NA, "b": True}, + ), + ], +) +def test_struct_getitem(series, expected): + sr = cudf.Series(series) + assert sr[0] == expected + + +def test_timedelta_getitem_na(): + s = cudf.Series([1, 2, None, 3], dtype="timedelta64[ns]") + assert s[2] is cudf.NaT diff --git a/python/cudf/cudf/tests/series/indexing/test_iloc.py b/python/cudf/cudf/tests/series/indexing/test_iloc.py new file mode 100644 index 00000000000..0d460c11dc8 --- /dev/null +++ b/python/cudf/cudf/tests/series/indexing/test_iloc.py @@ -0,0 +1,23 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + +import pytest + +import cudf +from cudf.testing import assert_eq + + +@pytest.mark.parametrize( + "indices", + [slice(0, 3), slice(1, 4), slice(None, None, 2), slice(1, None, 2)], + ids=[":3", "1:4", "0::2", "1::2"], +) +@pytest.mark.parametrize( + "values", + [[None, {}, {}, None], [{}, {}, {}, {}]], + ids=["nulls", "no_nulls"], +) +def test_struct_empty_children_slice(indices, values): + s = cudf.Series(values) + actual = s.iloc[indices] + expect = cudf.Series(values[indices], index=range(len(values))[indices]) + assert_eq(actual, expect) diff --git a/python/cudf/cudf/tests/series/indexing/test_setitem.py b/python/cudf/cudf/tests/series/indexing/test_setitem.py index 88a014191bc..f6ef3d4ddb6 100644 --- a/python/cudf/cudf/tests/series/indexing/test_setitem.py +++ b/python/cudf/cudf/tests/series/indexing/test_setitem.py @@ -71,3 +71,48 @@ def test_series_setitem_mixed_bool_dtype(): s = cudf.Series([True, False, True]) with pytest.raises(TypeError): s[0] = 10 + + +@pytest.mark.parametrize( + "data, item", + [ + ( + [ + {"a": "Hello world", "b": []}, + {"a": "CUDF", "b": [1, 2, 3], "c": cudf.NA}, + {"a": "abcde", "b": [4, 5, 6], "c": 9}, + ], + {"a": "Hello world", "b": [], "c": cudf.NA}, + ), + ( + [ + {"a": "Hello world", "b": []}, + {"a": "CUDF", "b": [1, 2, 3], "c": cudf.NA}, + {"a": "abcde", "b": [4, 5, 6], "c": 9}, + ], + {}, + ), + ( + [ + {"a": "Hello world", "b": []}, + {"a": "CUDF", "b": [1, 2, 3], "c": cudf.NA}, + {"a": "abcde", "b": [4, 5, 6], "c": 9}, + ], + cudf.NA, + ), + ( + [ + {"a": "Hello world", "b": []}, + {"a": "CUDF", "b": [1, 2, 3], "c": cudf.NA}, + {"a": "abcde", "b": [4, 5, 6], "c": 9}, + ], + {"a": "Second element", "b": [1, 2], "c": 1000}, + ), + ], +) +def test_struct_setitem(data, item): + sr = cudf.Series(data) + sr[1] = item + data[1] = item + expected = cudf.Series(data) + assert sr.to_arrow() == expected.to_arrow() diff --git a/python/cudf/cudf/tests/series/methods/test_astype.py b/python/cudf/cudf/tests/series/methods/test_astype.py index 6825af8442f..8373b173815 100644 --- a/python/cudf/cudf/tests/series/methods/test_astype.py +++ b/python/cudf/cudf/tests/series/methods/test_astype.py @@ -1,5 +1,6 @@ # Copyright (c) 2025, NVIDIA CORPORATION. +import cupy as cp import numpy as np import pandas as pd import pyarrow as pa @@ -7,6 +8,7 @@ import cudf from cudf.testing import assert_eq +from cudf.testing._utils import assert_exceptions_equal def test_series_typecast_to_object_error(): @@ -101,3 +103,214 @@ def test_empty_astype_always_castable(type1, type2, as_dtype, copy): assert ser._column is result._column else: assert ser._column is not result._column + + +@pytest.mark.parametrize( + "data", + [ + [1000000, 200000, 3000000], + [12, 12, 22, 343, 4353534, 435342], + [0.3534, 12, 22, 343, 43.53534, 4353.42], + cp.asarray([10, 20, 30, 100]), + ], +) +@pytest.mark.parametrize("cast_dtype", ["int64", "category"]) +def test_timedelta_from_typecast(data, timedelta_types_as_str, cast_dtype): + if timedelta_types_as_str != "timedelta64[ns]": + pytest.skip( + "Bug in pandas : https://github.com/pandas-dev/pandas/issues/35465" + ) + psr = pd.Series( + cp.asnumpy(data) if isinstance(data, cp.ndarray) else data, + dtype=timedelta_types_as_str, + ) + gsr = cudf.Series(data, dtype=timedelta_types_as_str) + + if cast_dtype == "int64": + assert_eq(psr.values.view(cast_dtype), gsr.astype(cast_dtype).values) + else: + assert_eq(psr.astype(cast_dtype), gsr.astype(cast_dtype)) + + +@pytest.mark.parametrize( + "data", + [ + [1000000, 200000, 3000000], + [12, 12, 22, 343, 4353534, 435342], + [0.3534, 12, 22, 343, 43.53534, 4353.42], + cp.asarray([10, 20, 30, 100]), + ], +) +def test_timedelta_to_typecast(data, timedelta_types_as_str): + psr = pd.Series(cp.asnumpy(data) if isinstance(data, cp.ndarray) else data) + gsr = cudf.Series(data) + + assert_eq( + psr.astype(timedelta_types_as_str), gsr.astype(timedelta_types_as_str) + ) + + +@pytest.mark.parametrize("data", [[], [1, 2, 3, 4, 5]]) +def test_numeric_to_timedelta( + data, numeric_types_as_str, timedelta_types_as_str +): + sr = cudf.Series(data, dtype=numeric_types_as_str) + psr = sr.to_pandas() + + actual = sr.astype(timedelta_types_as_str) + expected = psr.astype(timedelta_types_as_str) + + assert_eq(expected, actual) + + +def test_timedelta_datetime_cast_invalid(): + sr = cudf.Series([1, 2, 3], dtype="timedelta64[ns]") + psr = sr.to_pandas() + + assert_exceptions_equal( + psr.astype, + sr.astype, + (["datetime64[ns]"],), + (["datetime64[ns]"],), + ) + + sr = cudf.Series([1, 2, 3], dtype="datetime64[ns]") + psr = sr.to_pandas() + + assert_exceptions_equal( + psr.astype, + sr.astype, + (["timedelta64[ns]"],), + (["timedelta64[ns]"],), + ) + + +@pytest.mark.parametrize( + "sr_data, sr_dtype, exp_data, exp_dtype", + [ + [ + [1, 2, 3], + "timedelta64[ns]", + [ + "0 days 00:00:00.000000001", + "0 days 00:00:00.000000002", + "0 days 00:00:00.000000003", + ], + None, + ], + [ + [1000000, 200000, 3000000], + "timedelta64[ms]", + ["0 days 00:16:40", "0 days 00:03:20", "0 days 00:50:00"], + None, + ], + [ + [1000000, 200000, 3000000], + "timedelta64[s]", + ["11 days 13:46:40", "2 days 07:33:20", "34 days 17:20:00"], + None, + ], + [ + [None, None, None, None, None], + "timedelta64[us]", + [None, None, None, None, None], + "str", + ], + [ + [ + 136457654, + None, + 245345345, + 223432411, + None, + 3634548734, + 23234, + ], + "timedelta64[us]", + [ + "0 days 00:02:16.457654", + None, + "0 days 00:04:05.345345", + "0 days 00:03:43.432411", + None, + "0 days 01:00:34.548734", + "0 days 00:00:00.023234", + ], + None, + ], + [ + [ + 136457654, + None, + 245345345, + 223432411, + None, + 3634548734, + 23234, + ], + "timedelta64[ms]", + [ + "1 days 13:54:17.654", + None, + "2 days 20:09:05.345", + "2 days 14:03:52.411", + None, + "42 days 01:35:48.734", + "0 days 00:00:23.234", + ], + None, + ], + [ + [ + 136457654, + None, + 245345345, + 223432411, + None, + 3634548734, + 23234, + ], + "timedelta64[s]", + [ + "1579 days 08:54:14", + None, + "2839 days 15:29:05", + "2586 days 00:33:31", + None, + "42066 days 12:52:14", + "0 days 06:27:14", + ], + None, + ], + [ + [ + 136457654, + None, + 245345345, + 223432411, + None, + 3634548734, + 23234, + ], + "timedelta64[ns]", + [ + "0 days 00:00:00.136457654", + None, + "0 days 00:00:00.245345345", + "0 days 00:00:00.223432411", + None, + "0 days 00:00:03.634548734", + "0 days 00:00:00.000023234", + ], + None, + ], + ], +) +def test_timedelta_str_roundtrip(sr_data, sr_dtype, exp_data, exp_dtype): + gsr = cudf.Series(sr_data, dtype=sr_dtype) + actual_series = gsr.astype("str") + + expected_series = cudf.Series(exp_data, dtype=exp_dtype) + assert_eq(expected_series, actual_series) + + assert_eq(gsr, actual_series.astype(gsr.dtype)) diff --git a/python/cudf/cudf/tests/series/methods/test_copy.py b/python/cudf/cudf/tests/series/methods/test_copy.py new file mode 100644 index 00000000000..4637e821642 --- /dev/null +++ b/python/cudf/cudf/tests/series/methods/test_copy.py @@ -0,0 +1,9 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + +import cudf +from cudf.testing import assert_eq + + +def test_struct_of_struct_copy(): + sr = cudf.Series([{"a": {"b": 1}}]) + assert_eq(sr, sr.copy()) diff --git a/python/cudf/cudf/tests/series/methods/test_fillna.py b/python/cudf/cudf/tests/series/methods/test_fillna.py index d317ff85596..e64fb209519 100644 --- a/python/cudf/cudf/tests/series/methods/test_fillna.py +++ b/python/cudf/cudf/tests/series/methods/test_fillna.py @@ -38,3 +38,44 @@ def test_fillna_categorical_with_different_categories_raises(): ser = cudf.Series([1, None], dtype="category") with pytest.raises(TypeError): ser.fillna(cudf.Series([1, 2]), dtype="category") + + +@pytest.mark.parametrize( + "data", + [ + [1000000, 200000, 3000000], + [12, 12, 22, 343, 4353534, 435342], + [0.3534, 12, 22, 343, 43.53534, 4353.42], + np.array([10, 20, 30, 100]), + ], +) +@pytest.mark.parametrize( + "fill_value", + [ + np.timedelta64(4, "s"), + np.timedelta64(456, "D"), + np.timedelta64("nat"), + np.timedelta64(1, "s"), + np.timedelta64(1, "ms"), + np.timedelta64(1, "us"), + np.timedelta64(1, "ns"), + "NaT", + ], +) +def test_timedelta_fillna(data, timedelta_types_as_str, fill_value): + sr = cudf.Series(data, dtype=timedelta_types_as_str) + psr = sr.to_pandas() + + expected = psr.dropna() + actual = sr.dropna() + + assert_eq(expected, actual) + + expected = psr.fillna(fill_value) + actual = sr.fillna(fill_value) + assert_eq(expected, actual) + + expected = expected.dropna() + actual = actual.dropna() + + assert_eq(expected, actual) diff --git a/python/cudf/cudf/tests/series/methods/test_memory_usage.py b/python/cudf/cudf/tests/series/methods/test_memory_usage.py index f9c81aaaef9..003e2f61960 100644 --- a/python/cudf/cudf/tests/series/methods/test_memory_usage.py +++ b/python/cudf/cudf/tests/series/methods/test_memory_usage.py @@ -1,7 +1,7 @@ # Copyright (c) 2025, NVIDIA CORPORATION. - import cudf +from cudf.testing import assert_eq def test_series_memory_usage(): @@ -19,3 +19,24 @@ def test_series_memory_usage(): assert sr[3:].memory_usage() == 9 # z assert sr[:1].memory_usage() == 19 # hello world + + +def test_struct_with_null_memory_usage(): + df = cudf.DataFrame( + { + "a": cudf.Series([1, 2, -1, -1, 3], dtype="int64"), + "b": cudf.Series([10, 20, -1, -1, 30], dtype="int64"), + } + ) + s = df.to_struct() + assert s.memory_usage() == 80 + + s[2:4] = None + assert s.memory_usage() == 272 + + +def test_struct_memory_usage(): + s = cudf.Series([{"a": 1, "b": 10}, {"a": 2, "b": 20}, {"a": 3, "b": 30}]) + df = s.struct.explode() + + assert_eq(s.memory_usage(), df.memory_usage().sum()) diff --git a/python/cudf/cudf/tests/series/methods/test_to_numpy.py b/python/cudf/cudf/tests/series/methods/test_to_numpy.py new file mode 100644 index 00000000000..6e94e355d61 --- /dev/null +++ b/python/cudf/cudf/tests/series/methods/test_to_numpy.py @@ -0,0 +1,34 @@ +# Copyright (c) 2023-2025, NVIDIA CORPORATION. + +import cupy as cp +import numpy as np +import pytest + +import cudf + + +@pytest.mark.parametrize( + "data", + [ + [1000000, 200000, 3000000], + [1000000, 200000, None], + [], + [None], + [None, None, None, None, None], + [12, 12, 22, 343, 4353534, 435342], + np.array([10, 20, 30, None, 100]), + cp.asarray([10, 20, 30, 100]), + ], +) +def test_timedelta_series_to_numpy(data, timedelta_types_as_str): + gsr = cudf.Series(data, dtype=timedelta_types_as_str) + + expected = np.array( + cp.asnumpy(data) if isinstance(data, cp.ndarray) else data, + dtype=timedelta_types_as_str, + ) + expected = expected[~np.isnan(expected)] + + actual = gsr.dropna().to_numpy() + + np.testing.assert_array_equal(expected, actual) diff --git a/python/cudf/cudf/tests/series/methods/test_to_pandas.py b/python/cudf/cudf/tests/series/methods/test_to_pandas.py index 8e7ced2ea30..bc78a8c7871 100644 --- a/python/cudf/cudf/tests/series/methods/test_to_pandas.py +++ b/python/cudf/cudf/tests/series/methods/test_to_pandas.py @@ -3,6 +3,8 @@ import datetime import decimal +import cupy as cp +import numpy as np import pandas as pd import pyarrow as pa import pytest @@ -152,3 +154,44 @@ def test_series_to_pandas_arrow_type(scalar): result = ser.to_pandas(arrow_type=True) expected = pd.Series(pd.arrays.ArrowExtensionArray(pa_array)) pd.testing.assert_series_equal(result, expected) + + +@pytest.mark.parametrize( + "data", + [ + [1000000, 200000, 3000000], + [1000000, 200000, None], + [], + [None], + [None, None, None, None, None], + [12, 12, 22, 343, 4353534, 435342], + np.array([10, 20, 30, None, 100]), + cp.asarray([10, 20, 30, 100]), + ], +) +def test_timedelta_series_to_pandas(data, timedelta_types_as_str): + gsr = cudf.Series(data, dtype=timedelta_types_as_str) + + expected = np.array( + cp.asnumpy(data) if isinstance(data, cp.ndarray) else data, + dtype=timedelta_types_as_str, + ) + + expected = pd.Series(expected) + actual = gsr.to_pandas() + + assert_eq(expected, actual) + + +def test_writable_numpy_array_timedelta(): + gi = cudf.Index([1, 2, 3], dtype="timedelta64[ns]") + expected_flags = pd.Index( + [1, 2, 3], dtype="timedelta64[ns]" + )._data._ndarray.flags + + actual_flags = gi.to_pandas()._data._ndarray.flags + assert expected_flags.c_contiguous == actual_flags.c_contiguous + assert expected_flags.f_contiguous == actual_flags.f_contiguous + assert expected_flags.writeable == actual_flags.writeable + assert expected_flags.aligned == actual_flags.aligned + assert expected_flags.writebackifcopy == actual_flags.writebackifcopy diff --git a/python/cudf/cudf/tests/series/test_attributes.py b/python/cudf/cudf/tests/series/test_attributes.py index 7ae81a6c257..6cc5999ab4f 100644 --- a/python/cudf/cudf/tests/series/test_attributes.py +++ b/python/cudf/cudf/tests/series/test_attributes.py @@ -137,7 +137,29 @@ def test_dtype_dtypes_equal(): assert ser.dtypes is ser.to_pandas().dtypes -def test_roundtrip_series_plc_column(ps): - expect = cudf.Series(ps) - actual = cudf.Series.from_pylibcudf(*expect.to_pylibcudf()) - assert_eq(expect, actual) +@pytest.mark.parametrize("data", [[], [1, 2, 3, 4, 5]]) +@pytest.mark.parametrize( + "scalar", + [ + 1, + 2, + 3, + "a", + np.timedelta64(1, "s"), + np.timedelta64(2, "s"), + np.timedelta64(2, "D"), + np.timedelta64(3, "ms"), + np.timedelta64(4, "us"), + np.timedelta64(5, "ns"), + np.timedelta64(6, "ns"), + np.datetime64(6, "s"), + ], +) +def test_timedelta_contains(data, timedelta_types_as_str, scalar): + sr = cudf.Series(data, dtype=timedelta_types_as_str) + psr = sr.to_pandas() + + expected = scalar in sr + actual = scalar in psr + + assert_eq(expected, actual) diff --git a/python/cudf/cudf/tests/series/test_constructors.py b/python/cudf/cudf/tests/series/test_constructors.py index b8bca586361..6357680b368 100644 --- a/python/cudf/cudf/tests/series/test_constructors.py +++ b/python/cudf/cudf/tests/series/test_constructors.py @@ -14,6 +14,66 @@ from cudf.testing._utils import assert_exceptions_equal +@pytest.fixture( + params=[ + [1000000, 200000, 3000000], + [1000000, 200000, None], + [], + [None], + [None, None, None, None, None], + [12, 12, 22, 343, 4353534, 435342], + np.array([10, 20, 30, None, 100]), + cp.asarray([10, 20, 30, 100]), + [1000000, 200000, 3000000], + [1000000, 200000, None], + [1], + [12, 11, 232, 223432411, 2343241, 234324, 23234], + [12, 11, 2.32, 2234.32411, 2343.241, 23432.4, 23234], + [1.321, 1132.324, 23223231.11, 233.41, 0.2434, 332, 323], + [ + 136457654736252, + 134736784364431, + 245345345545332, + 223432411, + 2343241, + 3634548734, + 23234, + ], + [12, 11, 2.32, 2234.32411, 2343.241, 23432.4, 23234], + ] +) +def timedelta_data(request): + return request.param + + +def test_timedelta_series_create(timedelta_data, timedelta_types_as_str): + if timedelta_types_as_str != "timedelta64[ns]": + pytest.skip( + "Bug in pandas : https://github.com/pandas-dev/pandas/issues/35465" + ) + psr = pd.Series( + cp.asnumpy(timedelta_data) + if isinstance(timedelta_data, cp.ndarray) + else timedelta_data, + dtype=timedelta_types_as_str, + ) + gsr = cudf.Series(timedelta_data, dtype=timedelta_types_as_str) + + assert_eq(psr, gsr) + + +def test_timedelta_from_pandas(timedelta_data, timedelta_types_as_str): + psr = pd.Series( + cp.asnumpy(timedelta_data) + if isinstance(timedelta_data, cp.ndarray) + else timedelta_data, + dtype=timedelta_types_as_str, + ) + gsr = cudf.from_pandas(psr) + + assert_eq(psr, gsr) + + def test_construct_int_series_with_nulls_compat_mode(): # in compatibility mode, constructing a Series # with nulls should result in a floating Series: @@ -635,3 +695,60 @@ def test_to_dense_array(): dense = sr.dropna().to_numpy() assert dense.size < filled.size assert filled.size == len(sr) + + +@pytest.mark.parametrize( + "ps", + [ + pd.Series([0, 1, 2, np.nan, 4, None, 6]), + pd.Series( + [0, 1, 2, np.nan, 4, None, 6], + index=["q", "w", "e", "r", "t", "y", "u"], + name="a", + ), + pd.Series([0, 1, 2, 3, 4]), + pd.Series(["a", "b", "u", "h", "d"]), + pd.Series([None, None, np.nan, None, np.inf, -np.inf]), + pd.Series([], dtype="float64"), + pd.Series( + [pd.NaT, pd.Timestamp("1939-05-27"), pd.Timestamp("1940-04-25")] + ), + pd.Series([np.nan]), + pd.Series([None]), + pd.Series(["a", "b", "", "c", None, "e"]), + ], +) +def test_roundtrip_series_plc_column(ps): + expect = cudf.Series(ps) + actual = cudf.Series.from_pylibcudf(*expect.to_pylibcudf()) + assert_eq(expect, actual) + + +def test_series_construction_with_nulls(): + fields = [ + pa.array([1], type=pa.int64()), + pa.array([None], type=pa.int64()), + pa.array([3], type=pa.int64()), + ] + expect = pa.StructArray.from_arrays(fields, ["a", "b", "c"]) + got = cudf.Series(expect).to_arrow() + + assert expect == got + + +@pytest.mark.parametrize( + "data", + [ + [{}], + [{"a": None}], + [{"a": 1}], + [{"a": "one"}], + [{"a": 1}, {"a": 2}], + [{"a": 1, "b": "one"}, {"a": 2, "b": "two"}], + [{"b": "two", "a": None}, None, {"a": "one", "b": "two"}], + ], +) +def test_create_struct_series(data): + expect = pd.Series(data) + got = cudf.Series(data) + assert_eq(expect, got, check_dtype=False) diff --git a/python/cudf/cudf/tests/structs/test_struct_methods.py b/python/cudf/cudf/tests/structs/test_struct_methods.py deleted file mode 100644 index 06777c8e6af..00000000000 --- a/python/cudf/cudf/tests/structs/test_struct_methods.py +++ /dev/null @@ -1 +0,0 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. diff --git a/python/cudf/cudf/tests/test_struct.py b/python/cudf/cudf/tests/test_struct.py deleted file mode 100644 index 13eed9d0d77..00000000000 --- a/python/cudf/cudf/tests/test_struct.py +++ /dev/null @@ -1,403 +0,0 @@ -# Copyright (c) 2020-2025, NVIDIA CORPORATION. - -import numpy as np -import pandas as pd -import pyarrow as pa -import pytest - -import cudf -from cudf.testing import assert_eq -from cudf.testing._utils import DATETIME_TYPES, TIMEDELTA_TYPES - - -@pytest.mark.parametrize( - "data", - [ - [{}], - [{"a": None}], - [{"a": 1}], - [{"a": "one"}], - [{"a": 1}, {"a": 2}], - [{"a": 1, "b": "one"}, {"a": 2, "b": "two"}], - [{"b": "two", "a": None}, None, {"a": "one", "b": "two"}], - ], -) -def test_create_struct_series(data): - expect = pd.Series(data) - got = cudf.Series(data) - assert_eq(expect, got, check_dtype=False) - - -def test_struct_of_struct_copy(): - sr = cudf.Series([{"a": {"b": 1}}]) - assert_eq(sr, sr.copy()) - - -def test_struct_of_struct_loc(): - df = cudf.DataFrame({"col": [{"a": {"b": 1}}]}) - expect = cudf.Series([{"a": {"b": 1}}], name="col") - assert_eq(expect, df["col"]) - - -@pytest.mark.parametrize( - "key, expect", [(0, [1, 3]), (1, [2, 4]), ("a", [1, 3]), ("b", [2, 4])] -) -def test_struct_for_field(key, expect): - sr = cudf.Series([{"a": 1, "b": 2}, {"a": 3, "b": 4}]) - expect = cudf.Series(expect) - got = sr.struct.field(key) - assert_eq(expect, got) - - -def test_series_construction_with_nulls(): - fields = [ - pa.array([1], type=pa.int64()), - pa.array([None], type=pa.int64()), - pa.array([3], type=pa.int64()), - ] - expect = pa.StructArray.from_arrays(fields, ["a", "b", "c"]) - got = cudf.Series(expect).to_arrow() - - assert expect == got - - -@pytest.mark.parametrize( - "fields", - [ - {"a": np.dtype(np.int64)}, - {"a": np.dtype(np.int64), "b": None}, - { - "a": cudf.ListDtype(np.dtype(np.int64)), - "b": cudf.Decimal64Dtype(1, 0), - }, - { - "a": cudf.ListDtype(cudf.StructDtype({"b": np.dtype(np.int64)})), - "b": cudf.ListDtype(cudf.ListDtype(np.dtype(np.int64))), - }, - ], -) -def test_serialize_struct_dtype(fields): - dtype = cudf.StructDtype(fields) - recreated = dtype.__class__.device_deserialize(*dtype.device_serialize()) - assert recreated == dtype - - -@pytest.mark.parametrize( - "series, expected", - [ - ( - [ - {"a": "Hello world", "b": []}, - {"a": "CUDF", "b": [1, 2, 3], "c": 1}, - {}, - ], - {"a": "Hello world", "b": [], "c": cudf.NA}, - ), - ([{}], {}), - ( - [{"b": True}, {"a": 1, "c": [1, 2, 3], "d": "1", "b": False}], - {"a": cudf.NA, "c": cudf.NA, "d": cudf.NA, "b": True}, - ), - ], -) -def test_struct_getitem(series, expected): - sr = cudf.Series(series) - assert sr[0] == expected - - -@pytest.mark.parametrize( - "data, item", - [ - ( - [ - {"a": "Hello world", "b": []}, - {"a": "CUDF", "b": [1, 2, 3], "c": cudf.NA}, - {"a": "abcde", "b": [4, 5, 6], "c": 9}, - ], - {"a": "Hello world", "b": [], "c": cudf.NA}, - ), - ( - [ - {"a": "Hello world", "b": []}, - {"a": "CUDF", "b": [1, 2, 3], "c": cudf.NA}, - {"a": "abcde", "b": [4, 5, 6], "c": 9}, - ], - {}, - ), - ( - [ - {"a": "Hello world", "b": []}, - {"a": "CUDF", "b": [1, 2, 3], "c": cudf.NA}, - {"a": "abcde", "b": [4, 5, 6], "c": 9}, - ], - cudf.NA, - ), - ( - [ - {"a": "Hello world", "b": []}, - {"a": "CUDF", "b": [1, 2, 3], "c": cudf.NA}, - {"a": "abcde", "b": [4, 5, 6], "c": 9}, - ], - {"a": "Second element", "b": [1, 2], "c": 1000}, - ), - ], -) -def test_struct_setitem(data, item): - sr = cudf.Series(data) - sr[1] = item - data[1] = item - expected = cudf.Series(data) - assert sr.to_arrow() == expected.to_arrow() - - -def test_struct_explode(): - s = cudf.Series([], dtype=cudf.StructDtype({})) - expect = cudf.DataFrame({}) - assert_eq(expect, s.struct.explode()) - - s = cudf.Series( - [ - {"a": 1, "b": "x"}, - {"a": 2, "b": "y"}, - {"a": 3, "b": "z"}, - {"a": 4, "b": "a"}, - ] - ) - expect = cudf.DataFrame({"a": [1, 2, 3, 4], "b": ["x", "y", "z", "a"]}) - got = s.struct.explode() - assert_eq(expect, got) - - # check that a copy was made: - got["a"][0] = 5 - assert_eq(s.struct.explode(), expect) - - -def test_dataframe_to_struct(): - df = cudf.DataFrame() - expect = cudf.Series(dtype=cudf.StructDtype({})) - got = df.to_struct() - assert_eq(expect, got) - - df = cudf.DataFrame({"a": [1, 2, 3], "b": ["x", "y", "z"]}) - expect = cudf.Series( - [{"a": 1, "b": "x"}, {"a": 2, "b": "y"}, {"a": 3, "b": "z"}] - ) - got = df.to_struct() - assert_eq(expect, got) - - # check that a copy was made: - df["a"][0] = 5 - assert_eq(got, expect) - - # check that a non-string (but convertible to string) named column can be - # converted to struct - df = cudf.DataFrame([[1, 2], [3, 4]], columns=[(1, "b"), 0]) - expect = cudf.Series([{"(1, 'b')": 1, "0": 2}, {"(1, 'b')": 3, "0": 4}]) - with pytest.warns(UserWarning, match="will be casted"): - got = df.to_struct() - assert_eq(got, expect) - - -@pytest.mark.parametrize( - "series, slce", - [ - ( - [ - {"a": "Hello world", "b": []}, - {"a": "CUDF", "b": [1, 2, 3], "c": 1}, - {}, - None, - ], - slice(1, None), - ), - ( - [ - {"a": "Hello world", "b": []}, - {"a": "CUDF", "b": [1, 2, 3], "c": 1}, - {}, - None, - {"d": ["Hello", "rapids"]}, - None, - cudf.NA, - ], - slice(1, 5), - ), - ( - [ - {"a": "Hello world", "b": []}, - {"a": "CUDF", "b": [1, 2, 3], "c": 1}, - {}, - None, - {"c": 5}, - None, - cudf.NA, - ], - slice(None, 4), - ), - ([{"a": {"b": 42, "c": -1}}, {"a": {"b": 0, "c": None}}], slice(0, 1)), - ], -) -def test_struct_slice(series, slce): - got = cudf.Series(series)[slce] - expected = cudf.Series(series[slce]) - assert got.to_arrow() == expected.to_arrow() - - -def test_struct_slice_nested_struct(): - data = [ - {"a": {"b": 42, "c": "abc"}}, - {"a": {"b": 42, "c": "hello world"}}, - ] - - got = cudf.Series(data)[0:1] - expect = cudf.Series(data[0:1]) - assert got.to_arrow() == expect.to_arrow() - - -@pytest.mark.parametrize( - "data", - [ - [{}], - [{"a": None}], - [{"a": 1}], - [{"a": "one"}], - [{"a": 1}, {"a": 2}], - [{"a": 1, "b": "one"}, {"a": 2, "b": "two"}], - [{"b": "two", "a": None}, None, {"a": "one", "b": "two"}], - ], -) -def test_struct_field_errors(data): - got = cudf.Series(data) - - with pytest.raises(KeyError): - got.struct.field("notWithinFields") - - with pytest.raises(IndexError): - got.struct.field(100) - - -@pytest.mark.parametrize("dtype", DATETIME_TYPES + TIMEDELTA_TYPES) -def test_struct_with_datetime_and_timedelta(dtype): - df = cudf.DataFrame( - { - "a": [12, 232, 2334], - "datetime": cudf.Series([23432, 3432423, 324324], dtype=dtype), - } - ) - series = df.to_struct() - a_array = np.array([12, 232, 2334]) - datetime_array = np.array([23432, 3432423, 324324]).astype(dtype) - - actual = series.to_pandas() - values_list = [] - for i, val in enumerate(a_array): - values_list.append({"a": val, "datetime": datetime_array[i]}) - - expected = pd.Series(values_list) - assert_eq(expected, actual) - - -def test_struct_int_values(): - series = cudf.Series( - [{"a": 1, "b": 2}, {"a": 10, "b": None}, {"a": 5, "b": 6}] - ) - actual_series = series.to_pandas() - - assert isinstance(actual_series[0]["b"], int) - assert isinstance(actual_series[1]["b"], type(None)) - assert isinstance(actual_series[2]["b"], int) - - -def test_nested_struct_from_pandas_empty(): - # tests constructing nested structs columns that would result in - # libcudf EMPTY type child columns inheriting their parent's null - # mask. See GH PR: #10761 - pdf = pd.Series([[{"c": {"x": None}}], [{"c": None}]]) - gdf = cudf.from_pandas(pdf) - - assert_eq(pdf, gdf) - - -def _nested_na_replace(struct_scalar): - """ - Replace `cudf.NA` with `None` in the dict - """ - for key, value in struct_scalar.items(): - if value is cudf.NA: - struct_scalar[key] = None - return struct_scalar - - -@pytest.mark.parametrize( - "data, idx, expected", - [ - ( - [{"f2": {"a": "sf21"}, "f1": "a"}, {"f1": "sf12", "f2": None}], - 0, - {"f1": "a", "f2": {"a": "sf21"}}, - ), - ( - [ - {"f2": {"a": "sf21"}}, - {"f1": "sf12", "f2": None}, - ], - 0, - {"f1": cudf.NA, "f2": {"a": "sf21"}}, - ), - ( - [{"a": "123"}, {"a": "sf12", "b": {"a": {"b": "c"}}}], - 1, - {"a": "sf12", "b": {"a": {"b": "c"}}}, - ), - ], -) -def test_nested_struct_extract_host_scalars(data, idx, expected): - series = cudf.Series(data) - - assert _nested_na_replace(series[idx]) == _nested_na_replace(expected) - - -def test_struct_memory_usage(): - s = cudf.Series([{"a": 1, "b": 10}, {"a": 2, "b": 20}, {"a": 3, "b": 30}]) - df = s.struct.explode() - - assert_eq(s.memory_usage(), df.memory_usage().sum()) - - -def test_struct_with_null_memory_usage(): - df = cudf.DataFrame( - { - "a": cudf.Series([1, 2, -1, -1, 3], dtype="int64"), - "b": cudf.Series([10, 20, -1, -1, 30], dtype="int64"), - } - ) - s = df.to_struct() - assert s.memory_usage() == 80 - - s[2:4] = None - assert s.memory_usage() == 272 - - -@pytest.mark.parametrize( - "indices", - [slice(0, 3), slice(1, 4), slice(None, None, 2), slice(1, None, 2)], - ids=[":3", "1:4", "0::2", "1::2"], -) -@pytest.mark.parametrize( - "values", - [[None, {}, {}, None], [{}, {}, {}, {}]], - ids=["nulls", "no_nulls"], -) -def test_struct_empty_children_slice(indices, values): - s = cudf.Series(values) - actual = s.iloc[indices] - expect = cudf.Series(values[indices], index=range(len(values))[indices]) - assert_eq(actual, expect) - - -def test_struct_iterate_error(): - s = cudf.Series( - [{"f2": {"a": "sf21"}, "f1": "a"}, {"f1": "sf12", "f2": None}] - ) - with pytest.raises(TypeError): - iter(s.struct) diff --git a/python/cudf/cudf/tests/test_timedelta.py b/python/cudf/cudf/tests/test_timedelta.py index f0e6503c55a..28741b9f592 100644 --- a/python/cudf/cudf/tests/test_timedelta.py +++ b/python/cudf/cudf/tests/test_timedelta.py @@ -14,38 +14,6 @@ from cudf.testing._utils import assert_exceptions_equal, expect_warning_if -@pytest.fixture( - params=[ - [1000000, 200000, 3000000], - [1000000, 200000, None], - [], - [None], - [None, None, None, None, None], - [12, 12, 22, 343, 4353534, 435342], - np.array([10, 20, 30, None, 100]), - cp.asarray([10, 20, 30, 100]), - [1000000, 200000, 3000000], - [1000000, 200000, None], - [1], - [12, 11, 232, 223432411, 2343241, 234324, 23234], - [12, 11, 2.32, 2234.32411, 2343.241, 23432.4, 23234], - [1.321, 1132.324, 23223231.11, 233.41, 0.2434, 332, 323], - [ - 136457654736252, - 134736784364431, - 245345345545332, - 223432411, - 2343241, - 3634548734, - 23234, - ], - [12, 11, 2.32, 2234.32411, 2343.241, 23432.4, 23234], - ] -) -def data(request): - return request.param - - @pytest.fixture( params=[ [1000000, 200000, 3000000], @@ -74,127 +42,6 @@ def timedelta_dtype(request): return request.param -def test_timedelta_series_create(data, timedelta_dtype): - if timedelta_dtype != "timedelta64[ns]": - pytest.skip( - "Bug in pandas : https://github.com/pandas-dev/pandas/issues/35465" - ) - psr = pd.Series( - cp.asnumpy(data) if isinstance(data, cp.ndarray) else data, - dtype=timedelta_dtype, - ) - gsr = cudf.Series(data, dtype=timedelta_dtype) - - assert_eq(psr, gsr) - - -@pytest.mark.parametrize( - "data", - [ - [1000000, 200000, 3000000], - [12, 12, 22, 343, 4353534, 435342], - [0.3534, 12, 22, 343, 43.53534, 4353.42], - cp.asarray([10, 20, 30, 100]), - ], -) -@pytest.mark.parametrize("cast_dtype", ["int64", "category"]) -def test_timedelta_from_typecast(data, timedelta_dtype, cast_dtype): - if timedelta_dtype != "timedelta64[ns]": - pytest.skip( - "Bug in pandas : https://github.com/pandas-dev/pandas/issues/35465" - ) - psr = pd.Series( - cp.asnumpy(data) if isinstance(data, cp.ndarray) else data, - dtype=timedelta_dtype, - ) - gsr = cudf.Series(data, dtype=timedelta_dtype) - - if cast_dtype == "int64": - assert_eq(psr.values.view(cast_dtype), gsr.astype(cast_dtype).values) - else: - assert_eq(psr.astype(cast_dtype), gsr.astype(cast_dtype)) - - -@pytest.mark.parametrize( - "data", - [ - [1000000, 200000, 3000000], - [12, 12, 22, 343, 4353534, 435342], - [0.3534, 12, 22, 343, 43.53534, 4353.42], - cp.asarray([10, 20, 30, 100]), - ], -) -def test_timedelta_to_typecast(data, timedelta_dtype): - psr = pd.Series(cp.asnumpy(data) if isinstance(data, cp.ndarray) else data) - gsr = cudf.Series(data) - - assert_eq(psr.astype(timedelta_dtype), gsr.astype(timedelta_dtype)) - - -def test_timedelta_from_pandas(data, timedelta_dtype): - psr = pd.Series( - cp.asnumpy(data) if isinstance(data, cp.ndarray) else data, - dtype=timedelta_dtype, - ) - gsr = cudf.from_pandas(psr) - - assert_eq(psr, gsr) - - -@pytest.mark.parametrize( - "data", - [ - [1000000, 200000, 3000000], - [1000000, 200000, None], - [], - [None], - [None, None, None, None, None], - [12, 12, 22, 343, 4353534, 435342], - np.array([10, 20, 30, None, 100]), - cp.asarray([10, 20, 30, 100]), - ], -) -def test_timedelta_series_to_numpy(data, timedelta_dtype): - gsr = cudf.Series(data, dtype=timedelta_dtype) - - expected = np.array( - cp.asnumpy(data) if isinstance(data, cp.ndarray) else data, - dtype=timedelta_dtype, - ) - expected = expected[~np.isnan(expected)] - - actual = gsr.dropna().to_numpy() - - np.testing.assert_array_equal(expected, actual) - - -@pytest.mark.parametrize( - "data", - [ - [1000000, 200000, 3000000], - [1000000, 200000, None], - [], - [None], - [None, None, None, None, None], - [12, 12, 22, 343, 4353534, 435342], - np.array([10, 20, 30, None, 100]), - cp.asarray([10, 20, 30, 100]), - ], -) -def test_timedelta_series_to_pandas(data, timedelta_dtype): - gsr = cudf.Series(data, dtype=timedelta_dtype) - - expected = np.array( - cp.asnumpy(data) if isinstance(data, cp.ndarray) else data, - dtype=timedelta_dtype, - ) - - expected = pd.Series(expected) - actual = gsr.to_pandas() - - assert_eq(expected, actual) - - @pytest.mark.parametrize( "data,other", [ @@ -520,57 +367,6 @@ def test_timedelta_reduction_ops( assert_eq(expected, actual) -def test_timedelta_dt_components(data, timedelta_dtype): - gsr = cudf.Series(data, dtype=timedelta_dtype) - psr = gsr.to_pandas() - - expected = psr.dt.components - actual = gsr.dt.components - - if gsr.isnull().any(): - assert_eq(expected, actual.astype("float")) - else: - assert_eq(expected, actual) - - -def test_timedelta_dt_properties(data, timedelta_dtype): - gsr = cudf.Series(data, dtype=timedelta_dtype) - psr = gsr.to_pandas() - - def local_assert(expected, actual, **kwargs): - if gsr.isnull().any(): - assert_eq(expected, actual.astype("float"), **kwargs) - else: - assert_eq(expected, actual, **kwargs) - - expected_days = psr.dt.days - actual_days = gsr.dt.days - - local_assert(expected_days, actual_days, check_dtype=False) - - expected_seconds = psr.dt.seconds - actual_seconds = gsr.dt.seconds - - local_assert(expected_seconds, actual_seconds, check_dtype=False) - - expected_microseconds = psr.dt.microseconds - actual_microseconds = gsr.dt.microseconds - - local_assert(expected_microseconds, actual_microseconds, check_dtype=False) - - expected_nanoseconds = psr.dt.nanoseconds - actual_nanoseconds = gsr.dt.nanoseconds - - local_assert(expected_nanoseconds, actual_nanoseconds, check_dtype=False) - - -def test_timedelta_index(data, timedelta_dtype): - gdi = cudf.Index(data, dtype=timedelta_dtype) - pdi = gdi.to_pandas() - - assert_eq(pdi, gdi) - - @pytest.mark.parametrize("datetime_dtype", utils.DATETIME_TYPES) def test_timedelta_index_datetime_index_ops( data_non_overflow, datetime_dtype, timedelta_dtype @@ -721,213 +517,6 @@ def test_timedelta_index_ops_with_scalars( assert_eq(expected, actual) -@pytest.mark.parametrize("name", ["abcd", None]) -def test_timedelta_index_properties(data, timedelta_dtype, name): - gdi = cudf.Index(data, dtype=timedelta_dtype, name=name) - pdi = gdi.to_pandas() - - def local_assert(expected, actual): - if actual._column.null_count: - assert_eq(expected, actual.astype("float64")) - else: - assert_eq(expected, actual) - - expected_days = pdi.days - actual_days = gdi.days - - local_assert(expected_days, actual_days) - - expected_seconds = pdi.seconds - actual_seconds = gdi.seconds - - local_assert(expected_seconds, actual_seconds) - - expected_microseconds = pdi.microseconds - actual_microseconds = gdi.microseconds - - local_assert(expected_microseconds, actual_microseconds) - - expected_nanoseconds = pdi.nanoseconds - actual_nanoseconds = gdi.nanoseconds - - local_assert(expected_nanoseconds, actual_nanoseconds) - - expected_components = pdi.components - actual_components = gdi.components - - if actual_components.isnull().any().any(): - assert_eq(expected_components, actual_components.astype("float")) - else: - assert_eq( - expected_components, - actual_components, - check_index_type=not actual_components.empty, - ) - - -@pytest.mark.parametrize( - "fill_value", - [ - np.timedelta64(4, "s"), - np.timedelta64(456, "D"), - np.timedelta64("nat"), - np.timedelta64(1, "s"), - np.timedelta64(1, "ms"), - np.timedelta64(1, "us"), - np.timedelta64(1, "ns"), - "NaT", - ], -) -def test_timedelta_fillna(data, timedelta_dtype, fill_value): - sr = cudf.Series(data, dtype=timedelta_dtype) - psr = sr.to_pandas() - - expected = psr.dropna() - actual = sr.dropna() - - assert_eq(expected, actual) - - expected = psr.fillna(fill_value) - actual = sr.fillna(fill_value) - assert_eq(expected, actual) - - expected = expected.dropna() - actual = actual.dropna() - - assert_eq(expected, actual) - - -@pytest.mark.parametrize( - "sr_data, sr_dtype, exp_data, exp_dtype", - [ - [ - [1, 2, 3], - "timedelta64[ns]", - [ - "0 days 00:00:00.000000001", - "0 days 00:00:00.000000002", - "0 days 00:00:00.000000003", - ], - None, - ], - [ - [1000000, 200000, 3000000], - "timedelta64[ms]", - ["0 days 00:16:40", "0 days 00:03:20", "0 days 00:50:00"], - None, - ], - [ - [1000000, 200000, 3000000], - "timedelta64[s]", - ["11 days 13:46:40", "2 days 07:33:20", "34 days 17:20:00"], - None, - ], - [ - [None, None, None, None, None], - "timedelta64[us]", - [None, None, None, None, None], - "str", - ], - [ - [ - 136457654, - None, - 245345345, - 223432411, - None, - 3634548734, - 23234, - ], - "timedelta64[us]", - [ - "0 days 00:02:16.457654", - None, - "0 days 00:04:05.345345", - "0 days 00:03:43.432411", - None, - "0 days 01:00:34.548734", - "0 days 00:00:00.023234", - ], - None, - ], - [ - [ - 136457654, - None, - 245345345, - 223432411, - None, - 3634548734, - 23234, - ], - "timedelta64[ms]", - [ - "1 days 13:54:17.654", - None, - "2 days 20:09:05.345", - "2 days 14:03:52.411", - None, - "42 days 01:35:48.734", - "0 days 00:00:23.234", - ], - None, - ], - [ - [ - 136457654, - None, - 245345345, - 223432411, - None, - 3634548734, - 23234, - ], - "timedelta64[s]", - [ - "1579 days 08:54:14", - None, - "2839 days 15:29:05", - "2586 days 00:33:31", - None, - "42066 days 12:52:14", - "0 days 06:27:14", - ], - None, - ], - [ - [ - 136457654, - None, - 245345345, - 223432411, - None, - 3634548734, - 23234, - ], - "timedelta64[ns]", - [ - "0 days 00:00:00.136457654", - None, - "0 days 00:00:00.245345345", - "0 days 00:00:00.223432411", - None, - "0 days 00:00:03.634548734", - "0 days 00:00:00.000023234", - ], - None, - ], - ], -) -def test_timedelta_str_roundtrip(sr_data, sr_dtype, exp_data, exp_dtype): - gsr = cudf.Series(sr_data, dtype=sr_dtype) - actual_series = gsr.astype("str") - - expected_series = cudf.Series(exp_data, dtype=exp_dtype) - assert_eq(expected_series, actual_series) - - assert_eq(gsr, actual_series.astype(gsr.dtype)) - - def test_timedelta_invalid_ops(): sr = cudf.Series([1, 2, 3], dtype="timedelta64[ns]") psr = sr.to_pandas() @@ -1029,68 +618,6 @@ def test_timedelta_invalid_ops(): ) -def test_timedelta_datetime_cast_invalid(): - sr = cudf.Series([1, 2, 3], dtype="timedelta64[ns]") - psr = sr.to_pandas() - - assert_exceptions_equal( - psr.astype, - sr.astype, - (["datetime64[ns]"],), - (["datetime64[ns]"],), - ) - - sr = cudf.Series([1, 2, 3], dtype="datetime64[ns]") - psr = sr.to_pandas() - - assert_exceptions_equal( - psr.astype, - sr.astype, - (["timedelta64[ns]"],), - (["timedelta64[ns]"],), - ) - - -@pytest.mark.parametrize("data", [[], [1, 2, 3, 4, 5]]) -@pytest.mark.parametrize("dtype", utils.NUMERIC_TYPES) -def test_numeric_to_timedelta(data, dtype, timedelta_dtype): - sr = cudf.Series(data, dtype=dtype) - psr = sr.to_pandas() - - actual = sr.astype(timedelta_dtype) - expected = psr.astype(timedelta_dtype) - - assert_eq(expected, actual) - - -@pytest.mark.parametrize("data", [[], [1, 2, 3, 4, 5]]) -@pytest.mark.parametrize( - "scalar", - [ - 1, - 2, - 3, - "a", - np.timedelta64(1, "s"), - np.timedelta64(2, "s"), - np.timedelta64(2, "D"), - np.timedelta64(3, "ms"), - np.timedelta64(4, "us"), - np.timedelta64(5, "ns"), - np.timedelta64(6, "ns"), - np.datetime64(6, "s"), - ], -) -def test_timedelta_contains(data, timedelta_dtype, scalar): - sr = cudf.Series(data, dtype=timedelta_dtype) - psr = sr.to_pandas() - - expected = scalar in sr - actual = scalar in psr - - assert_eq(expected, actual) - - @pytest.mark.parametrize("data", [[1, 2, 3], [], [1, 20, 1000, None]]) @pytest.mark.parametrize("ddof", [1, 2, 3]) def test_timedelta_std(data, timedelta_dtype, ddof): @@ -1135,37 +662,6 @@ def test_timedelta_reductions(data, op, timedelta_dtype): assert_eq(expected.to_numpy(), actual) -def test_error_values(): - s = cudf.Series([1, 2, 3], dtype="timedelta64[ns]") - with pytest.raises(NotImplementedError, match="cupy does not support"): - s.values - - -@pytest.mark.parametrize("name", [None, "delta-index"]) -def test_create_TimedeltaIndex(timedelta_dtype, name): - gdi = cudf.TimedeltaIndex( - [1132223, 2023232, 342234324, 4234324], - dtype=timedelta_dtype, - name=name, - ) - pdi = gdi.to_pandas() - assert_eq(pdi, gdi) - - -def test_timedelta_constructor(): - data = [43534, 43543, 37897, 2000] - dtype = "timedelta64[ns]" - expected = pd.TimedeltaIndex(data=data, dtype=dtype) - actual = cudf.TimedeltaIndex(data=data, dtype=dtype) - - assert_eq(expected, actual) - - expected = pd.TimedeltaIndex(data=pd.Series(data), dtype=dtype) - actual = cudf.TimedeltaIndex(data=cudf.Series(data), dtype=dtype) - - assert_eq(expected, actual) - - @pytest.mark.parametrize("op", [operator.add, operator.sub]) def test_timdelta_binop_tz_timestamp(op): s = cudf.Series([1, 2, 3], dtype="timedelta64[ns]") @@ -1177,11 +673,6 @@ def test_timdelta_binop_tz_timestamp(op): op(s, date_tz_scalar) -def test_timedelta_getitem_na(): - s = cudf.Series([1, 2, None, 3], dtype="timedelta64[ns]") - assert s[2] is cudf.NaT - - @pytest.mark.parametrize( "op", [ @@ -1228,55 +719,3 @@ def test_tdi_reductions(method, kwargs): result = getattr(pd_tdi, method)(**kwargs) expected = getattr(cudf_tdi, method)(**kwargs) assert result == expected - - -def test_tdi_asi8(): - pd_tdi = pd.TimedeltaIndex(["1 day", "2 days", "3 days"]) - cudf_tdi = cudf.from_pandas(pd_tdi) - - result = pd_tdi.asi8 - expected = cudf_tdi.asi8 - assert_eq(result, expected) - - -def test_tdi_unit(): - pd_tdi = pd.TimedeltaIndex( - ["1 day", "2 days", "3 days"], dtype="timedelta64[ns]" - ) - cudf_tdi = cudf.from_pandas(pd_tdi) - - result = pd_tdi.unit - expected = cudf_tdi.unit - assert result == expected - - -def test_timedelta_series_total_seconds(data, timedelta_dtype): - gsr = cudf.Series(data, dtype=timedelta_dtype) - psr = gsr.to_pandas() - - expected = psr.dt.total_seconds() - actual = gsr.dt.total_seconds() - assert_eq(expected, actual) - - -def test_timedelta_index_total_seconds(data, timedelta_dtype): - gi = cudf.Index(data, dtype=timedelta_dtype) - pi = gi.to_pandas() - - expected = pi.total_seconds() - actual = gi.total_seconds() - assert_eq(expected, actual) - - -def test_writable_numpy_array(): - gi = cudf.Index([1, 2, 3], dtype="timedelta64[ns]") - expected_flags = pd.Index( - [1, 2, 3], dtype="timedelta64[ns]" - )._data._ndarray.flags - - actual_flags = gi.to_pandas()._data._ndarray.flags - assert expected_flags.c_contiguous == actual_flags.c_contiguous - assert expected_flags.f_contiguous == actual_flags.f_contiguous - assert expected_flags.writeable == actual_flags.writeable - assert expected_flags.aligned == actual_flags.aligned - assert expected_flags.writebackifcopy == actual_flags.writebackifcopy From 2553df0a3c63fd49991db0f37ac8a968ee358e98 Mon Sep 17 00:00:00 2001 From: David Wendt <45795991+davidwendt@users.noreply.github.com> Date: Fri, 1 Aug 2025 14:34:33 -0400 Subject: [PATCH 043/366] Use no_validity() instead of null_probability(0) in benchmarks profile (#19554) The `no_validity()` for a `data_profile` for generating benchmark data is more reliable than `null_probability(0,0)` since no mask is created. Authors: - David Wendt (https://github.com/davidwendt) Approvers: - Shruti Shivakumar (https://github.com/shrshi) - Muhammad Haseeb (https://github.com/mhaseeb123) URL: https://github.com/rapidsai/cudf/pull/19554 --- cpp/benchmarks/common/generate_input.cu | 6 ++---- cpp/benchmarks/common/generate_input.hpp | 6 +++--- 2 files changed, 5 insertions(+), 7 deletions(-) diff --git a/cpp/benchmarks/common/generate_input.cu b/cpp/benchmarks/common/generate_input.cu index 0488a59099d..d8e868a4ae8 100644 --- a/cpp/benchmarks/common/generate_input.cu +++ b/cpp/benchmarks/common/generate_input.cu @@ -1063,12 +1063,10 @@ std::unique_ptr create_string_column(cudf::size_type num_rows, auto const num_matches = (static_cast(num_rows) * hit_rate) / 100; // Create a randomized gather-map to build a column out of the strings in data. - data_profile gather_profile = - data_profile_builder().cardinality(0).null_probability(0.0).distribution( - cudf::type_id::INT32, distribution_id::UNIFORM, 1, data_view.size() - 1); + data_profile gather_profile = data_profile_builder().cardinality(0).no_validity().distribution( + cudf::type_id::INT32, distribution_id::UNIFORM, 1, data_view.size() - 1); auto gather_table = create_random_table({cudf::type_id::INT32}, row_count{num_rows}, gather_profile); - gather_table->get_column(0).set_null_mask(rmm::device_buffer{}, 0); // Create scatter map by placing 0-index values throughout the gather-map auto scatter_data = cudf::sequence(num_matches, diff --git a/cpp/benchmarks/common/generate_input.hpp b/cpp/benchmarks/common/generate_input.hpp index b900acc2f20..a89fb2429bf 100644 --- a/cpp/benchmarks/common/generate_input.hpp +++ b/cpp/benchmarks/common/generate_input.hpp @@ -449,14 +449,14 @@ class data_profile { * For example, `data_profile` initialization * @code{.pseudo} * data_profile profile; - * profile.set_null_probability(0.0); + * profile.set_null_probability(0.01); * profile.set_cardinality(0); * profile.set_distribution_params(cudf::type_id::INT32, distribution_id::UNIFORM, 0, 100); * @endcode * becomes * @code{.pseudo} * data_profile const profile = - * data_profile_builder().cardinality(0).null_probability(0.0).distribution( + * data_profile_builder().cardinality(0).null_probability(0.01).distribution( * cudf::type_id::INT32, distribution_id::UNIFORM, 0, 100); * @endcode * The builder makes it easier to have immutable `data_profile` objects even with the complex @@ -465,7 +465,7 @@ class data_profile { * * The builder API also includes a few additional convenience setters: * Overload of `distribution` that only takes the distribution type (not the range). - * `no_validity`, which is a simpler equivalent of `null_probability(std::nullopr)`. + * `no_validity`, which is a simpler equivalent of `null_probability(std::nullopt)`. */ class data_profile_builder { data_profile profile; From 79f24164cbd7b433cb0e0ee17d65ec3aa8d25278 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Fri, 1 Aug 2025 12:14:45 -0700 Subject: [PATCH 044/366] Add streams to all single-function modules (#19559) Contributes to #15163 Authors: - Vyas Ramasubramani (https://github.com/vyasr) Approvers: - Matthew Roeschke (https://github.com/mroeschke) URL: https://github.com/rapidsai/cudf/pull/19559 --- python/pylibcudf/pylibcudf/concatenate.pxd | 6 +++-- python/pylibcudf/pylibcudf/concatenate.pyi | 3 +++ python/pylibcudf/pylibcudf/concatenate.pyx | 18 +++++++++----- python/pylibcudf/pylibcudf/interop.pxd | 5 ++-- python/pylibcudf/pylibcudf/interop.pyi | 8 +++++-- python/pylibcudf/pylibcudf/interop.pyx | 19 +++++++++++---- python/pylibcudf/pylibcudf/json.pxd | 7 ++++-- python/pylibcudf/pylibcudf/json.pyi | 7 +++++- python/pylibcudf/pylibcudf/json.pyx | 18 ++++++++++---- python/pylibcudf/pylibcudf/labeling.pxd | 7 ++++-- python/pylibcudf/pylibcudf/labeling.pyi | 3 +++ python/pylibcudf/pylibcudf/labeling.pyx | 13 ++++++++-- .../pylibcudf/libcudf/concatenate.pxd | 9 ++++--- .../pylibcudf/pylibcudf/libcudf/interop.pxd | 8 +++++-- python/pylibcudf/pylibcudf/libcudf/json.pxd | 5 +++- .../pylibcudf/pylibcudf/libcudf/labeling.pxd | 7 ++++-- python/pylibcudf/pylibcudf/libcudf/merge.pxd | 5 +++- .../libcudf/nvtext/edit_distance.pxd | 10 +++++--- python/pylibcudf/pylibcudf/libcudf/round.pxd | 5 +++- .../pylibcudf/libcudf/strings/repeat.pxd | 12 +++++++--- .../pylibcudf/libcudf/strings/substring.pxd | 12 +++++++--- .../pylibcudf/pylibcudf/libcudf/transpose.pxd | 7 ++++-- python/pylibcudf/pylibcudf/merge.pxd | 5 +++- python/pylibcudf/pylibcudf/merge.pyi | 3 +++ python/pylibcudf/pylibcudf/merge.pyx | 13 ++++++++-- .../pylibcudf/nvtext/edit_distance.pxd | 7 +++--- .../pylibcudf/nvtext/edit_distance.pyi | 10 ++++++-- .../pylibcudf/nvtext/edit_distance.pyx | 24 +++++++++++++------ python/pylibcudf/pylibcudf/round.pxd | 7 ++++-- python/pylibcudf/pylibcudf/round.pyi | 3 +++ python/pylibcudf/pylibcudf/round.pyx | 15 +++++++++--- python/pylibcudf/pylibcudf/strings/repeat.pxd | 7 ++++-- python/pylibcudf/pylibcudf/strings/repeat.pyi | 8 ++++++- python/pylibcudf/pylibcudf/strings/repeat.pyx | 21 ++++++++++++---- python/pylibcudf/pylibcudf/strings/slice.pxd | 6 +++-- python/pylibcudf/pylibcudf/strings/slice.pyi | 3 +++ python/pylibcudf/pylibcudf/strings/slice.pyx | 19 +++++++++++---- python/pylibcudf/pylibcudf/transpose.pxd | 6 +++-- python/pylibcudf/pylibcudf/transpose.pyi | 5 +++- python/pylibcudf/pylibcudf/transpose.pyx | 15 ++++++++---- 40 files changed, 280 insertions(+), 91 deletions(-) diff --git a/python/pylibcudf/pylibcudf/concatenate.pxd b/python/pylibcudf/pylibcudf/concatenate.pxd index c506ffb93c9..629c88161ae 100644 --- a/python/pylibcudf/pylibcudf/concatenate.pxd +++ b/python/pylibcudf/pylibcudf/concatenate.pxd @@ -1,10 +1,12 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. +# Copyright (c) 2024-2025, NVIDIA CORPORATION. from .table cimport Table +from rmm.pylibrmm.stream cimport Stream + # There is no way to define a fused type that is a list of other objects, so we cannot # unify the column and table paths without using runtime dispatch instead. In this case # we choose to prioritize API consistency over performance, so we use the same function # with a bit of runtime dispatch overhead. -cpdef concatenate(list objects) +cpdef concatenate(list objects, Stream stream=*) diff --git a/python/pylibcudf/pylibcudf/concatenate.pyi b/python/pylibcudf/pylibcudf/concatenate.pyi index 79076f509e0..020ba1eb997 100644 --- a/python/pylibcudf/pylibcudf/concatenate.pyi +++ b/python/pylibcudf/pylibcudf/concatenate.pyi @@ -1,8 +1,11 @@ # Copyright (c) 2024, NVIDIA CORPORATION. +from rmm.pylibrmm.stream import Stream + from pylibcudf.column import Column from pylibcudf.table import Table def concatenate[ColumnOrTable: (Column, Table)]( objects: list[ColumnOrTable], + stream: Stream | None = None, ) -> ColumnOrTable: ... diff --git a/python/pylibcudf/pylibcudf/concatenate.pyx b/python/pylibcudf/pylibcudf/concatenate.pyx index 42c5f34cf3e..2937732e2cd 100644 --- a/python/pylibcudf/pylibcudf/concatenate.pyx +++ b/python/pylibcudf/pylibcudf/concatenate.pyx @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. +# Copyright (c) 2024-2025, NVIDIA CORPORATION. from libcpp.memory cimport unique_ptr from libcpp.utility cimport move @@ -9,18 +9,23 @@ from pylibcudf.libcudf.column.column_view cimport column_view from pylibcudf.libcudf.table.table cimport table from pylibcudf.libcudf.table.table_view cimport table_view +from rmm.pylibrmm.stream cimport Stream + from .column cimport Column from .table cimport Table +from .utils cimport _get_stream __all__ = ["concatenate"] -cpdef concatenate(list objects): +cpdef concatenate(list objects, Stream stream=None): """Concatenate columns or tables. Parameters ---------- objects : Union[List[Column], List[Table]] The list of Columns or Tables to concatenate. + stream : Stream | None + CUDA stream on which to perform the operation. Returns ------- @@ -32,6 +37,7 @@ cpdef concatenate(list objects): cdef vector[column_view] c_columns cdef vector[table_view] c_tables + stream = _get_stream(stream) cdef unique_ptr[column] c_col_result cdef unique_ptr[table] c_tbl_result @@ -41,14 +47,14 @@ cpdef concatenate(list objects): c_tables.push_back((tbl).view()) with nogil: - c_tbl_result = cpp_concatenate.concatenate(c_tables) - return Table.from_libcudf(move(c_tbl_result)) + c_tbl_result = cpp_concatenate.concatenate(c_tables, stream.view()) + return Table.from_libcudf(move(c_tbl_result), stream) elif isinstance(objects[0], Column): for column in objects: c_columns.push_back((column).view()) with nogil: - c_col_result = cpp_concatenate.concatenate(c_columns) - return Column.from_libcudf(move(c_col_result)) + c_col_result = cpp_concatenate.concatenate(c_columns, stream.view()) + return Column.from_libcudf(move(c_col_result), stream) else: raise ValueError("input must be a list of Columns or Tables") diff --git a/python/pylibcudf/pylibcudf/interop.pxd b/python/pylibcudf/pylibcudf/interop.pxd index 7cf3be08e09..a02261db74c 100644 --- a/python/pylibcudf/pylibcudf/interop.pxd +++ b/python/pylibcudf/pylibcudf/interop.pxd @@ -1,7 +1,8 @@ # Copyright (c) 2024-2025, NVIDIA CORPORATION. from pylibcudf.table cimport Table +from rmm.pylibrmm.stream cimport Stream -cpdef Table from_dlpack(object managed_tensor) +cpdef Table from_dlpack(object managed_tensor, Stream stream=*) -cpdef object to_dlpack(Table input) +cpdef object to_dlpack(Table input, Stream stream=*) diff --git a/python/pylibcudf/pylibcudf/interop.pyi b/python/pylibcudf/pylibcudf/interop.pyi index 63de816010b..decd0e412c4 100644 --- a/python/pylibcudf/pylibcudf/interop.pyi +++ b/python/pylibcudf/pylibcudf/interop.pyi @@ -6,6 +6,8 @@ from typing import Any, overload import pyarrow as pa +from rmm.pylibrmm.stream import Stream + from pylibcudf.column import Column from pylibcudf.scalar import Scalar from pylibcudf.table import Table @@ -48,5 +50,7 @@ def to_arrow( def to_arrow( obj: Scalar, metadata: ColumnMetadata | str | None = None ) -> pa.Scalar[Any]: ... -def from_dlpack(managed_tensor: Any) -> Table: ... -def to_dlpack(input: Table) -> Any: ... +def from_dlpack( + managed_tensor: Any, stream: Stream | None = None +) -> Table: ... +def to_dlpack(input: Table, stream: Stream | None = None) -> Any: ... diff --git a/python/pylibcudf/pylibcudf/interop.pyx b/python/pylibcudf/pylibcudf/interop.pyx index f31b82153c1..0595bb4a777 100644 --- a/python/pylibcudf/pylibcudf/interop.pyx +++ b/python/pylibcudf/pylibcudf/interop.pyx @@ -18,11 +18,14 @@ from pylibcudf.libcudf.interop cimport ( ) from pylibcudf.libcudf.table.table cimport table +from rmm.pylibrmm.stream cimport Stream + from .column cimport Column from .scalar cimport Scalar from .table cimport Table from .types cimport DataType, type_id from .types import LIBCUDF_TO_ARROW_TYPES +from .utils cimport _get_stream from ._interop_helpers import ColumnMetadata try: @@ -177,7 +180,7 @@ if pa is not None: return to_arrow(Column.from_scalar(plc_object, 1), metadata=metadata)[0] -cpdef Table from_dlpack(object managed_tensor): +cpdef Table from_dlpack(object managed_tensor, Stream stream=None): """ Convert a DLPack DLTensor into a cudf table. @@ -187,6 +190,8 @@ cpdef Table from_dlpack(object managed_tensor): ---------- managed_tensor : PyCapsule A 1D or 2D column-major (Fortran order) tensor. + stream : Stream | None + CUDA stream on which to perform the operation. Returns ------- @@ -202,6 +207,7 @@ cpdef Table from_dlpack(object managed_tensor): if dlpack_tensor is NULL: raise ValueError("PyCapsule object contained a NULL pointer") PyCapsule_SetName(managed_tensor, "used_dltensor") + stream = _get_stream(stream) # Note: A copy is always performed when converting the dlpack # data to a libcudf table. We also delete the dlpack_tensor pointer @@ -209,14 +215,14 @@ cpdef Table from_dlpack(object managed_tensor): # TODO: https://github.com/rapidsai/cudf/issues/10874 # TODO: https://github.com/rapidsai/cudf/issues/10849 with nogil: - c_result = cpp_from_dlpack(dlpack_tensor) + c_result = cpp_from_dlpack(dlpack_tensor, stream.view()) - cdef Table result = Table.from_libcudf(move(c_result)) + cdef Table result = Table.from_libcudf(move(c_result), stream) dlpack_tensor.deleter(dlpack_tensor) return result -cpdef object to_dlpack(Table input): +cpdef object to_dlpack(Table input, Stream stream=None): """ Convert a cudf table into a DLPack DLTensor. @@ -226,6 +232,8 @@ cpdef object to_dlpack(Table input): ---------- input : Table A 1D or 2D column-major (Fortran order) tensor. + stream : Stream | None + CUDA stream on which to perform the operation. Returns ------- @@ -239,9 +247,10 @@ cpdef object to_dlpack(Table input): "Input is required to have null count as zero." ) cdef DLManagedTensor *dlpack_tensor + stream = _get_stream(stream) with nogil: - dlpack_tensor = cpp_to_dlpack(input.view()) + dlpack_tensor = cpp_to_dlpack(input.view(), stream.view()) return PyCapsule_New( dlpack_tensor, diff --git a/python/pylibcudf/pylibcudf/json.pxd b/python/pylibcudf/pylibcudf/json.pxd index 87a87349b8a..81062980608 100644 --- a/python/pylibcudf/pylibcudf/json.pxd +++ b/python/pylibcudf/pylibcudf/json.pxd @@ -1,9 +1,11 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. +# Copyright (c) 2024-2025, NVIDIA CORPORATION. from pylibcudf.column cimport Column from pylibcudf.libcudf.json cimport get_json_object_options from pylibcudf.scalar cimport Scalar +from rmm.pylibrmm.stream cimport Stream + cdef class GetJsonObjectOptions: cdef get_json_object_options options @@ -12,5 +14,6 @@ cdef class GetJsonObjectOptions: cpdef Column get_json_object( Column col, Scalar json_path, - GetJsonObjectOptions options=* + GetJsonObjectOptions options=*, + Stream stream=* ) diff --git a/python/pylibcudf/pylibcudf/json.pyi b/python/pylibcudf/pylibcudf/json.pyi index b93d4876dab..e7d00013103 100644 --- a/python/pylibcudf/pylibcudf/json.pyi +++ b/python/pylibcudf/pylibcudf/json.pyi @@ -1,5 +1,7 @@ # Copyright (c) 2024, NVIDIA CORPORATION. +from rmm.pylibrmm.stream import Stream + from pylibcudf.column import Column from pylibcudf.scalar import Scalar @@ -19,5 +21,8 @@ class GetJsonObjectOptions: def set_missing_fields_as_nulls(self, val: bool) -> None: ... def get_json_object( - col: Column, json_path: Scalar, options: GetJsonObjectOptions | None = None + col: Column, + json_path: Scalar, + options: GetJsonObjectOptions | None = None, + stream: Stream | None = None, ) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/json.pyx b/python/pylibcudf/pylibcudf/json.pyx index 5ec1e1be971..2836eedc5ff 100644 --- a/python/pylibcudf/pylibcudf/json.pyx +++ b/python/pylibcudf/pylibcudf/json.pyx @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. +# Copyright (c) 2024-2025, NVIDIA CORPORATION. from cython.operator cimport dereference from libcpp cimport bool @@ -10,6 +10,10 @@ from pylibcudf.libcudf.column.column cimport column from pylibcudf.libcudf.scalar.scalar cimport string_scalar from pylibcudf.scalar cimport Scalar +from rmm.pylibrmm.stream cimport Stream + +from .utils cimport _get_stream + __all__ = ["GetJsonObjectOptions", "get_json_object"] cdef class GetJsonObjectOptions: @@ -113,7 +117,8 @@ cdef class GetJsonObjectOptions: cpdef Column get_json_object( Column col, Scalar json_path, - GetJsonObjectOptions options=None + GetJsonObjectOptions options=None, + Stream stream=None ): """ Apply a JSONPath string to all rows in an input strings column. @@ -131,6 +136,9 @@ cpdef Column get_json_object( options : GetJsonObjectOptions Options for controlling the behavior of the function. + stream : Stream | None + CUDA stream on which to perform the operation. + Returns ------- Column @@ -144,12 +152,14 @@ cpdef Column get_json_object( options = GetJsonObjectOptions() cdef cpp_json.get_json_object_options c_options = options.options + stream = _get_stream(stream) with nogil: c_result = cpp_json.get_json_object( col.view(), dereference(c_json_path), - c_options + c_options, + stream.view() ) - return Column.from_libcudf(move(c_result)) + return Column.from_libcudf(move(c_result), stream) diff --git a/python/pylibcudf/pylibcudf/labeling.pxd b/python/pylibcudf/pylibcudf/labeling.pxd index b1f9f2e806d..13205ee19ba 100644 --- a/python/pylibcudf/pylibcudf/labeling.pxd +++ b/python/pylibcudf/pylibcudf/labeling.pxd @@ -1,14 +1,17 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. +# Copyright (c) 2024-2025, NVIDIA CORPORATION. from libcpp cimport bool from pylibcudf.libcudf.labeling cimport inclusive from .column cimport Column +from rmm.pylibrmm.stream cimport Stream + cpdef Column label_bins( Column input, Column left_edges, inclusive left_inclusive, Column right_edges, - inclusive right_inclusive + inclusive right_inclusive, + Stream stream=* ) diff --git a/python/pylibcudf/pylibcudf/labeling.pyi b/python/pylibcudf/pylibcudf/labeling.pyi index c3a75d10baf..7f0a42cad9c 100644 --- a/python/pylibcudf/pylibcudf/labeling.pyi +++ b/python/pylibcudf/pylibcudf/labeling.pyi @@ -2,6 +2,8 @@ from enum import IntEnum +from rmm.pylibrmm.stream import Stream + from pylibcudf.column import Column class Inclusive(IntEnum): @@ -14,4 +16,5 @@ def label_bins( left_inclusive: Inclusive, right_edges: Column, right_inclusive: Inclusive, + stream: Stream | None = None, ) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/labeling.pyx b/python/pylibcudf/pylibcudf/labeling.pyx index 0d93463cc7e..d5be40f2558 100644 --- a/python/pylibcudf/pylibcudf/labeling.pyx +++ b/python/pylibcudf/pylibcudf/labeling.pyx @@ -8,7 +8,10 @@ from pylibcudf.libcudf.labeling cimport inclusive from pylibcudf.libcudf.labeling import inclusive as Inclusive # no-cython-lint +from rmm.pylibrmm.stream cimport Stream + from .column cimport Column +from .utils cimport _get_stream __all__ = ["Inclusive", "label_bins"] @@ -17,7 +20,8 @@ cpdef Column label_bins( Column left_edges, inclusive left_inclusive, Column right_edges, - inclusive right_inclusive + inclusive right_inclusive, + Stream stream=None ): """Labels elements based on membership in the specified bins. @@ -35,6 +39,8 @@ cpdef Column label_bins( Column of the right edge of each bin. right_inclusive : Inclusive Whether or not the right edge is inclusive. + stream : Stream | None + CUDA stream on which to perform the operation. Returns ------- @@ -43,6 +49,8 @@ cpdef Column label_bins( according to the specified bins. """ cdef unique_ptr[column] c_result + stream = _get_stream(stream) + with nogil: c_result = cpp_labeling.label_bins( input.view(), @@ -50,8 +58,9 @@ cpdef Column label_bins( left_inclusive, right_edges.view(), right_inclusive, + stream.view() ) - return Column.from_libcudf(move(c_result)) + return Column.from_libcudf(move(c_result), stream) Inclusive.__str__ = Inclusive.__repr__ diff --git a/python/pylibcudf/pylibcudf/libcudf/concatenate.pxd b/python/pylibcudf/pylibcudf/libcudf/concatenate.pxd index 0a827b21cda..3711105401d 100644 --- a/python/pylibcudf/pylibcudf/libcudf/concatenate.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/concatenate.pxd @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. +# Copyright (c) 2020-2025, NVIDIA CORPORATION. from libcpp.memory cimport unique_ptr from libcpp.vector cimport vector from pylibcudf.exception_handler cimport libcudf_exception_handler @@ -7,6 +7,7 @@ from pylibcudf.libcudf.table.table cimport table, table_view from pylibcudf.libcudf.utilities.span cimport host_span from rmm.librmm.device_buffer cimport device_buffer +from rmm.librmm.cuda_stream_view cimport cuda_stream_view cdef extern from "cudf/concatenate.hpp" namespace "cudf" nogil: @@ -20,8 +21,10 @@ cdef extern from "cudf/concatenate.hpp" namespace "cudf" nogil: # ) except +libcudf_exception_handler cdef unique_ptr[column] concatenate( - const vector[column_view] columns + const vector[column_view] columns, + cuda_stream_view stream ) except +libcudf_exception_handler cdef unique_ptr[table] concatenate( - const vector[table_view] tables + const vector[table_view] tables, + cuda_stream_view stream ) except +libcudf_exception_handler diff --git a/python/pylibcudf/pylibcudf/libcudf/interop.pxd b/python/pylibcudf/pylibcudf/libcudf/interop.pxd index e626c2380d1..257bdcea739 100644 --- a/python/pylibcudf/pylibcudf/libcudf/interop.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/interop.pxd @@ -9,6 +9,8 @@ from pylibcudf.libcudf.scalar.scalar cimport scalar from pylibcudf.libcudf.table.table cimport table from pylibcudf.libcudf.table.table_view cimport table_view +from rmm.librmm.cuda_stream_view cimport cuda_stream_view + cdef extern from "dlpack/dlpack.h" nogil: ctypedef struct DLManagedTensor: @@ -33,11 +35,13 @@ cdef extern from "cudf/interop.hpp" nogil: cdef extern from "cudf/interop.hpp" namespace "cudf" \ nogil: cdef unique_ptr[table] from_dlpack( - const DLManagedTensor* managed_tensor + const DLManagedTensor* managed_tensor, + cuda_stream_view stream ) except +libcudf_exception_handler DLManagedTensor* to_dlpack( - const table_view& input + const table_view& input, + cuda_stream_view stream ) except +libcudf_exception_handler cdef cppclass column_metadata: diff --git a/python/pylibcudf/pylibcudf/libcudf/json.pxd b/python/pylibcudf/pylibcudf/libcudf/json.pxd index d5bdd6d299a..06fb0e6ef99 100644 --- a/python/pylibcudf/pylibcudf/libcudf/json.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/json.pxd @@ -1,4 +1,4 @@ -# Copyright (c) 2021-2024, NVIDIA CORPORATION. +# Copyright (c) 2021-2025, NVIDIA CORPORATION. from libcpp cimport bool from libcpp.memory cimport unique_ptr from libcpp.string cimport string @@ -7,6 +7,8 @@ from pylibcudf.libcudf.column.column cimport column from pylibcudf.libcudf.column.column_view cimport column_view from pylibcudf.libcudf.scalar.scalar cimport scalar, string_scalar +from rmm.librmm.cuda_stream_view cimport cuda_stream_view + cdef extern from "cudf/json/json.hpp" namespace "cudf" nogil: cdef cppclass get_json_object_options: @@ -26,4 +28,5 @@ cdef extern from "cudf/json/json.hpp" namespace "cudf" nogil: column_view col, string_scalar json_path, get_json_object_options options, + cuda_stream_view stream ) except +libcudf_exception_handler diff --git a/python/pylibcudf/pylibcudf/libcudf/labeling.pxd b/python/pylibcudf/pylibcudf/libcudf/labeling.pxd index e5dbec879ce..d2681c46b15 100644 --- a/python/pylibcudf/pylibcudf/libcudf/labeling.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/labeling.pxd @@ -1,10 +1,12 @@ -# Copyright (c) 2021-2024, NVIDIA CORPORATION. +# Copyright (c) 2021-2025, NVIDIA CORPORATION. from libcpp cimport int from libcpp.memory cimport unique_ptr from pylibcudf.exception_handler cimport libcudf_exception_handler from pylibcudf.libcudf.column.column cimport column from pylibcudf.libcudf.column.column_view cimport column_view +from rmm.librmm.cuda_stream_view cimport cuda_stream_view + cdef extern from "cudf/labeling/label_bins.hpp" namespace "cudf" nogil: cpdef enum class inclusive(int): @@ -16,5 +18,6 @@ cdef extern from "cudf/labeling/label_bins.hpp" namespace "cudf" nogil: const column_view &left_edges, inclusive left_inclusive, const column_view &right_edges, - inclusive right_inclusive + inclusive right_inclusive, + cuda_stream_view stream ) except +libcudf_exception_handler diff --git a/python/pylibcudf/pylibcudf/libcudf/merge.pxd b/python/pylibcudf/pylibcudf/libcudf/merge.pxd index f546ae3bbdd..fa17e1000f9 100644 --- a/python/pylibcudf/pylibcudf/libcudf/merge.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/merge.pxd @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. +# Copyright (c) 2020-2025, NVIDIA CORPORATION. cimport pylibcudf.libcudf.types as libcudf_types from libcpp.memory cimport unique_ptr from libcpp.vector cimport vector @@ -6,6 +6,8 @@ from pylibcudf.exception_handler cimport libcudf_exception_handler from pylibcudf.libcudf.table.table cimport table from pylibcudf.libcudf.table.table_view cimport table_view +from rmm.librmm.cuda_stream_view cimport cuda_stream_view + cdef extern from "cudf/merge.hpp" namespace "cudf" nogil: cdef unique_ptr[table] merge ( @@ -13,4 +15,5 @@ cdef extern from "cudf/merge.hpp" namespace "cudf" nogil: vector[libcudf_types.size_type] key_cols, vector[libcudf_types.order] column_order, vector[libcudf_types.null_order] null_precedence, + cuda_stream_view stream ) except +libcudf_exception_handler diff --git a/python/pylibcudf/pylibcudf/libcudf/nvtext/edit_distance.pxd b/python/pylibcudf/pylibcudf/libcudf/nvtext/edit_distance.pxd index fbb1c0b2f4c..0d54a3cdc11 100644 --- a/python/pylibcudf/pylibcudf/libcudf/nvtext/edit_distance.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/nvtext/edit_distance.pxd @@ -1,18 +1,22 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. +# Copyright (c) 2020-2025, NVIDIA CORPORATION. from libcpp cimport bool from libcpp.memory cimport unique_ptr from pylibcudf.exception_handler cimport libcudf_exception_handler from pylibcudf.libcudf.column.column cimport column from pylibcudf.libcudf.column.column_view cimport column_view +from rmm.librmm.cuda_stream_view cimport cuda_stream_view + cdef extern from "nvtext/edit_distance.hpp" namespace "nvtext" nogil: cdef unique_ptr[column] edit_distance( const column_view & strings, - const column_view & targets + const column_view & targets, + cuda_stream_view stream ) except +libcudf_exception_handler cdef unique_ptr[column] edit_distance_matrix( - const column_view & strings + const column_view & strings, + cuda_stream_view stream ) except +libcudf_exception_handler diff --git a/python/pylibcudf/pylibcudf/libcudf/round.pxd b/python/pylibcudf/pylibcudf/libcudf/round.pxd index efd9e3de25d..58bc9b5a4dc 100644 --- a/python/pylibcudf/pylibcudf/libcudf/round.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/round.pxd @@ -1,10 +1,12 @@ -# Copyright (c) 2021-2024, NVIDIA CORPORATION. +# Copyright (c) 2021-2025, NVIDIA CORPORATION. from libc.stdint cimport int32_t from libcpp.memory cimport unique_ptr from pylibcudf.exception_handler cimport libcudf_exception_handler from pylibcudf.libcudf.column.column cimport column from pylibcudf.libcudf.column.column_view cimport column_view +from rmm.librmm.cuda_stream_view cimport cuda_stream_view + cdef extern from "cudf/round.hpp" namespace "cudf" nogil: @@ -16,4 +18,5 @@ cdef extern from "cudf/round.hpp" namespace "cudf" nogil: const column_view& input, int32_t decimal_places, rounding_method method, + cuda_stream_view stream ) except +libcudf_exception_handler diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/repeat.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/repeat.pxd index de65b554eba..91ec6a53d50 100644 --- a/python/pylibcudf/pylibcudf/libcudf/strings/repeat.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/strings/repeat.pxd @@ -1,18 +1,24 @@ -# Copyright (c) 2021-2024, NVIDIA CORPORATION. +# Copyright (c) 2021-2025, NVIDIA CORPORATION. from libcpp.memory cimport unique_ptr from pylibcudf.exception_handler cimport libcudf_exception_handler from pylibcudf.libcudf.column.column cimport column from pylibcudf.libcudf.column.column_view cimport column_view from pylibcudf.libcudf.types cimport size_type +from rmm.librmm.cuda_stream_view cimport cuda_stream_view + cdef extern from "cudf/strings/repeat_strings.hpp" namespace "cudf::strings" \ nogil: cdef unique_ptr[column] repeat_strings( column_view input, - size_type repeat_times) except +libcudf_exception_handler + size_type repeat_times, + cuda_stream_view stream + ) except +libcudf_exception_handler cdef unique_ptr[column] repeat_strings( column_view input, - column_view repeat_times) except +libcudf_exception_handler + column_view repeat_times, + cuda_stream_view stream + ) except +libcudf_exception_handler diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/substring.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/substring.pxd index f573870583d..e1857fde33d 100644 --- a/python/pylibcudf/pylibcudf/libcudf/strings/substring.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/strings/substring.pxd @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. +# Copyright (c) 2020-2025, NVIDIA CORPORATION. from libcpp.memory cimport unique_ptr from pylibcudf.exception_handler cimport libcudf_exception_handler from pylibcudf.libcudf.column.column cimport column @@ -6,15 +6,21 @@ from pylibcudf.libcudf.column.column_view cimport column_view from pylibcudf.libcudf.scalar.scalar cimport numeric_scalar from pylibcudf.libcudf.types cimport size_type +from rmm.librmm.cuda_stream_view cimport cuda_stream_view + cdef extern from "cudf/strings/slice.hpp" namespace "cudf::strings" nogil: cdef unique_ptr[column] slice_strings( column_view source_strings, numeric_scalar[size_type] start, numeric_scalar[size_type] end, - numeric_scalar[size_type] step) except +libcudf_exception_handler + numeric_scalar[size_type] step, + cuda_stream_view stream + ) except +libcudf_exception_handler cdef unique_ptr[column] slice_strings( column_view source_strings, column_view starts, - column_view stops) except +libcudf_exception_handler + column_view stops, + cuda_stream_view stream + ) except +libcudf_exception_handler diff --git a/python/pylibcudf/pylibcudf/libcudf/transpose.pxd b/python/pylibcudf/pylibcudf/libcudf/transpose.pxd index fde49afd99c..ab0944914f2 100644 --- a/python/pylibcudf/pylibcudf/libcudf/transpose.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/transpose.pxd @@ -1,15 +1,18 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. +# Copyright (c) 2020-2025, NVIDIA CORPORATION. from libcpp.memory cimport unique_ptr from libcpp.pair cimport pair from pylibcudf.exception_handler cimport libcudf_exception_handler from pylibcudf.libcudf.column.column cimport column from pylibcudf.libcudf.table.table_view cimport table_view +from rmm.librmm.cuda_stream_view cimport cuda_stream_view + cdef extern from "cudf/transpose.hpp" namespace "cudf" nogil: cdef pair[ unique_ptr[column], table_view ] transpose( - table_view input_table + table_view input_table, + cuda_stream_view stream ) except +libcudf_exception_handler diff --git a/python/pylibcudf/pylibcudf/merge.pxd b/python/pylibcudf/pylibcudf/merge.pxd index 4b598aa8f4f..0cfe40dbe2c 100644 --- a/python/pylibcudf/pylibcudf/merge.pxd +++ b/python/pylibcudf/pylibcudf/merge.pxd @@ -1,11 +1,14 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. +# Copyright (c) 2024-2025, NVIDIA CORPORATION. from .table cimport Table +from rmm.pylibrmm.stream cimport Stream + cpdef Table merge ( list tables_to_merge, list key_cols, list column_order, list null_precedence, + Stream stream=* ) diff --git a/python/pylibcudf/pylibcudf/merge.pyi b/python/pylibcudf/pylibcudf/merge.pyi index b18eb01f8a2..26cfdc6ea0c 100644 --- a/python/pylibcudf/pylibcudf/merge.pyi +++ b/python/pylibcudf/pylibcudf/merge.pyi @@ -1,5 +1,7 @@ # Copyright (c) 2024, NVIDIA CORPORATION. +from rmm.pylibrmm.stream import Stream + from pylibcudf.table import Table from pylibcudf.types import NullOrder, Order @@ -8,4 +10,5 @@ def merge( key_cols: list[int], column_order: list[Order], null_precedence: list[NullOrder], + stream: Stream | None = None, ) -> Table: ... diff --git a/python/pylibcudf/pylibcudf/merge.pyx b/python/pylibcudf/pylibcudf/merge.pyx index c051cdc0c66..cc585b11cc5 100644 --- a/python/pylibcudf/pylibcudf/merge.pyx +++ b/python/pylibcudf/pylibcudf/merge.pyx @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. +# Copyright (c) 2024-2025, NVIDIA CORPORATION. from libcpp.memory cimport unique_ptr from libcpp.utility cimport move @@ -8,7 +8,10 @@ from pylibcudf.libcudf.table.table cimport table from pylibcudf.libcudf.table.table_view cimport table_view from pylibcudf.libcudf.types cimport null_order, order, size_type +from rmm.pylibrmm.stream cimport Stream + from .table cimport Table +from .utils cimport _get_stream __all__ = ["merge"] @@ -17,6 +20,7 @@ cpdef Table merge ( list key_cols, list column_order, list null_precedence, + Stream stream=None ): """Merge a set of sorted tables. @@ -32,6 +36,8 @@ cpdef Table merge ( Whether each column should be sorted in ascending or descending order. null_precedence : List[NullOrder] Whether nulls should come before or after non-nulls. + stream : Stream | None + CUDA stream on which to perform the operation. Returns ------- @@ -47,11 +53,14 @@ cpdef Table merge ( c_tables_to_merge.push_back(( tbl).view()) cdef unique_ptr[table] c_result + stream = _get_stream(stream) + with nogil: c_result = cpp_merge.merge( c_tables_to_merge, c_key_cols, c_column_order, c_null_precedence, + stream.view() ) - return Table.from_libcudf(move(c_result)) + return Table.from_libcudf(move(c_result), stream) diff --git a/python/pylibcudf/pylibcudf/nvtext/edit_distance.pxd b/python/pylibcudf/pylibcudf/nvtext/edit_distance.pxd index 446b95afabb..b915f8753dd 100644 --- a/python/pylibcudf/pylibcudf/nvtext/edit_distance.pxd +++ b/python/pylibcudf/pylibcudf/nvtext/edit_distance.pxd @@ -1,8 +1,9 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. +# Copyright (c) 2024-2025, NVIDIA CORPORATION. from pylibcudf.column cimport Column +from rmm.pylibrmm.stream cimport Stream -cpdef Column edit_distance(Column input, Column targets) +cpdef Column edit_distance(Column input, Column targets, Stream stream=*) -cpdef Column edit_distance_matrix(Column input) +cpdef Column edit_distance_matrix(Column input, Stream stream=*) diff --git a/python/pylibcudf/pylibcudf/nvtext/edit_distance.pyi b/python/pylibcudf/pylibcudf/nvtext/edit_distance.pyi index 85bbbb880ee..50b3495f625 100644 --- a/python/pylibcudf/pylibcudf/nvtext/edit_distance.pyi +++ b/python/pylibcudf/pylibcudf/nvtext/edit_distance.pyi @@ -1,6 +1,12 @@ # Copyright (c) 2024, NVIDIA CORPORATION. +from rmm.pylibrmm.stream import Stream + from pylibcudf.column import Column -def edit_distance(input: Column, targets: Column) -> Column: ... -def edit_distance_matrix(input: Column) -> Column: ... +def edit_distance( + input: Column, targets: Column, stream: Stream | None = None +) -> Column: ... +def edit_distance_matrix( + input: Column, stream: Stream | None = None +) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/nvtext/edit_distance.pyx b/python/pylibcudf/pylibcudf/nvtext/edit_distance.pyx index eceeaff24e3..00d0ea26c0d 100644 --- a/python/pylibcudf/pylibcudf/nvtext/edit_distance.pyx +++ b/python/pylibcudf/pylibcudf/nvtext/edit_distance.pyx @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. +# Copyright (c) 2024-2025, NVIDIA CORPORATION. from libcpp.memory cimport unique_ptr from libcpp.utility cimport move @@ -9,9 +9,13 @@ from pylibcudf.libcudf.nvtext.edit_distance cimport ( edit_distance_matrix as cpp_edit_distance_matrix, ) +from rmm.pylibrmm.stream cimport Stream + +from ..utils cimport _get_stream + __all__ = ["edit_distance", "edit_distance_matrix"] -cpdef Column edit_distance(Column input, Column targets): +cpdef Column edit_distance(Column input, Column targets, Stream stream=None): """ Returns the edit distance between individual strings in two strings columns @@ -23,6 +27,8 @@ cpdef Column edit_distance(Column input, Column targets): Input strings targets : Column Strings to compute edit distance against + stream : Stream | None + CUDA stream on which to perform the operation. Returns ------- @@ -32,14 +38,15 @@ cpdef Column edit_distance(Column input, Column targets): cdef column_view c_strings = input.view() cdef column_view c_targets = targets.view() cdef unique_ptr[column] c_result + stream = _get_stream(stream) with nogil: - c_result = cpp_edit_distance(c_strings, c_targets) + c_result = cpp_edit_distance(c_strings, c_targets, stream.view()) - return Column.from_libcudf(move(c_result)) + return Column.from_libcudf(move(c_result), stream) -cpdef Column edit_distance_matrix(Column input): +cpdef Column edit_distance_matrix(Column input, Stream stream=None): """ Returns the edit distance between all strings in the input strings column @@ -49,6 +56,8 @@ cpdef Column edit_distance_matrix(Column input): ---------- input : Column Input strings + stream : Stream | None + CUDA stream on which to perform the operation. Returns ------- @@ -57,8 +66,9 @@ cpdef Column edit_distance_matrix(Column input): """ cdef column_view c_strings = input.view() cdef unique_ptr[column] c_result + stream = _get_stream(stream) with nogil: - c_result = cpp_edit_distance_matrix(c_strings) + c_result = cpp_edit_distance_matrix(c_strings, stream.view()) - return Column.from_libcudf(move(c_result)) + return Column.from_libcudf(move(c_result), stream) diff --git a/python/pylibcudf/pylibcudf/round.pxd b/python/pylibcudf/pylibcudf/round.pxd index c8501b03fad..4ab17203d31 100644 --- a/python/pylibcudf/pylibcudf/round.pxd +++ b/python/pylibcudf/pylibcudf/round.pxd @@ -1,12 +1,15 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. +# Copyright (c) 2024-2025, NVIDIA CORPORATION. from libc.stdint cimport int32_t from pylibcudf.libcudf.round cimport rounding_method from .column cimport Column +from rmm.pylibrmm.stream cimport Stream + cpdef Column round( Column source, int32_t decimal_places = *, - rounding_method round_method = * + rounding_method round_method = *, + Stream stream = * ) diff --git a/python/pylibcudf/pylibcudf/round.pyi b/python/pylibcudf/pylibcudf/round.pyi index 410cf5de586..676c6d609aa 100644 --- a/python/pylibcudf/pylibcudf/round.pyi +++ b/python/pylibcudf/pylibcudf/round.pyi @@ -2,6 +2,8 @@ from enum import IntEnum +from rmm.pylibrmm.stream import Stream + from pylibcudf.column import Column class RoundingMethod(IntEnum): @@ -12,4 +14,5 @@ def round( source: Column, decimal_places: int = 0, round_method: RoundingMethod = RoundingMethod.HALF_UP, + stream: Stream | None = None, ) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/round.pyx b/python/pylibcudf/pylibcudf/round.pyx index 024cf47a224..0bead59fc3b 100644 --- a/python/pylibcudf/pylibcudf/round.pyx +++ b/python/pylibcudf/pylibcudf/round.pyx @@ -9,14 +9,18 @@ from pylibcudf.libcudf.round import \ from pylibcudf.libcudf.column.column cimport column +from rmm.pylibrmm.stream cimport Stream + from .column cimport Column +from .utils cimport _get_stream __all__ = ["RoundingMethod", "round"] cpdef Column round( Column source, int32_t decimal_places = 0, - rounding_method round_method = rounding_method.HALF_UP + rounding_method round_method = rounding_method.HALF_UP, + Stream stream=None ): """Rounds all the values in a column to the specified number of decimal places. @@ -32,6 +36,8 @@ cpdef Column round( The method by which to round each value. Can be one of { RoundingMethod.HALF_UP, RoundingMethod.HALF_EVEN } (default rounding_method.HALF_UP) + stream : Stream | None + CUDA stream on which to perform the operation. Returns ------- @@ -39,13 +45,16 @@ cpdef Column round( A Column with values rounded """ cdef unique_ptr[column] c_result + stream = _get_stream(stream) + with nogil: c_result = cpp_round( source.view(), decimal_places, - round_method + round_method, + stream.view() ) - return Column.from_libcudf(move(c_result)) + return Column.from_libcudf(move(c_result), stream) RoundingMethod.__str__ = RoundingMethod.__repr__ diff --git a/python/pylibcudf/pylibcudf/strings/repeat.pxd b/python/pylibcudf/pylibcudf/strings/repeat.pxd index bc70926b6fa..fd97a1671f8 100644 --- a/python/pylibcudf/pylibcudf/strings/repeat.pxd +++ b/python/pylibcudf/pylibcudf/strings/repeat.pxd @@ -1,10 +1,13 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. +# Copyright (c) 2024-2025, NVIDIA CORPORATION. from pylibcudf.column cimport Column from pylibcudf.libcudf.types cimport size_type +from rmm.pylibrmm.stream cimport Stream ctypedef fused ColumnorSizeType: Column size_type -cpdef Column repeat_strings(Column input, ColumnorSizeType repeat_times) +cpdef Column repeat_strings( + Column input, ColumnorSizeType repeat_times, Stream stream=* +) diff --git a/python/pylibcudf/pylibcudf/strings/repeat.pyi b/python/pylibcudf/pylibcudf/strings/repeat.pyi index 93a46b71caa..246ac9874c7 100644 --- a/python/pylibcudf/pylibcudf/strings/repeat.pyi +++ b/python/pylibcudf/pylibcudf/strings/repeat.pyi @@ -1,5 +1,11 @@ # Copyright (c) 2024, NVIDIA CORPORATION. +from rmm.pylibrmm.stream import Stream + from pylibcudf.column import Column -def repeat_strings(input: Column, repeat_times: Column | int) -> Column: ... +def repeat_strings( + input: Column, + repeat_times: Column | int, + stream: Stream | None = None, +) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/strings/repeat.pyx b/python/pylibcudf/pylibcudf/strings/repeat.pyx index a497b1f438e..409a551c60d 100644 --- a/python/pylibcudf/pylibcudf/strings/repeat.pyx +++ b/python/pylibcudf/pylibcudf/strings/repeat.pyx @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. +# Copyright (c) 2024-2025, NVIDIA CORPORATION. from libcpp.memory cimport unique_ptr from libcpp.utility cimport move from pylibcudf.column cimport Column @@ -6,9 +6,15 @@ from pylibcudf.libcudf.column.column cimport column from pylibcudf.libcudf.strings cimport repeat as cpp_repeat from pylibcudf.libcudf.types cimport size_type +from rmm.pylibrmm.stream cimport Stream + +from ..utils cimport _get_stream + __all__ = ["repeat_strings"] -cpdef Column repeat_strings(Column input, ColumnorSizeType repeat_times): +cpdef Column repeat_strings( + Column input, ColumnorSizeType repeat_times, Stream stream=None +): """ Repeat each string in the given strings column by the numbers of times given in another numeric column. @@ -22,6 +28,8 @@ cpdef Column repeat_strings(Column input, ColumnorSizeType repeat_times): repeat_times : Column or int Number(s) of times that the corresponding input strings for each row are repeated. + stream : Stream | None + CUDA stream on which to perform the operation. Returns ------- @@ -29,20 +37,23 @@ cpdef Column repeat_strings(Column input, ColumnorSizeType repeat_times): New column containing the repeated strings. """ cdef unique_ptr[column] c_result + stream = _get_stream(stream) if ColumnorSizeType is Column: with nogil: c_result = cpp_repeat.repeat_strings( input.view(), - repeat_times.view() + repeat_times.view(), + stream.view() ) elif ColumnorSizeType is size_type: with nogil: c_result = cpp_repeat.repeat_strings( input.view(), - repeat_times + repeat_times, + stream.view() ) else: raise ValueError("repeat_times must be size_type or integer") - return Column.from_libcudf(move(c_result)) + return Column.from_libcudf(move(c_result), stream) diff --git a/python/pylibcudf/pylibcudf/strings/slice.pxd b/python/pylibcudf/pylibcudf/strings/slice.pxd index 01e9f2b3c88..d33e9dd2828 100644 --- a/python/pylibcudf/pylibcudf/strings/slice.pxd +++ b/python/pylibcudf/pylibcudf/strings/slice.pxd @@ -1,7 +1,8 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. +# Copyright (c) 2024-2025, NVIDIA CORPORATION. from pylibcudf.column cimport Column from pylibcudf.scalar cimport Scalar +from rmm.pylibrmm.stream cimport Stream ctypedef fused ColumnOrScalar: Column @@ -11,5 +12,6 @@ cpdef Column slice_strings( Column input, ColumnOrScalar start=*, ColumnOrScalar stop=*, - Scalar step=* + Scalar step=*, + Stream stream=* ) diff --git a/python/pylibcudf/pylibcudf/strings/slice.pyi b/python/pylibcudf/pylibcudf/strings/slice.pyi index 7bf9a7cb8c6..668524fc714 100644 --- a/python/pylibcudf/pylibcudf/strings/slice.pyi +++ b/python/pylibcudf/pylibcudf/strings/slice.pyi @@ -1,5 +1,7 @@ # Copyright (c) 2024, NVIDIA CORPORATION. +from rmm.pylibrmm.stream import Stream + from pylibcudf.column import Column from pylibcudf.scalar import Scalar @@ -8,4 +10,5 @@ def slice_strings( start: Column | Scalar | None = None, stop: Column | Scalar | None = None, step: Scalar | None = None, + stream: Stream | None = None, ) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/strings/slice.pyx b/python/pylibcudf/pylibcudf/strings/slice.pyx index d32de7c50e0..bf09d3963ff 100644 --- a/python/pylibcudf/pylibcudf/strings/slice.pyx +++ b/python/pylibcudf/pylibcudf/strings/slice.pyx @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. +# Copyright (c) 2024-2025, NVIDIA CORPORATION. from libcpp.memory cimport unique_ptr from libcpp.utility cimport move @@ -13,6 +13,9 @@ from pylibcudf.libcudf.types cimport size_type from pylibcudf.scalar cimport Scalar from cython.operator import dereference +from rmm.pylibrmm.stream cimport Stream + +from ..utils cimport _get_stream __all__ = ["slice_strings"] @@ -20,7 +23,8 @@ cpdef Column slice_strings( Column input, ColumnOrScalar start=None, ColumnOrScalar stop=None, - Scalar step=None + Scalar step=None, + Stream stream=None ): """Perform a slice operation on a strings column. @@ -41,6 +45,8 @@ cpdef Column slice_strings( The end character position or positions step : Scalar Distance between input characters retrieved + stream : Stream | None + CUDA stream on which to perform the operation. Returns ------- @@ -51,6 +57,7 @@ cpdef Column slice_strings( cdef numeric_scalar[size_type]* cpp_start cdef numeric_scalar[size_type]* cpp_stop cdef numeric_scalar[size_type]* cpp_step + stream = _get_stream(stream) if input is None: raise ValueError("input cannot be None") @@ -68,7 +75,8 @@ cpdef Column slice_strings( c_result = cpp_slice.slice_strings( input.view(), start.view(), - stop.view() + stop.view(), + stream.view() ) elif ColumnOrScalar is Scalar: @@ -94,9 +102,10 @@ cpdef Column slice_strings( input.view(), dereference(cpp_start), dereference(cpp_stop), - dereference(cpp_step) + dereference(cpp_step), + stream.view() ) else: raise ValueError("start, stop, and step must be either Column or Scalar") - return Column.from_libcudf(move(c_result)) + return Column.from_libcudf(move(c_result), stream) diff --git a/python/pylibcudf/pylibcudf/transpose.pxd b/python/pylibcudf/pylibcudf/transpose.pxd index 7b5a7676b49..28db765e48f 100644 --- a/python/pylibcudf/pylibcudf/transpose.pxd +++ b/python/pylibcudf/pylibcudf/transpose.pxd @@ -1,5 +1,7 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. +# Copyright (c) 2024-2025, NVIDIA CORPORATION. from .table cimport Table +from rmm.pylibrmm.stream cimport Stream -cpdef Table transpose(Table input_table) + +cpdef Table transpose(Table input_table, Stream stream=*) diff --git a/python/pylibcudf/pylibcudf/transpose.pyi b/python/pylibcudf/pylibcudf/transpose.pyi index a84ab8a60ea..4acbac7ea52 100644 --- a/python/pylibcudf/pylibcudf/transpose.pyi +++ b/python/pylibcudf/pylibcudf/transpose.pyi @@ -1,4 +1,7 @@ # Copyright (c) 2024, NVIDIA CORPORATION. + +from rmm.pylibrmm.stream import Stream + from pylibcudf.table import Table -def transpose(input_table: Table) -> Table: ... +def transpose(input_table: Table, stream: Stream | None = None) -> Table: ... diff --git a/python/pylibcudf/pylibcudf/transpose.pyx b/python/pylibcudf/pylibcudf/transpose.pyx index 5eb3e58cebc..cbb23e2358e 100644 --- a/python/pylibcudf/pylibcudf/transpose.pyx +++ b/python/pylibcudf/pylibcudf/transpose.pyx @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. +# Copyright (c) 2024-2025, NVIDIA CORPORATION. from libcpp.memory cimport unique_ptr from libcpp.pair cimport pair from libcpp.utility cimport move @@ -6,12 +6,15 @@ from pylibcudf.libcudf cimport transpose as cpp_transpose from pylibcudf.libcudf.column.column cimport column from pylibcudf.libcudf.table.table_view cimport table_view +from rmm.pylibrmm.stream cimport Stream + from .column cimport Column from .table cimport Table +from .utils cimport _get_stream __all__ = ["transpose"] -cpdef Table transpose(Table input_table): +cpdef Table transpose(Table input_table, Stream stream=None): """Transpose a Table. For details, see :cpp:func:`transpose`. @@ -20,6 +23,8 @@ cpdef Table transpose(Table input_table): ---------- input_table : Table Table to transpose + stream : Stream | None + CUDA stream on which to perform the operation. Returns ------- @@ -28,12 +33,14 @@ cpdef Table transpose(Table input_table): """ cdef pair[unique_ptr[column], table_view] c_result cdef Table owner_table + stream = _get_stream(stream) with nogil: - c_result = cpp_transpose.transpose(input_table.view()) + c_result = cpp_transpose.transpose(input_table.view(), stream.view()) owner_table = Table( - [Column.from_libcudf(move(c_result.first))] * c_result.second.num_columns() + [Column.from_libcudf(move(c_result.first), stream)] * + c_result.second.num_columns() ) return Table.from_table_view(c_result.second, owner_table) From bf14b22a3ed88776fba15c604029829e21ec861e Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Fri, 1 Aug 2025 12:35:06 -0700 Subject: [PATCH 045/366] Add hash-based SUM_WITH_OVERFLOW aggregation for INT64 values (#19403) Contributes to #19243 This PR introduces a new aggregation kind, `SUM_WITH_OVERFLOW`, which returns a `STRUCT` containing the sum and a boolean indicating overflow. If an overflow occurs, the corresponding row in the overflow boolean column will be set to `true`. Authors: - Yunsong Wang (https://github.com/PointKernel) Approvers: - Robert (Bobby) Evans (https://github.com/revans2) - David Wendt (https://github.com/davidwendt) - Matthew Roeschke (https://github.com/mroeschke) - Nghia Truong (https://github.com/ttnghia) - Lawrence Mitchell (https://github.com/wence-) - Shruti Shivakumar (https://github.com/shrshi) URL: https://github.com/rapidsai/cudf/pull/19403 --- cpp/include/cudf/aggregation.hpp | 84 +++---- .../cudf/detail/aggregation/aggregation.cuh | 4 + .../cudf/detail/aggregation/aggregation.hpp | 51 ++++- .../detail/aggregation/device_aggregators.cuh | 40 ++++ cpp/src/aggregation/aggregation.cpp | 23 ++ cpp/src/aggregation/aggregation.cu | 42 +++- cpp/src/groupby/groupby.cu | 8 + cpp/src/groupby/hash/compute_aggregations.cuh | 30 ++- .../hash/create_sparse_results_table.cu | 90 ++++++-- cpp/src/groupby/hash/groupby.cu | 5 +- .../sort/group_single_pass_reduction_util.cuh | 1 - cpp/tests/groupby/sum_tests.cpp | 206 +++++++++++++++++- python/pylibcudf/tests/test_aggregation.py | 2 +- 13 files changed, 497 insertions(+), 89 deletions(-) diff --git a/cpp/include/cudf/aggregation.hpp b/cpp/include/cudf/aggregation.hpp index bb086b611c7..d379674fad6 100644 --- a/cpp/include/cudf/aggregation.hpp +++ b/cpp/include/cudf/aggregation.hpp @@ -93,45 +93,46 @@ class aggregation { * @brief Possible aggregation operations */ enum Kind { - SUM, ///< sum reduction - PRODUCT, ///< product reduction - MIN, ///< min reduction - MAX, ///< max reduction - COUNT_VALID, ///< count number of valid elements - COUNT_ALL, ///< count number of elements - ANY, ///< any reduction - ALL, ///< all reduction - SUM_OF_SQUARES, ///< sum of squares reduction - MEAN, ///< arithmetic mean reduction - M2, ///< sum of squares of differences from the mean - VARIANCE, ///< variance - STD, ///< standard deviation - MEDIAN, ///< median reduction - QUANTILE, ///< compute specified quantile(s) - ARGMAX, ///< Index of max element - ARGMIN, ///< Index of min element - NUNIQUE, ///< count number of unique elements - NTH_ELEMENT, ///< get the nth element - ROW_NUMBER, ///< get row-number of current index (relative to rolling window) - EWMA, ///< get exponential weighted moving average at current index - RANK, ///< get rank of current index - COLLECT_LIST, ///< collect values into a list - COLLECT_SET, ///< collect values into a list without duplicate entries - LEAD, ///< window function, accesses row at specified offset following current row - LAG, ///< window function, accesses row at specified offset preceding current row - PTX, ///< PTX based UDF aggregation - CUDA, ///< CUDA based UDF aggregation - HOST_UDF, ///< host based UDF aggregation - MERGE_LISTS, ///< merge multiple lists values into one list - MERGE_SETS, ///< merge multiple lists values into one list then drop duplicate entries - MERGE_M2, ///< merge partial values of M2 aggregation, - COVARIANCE, ///< covariance between two sets of elements - CORRELATION, ///< correlation between two sets of elements - TDIGEST, ///< create a tdigest from a set of input values - MERGE_TDIGEST, ///< create a tdigest by merging multiple tdigests together - HISTOGRAM, ///< compute frequency of each element - MERGE_HISTOGRAM, ///< merge partial values of HISTOGRAM aggregation - BITWISE_AGG ///< bitwise aggregation on numeric columns + SUM, ///< sum reduction + SUM_WITH_OVERFLOW, ///< sum reduction with overflow detection + PRODUCT, ///< product reduction + MIN, ///< min reduction + MAX, ///< max reduction + COUNT_VALID, ///< count number of valid elements + COUNT_ALL, ///< count number of elements + ANY, ///< any reduction + ALL, ///< all reduction + SUM_OF_SQUARES, ///< sum of squares reduction + MEAN, ///< arithmetic mean reduction + M2, ///< sum of squares of differences from the mean + VARIANCE, ///< variance + STD, ///< standard deviation + MEDIAN, ///< median reduction + QUANTILE, ///< compute specified quantile(s) + ARGMAX, ///< Index of max element + ARGMIN, ///< Index of min element + NUNIQUE, ///< count number of unique elements + NTH_ELEMENT, ///< get the nth element + ROW_NUMBER, ///< get row-number of current index (relative to rolling window) + EWMA, ///< get exponential weighted moving average at current index + RANK, ///< get rank of current index + COLLECT_LIST, ///< collect values into a list + COLLECT_SET, ///< collect values into a list without duplicate entries + LEAD, ///< window function, accesses row at specified offset following current row + LAG, ///< window function, accesses row at specified offset preceding current row + PTX, ///< PTX based UDF aggregation + CUDA, ///< CUDA based UDF aggregation + HOST_UDF, ///< host based UDF aggregation + MERGE_LISTS, ///< merge multiple lists values into one list + MERGE_SETS, ///< merge multiple lists values into one list then drop duplicate entries + MERGE_M2, ///< merge partial values of M2 aggregation, + COVARIANCE, ///< covariance between two sets of elements + CORRELATION, ///< correlation between two sets of elements + TDIGEST, ///< create a tdigest from a set of input values + MERGE_TDIGEST, ///< create a tdigest by merging multiple tdigests together + HISTOGRAM, ///< compute frequency of each element + MERGE_HISTOGRAM, ///< merge partial values of HISTOGRAM aggregation + BITWISE_AGG ///< bitwise aggregation on numeric columns }; aggregation() = delete; @@ -271,6 +272,11 @@ enum class ewm_history : int32_t { INFINITE, FINITE }; template std::unique_ptr make_sum_aggregation(); +/// Factory to create a SUM_WITH_OVERFLOW aggregation +/// @return A SUM_WITH_OVERFLOW aggregation object +template +std::unique_ptr make_sum_with_overflow_aggregation(); + /// Factory to create a PRODUCT aggregation /// @return A PRODUCT aggregation object template diff --git a/cpp/include/cudf/detail/aggregation/aggregation.cuh b/cpp/include/cudf/detail/aggregation/aggregation.cuh index 00bdb229391..2124a131c19 100644 --- a/cpp/include/cudf/detail/aggregation/aggregation.cuh +++ b/cpp/include/cudf/detail/aggregation/aggregation.cuh @@ -78,6 +78,10 @@ struct corresponding_operator { using type = DeviceSum; }; template <> +struct corresponding_operator { + using type = DeviceSum; +}; +template <> struct corresponding_operator { using type = DeviceProduct; }; diff --git a/cpp/include/cudf/detail/aggregation/aggregation.hpp b/cpp/include/cudf/detail/aggregation/aggregation.hpp index 49ad841cc33..81084f8bdfb 100644 --- a/cpp/include/cudf/detail/aggregation/aggregation.hpp +++ b/cpp/include/cudf/detail/aggregation/aggregation.hpp @@ -18,6 +18,7 @@ #include #include +#include #include #include #include @@ -39,6 +40,8 @@ class simple_aggregations_collector { // Declares the interface for the simple aggregation const& agg); virtual std::vector> visit(data_type col_type, class sum_aggregation const& agg); + virtual std::vector> visit( + data_type col_type, class sum_with_overflow_aggregation const& agg); virtual std::vector> visit(data_type col_type, class product_aggregation const& agg); virtual std::vector> visit(data_type col_type, @@ -116,6 +119,7 @@ class aggregation_finalizer { // Declares the interface for the finalizer // Declare overloads for each kind of a agg to dispatch virtual void visit(aggregation const& agg); virtual void visit(class sum_aggregation const& agg); + virtual void visit(class sum_with_overflow_aggregation const& agg); virtual void visit(class product_aggregation const& agg); virtual void visit(class min_aggregation const& agg); virtual void visit(class max_aggregation const& agg); @@ -177,6 +181,26 @@ class sum_aggregation final : public rolling_aggregation, void finalize(aggregation_finalizer& finalizer) const override { finalizer.visit(*this); } }; +/** + * @brief Derived class for specifying a sum_with_overflow aggregation + */ +class sum_with_overflow_aggregation final : public groupby_aggregation, + public groupby_scan_aggregation { + public: + sum_with_overflow_aggregation() : aggregation(SUM_WITH_OVERFLOW) {} + + [[nodiscard]] std::unique_ptr clone() const override + { + return std::make_unique(*this); + } + std::vector> get_simple_aggregations( + data_type col_type, simple_aggregations_collector& collector) const override + { + return collector.visit(col_type, *this); + } + void finalize(aggregation_finalizer& finalizer) const override { finalizer.visit(*this); } +}; + /** * @brief Derived class for specifying a product aggregation */ @@ -1352,11 +1376,12 @@ constexpr bool is_sum_product_agg(aggregation::Kind k) (k == aggregation::SUM_OF_SQUARES); } -// Summing/Multiplying integers of any type, always use int64_t accumulator +// Summing/Multiplying integers of any type, always use int64_t accumulator (except +// SUM_WITH_OVERFLOW which has its own template) template -struct target_type_impl && is_sum_product_agg(k)>> { + requires(std::is_integral_v && is_sum_product_agg(k) && + k != aggregation::SUM_WITH_OVERFLOW) +struct target_type_impl { using type = int64_t; }; @@ -1369,12 +1394,12 @@ struct target_type_impl< using type = Source; }; -// Summing/Multiplying float/doubles, use same type accumulator +// Summing/Multiplying float/doubles, use same type accumulator (except SUM_WITH_OVERFLOW which has +// its own template) template -struct target_type_impl< - Source, - k, - std::enable_if_t && is_sum_product_agg(k)>> { + requires(std::is_floating_point_v && is_sum_product_agg(k) && + k != aggregation::SUM_WITH_OVERFLOW) +struct target_type_impl { using type = Source; }; @@ -1386,6 +1411,12 @@ struct target_type_impl +struct target_type_impl { + using type = struct_view; // SUM_WITH_OVERFLOW outputs a struct with sum and overflow fields +}; + // Always use `double` for M2 template struct target_type_impl { @@ -1599,6 +1630,8 @@ CUDF_HOST_DEVICE inline decltype(auto) aggregation_dispatcher(aggregation::Kind switch (k) { case aggregation::SUM: return f.template operator()(std::forward(args)...); + case aggregation::SUM_WITH_OVERFLOW: + return f.template operator()(std::forward(args)...); case aggregation::PRODUCT: return f.template operator()(std::forward(args)...); case aggregation::MIN: diff --git a/cpp/include/cudf/detail/aggregation/device_aggregators.cuh b/cpp/include/cudf/detail/aggregation/device_aggregators.cuh index 3af5afd20cd..0c5b57c51a7 100644 --- a/cpp/include/cudf/detail/aggregation/device_aggregators.cuh +++ b/cpp/include/cudf/detail/aggregation/device_aggregators.cuh @@ -25,6 +25,7 @@ #include #include +#include #include namespace cudf::detail { @@ -154,6 +155,45 @@ struct update_target_element { } }; +template + requires(cuda::std::is_same_v) +struct update_target_element { + __device__ void operator()(mutable_column_device_view target, + size_type target_index, + column_device_view source, + size_type source_index) const noexcept + { + // For SUM_WITH_OVERFLOW, target is a struct with sum value at child(0) and overflow flag at + // child(1) + auto sum_column = target.child(0); + auto overflow_column = target.child(1); + + auto const source_value = source.element(source_index); + auto const old_sum = + cudf::detail::atomic_add(&sum_column.element(target_index), source_value); + + // Early exit if overflow is already set to avoid unnecessary overflow checking + auto bool_ref = cuda::atomic_ref{ + *(overflow_column.data() + target_index)}; + if (bool_ref.load(cuda::memory_order_relaxed)) { return; } + + // Check for overflow before performing the addition to avoid UB + // For positive overflow: old_sum > 0, source_value > 0, and old_sum > max - source_value + // For negative overflow: old_sum < 0, source_value < 0, and old_sum < min - source_value + // TODO: to be replaced by CCCL equivalents once https://github.com/NVIDIA/cccl/pull/3755 is + // ready + auto constexpr int64_max = cuda::std::numeric_limits::max(); + auto constexpr int64_min = cuda::std::numeric_limits::min(); + auto const overflow = + ((old_sum > 0 && source_value > 0 && old_sum > int64_max - source_value) || + (old_sum < 0 && source_value < 0 && old_sum < int64_min - source_value)); + if (overflow) { + // Atomically set overflow flag to true (use atomic_max since true > false) + cudf::detail::atomic_max(&overflow_column.element(target_index), true); + } + } +}; + /** * @brief Function object to update a single element in a target column using * the dictionary key addressed by the specific index. diff --git a/cpp/src/aggregation/aggregation.cpp b/cpp/src/aggregation/aggregation.cpp index 7cf45e48cc3..cb3a2b80ae4 100644 --- a/cpp/src/aggregation/aggregation.cpp +++ b/cpp/src/aggregation/aggregation.cpp @@ -41,6 +41,12 @@ std::vector> simple_aggregations_collector::visit( return visit(col_type, static_cast(agg)); } +std::vector> simple_aggregations_collector::visit( + data_type col_type, sum_with_overflow_aggregation const& agg) +{ + return visit(col_type, static_cast(agg)); +} + std::vector> simple_aggregations_collector::visit( data_type col_type, product_aggregation const& agg) { @@ -258,6 +264,11 @@ void aggregation_finalizer::visit(sum_aggregation const& agg) visit(static_cast(agg)); } +void aggregation_finalizer::visit(sum_with_overflow_aggregation const& agg) +{ + visit(static_cast(agg)); +} + void aggregation_finalizer::visit(product_aggregation const& agg) { visit(static_cast(agg)); @@ -458,6 +469,18 @@ template CUDF_EXPORT std::unique_ptr make_sum_aggregation make_sum_aggregation(); +/// Factory to create a SUM_WITH_OVERFLOW aggregation +template +std::unique_ptr make_sum_with_overflow_aggregation() +{ + return std::make_unique(); +} +template CUDF_EXPORT std::unique_ptr make_sum_with_overflow_aggregation(); +template CUDF_EXPORT std::unique_ptr +make_sum_with_overflow_aggregation(); +template CUDF_EXPORT std::unique_ptr +make_sum_with_overflow_aggregation(); + /// Factory to create a PRODUCT aggregation template std::unique_ptr make_product_aggregation() diff --git a/cpp/src/aggregation/aggregation.cu b/cpp/src/aggregation/aggregation.cu index d4e112da777..c58d1f7af7c 100644 --- a/cpp/src/aggregation/aggregation.cu +++ b/cpp/src/aggregation/aggregation.cu @@ -49,13 +49,14 @@ struct identity_initializer { template static constexpr bool is_supported() { - return cudf::is_fixed_width() and - (k == aggregation::SUM or k == aggregation::MIN or k == aggregation::MAX or - k == aggregation::COUNT_VALID or k == aggregation::COUNT_ALL or - k == aggregation::ARGMAX or k == aggregation::ARGMIN or - k == aggregation::SUM_OF_SQUARES or k == aggregation::STD or - k == aggregation::VARIANCE or - (k == aggregation::PRODUCT and is_product_supported())); + return (cudf::is_fixed_width() and + (k == aggregation::SUM or k == aggregation::MIN or k == aggregation::MAX or + k == aggregation::COUNT_VALID or k == aggregation::COUNT_ALL or + k == aggregation::ARGMAX or k == aggregation::ARGMIN or + k == aggregation::SUM_OF_SQUARES or k == aggregation::STD or + k == aggregation::VARIANCE or + (k == aggregation::PRODUCT and is_product_supported()))) or + (k == aggregation::SUM_WITH_OVERFLOW and std::is_same_v); } template @@ -94,11 +95,28 @@ struct identity_initializer { void operator()(mutable_column_view const& col, rmm::cuda_stream_view stream) requires(is_supported()) { - using DeviceType = device_storage_type_t; - thrust::fill(rmm::exec_policy(stream), - col.begin(), - col.end(), - get_identity()); + if constexpr (k == aggregation::SUM_WITH_OVERFLOW) { + // SUM_WITH_OVERFLOW uses a struct with sum (int64_t) and overflow (bool) children + // Initialize sum child to 0 and overflow child to false + auto sum_col = col.child(0); + auto overflow_col = col.child(1); + + auto zip_begin = thrust::make_zip_iterator( + thrust::make_tuple(sum_col.begin(), overflow_col.begin())); + thrust::fill(rmm::exec_policy_nosync(stream), + zip_begin, + zip_begin + col.size(), + thrust::make_tuple(int64_t{0}, false)); + } else if constexpr (std::is_same_v) { + // This should only happen for SUM_WITH_OVERFLOW, but handle it just in case + CUDF_FAIL("Struct columns are only supported for SUM_WITH_OVERFLOW aggregation"); + } else { + using DeviceType = device_storage_type_t; + thrust::fill(rmm::exec_policy_nosync(stream), + col.begin(), + col.end(), + get_identity()); + } } template diff --git a/cpp/src/groupby/groupby.cu b/cpp/src/groupby/groupby.cu index 6f7db4938bb..bf3ee2a49aa 100644 --- a/cpp/src/groupby/groupby.cu +++ b/cpp/src/groupby/groupby.cu @@ -124,6 +124,14 @@ struct empty_column_constructor { } if constexpr (k == aggregation::Kind::MERGE_HISTOGRAM) { return empty_like(values); } + if constexpr (k == aggregation::Kind::SUM_WITH_OVERFLOW) { + // SUM_WITH_OVERFLOW returns a struct with sum (int64_t) and overflow (bool) children + std::vector> children; + children.push_back(make_empty_column(cudf::data_type{cudf::type_id::INT64})); + children.push_back(make_empty_column(cudf::data_type{cudf::type_id::BOOL8})); + return make_structs_column(0, std::move(children), 0, {}, stream, mr); + } + if constexpr (k == aggregation::Kind::RANK) { auto const& rank_agg = dynamic_cast(agg); if (rank_agg._method == cudf::rank_method::AVERAGE or diff --git a/cpp/src/groupby/hash/compute_aggregations.cuh b/cpp/src/groupby/hash/compute_aggregations.cuh index b97c8ddf88d..60a8b3c2f38 100644 --- a/cpp/src/groupby/hash/compute_aggregations.cuh +++ b/cpp/src/groupby/hash/compute_aggregations.cuh @@ -69,19 +69,29 @@ rmm::device_uvector compute_aggregations( auto const available_shmem_size = get_available_shared_memory_size(grid_size); auto const offsets_buffer_size = compute_shmem_offsets_size(flattened_values.num_columns()) * 2; auto const data_buffer_size = available_shmem_size - offsets_buffer_size; - auto const is_shared_memory_compatible = std::all_of( - requests.begin(), requests.end(), [&](cudf::groupby::aggregation_request const& request) { - if (cudf::is_dictionary(request.values.type())) { return false; } - // Ensure there is enough buffer space to store local aggregations up to the max cardinality - // for shared memory aggregations - auto const size = cudf::type_dispatcher(request.values.type(), - size_of_functor{}); - return data_buffer_size >= (size * GROUPBY_CARDINALITY_THRESHOLD); + + // Check if any aggregation is SUM_WITH_OVERFLOW, which should always use global memory + auto const has_sum_with_overflow = + std::any_of(agg_kinds.begin(), agg_kinds.end(), [](aggregation::Kind k) { + return k == aggregation::SUM_WITH_OVERFLOW; }); + auto const is_shared_memory_compatible = + !has_sum_with_overflow && + std::all_of( + requests.begin(), requests.end(), [&](cudf::groupby::aggregation_request const& request) { + if (cudf::is_dictionary(request.values.type())) { return false; } + // Ensure there is enough buffer space to store local aggregations up to the max cardinality + // for shared memory aggregations + auto const size = cudf::type_dispatcher(request.values.type(), + size_of_functor{}); + return data_buffer_size >= (size * GROUPBY_CARDINALITY_THRESHOLD); + }); + // Performs naive global memory aggregations when the workload is not compatible with shared - // memory, such as when aggregating dictionary columns or when there is insufficient dynamic - // shared memory for shared memory aggregations. + // memory, such as when aggregating dictionary columns, when there is insufficient dynamic + // shared memory for shared memory aggregations, or when SUM_WITH_OVERFLOW aggregations are + // present. if (!is_shared_memory_compatible) { return compute_global_memory_aggs(num_rows, skip_rows_with_nulls, diff --git a/cpp/src/groupby/hash/create_sparse_results_table.cu b/cpp/src/groupby/hash/create_sparse_results_table.cu index 562cd9142ae..dd622bbcc8f 100644 --- a/cpp/src/groupby/hash/create_sparse_results_table.cu +++ b/cpp/src/groupby/hash/create_sparse_results_table.cu @@ -21,6 +21,7 @@ #include #include #include +#include #include #include #include @@ -35,6 +36,80 @@ #include namespace cudf::groupby::detail::hash { +namespace { +/** + * @brief Functor to create sparse result columns for hash-based groupby aggregations + * + * This functor handles the creation of appropriately typed and sized columns for each + * aggregation, including special handling for SUM_WITH_OVERFLOW which requires a struct column. + */ +struct sparse_column_creator { + rmm::cuda_stream_view stream; + + explicit sparse_column_creator(rmm::cuda_stream_view stream) : stream(stream) {} + + std::unique_ptr operator()(cudf::column_view const& col, + cudf::aggregation::Kind const& agg) const + { + auto const nullable = + (agg == cudf::aggregation::COUNT_VALID or agg == cudf::aggregation::COUNT_ALL) + ? false + : (col.has_nulls() or agg == cudf::aggregation::VARIANCE or agg == cudf::aggregation::STD); + auto const mask_flag = (nullable) ? cudf::mask_state::ALL_NULL : cudf::mask_state::UNALLOCATED; + auto const col_type = cudf::is_dictionary(col.type()) + ? cudf::dictionary_column_view(col).keys().type() + : col.type(); + + // Special handling for SUM_WITH_OVERFLOW which needs a struct column + if (agg == cudf::aggregation::SUM_WITH_OVERFLOW) { + // Lambda to create empty columns for better readability + auto make_empty_column = [&stream = this->stream](cudf::type_id type_id, + cudf::size_type size, + cudf::mask_state mask_state) { + return make_fixed_width_column(cudf::data_type{type_id}, size, mask_state, stream); + }; + + // Lambda to create children for SUM_WITH_OVERFLOW struct column + auto make_children = [&make_empty_column](cudf::size_type size, cudf::mask_state mask_state) { + std::vector> children; + // Create sum child column (int64_t) - no null mask needed, struct-level mask handles + // nullability + children.push_back( + make_empty_column(cudf::type_id::INT64, size, cudf::mask_state::UNALLOCATED)); + // Create overflow child column (bool) - no null mask needed, only value matters + children.push_back( + make_empty_column(cudf::type_id::BOOL8, size, cudf::mask_state::UNALLOCATED)); + return children; + }; + + if (col.size() == 0) { + // For empty columns, create empty struct column manually + auto children = make_children(0, cudf::mask_state::UNALLOCATED); + return create_structs_hierarchy(0, std::move(children), 0, {}, stream); + } else { + auto children = make_children(col.size(), mask_flag); + + // Create struct column with the children + // For SUM_WITH_OVERFLOW, make struct nullable if input has nulls (same as other + // aggregations) + if (nullable) { + // Start with ALL_NULL, results will be marked valid during aggregation + auto null_mask = cudf::create_null_mask(col.size(), cudf::mask_state::ALL_NULL, stream); + auto null_count = col.size(); // All null initially + return create_structs_hierarchy( + col.size(), std::move(children), null_count, std::move(null_mask), stream); + } else { + return create_structs_hierarchy(col.size(), std::move(children), 0, {}, stream); + } + } + } else { + return make_fixed_width_column( + cudf::detail::target_type(col_type, agg), col.size(), mask_flag, stream); + } + } +}; +} // anonymous namespace + template void extract_populated_keys(SetType const& key_set, rmm::device_uvector& populated_keys, @@ -61,20 +136,7 @@ cudf::table create_sparse_results_table(cudf::table_view const& flattened_values flattened_values.end(), agg_kinds.begin(), std::back_inserter(sparse_columns), - [stream](auto const& col, auto const& agg) { - auto const nullable = - (agg == cudf::aggregation::COUNT_VALID or agg == cudf::aggregation::COUNT_ALL) - ? false - : (col.has_nulls() or agg == cudf::aggregation::VARIANCE or - agg == cudf::aggregation::STD); - auto const mask_flag = - (nullable) ? cudf::mask_state::ALL_NULL : cudf::mask_state::UNALLOCATED; - auto const col_type = cudf::is_dictionary(col.type()) - ? cudf::dictionary_column_view(col).keys().type() - : col.type(); - return make_fixed_width_column( - cudf::detail::target_type(col_type, agg), col.size(), mask_flag, stream); - }); + sparse_column_creator{stream}); cudf::table sparse_table(std::move(sparse_columns)); // If no direct aggregations, initialize the sparse table // only for the keys inserted in global hash set diff --git a/cpp/src/groupby/hash/groupby.cu b/cpp/src/groupby/hash/groupby.cu index 30e1d52fdbf..b9f08c2c505 100644 --- a/cpp/src/groupby/hash/groupby.cu +++ b/cpp/src/groupby/hash/groupby.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2024, NVIDIA CORPORATION. + * Copyright (c) 2019-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -44,7 +44,8 @@ namespace { * @brief List of aggregation operations that can be computed with a hash-based * implementation. */ -constexpr std::array hash_aggregations{aggregation::SUM, +constexpr std::array hash_aggregations{aggregation::SUM, + aggregation::SUM_WITH_OVERFLOW, aggregation::PRODUCT, aggregation::MIN, aggregation::MAX, diff --git a/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh b/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh index 9dba468bf14..5afdc82892e 100644 --- a/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh +++ b/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh @@ -152,7 +152,6 @@ struct group_reduction_functor< cudf::device_span group_labels, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) - { using SourceDType = device_storage_type_t; using ResultType = cudf::detail::target_type_t; diff --git a/cpp/tests/groupby/sum_tests.cpp b/cpp/tests/groupby/sum_tests.cpp index 5f5329e5d7a..f0ab60faab2 100644 --- a/cpp/tests/groupby/sum_tests.cpp +++ b/cpp/tests/groupby/sum_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2024, NVIDIA CORPORATION. + * Copyright (c) 2019-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -230,3 +230,207 @@ TYPED_TEST(GroupBySumFixedPointTest, GroupByHashSumDecimalAsValue) EXPECT_THROW(test_single_agg(keys, vals, expect_keys, {}, std::move(agg8)), cudf::logic_error); } } + +// SUM_WITH_OVERFLOW tests - only supports int64_t input values and outputs int64_t +template +struct groupby_sum_with_overflow_test : public cudf::test::BaseFixture {}; + +using sum_with_overflow_supported_types = cudf::test::Types; + +TYPED_TEST_SUITE(groupby_sum_with_overflow_test, sum_with_overflow_supported_types); + +TYPED_TEST(groupby_sum_with_overflow_test, basic) +{ + using V = TypeParam; + + cudf::test::fixed_width_column_wrapper keys{1, 2, 3, 1, 2, 2, 1, 3, 3, 2}; + cudf::test::fixed_width_column_wrapper vals{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}; + + cudf::test::fixed_width_column_wrapper expect_keys{1, 2, 3}; + + // Create expected struct column with sum and overflow children + auto sum_col = cudf::test::fixed_width_column_wrapper{9, 19, 17}; + auto overflow_col = cudf::test::fixed_width_column_wrapper{false, false, false}; + std::vector> children; + children.push_back(sum_col.release()); + children.push_back(overflow_col.release()); + auto expect_vals = cudf::create_structs_hierarchy(3, std::move(children), 0, {}); + + auto agg = cudf::make_sum_with_overflow_aggregation(); + test_single_agg(keys, vals, expect_keys, *expect_vals, std::move(agg)); + + // Note: SUM_WITH_OVERFLOW only works with hash groupby, not sort groupby +} + +TYPED_TEST(groupby_sum_with_overflow_test, empty_cols) +{ + using V = TypeParam; + + cudf::test::fixed_width_column_wrapper keys{}; + cudf::test::fixed_width_column_wrapper vals{}; + + cudf::test::fixed_width_column_wrapper expect_keys{}; + + // Create expected empty struct column with sum and overflow children + auto sum_col = cudf::test::fixed_width_column_wrapper{}; + auto overflow_col = cudf::test::fixed_width_column_wrapper{}; + std::vector> children; + children.push_back(sum_col.release()); + children.push_back(overflow_col.release()); + auto expect_vals = cudf::create_structs_hierarchy(0, std::move(children), 0, {}); + + auto agg = cudf::make_sum_with_overflow_aggregation(); + test_single_agg(keys, vals, expect_keys, *expect_vals, std::move(agg)); + + // Note: SUM_WITH_OVERFLOW only works with hash groupby, not sort groupby +} + +TYPED_TEST(groupby_sum_with_overflow_test, zero_valid_keys) +{ + using V = TypeParam; + + cudf::test::fixed_width_column_wrapper keys({1, 2, 3}, cudf::test::iterators::all_nulls()); + cudf::test::fixed_width_column_wrapper vals{3, 4, 5}; + + cudf::test::fixed_width_column_wrapper expect_keys{}; + + // Create expected empty struct column with sum and overflow children + auto sum_col = cudf::test::fixed_width_column_wrapper{}; + auto overflow_col = cudf::test::fixed_width_column_wrapper{}; + std::vector> children; + children.push_back(sum_col.release()); + children.push_back(overflow_col.release()); + auto expect_vals = cudf::create_structs_hierarchy(0, std::move(children), 0, {}); + + auto agg = cudf::make_sum_with_overflow_aggregation(); + test_single_agg(keys, vals, expect_keys, *expect_vals, std::move(agg)); + + // Note: SUM_WITH_OVERFLOW only works with hash groupby, not sort groupby +} + +TYPED_TEST(groupby_sum_with_overflow_test, zero_valid_values) +{ + using V = TypeParam; + + cudf::test::fixed_width_column_wrapper keys{1, 1, 1}; + cudf::test::fixed_width_column_wrapper vals({3, 4, 5}, cudf::test::iterators::all_nulls()); + + cudf::test::fixed_width_column_wrapper expect_keys{1}; + + // Create expected struct column with sum and overflow children (null result) + // Child columns have no null masks, only struct-level null mask matters + auto sum_col = cudf::test::fixed_width_column_wrapper({0}); + auto overflow_col = cudf::test::fixed_width_column_wrapper({false}); + std::vector> children; + children.push_back(sum_col.release()); + children.push_back(overflow_col.release()); + std::vector validity{0}; // null struct + auto [validity_mask, null_count] = + cudf::test::detail::make_null_mask(validity.begin(), validity.end()); + auto expect_vals = + cudf::create_structs_hierarchy(1, std::move(children), null_count, std::move(validity_mask)); + + auto agg = cudf::make_sum_with_overflow_aggregation(); + test_single_agg(keys, vals, expect_keys, *expect_vals, std::move(agg)); + + // Note: SUM_WITH_OVERFLOW only works with hash groupby, not sort groupby +} + +TYPED_TEST(groupby_sum_with_overflow_test, null_keys_and_values) +{ + using V = TypeParam; + + cudf::test::fixed_width_column_wrapper keys( + {1, 2, 3, 1, 2, 2, 1, 3, 3, 2, 4}, + {true, true, true, true, true, true, true, false, true, true, true}); + cudf::test::fixed_width_column_wrapper vals({0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 4}, + {0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0}); + + // { 1, 1, 2, 2, 2, 3, 3, 4} + cudf::test::fixed_width_column_wrapper expect_keys({1, 2, 3, 4}, + cudf::test::iterators::no_nulls()); + + // Create expected struct column with sum and overflow children + // { 3, 6, 1, 4, 9, 2, 8, -} + // Child columns have no null masks, only struct-level null mask matters + auto sum_col = cudf::test::fixed_width_column_wrapper({9, 14, 10, 0}); + auto overflow_col = cudf::test::fixed_width_column_wrapper({false, false, false, false}); + std::vector> children; + children.push_back(sum_col.release()); + children.push_back(overflow_col.release()); + std::vector validity{1, 1, 1, 0}; + auto [validity_mask, null_count] = + cudf::test::detail::make_null_mask(validity.begin(), validity.end()); + auto expect_vals = + cudf::create_structs_hierarchy(4, std::move(children), null_count, std::move(validity_mask)); + + auto agg = cudf::make_sum_with_overflow_aggregation(); + test_single_agg(keys, vals, expect_keys, *expect_vals, std::move(agg)); + + // Note: SUM_WITH_OVERFLOW only works with hash groupby, not sort groupby +} + +TYPED_TEST(groupby_sum_with_overflow_test, overflow_detection) +{ + using V = TypeParam; + + cudf::test::fixed_width_column_wrapper keys{1, 2, 3, 4, 1, 2, 2, 1, 3, 3, 2, 4, 4}; + // Mix of values that will cause positive and negative overflow for some groups but not others + cudf::test::fixed_width_column_wrapper vals{ + 9223372036854775800L, // Close to INT64_MAX + 100L, // Small value + 200L, // Small value + -9223372036854775800L, // Close to INT64_MIN + 20L, // Small value that will cause positive overflow when added to first + 200L, // Small value + 300L, // Small value + 9223372036854775800L, // Close to INT64_MAX + 9223372036854775800L, // Close to INT64_MAX + 1L, // Small value + 400L, // Small value + -20L, // Small value that will cause negative overflow when added to fourth + -9223372036854775800L}; // Close to INT64_MIN + + cudf::test::fixed_width_column_wrapper expect_keys{1, 2, 3, 4}; + + // Create expected struct column with sum and overflow children + // Group 1: 9223372036854775800 + 20 + 9223372036854775800 = positive overflow + // Group 2: 100 + 200 + 300 + 400 = 1000 (no overflow) + // Group 3: 200 + 9223372036854775800 + 1 = positive overflow + // Group 4: -9223372036854775800 + (-20) + (-9223372036854775800) = negative overflow + auto sum_col = cudf::test::fixed_width_column_wrapper{ + 4L, // Positive overflow result for group 1 + 1000L, // Normal sum for group 2 (no overflow) + -9223372036854775615L, // Positive overflow result for group 3 + -4L // Negative overflow result for group 4 + }; + auto overflow_col = cudf::test::fixed_width_column_wrapper{true, false, true, true}; + std::vector> children; + children.push_back(sum_col.release()); + children.push_back(overflow_col.release()); + auto expect_vals = cudf::create_structs_hierarchy(4, std::move(children), 0, {}); + + auto agg = cudf::make_sum_with_overflow_aggregation(); + test_single_agg(keys, vals, expect_keys, *expect_vals, std::move(agg)); + + // Note: SUM_WITH_OVERFLOW only works with hash groupby, not sort groupby +} + +// Test that SUM_WITH_OVERFLOW throws an error for invalid value types +TEST(groupby_sum_with_overflow_error_test, invalid_value_type) +{ + using K = int32_t; + using V = int32_t; // Invalid type for SUM_WITH_OVERFLOW, should only support int64_t + + cudf::test::fixed_width_column_wrapper keys{1, 1, 1, 2, 2, 2, 3, 3, 3}; + cudf::test::fixed_width_column_wrapper vals{1, 2, 3, 4, 5, 6, 7, 8, 9}; + + cudf::test::fixed_width_column_wrapper expect_keys{1, 2, 3}; + + auto agg = cudf::make_sum_with_overflow_aggregation(); + + // SUM_WITH_OVERFLOW should throw a logic_error when used with non-int64_t value types + EXPECT_THROW( + test_single_agg(keys, vals, expect_keys, {}, std::move(agg), force_use_sort_impl::NO), + cudf::logic_error); +} diff --git a/python/pylibcudf/tests/test_aggregation.py b/python/pylibcudf/tests/test_aggregation.py index f1a4a38a83f..8250ab16acf 100644 --- a/python/pylibcudf/tests/test_aggregation.py +++ b/python/pylibcudf/tests/test_aggregation.py @@ -4,4 +4,4 @@ def test_repr_name(): - assert repr(plc.aggregation.any()) == ")>" + assert repr(plc.aggregation.any()) == ")>" From d33498b001c5f07148091b6458d73c22053d650a Mon Sep 17 00:00:00 2001 From: David Wendt <45795991+davidwendt@users.noreply.github.com> Date: Mon, 4 Aug 2025 08:56:45 -0400 Subject: [PATCH 046/366] Rework fill/repeat benchmark to use nvbench (#19556) Converts the googlebench FILL_BENCH to use nvbench. Currently this is only the `cudf::repeat` API. Also added an axis to measure int32 as well as double. Authors: - David Wendt (https://github.com/davidwendt) Approvers: - Shruti Shivakumar (https://github.com/shrshi) - Nghia Truong (https://github.com/ttnghia) URL: https://github.com/rapidsai/cudf/pull/19556 --- cpp/benchmarks/CMakeLists.txt | 2 +- cpp/benchmarks/filling/repeat.cpp | 68 ++++++++++++++----------------- 2 files changed, 31 insertions(+), 39 deletions(-) diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt index c8d2cae9fd2..21f56e1331c 100644 --- a/cpp/benchmarks/CMakeLists.txt +++ b/cpp/benchmarks/CMakeLists.txt @@ -250,7 +250,7 @@ ConfigureNVBench(REPLACE_NVBENCH replace/nulls.cpp) # ################################################################################################## # * filling benchmark ----------------------------------------------------------------------------- -ConfigureBench(FILL_BENCH filling/repeat.cpp) +ConfigureNVBench(FILL_NVBENCH filling/repeat.cpp) # ################################################################################################## # * groupby benchmark ----------------------------------------------------------------------------- diff --git a/cpp/benchmarks/filling/repeat.cpp b/cpp/benchmarks/filling/repeat.cpp index 0abef46acac..92559b50bec 100644 --- a/cpp/benchmarks/filling/repeat.cpp +++ b/cpp/benchmarks/filling/repeat.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2023, NVIDIA CORPORATION. + * Copyright (c) 2021-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -15,57 +15,49 @@ */ #include -#include -#include +#include #include -class Repeat : public cudf::benchmark {}; +#include -template -void BM_repeat(benchmark::State& state) +namespace { +template +void nvbench_repeat(nvbench::state& state, nvbench::type_list) { - auto const n_rows = static_cast(state.range(0)); - auto const n_cols = static_cast(state.range(1)); + auto const num_rows = static_cast(state.get_int64("num_rows")); + auto const num_cols = static_cast(state.get_int64("num_cols")); + auto const nulls = state.get_int64("nulls"); auto const input_table = - create_sequence_table(cycle_dtypes({cudf::type_to_id()}, n_cols), - row_count{n_rows}, - nulls ? std::optional{1.0} : std::nullopt); + create_sequence_table(cycle_dtypes({cudf::type_to_id()}, num_cols), + row_count{num_rows}, + nulls ? std::optional{0.1} : std::nullopt); // Create table view - auto input = cudf::table_view(*input_table); + auto const input = input_table->view(); // repeat counts - using sizeT = cudf::size_type; data_profile const profile = data_profile_builder().cardinality(0).no_validity().distribution( - cudf::type_to_id(), distribution_id::UNIFORM, 0, 3); - auto repeat_count = create_random_column(cudf::type_to_id(), row_count{n_rows}, profile); + cudf::type_to_id(), distribution_id::UNIFORM, 0, 3); + auto counts = + create_random_column(cudf::type_to_id(), row_count{num_rows}, profile); - // warm up - auto output = cudf::repeat(input, *repeat_count); + auto output = cudf::repeat(input, counts->view()); - for (auto _ : state) { - cuda_event_timer raii(state, true); // flush_l2_cache = true, stream = 0 - cudf::repeat(input, *repeat_count); - } + state.add_global_memory_reads(input_table->alloc_size()); + state.add_global_memory_writes(output->alloc_size()); + state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value())); - auto data_bytes = - (input.num_columns() * input.num_rows() + output->num_columns() * output->num_rows()) * - sizeof(TypeParam); - auto null_bytes = - nulls ? input.num_columns() * cudf::bitmask_allocation_size_bytes(input.num_rows()) + - output->num_columns() * cudf::bitmask_allocation_size_bytes(output->num_rows()) - : 0; - state.SetBytesProcessed(state.iterations() * (data_bytes + null_bytes)); + state.exec(nvbench::exec_tag::sync, + [&](nvbench::launch& launch) { auto result = cudf::repeat(input, counts->view()); }); } +} // namespace -#define REPEAT_BENCHMARK_DEFINE(name, type, nulls) \ - BENCHMARK_DEFINE_F(Repeat, name)(::benchmark::State & state) { BM_repeat(state); } \ - BENCHMARK_REGISTER_F(Repeat, name) \ - ->RangeMultiplier(8) \ - ->Ranges({{1 << 10, 1 << 26}, {1, 8}}) \ - ->UseManualTime() \ - ->Unit(benchmark::kMillisecond); +using Types = nvbench::type_list; -REPEAT_BENCHMARK_DEFINE(double_nulls, double, true); -REPEAT_BENCHMARK_DEFINE(double_no_nulls, double, false); +NVBENCH_BENCH_TYPES(nvbench_repeat, NVBENCH_TYPE_AXES(Types)) + .set_name("repeat") + .set_type_axes_names({"DataType"}) + .add_int64_power_of_two_axis("num_rows", {10, 14, 18, 22, 26}) + .add_int64_axis("num_cols", {1, 2, 4, 8}) + .add_int64_axis("nulls", {0, 1}); From 64c016de4f737b2dca543feeca16ae1c36e33354 Mon Sep 17 00:00:00 2001 From: David Wendt <45795991+davidwendt@users.noreply.github.com> Date: Mon, 4 Aug 2025 08:57:39 -0400 Subject: [PATCH 047/366] Remove deprecated subword-tokenizer APIs (#19498) Removes the subword-tokenizer APIs from cudf which has been deprecated and replaced with the wordpiece-tokenizer APIs. Authors: - David Wendt (https://github.com/davidwendt) Approvers: - Matthew Roeschke (https://github.com/mroeschke) - Nghia Truong (https://github.com/ttnghia) - Bradley Dice (https://github.com/bdice) URL: https://github.com/rapidsai/cudf/pull/19498 --- cpp/CMakeLists.txt | 4 - cpp/include/nvtext/detail/load_hash_file.hpp | 50 -- cpp/include/nvtext/subword_tokenize.hpp | 172 ------ .../detail/codepoint_metadata.ah | 0 cpp/src/text/{subword => }/detail/cp_data.h | 0 cpp/src/text/normalize.cu | 71 ++- cpp/src/text/normalize.cuh | 21 +- cpp/src/text/subword/data_normalizer.cu | 280 --------- .../text/subword/detail/data_normalizer.hpp | 97 --- cpp/src/text/subword/detail/hash_utils.cuh | 172 ------ .../text/subword/detail/tokenizer_utils.cuh | 76 --- .../subword/detail/wordpiece_tokenizer.hpp | 106 ---- cpp/src/text/subword/load_hash_file.cu | 301 ---------- cpp/src/text/subword/subword_tokenize.cu | 300 ---------- cpp/src/text/subword/wordpiece_tokenizer.cu | 562 ------------------ .../pylibcudf/api_docs/nvtext/index.rst | 1 - .../api_docs/nvtext/subword_tokenize.rst | 6 - .../api_docs/wordpiece_tokenizer.rst | 7 - python/cudf/cudf/core/column/string.py | 26 - python/cudf/cudf/core/subword_tokenizer.py | 299 ---------- .../libcudf/nvtext/subword_tokenize.pxd | 55 -- .../pylibcudf/pylibcudf/nvtext/CMakeLists.txt | 1 - .../pylibcudf/pylibcudf/nvtext/__init__.pxd | 2 - python/pylibcudf/pylibcudf/nvtext/__init__.py | 2 - .../pylibcudf/nvtext/subword_tokenize.pxd | 20 - .../pylibcudf/nvtext/subword_tokenize.pyi | 15 - .../pylibcudf/nvtext/subword_tokenize.pyx | 87 --- 27 files changed, 89 insertions(+), 2644 deletions(-) delete mode 100644 cpp/include/nvtext/detail/load_hash_file.hpp delete mode 100644 cpp/include/nvtext/subword_tokenize.hpp rename cpp/src/text/{subword => }/detail/codepoint_metadata.ah (100%) rename cpp/src/text/{subword => }/detail/cp_data.h (100%) delete mode 100644 cpp/src/text/subword/data_normalizer.cu delete mode 100644 cpp/src/text/subword/detail/data_normalizer.hpp delete mode 100644 cpp/src/text/subword/detail/hash_utils.cuh delete mode 100644 cpp/src/text/subword/detail/tokenizer_utils.cuh delete mode 100644 cpp/src/text/subword/detail/wordpiece_tokenizer.hpp delete mode 100644 cpp/src/text/subword/load_hash_file.cu delete mode 100644 cpp/src/text/subword/subword_tokenize.cu delete mode 100644 cpp/src/text/subword/wordpiece_tokenizer.cu delete mode 100644 docs/cudf/source/pylibcudf/api_docs/nvtext/subword_tokenize.rst delete mode 100644 python/cudf/cudf/core/subword_tokenizer.py delete mode 100644 python/pylibcudf/pylibcudf/libcudf/nvtext/subword_tokenize.pxd delete mode 100644 python/pylibcudf/pylibcudf/nvtext/subword_tokenize.pxd delete mode 100644 python/pylibcudf/pylibcudf/nvtext/subword_tokenize.pyi delete mode 100644 python/pylibcudf/pylibcudf/nvtext/subword_tokenize.pyx diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 653c61fcb96..50b96a9baf4 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -798,10 +798,6 @@ add_library( src/text/stemmer.cu src/text/bpe/byte_pair_encoding.cu src/text/bpe/load_merge_pairs.cu - src/text/subword/data_normalizer.cu - src/text/subword/load_hash_file.cu - src/text/subword/subword_tokenize.cu - src/text/subword/wordpiece_tokenizer.cu src/text/tokenize.cu src/text/vocabulary_tokenize.cu src/text/wordpiece_tokenize.cu diff --git a/cpp/include/nvtext/detail/load_hash_file.hpp b/cpp/include/nvtext/detail/load_hash_file.hpp deleted file mode 100644 index 1334cbf47ea..00000000000 --- a/cpp/include/nvtext/detail/load_hash_file.hpp +++ /dev/null @@ -1,50 +0,0 @@ -/* - * Copyright (c) 2020-2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#pragma once - -#include -#include - -#include - -#include - -#include -#include - -namespace CUDF_EXPORT nvtext { -namespace detail { - -/** - * @brief Load the hashed vocabulary file into device memory. - * - * The object here can be used to call the subword_tokenize without - * incurring the cost of loading the same file each time. - * - * @param filename_hashed_vocabulary A path to the preprocessed vocab.txt file. - * Note that this is the file AFTER python/perfect_hash.py has been used - * for preprocessing. - * @param stream CUDA stream used for device memory operations and kernel launches. - * @param mr Memory resource to allocate any returned objects. - * @return vocabulary hash-table elements - */ -std::unique_ptr load_vocabulary_file( - std::string const& filename_hashed_vocabulary, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr); - -} // namespace detail -} // namespace CUDF_EXPORT nvtext diff --git a/cpp/include/nvtext/subword_tokenize.hpp b/cpp/include/nvtext/subword_tokenize.hpp deleted file mode 100644 index 6e04ec6a5a3..00000000000 --- a/cpp/include/nvtext/subword_tokenize.hpp +++ /dev/null @@ -1,172 +0,0 @@ -/* - * Copyright (c) 2020-2025, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#pragma once - -#include -#include -#include -#include -#include - -namespace CUDF_EXPORT nvtext { - -/** - * @addtogroup nvtext_tokenize - * @{ - * @file - */ - -/** - * @brief The vocabulary data for use with the subword_tokenize function. - */ -struct hashed_vocabulary { - uint16_t first_token_id{}; ///< The first token id in the vocabulary - uint16_t separator_token_id{}; ///< The separator token id in the vocabulary - uint16_t unknown_token_id{}; ///< The unknown token id in the vocabulary - uint32_t outer_hash_a{}; ///< The a parameter for the outer hash - uint32_t outer_hash_b{}; ///< The b parameter for the outer hash - uint16_t num_bins{}; ///< Number of bins - std::unique_ptr table; ///< uint64 column, the flattened hash table with key, value - ///< pairs packed in 64-bits - std::unique_ptr bin_coefficients; ///< uint64 column, containing the hashing - ///< parameters for each hash bin on the GPU - std::unique_ptr bin_offsets; ///< uint16 column, containing the start index of each - ///< bin in the flattened hash table - std::unique_ptr - cp_metadata; ///< uint32 column, The code point metadata table to use for normalization - std::unique_ptr - aux_cp_table; ///< uint64 column, The auxiliary code point table to use for normalization -}; - -/** - * @brief Load the hashed vocabulary file into device memory. - * - * The object here can be used to call the subword_tokenize without - * incurring the cost of loading the same file each time. - * - * @deprecated in 25.06 and to be removed in a future release - * - * @throw cudf::logic_error if the `filename_hashed_vocabulary` could not be opened. - * - * @param filename_hashed_vocabulary A path to the preprocessed vocab.txt file. - * Note that this is the file AFTER python/perfect_hash.py has been used - * for preprocessing. - * @param stream CUDA stream used for device memory operations and kernel launches - * @param mr Memory resource to allocate any returned objects. - * @return vocabulary hash-table elements - */ -[[deprecated]] std::unique_ptr load_vocabulary_file( - std::string const& filename_hashed_vocabulary, - rmm::cuda_stream_view stream = cudf::get_default_stream(), - rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); - -/** - * @brief Result object for the subword_tokenize functions. - */ -struct tokenizer_result { - /** - * @brief The number of rows for the output token-ids. - */ - uint32_t nrows_tensor{}; - /** - * @brief The number of token-ids in each row. - */ - uint32_t sequence_length{}; - /** - * @brief A vector of token-ids for each row. - * - * The data is a flat matrix (nrows_tensor x sequence_length) of token-ids. - * This column is of type UINT32 with no null entries. - */ - std::unique_ptr tensor_token_ids; - /** - * @brief This mask identifies which tensor-token-ids are valid. - * - * This column is of type UINT32 with no null entries. - */ - std::unique_ptr tensor_attention_mask; - /** - * @brief The metadata for each tensor row. - * - * There are three elements per tensor row [row-id, start_pos, stop_pos]) - * This column is of type UINT32 with no null entries. - */ - std::unique_ptr tensor_metadata; -}; - -/** - * @brief Creates a tokenizer that cleans the text, splits it into tokens and - * returns token-ids from an input vocabulary. - * - * @deprecated in 25.06 and to be removed in a future release - * Use nvtext::wordpiece_tokenize instead - * - * The strings are first normalized by converting to lower-case, removing - * punctuation, replacing a select set of multi-byte characters and - * whitespace characters. - * - * The strings are then tokenized by using whitespace as a delimiter. - * Consecutive delimiters are ignored. Each token is then assigned - * a 4-byte token-id mapped from the provided vocabulary table. - * - * Essentially each string is converted into one or more vectors of token-ids - * in the output column. The total number of these vectors times `max_sequence_length` - * is the size of the `tensor_token_ids` output column. For `do_truncate==true`: - * ``` - * size of tensor_token_ids = max_sequence_length * strings.size() - * size of tensor_attention_mask = max_sequence_length * strings.size() - * size of tensor_metadata = 3 * strings.size() - * ``` - * - * For `do_truncate==false` the number of rows per output string depends on the - * number of tokens resolved and the `stride` value which may repeat tokens - * in subsequent overflow rows. - * - * This function requires about 21x the number of character bytes in the input - * strings column as working memory. - * - * @throw cudf::logic_error if `stride > max_sequence_length` - * @throw std::overflow_error if `max_sequence_length * max_rows_tensor` - * exceeds the column size limit - * - * @param strings The input strings to tokenize. - * @param vocabulary_table The vocabulary table pre-loaded into this object. - * @param max_sequence_length Limit of the number of token-ids per row in final tensor - * for each string. - * @param stride Each row in the output token-ids will replicate `max_sequence_length - stride` - * the token-ids from the previous row, unless it is the first string. - * @param do_lower_case If true, the tokenizer will convert uppercase characters in the - * input stream to lower-case and strip accents from those characters. - * If false, accented and uppercase characters are not transformed. - * @param do_truncate If true, the tokenizer will discard all the token-ids after - * `max_sequence_length` for each input string. If false, it will use a new row - * in the output token-ids to continue generating the output. - * @param stream CUDA stream used for device memory operations and kernel launches - * @param mr Memory resource to allocate any returned objects. - * @return token-ids, attention-mask, and metadata - */ -[[deprecated]] tokenizer_result subword_tokenize( - cudf::strings_column_view const& strings, - hashed_vocabulary const& vocabulary_table, - uint32_t max_sequence_length, - uint32_t stride, - bool do_lower_case, - bool do_truncate, - rmm::cuda_stream_view stream = cudf::get_default_stream(), - rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); - -/** @} */ // end of group -} // namespace CUDF_EXPORT nvtext diff --git a/cpp/src/text/subword/detail/codepoint_metadata.ah b/cpp/src/text/detail/codepoint_metadata.ah similarity index 100% rename from cpp/src/text/subword/detail/codepoint_metadata.ah rename to cpp/src/text/detail/codepoint_metadata.ah diff --git a/cpp/src/text/subword/detail/cp_data.h b/cpp/src/text/detail/cp_data.h similarity index 100% rename from cpp/src/text/subword/detail/cp_data.h rename to cpp/src/text/detail/cp_data.h diff --git a/cpp/src/text/normalize.cu b/cpp/src/text/normalize.cu index 701056e44f2..a0ddf2c2b6e 100644 --- a/cpp/src/text/normalize.cu +++ b/cpp/src/text/normalize.cu @@ -14,9 +14,8 @@ * limitations under the License. */ +#include "text/detail/codepoint_metadata.ah" #include "text/normalize.cuh" -#include "text/subword/detail/data_normalizer.hpp" -#include "text/subword/detail/tokenizer_utils.cuh" #include "text/utilities/tokenize_ops.cuh" #include @@ -136,6 +135,74 @@ std::unique_ptr normalize_spaces(cudf::strings_column_view const& cudf::detail::copy_bitmask(strings.parent(), stream, mr)); } +/** + * @brief Retrieve the code point metadata table. + * + * Build the code point metadata table in device memory + * using the vector pieces from codepoint_metadata.ah + */ +rmm::device_uvector get_codepoint_metadata(rmm::cuda_stream_view stream) +{ + auto table_vector = rmm::device_uvector(codepoint_metadata_size, stream); + auto table = table_vector.data(); + thrust::fill(rmm::exec_policy(stream), + table + cp_section1_end, + table + codepoint_metadata_size, + codepoint_metadata_default_value); + CUDF_CUDA_TRY(cudaMemcpyAsync(table, + codepoint_metadata, + cp_section1_end * sizeof(codepoint_metadata[0]), // 1st section + cudaMemcpyDefault, + stream.value())); + CUDF_CUDA_TRY(cudaMemcpyAsync( + table + cp_section2_begin, + cp_metadata_917505_917999, + (cp_section2_end - cp_section2_begin + 1) * sizeof(codepoint_metadata[0]), // 2nd section + cudaMemcpyDefault, + stream.value())); + return table_vector; +} + +/** + * @brief Retrieve the aux code point data table. + * + * Build the aux code point data table in device memory + * using the vector pieces from codepoint_metadata.ah + */ +rmm::device_uvector get_aux_codepoint_data(rmm::cuda_stream_view stream) +{ + auto table_vector = rmm::device_uvector(aux_codepoint_data_size, stream); + auto table = table_vector.data(); + thrust::fill(rmm::exec_policy(stream), + table + aux_section1_end, + table + aux_codepoint_data_size, + aux_codepoint_default_value); + CUDF_CUDA_TRY(cudaMemcpyAsync(table, + aux_codepoint_data, + aux_section1_end * sizeof(aux_codepoint_data[0]), // 1st section + cudaMemcpyDefault, + stream.value())); + CUDF_CUDA_TRY(cudaMemcpyAsync( + table + aux_section2_begin, + aux_cp_data_44032_55203, + (aux_section2_end - aux_section2_begin + 1) * sizeof(aux_codepoint_data[0]), // 2nd section + cudaMemcpyDefault, + stream.value())); + CUDF_CUDA_TRY(cudaMemcpyAsync( + table + aux_section3_begin, + aux_cp_data_70475_71099, + (aux_section3_end - aux_section3_begin + 1) * sizeof(aux_codepoint_data[0]), // 3rd section + cudaMemcpyDefault, + stream.value())); + CUDF_CUDA_TRY(cudaMemcpyAsync( + table + aux_section4_begin, + aux_cp_data_119134_119232, + (aux_section4_end - aux_section4_begin + 1) * sizeof(aux_codepoint_data[0]), // 4th section + cudaMemcpyDefault, + stream.value())); + return table_vector; +} + } // namespace detail // external APIs diff --git a/cpp/src/text/normalize.cuh b/cpp/src/text/normalize.cuh index 3972726d536..c2a18e8a137 100644 --- a/cpp/src/text/normalize.cuh +++ b/cpp/src/text/normalize.cuh @@ -16,7 +16,12 @@ #pragma once -#include "text/subword/detail/cp_data.h" +#include "text/detail/cp_data.h" + +#include +#include + +#include namespace nvtext { namespace detail { @@ -96,5 +101,19 @@ __device__ constexpr bool is_multi_char_transform(uint32_t metadata) */ __device__ constexpr bool is_head_byte(unsigned char utf8_byte) { return (utf8_byte >> 6) != 2; } +/** + * @brief Retrieve the code point metadata table. + * + * @param stream CUDA stream used for device memory operations and kernel launches. + */ +rmm::device_uvector get_codepoint_metadata(rmm::cuda_stream_view stream); + +/** + * @brief Retrieve the auxiliary code point metadata table. + * + * @param stream CUDA stream used for device memory operations and kernel launches. + */ +rmm::device_uvector get_aux_codepoint_data(rmm::cuda_stream_view stream); + } // namespace detail } // namespace nvtext diff --git a/cpp/src/text/subword/data_normalizer.cu b/cpp/src/text/subword/data_normalizer.cu deleted file mode 100644 index d9943fb781b..00000000000 --- a/cpp/src/text/subword/data_normalizer.cu +++ /dev/null @@ -1,280 +0,0 @@ -/* - * Copyright (c) 2020-2025, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "text/normalize.cuh" -#include "text/subword/detail/data_normalizer.hpp" -#include "text/subword/detail/tokenizer_utils.cuh" - -#include -#include -#include -#include -#include -#include -#include - -#include -#include - -#include -#include -#include -#include -#include -#include -#include - -namespace nvtext { -namespace detail { -namespace { - -/** - * @brief Converts a UTF-8 character into a unicode code point value. - * - * If the byte at start_byte_for_thread is the first byte of a UTF-8 character (head byte), - * the UTF-8 character is converted to a unicode code point and returned. - * - * If the byte at start_byte_for_thread is not a head byte, 0 is returned. - * - * All threads start reading bytes from the pointer denoted by strings. - * - * @param strings A pointer to the start of the sequence of characters to be analyzed. - * @param start_byte_for_thread Which byte to start analyzing - * @return New code point value for this byte. - */ -__device__ uint32_t -extract_code_points_from_utf8(unsigned char const* strings, - size_t const total_bytes, - cudf::thread_index_type const start_byte_for_thread) -{ - constexpr uint8_t max_utf8_blocks_for_char = 4; - uint8_t utf8_blocks[max_utf8_blocks_for_char] = {0}; - - for (int i = 0; i < cuda::std::min(static_cast(max_utf8_blocks_for_char), - total_bytes - start_byte_for_thread); - ++i) { - utf8_blocks[i] = strings[start_byte_for_thread + i]; - } - - uint8_t const length_encoding_bits = utf8_blocks[0] >> 3; - // UTF-8 format is variable-width character encoding using up to 4 bytes. - // If the first byte is: - // - [x00-x7F] -- beginning of a 1-byte character (ASCII) - // - [xC0-xDF] -- beginning of a 2-byte character - // - [xE0-xEF] -- beginning of a 3-byte character - // - [xF0-xF7] -- beginning of a 3-byte character - // Anything else is an intermediate byte [x80-xBF]. - // So shifted by 3 bits this becomes - // - [x00-x0F] or leb < 16 - // - [x18-x1B] or 24 <= leb <= 27 - // - [x1C-x1D] or 28 <= leb <= 29 - // - [x1E-x1F] or leb >= 30 - // The remaining bits are part of the value as specified by the mask - // specified by x's below. - // - b0xxxxxxx = x7F - // - b110xxxxx = x1F - // - b1110xxxx = x0F - // - b11110xxx = x07 - using encoding_length_pair = thrust::pair; - // Set the number of characters and the top masks based on the length encoding bits. - encoding_length_pair const char_encoding_length = [length_encoding_bits] { - if (length_encoding_bits < 16) return encoding_length_pair{1, 0x7F}; - if (length_encoding_bits >= 24 && length_encoding_bits <= 27) - return encoding_length_pair{2, 0x1F}; - if (length_encoding_bits == 28 || length_encoding_bits == 29) - return encoding_length_pair{3, 0x0F}; - if (length_encoding_bits == 30) return encoding_length_pair{4, 0x07}; - return encoding_length_pair{0, 0}; - }(); - - // Now pack up the bits into a uint32_t. - // Move the first set of values into bits 19-24 in the 32-bit value. - uint32_t code_point = (utf8_blocks[0] & char_encoding_length.second) << 18; - // Move the remaining values which are 6 bits (mask b10xxxxxx = x3F) - // from the remaining bytes into successive positions in the 32-bit result. - code_point |= ((utf8_blocks[1] & 0x3F) << 12); - code_point |= ((utf8_blocks[2] & 0x3F) << 6); - code_point |= utf8_blocks[3] & 0x3F; - - // Adjust the final result by shifting by the character length. - uint8_t const shift_amt = 24 - 6 * char_encoding_length.first; - code_point >>= shift_amt; - return code_point; -} - -/** - * @brief Normalize the characters for the strings input. - * - * Characters are replaced, padded, or removed depending on the `do_lower_case` input - * as well as the metadata values for each code point found in `cp_metadata`. - * - * First, each character is converted from UTF-8 to a unicode code point value. - * This value is then looked up in the `cp_metadata` table to determine its fate. - * The end result is a set of code point values for each character. - * The normalized set of characters make it easier for the tokenizer to identify - * tokens and match up token ids. - * - * @param[in] strings The input strings with characters to normalize to code point values. - * @param[in] total_bytes Total number of bytes in the input `strings` vector. - * @param[in] cp_metadata The metadata lookup table for every unicode code point value. - * @param[in] aux_table Aux table for mapping some multi-byte code point values. - * @param[in] do_lower_case True if normalization should include lower-casing. - * @param[out] code_points The resulting code point values from normalization. - * @param[out] chars_per_thread Output number of code point values per string. - */ -CUDF_KERNEL void kernel_data_normalizer(unsigned char const* strings, - size_t const total_bytes, - uint32_t const* cp_metadata, - uint64_t const* aux_table, - bool const do_lower_case, - uint32_t* code_points, - uint32_t* chars_per_thread) -{ - constexpr uint32_t init_val = (1 << FILTER_BIT); - uint32_t replacement_code_points[MAX_NEW_CHARS] = {init_val, init_val, init_val}; - - auto const char_for_thread = cudf::detail::grid_1d::global_thread_id(); - uint32_t num_new_chars = 0; - - if (char_for_thread < total_bytes) { - auto const code_point = extract_code_points_from_utf8(strings, total_bytes, char_for_thread); - auto const metadata = cp_metadata[code_point]; - - if (is_head_byte(strings[char_for_thread]) && !should_remove_cp(metadata, do_lower_case)) { - num_new_chars = 1; - // Apply lower cases and accent stripping if necessary - auto const new_cp = - do_lower_case || always_replace(metadata) ? get_first_cp(metadata) : code_point; - replacement_code_points[0] = new_cp == 0 ? code_point : new_cp; - - if (do_lower_case && is_multi_char_transform(metadata)) { - auto const next_cps = aux_table[code_point]; - replacement_code_points[1] = static_cast(next_cps >> 32); - auto const potential_next_cp = static_cast(next_cps); - replacement_code_points[2] = - potential_next_cp != 0 ? potential_next_cp : replacement_code_points[2]; - num_new_chars = 2 + (potential_next_cp != 0); - } - - if (should_add_spaces(metadata, do_lower_case)) { - // Need to shift all existing code-points up one - // This is a rotate right. There is no thrust equivalent at this time. - for (int loc = num_new_chars; loc > 0; --loc) { - replacement_code_points[loc] = replacement_code_points[loc - 1]; - } - - // Write the required spaces at the end - replacement_code_points[0] = SPACE_CODE_POINT; - replacement_code_points[num_new_chars + 1] = SPACE_CODE_POINT; - num_new_chars += 2; - } - } - } - - chars_per_thread[char_for_thread] = num_new_chars; - - using BlockStore = - cub::BlockStore; - __shared__ typename BlockStore::TempStorage temp_storage; - - // Now we perform coalesced writes back to global memory using cub. - uint32_t* block_base = code_points + blockIdx.x * blockDim.x * MAX_NEW_CHARS; - BlockStore(temp_storage).Store(block_base, replacement_code_points); -} - -} // namespace - -data_normalizer::data_normalizer(codepoint_metadata_type const* cp_metadata, - aux_codepoint_data_type const* aux_table, - bool do_lower_case) - : d_cp_metadata{cp_metadata}, d_aux_table{aux_table}, do_lower_case{do_lower_case} -{ -} - -uvector_pair data_normalizer::normalize(cudf::strings_column_view const& input, - rmm::cuda_stream_view stream) const -{ - if (input.is_empty()) { - return uvector_pair{std::make_unique>(0, stream), - std::make_unique>(0, stream)}; - } - - // copy offsets to working memory - auto const num_offsets = input.size() + 1; - auto d_strings_offsets = std::make_unique>(num_offsets, stream); - auto const d_offsets = - cudf::detail::offsetalator_factory::make_input_iterator(input.offsets(), input.offset()); - thrust::transform(rmm::exec_policy(stream), - thrust::counting_iterator(0), - thrust::counting_iterator(num_offsets), - d_strings_offsets->begin(), - [d_offsets] __device__(auto idx) { - auto const offset = d_offsets[0]; // adjust for any offset to the offsets - return d_offsets[idx] - offset; - }); - auto const bytes_count = d_strings_offsets->element(input.size(), stream); - if (bytes_count == 0) { // if no bytes, nothing to do - return uvector_pair{std::make_unique>(0, stream), - std::make_unique>(0, stream)}; - } - - int64_t const threads_per_block = THREADS_PER_BLOCK; - size_t const num_blocks = cudf::util::div_rounding_up_safe(bytes_count, threads_per_block); - size_t const threads_on_device = threads_per_block * num_blocks; - size_t const max_new_char_total = MAX_NEW_CHARS * threads_on_device; - - auto d_code_points = std::make_unique>(max_new_char_total, stream); - rmm::device_uvector d_chars_per_thread(threads_on_device, stream); - auto const d_strings = input.chars_begin(stream) + cudf::strings::detail::get_offset_value( - input.offsets(), input.offset(), stream); - kernel_data_normalizer<<>>( - reinterpret_cast(d_strings), - bytes_count, - d_cp_metadata, - d_aux_table, - do_lower_case, - d_code_points->data(), - d_chars_per_thread.data()); - - // Remove the 'empty' code points from the vector - thrust::remove(rmm::exec_policy(stream), - d_code_points->begin(), - d_code_points->end(), - uint32_t{1 << FILTER_BIT}); - - // We also need to prefix sum the number of characters up to an including - // the current character in order to get the new strings lengths. - thrust::inclusive_scan(rmm::exec_policy(stream), - d_chars_per_thread.begin(), - d_chars_per_thread.end(), - d_chars_per_thread.begin()); - - // This will reset the offsets to the new generated code point values - thrust::for_each_n( - rmm::exec_policy(stream), - thrust::make_counting_iterator(1), - input.size(), - update_strings_lengths_fn{d_chars_per_thread.data(), d_strings_offsets->data()}); - - auto const num_chars = d_strings_offsets->element(input.size(), stream); - d_code_points->resize(num_chars, stream); // should be smaller than original allocated size - - // return the normalized code points and the new offsets - return uvector_pair(std::move(d_code_points), std::move(d_strings_offsets)); -} - -} // namespace detail -} // namespace nvtext diff --git a/cpp/src/text/subword/detail/data_normalizer.hpp b/cpp/src/text/subword/detail/data_normalizer.hpp deleted file mode 100644 index c70e3734691..00000000000 --- a/cpp/src/text/subword/detail/data_normalizer.hpp +++ /dev/null @@ -1,97 +0,0 @@ -/* - * Copyright (c) 2020-2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include "text/subword/detail/cp_data.h" - -#include -#include - -#include -#include - -using uvector_pair = std::pair>, - std::unique_ptr>>; - -namespace nvtext { -namespace detail { - -/** - * @brief Performs text cleaning for the tokenizers. - * - * Every instantiation of this class will transfer the meta data over to the GPU. - * It is advised to create one class and reuse that class as needed. - * - * Converts characters to lowercase, adds spaces around punctuation and multi-byte - * characters, strips accents from letters in the text and standardizes whitespace - * characters to all be the code point for the " " literal. - * - * The algorithm produces two vectors of integers `uvector_pair`. - * The first is the size of 3 uint32 values per input byte (of the strings buffer). - * The second is the same size as the input offsets vector -- number of strings + 1. - * - * A temporary buffer is created equal to 1 uint32 value per input byte. - * This means 16x the number bytes of the input strings buffer must be available - * to call the `normalize()` function in this class. - */ -class data_normalizer { - public: - /** - * @brief Create instance of the normalizer. - * - * @param cp_metadata The code point metadata table to use for normalization. - * @param aux_table The auxiliary code point table. - * @param do_lower_case If true, the normalizer will convert uppercase characters in the - * input stream to lower case and strip accents from those characters. - * If false, accented and uppercase characters are not transformed. - */ - data_normalizer(codepoint_metadata_type const* cp_metadata, - aux_codepoint_data_type const* aux_table, - bool do_lower_case = true); - - /** - * @brief Normalize a vector of strings. - * - * If `do_lower_case` is true, this function will convert each character to lowercase - * and strip accents from the characters. If false it will do all other conversions - * in the class description except lower-casing and punctuation stripping. - * - * The result of this function returns two pointers to GPU data. - * The first pointer is to a contiguous array of unicode code points corresponding to the - * characters in the text after running normalization. The second pointer is to the - * offsets of the strings in the code point array. That is, string `i` starts at - * `result.second->data()[i]`. - * This array will always be of length `input.size() + 1` since we need one entry - * for each input and a last entry which has the total number of bytes. - * - * @param input Strings to normalize - * @param stream CUDA stream used for device memory operations and kernel launches. - * @return Two pointers to GPU data buffers. The first is a pointer - * to the code points array and the second is a pointer to the offsets - * used to locate the code points for each string. - */ - uvector_pair normalize(cudf::strings_column_view const& input, - rmm::cuda_stream_view stream) const; - - private: - bool const do_lower_case; - codepoint_metadata_type const* d_cp_metadata; - aux_codepoint_data_type const* d_aux_table; -}; - -} // namespace detail -} // namespace nvtext diff --git a/cpp/src/text/subword/detail/hash_utils.cuh b/cpp/src/text/subword/detail/hash_utils.cuh deleted file mode 100644 index dc0737118e8..00000000000 --- a/cpp/src/text/subword/detail/hash_utils.cuh +++ /dev/null @@ -1,172 +0,0 @@ -/* - * Copyright (c) 2020-2021, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include - -namespace nvtext { -namespace detail { - -// Used for hashing functions in this file -constexpr uint64_t PRIME = 281474976710677; - -/** - * @brief This does a multiply mod 48 without overflow for the sdbm hash "pop" method. - * - * This method computes the bottom 48 bits of the result of multiplying two numbers - * respecting the restrictions specified by the parameters. - * - * It works by splitting `num` into 16 bit chunks and performing repeated multiplies. - * The result of all of those multiplies are added together. - * - * @param num_48bit A multiplicand that is at most 48 bits. - * @param num Any 64 bit number to multiply by num_48bit mod 2**48 - * @return (num_48bit * num) mod 2**48 - */ -__device__ uint64_t mul_mod_48(uint64_t num_48bit, uint64_t num) -{ - constexpr uint64_t mask = (1ULL << 48) - 1; - constexpr uint8_t bit_chunk_size = 16; - - uint64_t result = 0; -#pragma unroll - for (uint8_t i = 0; i < sizeof(num) / 2; ++i) { - auto const shift_amt = bit_chunk_size * i; - auto const bottom_16 = static_cast(num >> shift_amt); - // update result - result = result + ((num_48bit * bottom_16) << shift_amt); - result &= mask; - } - return result; -} - -/** - * @brief Computes the sdbm hash for the sequence starting at sequence_start up to length sequences. - * - * A start value for the sdbm hash can optionally be given. This is useful when checking if elements - * starting with "##" exist in the table since we can pass in the hash of "##" as the start value. - * - * @param sequence_start Code points to hash - * @param length Number of code points to hash - * @param start_value Initializes the hash computation. - * @return The sdbm hash of all elements in range `[sequence_start, sequence_start + length)` - */ -__device__ uint64_t sdbm_hash(uint32_t const* sequence_start, - uint32_t length, - uint64_t start_value = 0) -{ - // This expression computes h_{i} = (65599*h{i-1} + new_val) mod 2^48 and was obtained from here: - // http://www.cse.yorku.ca/~oz/hash.html - - constexpr uint64_t mask = (1ULL << 48) - 1; - uint64_t hash_value = start_value; - - for (int i = 0; i < length; ++i) { - hash_value = ((hash_value << 6) + (hash_value << 16) - hash_value) & mask; - hash_value = (hash_value + (sequence_start[i] & mask)) & mask; - } - - return hash_value; -} - -/** - * @brief Removes the last value added to the hash. - * - * If we have `current_hash = sdbm_hash("dog")` then, `prev_sdbm_hash(current_hash, cp(g))` - * returns the `sdbm_hash("do")` where it is assumed cp returns the unicode code point for a - * given letter. - * - * @param current_hash The current value used to compute the previous sdbm. - * @param last_val Last value used in the hash sequence. - * @return The hash value before that new value was added. - */ -__device__ uint64_t prev_sdbm_hash(uint64_t current_hash, uint32_t last_val) -{ - constexpr uint64_t mask = (1ULL << 48) - 1; - // Multiplicative inverse of 65599 under mod 2**48 - constexpr uint64_t mod_inverse = 24320495251391; - uint64_t const prev_hash = - mul_mod_48(mod_inverse, current_hash) - mul_mod_48(mod_inverse, last_val); - return prev_hash & mask; -} - -/** - * @brief The hash function used for accesses to the table. - * - * This is a universal hash function with parameters chosen to achieve perfect hashing. - * - * Algorithm is `((a*k + b) % PRIME) % table_size` where @ref PRIME is globally defined - * as 281474976710677 - * - * @param key Value to hash - * @param a Outer table first constant - * @param b Outer table second constant - * @param table_size Number of bins in the hash table. - * @return The computed hash value. - */ -__device__ uint32_t hash(uint64_t key, uint64_t a, uint64_t b, uint32_t table_size) -{ - return ((a * key + b) % PRIME) % table_size; -} - -/** - * @brief Retrieves the value associated with key in the hash table. - * - * If there is no value in the table with the input key, -1 is returned. - * - * This method will ALWAYS return the correct value if a key is in the table. However, some - * code point sequences may hash to the same key in which case an incorrect value is returned. - * This collision is rare and will not likely affect the model's performance. - * - * @param key The key to search for in the hash table - * @param hash_table A pointer to the flattened hash table - * @param bin_coefficients A pointer to the hashing parameters for each bin in the hash table. - * @param bin_offsets A pointer to the start of each bin in the hash table. - * @return -1 if key is not in the hash table. If the key is in the table returns an index in - * [0, vocab_size) indicating the index for the token in the bert model. - */ -__device__ int retrieve(uint64_t const key, - uint32_t const outer_table_a, - uint32_t const outer_table_b, - uint16_t const num_bins, - uint64_t const* hash_table, - uint64_t const* bin_coefficients, - uint16_t const* bin_offsets) -{ - auto const hash_bin = hash(key, outer_table_a, outer_table_b, num_bins); - auto const bin_params = bin_coefficients[hash_bin]; - auto const start_ht_offset = bin_offsets[hash_bin]; - - // The shift constants are due to how the hash coefficients are packed and are - // obtained from the python script perfect_hash.py which generates the expected tables. - auto const inner_bin_a = bin_params >> 16; - auto const inner_bin_b = (bin_params >> 9) & ((1 << 7) - 1); - auto const bin_size = static_cast(bin_params); - - if (bin_size == 0) { return -1; } // key hash has no bin parameters - - auto const inner_offset = hash(key, inner_bin_a, inner_bin_b, bin_size); - auto const kv_pair = hash_table[start_ht_offset + inner_offset]; - - auto const expected_key = kv_pair >> 16; - // extract value from encoded key-value - int value = kv_pair & ((1 << 16) - 1); - return key == expected_key ? value : -1; -} - -} // namespace detail -} // namespace nvtext diff --git a/cpp/src/text/subword/detail/tokenizer_utils.cuh b/cpp/src/text/subword/detail/tokenizer_utils.cuh deleted file mode 100644 index 01df910d420..00000000000 --- a/cpp/src/text/subword/detail/tokenizer_utils.cuh +++ /dev/null @@ -1,76 +0,0 @@ -/* - * Copyright (c) 2020-2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include "text/subword/detail/cp_data.h" - -#include - -#include -#include - -#include - -namespace nvtext { -namespace detail { - -constexpr int THREADS_PER_BLOCK = 64; - -/** - * @brief In-place update of offsets values. - * - * In the `d_chars_up_to_idx`, the last character of each string is basically - * the offset (i.e. the number of characters) in that string. - * - * Example - * @code{.pseudo} - * // 3 strings with sizes 5,4,2 - * d_offsets = [0,5,9,11] - * // code points generated per character (as offsets) - * // 2nd string has an extra code point at its first char - * d_chars_up_to_idx = [1,2,3,4,5,6,8,9,10,11,12] - * d_chars_up_to_idx[d_offsets[1-3]] is [5,10,12] - * => d_offsets becomes [0,5,10,12] - * @endcode - */ -struct update_strings_lengths_fn { - uint32_t const* d_chars_up_to_idx; - int64_t* d_offsets; - - __device__ void operator()(cudf::size_type idx) - { - auto const offset = d_offsets[idx]; - d_offsets[idx] = offset > 0 ? d_chars_up_to_idx[offset - 1] : 0; - } -}; - -/** - * @brief Retrieve the code point metadata table. - * - * @param stream CUDA stream used for device memory operations and kernel launches. - */ -rmm::device_uvector get_codepoint_metadata(rmm::cuda_stream_view stream); - -/** - * @brief Retrieve the auxiliary code point metadata table. - * - * @param stream CUDA stream used for device memory operations and kernel launches. - */ -rmm::device_uvector get_aux_codepoint_data(rmm::cuda_stream_view stream); - -} // namespace detail -} // namespace nvtext diff --git a/cpp/src/text/subword/detail/wordpiece_tokenizer.hpp b/cpp/src/text/subword/detail/wordpiece_tokenizer.hpp deleted file mode 100644 index 244fe5092e7..00000000000 --- a/cpp/src/text/subword/detail/wordpiece_tokenizer.hpp +++ /dev/null @@ -1,106 +0,0 @@ -/* - * Copyright (c) 2020-2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include "text/subword/detail/data_normalizer.hpp" - -#include - -#include - -namespace nvtext { - -struct hashed_vocabulary; - -namespace detail { - -/** - * @brief This splits words into tokens contained in the model vocabulary file. - * - * The tokenizer first normalizes the character bytes, identifies the words in - * each string, and then converts each word in to a integer token-id per the - * provided vocabulary hash table. - * - * The `tokenize()` function produces two device vectors `uvector_pair`. - * The first is the token-ids for each word identified in the input strings. - * The second is the offsets to identify which ids go with each string. - * - * Temporary buffers are created equal to 3 uint32 values plus 1 byte per input byte. - * Also the normalize step allocates an additional 16x bytes per input byte but 8x - * of this memory is reused by the `tokenize()` function. - * This means 13x + 8x = 21x the number bytes of the input strings buffer must be - * available to call the `tokenize()` function in this class. - */ -class wordpiece_tokenizer { - public: - /** - * @brief Creates a full tokenizer that cleans the text and splits it into tokens. - * - * @param vocab_table The preprocessed hashed vocabulary data. - * @param max_sequence_length Limit the number of token-ids per row in the output - * @param stride Each row in tensor-token-ids will replicate `max_sequence_length - stride` - * token-ids from the previous row, unless it is the first string. - * @param do_truncate If true, the tokenizer will discard all the token-ids after - * `max_sequence_length` for each input string. If false, it will use a - * new row in the tensor-token-ids to continue generating the output. - * @param do_lower_case If true, the tokenizer will convert uppercase characters in the - * input stream to lowercase and strip accents from those characters. - * If false, accented and uppercase characters are not transformed. - * @param max_word_length The length of the longest word that will be tokenized. Words - * longer than this will simply be replaced by the unknown token - * specified in the `vocab_file`. - */ - wordpiece_tokenizer(hashed_vocabulary const& vocab_table, - uint32_t max_sequence_length, - uint32_t stride, - bool do_truncate, - bool do_lower_case, - uint32_t max_word_length = 200); - - /** - * @brief Splits the input text into token ids. - * - * This class is simply a wrapper around the basic and word piece tokenizers. - * - * @param input Strings to tokenize - * @param stream CUDA stream used for device memory operations and kernel launches - * @return Pointer to token-ids and token-id offsets - */ - uvector_pair tokenize(cudf::strings_column_view const& input, rmm::cuda_stream_view stream); - - private: - /** - * @brief Splits the code points from the normalizer into tokens. - * - * @param[in,out] cps_and_offsets The output code points and offsets - * from the normalizer. - * The data is modified to contain the token ids and token counts - * per string. - * @param stream CUDA stream used for device memory operations and kernel launches. - */ - void tokenize(uvector_pair& cps_and_offsets, rmm::cuda_stream_view stream); - - hashed_vocabulary const& vocab_table; - data_normalizer normalizer; // removes punctuation, accents, etc - uint32_t const max_sequence_length; - uint32_t const stride; - bool const do_truncate; - uint32_t const max_word_length; -}; - -} // namespace detail -} // namespace nvtext diff --git a/cpp/src/text/subword/load_hash_file.cu b/cpp/src/text/subword/load_hash_file.cu deleted file mode 100644 index 55f1a19381c..00000000000 --- a/cpp/src/text/subword/load_hash_file.cu +++ /dev/null @@ -1,301 +0,0 @@ -/* - * Copyright (c) 2020-2025, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "text/subword/detail/codepoint_metadata.ah" -#include "text/subword/detail/tokenizer_utils.cuh" - -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -#include -#include -#include - -#include - -#include -#include -#include -#include -#include - -namespace nvtext { -namespace detail { - -/** - * @brief Retrieve the code point metadata table. - * - * Build the code point metadata table in device memory - * using the vector pieces from codepoint_metadata.ah - */ -rmm::device_uvector get_codepoint_metadata(rmm::cuda_stream_view stream) -{ - auto table_vector = rmm::device_uvector(codepoint_metadata_size, stream); - auto table = table_vector.data(); - thrust::fill(rmm::exec_policy(stream), - table + cp_section1_end, - table + codepoint_metadata_size, - codepoint_metadata_default_value); - CUDF_CUDA_TRY(cudaMemcpyAsync(table, - codepoint_metadata, - cp_section1_end * sizeof(codepoint_metadata[0]), // 1st section - cudaMemcpyDefault, - stream.value())); - CUDF_CUDA_TRY(cudaMemcpyAsync( - table + cp_section2_begin, - cp_metadata_917505_917999, - (cp_section2_end - cp_section2_begin + 1) * sizeof(codepoint_metadata[0]), // 2nd section - cudaMemcpyDefault, - stream.value())); - return table_vector; -} - -/** - * @brief Retrieve the aux code point data table. - * - * Build the aux code point data table in device memory - * using the vector pieces from codepoint_metadata.ah - */ -rmm::device_uvector get_aux_codepoint_data(rmm::cuda_stream_view stream) -{ - auto table_vector = rmm::device_uvector(aux_codepoint_data_size, stream); - auto table = table_vector.data(); - thrust::fill(rmm::exec_policy(stream), - table + aux_section1_end, - table + aux_codepoint_data_size, - aux_codepoint_default_value); - CUDF_CUDA_TRY(cudaMemcpyAsync(table, - aux_codepoint_data, - aux_section1_end * sizeof(aux_codepoint_data[0]), // 1st section - cudaMemcpyDefault, - stream.value())); - CUDF_CUDA_TRY(cudaMemcpyAsync( - table + aux_section2_begin, - aux_cp_data_44032_55203, - (aux_section2_end - aux_section2_begin + 1) * sizeof(aux_codepoint_data[0]), // 2nd section - cudaMemcpyDefault, - stream.value())); - CUDF_CUDA_TRY(cudaMemcpyAsync( - table + aux_section3_begin, - aux_cp_data_70475_71099, - (aux_section3_end - aux_section3_begin + 1) * sizeof(aux_codepoint_data[0]), // 3rd section - cudaMemcpyDefault, - stream.value())); - CUDF_CUDA_TRY(cudaMemcpyAsync( - table + aux_section4_begin, - aux_cp_data_119134_119232, - (aux_section4_end - aux_section4_begin + 1) * sizeof(aux_codepoint_data[0]), // 4th section - cudaMemcpyDefault, - stream.value())); - return table_vector; -} - -namespace { -/** - * @brief Convert string to uint32. - * - * This just wraps the std::stoi but provides a nice error message - * in case the hash file format is incorrect. - */ -uint32_t str_to_uint32(std::string const& str, uint64_t line_no) -{ - try { - return std::stoi(str); // there is no std::stoui - } catch (std::exception const& exc) { - std::string message("Line "); - message += std::to_string(line_no) + ": "; - message += "cannot convert integer from '"; - message += str; - message += "': "; - message += exc.what(); - std::cerr << message << std::endl; - throw; - } -} - -/** - * @brief Convert string to uint64. - * - * This just wraps the std::stoul but provides a nice error message - * in case the hash file format is incorrect. - */ -uint64_t str_to_uint64(std::string const& str, uint64_t line_no) -{ - try { - return std::stoul(str); - } catch (std::exception const& exc) { - std::string message("Line "); - message += std::to_string(line_no) + ": "; - message += "cannot convert integer from '"; - message += str; - message += "': "; - message += exc.what(); - std::cerr << message << std::endl; - throw; - } -} -} // namespace - -/** - * @brief Loads a text file representing the hashed vocabulary into hashed_vocabulary struct. - * - * @code{.pseudo} - * Format of the file (ASCII text file with numbers): - * First 3 lines have the following values: - * outer_hash_a - * outer_hash_b - * number-of-bins - * The next number-of-bins lines has two values in each line separated by a space - * coefficient offset - * ... - * Next line has the size (number of lines) of the table followed - * by the table values -- one value per line. - * The last three lines: - * unknown_token_id - * first_token_id - * separator_token_id - * @endcode - * - * @param filename_hashed_vocabulary Path to text file containing hashed vocabulary - * @return object containing hash table elements for the wordpiece tokenizer - */ -std::unique_ptr load_vocabulary_file( - std::string const& filename_hashed_vocabulary, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) -{ - hashed_vocabulary result; - std::ifstream hash_file(filename_hashed_vocabulary); - CUDF_EXPECTS(hash_file.good(), "Could not open " + filename_hashed_vocabulary); - - uint64_t line_no = 1; - std::string line; - std::getline(hash_file, line); - result.outer_hash_a = str_to_uint32(line, line_no++); - - std::getline(hash_file, line); - result.outer_hash_b = str_to_uint32(line, line_no++); - - std::getline(hash_file, line); - result.num_bins = str_to_uint32(line, line_no++); - - auto bin_coefficients = cudf::detail::make_host_vector(result.num_bins, stream); - auto bin_offsets = cudf::detail::make_host_vector(result.num_bins, stream); - - for (int i = 0; i < result.num_bins; ++i) { - std::getline(hash_file, line); - size_t loc_of_space = line.find(' '); - CUDF_EXPECTS(loc_of_space != line.npos, "invalid hash file format"); - - std::string first_num = line.substr(0, loc_of_space); - std::string second_num = line.substr(loc_of_space + 1, line.length()); - - bin_coefficients[i] = str_to_uint64(first_num, line_no); - bin_offsets[i] = str_to_uint32(second_num, line_no); - ++line_no; - } - - std::getline(hash_file, line); - uint64_t hash_table_length = str_to_uint64(line, line_no++); - auto table = cudf::detail::make_host_vector(hash_table_length, stream); - - std::generate(table.begin(), table.end(), [&hash_file, &line_no]() { - std::string line; - std::getline(hash_file, line); - return str_to_uint64(line, line_no++); - }); - - std::getline(hash_file, line); - result.unknown_token_id = str_to_uint32(line, line_no++); - - std::getline(hash_file, line); - result.first_token_id = str_to_uint32(line, line_no++); - - std::getline(hash_file, line); - result.separator_token_id = str_to_uint32(line, line_no++); - - // Transfer hash table to columns - result.table = cudf::make_numeric_column(cudf::data_type{cudf::type_id::UINT64}, - table.size(), - cudf::mask_state::UNALLOCATED, - stream, - mr); - cudf::detail::cuda_memcpy_async( - cudf::device_span(result.table->mutable_view().data(), table.size()), - table, - stream); - - result.bin_coefficients = cudf::make_numeric_column(cudf::data_type{cudf::type_id::UINT64}, - bin_coefficients.size(), - cudf::mask_state::UNALLOCATED, - stream, - mr); - cudf::detail::cuda_memcpy_async( - cudf::device_span(result.bin_coefficients->mutable_view().data(), - bin_coefficients.size()), - bin_coefficients, - stream); - - result.bin_offsets = cudf::make_numeric_column(cudf::data_type{cudf::type_id::UINT16}, - bin_offsets.size(), - cudf::mask_state::UNALLOCATED, - stream, - mr); - cudf::detail::cuda_memcpy_async( - cudf::device_span(result.bin_offsets->mutable_view().data(), - bin_offsets.size()), - bin_offsets, - stream); - - auto cp_metadata = detail::get_codepoint_metadata(stream); - auto const cp_metadata_size = static_cast(cp_metadata.size()); - result.cp_metadata = std::make_unique(cudf::data_type{cudf::type_id::UINT32}, - cp_metadata_size, - cp_metadata.release(), - rmm::device_buffer{}, - 0); - - auto aux_cp_table = detail::get_aux_codepoint_data(stream); - auto const aux_cp_table_size = static_cast(aux_cp_table.size()); - result.aux_cp_table = std::make_unique(cudf::data_type{cudf::type_id::UINT64}, - aux_cp_table_size, - aux_cp_table.release(), - rmm::device_buffer{}, - 0); - - return std::make_unique(std::move(result)); -} - -} // namespace detail - -std::unique_ptr load_vocabulary_file( - std::string const& filename_hashed_vocabulary, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) -{ - CUDF_FUNC_RANGE(); - return detail::load_vocabulary_file(filename_hashed_vocabulary, stream, mr); -} - -} // namespace nvtext diff --git a/cpp/src/text/subword/subword_tokenize.cu b/cpp/src/text/subword/subword_tokenize.cu deleted file mode 100644 index 58653be7dd7..00000000000 --- a/cpp/src/text/subword/subword_tokenize.cu +++ /dev/null @@ -1,300 +0,0 @@ -/* - * Copyright (c) 2020-2025, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "text/subword/detail/wordpiece_tokenizer.hpp" - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include - -#include -#include - -#include -#include -#include -#include -#include - -namespace nvtext { -namespace detail { -namespace { - -/** - * @brief Convert tokens and row2tensor map to final tensor data. - * - * @param[in] token_ids Tokens from tokenizer - * @param[in] offsets Offsets to each string's output row of tokens - * @param[in] row2tensor String to tensor token counts - * @param[in] row2row_within_tensor Token counts within sub-rows of the output - * @param[in] max_sequence_length Maximum number of tokens in a row - * @param[in] nrows_tensor_token_ids Total number of output tensor rows - * @param[in] stride Number of tokens in sub-rows - * @param[in] do_truncate True if tokens should not spill into sub-rows in the output - * @param[out] final_tensor Output vector of token-ids - * @param[out] attn_mask Identifies valid token id entries - * @param[out] metadata Additional data per row - */ -CUDF_KERNEL void kernel_compute_tensor_metadata( - // input - uint32_t const* token_ids, - int64_t const* offsets, - uint32_t const* row2tensor, - uint32_t const* row2row_within_tensor, - uint32_t max_sequence_length, - uint32_t nrows_tensor_token_ids, - uint32_t stride, - bool do_truncate, - // output - uint32_t* final_tensor, - uint32_t* attn_mask, - uint32_t* metadata) -{ - auto const output_idx = cudf::detail::grid_1d::global_thread_id(); - - uint32_t const absolute_row_id = output_idx / max_sequence_length; - if (absolute_row_id >= nrows_tensor_token_ids) { return; } - uint32_t const tensor_id = row2tensor[absolute_row_id]; - uint32_t const row_within_tensor = row2row_within_tensor[absolute_row_id]; - uint32_t const offset_token_ids_tensor = offsets[tensor_id]; - uint32_t const n_tokens_tensor = offsets[tensor_id + 1] - offset_token_ids_tensor; - // check for last row within tensor - bool const last_row_of_tensor = (absolute_row_id == nrows_tensor_token_ids - 1) || - (row2tensor[absolute_row_id + 1] != tensor_id); - // compute input offset to retrieve token ids - uint32_t const token_idx = output_idx % max_sequence_length; - uint32_t const row_offset_token_ids = - offset_token_ids_tensor + token_idx + - (row_within_tensor ? (max_sequence_length + (stride * (row_within_tensor - 1))) : 0); - - if (row_within_tensor == 0) { - if (token_idx < n_tokens_tensor) { - // copy token ids - final_tensor[output_idx] = token_ids[row_offset_token_ids]; - attn_mask[output_idx] = 1; - } else { - // pad with 0 - final_tensor[output_idx] = 0; - attn_mask[output_idx] = 0; - } - } else { - uint32_t const n_replicates = max_sequence_length - stride; - if ((row_offset_token_ids - n_replicates) < (offset_token_ids_tensor + n_tokens_tensor)) { - // replicate elements from previous row or copy new tokens - final_tensor[output_idx] = token_ids[row_offset_token_ids - n_replicates]; - attn_mask[output_idx] = 1; - } else { - // pad with 0 - final_tensor[output_idx] = 0; - attn_mask[output_idx] = 0; - } - } - - // write metadata - if (token_idx == 0) { - auto const metadata_idx = absolute_row_id * 3; // three metadata values per output row - metadata[metadata_idx] = tensor_id; - metadata[metadata_idx + 1] = (row_within_tensor == 0) ? 0 : (max_sequence_length - stride) / 2; - metadata[metadata_idx + 2] = [&] { - if (!last_row_of_tensor) return max_sequence_length - (max_sequence_length - stride) / 2 - 1; - if (n_tokens_tensor <= max_sequence_length) // we fit, all good - return (n_tokens_tensor > 0) ? (n_tokens_tensor - 1) : 0; - if (do_truncate) return (max_sequence_length - 1); - - auto const final_row_value = - (max_sequence_length - stride) + (n_tokens_tensor - max_sequence_length) % stride; - return (final_row_value > 0) ? (final_row_value - 1) : 0; - }(); - } -} - -// this happens if there are no tokens in the input -tokenizer_result build_empty_result(cudf::size_type size, - uint32_t max_sequence_length, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) -{ - auto zero = cudf::numeric_scalar(0, true, stream); - auto ids = cudf::detail::sequence(size * max_sequence_length, zero, zero, stream, mr); - auto mask = cudf::detail::sequence(size * max_sequence_length, zero, zero, stream, mr); - - auto metadata = cudf::make_numeric_column( - cudf::data_type{cudf::type_id::UINT32}, size * 3, cudf::mask_state::UNALLOCATED, stream, mr); - thrust::tabulate(rmm::exec_policy(stream), - metadata->mutable_view().begin(), - metadata->mutable_view().end(), - [] __device__(auto idx) { return ((idx % 3) == 0) ? idx : 0; }); - metadata->set_null_count(0); - - return tokenizer_result{ - 0, max_sequence_length, std::move(ids), std::move(mask), std::move(metadata)}; -} - -} // namespace - -tokenizer_result subword_tokenize(cudf::strings_column_view const& strings, - hashed_vocabulary const& vocab_table, - uint32_t max_sequence_length, - uint32_t stride, - bool do_lower_case, - bool do_truncate, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) -{ - CUDF_EXPECTS(stride <= max_sequence_length, - "stride must be less than or equal to max_sequence_length"); - auto const strings_count = strings.size(); - if (strings_count == strings.null_count()) { // empty or all-null returns empty - return tokenizer_result{0, - max_sequence_length, - cudf::make_empty_column(cudf::data_type{cudf::type_id::UINT32}), - cudf::make_empty_column(cudf::data_type{cudf::type_id::UINT32}), - cudf::make_empty_column(cudf::data_type{cudf::type_id::UINT32})}; - } - CUDF_EXPECTS( - max_sequence_length <= - (static_cast(std::numeric_limits::max()) / strings_count), - "max_sequence_length times number of input rows exceeds the column size limit", - std::overflow_error); - - // Create tokenizer - wordpiece_tokenizer tokenizer( - vocab_table, max_sequence_length, stride, do_truncate, do_lower_case); - // Run tokenizer - auto const tokens = tokenizer.tokenize(strings, stream); - // assign output components - auto device_token_ids = tokens.first->data(); - auto device_offsets = tokens.second->data(); - - // Format output from tokenizer - // Each string can create 1 or more tensor entries. - // Compute the string-per-tensor offsets values by scanning - // over the number of tokens for each string. - rmm::device_uvector offsets_per_tensor(strings_count + 1, stream); - auto d_offsets_per_tensor = offsets_per_tensor.data(); - - thrust::transform_exclusive_scan( - rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - thrust::make_counting_iterator(strings_count + 1), - offsets_per_tensor.begin(), - [device_offsets, do_truncate, max_sequence_length, stride, strings_count] __device__( - cudf::size_type idx) { - uint32_t const num_tokens = - idx < strings_count ? device_offsets[idx + 1] - device_offsets[idx] : 0; - if (do_truncate || num_tokens <= max_sequence_length) return uint32_t{1}; - return 1 + ((num_tokens - max_sequence_length + stride - 1) / stride); - }, - uint32_t{0}, - cuda::std::plus()); - // last element is the total number of output rows - uint32_t const nrows_tensor_token_ids = offsets_per_tensor.element(strings_count, stream); - // if there are no tokens at all, build a specific empty result - if (nrows_tensor_token_ids == 0) { - return build_empty_result(strings_count, max_sequence_length, stream, mr); - } - - // compute global_row to tensor, and global_row to within_tensor_row correspondence - rmm::device_uvector row2tensor(nrows_tensor_token_ids, stream); - auto d_row2tensor = row2tensor.data(); - rmm::device_uvector row2row_within_tensor(nrows_tensor_token_ids, stream); - auto d_row2row_within_tensor = row2row_within_tensor.data(); - thrust::for_each_n( - rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - strings_count, - [d_offsets_per_tensor, d_row2tensor, d_row2row_within_tensor] __device__(auto idx) { - uint32_t offset = d_offsets_per_tensor[idx]; - uint32_t nrows = d_offsets_per_tensor[idx + 1] - offset; - for (uint32_t jdx = 0; jdx < nrows; ++jdx) { - d_row2tensor[jdx + offset] = idx; - d_row2row_within_tensor[jdx + offset] = jdx; - } - }); - - // create output data columns - auto tensor_token_ids = cudf::make_numeric_column(cudf::data_type{cudf::type_id::UINT32}, - nrows_tensor_token_ids * max_sequence_length, - cudf::mask_state::UNALLOCATED, - stream, - mr); - auto tensor_attention_mask = - cudf::make_numeric_column(cudf::data_type{cudf::type_id::UINT32}, - nrows_tensor_token_ids * max_sequence_length, - cudf::mask_state::UNALLOCATED, - stream, - mr); - auto tensor_metadata = cudf::make_numeric_column(cudf::data_type{cudf::type_id::UINT32}, - nrows_tensor_token_ids * 3, - cudf::mask_state::UNALLOCATED, - stream, - mr); - - // compute final-tensor, mask, and metadata - constexpr int block_size = 256; - cudf::detail::grid_1d const grid{ - static_cast(nrows_tensor_token_ids * max_sequence_length), block_size}; - kernel_compute_tensor_metadata<<>>( - device_token_ids, - device_offsets, - d_row2tensor, - d_row2row_within_tensor, - max_sequence_length, - nrows_tensor_token_ids, - stride, - do_truncate, - tensor_token_ids->mutable_view().data(), - tensor_attention_mask->mutable_view().data(), - tensor_metadata->mutable_view().data()); - - return tokenizer_result{nrows_tensor_token_ids, - max_sequence_length, - std::move(tensor_token_ids), - std::move(tensor_attention_mask), - std::move(tensor_metadata)}; -} - -} // namespace detail - -tokenizer_result subword_tokenize(cudf::strings_column_view const& strings, - hashed_vocabulary const& vocabulary_table, - uint32_t max_sequence_length, - uint32_t stride, - bool do_lower_case, - bool do_truncate, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) -{ - CUDF_FUNC_RANGE(); - return detail::subword_tokenize( - strings, vocabulary_table, max_sequence_length, stride, do_lower_case, do_truncate, stream, mr); -} - -} // namespace nvtext diff --git a/cpp/src/text/subword/wordpiece_tokenizer.cu b/cpp/src/text/subword/wordpiece_tokenizer.cu deleted file mode 100644 index a2de52b5659..00000000000 --- a/cpp/src/text/subword/wordpiece_tokenizer.cu +++ /dev/null @@ -1,562 +0,0 @@ -/* - * Copyright (c) 2020-2025, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "text/subword/detail/hash_utils.cuh" -#include "text/subword/detail/tokenizer_utils.cuh" -#include "text/subword/detail/wordpiece_tokenizer.hpp" - -#include -#include -#include -#include - -#include - -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -namespace nvtext { -namespace detail { -namespace { -/** - * @brief Initializes the token-ids, word-indices, and token counts vectors. - * - * Each thread process a single code point from `code_points`. - * This also locates the start and end of each word within the `code_points` buffer. - * A word start is identified as a non-space character that appears right after a space. - * A word end is identified as a space character that appears right after a non-space one. - * If the code point at this thread does not represent a word start or word end, - * a max uint32_t value is written to the appropriate vector instead. - * A post processing step is required to filter the relevant values in these - * vectors. - * - * It is guaranteed that the same number of valid values will be written to both the - * start and end indices and that after the select step, the two arrays will be aligned. - * That is, `start_word_indices[word]` and `end_word_indices[word]` are the start and - * end for the same word. - * - * Memory required is 13 bytes per code point values: - * - 4 bytes each for `start_word_indices` and `end_word_indices` - * - 4 bytes for each `token_ids` - * - 1 byte for each `tokens_per_word` - * Also, there is a code point value for each byte in the input strings. - * - * @param[in] code_points A pointer to the code points in the strings after normalization. - * @param[out] start_word_indices An array of size `num_code_points` which will contain the - * starting index for each word. - * @param[out] end_word_indices An array of size `num_code_points` which will contain the - * ending index for each word. - * @param num_code_points The total number of code_points. - * @param[out] token_ids An array of size `num_code_points` which will hold the token ids. - * This kernel just sets all the values to max uint32_t. - * @param[out] tokens_per_word An array of size `num_code_points` which hold the number of - * tokens. This kernel just sets all the values to 0. - */ -CUDF_KERNEL void init_data_and_mark_word_start_and_ends(uint32_t const* code_points, - uint32_t* start_word_indices, - uint32_t* end_word_indices, - size_t num_code_points, - uint32_t* token_ids, - uint8_t* tokens_per_word) -{ - auto const char_for_thread = cudf::detail::grid_1d::global_thread_id(); - - // Deal with the start_word_indices array - if (char_for_thread < num_code_points) { - uint32_t val_to_write = cuda::std::numeric_limits::max(); - if ((code_points[char_for_thread] != SPACE_CODE_POINT) && (char_for_thread > 0) && - (code_points[char_for_thread - 1] == SPACE_CODE_POINT)) { - val_to_write = char_for_thread; - } - start_word_indices[char_for_thread] = val_to_write; - - // Deal with the end_word_indices_array - val_to_write = cuda::std::numeric_limits::max(); - if ((code_points[char_for_thread] != SPACE_CODE_POINT) && - (char_for_thread + 1 < num_code_points) && - (code_points[char_for_thread + 1] == SPACE_CODE_POINT)) { - val_to_write = char_for_thread + 1; - } - end_word_indices[char_for_thread] = val_to_write; - - token_ids[char_for_thread] = cuda::std::numeric_limits::max(); - tokens_per_word[char_for_thread] = 0; - } -} - -/** - * @brief Resolves the string boundaries for the start and end words. - * - * This kernel should be called after `init_data_and_mark_word_start_and_ends` with at - * least `num_strings` total threads. - * - * The start and end indices are updated to honor the string boundaries - * within the strings array. This corrects any word ranges that span across - * individual strings. - * - * @param code_points A pointer to the code points in the strings. - * @param strings_offsets An array containing the index of the starting character of each string - * with an extra space at the end containing the total number of characters. As a result, - * this array is of length num_strings + 1. - * @param start_word_indices An array which will contain the starting index for each word scattered - * throughout. If an index does not represent a word start, the max-uint32_t value is written - * to indicate this. - * @param end_word_indices An array which will contain the one past the end index for each word - * scattered throughout. If an index does not represent a word end, the max uint32_t value is - * written to indicate this. - * @param num_strings The total number of strings to be processed. - */ -CUDF_KERNEL void mark_string_start_and_ends(uint32_t const* code_points, - int64_t const* strings_offsets, - uint32_t* start_word_indices, - uint32_t* end_word_indices, - uint32_t num_strings) -{ - auto const idx = cudf::detail::grid_1d::global_thread_id(); - // Ensure the starting character of each strings is written to the word start array. - if (idx <= num_strings) { - auto const offset = strings_offsets[idx]; - - if ((idx < num_strings) && (code_points[offset] != SPACE_CODE_POINT)) { - start_word_indices[offset] = offset; - } - - if ((offset > 0) && (code_points[offset - 1] != SPACE_CODE_POINT)) { - end_word_indices[offset - 1] = offset; - } - } -} - -/** - * @brief Currently supported special tokens. - * - * Code logic expects these to be 3 upper-case characters along - * with a single trailing space. - */ -__constant__ char special_tokens[35]{"BOS EOS UNK SEP PAD CLS MASK "}; -constexpr cudf::size_type MIN_ST_WIDTH = 4; // Min token size in special_tokens -constexpr cudf::size_type MAX_ST_WIDTH = 5; // Max token size in special_tokens - -struct mark_special_tokens { - /** - * @brief Check given code-point array to the list of known - * special tokens. - */ - __device__ bool is_special_token(uint32_t const* token, cudf::size_type size) const - { - if (size < MIN_ST_WIDTH || size > MAX_ST_WIDTH) return false; - char str_token[MAX_ST_WIDTH]; - // convert code-points to chars - thrust::transform(thrust::seq, token, token + size, str_token, [](uint32_t cp) { - // also upper-case them to match again special_tokens array - return static_cast(cp >= 'a' ? cp - 'a' + 'A' : cp); - }); - // search the special tokens array for the str_token - cudf::string_view tokens(special_tokens, sizeof(special_tokens)); - return tokens.find(str_token, size) != cudf::string_view::npos; - } - - /** - * @brief Check code-points for special tokens and adjust indices. - * - * Tokens will appear in the `code_points` array as: - * `_[_ttt_]_` where `_` are single space characters and - * ttt is the variable-length token name - * - * The logic below uses the following variables to represent position - * values in the `code_points` array after locating a special token: - * ``` - * _ [ _ t t t _ ] _ - * ^ ^ ^ ^ - * si sp ep ei - * ``` - * where `si` is `start_index` - * `sp` is `start_pos` - * `ep` is `end_pos` - * `ei` is `end_index` - * - * When a special token is found, the `code_points` are adjusted - * to remove the spaces and capitalize the name. - * ``` - * _ [ _ t t t _ ] _ is updated to - * _ [ T T T ] _ ] _ - * ``` - * This is required for the downstream word-piece tokenizer to - * match it to the vocabulary hash table. - * - * The `start_word_indices` and `end_word_indices` are updated to - * identify the token and to ignore the extra trailing `]` character. - */ - __device__ void operator()(size_t idx) const - { - uint32_t const start_index = start_word_indices[idx]; - if ((start_index == cuda::std::numeric_limits::max()) || - ((start_index + MIN_ST_WIDTH + 2) > num_code_points)) - return; - if (code_points[start_index] != '[') return; - - // check for matching end bracket - uint32_t const start_pos = start_index + 2; // after the space delimiter - // search for next start-word and then check it is a ']' - uint32_t const end_index = [&] { - auto const begin = start_word_indices + start_pos; - auto const width = - cuda::std::min(static_cast(MAX_ST_WIDTH + 1), (num_code_points - start_pos)); - auto const end = begin + width; - // checking the next start-word is more reliable than arbitrarily searching for ']' - // in case the text is split across string rows - auto const iter = thrust::find_if(thrust::seq, begin + 1, end, [](auto swi) { - return swi != cuda::std::numeric_limits::max(); - }); - return iter == end ? start_index : static_cast(iter - start_word_indices); - }(); - if (code_points[end_index] != ']') return; - - // check for special token - auto const size = static_cast(end_index - start_pos); - if (!is_special_token(code_points + start_pos, size)) return; - - // special token found - // adjust code-points - auto const end_pos = end_index - 2; - // change _[_ttt_]_ to _[TTT]_ - for (auto left_idx = start_pos - 1; left_idx <= end_pos; ++left_idx) { - auto const cp = code_points[left_idx + 1]; - code_points[left_idx] = cp >= 'a' ? cp - 'a' + 'A' : cp; - } - code_points[end_pos] = ']'; - - // erase the intermediate indices - thrust::fill(thrust::seq, - start_word_indices + start_index + 1, // keep the first one - start_word_indices + end_index + 1, - cuda::std::numeric_limits::max()); - thrust::fill(thrust::seq, - end_word_indices + start_index, - end_word_indices + end_index + 1, - cuda::std::numeric_limits::max()); - - // reset the new end-word index - end_word_indices[end_pos] = end_pos + 1; - } - - uint32_t* const code_points; - uint32_t* const start_word_indices; - uint32_t* const end_word_indices; - size_t const num_code_points; -}; - -/** - * @brief Converts words into token ids. - * - * Each thread is assigned a word to convert based on the `hash_table`. Each thread converts - * its word and writes the number of tokens it found in the `tokens_per_word` array. - * - * The `tokens_per_word` array is kept to the length `num_code_points + 1`. This means each thread - * can write its number of tokens to the `tokens_per_word` corresponding to the starting - * character of each word. Since strings must start at some word, we can prefix sum this array - * and use the strings_lengths code point offsets to directly index the number of tokens in each - * string. - * - * The `token_ids` array should be initialized to the max uint32_t before calling this kernel. - * - * @param code_points An array containing all of the code points to be processed - * @param hash_table An array containing the flattened hash table with key, value pairs - * packed in 64-bits - * @param bin_coefficients A pointer to the GPU pointer containing the hashing parameters for - * each hash bin on the GPU. - * @param bin_offsets: A pointer to the GPU pointer containing the start index of each bin in - * the flattened hash table. - * @param token_ids The index for each token found during tokenization. This is of length - * num_code_points. In most cases, multiple characters will collapse to one token. In these - * cases, the max uint32_t will be in place. Cub will be used later to filter out these - * invalid ids later. - * @param word_starts An array of length `num_code_points`. The first total word elements contains - * the index of the first character for each word. - * @param word_ends An array of length num_code_points. The first total_words elements contains the - * past the end index for each word. This array is kept aligned with the initial - * token_ids array containing the word start code points. - * `word_ends[word] - filtered_start_indices[word] = word_length` - * @param tokens_per_word An array of size num_code_points that will contain the number of tokens in - * each word in a string. This array can be exclusive summed and the result used in - * conjunction with the strings lengths array to find the tokens in each string. This is - * possible since the number of tokens in each word will be placed at the index corresponding - * to the start character of a word. If we assume prefix_summed is the prefix sum of the - * tokens_per_word array, then `prefix_summed[strings_lengths[string_idx] - 1]` is the number - * of tokens found before the start of string. - * @param unk_token_id The token id to be place for unknown tokens - * @param max_word_length The maximum length of a word. Any word longer than this length is - * replaced by the unknown token. - * @param total_words The total number of white space separated words - * @param outer_hash_a_param The a parameter for the outer hash - * @param outer_hash_b_param: The b parameter for the outer hash - * @param num_outer_bins: The number of bins for the outer hash - */ -CUDF_KERNEL void kernel_wordpiece_tokenizer(uint32_t const* code_points, - uint64_t const* hash_table, - uint64_t const* bin_coefficients, - uint16_t const* bin_offsets, - uint16_t unk_token_id, - uint32_t outer_hash_a_param, - uint32_t outer_hash_b_param, - uint16_t num_outer_bins, - uint32_t const* word_starts, - uint32_t const* word_ends, - uint32_t max_word_length, - uint32_t total_words, - uint32_t* token_ids, - uint8_t* tokens_per_word) -{ - auto const word_to_tokenize = cudf::detail::grid_1d::global_thread_id(); - - if (word_to_tokenize >= total_words) { return; } - // Each thread gets the start code_point offset for each word and resets the token_id memory to - // the default value. In a post processing step, all of these values will be removed. - auto const token_start = word_starts[word_to_tokenize]; - auto const token_end = word_ends[word_to_tokenize]; - auto const word_length = token_end - token_start; - - // The sdbm hash of "##" - constexpr uint32_t hashtag_hash = 2296000; - uint16_t num_values_tokenized = 0; - // initialize start, end - uint32_t start = token_start; - uint32_t end = token_end; - - if (word_length > max_word_length) { - start = token_end; - num_values_tokenized = 1; - token_ids[token_start] = unk_token_id; - tokens_per_word[token_start] = num_values_tokenized; - } - - while (start < token_end) { - end = token_end; - // init token_id to no token - int token_id = -1; - // compute current length - uint32_t const length = token_end - start; - uint64_t substr_hash = - sdbm_hash(code_points + start, length, start == token_start ? 0 : hashtag_hash); - while (start < end) { - token_id = retrieve(substr_hash, - outer_hash_a_param, - outer_hash_b_param, - num_outer_bins, - hash_table, - bin_coefficients, - bin_offsets); - if (token_id != -1) { break; } - --end; - // Pop off the last value from the substr hash - substr_hash = prev_sdbm_hash(substr_hash, code_points[end]); - } - - if (token_id == -1) { - end = token_end; - token_id = unk_token_id; - // We need to clean up the global array. This case is very uncommon. - // Only 0.016% of words cannot be resolved to a token from the squad dev set. - for (uint32_t i = 1; i < num_values_tokenized; ++i) { - token_ids[token_start + i] = cuda::std::numeric_limits::max(); - } - num_values_tokenized = 0; - } - - token_ids[token_start + num_values_tokenized] = token_id; - ++num_values_tokenized; - start = end; - } - - tokens_per_word[token_start] = num_values_tokenized; -} - -} // namespace - -wordpiece_tokenizer::wordpiece_tokenizer(hashed_vocabulary const& vocab_table, - uint32_t max_sequence_length, - uint32_t stride, - bool do_truncate, - bool do_lower_case, - uint32_t max_word_length) - : vocab_table(vocab_table), - normalizer(vocab_table.cp_metadata->view().data(), - vocab_table.aux_cp_table->view().data(), - do_lower_case), - max_sequence_length{max_sequence_length}, - stride(stride), - do_truncate(do_truncate), - max_word_length{max_word_length} -{ -} - -uvector_pair wordpiece_tokenizer::tokenize(cudf::strings_column_view const& input, - rmm::cuda_stream_view stream) -{ - auto cps_and_offsets = normalizer.normalize(input, stream); - tokenize(cps_and_offsets, stream); - return uvector_pair(std::move(cps_and_offsets.first), std::move(cps_and_offsets.second)); -} - -struct copy_if_fn { // inline lambda not allowed in private or protected member function - __device__ bool operator()(uint32_t cp) - { - return cp != cuda::std::numeric_limits::max(); - } -}; - -struct tranform_fn { // just converting uint8 value to uint32 - __device__ uint32_t operator()(uint8_t count) { return count; } -}; - -void wordpiece_tokenizer::tokenize(uvector_pair& cps_and_offsets, rmm::cuda_stream_view stream) -{ - auto device_code_points = cps_and_offsets.first->data(); - auto const num_code_points = cps_and_offsets.first->size(); - auto device_strings_offsets = cps_and_offsets.second->data(); - auto const num_strings = cps_and_offsets.second->size() - 1; - - size_t const four_byte_cp_chunks = 1 + (num_code_points - 1) / sizeof(uint32_t); - size_t const rounded_num_cps = sizeof(uint32_t) * four_byte_cp_chunks; - rmm::device_uvector device_tokens_per_word(rounded_num_cps, stream); - rmm::device_uvector device_token_ids(num_code_points, stream); - rmm::device_uvector device_word_indices(2 * num_code_points, stream); - - // make device_start_word_indices and device_end_word_indices contiguous - uint32_t* device_start_word_indices = device_word_indices.data(); - uint32_t* device_end_word_indices = device_start_word_indices + num_code_points; - - cudf::detail::grid_1d const grid_init{static_cast(num_code_points), - THREADS_PER_BLOCK}; - detail::init_data_and_mark_word_start_and_ends<<>>(device_code_points, - device_start_word_indices, - device_end_word_indices, - num_code_points, - device_token_ids.data(), - device_tokens_per_word.data()); - CUDF_CHECK_CUDA(stream.value()); - - cudf::detail::grid_1d const grid_mark{static_cast(num_strings + 1), - THREADS_PER_BLOCK}; - detail::mark_string_start_and_ends<<>>(device_code_points, - device_strings_offsets, - device_start_word_indices, - device_end_word_indices, - num_strings); - CUDF_CHECK_CUDA(stream.value()); - - // check for special tokens and adjust indices - thrust::for_each_n( - rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - num_code_points, - mark_special_tokens{ - device_code_points, device_start_word_indices, device_end_word_indices, num_code_points}); - - // Now start_word_indices has the word starts scattered throughout the array. We need to select - // all values not equal to the max uint32_t and place them at the start of the array. We leverage - // the fact that the start_word_indices and the end_word indices are contiguous to only launch one - // device select kernel. - auto itr_end = thrust::remove(rmm::exec_policy(stream), - device_word_indices.begin(), - device_word_indices.end(), - cuda::std::numeric_limits::max()); - - // The number of tokens selected will be double the number of words since we - // select from both the start and end index arrays. - uint32_t const num_words = cuda::std::distance(device_word_indices.begin(), itr_end) / 2; - - // We need to change the end_word_indices pointer after the selection is complete - device_end_word_indices = device_start_word_indices + num_words; - - if (num_words > 0) { - cudf::detail::grid_1d const grid{static_cast(num_words), THREADS_PER_BLOCK}; - detail::kernel_wordpiece_tokenizer<<>>( - device_code_points, - vocab_table.table->view().data(), - vocab_table.bin_coefficients->view().data(), - vocab_table.bin_offsets->view().data(), - vocab_table.unknown_token_id, - vocab_table.outer_hash_a, - vocab_table.outer_hash_b, - vocab_table.num_bins, - device_start_word_indices, - device_end_word_indices, - max_word_length, - num_words, - device_token_ids.data(), - device_tokens_per_word.data()); - CUDF_CHECK_CUDA(stream.value()); - } - - // Repurpose the input array for the token ids. In the worst case, each code point ends up being a - // token so this will always have enough memory to store the contiguous tokens. - uint32_t* contiguous_token_ids = device_code_points; - auto const copy_size = // thrust::copy_if limited to copying int-max values - cuda::std::min(device_token_ids.size(), - static_cast(cuda::std::numeric_limits::max())); - auto ids_itr = device_token_ids.begin(); - auto const ids_end = device_token_ids.end(); - while (ids_itr != ids_end) { - auto const copy_end = (static_cast(std::distance(ids_itr, ids_end)) <= copy_size) - ? ids_end - : ids_itr + copy_size; - contiguous_token_ids = thrust::copy_if( - rmm::exec_policy(stream), ids_itr, copy_end, contiguous_token_ids, copy_if_fn{}); - ids_itr = copy_end; - } - - // Repurpose start word indices since it is the same size and type as the required output. - uint32_t* token_id_counts = device_start_word_indices; - thrust::transform_inclusive_scan(rmm::exec_policy(stream), - device_tokens_per_word.data(), - device_tokens_per_word.data() + num_code_points, - token_id_counts, - tranform_fn{}, - cuda::std::plus()); - - // Update the device_strings_offsets using the token_id_counts - thrust::for_each_n(rmm::exec_policy(stream), - thrust::make_counting_iterator(1), - num_strings, - update_strings_lengths_fn{token_id_counts, device_strings_offsets}); -} - -} // namespace detail -} // namespace nvtext diff --git a/docs/cudf/source/pylibcudf/api_docs/nvtext/index.rst b/docs/cudf/source/pylibcudf/api_docs/nvtext/index.rst index 9ba47fd8d70..00314bceeb9 100644 --- a/docs/cudf/source/pylibcudf/api_docs/nvtext/index.rst +++ b/docs/cudf/source/pylibcudf/api_docs/nvtext/index.rst @@ -13,5 +13,4 @@ nvtext normalize replace stemmer - subword_tokenize tokenize diff --git a/docs/cudf/source/pylibcudf/api_docs/nvtext/subword_tokenize.rst b/docs/cudf/source/pylibcudf/api_docs/nvtext/subword_tokenize.rst deleted file mode 100644 index 818714bec6a..00000000000 --- a/docs/cudf/source/pylibcudf/api_docs/nvtext/subword_tokenize.rst +++ /dev/null @@ -1,6 +0,0 @@ -================ -subword_tokenize -================ - -.. automodule:: pylibcudf.nvtext.subword_tokenize - :members: diff --git a/docs/cudf/source/user_guide/api_docs/wordpiece_tokenizer.rst b/docs/cudf/source/user_guide/api_docs/wordpiece_tokenizer.rst index 188f0294a86..dc78447808b 100644 --- a/docs/cudf/source/user_guide/api_docs/wordpiece_tokenizer.rst +++ b/docs/cudf/source/user_guide/api_docs/wordpiece_tokenizer.rst @@ -10,10 +10,3 @@ Constructor WordPieceVocabulary WordPieceVocabulary.tokenize - -.. currentmodule:: cudf.core.subword_tokenizer -.. autosummary:: - :toctree: api/ - - SubwordTokenizer - SubwordTokenizer.__call__ diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index f906b83d5a7..1fa05377099 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -835,32 +835,6 @@ def is_letter(self, is_vowel: bool, index: int | NumericalColumn) -> Self: ) ) - @acquire_spill_lock() - def subword_tokenize( - self, - hashed_vocabulary: plc.nvtext.subword_tokenize.HashedVocabulary, - max_sequence_length: int = 64, - stride: int = 48, - do_lower: bool = True, - do_truncate: bool = False, - ) -> tuple[ColumnBase, ColumnBase, ColumnBase]: - """ - Subword tokenizes text series by using the pre-loaded hashed vocabulary - """ - result = plc.nvtext.subword_tokenize.subword_tokenize( - self.to_pylibcudf(mode="read"), - hashed_vocabulary, - max_sequence_length, - stride, - do_lower, - do_truncate, - ) - # return the 3 tensor components - tokens = type(self).from_pylibcudf(result[0]) - masks = type(self).from_pylibcudf(result[1]) - metadata = type(self).from_pylibcudf(result[2]) - return tokens, masks, metadata - @acquire_spill_lock() def tokenize_scalar(self, delimiter: plc.Scalar) -> Self: return type(self).from_pylibcudf( # type: ignore[return-value] diff --git a/python/cudf/cudf/core/subword_tokenizer.py b/python/cudf/cudf/core/subword_tokenizer.py deleted file mode 100644 index 27976988efb..00000000000 --- a/python/cudf/cudf/core/subword_tokenizer.py +++ /dev/null @@ -1,299 +0,0 @@ -# Copyright (c) 2021-2025, NVIDIA CORPORATION. - -from __future__ import annotations - -import warnings - -import cupy as cp -import numpy as np - -import pylibcudf as plc - - -def _cast_to_appropriate_type(ar, cast_type): - if cast_type == "cp": - return ar - - if cast_type == "pt": - from torch.utils.dlpack import from_dlpack - - elif cast_type == "tf": - from tensorflow.experimental.dlpack import from_dlpack - - return from_dlpack(ar.astype(np.dtype(np.int32)).__dlpack__()) - - -class SubwordTokenizer: - """ - Run CUDA BERT subword tokenizer on cuDF strings column. - Encodes words to token ids using vocabulary from a pretrained - tokenizer. - This function requires about 21x the number of character bytes - in the input strings column as working memory. - - Parameters - ---------- - hash_file : str - Path to hash file containing vocabulary of words with token-ids. - This can be created from the raw vocabulary - using the ``cudf.utils.hash_vocab_utils.hash_vocab`` function - - do_lower : bool, Default is True - If set to True, original text will be lowercased before encoding. - - Returns - ------- - SubwordTokenizer - """ - - def __init__(self, hash_file: str, do_lower_case: bool = True): - self.do_lower_case = do_lower_case - self.vocab_file = plc.nvtext.subword_tokenize.HashedVocabulary( - hash_file - ) - warnings.warn( - "SubwordTokenizer is deprecated and will be removed in a future " - "version. Use WordPieceVocabulary instead.", - FutureWarning, - ) - - def __call__( - self, - text, - max_length: int, - max_num_rows: int, - add_special_tokens: bool = True, - padding: str = "max_length", - truncation: bool | str = False, - stride: int = 0, - return_tensors: str = "cp", - return_token_type_ids: bool = False, - ): - """ - Run CUDA BERT subword tokenizer on cuDF strings column. - Encodes words to token ids using vocabulary from a - pretrained tokenizer. - - Parameters - ---------- - text : cudf string series - The batch of sequences to be encoded. - - max_length : int - Controls the maximum length to use or pad to. - - max_num_rows : int - Maximum number of rows for the output token-ids expected to - be generated by the tokenizer. - Used for allocating temporary working memory on the GPU device. - If the output generates a larger number of rows, - behavior is undefined. - This will vary based on stride, truncation, and max_length. - For example, for non-overlapping sequences output rows will be - the same as input rows. - A good default can be twice the max_length - - add_special_tokens : bool, optional, defaults to True - Whether or not to encode the sequences with the special tokens - of the BERT classification model - - padding : "max_length" - Pad to a maximum length specified with the argument max_length - - truncation : bool, defaults to False - True: - Truncate to a maximum length specified with the argument max_length - False or 'do_not_truncate': default - No truncation (Output differs from HuggingFace) - - stride : int, optional, defaults to 0 - The value of this argument defines the number of - overlapping tokens. - The information about the overlapping tokens is - present in the metadata outputted. - - return_tensors : str, {"cp", "pt", "tf"} defaults to "cp" - "cp" : Return cupy cp.ndarray objects - "tf" : Return TensorFlow tf.constant objects - "pt" : Return PyTorch torch.Tensor objects - - - return_token_type_ids : bool, optional - Only False currently supported - - Returns - ------- - An encoding with the following fields: - input_ids:(type defined by return_tensors) - A tensor of token ids to be fed to the model. - attention_mask: (type defined by return_tensors) - A tensor of indices specifying which tokens - should be attended to by the model - metadata: (type defined by return_tensors) - Each row contains the index id of the original string and the - first and last index of the token-ids that are non-padded and - non-overlapping - - Examples - -------- - >>> import cudf - >>> from cudf.utils.hash_vocab_utils import hash_vocab - >>> hash_vocab('bert-base-cased-vocab.txt', 'voc_hash.txt') - - - >>> from cudf.core.subword_tokenizer import SubwordTokenizer - >>> cudf_tokenizer = SubwordTokenizer('voc_hash.txt', - ... do_lower_case=True) - >>> str_series = cudf.Series(['This is the', 'best book']) - >>> tokenizer_output = cudf_tokenizer(str_series, - ... max_length=8, - ... max_num_rows=len(str_series), - ... padding='max_length', - ... return_tensors='pt', - ... truncation=True) - >>> tokenizer_output['input_ids'] - tensor([[ 101, 1142, 1110, 1103, 102, 0, 0, 0], - [ 101, 1436, 1520, 102, 0, 0, 0, 0]], - device='cuda:0', - dtype=torch.int32) - >>> tokenizer_output['attention_mask'] - tensor([[1, 1, 1, 1, 1, 0, 0, 0], - [1, 1, 1, 1, 0, 0, 0, 0]], - device='cuda:0', dtype=torch.int32) - >>> tokenizer_output['metadata'] - tensor([[0, 1, 3], - [1, 1, 2]], device='cuda:0', dtype=torch.int32) - """ - - if return_token_type_ids: - # raise not currently supported - # Can also return zeros - error_msg = "Returning token_type_ids is currently supported" - raise NotImplementedError(error_msg) - - if truncation in (False, "do_not_truncate"): - if add_special_tokens: - error_msg = ( - "Adding special tokens is not supported " - f"with truncation = {truncation}. " - ) - recommendation = ( - "Custom Cupy kernel can potentially " - "be used to add it. For reference " - "see: _bert_add_special_tokens" - ) - raise NotImplementedError(error_msg + recommendation) - - truncation = False - warning_msg = ( - "When truncation is not True, the behavior currently differs " - "from HuggingFace as cudf always returns overflowing tokens" - ) - warnings.warn(warning_msg) - - if padding != "max_length": - error_msg = ( - "Only padding to the provided max_lengthis currently supported" - ) - raise NotImplementedError(error_msg) - - if max_length <= stride: - error_msg = "Stride should be less than max_length" - raise ValueError(error_msg) - - if return_tensors not in {"cp", "pt", "tf"}: - error_msg = ( - "Only cupy(cp), pytorch(pt) and tensorflow(tf) " - "tensors are supported" - ) - raise NotImplementedError(error_msg) - - stride = max_length - stride - # behavior varies from subword_tokenize but maps with huggingface - - input_ids, attention_mask, metadata = text._column.subword_tokenize( - self.vocab_file, - max_sequence_length=max_length, - stride=stride, - do_lower=self.do_lower_case, - do_truncate=truncation, - ) - - tokenizer_output = { - "input_ids": cp.asarray(input_ids).reshape(-1, max_length), - "attention_mask": cp.asarray(attention_mask).reshape( - -1, max_length - ), - "metadata": cp.asarray(metadata).reshape(-1, 3), - } - - if add_special_tokens: - tokenizer_output = _bert_add_special_tokens(tokenizer_output) - - tokenizer_output = { - k: _cast_to_appropriate_type(v, return_tensors) - for k, v in tokenizer_output.items() - } - - return tokenizer_output - - -def _bert_add_special_tokens(token_o): - """ - Adds special tokens (CLS,SEP) which are often used by pre-trained BERT - models to input_ids and adjusts attention_mask and metadata to account - for them. - """ - max_length = token_o["input_ids"].shape[1] - seq_end_col = max_length - (token_o["input_ids"][:, ::-1] != 0).argmax(1) - # clipping to take overflow into account - seq_end_col = cp.clip(seq_end_col + 1, a_min=None, a_max=max_length - 1) - - _bert_add_special_tokens_input_ids(token_o["input_ids"], seq_end_col) - _bert_add_special_tokens_attention_mask( - token_o["attention_mask"], seq_end_col - ) - _bert_add_special_tokens_metadata(token_o["metadata"], max_length) - - return token_o - - -def _bert_add_special_tokens_input_ids(input_ids, seq_end_col): - """ - Add token ids for special tokens ([CLS] and [SEP]) to - the start and end of each sequence - """ - # Mark sequence start with [CLS] token mapping to the start of sequence - input_ids[:, 1:-1] = input_ids[:, 0:-2] - input_ids[:, 0] = 101 - # Mark end of sequence [SEP] - - input_ids[ - cp.arange(0, input_ids.shape[0], dtype=cp.uint32), seq_end_col - ] = 102 - - -def _bert_add_special_tokens_attention_mask(attention_mask, seq_end_col): - """ - Mark attention mask for special tokens ([CLS] and [SEP]) with 1 - """ - # Copy attention masks for all but last two - attention_mask[:, 1:-1] = attention_mask[:, 0:-2] - # Mark [CLS] token with 1 - attention_mask[:, 0] = 1 - # Mark [SEP] token with 1 - attention_mask[ - cp.arange(0, attention_mask.shape[0], dtype=cp.uint32), seq_end_col - ] = 1 - - -def _bert_add_special_tokens_metadata(metadata, max_length): - """ - Edit metadata to account for the added special tokens ([CLS] and [SEP]) - """ - # metadata seq starts from plus 1 - metadata[:, 1] = metadata[:, 1] + 1 - # clip done to take overflow into account - metadata[:, 2] = cp.clip( - metadata[:, 2] + 1, a_min=None, a_max=max_length - 2 - ) diff --git a/python/pylibcudf/pylibcudf/libcudf/nvtext/subword_tokenize.pxd b/python/pylibcudf/pylibcudf/libcudf/nvtext/subword_tokenize.pxd deleted file mode 100644 index 1ac69c87c4b..00000000000 --- a/python/pylibcudf/pylibcudf/libcudf/nvtext/subword_tokenize.pxd +++ /dev/null @@ -1,55 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. -from libc.stdint cimport uint16_t, uint32_t -from libcpp cimport bool -from libcpp.memory cimport unique_ptr -from libcpp.string cimport string -from pylibcudf.exception_handler cimport libcudf_exception_handler -from pylibcudf.libcudf.column.column cimport column -from pylibcudf.libcudf.column.column_view cimport column_view - - -cdef extern from "nvtext/subword_tokenize.hpp" namespace "nvtext" nogil: - cdef cppclass tokenizer_result: - uint32_t nrows_tensor - uint32_t sequence_length - unique_ptr[column] tensor_token_ids - unique_ptr[column] tensor_attention_mask - unique_ptr[column] tensor_metadata - - cdef cppclass hashed_vocabulary: - uint16_t first_token_id - uint16_t separator_token_id - uint16_t unknown_token_id - uint32_t outer_hash_a - uint32_t outer_hash_b - uint16_t num_bin - unique_ptr[column] table - unique_ptr[column] bin_coefficients - unique_ptr[column] bin_offsets - unique_ptr[column] cp_metadata - unique_ptr[column] aux_cp_table - - cdef unique_ptr[hashed_vocabulary] load_vocabulary_file( - const string &filename_hashed_vocabulary - ) except +libcudf_exception_handler - - cdef tokenizer_result subword_tokenize( - const column_view & strings, - hashed_vocabulary & hashed_vocabulary_obj, - uint32_t max_sequence_length, - uint32_t stride, - bool do_lower, - bool do_truncate - ) except +libcudf_exception_handler - - cdef tokenizer_result subword_tokenize( - const column_view &strings, - const string &filename_hashed_vocabulary, - uint32_t max_sequence_length, - uint32_t stride, - bool do_lower, - bool do_truncate - ) except +libcudf_exception_handler - -cdef extern from "" namespace "std" nogil: - cdef tokenizer_result move(tokenizer_result) diff --git a/python/pylibcudf/pylibcudf/nvtext/CMakeLists.txt b/python/pylibcudf/pylibcudf/nvtext/CMakeLists.txt index 842d6079749..5b0ecb6a7a8 100644 --- a/python/pylibcudf/pylibcudf/nvtext/CMakeLists.txt +++ b/python/pylibcudf/pylibcudf/nvtext/CMakeLists.txt @@ -24,7 +24,6 @@ set(cython_sources replace.pyx stemmer.pyx tokenize.pyx - subword_tokenize.pyx wordpiece_tokenize.pyx ) diff --git a/python/pylibcudf/pylibcudf/nvtext/__init__.pxd b/python/pylibcudf/pylibcudf/nvtext/__init__.pxd index 0fe6f657051..704fd03b242 100644 --- a/python/pylibcudf/pylibcudf/nvtext/__init__.pxd +++ b/python/pylibcudf/pylibcudf/nvtext/__init__.pxd @@ -11,7 +11,6 @@ from . cimport ( normalize, replace, stemmer, - subword_tokenize, tokenize, wordpiece_tokenize, ) @@ -27,7 +26,6 @@ __all__ = [ "normalize", "replace", "stemmer", - "subword_tokenize", "tokenize", "wordpiece_tokenize", ] diff --git a/python/pylibcudf/pylibcudf/nvtext/__init__.py b/python/pylibcudf/pylibcudf/nvtext/__init__.py index 4b18c2fce09..b67a5bcbff4 100644 --- a/python/pylibcudf/pylibcudf/nvtext/__init__.py +++ b/python/pylibcudf/pylibcudf/nvtext/__init__.py @@ -11,7 +11,6 @@ normalize, replace, stemmer, - subword_tokenize, tokenize, wordpiece_tokenize, ) @@ -27,7 +26,6 @@ "normalize", "replace", "stemmer", - "subword_tokenize", "tokenize", "wordpiece_tokenize", ] diff --git a/python/pylibcudf/pylibcudf/nvtext/subword_tokenize.pxd b/python/pylibcudf/pylibcudf/nvtext/subword_tokenize.pxd deleted file mode 100644 index 091c7b897ac..00000000000 --- a/python/pylibcudf/pylibcudf/nvtext/subword_tokenize.pxd +++ /dev/null @@ -1,20 +0,0 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. - -from libc.stdint cimport uint32_t -from libcpp cimport bool -from libcpp.memory cimport unique_ptr -from pylibcudf.column cimport Column -from pylibcudf.libcudf.nvtext.subword_tokenize cimport hashed_vocabulary - - -cdef class HashedVocabulary: - cdef unique_ptr[hashed_vocabulary] c_obj - -cpdef tuple[Column, Column, Column] subword_tokenize( - Column input, - HashedVocabulary vocabulary_table, - uint32_t max_sequence_length, - uint32_t stride, - bool do_lower_case, - bool do_truncate, -) diff --git a/python/pylibcudf/pylibcudf/nvtext/subword_tokenize.pyi b/python/pylibcudf/pylibcudf/nvtext/subword_tokenize.pyi deleted file mode 100644 index f6618e296b1..00000000000 --- a/python/pylibcudf/pylibcudf/nvtext/subword_tokenize.pyi +++ /dev/null @@ -1,15 +0,0 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. - -from pylibcudf.column import Column - -class HashedVocabulary: - def __init__(self, hash_file: str): ... - -def subword_tokenize( - input: Column, - vocabulary_table: HashedVocabulary, - max_sequence_length: int, - stride: int, - do_lower_case: bool, - do_truncate: bool, -) -> tuple[Column, Column, Column]: ... diff --git a/python/pylibcudf/pylibcudf/nvtext/subword_tokenize.pyx b/python/pylibcudf/pylibcudf/nvtext/subword_tokenize.pyx deleted file mode 100644 index 14fb6f5fe1e..00000000000 --- a/python/pylibcudf/pylibcudf/nvtext/subword_tokenize.pyx +++ /dev/null @@ -1,87 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. - -from cython.operator cimport dereference -from libc.stdint cimport uint32_t -from libcpp cimport bool -from libcpp.string cimport string -from libcpp.utility cimport move -from pylibcudf.column cimport Column -from pylibcudf.libcudf.nvtext.subword_tokenize cimport ( - load_vocabulary_file as cpp_load_vocabulary_file, - move as tr_move, - subword_tokenize as cpp_subword_tokenize, - tokenizer_result as cpp_tokenizer_result, -) - -__all__ = ["HashedVocabulary", "subword_tokenize"] - -cdef class HashedVocabulary: - """The vocabulary data for use with the subword_tokenize function. - - For details, see :cpp:class:`cudf::nvtext::hashed_vocabulary`. - """ - def __cinit__(self, hash_file): - cdef string c_hash_file = str(hash_file).encode() - with nogil: - self.c_obj = move(cpp_load_vocabulary_file(c_hash_file)) - - __hash__ = None - -cpdef tuple[Column, Column, Column] subword_tokenize( - Column input, - HashedVocabulary vocabulary_table, - uint32_t max_sequence_length, - uint32_t stride, - bool do_lower_case, - bool do_truncate, -): - """ - Creates a tokenizer that cleans the text, splits it into - tokens and returns token-ids from an input vocabulary. - - For details, see cpp:func:`subword_tokenize` - - Parameters - ---------- - input : Column - The input strings to tokenize. - vocabulary_table : HashedVocabulary - The vocabulary table pre-loaded into this object. - max_sequence_length : uint32_t - Limit of the number of token-ids per row in final tensor for each string. - stride : uint32_t - Each row in the output token-ids will replicate - ``max_sequence_length`` - ``stride`` the token-ids - from the previous row, unless it is the first string. - do_lower_case : bool - If true, the tokenizer will convert uppercase characters in the - input stream to lower-case and strip accents from those characters. - If false, accented and uppercase characters are not transformed. - do_truncate : bool - If true, the tokenizer will discard all the token-ids after - ``max_sequence_length`` for each input string. If false, it - will use a new row in the output token-ids to continue - generating the output. - - Returns - ------- - tuple[Column, Column, Column] - A tuple of three columns containing the - tokens, masks, and metadata. - """ - cdef cpp_tokenizer_result c_result - with nogil: - c_result = tr_move( - cpp_subword_tokenize( - input.view(), - dereference(vocabulary_table.c_obj.get()), - max_sequence_length, - stride, - do_lower_case, - do_truncate, - ) - ) - cdef Column tokens = Column.from_libcudf(move(c_result.tensor_token_ids)) - cdef Column masks = Column.from_libcudf(move(c_result.tensor_attention_mask)) - cdef Column metadata = Column.from_libcudf(move(c_result.tensor_metadata)) - return tokens, masks, metadata From a3ce3a1b1bf730df7efa3a80163acd93115f84cd Mon Sep 17 00:00:00 2001 From: David Wendt <45795991+davidwendt@users.noreply.github.com> Date: Mon, 4 Aug 2025 12:49:14 -0400 Subject: [PATCH 048/366] Fix OOB memcheck error in group_rank_to_percentage utility (#19567) Fixes out-of-bounds memory read access in `cudf::groupby::detail::group_rank_to_percentage` utility. ``` ========= Invalid __global__ read of size 4 bytes ========= at cudf::groupby::detail::group_rank_to_percentage(cudf::rank_method, cudf::rank_percentage, const cudf::column_view &, const cudf::column_view &, cudf::device_span, cudf::device_span, rmm::cuda_stream_view, cuda::mr::__4::basic_resource_ref<(cuda::mr::__4::_AllocType)1, cuda::mr::__4::device_accessible>)::[lambda(int) (instance 1)]::operator ()(int) const+0x1650 in group_rank_scan.cu:313 ========= by thread (0,0,0) in block (0,0,0) ========= Access at 0x7beb8dc01dfc is out of bounds ========= and is 4 bytes before the nearest allocation at 0x7beb8dc01e00 of size 400 bytes ========= Device Frame: thrust::cuda_cub::__tabulate::functor, cudf::device_span, rmm::cuda_stream_view, cuda::mr::__4::basic_resource_ref<(cuda::mr::__4::_AllocType)1, cuda::mr::__4::device_accessible>)::[lambda(int) (instance 1)], long>::operator ()(long)+0x1520 in tabulate.h:66 ... ``` This was found using compute-sanitizer on the following pytest command in `/cudf/python/cudf/cudf/tests`: ``` pytest test_groupby.py -k'test_groupby_2keys_rank[True-keep-True-dense-100]' ``` Authors: - David Wendt (https://github.com/davidwendt) Approvers: - Bradley Dice (https://github.com/bdice) - Karthikeyan (https://github.com/karthikeyann) - Nghia Truong (https://github.com/ttnghia) - Tianyu Liu (https://github.com/kingcrimsontianyu) URL: https://github.com/rapidsai/cudf/pull/19567 --- cpp/src/groupby/sort/group_rank_scan.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/groupby/sort/group_rank_scan.cu b/cpp/src/groupby/sort/group_rank_scan.cu index a0ba81bccb2..8e266d42077 100644 --- a/cpp/src/groupby/sort/group_rank_scan.cu +++ b/cpp/src/groupby/sort/group_rank_scan.cu @@ -300,7 +300,7 @@ std::unique_ptr group_rank_to_percentage(rank_method const method, double const r = is_double ? d_rank[row_index] : s_rank[row_index]; auto const count = dcount[labels[row_index]]; size_type const last_rank_index = offsets[labels[row_index]] + count - 1; - auto const last_rank = s_rank[last_rank_index]; + auto const last_rank = last_rank_index < 0 ? 1 : s_rank[last_rank_index]; return percentage == rank_percentage::ZERO_NORMALIZED ? r / last_rank : one_normalized(r, last_rank); From bfa6da8b6a959aef2636c9ed5cf6995a7533d730 Mon Sep 17 00:00:00 2001 From: Matthew Murray <41342305+Matt711@users.noreply.github.com> Date: Mon, 4 Aug 2025 13:46:08 -0400 Subject: [PATCH 049/366] Avoid querying device memory on systems without it in dask-cudf (#19577) This PR updates the dask-cudf's `_get_device_size` (which is used to determine the blocksize when reading parquet files) to catch `pynvml.NVMLError_NotSupported`, which is raised on systems without traditional dedicated GPU memory. Related PR #19575 Authors: - Matthew Murray (https://github.com/Matt711) Approvers: - Peter Andreas Entschev (https://github.com/pentschev) URL: https://github.com/rapidsai/cudf/pull/19577 --- python/dask_cudf/dask_cudf/io/parquet.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/python/dask_cudf/dask_cudf/io/parquet.py b/python/dask_cudf/dask_cudf/io/parquet.py index e075971f01f..3fa5a81965b 100644 --- a/python/dask_cudf/dask_cudf/io/parquet.py +++ b/python/dask_cudf/dask_cudf/io/parquet.py @@ -52,12 +52,16 @@ def _get_device_size(): if index and not index.isnumeric(): # This means index is UUID. This works for both MIG and non-MIG device UUIDs. handle = pynvml.nvmlDeviceGetHandleByUUID(str.encode(index)) + if pynvml.nvmlDeviceIsMigDeviceHandle(handle): + handle = pynvml.nvmlDeviceGetDeviceHandleFromMigDeviceHandle( + handle + ) else: # This is a device index handle = pynvml.nvmlDeviceGetHandleByIndex(int(index)) return pynvml.nvmlDeviceGetMemoryInfo(handle).total - except ValueError: + except (ValueError, pynvml.NVMLError_NotSupported): # Fall back to a conservative 8GiB default return 8 * 1024**3 From cdf6a3ab79e7b38d03d14b65158fd969a240eb13 Mon Sep 17 00:00:00 2001 From: Matthew Murray <41342305+Matt711@users.noreply.github.com> Date: Mon, 4 Aug 2025 13:46:47 -0400 Subject: [PATCH 050/366] Avoid querying device memory on systems without it in cudf-polars benchmarks (#19575) This PR updates the cudf-polars benchmark suite to catch `pynvml.NVMLError_NotSupported`, which is raised on systems without traditional dedicated GPU memory. This should have been included in [#19444](https://github.com/rapidsai/cudf/pull/19444), which closed [#19427](https://github.com/rapidsai/cudf/issues/19427). It was an oversight by me due to testing primarily on systems with dedicated device memory. Authors: - Matthew Murray (https://github.com/Matt711) Approvers: - Lawrence Mitchell (https://github.com/wence-) URL: https://github.com/rapidsai/cudf/pull/19575 --- .../experimental/benchmarks/utils.py | 34 +++++++++++++------ 1 file changed, 23 insertions(+), 11 deletions(-) diff --git a/python/cudf_polars/cudf_polars/experimental/benchmarks/utils.py b/python/cudf_polars/cudf_polars/experimental/benchmarks/utils.py index 7e56e3c57d3..11e6442baab 100644 --- a/python/cudf_polars/cudf_polars/experimental/benchmarks/utils.py +++ b/python/cudf_polars/cudf_polars/experimental/benchmarks/utils.py @@ -105,23 +105,35 @@ class GPUInfo: name: str index: int - free_memory: int - used_memory: int - total_memory: int + free_memory: int | None + used_memory: int | None + total_memory: int | None @classmethod def from_index(cls, index: int) -> GPUInfo: """Create a GPUInfo from an index.""" pynvml.nvmlInit() handle = pynvml.nvmlDeviceGetHandleByIndex(index) - memory = pynvml.nvmlDeviceGetMemoryInfo(handle) - return cls( - name=pynvml.nvmlDeviceGetName(handle), - index=index, - free_memory=memory.free, - used_memory=memory.used, - total_memory=memory.total, - ) + try: + memory = pynvml.nvmlDeviceGetMemoryInfo(handle) + return cls( + name=pynvml.nvmlDeviceGetName(handle), + index=index, + free_memory=memory.free, + used_memory=memory.used, + total_memory=memory.total, + ) + except pynvml.NVMLError_NotSupported: + # Happens on systems without traditional GPU memory (e.g., Grace Hopper), + # where nvmlDeviceGetMemoryInfo is not supported. + # See: https://github.com/rapidsai/cudf/issues/19427 + return cls( + name=pynvml.nvmlDeviceGetName(handle), + index=index, + free_memory=None, + used_memory=None, + total_memory=None, + ) @dataclasses.dataclass From f1892a687cf197d8fb9785d25bb2f728be3ffbd0 Mon Sep 17 00:00:00 2001 From: Matthew Murray <41342305+Matt711@users.noreply.github.com> Date: Mon, 4 Aug 2025 13:48:24 -0400 Subject: [PATCH 051/366] Expose `filter` and `columns` parquet reader builder options to python (#19566) Exposes `filter` and `columns` options so we can use chaining style when building the options object before passing them to the reader. Authors: - Matthew Murray (https://github.com/Matt711) Approvers: - Matthew Roeschke (https://github.com/mroeschke) URL: https://github.com/rapidsai/cudf/pull/19566 --- python/pylibcudf/pylibcudf/io/parquet.pxd | 2 ++ python/pylibcudf/pylibcudf/io/parquet.pyx | 37 ++++++++++++++++++++++- 2 files changed, 38 insertions(+), 1 deletion(-) diff --git a/python/pylibcudf/pylibcudf/io/parquet.pxd b/python/pylibcudf/pylibcudf/io/parquet.pxd index 2a925b23f6e..63d96c4af3e 100644 --- a/python/pylibcudf/pylibcudf/io/parquet.pxd +++ b/python/pylibcudf/pylibcudf/io/parquet.pxd @@ -53,6 +53,8 @@ cdef class ParquetReaderOptionsBuilder: cpdef ParquetReaderOptionsBuilder use_pandas_metadata(self, bool val) cpdef ParquetReaderOptionsBuilder allow_mismatched_pq_schemas(self, bool val) cpdef ParquetReaderOptionsBuilder use_arrow_schema(self, bool val) + cpdef ParquetReaderOptionsBuilder filter(self, Expression filter) + cpdef ParquetReaderOptionsBuilder columns(self, list col_names) cpdef build(self) diff --git a/python/pylibcudf/pylibcudf/io/parquet.pyx b/python/pylibcudf/pylibcudf/io/parquet.pyx index 42803d8d8fb..29dfb8df0aa 100644 --- a/python/pylibcudf/pylibcudf/io/parquet.pyx +++ b/python/pylibcudf/pylibcudf/io/parquet.pyx @@ -171,7 +171,7 @@ cdef class ParquetReaderOptions: ------- None """ - self.c_obj.set_filter(dereference(filter.c_obj.get())) + self.c_obj.set_filter(dereference(filter.c_obj)) cdef class ParquetReaderOptionsBuilder: @@ -241,6 +241,41 @@ cdef class ParquetReaderOptionsBuilder: self.c_obj.use_arrow_schema(val) return self + cpdef ParquetReaderOptionsBuilder filter(self, Expression filter): + """ + Sets AST based filter for predicate pushdown. + + Parameters + ---------- + filter : Expression + AST expression to use as filter + + Returns + ------- + ParquetReaderOptionsBuilder + """ + self.c_obj.filter(dereference(filter.c_obj)) + return self + + cpdef ParquetReaderOptionsBuilder columns(self, list col_names): + """ + Sets names of the columns to be read. + + Parameters + ---------- + col_names : list[str] + List of column names + + Returns + ------- + ParquetReaderOptionsBuilder + """ + cdef vector[string] vec + for name in col_names: + vec.push_back(str(name).encode()) + self.c_obj.columns(vec) + return self + cpdef build(self): """Create a ParquetReaderOptions object""" cdef ParquetReaderOptions parquet_options = ParquetReaderOptions.__new__( From e977f5c54e0db6648287e231cb1ee45b72979ae0 Mon Sep 17 00:00:00 2001 From: Kyle Edwards Date: Mon, 4 Aug 2025 14:56:30 -0400 Subject: [PATCH 052/366] Update rapids-build-backend to 0.4.0 (#19580) Issue: https://github.com/rapidsai/build-planning/issues/207 Authors: - Kyle Edwards (https://github.com/KyleFromNVIDIA) Approvers: - Bradley Dice (https://github.com/bdice) URL: https://github.com/rapidsai/cudf/pull/19580 --- conda/environments/all_cuda-129_arch-aarch64.yaml | 2 +- conda/environments/all_cuda-129_arch-x86_64.yaml | 2 +- conda/recipes/cudf-polars/recipe.yaml | 2 +- conda/recipes/cudf/recipe.yaml | 2 +- conda/recipes/cudf_kafka/recipe.yaml | 2 +- conda/recipes/custreamz/recipe.yaml | 2 +- conda/recipes/dask-cudf/recipe.yaml | 2 +- conda/recipes/pylibcudf/recipe.yaml | 2 +- dependencies.yaml | 2 +- python/cudf/pyproject.toml | 2 +- python/cudf_kafka/pyproject.toml | 2 +- python/cudf_polars/pyproject.toml | 2 +- python/custreamz/pyproject.toml | 2 +- python/dask_cudf/pyproject.toml | 2 +- python/libcudf/pyproject.toml | 2 +- python/pylibcudf/pyproject.toml | 2 +- 16 files changed, 16 insertions(+), 16 deletions(-) diff --git a/conda/environments/all_cuda-129_arch-aarch64.yaml b/conda/environments/all_cuda-129_arch-aarch64.yaml index 15af26470a4..0e95832dddd 100644 --- a/conda/environments/all_cuda-129_arch-aarch64.yaml +++ b/conda/environments/all_cuda-129_arch-aarch64.yaml @@ -81,7 +81,7 @@ dependencies: - python-xxhash - python>=3.10,<3.14 - pytorch>=2.4.0 -- rapids-build-backend>=0.3.0,<0.4.0.dev0 +- rapids-build-backend>=0.4.0,<0.5.0.dev0 - rapids-dask-dependency==25.10.*,>=0.0.0a0 - rapids-logger==0.1.*,>=0.0.0a0 - rich diff --git a/conda/environments/all_cuda-129_arch-x86_64.yaml b/conda/environments/all_cuda-129_arch-x86_64.yaml index e606a1cd4c3..e96b8d81953 100644 --- a/conda/environments/all_cuda-129_arch-x86_64.yaml +++ b/conda/environments/all_cuda-129_arch-x86_64.yaml @@ -82,7 +82,7 @@ dependencies: - python-xxhash - python>=3.10,<3.14 - pytorch>=2.4.0 -- rapids-build-backend>=0.3.0,<0.4.0.dev0 +- rapids-build-backend>=0.4.0,<0.5.0.dev0 - rapids-dask-dependency==25.10.*,>=0.0.0a0 - rapids-logger==0.1.*,>=0.0.0a0 - rich diff --git a/conda/recipes/cudf-polars/recipe.yaml b/conda/recipes/cudf-polars/recipe.yaml index 46ab07ab81c..d3c080249f1 100644 --- a/conda/recipes/cudf-polars/recipe.yaml +++ b/conda/recipes/cudf-polars/recipe.yaml @@ -43,7 +43,7 @@ requirements: host: - python =${{ py_version }} - pip - - rapids-build-backend >=0.3.0,<0.4.0.dev0 + - rapids-build-backend >=0.4.0,<0.5.0.dev0 - setuptools - cuda-version =${{ cuda_version }} run: diff --git a/conda/recipes/cudf/recipe.yaml b/conda/recipes/cudf/recipe.yaml index 98e82c95bce..38e32b5c1f2 100644 --- a/conda/recipes/cudf/recipe.yaml +++ b/conda/recipes/cudf/recipe.yaml @@ -52,7 +52,7 @@ requirements: - python =${{ py_version }} - pip - cython >=3.0.3 - - rapids-build-backend >=0.3.0,<0.4.0.dev0 + - rapids-build-backend >=0.4.0,<0.5.0.dev0 - scikit-build-core >=0.10.0 - dlpack >=0.8,<1.0 - numba-cuda >=0.14.0,<0.15.0a0 diff --git a/conda/recipes/cudf_kafka/recipe.yaml b/conda/recipes/cudf_kafka/recipe.yaml index 0d2c8cc39cc..100dc270915 100644 --- a/conda/recipes/cudf_kafka/recipe.yaml +++ b/conda/recipes/cudf_kafka/recipe.yaml @@ -55,7 +55,7 @@ requirements: - cuda-version =${{ cuda_version }} - pylibcudf =${{ version }} - libcudf_kafka =${{ version }} - - rapids-build-backend >=0.3.0,<0.4.0.dev0 + - rapids-build-backend >=0.4.0,<0.5.0.dev0 - scikit-build-core >=0.10.0 - cuda-cudart-dev run: diff --git a/conda/recipes/custreamz/recipe.yaml b/conda/recipes/custreamz/recipe.yaml index 36535c4f472..4e8644b046e 100644 --- a/conda/recipes/custreamz/recipe.yaml +++ b/conda/recipes/custreamz/recipe.yaml @@ -28,7 +28,7 @@ requirements: host: - python =${{ py_version }} - pip - - rapids-build-backend >=0.3.0,<0.4.0.dev0 + - rapids-build-backend >=0.4.0,<0.5.0.dev0 - setuptools - python-confluent-kafka >=2.8.0,<2.9.0a0 - cudf_kafka =${{ version }} diff --git a/conda/recipes/dask-cudf/recipe.yaml b/conda/recipes/dask-cudf/recipe.yaml index eaa05196c9d..fc9e20f4192 100644 --- a/conda/recipes/dask-cudf/recipe.yaml +++ b/conda/recipes/dask-cudf/recipe.yaml @@ -28,7 +28,7 @@ requirements: host: - python =${{ py_version }} - pip - - rapids-build-backend >=0.3.0,<0.4.0.dev0 + - rapids-build-backend >=0.4.0,<0.5.0.dev0 - setuptools - cuda-version =${{ cuda_version }} run: diff --git a/conda/recipes/pylibcudf/recipe.yaml b/conda/recipes/pylibcudf/recipe.yaml index e5fec6983c4..2d2cf0a630f 100644 --- a/conda/recipes/pylibcudf/recipe.yaml +++ b/conda/recipes/pylibcudf/recipe.yaml @@ -52,7 +52,7 @@ requirements: - python =${{ py_version }} - pip - cython >=3.0.3 - - rapids-build-backend >=0.3.0,<0.4.0.dev0 + - rapids-build-backend >=0.4.0,<0.5.0.dev0 - scikit-build-core >=0.10.0 - dlpack >=0.8,<1.0 - libcudf =${{ version }} diff --git a/dependencies.yaml b/dependencies.yaml index 0153304a4e2..7e53ff8a959 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -465,7 +465,7 @@ dependencies: common: - output_types: [conda, requirements, pyproject] packages: - - &rapids_build_backend rapids-build-backend>=0.3.0,<0.4.0.dev0 + - &rapids_build_backend rapids-build-backend>=0.4.0,<0.5.0.dev0 - output_types: conda packages: - scikit-build-core>=0.10.0 diff --git a/python/cudf/pyproject.toml b/python/cudf/pyproject.toml index f1a678e5326..bd0bf9087a0 100644 --- a/python/cudf/pyproject.toml +++ b/python/cudf/pyproject.toml @@ -3,7 +3,7 @@ [build-system] build-backend = "rapids_build_backend.build" requires = [ - "rapids-build-backend>=0.3.0,<0.4.0.dev0", + "rapids-build-backend>=0.4.0,<0.5.0.dev0", "scikit-build-core[pyproject]>=0.10.0", ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. diff --git a/python/cudf_kafka/pyproject.toml b/python/cudf_kafka/pyproject.toml index f4be4552feb..c5c0837ad67 100644 --- a/python/cudf_kafka/pyproject.toml +++ b/python/cudf_kafka/pyproject.toml @@ -3,7 +3,7 @@ [build-system] build-backend = "rapids_build_backend.build" requires = [ - "rapids-build-backend>=0.3.0,<0.4.0.dev0", + "rapids-build-backend>=0.4.0,<0.5.0.dev0", "scikit-build-core[pyproject]>=0.10.0", ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. diff --git a/python/cudf_polars/pyproject.toml b/python/cudf_polars/pyproject.toml index a69b5551b09..472520df984 100644 --- a/python/cudf_polars/pyproject.toml +++ b/python/cudf_polars/pyproject.toml @@ -3,7 +3,7 @@ [build-system] build-backend = "rapids_build_backend.build" requires = [ - "rapids-build-backend>=0.3.0,<0.4.0.dev0", + "rapids-build-backend>=0.4.0,<0.5.0.dev0", "setuptools", "wheel", ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. diff --git a/python/custreamz/pyproject.toml b/python/custreamz/pyproject.toml index 0b961894d2d..244906cf204 100644 --- a/python/custreamz/pyproject.toml +++ b/python/custreamz/pyproject.toml @@ -3,7 +3,7 @@ [build-system] build-backend = "rapids_build_backend.build" requires = [ - "rapids-build-backend>=0.3.0,<0.4.0.dev0", + "rapids-build-backend>=0.4.0,<0.5.0.dev0", "setuptools", "wheel", ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. diff --git a/python/dask_cudf/pyproject.toml b/python/dask_cudf/pyproject.toml index f0d1d91fbfe..b8603849892 100644 --- a/python/dask_cudf/pyproject.toml +++ b/python/dask_cudf/pyproject.toml @@ -3,7 +3,7 @@ [build-system] build-backend = "rapids_build_backend.build" requires = [ - "rapids-build-backend>=0.3.0,<0.4.0.dev0", + "rapids-build-backend>=0.4.0,<0.5.0.dev0", "setuptools", "wheel", ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. diff --git a/python/libcudf/pyproject.toml b/python/libcudf/pyproject.toml index 7b69543d898..bc0bf5b1edf 100644 --- a/python/libcudf/pyproject.toml +++ b/python/libcudf/pyproject.toml @@ -15,7 +15,7 @@ [build-system] build-backend = "rapids_build_backend.build" requires = [ - "rapids-build-backend>=0.3.0,<0.4.0.dev0", + "rapids-build-backend>=0.4.0,<0.5.0.dev0", "scikit-build-core[pyproject]>=0.10.0", ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. diff --git a/python/pylibcudf/pyproject.toml b/python/pylibcudf/pyproject.toml index 561f9eccd64..85d8693b1c3 100644 --- a/python/pylibcudf/pyproject.toml +++ b/python/pylibcudf/pyproject.toml @@ -3,7 +3,7 @@ [build-system] build-backend = "rapids_build_backend.build" requires = [ - "rapids-build-backend>=0.3.0,<0.4.0.dev0", + "rapids-build-backend>=0.4.0,<0.5.0.dev0", "scikit-build-core[pyproject]>=0.10.0", ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. From dbb33055295e61d810f2f9e412dda445b42c572c Mon Sep 17 00:00:00 2001 From: Gil Forsyth Date: Mon, 4 Aug 2025 16:10:33 -0400 Subject: [PATCH 053/366] ci(labeler): update labeler action to @v5 (#19581) Bumps the version of `actions/labeler` to `@v5` and updates the syntax in the `labeler.yml` file to account for breaking changes in that version bump. xref: rapidsai/ops#2968 Authors: - Gil Forsyth (https://github.com/gforsyth) Approvers: - Bradley Dice (https://github.com/bdice) URL: https://github.com/rapidsai/cudf/pull/19581 --- .github/labeler.yml | 42 +++++++++++++++++++++-------------- .github/workflows/labeler.yml | 2 +- 2 files changed, 26 insertions(+), 18 deletions(-) diff --git a/.github/labeler.yml b/.github/labeler.yml index 63ef619b64e..87077c8ea9e 100644 --- a/.github/labeler.yml +++ b/.github/labeler.yml @@ -1,26 +1,34 @@ # Documentation for config - https://github.com/actions/labeler#common-examples Python: - - 'python/**' - - 'notebooks/**' - + - changed-files: + any-glob-to-any-file: + - 'python/**' + - 'notebooks/**' cudf.pandas: - - 'python/cudf/cudf/pandas/**' - - 'python/cudf/cudf_pandas_tests/**' - + - changed-files: + any-glob-to-any-file: + - 'python/cudf/cudf/pandas/**' + - 'python/cudf/cudf_pandas_tests/**' cudf-polars: - - 'python/cudf_polars/**' - + - changed-files: + any-glob-to-any-file: + - 'python/cudf_polars/**' pylibcudf: - - 'python/pylibcudf/**' - + - changed-files: + any-glob-to-any-file: + - 'python/pylibcudf/**' libcudf: - - 'cpp/**' - + - changed-files: + any-glob-to-any-file: + - 'cpp/**' CMake: - - '**/CMakeLists.txt' - - '**/cmake/**' - - '**/*.cmake' - + - changed-files: + any-glob-to-any-file: + - '**/CMakeLists.txt' + - '**/cmake/**' + - '**/*.cmake' Java: - - 'java/**' + - changed-files: + any-glob-to-any-file: + - 'java/**' diff --git a/.github/workflows/labeler.yml b/.github/workflows/labeler.yml index acfefc5e4af..fa134653d49 100644 --- a/.github/workflows/labeler.yml +++ b/.github/workflows/labeler.yml @@ -13,6 +13,6 @@ jobs: persist-credentials: false sparse-checkout: .github/labeler.yml sparse-checkout-cone-mode: false - - uses: actions/labeler@v4 + - uses: actions/labeler@v5 with: repo-token: "${{ secrets.GITHUB_TOKEN }}" From 5853debea4ae85b48b99e8f556973ea244f6a71a Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Mon, 4 Aug 2025 16:08:35 -0700 Subject: [PATCH 054/366] Construct cuDF classic columns with __array_interface__ through pylibcudf (#19538) Towards https://github.com/rapidsai/cudf/issues/18726 Use `pylibcudf.Column.from_array_interface` or pyarrow to convert an object implementing `__array_interface__` to a cuDF classic column instead of use the custom Buffer class Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - GALI PREM SAGAR (https://github.com/galipremsagar) URL: https://github.com/rapidsai/cudf/pull/19538 --- python/cudf/cudf/core/column/column.py | 67 ++++++++++--------- .../cudf/pandas/scripts/conftest-patch.py | 56 ++-------------- .../cudf/tests/series/test_constructors.py | 5 ++ 3 files changed, 45 insertions(+), 83 deletions(-) diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index ac2d9d12752..f3d8a7798be 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -3132,34 +3132,37 @@ def as_column( # TODO: Or treat as scalar? arbitrary = arbitrary[np.newaxis] - if arbitrary.dtype.kind in "OSU": - if pd.isna(arbitrary).any(): - new_array = pa.array(arbitrary) + if arbitrary.dtype.kind == "O": + is_na = pd.isna(arbitrary) + if is_na.any(): + if is_na.all(): + # Avoid pyarrow converting np.ndarray[object] of all NaNs to float + raise MixedTypeError( + "Cannot have all NaN values with object dtype." + ) + arbitrary = pa.array(arbitrary) else: # Let pandas potentially infer object type # e.g. np.array([pd.Timestamp(...)], dtype=object) -> datetime64 - new_array = pd.Series(arbitrary) - res = as_column(new_array, dtype=dtype, nan_as_null=nan_as_null) - if ( - cudf.get_option("mode.pandas_compatible") - and res.dtype.kind == "f" - and arbitrary.dtype.kind == "O" - ): - raise MixedTypeError( - "Cannot create column with mixed types, " - "pandas Series with object dtype cannot be converted to cudf Series with float dtype." - ) - return res + arbitrary = pd.Series(arbitrary) + return as_column(arbitrary, dtype=dtype, nan_as_null=nan_as_null) + elif arbitrary.dtype.kind in "SU": + result_column = ColumnBase.from_arrow(pa.array(arbitrary)) + if dtype is not None: + result_column = result_column.astype(dtype) + return result_column elif arbitrary.dtype.kind in "biuf": - from_pandas = nan_as_null is None or nan_as_null if not arbitrary.dtype.isnative: - # Not supported by pyarrow + # Not supported by pylibcudf arbitrary = arbitrary.astype(arbitrary.dtype.newbyteorder("=")) - return as_column( - pa.array(arbitrary, from_pandas=from_pandas), - dtype=dtype, - nan_as_null=nan_as_null, + result_column = ColumnBase.from_pylibcudf( + plc.Column.from_array_interface(arbitrary) ) + if nan_as_null is not False: + result_column = result_column.nans_to_nulls() + if dtype is not None: + result_column = result_column.astype(dtype) + return result_column elif arbitrary.dtype.kind in "mM": time_unit = np.datetime_data(arbitrary.dtype)[0] if time_unit in ("D", "W", "M", "Y"): @@ -3175,7 +3178,7 @@ def as_column( is_nat = np.isnat(arbitrary) mask = None if is_nat.any(): - if nan_as_null is None or nan_as_null: + if nan_as_null is not False: # Convert NaT to NA, which pyarrow does by default return as_column( pa.array(arbitrary), @@ -3184,16 +3187,16 @@ def as_column( ) # Consider NaT as NA in the mask # but maintain NaT as a value - mask = as_column(~is_nat).as_mask() - buffer = as_buffer(arbitrary.view("|u1")) - col = build_column( - data=buffer, - mask=mask, - dtype=arbitrary.dtype, - ) - if dtype: - col = col.astype(dtype) - return col + mask = plc.Column.from_array_interface(~is_nat) + plc_column = plc.Column.from_array_interface(arbitrary) + if mask is not None: + plc_column = plc_column.with_mask( + *plc.transform.bools_to_mask(mask) + ) + result_column = ColumnBase.from_pylibcudf(plc_column) + if dtype is not None: + result_column = result_column.astype(dtype) + return result_column else: raise NotImplementedError(f"{arbitrary.dtype} not supported") elif (view := as_memoryview(arbitrary)) is not None: diff --git a/python/cudf/cudf/pandas/scripts/conftest-patch.py b/python/cudf/cudf/pandas/scripts/conftest-patch.py index 9520f09945e..4dc5b3d42ab 100644 --- a/python/cudf/cudf/pandas/scripts/conftest-patch.py +++ b/python/cudf/cudf/pandas/scripts/conftest-patch.py @@ -5005,10 +5005,6 @@ def pytest_unconfigure(config): "tests/frame/indexing/test_indexing.py::test_adding_new_conditional_column_with_string[string[pyarrow_numpy]-True]", "tests/frame/indexing/test_indexing.py::test_object_casting_indexing_wraps_datetimelike", "tests/frame/indexing/test_insert.py::TestDataFrameInsert::test_insert", - "tests/frame/indexing/test_setitem.py::TestDataFrameSetItem::test_frame_setitem_datetime64_col_other_units[h]", - "tests/frame/indexing/test_setitem.py::TestDataFrameSetItem::test_frame_setitem_datetime64_col_other_units[m]", - "tests/frame/indexing/test_setitem.py::TestDataFrameSetItem::test_frame_setitem_existing_datetime64_col_other_units[h]", - "tests/frame/indexing/test_setitem.py::TestDataFrameSetItem::test_frame_setitem_existing_datetime64_col_other_units[m]", "tests/frame/indexing/test_setitem.py::TestDataFrameSetItem::test_setitem_list_missing_columns[columns3-box3-expected3]", "tests/frame/indexing/test_setitem.py::TestDataFrameSetItem::test_setitem_multi_index", "tests/frame/indexing/test_setitem.py::TestDataFrameSetItemSlicing::test_setitem_slice_indexer_broadcasting_rhs[1-Series-iloc]", @@ -5089,27 +5085,9 @@ def pytest_unconfigure(config): "tests/frame/methods/test_astype.py::TestAstype::test_astype_dt64tz", "tests/frame/methods/test_astype.py::TestAstype::test_astype_dt64tz_to_str", "tests/frame/methods/test_astype.py::TestAstype::test_astype_extension_dtypes_duplicate_col[Int64]", - "tests/frame/methods/test_astype.py::TestAstype::test_astype_from_datetimelike_to_object[h-M8]", - "tests/frame/methods/test_astype.py::TestAstype::test_astype_from_datetimelike_to_object[h-m8]", - "tests/frame/methods/test_astype.py::TestAstype::test_astype_from_datetimelike_to_object[m-M8]", - "tests/frame/methods/test_astype.py::TestAstype::test_astype_from_datetimelike_to_object[m-m8]", "tests/frame/methods/test_astype.py::TestAstype::test_astype_str_float", "tests/frame/methods/test_astype.py::TestAstype::test_astype_td64_to_string[DataFrame]", "tests/frame/methods/test_astype.py::TestAstype::test_astype_td64_to_string[Series]", - "tests/frame/methods/test_astype.py::TestAstype::test_astype_to_datetime_unit[h]", - "tests/frame/methods/test_astype.py::TestAstype::test_astype_to_datetime_unit[m]", - "tests/frame/methods/test_astype.py::TestAstype::test_astype_to_datetimelike_unit[h-M8-float64]", - "tests/frame/methods/test_astype.py::TestAstype::test_astype_to_datetimelike_unit[h-M8-int64]", - "tests/frame/methods/test_astype.py::TestAstype::test_astype_to_datetimelike_unit[h-m8-float64]", - "tests/frame/methods/test_astype.py::TestAstype::test_astype_to_datetimelike_unit[h-m8-int64]", - "tests/frame/methods/test_astype.py::TestAstype::test_astype_to_datetimelike_unit[m-M8-float64]", - "tests/frame/methods/test_astype.py::TestAstype::test_astype_to_datetimelike_unit[m-M8-int64]", - "tests/frame/methods/test_astype.py::TestAstype::test_astype_to_datetimelike_unit[m-m8-float64]", - "tests/frame/methods/test_astype.py::TestAstype::test_astype_to_datetimelike_unit[m-m8-int64]", - "tests/frame/methods/test_astype.py::TestAstype::test_astype_to_incorrect_datetimelike[h]", - "tests/frame/methods/test_astype.py::TestAstype::test_astype_to_incorrect_datetimelike[m]", - "tests/frame/methods/test_astype.py::TestAstype::test_astype_to_timedelta_unit[h]", - "tests/frame/methods/test_astype.py::TestAstype::test_astype_to_timedelta_unit[m]", "tests/frame/methods/test_astype.py::TestAstype::test_astype_with_exclude_string", "tests/frame/methods/test_astype.py::TestAstypeCategorical::test_astype_categorical_to_string_missing", "tests/frame/methods/test_astype.py::test_astype_to_string_not_modifying_input[pyarrow_numpy-None]", @@ -5176,6 +5154,7 @@ def pytest_unconfigure(config): "tests/frame/methods/test_explode.py::test_duplicate_index[input_dict1-input_index1-expected_dict1-expected_index1]", "tests/frame/methods/test_explode.py::test_duplicate_index[input_dict2-input_index2-expected_dict2-expected_index2]", "tests/frame/methods/test_explode.py::test_duplicate_index[input_dict3-input_index3-expected_dict3-expected_index3]", + "tests/frame/methods/test_explode.py::test_multi_columns_nan_empty", "tests/frame/methods/test_fillna.py::TestFillNA::test_fillna_dict_inplace_nonunique_columns", "tests/frame/methods/test_fillna.py::TestFillNA::test_fillna_dtype_conversion", "tests/frame/methods/test_fillna.py::TestFillNA::test_fillna_on_column_view", @@ -5372,6 +5351,7 @@ def pytest_unconfigure(config): "tests/frame/methods/test_set_index.py::TestSetIndex::test_set_index_pass_single_array[True-True-test-1]", "tests/frame/methods/test_set_index.py::TestSetIndex::test_set_index_pass_single_array[True-True-test-Index]", "tests/frame/methods/test_set_index.py::TestSetIndexCustomLabelType::test_set_index_custom_label_hashable_iterable", + "tests/frame/methods/test_shift.py::TestDataFrameShift::test_shift_bool", "tests/frame/methods/test_shift.py::TestDataFrameShift::test_shift_dt64values_axis1_invalid_fill[datetime64[ns]-False]", "tests/frame/methods/test_shift.py::TestDataFrameShift::test_shift_dt64values_axis1_invalid_fill[timedelta64[ns]-False]", "tests/frame/methods/test_shift.py::TestDataFrameShift::test_shift_dt64values_int_fill_deprecated", @@ -5499,7 +5479,6 @@ def pytest_unconfigure(config): "tests/frame/test_arithmetic.py::test_pow_with_realignment[python]", "tests/frame/test_block_internals.py::TestDataFrameBlockInternals::test_consolidate", "tests/frame/test_block_internals.py::TestDataFrameBlockInternals::test_construction_with_conversions", - "tests/frame/test_block_internals.py::TestDataFrameBlockInternals::test_constructor_with_convert", "tests/frame/test_block_internals.py::TestDataFrameBlockInternals::test_modify_values", "tests/frame/test_block_internals.py::TestDataFrameBlockInternals::test_stale_cached_series_bug_473", "tests/frame/test_block_internals.py::test_update_inplace_sets_valid_block_values", @@ -5532,14 +5511,6 @@ def pytest_unconfigure(config): "tests/frame/test_constructors.py::TestDataFrameConstructors::test_construct_from_list_of_datetimes", "tests/frame/test_constructors.py::TestDataFrameConstructors::test_construct_ndarray_with_nas_and_int_dtype", "tests/frame/test_constructors.py::TestDataFrameConstructors::test_construct_with_two_categoricalindex_series", - "tests/frame/test_constructors.py::TestDataFrameConstructors::test_constructor_datetimes_non_ns[h-A]", - "tests/frame/test_constructors.py::TestDataFrameConstructors::test_constructor_datetimes_non_ns[h-C]", - "tests/frame/test_constructors.py::TestDataFrameConstructors::test_constructor_datetimes_non_ns[h-F]", - "tests/frame/test_constructors.py::TestDataFrameConstructors::test_constructor_datetimes_non_ns[h-K]", - "tests/frame/test_constructors.py::TestDataFrameConstructors::test_constructor_datetimes_non_ns[m-A]", - "tests/frame/test_constructors.py::TestDataFrameConstructors::test_constructor_datetimes_non_ns[m-C]", - "tests/frame/test_constructors.py::TestDataFrameConstructors::test_constructor_datetimes_non_ns[m-F]", - "tests/frame/test_constructors.py::TestDataFrameConstructors::test_constructor_datetimes_non_ns[m-K]", "tests/frame/test_constructors.py::TestDataFrameConstructors::test_constructor_datetimes_with_nulls[arr0]", "tests/frame/test_constructors.py::TestDataFrameConstructors::test_constructor_datetimes_with_nulls[arr1]", "tests/frame/test_constructors.py::TestDataFrameConstructors::test_constructor_dict", @@ -5564,14 +5535,6 @@ def pytest_unconfigure(config): "tests/frame/test_constructors.py::TestDataFrameConstructors::test_constructor_mixed_dict_and_Series", "tests/frame/test_constructors.py::TestDataFrameConstructors::test_constructor_ndarray_copy", "tests/frame/test_constructors.py::TestDataFrameConstructors::test_constructor_rec", - "tests/frame/test_constructors.py::TestDataFrameConstructors::test_constructor_timedelta_non_ns[h-A]", - "tests/frame/test_constructors.py::TestDataFrameConstructors::test_constructor_timedelta_non_ns[h-C]", - "tests/frame/test_constructors.py::TestDataFrameConstructors::test_constructor_timedelta_non_ns[h-F]", - "tests/frame/test_constructors.py::TestDataFrameConstructors::test_constructor_timedelta_non_ns[h-K]", - "tests/frame/test_constructors.py::TestDataFrameConstructors::test_constructor_timedelta_non_ns[m-A]", - "tests/frame/test_constructors.py::TestDataFrameConstructors::test_constructor_timedelta_non_ns[m-C]", - "tests/frame/test_constructors.py::TestDataFrameConstructors::test_constructor_timedelta_non_ns[m-F]", - "tests/frame/test_constructors.py::TestDataFrameConstructors::test_constructor_timedelta_non_ns[m-K]", "tests/frame/test_constructors.py::TestDataFrameConstructors::test_constructor_with_datetimes1", "tests/frame/test_constructors.py::TestDataFrameConstructors::test_constructor_with_datetimes2", "tests/frame/test_constructors.py::TestDataFrameConstructors::test_constructor_with_datetimes3", @@ -5810,6 +5773,7 @@ def pytest_unconfigure(config): "tests/frame/test_constructors.py::TestDataFrameConstructors::test_dict_nocopy[UInt8-uint64-False]", "tests/frame/test_constructors.py::TestDataFrameConstructors::test_dict_nocopy[UInt8-uint8-False]", "tests/frame/test_constructors.py::TestDataFrameConstructors::test_frame_string_inference", + "tests/frame/test_constructors.py::TestDataFrameConstructors::test_frame_string_inference_array_string_dtype", "tests/frame/test_constructors.py::TestDataFrameConstructors::test_from_datetime_subclass", "tests/frame/test_constructors.py::TestFromScalar::test_from_out_of_bounds_ns_datetime[DataFrame-None-datetime64]", "tests/frame/test_constructors.py::TestFromScalar::test_from_out_of_bounds_ns_datetime[DataFrame-dict-datetime64]", @@ -11258,8 +11222,6 @@ def pytest_unconfigure(config): "tests/reshape/merge/test_merge.py::TestMerge::test_merge_on_index_with_more_values[index9-expected_index9-right]", "tests/reshape/merge/test_merge.py::TestMerge::test_merge_right_index_right", "tests/reshape/merge/test_merge.py::TestMerge::test_merge_take_missing_values_from_index_of_other_dtype", - "tests/reshape/merge/test_merge.py::TestMerge::test_other_timedelta_unit[h]", - "tests/reshape/merge/test_merge.py::TestMerge::test_other_timedelta_unit[m]", "tests/reshape/merge/test_merge.py::TestMergeCategorical::test_dtype_on_categorical_dates", "tests/reshape/merge/test_merge.py::TestMergeCategorical::test_dtype_on_merged_different[inner-1]", "tests/reshape/merge/test_merge.py::TestMergeCategorical::test_dtype_on_merged_different[left-0]", @@ -13060,6 +13022,7 @@ def pytest_unconfigure(config): "tests/series/methods/test_replace.py::TestSeriesReplace::test_replace_dtype[Float64-input_data4-to_replace4-expected_data4]", "tests/series/methods/test_replace.py::TestSeriesReplace::test_replace_dtype[Int64-input_data2-to_replace2-expected_data2]", "tests/series/methods/test_replace.py::TestSeriesReplace::test_replace_dtype[string-input_data5-to_replace5-expected_data5]", + "tests/series/methods/test_replace.py::TestSeriesReplace::test_replace_explicit_none", "tests/series/methods/test_replace.py::TestSeriesReplace::test_replace_mixed_types", "tests/series/methods/test_replace.py::TestSeriesReplace::test_replace_na_in_obj_column[Int64]", "tests/series/methods/test_replace.py::TestSeriesReplace::test_replace_no_cast[ser0-exp0]", @@ -13235,14 +13198,6 @@ def pytest_unconfigure(config): "tests/series/test_arithmetic.py::test_none_comparison[python-uint64]", "tests/series/test_arithmetic.py::test_none_comparison[python-uint8]", "tests/series/test_constructors.py::TestSeriesConstructors::test_categorical_sideeffects_free", - "tests/series/test_constructors.py::TestSeriesConstructors::test_construction_to_datetimelike_unit[h-M-float64]", - "tests/series/test_constructors.py::TestSeriesConstructors::test_construction_to_datetimelike_unit[h-M-int64]", - "tests/series/test_constructors.py::TestSeriesConstructors::test_construction_to_datetimelike_unit[h-m-float64]", - "tests/series/test_constructors.py::TestSeriesConstructors::test_construction_to_datetimelike_unit[h-m-int64]", - "tests/series/test_constructors.py::TestSeriesConstructors::test_construction_to_datetimelike_unit[m-M-float64]", - "tests/series/test_constructors.py::TestSeriesConstructors::test_construction_to_datetimelike_unit[m-M-int64]", - "tests/series/test_constructors.py::TestSeriesConstructors::test_construction_to_datetimelike_unit[m-m-float64]", - "tests/series/test_constructors.py::TestSeriesConstructors::test_construction_to_datetimelike_unit[m-m-int64]", "tests/series/test_constructors.py::TestSeriesConstructors::test_constructor", "tests/series/test_constructors.py::TestSeriesConstructors::test_constructor_bool_dtype_missing_values", "tests/series/test_constructors.py::TestSeriesConstructors::test_constructor_coerce_float_fail[int16]", @@ -13273,7 +13228,6 @@ def pytest_unconfigure(config): "tests/series/test_constructors.py::TestSeriesConstructors::test_constructor_data_aware_dtype_naive[tzutc()-True]", "tests/series/test_constructors.py::TestSeriesConstructors::test_constructor_data_aware_dtype_naive[zoneinfo.ZoneInfo(key='US/Pacific')-True]", "tests/series/test_constructors.py::TestSeriesConstructors::test_constructor_data_aware_dtype_naive[zoneinfo.ZoneInfo(key='UTC')-True]", - "tests/series/test_constructors.py::TestSeriesConstructors::test_constructor_datetime64_bigendian", "tests/series/test_constructors.py::TestSeriesConstructors::test_constructor_datetimes_with_nulls", "tests/series/test_constructors.py::TestSeriesConstructors::test_constructor_dict_datetime64_index", "tests/series/test_constructors.py::TestSeriesConstructors::test_constructor_dtype_datetime64_10", @@ -13303,6 +13257,7 @@ def pytest_unconfigure(config): "tests/series/test_constructors.py::TestSeriesConstructors::test_series_ctor_plus_datetimeindex", "tests/series/test_constructors.py::TestSeriesConstructors::test_series_from_index_dtype_equal_does_not_copy", "tests/series/test_constructors.py::TestSeriesConstructors::test_series_string_inference", + "tests/series/test_constructors.py::TestSeriesConstructors::test_series_string_inference_array_string_dtype", "tests/series/test_constructors.py::TestSeriesConstructors::test_series_string_inference_scalar", "tests/series/test_constructors.py::TestSeriesConstructors::test_series_string_inference_storage_definition", "tests/series/test_constructors.py::TestSeriesConstructors::test_series_string_with_na_inference[None]", @@ -13862,7 +13817,6 @@ def pytest_unconfigure(config): "tests/tools/test_to_numeric.py::test_to_numeric_large_float_not_downcast_to_float_32[9876543210.0]", "tests/tools/test_to_timedelta.py::TestTimedeltas::test_to_timedelta_nullable_int64_dtype[None-None]", "tests/tools/test_to_timedelta.py::TestTimedeltas::test_to_timedelta_nullable_int64_dtype[expected_val0-2]", - "tests/tools/test_to_timedelta.py::TestTimedeltas::test_to_timedelta_oob_non_nano", "tests/tools/test_to_timedelta.py::TestTimedeltas::test_to_timedelta_series", "tests/tools/test_to_timedelta.py::TestTimedeltas::test_to_timedelta_via_apply", "tests/tseries/frequencies/test_inference.py::test_infer_freq_tz_transition[zoneinfo.ZoneInfo(key='US/Pacific')-10min-date_pair1]", diff --git a/python/cudf/cudf/tests/series/test_constructors.py b/python/cudf/cudf/tests/series/test_constructors.py index 6357680b368..d5a9ea50317 100644 --- a/python/cudf/cudf/tests/series/test_constructors.py +++ b/python/cudf/cudf/tests/series/test_constructors.py @@ -697,6 +697,11 @@ def test_to_dense_array(): assert filled.size == len(sr) +def test_series_np_array_all_nan_object_raises(): + with pytest.raises(MixedTypeError): + cudf.Series(np.array([np.nan, np.nan], dtype=object)) + + @pytest.mark.parametrize( "ps", [ From b65d6ee26d384cd1adac438fed3a99ca24d61c79 Mon Sep 17 00:00:00 2001 From: Robert Maynard Date: Tue, 5 Aug 2025 10:35:48 -0400 Subject: [PATCH 055/366] Update cudf to handle CUDA 13 changes (#19585) Required changes to support CUDA 13 Authors: - Robert Maynard (https://github.com/robertmaynard) Approvers: - Bradley Dice (https://github.com/bdice) - Muhammad Haseeb (https://github.com/mhaseeb123) - Nghia Truong (https://github.com/ttnghia) URL: https://github.com/rapidsai/cudf/pull/19585 --- cpp/src/io/json/write_json.cu | 5 +++-- cpp/src/transform/row_bit_count.cu | 9 +++++++-- cpp/src/utilities/prefetch.cpp | 11 ++++++++++- python/cudf/udf_cpp/CMakeLists.txt | 8 ++++++-- 4 files changed, 26 insertions(+), 7 deletions(-) diff --git a/cpp/src/io/json/write_json.cu b/cpp/src/io/json/write_json.cu index 01f9df2f4cc..e83ad15720b 100644 --- a/cpp/src/io/json/write_json.cu +++ b/cpp/src/io/json/write_json.cu @@ -714,9 +714,10 @@ struct column_to_strings_fn { host_span children_names) const requires(std::is_same_v) { + auto structs_view = structs_column_view{column}; auto const child_it = cudf::detail::make_counting_transform_iterator( - 0, [&stream = stream_, structs_view = structs_column_view{column}](auto const child_idx) { - return structs_view.get_sliced_child(child_idx, stream); + 0, [&stream = stream_, &s_v = structs_view](auto const child_idx) { + return s_v.get_sliced_child(child_idx, stream); }); auto col_string = operator()(child_it, child_it + column.num_children(), diff --git a/cpp/src/transform/row_bit_count.cu b/cpp/src/transform/row_bit_count.cu index ded16ef958d..c941d6bfaeb 100644 --- a/cpp/src/transform/row_bit_count.cu +++ b/cpp/src/transform/row_bit_count.cu @@ -230,8 +230,13 @@ struct flatten_functor { info.push_back({cur_depth, branch_depth_start, branch_depth_end}); lists_column_view lcv(col); - auto iter = cudf::detail::make_counting_transform_iterator( - 0, [col = lcv.get_sliced_child(stream)](auto) { return col; }); + auto sliced_child = lcv.get_sliced_child(stream); + + // We don't pass sliced_child by value as that will generate + // invocation of a host function ( ~column_view() ) in a host/device + // context when compiling with CUDA 13 + auto iter = + cudf::detail::make_counting_transform_iterator(0, [&](auto) { return sliced_child; }); h_info.complex_type_count++; flatten_hierarchy( diff --git a/cpp/src/utilities/prefetch.cpp b/cpp/src/utilities/prefetch.cpp index 6c9f677afb3..be28d54214e 100644 --- a/cpp/src/utilities/prefetch.cpp +++ b/cpp/src/utilities/prefetch.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2024, NVIDIA CORPORATION. + * Copyright (c) 2020-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -69,7 +69,16 @@ cudaError_t prefetch_noexcept(std::string_view key, std::cerr << "Prefetching " << size << " bytes for key " << key << " at location " << ptr << std::endl; } + +#if defined(CUDART_VERSION) && CUDART_VERSION >= 13000 + cudaMemLocation location{ + (device_id.value() == cudaCpuDeviceId) ? cudaMemLocationTypeHost : cudaMemLocationTypeDevice, + device_id.value()}; + constexpr int flags = 0; + auto result = cudaMemPrefetchAsync(ptr, size, location, flags, stream.value()); +#else auto result = cudaMemPrefetchAsync(ptr, size, device_id.value(), stream.value()); +#endif // Need to flush the CUDA error so that the context is not corrupted. if (result == cudaErrorInvalidValue) { cudaGetLastError(); } return result; diff --git a/python/cudf/udf_cpp/CMakeLists.txt b/python/cudf/udf_cpp/CMakeLists.txt index 0c12a022f22..2f7d332b877 100644 --- a/python/cudf/udf_cpp/CMakeLists.txt +++ b/python/cudf/udf_cpp/CMakeLists.txt @@ -74,9 +74,13 @@ install(TARGETS cudf_strings_udf DESTINATION ./cudf/_lib/) # Create the shim library for each architecture. set(SHIM_CUDA_FLAGS --expt-relaxed-constexpr -rdc=true) -# always build a default PTX file in case RAPIDS_NO_INITIALIZE is set and the device cc can't be +# always build a default architecture in case RAPIDS_NO_INITIALIZE is set and the device cc can't be # safely queried through a context -list(INSERT CMAKE_CUDA_ARCHITECTURES 0 "70") +if(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 13.0.0) + list(INSERT CMAKE_CUDA_ARCHITECTURES 0 "75") +else() + list(INSERT CMAKE_CUDA_ARCHITECTURES 0 "70") +endif() list(TRANSFORM CMAKE_CUDA_ARCHITECTURES REPLACE "-real" "") list(TRANSFORM CMAKE_CUDA_ARCHITECTURES REPLACE "-virtual" "") From eff56237a3927b01ec09f727f51cf5667c257545 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 5 Aug 2025 10:52:06 -0700 Subject: [PATCH 056/366] Use more pytest fixtures and avoid GPU parameterization in test_replace/reshape/rolling.py (#19426) Towards https://github.com/rapidsai/cudf/issues/9999 * Use more pytest fixtures * Avoids `pytest.mark.parametrize` with GPU objects Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/19426 --- python/cudf/cudf/tests/test_replace.py | 122 ++++++++++++++----------- python/cudf/cudf/tests/test_reshape.py | 18 ++-- python/cudf/cudf/tests/test_rolling.py | 19 ++-- 3 files changed, 85 insertions(+), 74 deletions(-) diff --git a/python/cudf/cudf/tests/test_replace.py b/python/cudf/cudf/tests/test_replace.py index 8ea0d205e8b..b1efcef5d1e 100644 --- a/python/cudf/cudf/tests/test_replace.py +++ b/python/cudf/cudf/tests/test_replace.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. +# Copyright (c) 2020-2025, NVIDIA CORPORATION. import operator import re @@ -27,11 +27,11 @@ @pytest.mark.parametrize( - "gsr", + "gsr_data, dtype", [ - cudf.Series([5, 1, 2, 3, None, 243, None, 4]), - cudf.Series(["one", "two", "three", None, "one"], dtype="category"), - cudf.Series([*list(range(400)), None]), + [[5, 1, 2, 3, None, 243, None, 4], None], + [["one", "two", "three", None, "one"], "category"], + [[*list(range(400)), None], None], ], ) @pytest.mark.parametrize( @@ -49,7 +49,8 @@ (np.inf, 4), ], ) -def test_series_replace_all(gsr, to_replace, value): +def test_series_replace_all(gsr_data, dtype, to_replace, value): + gsr = cudf.Series(gsr_data, dtype=dtype) psr = gsr.to_pandas() gd_to_replace = to_replace @@ -183,28 +184,30 @@ def test_series_replace_with_nulls(): reason="warning introduced in pandas-2.2.0", ) @pytest.mark.parametrize( - "df", + "data, dtype", [ - cudf.DataFrame( + ( { "a": [0, 1, None, 2, 3], "b": [3, 2, 2, 3, None], "c": ["abc", "def", ".", None, None], - } + }, + None, ), - cudf.DataFrame( + ( { "a": ["one", "two", None, "three"], "b": ["one", None, "two", "three"], }, - dtype="category", + "category", ), - cudf.DataFrame( + ( { "col one": [None, 10, 11, None, 1000, 500, 600], "col two": ["abc", "def", "ghi", None, "pp", None, "a"], "a": [0.324, 0.234, 324.342, 23.32, 9.9, None, None], - } + }, + None, ), ], ) @@ -245,8 +248,8 @@ def test_series_replace_with_nulls(): ), ], ) -def test_dataframe_replace(df, to_replace, value): - gdf = df +def test_dataframe_replace(data, dtype, to_replace, value): + gdf = cudf.DataFrame(data, dtype=dtype) pdf = gdf.to_pandas() pd_value = value @@ -262,7 +265,7 @@ def test_dataframe_replace(df, to_replace, value): gd_to_replace = to_replace can_warn = ( - isinstance(df["a"].dtype, cudf.CategoricalDtype) + isinstance(gdf["a"].dtype, cudf.CategoricalDtype) and isinstance(to_replace, str) and to_replace == "two" and isinstance(value, str) @@ -394,26 +397,29 @@ def test_fillna_method_numerical(data, container, data_dtype, method, inplace): @pytest.mark.parametrize( - "gsr_data", + "gsr_data, dtype", [ - cudf.Series(["2.34", "5.2", "7.47", None, "92.29", None]).astype( - Decimal64Dtype(7, 2) + (["2.34", "5.2", "7.47", None, "92.29", None], Decimal64Dtype(7, 2)), + ( + ["-74.56", None, "-23.73", "34.55", "2.89", None], + Decimal32Dtype(7, 2), ), - cudf.Series(["-74.56", None, "-23.73", "34.55", "2.89", None]).astype( - Decimal32Dtype(7, 2) + ( + ["85.955", np.nan, "-3.243", np.nan, "29.492", np.nan], + Decimal64Dtype(8, 3), + ), + ( + ["2.964", None, "57.432", "-989.330", None, "56.444"], + Decimal64Dtype(8, 3), + ), + ( + [np.nan, "55.2498", np.nan, "-5.2965", "-28.9423", np.nan], + Decimal64Dtype(10, 4), + ), + ( + ["2.964", None, "54347.432", "-989.330", None, "56.444"], + Decimal128Dtype(20, 7), ), - cudf.Series( - ["85.955", np.nan, "-3.243", np.nan, "29.492", np.nan] - ).astype(Decimal64Dtype(8, 3)), - cudf.Series( - ["2.964", None, "57.432", "-989.330", None, "56.444"] - ).astype(Decimal64Dtype(8, 3)), - cudf.Series( - [np.nan, "55.2498", np.nan, "-5.2965", "-28.9423", np.nan] - ).astype(Decimal64Dtype(10, 4)), - cudf.Series( - ["2.964", None, "54347.432", "-989.330", None, "56.444"] - ).astype(Decimal128Dtype(20, 7)), ], ) @pytest.mark.parametrize( @@ -433,8 +439,8 @@ def test_fillna_method_numerical(data, container, data_dtype, method, inplace): ], ) @pytest.mark.parametrize("inplace", [True, False]) -def test_fillna_decimal(gsr_data, fill_value, inplace): - gsr = gsr_data.copy(deep=True) +def test_fillna_decimal(gsr_data, dtype, fill_value, inplace): + gsr = cudf.Series(gsr_data).astype(dtype) psr = gsr.to_pandas() if isinstance(fill_value, cudf.Series): @@ -1306,67 +1312,73 @@ def test_series_replace_errors(): "gsr,old,new,expected", [ ( - cudf.Series(["a", "b", "c", None]), + lambda: cudf.Series(["a", "b", "c", None]), None, "a", - cudf.Series(["a", "b", "c", "a"]), + lambda: cudf.Series(["a", "b", "c", "a"]), ), ( - cudf.Series(["a", "b", "c", None]), + lambda: cudf.Series(["a", "b", "c", None]), [None, "a", "a"], ["c", "b", "d"], - cudf.Series(["d", "b", "c", "c"]), + lambda: cudf.Series(["d", "b", "c", "c"]), ), ( - cudf.Series(["a", "b", "c", None]), + lambda: cudf.Series(["a", "b", "c", None]), [None, "a"], ["b", None], - cudf.Series([None, "b", "c", "b"]), + lambda: cudf.Series([None, "b", "c", "b"]), ), ( - cudf.Series(["a", "b", "c", None]), + lambda: cudf.Series(["a", "b", "c", None]), [None, None], [None, None], - cudf.Series(["a", "b", "c", None]), + lambda: cudf.Series(["a", "b", "c", None]), + ), + ( + lambda: cudf.Series([1, 2, None, 3]), + None, + 10, + lambda: cudf.Series([1, 2, 10, 3]), ), - (cudf.Series([1, 2, None, 3]), None, 10, cudf.Series([1, 2, 10, 3])), ( - cudf.Series([1, 2, None, 3]), + lambda: cudf.Series([1, 2, None, 3]), [None, 1, 1], [3, 2, 4], - cudf.Series([4, 2, 3, 3]), + lambda: cudf.Series([4, 2, 3, 3]), ), ( - cudf.Series([1, 2, None, 3]), + lambda: cudf.Series([1, 2, None, 3]), [None, 1], [2, None], - cudf.Series([None, 2, 2, 3]), + lambda: cudf.Series([None, 2, 2, 3]), ), ( - cudf.Series(["a", "q", "t", None], dtype="category"), + lambda: cudf.Series(["a", "q", "t", None], dtype="category"), None, "z", - cudf.Series(["a", "q", "t", "z"], dtype="category"), + lambda: cudf.Series(["a", "q", "t", "z"], dtype="category"), ), ( - cudf.Series(["a", "q", "t", None], dtype="category"), + lambda: cudf.Series(["a", "q", "t", None], dtype="category"), [None, "a", "q"], ["z", None, None], - cudf.Series([None, None, "t", "z"], dtype="category"), + lambda: cudf.Series([None, None, "t", "z"], dtype="category"), ), ( - cudf.Series(["a", None, "t", None], dtype="category"), + lambda: cudf.Series(["a", None, "t", None], dtype="category"), [None, "t"], ["p", None], - cudf.Series(["a", "p", None, "p"], dtype="category"), + lambda: cudf.Series(["a", "p", None, "p"], dtype="category"), ), ], ) def test_replace_nulls(gsr, old, new, expected): + gsr = gsr() with expect_warning_if(isinstance(gsr.dtype, cudf.CategoricalDtype)): actual = gsr.replace(old, new) assert_eq( - expected.sort_values().reset_index(drop=True), + expected().sort_values().reset_index(drop=True), actual.sort_values().reset_index(drop=True), ) diff --git a/python/cudf/cudf/tests/test_reshape.py b/python/cudf/cudf/tests/test_reshape.py index b13b0db2679..84cf6136255 100644 --- a/python/cudf/cudf/tests/test_reshape.py +++ b/python/cudf/cudf/tests/test_reshape.py @@ -667,10 +667,6 @@ def test_unstack_multiindex(level): ) -@pytest.mark.parametrize( - "data", - [{"A": [1.0, 2.0, 3.0, 4.0, 5.0], "B": [11.0, 12.0, 13.0, 14.0, 15.0]}], -) @pytest.mark.parametrize( "index", [ @@ -695,7 +691,11 @@ def test_unstack_multiindex(level): ), ], ) -def test_unstack_index(data, index, col_idx): +def test_unstack_index(index, col_idx): + data = { + "A": [1.0, 2.0, 3.0, 4.0, 5.0], + "B": [11.0, 12.0, 13.0, 14.0, 15.0], + } pdf = pd.DataFrame(data) gdf = cudf.from_pandas(pdf) @@ -733,9 +733,9 @@ def test_pivot_duplicate_error(): @pytest.mark.parametrize( "aggfunc", ["mean", "count", {"D": "sum", "E": "count"}] ) -@pytest.mark.parametrize("fill_value", [0]) -def test_pivot_table_simple(aggfunc, fill_value): +def test_pivot_table_simple(aggfunc): rng = np.random.default_rng(seed=0) + fill_value = 0 pdf = pd.DataFrame( { "A": ["one", "one", "two", "three"] * 6, @@ -768,9 +768,9 @@ def test_pivot_table_simple(aggfunc, fill_value): @pytest.mark.parametrize( "aggfunc", ["mean", "count", {"D": "sum", "E": "count"}] ) -@pytest.mark.parametrize("fill_value", [0]) -def test_dataframe_pivot_table_simple(aggfunc, fill_value): +def test_dataframe_pivot_table_simple(aggfunc): rng = np.random.default_rng(seed=0) + fill_value = 0 pdf = pd.DataFrame( { "A": ["one", "one", "two", "three"] * 6, diff --git a/python/cudf/cudf/tests/test_rolling.py b/python/cudf/cudf/tests/test_rolling.py index 87b509ce22b..30958f29a8f 100644 --- a/python/cudf/cudf/tests/test_rolling.py +++ b/python/cudf/cudf/tests/test_rolling.py @@ -102,13 +102,13 @@ def test_rolling_dataframe_basic(data, agg, nulls, center): @pytest.mark.parametrize( "agg", [ - pytest.param("sum"), - pytest.param("min"), - pytest.param("max"), - pytest.param("mean"), - pytest.param("count"), - pytest.param("std"), - pytest.param("var"), + "sum", + "min", + "max", + "mean", + "count", + "std", + "var", ], ) def test_rolling_with_offset(agg): @@ -134,9 +134,8 @@ def test_rolling_with_offset(agg): @pytest.mark.parametrize("agg", ["std", "var"]) @pytest.mark.parametrize("ddof", [0, 1]) @pytest.mark.parametrize("center", [True, False]) -@pytest.mark.parametrize("seed", [100, 2000]) @pytest.mark.parametrize("window_size", [2, 10, 100]) -def test_rolling_var_std_large(agg, ddof, center, seed, window_size): +def test_rolling_var_std_large(agg, ddof, center, window_size): iupper_bound = math.sqrt(np.iinfo(np.int64).max / window_size) ilower_bound = -math.sqrt(abs(np.iinfo(np.int64).min) / window_size) @@ -170,7 +169,7 @@ def test_rolling_var_std_large(agg, ddof, center, seed, window_size): ], rows=n_rows, use_threads=False, - seed=seed, + seed=100, ) pdf = data.to_pandas() gdf = cudf.from_pandas(pdf) From cec71d1ae9f9828dc8719f2b9a34d700daf59835 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 5 Aug 2025 11:17:58 -0700 Subject: [PATCH 057/366] Use more pytest fixtures and avoid GPU parameterization in test_groupby/index.py (#19438) Towards https://github.com/rapidsai/cudf/issues/9999 * Use more pytest fixtures * Avoids pytest.mark.parametrize with GPU objects * Eliminate/reduce parameterizations of input size Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/19438 --- python/cudf/cudf/tests/test_groupby.py | 176 ++++----- python/cudf/cudf/tests/test_index.py | 528 +++++++++---------------- 2 files changed, 268 insertions(+), 436 deletions(-) diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py index 898b056f263..f332fb37e56 100644 --- a/python/cudf/cudf/tests/test_groupby.py +++ b/python/cudf/cudf/tests/test_groupby.py @@ -106,8 +106,8 @@ def pdf(gdf): return gdf.to_pandas() -@pytest.mark.parametrize("nelem", [2, 3, 100, 1000]) -def test_groupby_mean(nelem): +def test_groupby_mean(): + nelem = 20 got_df = make_frame(DataFrame, nelem=nelem).groupby(["x", "y"]).mean() expect_df = ( make_frame(pd.DataFrame, nelem=nelem).groupby(["x", "y"]).mean() @@ -115,8 +115,8 @@ def test_groupby_mean(nelem): assert_groupby_results_equal(got_df, expect_df) -@pytest.mark.parametrize("nelem", [2, 3, 100, 1000]) -def test_groupby_mean_3level(nelem): +def test_groupby_mean_3level(): + nelem = 20 lvls = "z" bys = list("xyz") got_df = ( @@ -132,8 +132,8 @@ def test_groupby_mean_3level(nelem): assert_groupby_results_equal(got_df, expect_df) -@pytest.mark.parametrize("nelem", [2, 3, 100, 1000]) -def test_groupby_agg_mean_min(nelem): +def test_groupby_agg_mean_min(): + nelem = 20 got_df = ( make_frame(DataFrame, nelem=nelem) .groupby(["x", "y"]) @@ -147,8 +147,8 @@ def test_groupby_agg_mean_min(nelem): assert_groupby_results_equal(got_df, expect_df) -@pytest.mark.parametrize("nelem", [2, 3, 100, 1000]) -def test_groupby_agg_min_max_dictargs(nelem): +def test_groupby_agg_min_max_dictargs(): + nelem = 20 expect_df = ( make_frame(pd.DataFrame, nelem=nelem, extra_vals="ab") .groupby(["x", "y"]) @@ -162,8 +162,8 @@ def test_groupby_agg_min_max_dictargs(nelem): assert_groupby_results_equal(expect_df, got_df) -@pytest.mark.parametrize("nelem", [2, 3, 100, 1000]) -def test_groupby_agg_min_max_dictlist(nelem): +def test_groupby_agg_min_max_dictlist(): + nelem = 20 expect_df = ( make_frame(pd.DataFrame, nelem=nelem, extra_vals="ab") .groupby(["x", "y"]) @@ -334,23 +334,24 @@ def foo(df): assert_groupby_results_equal(expect, got) -def create_test_groupby_apply_args_params(): - def f1(df, k): - df["out"] = df["val1"] + df["val2"] + k - return df +def f1(df, k): + df["out"] = df["val1"] + df["val2"] + k + return df - def f2(df, k, L): - df["out"] = df["val1"] - df["val2"] + (k / L) - return df - def f3(df, k, L, m): - df["out"] = ((k * df["val1"]) + (L * df["val2"])) / m - return df +def f2(df, k, L): + df["out"] = df["val1"] - df["val2"] + (k / L) + return df - return [(f1, (42,)), (f2, (42, 119)), (f3, (42, 119, 212.1))] +def f3(df, k, L, m): + df["out"] = ((k * df["val1"]) + (L * df["val2"])) / m + return df -@pytest.mark.parametrize("func,args", create_test_groupby_apply_args_params()) + +@pytest.mark.parametrize( + "func,args", [(f1, (42,)), (f2, (42, 119)), (f3, (42, 119, 212.1))] +) @pytest.mark.skipif( PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, reason="Fails in older versions of pandas", @@ -621,7 +622,6 @@ def test_groupby_apply_jit_reductions_special_vals( ) -@pytest.mark.parametrize("dtype", ["float64"]) @pytest.mark.parametrize("func", ["idxmax", "idxmin"]) @pytest.mark.parametrize( "special_val", @@ -642,21 +642,20 @@ def test_groupby_apply_jit_reductions_special_vals( reason="include_groups keyword new in pandas 2.2", ) def test_groupby_apply_jit_idx_reductions_special_vals( - func, dtype, dataset, groupby_jit_datasets, special_val + func, dataset, groupby_jit_datasets, special_val ): dataset = groupby_jit_datasets[dataset].copy(deep=True) groupby_apply_jit_idx_reductions_special_vals_inner( - func, dataset, dtype, special_val + func, dataset, "float64", special_val ) -@pytest.mark.parametrize("dtype", ["int32"]) @pytest.mark.skipif( PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, reason="Fails in older versions of pandas", ) -def test_groupby_apply_jit_sum_integer_overflow(dtype): - max = np.iinfo(dtype).max +def test_groupby_apply_jit_sum_integer_overflow(): + max = np.iinfo("int32").max data = DataFrame( { @@ -821,21 +820,20 @@ def test_groupby_apply_jit_basic(func, groupby_jit_data_small): run_groupby_apply_jit_test(groupby_jit_data_small, func, ["key1", "key2"]) -def create_test_groupby_apply_jit_args_params(): - def f1(df, k): - return df["val1"].max() + df["val2"].min() + k +def f1(df, k): + return df["val1"].max() + df["val2"].min() + k + - def f2(df, k, L): - return df["val1"].sum() - df["val2"].var() + (k / L) +def f2(df, k, L): + return df["val1"].sum() - df["val2"].var() + (k / L) - def f3(df, k, L, m): - return ((k * df["val1"].mean()) + (L * df["val2"].std())) / m - return [(f1, (42,)), (f2, (42, 119)), (f3, (42, 119, 212.1))] +def f3(df, k, L, m): + return ((k * df["val1"].mean()) + (L * df["val2"].std())) / m @pytest.mark.parametrize( - "func,args", create_test_groupby_apply_jit_args_params() + "func,args", [(f1, (42,)), (f2, (42, 119)), (f3, (42, 119, 212.1))] ) @pytest.mark.skipif( PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, @@ -1002,7 +1000,6 @@ def pdf_func(df): assert_groupby_results_equal(expect, got) -@pytest.mark.parametrize("nelem", [2, 3, 100, 500, 1000]) @pytest.mark.parametrize( "func", [ @@ -1018,8 +1015,9 @@ def pdf_func(df): "prod", ], ) -def test_groupby_2keys_agg(nelem, func): +def test_groupby_2keys_agg(func): # gdf (Note: lack of multiIndex) + nelem = 20 expect_df = ( make_frame(pd.DataFrame, nelem=nelem).groupby(["x", "y"]).agg(func) ) @@ -1029,8 +1027,8 @@ def test_groupby_2keys_agg(nelem, func): assert_groupby_results_equal(got_df, expect_df, check_dtype=check_dtype) -@pytest.mark.parametrize("num_groups", [2, 3, 10, 50, 100]) -@pytest.mark.parametrize("nelem_per_group", [1, 10, 100]) +@pytest.mark.parametrize("num_groups", [2, 20]) +@pytest.mark.parametrize("nelem_per_group", [1, 10]) @pytest.mark.parametrize( "func", ["min", "max", "count", "sum"], @@ -1777,14 +1775,16 @@ def test_groupby_cumcount(index): ) -@pytest.mark.parametrize("nelem", [2, 3, 1000]) @pytest.mark.parametrize("as_index", [True, False]) @pytest.mark.parametrize( "agg", ["min", "max", "idxmin", "idxmax", "mean", "count"] ) -def test_groupby_datetime(nelem, as_index, agg): +def test_groupby_datetime(request, as_index, agg): + nelem = 20 if agg == "mean" and as_index is True: - return + request.applymarker( + pytest.mark.xfail(reason="Invalid type/aggregation combination") + ) check_dtype = agg not in ("mean", "count", "idxmin", "idxmax") pdf = make_frame(pd.DataFrame, nelem=nelem, with_datetime=True) gdf = make_frame(cudf.DataFrame, nelem=nelem, with_datetime=True) @@ -2457,7 +2457,11 @@ def test_groupby_nonempty_no_keys(pdf): @pytest.mark.parametrize( "by,data", [ - # ([], []), # error? + pytest.param( + [], + [], + marks=pytest.mark.xfail(reason="dtype always cast to object"), + ), ([1, 1, 2, 2], [0, 0, 1, 1]), ([1, 2, 3, 4], [0, 0, 0, 0]), ([1, 2, 1, 2], [0, 1, 1, 1]), @@ -2477,11 +2481,11 @@ def test_groupby_unique(by, data, dtype): assert_groupby_results_equal(expect, got) -@pytest.mark.parametrize("nelem", [2, 3, 100, 1000]) @pytest.mark.parametrize( "func", ["cummin", "cummax", "cumcount", "cumsum", "cumprod"] ) -def test_groupby_2keys_scan(nelem, func): +def test_groupby_2keys_scan(func): + nelem = 20 pdf = make_frame(pd.DataFrame, nelem=nelem) expect_df = pdf.groupby(["x", "y"], sort=True).agg(func) gdf = cudf.from_pandas(pdf) @@ -2506,12 +2510,12 @@ def test_groupby_2keys_scan(nelem, func): assert_groupby_results_equal(got_df, expect_df, check_dtype=check_dtype) -@pytest.mark.parametrize("nelem", [100, 1000]) @pytest.mark.parametrize("method", ["average", "min", "max", "first", "dense"]) @pytest.mark.parametrize("ascending", [True, False]) @pytest.mark.parametrize("na_option", ["keep", "top", "bottom"]) @pytest.mark.parametrize("pct", [False, True]) -def test_groupby_2keys_rank(nelem, method, ascending, na_option, pct): +def test_groupby_2keys_rank(method, ascending, na_option, pct): + nelem = 20 t = rand_dataframe( dtypes_meta=[ {"dtype": "int64", "null_frequency": 0, "cardinality": 10}, @@ -2592,11 +2596,11 @@ def test_groupby_mix_agg_scan(): gb.agg(func) -@pytest.mark.parametrize("nelem", [2, 3, 100, 1000]) @pytest.mark.parametrize("shift_perc", [0.5, 1.0, 1.5]) @pytest.mark.parametrize("direction", [1, -1]) @pytest.mark.parametrize("fill_value", [None, np.nan, 42]) -def test_groupby_shift_row(nelem, shift_perc, direction, fill_value): +def test_groupby_shift_row(shift_perc, direction, fill_value): + nelem = 20 pdf = make_frame(pd.DataFrame, nelem=nelem, extra_vals=["val2"]) gdf = cudf.from_pandas(pdf) n_shift = int(nelem * shift_perc) * direction @@ -2611,7 +2615,6 @@ def test_groupby_shift_row(nelem, shift_perc, direction, fill_value): ) -@pytest.mark.parametrize("nelem", [10, 50, 100, 1000]) @pytest.mark.parametrize("shift_perc", [0.5, 1.0, 1.5]) @pytest.mark.parametrize("direction", [1, -1]) @pytest.mark.parametrize( @@ -2632,9 +2635,8 @@ def test_groupby_shift_row(nelem, shift_perc, direction, fill_value): ), ], ) -def test_groupby_shift_row_mixed_numerics( - nelem, shift_perc, direction, fill_value -): +def test_groupby_shift_row_mixed_numerics(shift_perc, direction, fill_value): + nelem = 20 t = rand_dataframe( dtypes_meta=[ {"dtype": "int64", "null_frequency": 0, "cardinality": 10}, @@ -2669,10 +2671,10 @@ def test_groupby_shift_row_mixed_numerics( # TODO: Shifting list columns is currently unsupported because we cannot # construct a null list scalar in python. Support once it is added. -@pytest.mark.parametrize("nelem", [10, 50, 100, 1000]) @pytest.mark.parametrize("shift_perc", [0.5, 1.0, 1.5]) @pytest.mark.parametrize("direction", [1, -1]) -def test_groupby_shift_row_mixed(nelem, shift_perc, direction): +def test_groupby_shift_row_mixed(shift_perc, direction): + nelem = 20 t = rand_dataframe( dtypes_meta=[ {"dtype": "int64", "null_frequency": 0, "cardinality": 10}, @@ -2705,7 +2707,6 @@ def test_groupby_shift_row_mixed(nelem, shift_perc, direction): ) -@pytest.mark.parametrize("nelem", [10, 50, 100, 1000]) @pytest.mark.parametrize("shift_perc", [0.5, 1.0, 1.5]) @pytest.mark.parametrize("direction", [1, -1]) @pytest.mark.parametrize( @@ -2719,9 +2720,8 @@ def test_groupby_shift_row_mixed(nelem, shift_perc, direction): ] ], ) -def test_groupby_shift_row_mixed_fill( - nelem, shift_perc, direction, fill_value -): +def test_groupby_shift_row_mixed_fill(shift_perc, direction, fill_value): + nelem = 20 t = rand_dataframe( dtypes_meta=[ {"dtype": "int64", "null_frequency": 0, "cardinality": 10}, @@ -2763,9 +2763,9 @@ def test_groupby_shift_row_mixed_fill( ) -@pytest.mark.parametrize("nelem", [10, 50, 100, 1000]) @pytest.mark.parametrize("fill_value", [None, 0, 42]) -def test_groupby_shift_row_zero_shift(nelem, fill_value): +def test_groupby_shift_row_zero_shift(fill_value): + nelem = 20 t = rand_dataframe( dtypes_meta=[ {"dtype": "int64", "null_frequency": 0, "cardinality": 10}, @@ -2796,10 +2796,10 @@ def test_groupby_shift_row_zero_shift(nelem, fill_value): ) -@pytest.mark.parametrize("nelem", [2, 3, 100, 1000]) @pytest.mark.parametrize("shift_perc", [0.5, 1.0, 1.5]) @pytest.mark.parametrize("direction", [1, -1]) -def test_groupby_diff_row(nelem, shift_perc, direction): +def test_groupby_diff_row(shift_perc, direction): + nelem = 20 pdf = make_frame(pd.DataFrame, nelem=nelem, extra_vals=["val2"]) gdf = cudf.from_pandas(pdf) n_shift = int(nelem * shift_perc) * direction @@ -2812,10 +2812,10 @@ def test_groupby_diff_row(nelem, shift_perc, direction): ) -@pytest.mark.parametrize("nelem", [10, 50, 100, 1000]) @pytest.mark.parametrize("shift_perc", [0.5, 1.0, 1.5]) @pytest.mark.parametrize("direction", [1, -1]) -def test_groupby_diff_row_mixed_numerics(nelem, shift_perc, direction): +def test_groupby_diff_row_mixed_numerics(shift_perc, direction): + nelem = 20 t = rand_dataframe( dtypes_meta=[ {"dtype": "int64", "null_frequency": 0, "cardinality": 10}, @@ -2849,8 +2849,8 @@ def test_groupby_diff_row_mixed_numerics(nelem, shift_perc, direction): ) -@pytest.mark.parametrize("nelem", [10, 50, 100, 1000]) -def test_groupby_diff_row_zero_shift(nelem): +def test_groupby_diff_row_zero_shift(): + nelem = 20 t = rand_dataframe( dtypes_meta=[ {"dtype": "int64", "null_frequency": 0, "cardinality": 10}, @@ -2881,12 +2881,12 @@ def test_groupby_diff_row_zero_shift(nelem): ) -@pytest.mark.parametrize("nelem", [10, 100, 1000]) @pytest.mark.skipif( PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, reason="warning not present in older pandas versions", ) -def test_groupby_fillna_multi_value(nelem): +def test_groupby_fillna_multi_value(): + nelem = 20 t = rand_dataframe( dtypes_meta=[ {"dtype": "int64", "null_frequency": 0, "cardinality": 10}, @@ -2930,12 +2930,12 @@ def test_groupby_fillna_multi_value(nelem): # TODO: cudf.fillna does not support decimal column to column fill yet -@pytest.mark.parametrize("nelem", [10, 100, 1000]) @pytest.mark.skipif( PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, reason="warning not present in older pandas versions", ) -def test_groupby_fillna_multi_value_df(nelem): +def test_groupby_fillna_multi_value_df(): + nelem = 20 t = rand_dataframe( dtypes_meta=[ {"dtype": "int64", "null_frequency": 0, "cardinality": 10}, @@ -3010,9 +3010,9 @@ def test_groupby_various_by_fillna(by, data, args): PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, reason="warning not present in older pandas versions", ) -@pytest.mark.parametrize("nelem", [10, 100, 1000]) @pytest.mark.parametrize("method", ["ffill", "bfill"]) -def test_groupby_fillna_method(nelem, method): +def test_groupby_fillna_method(method): + nelem = 20 t = rand_dataframe( dtypes_meta=[ {"dtype": "int64", "null_frequency": 0, "cardinality": 10}, @@ -3502,20 +3502,6 @@ def test_groupby_group_keys(group_keys, by): assert_eq(actual, expected) -@pytest.fixture -def df_ngroup(): - df = cudf.DataFrame( - { - "a": [2, 2, 1, 1, 2, 3], - "b": [1, 2, 1, 2, 1, 2], - "c": ["a", "a", "b", "c", "d", "c"], - }, - index=[1, 3, 5, 7, 4, 2], - ) - df.index.name = "foo" - return df - - @pytest.mark.parametrize( "by", [ @@ -3528,7 +3514,16 @@ def df_ngroup(): ], ) @pytest.mark.parametrize("ascending", [True, False]) -def test_groupby_ngroup(by, ascending, df_ngroup): +def test_groupby_ngroup(by, ascending): + df_ngroup = cudf.DataFrame( + { + "a": [2, 2, 1, 1, 2, 3], + "b": [1, 2, 1, 2, 1, 2], + "c": ["a", "a", "b", "c", "d", "c"], + }, + index=[1, 3, 5, 7, 4, 2], + ) + df_ngroup.index.name = "foo" by = by() expected = df_ngroup.to_pandas().groupby(by).ngroup(ascending=ascending) actual = df_ngroup.groupby(by).ngroup(ascending=ascending) @@ -3928,7 +3923,6 @@ def test_group_by_value_counts(normalize, sort, ascending, dropna, as_index): assert_groupby_results_equal( actual, expected, - check_names=False, check_index_type=False, as_index=as_index, by=["gender", "education"], diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py index e7e702a0b8c..9ee55790b73 100644 --- a/python/cudf/cudf/tests/test_index.py +++ b/python/cudf/cudf/tests/test_index.py @@ -25,12 +25,9 @@ from cudf.testing import assert_eq from cudf.testing._utils import ( ALL_TYPES, - FLOAT_TYPES, NUMERIC_TYPES, OTHER_TYPES, SERIES_OR_INDEX_NAMES, - SIGNED_INTEGER_TYPES, - UNSIGNED_TYPES, assert_column_memory_eq, assert_column_memory_ne, assert_exceptions_equal, @@ -309,121 +306,51 @@ def test_set_index_as_property(): assert_eq(head.index, idx[:5]) -@pytest.mark.parametrize("name", ["x"]) -def test_index_copy_range(name, deep=True): - cidx = cudf.RangeIndex(1, 5) - pidx = cidx.to_pandas() - - pidx_copy = pidx.copy(name=name, deep=deep) - cidx_copy = cidx.copy(name=name, deep=deep) - - assert_eq(pidx_copy, cidx_copy) - - -@pytest.mark.parametrize("name", ["x"]) -def test_index_copy_datetime(name, deep=True): - cidx = cudf.DatetimeIndex(["2001", "2002", "2003"]) - pidx = cidx.to_pandas() - - pidx_copy = pidx.copy(name=name, deep=deep) - cidx_copy = cidx.copy(name=name, deep=deep) - - assert_eq(pidx_copy, cidx_copy) - - -@pytest.mark.parametrize("name", ["x"]) -def test_index_copy_string(name, deep=True): - cidx = cudf.Index(["a", "b", "c"]) - pidx = cidx.to_pandas() - - pidx_copy = pidx.copy(name=name, deep=deep) - cidx_copy = cidx.copy(name=name, deep=deep) - - assert_eq(pidx_copy, cidx_copy) - - -@pytest.mark.parametrize("name", ["x"]) -def test_index_copy_integer(name, deep=True): - """Test for NumericIndex Copy Casts""" - cidx = cudf.Index([1, 2, 3]) - pidx = cidx.to_pandas() - - pidx_copy = pidx.copy(name=name, deep=deep) - cidx_copy = cidx.copy(name=name, deep=deep) - - assert_eq(pidx_copy, cidx_copy) - - -@pytest.mark.parametrize("name", ["x"]) -def test_index_copy_float(name, deep=True): - """Test for NumericIndex Copy Casts""" - cidx = cudf.Index([1.0, 2.0, 3.0]) - pidx = cidx.to_pandas() - - pidx_copy = pidx.copy(name=name, deep=deep) - cidx_copy = cidx.copy(name=name, deep=deep) - - assert_eq(pidx_copy, cidx_copy) - - -@pytest.mark.parametrize("name", ["x"]) -def test_index_copy_category(name, deep=True): - cidx = cudf.core.index.CategoricalIndex([1, 2, 3]) +@pytest.mark.parametrize( + "data", + [ + range(1, 5), + [1, 2, 3, 4], + pd.DatetimeIndex(["2001", "2002", "2003"]), + ["a", "b", "c"], + pd.CategoricalIndex(["a", "b", "c"]), + ], +) +@pytest.mark.parametrize("deep", [True, False]) +@pytest.mark.parametrize("copy_on_write", [True, False]) +def test_index_copy(data, deep, copy_on_write): + name = "x" + cidx = cudf.Index(data) pidx = cidx.to_pandas() pidx_copy = pidx.copy(name=name, deep=deep) cidx_copy = cidx.copy(name=name, deep=deep) - assert_column_memory_ne(cidx._column, cidx_copy._column) assert_eq(pidx_copy, cidx_copy) + with cudf.option_context("copy_on_write", copy_on_write): + if not isinstance(cidx, cudf.RangeIndex): + if ( + isinstance(cidx._column, cudf.core.column.StringColumn) + or not deep + or (copy_on_write and not deep) + ): + # StringColumn is immutable hence, deep copies of a + # Index with string dtype will share the same StringColumn. -@pytest.mark.parametrize("deep", [True, False]) -@pytest.mark.parametrize( - "idx", - [ - cudf.DatetimeIndex(["2001", "2002", "2003"]), - cudf.Index(["a", "b", "c"]), - cudf.Index([1, 2, 3]), - cudf.Index([1.0, 2.0, 3.0]), - cudf.CategoricalIndex([1, 2, 3]), - cudf.CategoricalIndex(["a", "b", "c"]), - ], -) -@pytest.mark.parametrize("copy_on_write", [True, False]) -def test_index_copy_deep(idx, deep, copy_on_write): - """Test if deep copy creates a new instance for device data.""" - idx_copy = idx.copy(deep=deep) - original_cow_setting = cudf.get_option("copy_on_write") - cudf.set_option("copy_on_write", copy_on_write) - if ( - isinstance(idx._column, cudf.core.column.StringColumn) - or not deep - or (cudf.get_option("copy_on_write") and not deep) - ): - # StringColumn is immutable hence, deep copies of a - # Index with string dtype will share the same StringColumn. - - # When `copy_on_write` is turned on, Index objects will - # have unique column object but they all point to same - # data pointers. - assert_column_memory_eq(idx._column, idx_copy._column) - else: - assert_column_memory_ne(idx._column, idx_copy._column) - cudf.set_option("copy_on_write", original_cow_setting) + # When `copy_on_write` is turned on, Index objects will + # have unique column object but they all point to same + # data pointers. + assert_column_memory_eq(cidx._column, cidx_copy._column) + else: + assert_column_memory_ne(cidx._column, cidx_copy._column) -@pytest.mark.parametrize("idx", [[1, None, 3, None, 5]]) -def test_index_isna(idx): +def test_index_isna_notna(): + idx = [1, None, 3, None, 5] pidx = pd.Index(idx, name="idx") gidx = cudf.Index(idx, name="idx") assert_eq(gidx.isna(), pidx.isna()) - - -@pytest.mark.parametrize("idx", [[1, None, 3, None, 5]]) -def test_index_notna(idx): - pidx = pd.Index(idx, name="idx") - gidx = cudf.Index(idx, name="idx") assert_eq(gidx.notna(), pidx.notna()) @@ -1283,39 +1210,6 @@ def test_index_basic(data, dtype, name): assert_eq(pdi, gdi) -@pytest.mark.parametrize("data", [[1, 2, 3, 4], []]) -@pytest.mark.parametrize("name", [1, "a", None]) -@pytest.mark.parametrize("dtype", SIGNED_INTEGER_TYPES) -def test_integer_index_apis(data, name, dtype): - pindex = pd.Index(data, dtype=dtype, name=name) - gindex = cudf.Index(data, dtype=dtype, name=name) - - assert_eq(pindex, gindex) - assert gindex.dtype == dtype - - -@pytest.mark.parametrize("data", [[1, 2, 3, 4], []]) -@pytest.mark.parametrize("name", [1, "a", None]) -@pytest.mark.parametrize("dtype", UNSIGNED_TYPES) -def test_unsigned_integer_index_apis(data, name, dtype): - pindex = pd.Index(data, dtype=dtype, name=name) - gindex = cudf.Index(data, dtype=dtype, name=name) - - assert_eq(pindex, gindex) - assert gindex.dtype == dtype - - -@pytest.mark.parametrize("data", [[1, 2, 3, 4], []]) -@pytest.mark.parametrize("name", [1, "a", None]) -@pytest.mark.parametrize("dtype", FLOAT_TYPES) -def test_float_index_apis(data, name, dtype): - pindex = pd.Index(data, dtype=dtype, name=name) - gindex = cudf.Index(data, dtype=dtype, name=name) - - assert_eq(pindex, gindex) - assert gindex.dtype == dtype - - @pytest.mark.parametrize("data", [[1, 2, 3, 4], []]) @pytest.mark.parametrize("categories", [[1, 2], None]) @pytest.mark.parametrize( @@ -1716,12 +1610,11 @@ def test_index_set_names(idx, names, inplace): assert_eq(expected, actual) -@pytest.mark.parametrize("idx", [pd.Index([1, 2, 3], name="abc")]) @pytest.mark.parametrize("level", [1, [0], "abc"]) @pytest.mark.parametrize("names", [None, "a"]) -def test_index_set_names_error(idx, level, names): - pi = idx.copy() - gi = cudf.from_pandas(idx) +def test_index_set_names_error(level, names): + pi = pd.Index([1, 2, 3], name="abc") + gi = cudf.from_pandas(pi) assert_exceptions_equal( lfunc=pi.set_names, @@ -1732,13 +1625,12 @@ def test_index_set_names_error(idx, level, names): @pytest.mark.parametrize( - "idx", - [pd.Index([1, 3, 6]), pd.Index([6, 1, 3])], # monotonic # non-monotonic + "data", [[1, 3, 6], [6, 1, 3]], ids=["monotonic", "non-monotonic"] ) -@pytest.mark.parametrize("key", [list(range(0, 8))]) @pytest.mark.parametrize("method", [None, "ffill", "bfill", "nearest"]) -def test_get_indexer_single_unique_numeric(idx, key, method): - pi = idx +def test_get_indexer_single_unique_numeric(data, method): + key = list(range(0, 8)) + pi = pd.Index(data) gi = cudf.from_pandas(pi) if ( @@ -1763,22 +1655,19 @@ def test_get_indexer_single_unique_numeric(idx, key, method): @pytest.mark.parametrize( - "idx", - [pd.RangeIndex(3, 100, 4)], -) -@pytest.mark.parametrize( - "key", + "rng", [ - list(range(1, 20, 3)), - list(range(20, 35, 3)), - list(range(35, 77, 3)), - list(range(77, 110, 3)), + range(1, 20, 3), + range(20, 35, 3), + range(35, 77, 3), + range(77, 110, 3), ], ) @pytest.mark.parametrize("method", [None, "ffill", "bfill", "nearest"]) @pytest.mark.parametrize("tolerance", [None, 0, 1, 13, 20]) -def test_get_indexer_rangeindex(idx, key, method, tolerance): - pi = idx +def test_get_indexer_rangeindex(rng, method, tolerance): + key = list(rng) + pi = pd.RangeIndex(3, 100, 4) gi = cudf.from_pandas(pi) expected = pi.get_indexer( @@ -1797,13 +1686,9 @@ def test_get_indexer_rangeindex(idx, key, method, tolerance): assert_eq(expected, got, check_dtype=True) -@pytest.mark.parametrize( - "idx", - [pd.RangeIndex(3, 100, 4)], -) @pytest.mark.parametrize("key", list(range(1, 110, 3))) -def test_get_loc_rangeindex(idx, key): - pi = idx +def test_get_loc_rangeindex(key): + pi = pd.RangeIndex(3, 100, 4) gi = cudf.from_pandas(pi) if ( (key not in pi) @@ -1828,14 +1713,15 @@ def test_get_loc_rangeindex(idx, key): @pytest.mark.parametrize( "idx", [ - pd.Index([1, 3, 3, 6]), # monotonic increasing - pd.Index([6, 1, 3, 3]), # non-monotonic - pd.Index([4, 3, 2, 1, 0]), # monotonic decreasing + [1, 3, 3, 6], + [6, 1, 3, 3], + [4, 3, 2, 1, 0], ], + ids=["monotonic increasing", "non-monotonic", "monotonic decreasing"], ) @pytest.mark.parametrize("key", [0, 3, 6, 7, 4]) def test_get_loc_duplicate_numeric(idx, key): - pi = idx + pi = pd.Index(idx) gi = cudf.from_pandas(pi) if key not in pi: @@ -1855,15 +1741,16 @@ def test_get_loc_duplicate_numeric(idx, key): @pytest.mark.parametrize( "idx", [ - pd.Index([-1, 2, 3, 6]), # monotonic - pd.Index([6, 1, 3, 4]), # non-monotonic + [-1, 2, 3, 6], + [6, 1, 3, 4], ], + ids=["monotonic", "non-monotonic"], ) @pytest.mark.parametrize("key", [[0, 3, 1], [6, 7]]) @pytest.mark.parametrize("method", [None, "ffill", "bfill", "nearest"]) @pytest.mark.parametrize("tolerance", [None, 1, 2]) def test_get_indexer_single_duplicate_numeric(idx, key, method, tolerance): - pi = idx + pi = pd.Index(idx) gi = cudf.from_pandas(pi) if not pi.is_monotonic_increasing and method is not None: @@ -1884,12 +1771,10 @@ def test_get_indexer_single_duplicate_numeric(idx, key, method, tolerance): assert_eq(expected, got) -@pytest.mark.parametrize( - "idx", [pd.Index(["b", "f", "m", "q"]), pd.Index(["m", "f", "b", "q"])] -) +@pytest.mark.parametrize("idx", [["b", "f", "m", "q"], ["m", "f", "b", "q"]]) @pytest.mark.parametrize("key", ["a", "f", "n", "z"]) def test_get_loc_single_unique_string(idx, key): - pi = idx + pi = pd.Index(idx) gi = cudf.from_pandas(pi) if key not in pi: @@ -1906,13 +1791,11 @@ def test_get_loc_single_unique_string(idx, key): assert_eq(expected, got) -@pytest.mark.parametrize( - "idx", [pd.Index(["b", "f", "m", "q"]), pd.Index(["m", "f", "b", "q"])] -) +@pytest.mark.parametrize("idx", [["b", "f", "m", "q"], ["m", "f", "b", "q"]]) @pytest.mark.parametrize("key", [["a", "f", "n", "z"], ["p", "p", "b"]]) @pytest.mark.parametrize("method", [None, "ffill", "bfill"]) def test_get_indexer_single_unique_string(idx, key, method): - pi = idx + pi = pd.Index(idx) gi = cudf.from_pandas(pi) if not pi.is_monotonic_increasing and method is not None: @@ -1929,12 +1812,10 @@ def test_get_indexer_single_unique_string(idx, key, method): assert_eq(expected, got) -@pytest.mark.parametrize( - "idx", [pd.Index(["b", "m", "m", "q"]), pd.Index(["m", "f", "m", "q"])] -) +@pytest.mark.parametrize("idx", [["b", "m", "m", "q"], ["m", "f", "m", "q"]]) @pytest.mark.parametrize("key", ["a", "f", "n", "z"]) def test_get_loc_single_duplicate_string(idx, key): - pi = idx + pi = pd.Index(idx) gi = cudf.from_pandas(pi) if key not in pi: @@ -1951,13 +1832,11 @@ def test_get_loc_single_duplicate_string(idx, key): assert_eq(expected, got) -@pytest.mark.parametrize( - "idx", [pd.Index(["b", "m", "m", "q"]), pd.Index(["a", "f", "m", "q"])] -) +@pytest.mark.parametrize("idx", [["b", "m", "m", "q"], ["a", "f", "m", "q"]]) @pytest.mark.parametrize("key", [["a"], ["f", "n", "z"]]) @pytest.mark.parametrize("method", [None, "ffill", "bfill"]) def test_get_indexer_single_duplicate_string(idx, key, method): - pi = idx + pi = pd.Index(idx) gi = cudf.from_pandas(pi) if ( @@ -1984,21 +1863,16 @@ def test_get_indexer_single_duplicate_string(idx, key, method): @pytest.mark.parametrize( - "idx", + "data", [ - pd.MultiIndex.from_tuples( - [(1, 1, 1), (1, 1, 2), (1, 2, 1), (1, 2, 3), (2, 1, 1), (2, 2, 1)] - ), - pd.MultiIndex.from_tuples( - [(2, 1, 1), (1, 2, 3), (1, 2, 1), (1, 1, 2), (2, 2, 1), (1, 1, 1)] - ), - pd.MultiIndex.from_tuples( - [(1, 1, 1), (1, 1, 2), (1, 1, 2), (1, 2, 3), (2, 1, 1), (2, 2, 1)] - ), + [(1, 1, 1), (1, 1, 2), (1, 2, 1), (1, 2, 3), (2, 1, 1), (2, 2, 1)], + [(2, 1, 1), (1, 2, 3), (1, 2, 1), (1, 1, 2), (2, 2, 1), (1, 1, 1)], + [(1, 1, 1), (1, 1, 2), (1, 1, 2), (1, 2, 3), (2, 1, 1), (2, 2, 1)], ], ) @pytest.mark.parametrize("key", [1, (1, 2), (1, 2, 3), (2, 1, 1), (9, 9, 9)]) -def test_get_loc_multi_numeric(idx, key): +def test_get_loc_multi_numeric(data, key): + idx = pd.MultiIndex.from_tuples(data) pi = idx.sort_values() gi = cudf.from_pandas(pi) @@ -2017,22 +1891,17 @@ def test_get_loc_multi_numeric(idx, key): @pytest.mark.parametrize( - "idx", + "data", [ - pd.MultiIndex.from_tuples( - [(1, 1, 1), (1, 1, 2), (1, 2, 1), (1, 2, 3), (2, 1, 1), (2, 2, 1)] - ), - pd.MultiIndex.from_tuples( - [(2, 1, 1), (1, 2, 3), (1, 2, 1), (1, 1, 2), (2, 2, 1), (1, 1, 1)] - ), - pd.MultiIndex.from_tuples( - [(1, 1, 1), (1, 1, 2), (1, 1, 24), (1, 2, 3), (2, 1, 1), (2, 2, 1)] - ), + [(1, 1, 1), (1, 1, 2), (1, 2, 1), (1, 2, 3), (2, 1, 1), (2, 2, 1)], + [(2, 1, 1), (1, 2, 3), (1, 2, 1), (1, 1, 2), (2, 2, 1), (1, 1, 1)], + [(1, 1, 1), (1, 1, 2), (1, 1, 24), (1, 2, 3), (2, 1, 1), (2, 2, 1)], ], ) @pytest.mark.parametrize("key", [[(1, 2, 3)], [(9, 9, 9)]]) @pytest.mark.parametrize("method", [None, "ffill", "bfill"]) -def test_get_indexer_multi_numeric(idx, key, method): +def test_get_indexer_multi_numeric(data, key, method): + idx = pd.MultiIndex.from_tuples(data) pi = idx.sort_values() gi = cudf.from_pandas(pi) @@ -2047,14 +1916,6 @@ def test_get_indexer_multi_numeric(idx, key, method): assert_eq(expected, got, check_dtype=True) -@pytest.mark.parametrize( - "idx", - [ - pd.MultiIndex.from_tuples( - [(2, 1, 1), (1, 2, 3), (1, 2, 1), (1, 1, 1), (1, 1, 1), (2, 2, 1)] - ) - ], -) @pytest.mark.parametrize( "key, result", [ @@ -2065,8 +1926,10 @@ def test_get_indexer_multi_numeric(idx, key, method): ((9, 9, 9), None), ], ) -def test_get_loc_multi_numeric_deviate(idx, key, result): - pi = idx +def test_get_loc_multi_numeric_deviate(key, result): + pi = pd.MultiIndex.from_tuples( + [(2, 1, 1), (1, 2, 3), (1, 2, 1), (1, 1, 1), (1, 1, 1), (2, 2, 1)] + ) gi = cudf.from_pandas(pi) with expect_warning_if( @@ -2138,64 +2001,55 @@ def test_get_indexer_multi_error(method): @pytest.mark.parametrize( - "idx", + "data", [ - pd.MultiIndex.from_tuples( - [ - ("a", "a", "a"), - ("a", "a", "b"), - ("a", "b", "a"), - ("a", "b", "c"), - ("b", "a", "a"), - ("b", "c", "a"), - ] - ), - pd.MultiIndex.from_tuples( - [ - ("a", "a", "b"), - ("a", "b", "c"), - ("b", "a", "a"), - ("a", "a", "a"), - ("a", "b", "a"), - ("b", "c", "a"), - ] - ), - pd.MultiIndex.from_tuples( - [ - ("a", "a", "a"), - ("a", "b", "c"), - ("b", "a", "a"), - ("a", "a", "b"), - ("a", "b", "a"), - ("b", "c", "a"), - ] - ), - pd.MultiIndex.from_tuples( - [ - ("a", "a", "a"), - ("a", "a", "b"), - ("a", "a", "b"), - ("a", "b", "c"), - ("b", "a", "a"), - ("b", "c", "a"), - ] - ), - pd.MultiIndex.from_tuples( - [ - ("a", "a", "b"), - ("b", "a", "a"), - ("b", "a", "a"), - ("a", "a", "a"), - ("a", "b", "a"), - ("b", "c", "a"), - ] - ), + [ + ("a", "a", "a"), + ("a", "a", "b"), + ("a", "b", "a"), + ("a", "b", "c"), + ("b", "a", "a"), + ("b", "c", "a"), + ], + [ + ("a", "a", "b"), + ("a", "b", "c"), + ("b", "a", "a"), + ("a", "a", "a"), + ("a", "b", "a"), + ("b", "c", "a"), + ], + [ + ("a", "a", "a"), + ("a", "b", "c"), + ("b", "a", "a"), + ("a", "a", "b"), + ("a", "b", "a"), + ("b", "c", "a"), + ], + [ + ("a", "a", "a"), + ("a", "a", "b"), + ("a", "a", "b"), + ("a", "b", "c"), + ("b", "a", "a"), + ("b", "c", "a"), + ], + [ + ("a", "a", "b"), + ("b", "a", "a"), + ("b", "a", "a"), + ("a", "a", "a"), + ("a", "b", "a"), + ("b", "c", "a"), + ], ], ) @pytest.mark.parametrize( "key", ["a", ("a", "a"), ("a", "b", "c"), ("b", "c", "a"), ("z", "z", "z")] ) -def test_get_loc_multi_string(idx, key): +def test_get_loc_multi_string(data, key): + idx = pd.MultiIndex.from_tuples(data) pi = idx.sort_values() gi = cudf.from_pandas(pi) @@ -2214,45 +2068,40 @@ def test_get_loc_multi_string(idx, key): @pytest.mark.parametrize( - "idx", + "data", [ - pd.MultiIndex.from_tuples( - [ - ("a", "a", "a"), - ("a", "a", "b"), - ("a", "b", "a"), - ("a", "b", "c"), - ("b", "a", "a"), - ("b", "c", "a"), - ] - ), - pd.MultiIndex.from_tuples( - [ - ("a", "a", "b"), - ("a", "b", "c"), - ("b", "a", "a"), - ("a", "a", "a"), - ("a", "b", "a"), - ("b", "c", "a"), - ] - ), - pd.MultiIndex.from_tuples( - [ - ("a", "a", "a"), - ("a", "b", "c"), - ("b", "a", "a"), - ("a", "a", "b"), - ("a", "b", "a"), - ("b", "c", "a"), - ] - ), + [ + ("a", "a", "a"), + ("a", "a", "b"), + ("a", "b", "a"), + ("a", "b", "c"), + ("b", "a", "a"), + ("b", "c", "a"), + ], + [ + ("a", "a", "b"), + ("a", "b", "c"), + ("b", "a", "a"), + ("a", "a", "a"), + ("a", "b", "a"), + ("b", "c", "a"), + ], + [ + ("a", "a", "a"), + ("a", "b", "c"), + ("b", "a", "a"), + ("a", "a", "b"), + ("a", "b", "a"), + ("b", "c", "a"), + ], ], ) @pytest.mark.parametrize( "key", [[("a", "b", "c"), ("b", "c", "a")], [("z", "z", "z")]] ) @pytest.mark.parametrize("method", [None, "ffill", "bfill"]) -def test_get_indexer_multi_string(idx, key, method): +def test_get_indexer_multi_string(data, key, method): + idx = pd.MultiIndex.from_tuples(data) pi = idx.sort_values() gi = cudf.from_pandas(pi) @@ -2672,8 +2521,9 @@ def test_isin_multiindex(data, values, level, err): ) -@pytest.fixture( - params=[ +@pytest.mark.parametrize( + "rangeindex", + [ range(np.random.default_rng(seed=0).integers(0, 100)), range(9, 12, 2), range(20, 30), @@ -2681,19 +2531,14 @@ def test_isin_multiindex(data, values, level, err): range(0, 10, -2), range(0, -10, 2), range(0, -10, -2), - ] + ], ) -def rangeindex(request): - """Create a cudf RangeIndex of different `nrows`""" - return cudf.RangeIndex(request.param) - - @pytest.mark.parametrize( "func", ["nunique", "min", "max", "any", "values"], ) def test_rangeindex_methods(rangeindex, func): - gidx = rangeindex + gidx = cudf.RangeIndex(rangeindex) pidx = gidx.to_pandas() if func == "values": @@ -2856,8 +2701,9 @@ def test_rangeindex_append_return_rangeindex(): assert_eq(result, expected) -@pytest.fixture( - params=[ +@pytest.mark.parametrize( + "index", + [ range(np.random.default_rng(seed=0).integers(0, 100)), range(0, 10, -2), range(0, -10, 2), @@ -2868,13 +2714,8 @@ def test_rangeindex_append_return_rangeindex(): [None, "a", "3.2", "z", None, None], pd.Series(["a", "b", None], dtype="category"), np.array([1, 2, 3, None], dtype="datetime64[s]"), - ] + ], ) -def index(request): - """Create a cudf Index of different dtypes""" - return cudf.Index(request.param) - - @pytest.mark.parametrize( "func", [ @@ -2885,7 +2726,7 @@ def index(request): ], ) def test_index_methods(index, func): - gidx = index + gidx = cudf.Index(index) pidx = gidx.to_pandas() if func == "append": @@ -3032,23 +2873,18 @@ def test_index_to_pandas_nullable(data, expected_dtype): assert_eq(pi, expected) -class TestIndexScalarGetItem: - @pytest.fixture( - params=[range(1, 10, 2), [1, 2, 3], ["a", "b", "c"], [1.5, 2.5, 3.5]] - ) - def index_values(self, request): - return request.param - - @pytest.fixture(params=[int, np.int8, np.int32, np.int64]) - def i(self, request): - return request.param(1) - - def test_scalar_getitem(self, index_values, i): - index = cudf.Index(index_values) +@pytest.mark.parametrize( + "index_values", + [range(1, 10, 2), [1, 2, 3], ["a", "b", "c"], [1.5, 2.5, 3.5]], +) +@pytest.mark.parametrize("i_type", [int, np.int8, np.int32, np.int64]) +def test_scalar_getitem(index_values, i_type): + i = i_type(1) + index = cudf.Index(index_values) - assert not isinstance(index[i], cudf.Index) - assert index[i] == index_values[i] - assert_eq(index, index.to_pandas()) + assert not isinstance(index[i], cudf.Index) + assert index[i] == index_values[i] + assert_eq(index, index.to_pandas()) @pytest.mark.parametrize( @@ -3197,14 +3033,15 @@ def test_from_pandas_rangeindex_return_rangeindex(): @pytest.mark.parametrize( - "idx", + "data", [ - cudf.RangeIndex(1), - cudf.DatetimeIndex(np.array([1, 2], dtype="datetime64[ns]")), - cudf.TimedeltaIndex(np.array([1, 2], dtype="timedelta64[ns]")), + range(1), + np.array([1, 2], dtype="datetime64[ns]"), + np.array([1, 2], dtype="timedelta64[ns]"), ], ) -def test_index_to_pandas_nullable_notimplemented(idx): +def test_index_to_pandas_nullable_notimplemented(data): + idx = cudf.Index(data) with pytest.raises(NotImplementedError): idx.to_pandas(nullable=True) @@ -3340,12 +3177,13 @@ def test_index_datetime_repeat(): @pytest.mark.parametrize( "index", [ - cudf.Index([1]), - cudf.RangeIndex(1), - cudf.MultiIndex(levels=[[0]], codes=[[0]]), + lambda: cudf.Index([1]), + lambda: cudf.RangeIndex(1), + lambda: cudf.MultiIndex(levels=[[0]], codes=[[0]]), ], ) def test_index_assignment_no_shallow_copy(index): + index = index() df = cudf.DataFrame(range(1)) df.index = index assert df.index is index From ff8c4b9927db5845b27ba1b7330340943a26068d Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 5 Aug 2025 11:21:00 -0700 Subject: [PATCH 058/366] Clean testing/_utils.py (#19506) Working to have less places where we define "testing utilities" e.g. `conftest.py` vs `testing/*.py` Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - Peter Andreas Entschev (https://github.com/pentschev) - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/19506 --- python/cudf/cudf/testing/_utils.py | 72 ------------------- .../cudf/tests/test_cuda_array_interface.py | 2 +- python/cudf/cudf/tests/test_dataframe.py | 3 +- python/cudf/cudf/tests/test_dlpack.py | 2 +- python/cudf/cudf/tests/test_udf_masked_ops.py | 46 +++++++++++- .../test_disable_pandas_accelerator.py | 28 ++++---- .../dask_cudf/tests/test_accessor.py | 3 +- 7 files changed, 62 insertions(+), 94 deletions(-) diff --git a/python/cudf/cudf/testing/_utils.py b/python/cudf/cudf/testing/_utils.py index fa0bf52279e..41401ab4bde 100644 --- a/python/cudf/cudf/testing/_utils.py +++ b/python/cudf/cudf/testing/_utils.py @@ -3,7 +3,6 @@ import itertools import string -import time from collections import abc from contextlib import contextmanager from decimal import Decimal @@ -12,23 +11,11 @@ import numpy as np import pandas as pd import pytest -from numba.core.typing import signature as nb_signature -from numba.core.typing.templates import AbstractTemplate -from numba.cuda.cudadecl import registry as cuda_decl_registry -from numba.cuda.cudaimpl import lower as cuda_lower import pylibcudf as plc import cudf from cudf.core.column.column import as_column -from cudf.core.udf.strings_lowering import ( - cast_string_view_to_managed_udf_string, -) -from cudf.core.udf.strings_typing import ( - StringView, - managed_udf_string, - string_view, -) from cudf.utils import dtypes as dtypeutils from cudf.utils.temporal import unit_to_nanoseconds_conversion @@ -291,11 +278,6 @@ def _decimal_series(input, dtype): ) -@contextmanager -def does_not_raise(): - yield - - def assert_column_memory_eq(lhs: ColumnBase, rhs: ColumnBase): """Assert the memory location and size of `lhs` and `rhs` are equivalent. @@ -391,57 +373,3 @@ def expect_warning_if(condition, warning=FutureWarning, *args, **kwargs): yield else: yield - - -def sv_to_managed_udf_str(sv): - """ - Cast a string_view object to a managed_udf_string object - - This placeholder function never runs in python - It exists only for numba to have something to replace - with the typing and lowering code below - - This is similar conceptually to needing a translation - engine to emit an expression in target language "B" when - there is no equivalent in the source language "A" to - translate from. This function effectively defines the - expression in language "A" and the associated typing - and lowering describe the translation process, despite - the expression having no meaning in language "A" - """ - pass - - -@cuda_decl_registry.register_global(sv_to_managed_udf_str) -class StringViewToUDFStringDecl(AbstractTemplate): - def generic(self, args, kws): - if isinstance(args[0], StringView) and len(args) == 1: - return nb_signature(managed_udf_string, string_view) - - -@cuda_lower(sv_to_managed_udf_str, string_view) -def sv_to_udf_str_testing_lowering(context, builder, sig, args): - return cast_string_view_to_managed_udf_string( - context, builder, sig.args[0], sig.return_type, args[0] - ) - - -class cudf_timeout: - """ - Context manager to raise a TimeoutError after a specified number of seconds. - """ - - def __init__(self, timeout): - self.timeout = timeout - - def __enter__(self): - self.start_time = time.perf_counter() - - def __exit__(self, *args): - elapsed_time = ( - time.perf_counter() - self.start_time - ) # Calculate elapsed time - if elapsed_time >= self.timeout: - raise TimeoutError( - f"Expected to finish in {self.timeout=} seconds but took {elapsed_time=} seconds" - ) diff --git a/python/cudf/cudf/tests/test_cuda_array_interface.py b/python/cudf/cudf/tests/test_cuda_array_interface.py index 18067d4cf20..e163f62282b 100644 --- a/python/cudf/cudf/tests/test_cuda_array_interface.py +++ b/python/cudf/cudf/tests/test_cuda_array_interface.py @@ -1,7 +1,7 @@ # Copyright (c) 2019-2025, NVIDIA CORPORATION. import types -from contextlib import ExitStack as does_not_raise +from contextlib import nullcontext as does_not_raise import cupy import numba.cuda diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index 2671c0bf0f3..328d6fbca7b 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -13,7 +13,7 @@ import textwrap import warnings from collections import OrderedDict, defaultdict, namedtuple -from contextlib import contextmanager +from contextlib import contextmanager, nullcontext as does_not_raise from copy import copy import cupy @@ -40,7 +40,6 @@ DATETIME_TYPES, NUMERIC_TYPES, assert_exceptions_equal, - does_not_raise, expect_warning_if, gen_rand, ) diff --git a/python/cudf/cudf/tests/test_dlpack.py b/python/cudf/cudf/tests/test_dlpack.py index 37180871086..ffb33870323 100644 --- a/python/cudf/cudf/tests/test_dlpack.py +++ b/python/cudf/cudf/tests/test_dlpack.py @@ -1,7 +1,7 @@ # Copyright (c) 2019-2025, NVIDIA CORPORATION. import itertools -from contextlib import ExitStack as does_not_raise +from contextlib import nullcontext as does_not_raise import cupy import numpy as np diff --git a/python/cudf/cudf/tests/test_udf_masked_ops.py b/python/cudf/cudf/tests/test_udf_masked_ops.py index 985766b59c7..958a3657abb 100644 --- a/python/cudf/cudf/tests/test_udf_masked_ops.py +++ b/python/cudf/cudf/tests/test_udf_masked_ops.py @@ -5,6 +5,10 @@ import numpy as np import pytest from numba import cuda +from numba.core.typing import signature as nb_signature +from numba.core.typing.templates import AbstractTemplate +from numba.cuda.cudadecl import registry as cuda_decl_registry +from numba.cuda.cudaimpl import lower as cuda_lower import cudf from cudf.core._compat import PANDAS_CURRENT_SUPPORTED_VERSION, PANDAS_VERSION @@ -16,15 +20,55 @@ unary_ops, ) from cudf.core.udf.api import Masked +from cudf.core.udf.strings_lowering import ( + cast_string_view_to_managed_udf_string, +) +from cudf.core.udf.strings_typing import ( + StringView, + managed_udf_string, + string_view, +) from cudf.core.udf.utils import precompiled from cudf.testing import assert_eq from cudf.testing._utils import ( _decimal_series, parametrize_numeric_dtypes_pairwise, - sv_to_managed_udf_str, ) +def sv_to_managed_udf_str(sv): + """ + Cast a string_view object to a managed_udf_string object + + This placeholder function never runs in python + It exists only for numba to have something to replace + with the typing and lowering code below + + This is similar conceptually to needing a translation + engine to emit an expression in target language "B" when + there is no equivalent in the source language "A" to + translate from. This function effectively defines the + expression in language "A" and the associated typing + and lowering describe the translation process, despite + the expression having no meaning in language "A" + """ + pass + + +@cuda_decl_registry.register_global(sv_to_managed_udf_str) +class StringViewToUDFStringDecl(AbstractTemplate): + def generic(self, args, kws): + if isinstance(args[0], StringView) and len(args) == 1: + return nb_signature(managed_udf_string, string_view) + + +@cuda_lower(sv_to_managed_udf_str, string_view) +def sv_to_udf_str_testing_lowering(context, builder, sig, args): + return cast_string_view_to_managed_udf_string( + context, builder, sig.args[0], sig.return_type, args[0] + ) + + @pytest.fixture(scope="module") def str_udf_data(): return cudf.DataFrame( diff --git a/python/cudf/cudf_pandas_tests/test_disable_pandas_accelerator.py b/python/cudf/cudf_pandas_tests/test_disable_pandas_accelerator.py index 7110da677f9..b4c2b3aff25 100644 --- a/python/cudf/cudf_pandas_tests/test_disable_pandas_accelerator.py +++ b/python/cudf/cudf_pandas_tests/test_disable_pandas_accelerator.py @@ -2,30 +2,26 @@ import os import subprocess +import sys import pytest -from cudf.testing import _utils as utils - @pytest.mark.flaky(reruns=3, reruns_delay=30) def test_disable_pandas_accelerator_multi_threaded(): data_directory = os.path.dirname(os.path.abspath(__file__)) - # Create a copy of the current environment variables - env = os.environ.copy() - with utils.cudf_timeout(20): - sp_completed = subprocess.run( - [ - "python", - "-m", - "cudf.pandas", - data_directory + "/data/disable_cudf_pandas_multi_thread.py", - ], - capture_output=True, - text=True, - env=env, - ) + sp_completed = subprocess.run( + [ + sys.executable, + "-m", + "cudf.pandas", + data_directory + "/data/disable_cudf_pandas_multi_thread.py", + ], + capture_output=True, + text=True, + timeout=20, + ) assert sp_completed.returncode == 0 output = sp_completed.stdout diff --git a/python/dask_cudf/dask_cudf/tests/test_accessor.py b/python/dask_cudf/dask_cudf/tests/test_accessor.py index 8d69940ab84..13407f5d56f 100644 --- a/python/dask_cudf/dask_cudf/tests/test_accessor.py +++ b/python/dask_cudf/dask_cudf/tests/test_accessor.py @@ -1,5 +1,7 @@ # Copyright (c) 2019-2025, NVIDIA CORPORATION. +from contextlib import nullcontext as does_not_raise + import numpy as np import pandas as pd import pytest @@ -10,7 +12,6 @@ from cudf import DataFrame, Series, date_range from cudf.testing import assert_eq -from cudf.testing._utils import does_not_raise import dask_cudf From 4d2cd545e66bb56ce7460d71bdbae76db9a38047 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 5 Aug 2025 11:24:27 -0700 Subject: [PATCH 059/366] Use more pytest fixtures and avoid GPU parameterization in test_query/rank/reduction/repr.py (#19434) Towards https://github.com/rapidsai/cudf/issues/9999 * Use more pytest fixtures * Avoids pytest.mark.parametrize with GPU objects * Uses more context managers so state doesn't get altered if a test fails * `python/cudf/cudf/tests/test_query_mask.py` contained essentially the same test multiple times, so it was moved to `test_query.py` Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/19434 --- python/cudf/cudf/tests/test_query.py | 105 ++++---- python/cudf/cudf/tests/test_query_mask.py | 71 ------ python/cudf/cudf/tests/test_rank.py | 29 +-- python/cudf/cudf/tests/test_reductions.py | 22 +- python/cudf/cudf/tests/test_repr.py | 279 ++++++++++++---------- 5 files changed, 224 insertions(+), 282 deletions(-) delete mode 100644 python/cudf/cudf/tests/test_query_mask.py diff --git a/python/cudf/cudf/tests/test_query.py b/python/cudf/cudf/tests/test_query.py index fc30132458e..ddb8a7ffd37 100644 --- a/python/cudf/cudf/tests/test_query.py +++ b/python/cudf/cudf/tests/test_query.py @@ -3,7 +3,6 @@ import datetime import inspect -from itertools import product import numpy as np import pandas as pd @@ -14,13 +13,15 @@ from cudf.testing import assert_eq from cudf.utils import queryutils -_params_query_parser = [] -_params_query_parser.append(("a > @b", ("a", "__CUDF_ENVREF__b"))) -_params_query_parser.append(("(a + b) <= @c", ("a", "b", "__CUDF_ENVREF__c"))) -_params_query_parser.append(("a > b if a > 0 else b > a", ("a", "b"))) - -@pytest.mark.parametrize("text,expect_args", _params_query_parser) +@pytest.mark.parametrize( + "text,expect_args", + [ + ("a > @b", ("a", "__CUDF_ENVREF__b")), + ("(a + b) <= @c", ("a", "b", "__CUDF_ENVREF__c")), + ("a > b if a > 0 else b > a", ("a", "b")), + ], +) def test_query_parser(text, expect_args): info = queryutils.query_parser(text) fn = queryutils.query_builder(info, "myfoo") @@ -29,21 +30,18 @@ def test_query_parser(text, expect_args): assert tuple(argspec.args) == tuple(expect_args) -params_query_data = list(product([1, 2, 7, 8, 9, 16, 100, 129], range(2))) -params_query_fn = [ - (lambda a, b: a < b, "a < b"), - (lambda a, b: a * 2 >= b, "a * 2 >= b"), - (lambda a, b: 2 * (a + b) > (a + b) / 2, "2 * (a + b) > (a + b) / 2"), -] -nulls = [True, False] - - +@pytest.mark.parametrize("nelem", [1, 10]) @pytest.mark.parametrize( - "data,fn,nulls", product(params_query_data, params_query_fn, nulls) + "fn", + [ + (lambda a, b: a < b, "a < b"), + (lambda a, b: a * 2 >= b, "a * 2 >= b"), + (lambda a, b: 2 * (a + b) > (a + b) / 2, "2 * (a + b) > (a + b) / 2"), + ], ) -def test_query(data, fn, nulls): +@pytest.mark.parametrize("nulls", [True, False]) +def test_query(nelem, fn, nulls): # prepare - nelem, seed = data expect_fn, query_expr = fn rng = np.random.default_rng(seed=0) pdf = pd.DataFrame() @@ -55,21 +53,19 @@ def test_query(data, fn, nulls): assert_eq(pdf.query(query_expr), gdf.query(query_expr)) -params_query_env_fn = [ - (lambda a, b, c, d: a * c > b + d, "a * @c > b + @d"), - ( - lambda a, b, c, d: ((a / c) < d) | ((b**c) > d), - "((a / @c) < @d) | ((b ** @c) > @d)", - ), -] - - +@pytest.mark.parametrize("nelem", [1, 10]) @pytest.mark.parametrize( - "data,fn", product(params_query_data, params_query_env_fn) + "fn", + [ + (lambda a, b, c, d: a * c > b + d, "a * @c > b + @d"), + ( + lambda a, b, c, d: ((a / c) < d) | ((b**c) > d), + "((a / @c) < @d) | ((b ** @c) > @d)", + ), + ], ) -def test_query_ref_env(data, fn): +def test_query_ref_env(nelem, fn): # prepare - nelem, seed = data expect_fn, query_expr = fn rng = np.random.default_rng(seed=0) df = DataFrame() @@ -225,16 +221,9 @@ def test_query_with_index_keyword(query, a_val, b_val, c_val): assert_eq(out, expect) -@pytest.mark.parametrize( - "data, query", - [ - # Only need to test the dtypes that pandas - # supports but that we do not - (["a", "b", "c"], "data == 'a'"), - ], -) -def test_query_unsupported_dtypes(data, query): - gdf = cudf.DataFrame({"data": data}) +def test_query_unsupported_dtypes(): + query = "data == 'a'" + gdf = cudf.DataFrame({"data": ["a", "b", "c"]}) # make sure the query works in pandas pdf = gdf.to_pandas() @@ -246,3 +235,37 @@ def test_query_unsupported_dtypes(data, query): # but fails in cuDF with pytest.raises(TypeError): gdf.query(query) + + +@pytest.mark.parametrize( + "values", + [ + [0, 1.0, 2.0, None, np.nan, None, 3, 5], + [0, 1.0, 2.0, None, 3, np.nan, None, 4], + [0, 1.0, 2.0, None, 3, np.nan, None, 4, None, 9], + ], +) +@pytest.mark.parametrize("nan_as_null", [True, False]) +@pytest.mark.parametrize( + "query", + [ + "a == 3", + pytest.param( + "a != 3", + marks=pytest.mark.xfail(reason="incompatible with pandas"), + ), + "a < 3", + "a <= 3", + "a < 3", + "a >= 3", + ], +) +def test_query_mask(values, nan_as_null, query): + data = {"a": values} + pdf = pd.DataFrame(data) + gdf = cudf.DataFrame(data, nan_as_null=nan_as_null) + + pdf_q_res = pdf.query(query) + gdf_q_res = gdf.query(query) + + assert_eq(pdf_q_res, gdf_q_res) diff --git a/python/cudf/cudf/tests/test_query_mask.py b/python/cudf/cudf/tests/test_query_mask.py deleted file mode 100644 index 9372681187d..00000000000 --- a/python/cudf/cudf/tests/test_query_mask.py +++ /dev/null @@ -1,71 +0,0 @@ -# Copyright (c) 2019-2024, NVIDIA CORPORATION. - -import numpy as np -import pandas as pd -import pytest - -import cudf -from cudf.testing import assert_eq - -_data = [ - {"a": [0, 1.0, 2.0, None, np.nan, None, 3, 5]}, - {"a": [0, 1.0, 2.0, None, 3, np.nan, None, 4]}, - {"a": [0, 1.0, 2.0, None, 3, np.nan, None, 4, None, 9]}, -] -_queries = [ - "a == 3", - # "a != 3", # incompatible with pandas - "a < 3", - "a <= 3", - "a < 3", - "a >= 3", -] - - -@pytest.mark.parametrize("data", _data) -@pytest.mark.parametrize("query", _queries) -def test_mask_0(data, query): - pdf = pd.DataFrame(data) - gdf = cudf.from_pandas(pdf) - - pdf_q_res = pdf.query(query) - gdf_q_res = gdf.query(query) - - assert_eq(pdf_q_res, gdf_q_res) - - -@pytest.mark.parametrize("data", _data) -@pytest.mark.parametrize("nan_as_null", [False, True]) -@pytest.mark.parametrize("query", _queries) -def test_mask_1(data, nan_as_null, query): - pdf = pd.DataFrame(data) - gdf = cudf.DataFrame.from_pandas(pdf, nan_as_null=nan_as_null) - - pdf_q_res = pdf.query(query) - gdf_q_res = gdf.query(query) - - assert_eq(pdf_q_res, gdf_q_res) - - -@pytest.mark.parametrize("data", _data) -@pytest.mark.parametrize("query", _queries) -def test_mask_2(data, query): - pdf = pd.DataFrame(data) - gdf = cudf.DataFrame(data) - - pdf_q_res = pdf.query(query) - gdf_q_res = gdf.query(query) - - assert_eq(pdf_q_res, gdf_q_res) - - -@pytest.mark.parametrize("data", _data) -@pytest.mark.parametrize("query", _queries) -def test_dataframe_initializer(data, query): - pdf = pd.DataFrame(data) - gdf = cudf.DataFrame(data) - - pdf_q_res = pdf.query(query) - gdf_q_res = gdf.query(query) - - assert_eq(pdf_q_res, gdf_q_res) diff --git a/python/cudf/cudf/tests/test_rank.py b/python/cudf/cudf/tests/test_rank.py index fdb005d0ba9..07a844333cf 100644 --- a/python/cudf/cudf/tests/test_rank.py +++ b/python/cudf/cudf/tests/test_rank.py @@ -1,7 +1,4 @@ # Copyright (c) 2020-2025, NVIDIA CORPORATION. - -from itertools import chain, combinations_with_replacement, product - import numpy as np import pandas as pd import pytest @@ -38,8 +35,8 @@ def test_rank_all_arguments( # not supported by pandas return - pdf = pdf.copy(deep=True) # for parallel pytest if numeric_only: + pdf = pdf.copy(deep=True) # for parallel pytest pdf["str"] = np.array( ["a", "b", "c", "d", "e", "1", "2", "3", "4", "5"] ) @@ -127,25 +124,11 @@ def test_rank_error_arguments(pdf): @pytest.mark.filterwarnings("ignore:invalid value encountered in cast") -@pytest.mark.parametrize( - "elem,dtype", - list( - product( - combinations_with_replacement( - [ - np.full((3,), np.nan), - 100 * np.random.default_rng(seed=0).random(10), - np.full((3,), np.inf), - np.full((3,), -np.inf), - ], - 4, - ), - [np.int32, np.int64, np.float32, np.float64], - ) - ), -) -def test_series_rank_combinations(elem, dtype): - aa = np.fromiter(chain.from_iterable(elem), np.float64).astype(dtype) +@pytest.mark.parametrize("elem1", [np.nan, np.inf, -np.inf, 1.43]) +@pytest.mark.parametrize("elem2", [np.nan, np.inf, -np.inf, 1.43]) +@pytest.mark.parametrize("dtype", [np.int32, np.int64, np.float32, np.float64]) +def test_series_rank_combinations(elem1, elem2, dtype): + aa = np.array([elem1, elem2], dtype=np.float64).astype(dtype) gdf = DataFrame({"a": aa}) df = pd.DataFrame({"a": aa}) ranked_gs = gdf["a"].rank(method="first") diff --git a/python/cudf/cudf/tests/test_reductions.py b/python/cudf/cudf/tests/test_reductions.py index a234f6655ba..144d79ca94a 100644 --- a/python/cudf/cudf/tests/test_reductions.py +++ b/python/cudf/cudf/tests/test_reductions.py @@ -1,8 +1,5 @@ # Copyright (c) 2020-2025, NVIDIA CORPORATION. - - from decimal import Decimal -from itertools import product import numpy as np import pandas as pd @@ -17,14 +14,17 @@ from cudf.testing import _utils as utils, assert_eq from cudf.testing._utils import NUMERIC_TYPES, expect_warning_if, gen_rand -params_dtype = NUMERIC_TYPES -params_sizes = [1, 2, 3, 127, 128, 129, 200, 10000] +@pytest.fixture(params=NUMERIC_TYPES) +def dtype(request): + return request.param + -params = list(product(params_dtype, params_sizes)) +@pytest.fixture(params=[1, 20]) +def nelem(request): + return request.param -@pytest.mark.parametrize("dtype,nelem", params) def test_sum(dtype, nelem): dtype = cudf.dtype(dtype).type data = gen_rand(dtype, nelem) @@ -86,7 +86,6 @@ def test_sum_string(): Decimal128Dtype(20, 7), ], ) -@pytest.mark.parametrize("nelem", params_sizes) def test_sum_decimal(dtype, nelem): data = [str(x) for x in gen_rand("int64", nelem, seed=0) / 100] @@ -96,7 +95,6 @@ def test_sum_decimal(dtype, nelem): assert_eq(expected, got) -@pytest.mark.parametrize("dtype,nelem", params) def test_product(dtype, nelem): rng = np.random.default_rng(seed=0) dtype = cudf.dtype(dtype).type @@ -162,7 +160,6 @@ def test_product_decimal(dtype): accuracy_for_dtype = {np.float64: 6, np.float32: 5} -@pytest.mark.parametrize("dtype,nelem", params) def test_sum_of_squares(dtype, nelem): dtype = cudf.dtype(dtype).type data = gen_rand(dtype, nelem) @@ -228,7 +225,6 @@ def test_sum_of_squares_decimal(dtype): assert_eq(expected, got) -@pytest.mark.parametrize("dtype,nelem", params) def test_min(dtype, nelem): dtype = cudf.dtype(dtype).type data = gen_rand(dtype, nelem) @@ -274,7 +270,6 @@ def test_min(dtype, nelem): Decimal128Dtype(20, 7), ], ) -@pytest.mark.parametrize("nelem", params_sizes) def test_min_decimal(dtype, nelem): data = [str(x) for x in gen_rand("int64", nelem) / 100] @@ -284,7 +279,6 @@ def test_min_decimal(dtype, nelem): assert_eq(expected, got) -@pytest.mark.parametrize("dtype,nelem", params) def test_max(dtype, nelem): dtype = cudf.dtype(dtype).type data = gen_rand(dtype, nelem) @@ -330,7 +324,6 @@ def test_max(dtype, nelem): Decimal128Dtype(20, 7), ], ) -@pytest.mark.parametrize("nelem", params_sizes) def test_max_decimal(dtype, nelem): data = [str(x) for x in gen_rand("int64", nelem) / 100] @@ -340,7 +333,6 @@ def test_max_decimal(dtype, nelem): assert_eq(expected, got) -@pytest.mark.parametrize("nelem", params_sizes) def test_sum_masked(nelem): dtype = np.float64 data = gen_rand(dtype, nelem) diff --git a/python/cudf/cudf/tests/test_repr.py b/python/cudf/cudf/tests/test_repr.py index d4885c4c9fc..89fa6a0bb78 100644 --- a/python/cudf/cudf/tests/test_repr.py +++ b/python/cudf/cudf/tests/test_repr.py @@ -12,17 +12,21 @@ from cudf.testing import _utils as utils from cudf.utils.dtypes import np_dtypes_to_pandas_dtypes -repr_categories = [ - "uint16", - "int64", - "float64", - "str", - "category", - "datetime64[ns]", -] + +@pytest.fixture( + params=[ + "uint16", + "int64", + "float64", + "str", + "category", + "datetime64[ns]", + ] +) +def dtype(request): + return request.param -@pytest.mark.parametrize("dtype", repr_categories) @pytest.mark.parametrize("nrows", [0, 5, 10]) def test_null_series(nrows, dtype): rng = np.random.default_rng(seed=0) @@ -40,57 +44,50 @@ def test_null_series(nrows, dtype): else: ps = sr.to_pandas() - pd.options.display.max_rows = int(nrows) - psrepr = repr(ps).replace("NaN", "").replace("None", "") - if "UInt" in psrepr: - psrepr = psrepr.replace("UInt", "uint") - elif "Int" in psrepr: - psrepr = psrepr.replace("Int", "int") - assert psrepr.split() == repr(sr).split() - pd.reset_option("display.max_rows") - - -dtype_categories = [ - "float32", - "float64", - "datetime64[ns]", - "str", - "category", -] + with pd.option_context("display.max_rows", int(nrows)): + psrepr = repr(ps).replace("NaN", "").replace("None", "") + if "UInt" in psrepr: + psrepr = psrepr.replace("UInt", "uint") + elif "Int" in psrepr: + psrepr = psrepr.replace("Int", "int") + assert psrepr.split() == repr(sr).split() @pytest.mark.parametrize("ncols", [1, 2, 3, 4, 5, 10]) def test_null_dataframe(ncols): + dtype_categories = [ + "float32", + "float64", + "datetime64[ns]", + "str", + "category", + ] rng = np.random.default_rng(seed=0) size = 20 gdf = cudf.DataFrame() - for idx, dtype in enumerate(dtype_categories): + for dtype in dtype_categories: sr = cudf.Series(rng.integers(0, 128, size)).astype(dtype) sr[rng.choice([False, True], size=size)] = None gdf[dtype] = sr pdf = gdf.to_pandas() - pd.options.display.max_columns = int(ncols) - pdf_repr = repr(pdf).replace("NaN", "").replace("None", "") - assert pdf_repr.split() == repr(gdf).split() - pd.reset_option("display.max_columns") + with pd.option_context("display.max_columns", int(ncols)): + pdf_repr = repr(pdf).replace("NaN", "").replace("None", "") + assert pdf_repr.split() == repr(gdf).split() -@pytest.mark.parametrize("dtype", repr_categories) @pytest.mark.parametrize("nrows", [None, 0, 1, 2, 9, 10, 11, 19, 20, 21]) def test_full_series(nrows, dtype): size = 20 rng = np.random.default_rng(seed=0) ps = pd.Series(rng.integers(0, 100, size)).astype(dtype) sr = cudf.from_pandas(ps) - pd.options.display.max_rows = nrows - assert repr(ps) == repr(sr) - pd.reset_option("display.max_rows") + with pd.option_context("display.max_rows", nrows): + assert repr(ps) == repr(sr) @pytest.mark.parametrize("nrows", [5, 10, 15]) @pytest.mark.parametrize("ncols", [5, 10, 15]) @pytest.mark.parametrize("size", [20, 21]) -@pytest.mark.parametrize("dtype", repr_categories) def test_full_dataframe_20(dtype, size, nrows, ncols): rng = np.random.default_rng(seed=0) pdf = pd.DataFrame( @@ -195,15 +192,12 @@ def test_MI(): [0, 1, 2, 3, 0, 1, 2, 3, 0, 1], [0, 1, 0, 1, 0, 1, 0, 1, 0, 1], ] - pd.options.display.max_rows = 999 - pd.options.display.max_columns = 0 - gdf = gdf.set_index(cudf.MultiIndex(levels=levels, codes=codes)) - pdf = gdf.to_pandas() - assert repr(gdf) == repr(pdf) - assert repr(gdf.index) == repr(pdf.index) - assert repr(gdf.T) == repr(pdf.T) - pd.reset_option("display.max_rows") - pd.reset_option("display.max_columns") + with pd.option_context("display.max_rows", 999, "display.max_columns", 0): + gdf = gdf.set_index(cudf.MultiIndex(levels=levels, codes=codes)) + pdf = gdf.to_pandas() + assert repr(gdf) == repr(pdf) + assert repr(gdf.index) == repr(pdf.index) + assert repr(gdf.T) == repr(pdf.T) @pytest.mark.parametrize("nrows", [0, 1, 3, 5, 10]) @@ -215,13 +209,12 @@ def test_groupby_MI(nrows, ncols): pdf = gdf.to_pandas() gdg = gdf.groupby(["a", "b"], sort=True).count() pdg = pdf.groupby(["a", "b"], sort=True).count() - pd.options.display.max_rows = nrows - pd.options.display.max_columns = ncols - assert repr(gdg) == repr(pdg) - assert repr(gdg.index) == repr(pdg.index) - assert repr(gdg.T) == repr(pdg.T) - pd.reset_option("display.max_rows") - pd.reset_option("display.max_columns") + with pd.option_context( + "display.max_rows", nrows, "display.max_columns", ncols + ): + assert repr(gdg) == repr(pdg) + assert repr(gdg.index) == repr(pdg.index) + assert repr(gdg.T) == repr(pdg.T) @pytest.mark.parametrize("dtype", utils.NUMERIC_TYPES) @@ -241,23 +234,25 @@ def test_generic_index(length, dtype): @pytest.mark.parametrize( "gdf", [ - cudf.DataFrame({"a": range(10000)}), - cudf.DataFrame({"a": range(10000), "b": range(10000)}), - cudf.DataFrame({"a": range(20), "b": range(20)}), - cudf.DataFrame( + lambda: cudf.DataFrame({"a": range(10000)}), + lambda: cudf.DataFrame({"a": range(10000), "b": range(10000)}), + lambda: cudf.DataFrame({"a": range(20), "b": range(20)}), + lambda: cudf.DataFrame( { "a": range(20), "b": range(20), "c": ["abc", "def", "xyz", "def", "pqr"] * 4, } ), - cudf.DataFrame(index=[1, 2, 3]), - cudf.DataFrame(index=range(10000)), - cudf.DataFrame(columns=["a", "b", "c", "d"]), - cudf.DataFrame(columns=["a"], index=range(10000)), - cudf.DataFrame(columns=["a", "col2", "...col n"], index=range(10000)), - cudf.DataFrame(index=cudf.Series(range(10000)).astype("str")), - cudf.DataFrame( + lambda: cudf.DataFrame(index=[1, 2, 3]), + lambda: cudf.DataFrame(index=range(10000)), + lambda: cudf.DataFrame(columns=["a", "b", "c", "d"]), + lambda: cudf.DataFrame(columns=["a"], index=range(10000)), + lambda: cudf.DataFrame( + columns=["a", "col2", "...col n"], index=range(10000) + ), + lambda: cudf.DataFrame(index=cudf.Series(range(10000)).astype("str")), + lambda: cudf.DataFrame( columns=["a", "b", "c", "d"], index=cudf.Series(range(10000)).astype("str"), ), @@ -277,50 +272,52 @@ def test_generic_index(length, dtype): @pytest.mark.parametrize("max_seq_items", [1, 10, 60, 10000, None]) @pytest.mark.parametrize("max_rows", [1, 10, 60, 10000, None]) def test_dataframe_sliced(gdf, slice, max_seq_items, max_rows): - pd.options.display.max_seq_items = max_seq_items - pd.options.display.max_rows = max_rows - pdf = gdf.to_pandas() + gdf = gdf() + with pd.option_context( + "display.max_seq_items", max_seq_items, "display.max_rows", max_rows + ): + pdf = gdf.to_pandas() - sliced_gdf = gdf[slice] - sliced_pdf = pdf[slice] + sliced_gdf = gdf[slice] + sliced_pdf = pdf[slice] - expected_repr = repr(sliced_pdf).replace("None", "") - actual_repr = repr(sliced_gdf) + expected_repr = repr(sliced_pdf).replace("None", "") + actual_repr = repr(sliced_gdf) - assert expected_repr == actual_repr - pd.reset_option("display.max_rows") - pd.reset_option("display.max_seq_items") + assert expected_repr == actual_repr @pytest.mark.parametrize( "index,expected_repr", [ ( - cudf.Index([1, 2, 3, None]), + lambda: cudf.Index([1, 2, 3, None]), "Index([1, 2, 3, ], dtype='int64')", ), ( - cudf.Index([None, 2.2, 3.324342, None]), + lambda: cudf.Index([None, 2.2, 3.324342, None]), "Index([, 2.2, 3.324342, ], dtype='float64')", ), ( - cudf.Index([None, None, None], name="hello"), + lambda: cudf.Index([None, None, None], name="hello"), "Index([, , ], dtype='object', name='hello')", ), ( - cudf.Index([None, None, None], dtype="float", name="hello"), + lambda: cudf.Index( + [None, None, None], dtype="float", name="hello" + ), "Index([, , ], dtype='float64', name='hello')", ), ( - cudf.Index([None], dtype="float64", name="hello"), + lambda: cudf.Index([None], dtype="float64", name="hello"), "Index([], dtype='float64', name='hello')", ), ( - cudf.Index([None], dtype="int8", name="hello"), + lambda: cudf.Index([None], dtype="int8", name="hello"), "Index([], dtype='int8', name='hello')", ), ( - cudf.Index([None] * 50, dtype="object"), + lambda: cudf.Index([None] * 50, dtype="object"), "Index([, , , , , , , , , " ", , ,\n , , , , , , , " ", , , , ,\n , , , , " @@ -329,63 +326,72 @@ def test_dataframe_sliced(gdf, slice, max_seq_items, max_rows): ",\n , ],\n dtype='object')", ), ( - cudf.Index([None] * 20, dtype="uint32"), + lambda: cudf.Index([None] * 20, dtype="uint32"), "Index([, , , , , , , , " ",\n , , , , , , , , " ",\n , ],\n dtype='uint32')", ), ( - cudf.Index( + lambda: cudf.Index( [None, 111, 22, 33, None, 23, 34, 2343, None], dtype="int16" ), "Index([, 111, 22, 33, , 23, 34, 2343, ], " "dtype='int16')", ), ( - cudf.Index([1, 2, 3, None], dtype="category"), + lambda: cudf.Index([1, 2, 3, None], dtype="category"), "CategoricalIndex([1, 2, 3, ], categories=[1, 2, 3], " "ordered=False, dtype='category')", ), ( - cudf.Index([None, None], dtype="category"), + lambda: cudf.Index([None, None], dtype="category"), "CategoricalIndex([, ], categories=[], ordered=False, " "dtype='category')", ), ( - cudf.Index(np.array([10, 20, 30, None], dtype="datetime64[ns]")), + lambda: cudf.Index( + np.array([10, 20, 30, None], dtype="datetime64[ns]") + ), "DatetimeIndex([1970-01-01 00:00:00.000000010, " "1970-01-01 00:00:00.000000020," "\n 1970-01-01 00:00:00.000000030, NaT],\n " "dtype='datetime64[ns]')", ), ( - cudf.Index(np.array([10, 20, 30, None], dtype="datetime64[s]")), + lambda: cudf.Index( + np.array([10, 20, 30, None], dtype="datetime64[s]") + ), "DatetimeIndex([1970-01-01 00:00:10, " "1970-01-01 00:00:20, 1970-01-01 00:00:30,\n" " NaT],\n dtype='datetime64[s]')", ), ( - cudf.Index(np.array([10, 20, 30, None], dtype="datetime64[us]")), + lambda: cudf.Index( + np.array([10, 20, 30, None], dtype="datetime64[us]") + ), "DatetimeIndex([1970-01-01 00:00:00.000010, " "1970-01-01 00:00:00.000020,\n " "1970-01-01 00:00:00.000030, NaT],\n " "dtype='datetime64[us]')", ), ( - cudf.Index(np.array([10, 20, 30, None], dtype="datetime64[ms]")), + lambda: cudf.Index( + np.array([10, 20, 30, None], dtype="datetime64[ms]") + ), "DatetimeIndex([1970-01-01 00:00:00.010, " "1970-01-01 00:00:00.020,\n " "1970-01-01 00:00:00.030, NaT],\n " "dtype='datetime64[ms]')", ), ( - cudf.Index(np.array([None] * 10, dtype="datetime64[ms]")), + lambda: cudf.Index(np.array([None] * 10, dtype="datetime64[ms]")), "DatetimeIndex([NaT, NaT, NaT, NaT, NaT, NaT, NaT, NaT, " "NaT, NaT], dtype='datetime64[ms]')", ), ], ) def test_generic_index_null(index, expected_repr): + index = index() actual_repr = repr(index) assert expected_repr == actual_repr @@ -598,7 +604,7 @@ def test_timedelta_series_s_us_repr(data, dtype): "ser, expected_repr", [ ( - cudf.Series([], dtype="timedelta64[ns]"), + lambda: cudf.Series([], dtype="timedelta64[ns]"), textwrap.dedent( """ Series([], dtype: timedelta64[ns]) @@ -606,7 +612,7 @@ def test_timedelta_series_s_us_repr(data, dtype): ), ), ( - cudf.Series([], dtype="timedelta64[ms]"), + lambda: cudf.Series([], dtype="timedelta64[ms]"), textwrap.dedent( """ Series([], dtype: timedelta64[ms]) @@ -614,7 +620,9 @@ def test_timedelta_series_s_us_repr(data, dtype): ), ), ( - cudf.Series([1000000, 200000, 3000000], dtype="timedelta64[ns]"), + lambda: cudf.Series( + [1000000, 200000, 3000000], dtype="timedelta64[ns]" + ), textwrap.dedent( """ 0 0 days 00:00:00.001000 @@ -625,7 +633,9 @@ def test_timedelta_series_s_us_repr(data, dtype): ), ), ( - cudf.Series([1000000, 200000, 3000000], dtype="timedelta64[ms]"), + lambda: cudf.Series( + [1000000, 200000, 3000000], dtype="timedelta64[ms]" + ), textwrap.dedent( """ 0 0 days 00:16:40 @@ -636,7 +646,9 @@ def test_timedelta_series_s_us_repr(data, dtype): ), ), ( - cudf.Series([1000000, 200000, None], dtype="timedelta64[ns]"), + lambda: cudf.Series( + [1000000, 200000, None], dtype="timedelta64[ns]" + ), textwrap.dedent( """ 0 0 days 00:00:00.001000000 @@ -647,7 +659,9 @@ def test_timedelta_series_s_us_repr(data, dtype): ), ), ( - cudf.Series([1000000, 200000, None], dtype="timedelta64[ms]"), + lambda: cudf.Series( + [1000000, 200000, None], dtype="timedelta64[ms]" + ), textwrap.dedent( """ 0 0 days 00:16:40 @@ -658,7 +672,7 @@ def test_timedelta_series_s_us_repr(data, dtype): ), ), ( - cudf.Series( + lambda: cudf.Series( [None, None, None, None, None], dtype="timedelta64[ns]" ), textwrap.dedent( @@ -673,7 +687,7 @@ def test_timedelta_series_s_us_repr(data, dtype): ), ), ( - cudf.Series( + lambda: cudf.Series( [None, None, None, None, None], dtype="timedelta64[ms]" ), textwrap.dedent( @@ -688,7 +702,7 @@ def test_timedelta_series_s_us_repr(data, dtype): ), ), ( - cudf.Series( + lambda: cudf.Series( [12, 12, 22, 343, 4353534, 435342], dtype="timedelta64[ns]" ), textwrap.dedent( @@ -704,7 +718,7 @@ def test_timedelta_series_s_us_repr(data, dtype): ), ), ( - cudf.Series( + lambda: cudf.Series( [12, 12, 22, 343, 4353534, 435342], dtype="timedelta64[ms]" ), textwrap.dedent( @@ -720,7 +734,7 @@ def test_timedelta_series_s_us_repr(data, dtype): ), ), ( - cudf.Series( + lambda: cudf.Series( [1.321, 1132.324, 23223231.11, 233.41, 0.2434, 332, 323], dtype="timedelta64[ns]", ), @@ -738,7 +752,7 @@ def test_timedelta_series_s_us_repr(data, dtype): ), ), ( - cudf.Series( + lambda: cudf.Series( [1.321, 1132.324, 23223231.11, 233.41, 0.2434, 332, 323], dtype="timedelta64[ms]", ), @@ -756,7 +770,7 @@ def test_timedelta_series_s_us_repr(data, dtype): ), ), ( - cudf.Series( + lambda: cudf.Series( [ 13645765432432, 134736784, @@ -782,7 +796,7 @@ def test_timedelta_series_s_us_repr(data, dtype): ), ), ( - cudf.Series( + lambda: cudf.Series( [ 13645765432432, 134736784, @@ -808,7 +822,7 @@ def test_timedelta_series_s_us_repr(data, dtype): ), ), ( - cudf.Series( + lambda: cudf.Series( [ 13645765432432, 134736784, @@ -835,7 +849,7 @@ def test_timedelta_series_s_us_repr(data, dtype): ), ), ( - cudf.Series( + lambda: cudf.Series( [ 13645765432432, 134736784, @@ -866,7 +880,7 @@ def test_timedelta_series_s_us_repr(data, dtype): ) def test_timedelta_series_ns_ms_repr(ser, expected_repr): expected = expected_repr - actual = repr(ser) + actual = repr(ser()) assert expected.split() == actual.split() @@ -875,7 +889,7 @@ def test_timedelta_series_ns_ms_repr(ser, expected_repr): "df,expected_repr", [ ( - cudf.DataFrame( + lambda: cudf.DataFrame( { "a": cudf.Series( [1000000, 200000, 3000000], dtype="timedelta64[s]" @@ -892,7 +906,7 @@ def test_timedelta_series_ns_ms_repr(ser, expected_repr): ), ), ( - cudf.DataFrame( + lambda: cudf.DataFrame( { "a": cudf.Series( [ @@ -923,7 +937,7 @@ def test_timedelta_series_ns_ms_repr(ser, expected_repr): ), ), ( - cudf.DataFrame( + lambda: cudf.DataFrame( { "a": cudf.Series( [ @@ -954,7 +968,7 @@ def test_timedelta_series_ns_ms_repr(ser, expected_repr): ), ), ( - cudf.DataFrame( + lambda: cudf.DataFrame( { "a": cudf.Series( [1, 2, 3, 4, 5, 6, 7], @@ -987,7 +1001,7 @@ def test_timedelta_series_ns_ms_repr(ser, expected_repr): ), ), ( - cudf.DataFrame( + lambda: cudf.DataFrame( { "a": cudf.Series( ["a", "f", "q", "e", "w", "e", "t"], @@ -1022,7 +1036,7 @@ def test_timedelta_series_ns_ms_repr(ser, expected_repr): ], ) def test_timedelta_dataframe_repr(df, expected_repr): - actual_repr = repr(df) + actual_repr = repr(df()) assert actual_repr.split() == expected_repr.split() @@ -1031,20 +1045,22 @@ def test_timedelta_dataframe_repr(df, expected_repr): "index, expected_repr", [ ( - cudf.Index([1000000, 200000, 3000000], dtype="timedelta64[ms]"), + lambda: cudf.Index( + [1000000, 200000, 3000000], dtype="timedelta64[ms]" + ), "TimedeltaIndex(['0 days 00:16:40', " "'0 days 00:03:20', '0 days 00:50:00'], " "dtype='timedelta64[ms]')", ), ( - cudf.Index( + lambda: cudf.Index( [None, None, None, None, None], dtype="timedelta64[us]" ), "TimedeltaIndex([NaT, NaT, NaT, NaT, NaT], " "dtype='timedelta64[us]')", ), ( - cudf.Index( + lambda: cudf.Index( [ 136457654, None, @@ -1063,7 +1079,7 @@ def test_timedelta_dataframe_repr(df, expected_repr): " dtype='timedelta64[us]')", ), ( - cudf.Index( + lambda: cudf.Index( [ 136457654, None, @@ -1083,7 +1099,7 @@ def test_timedelta_dataframe_repr(df, expected_repr): ], ) def test_timedelta_index_repr(index, expected_repr): - actual_repr = repr(index) + actual_repr = repr(index()) assert actual_repr.split() == expected_repr.split() @@ -1111,18 +1127,17 @@ def test_timedelta_index_repr(index, expected_repr): ) @pytest.mark.parametrize("max_seq_items", [None, 1, 2, 5, 10, 100]) def test_multiindex_repr(pmi, max_seq_items): - pd.set_option("display.max_seq_items", max_seq_items) - gmi = cudf.from_pandas(pmi) + with pd.option_context("display.max_seq_items", max_seq_items): + gmi = cudf.from_pandas(pmi) - assert repr(gmi) == repr(pmi) - pd.reset_option("display.max_seq_items") + assert repr(gmi) == repr(pmi) @pytest.mark.parametrize( "gdi, expected_repr", [ ( - cudf.DataFrame( + lambda: cudf.DataFrame( { "a": [None, 1, 2, 3], "b": ["abc", None, "xyz", None], @@ -1142,7 +1157,7 @@ def test_multiindex_repr(pmi, max_seq_items): ), ), ( - cudf.DataFrame( + lambda: cudf.DataFrame( { "a": cudf.Series([None, np.nan, 2, 3], nan_as_null=False), "b": ["abc", None, "xyz", None], @@ -1162,7 +1177,7 @@ def test_multiindex_repr(pmi, max_seq_items): ), ), ( - cudf.DataFrame( + lambda: cudf.DataFrame( { "a": cudf.Series([None, 1, 2, 3], dtype="datetime64[ns]"), "b": ["abc", None, "xyz", None], @@ -1182,7 +1197,7 @@ def test_multiindex_repr(pmi, max_seq_items): ), ), ( - cudf.DataFrame( + lambda: cudf.DataFrame( { "a": cudf.Series([None, 1, 2, 3], dtype="datetime64[ns]"), "b": ["abc", None, "xyz", None], @@ -1202,7 +1217,7 @@ def test_multiindex_repr(pmi, max_seq_items): ), ), ( - cudf.DataFrame( + lambda: cudf.DataFrame( { "a": ["abc", None, "xyz", None], "b": cudf.Series([None, 1, 2, 3], dtype="timedelta64[ns]"), @@ -1222,7 +1237,7 @@ def test_multiindex_repr(pmi, max_seq_items): ), ), ( - cudf.DataFrame( + lambda: cudf.DataFrame( { "a": ["abc", None, "xyz", None], "b": cudf.Series([None, 1, 2, 3], dtype="timedelta64[ns]"), @@ -1242,7 +1257,7 @@ def test_multiindex_repr(pmi, max_seq_items): ), ), ( - cudf.DataFrame( + lambda: cudf.DataFrame( { "a": [None, None, None, None], "b": cudf.Series( @@ -1264,7 +1279,7 @@ def test_multiindex_repr(pmi, max_seq_items): ), ), ( - cudf.DataFrame( + lambda: cudf.DataFrame( { "a": [1, 2, None, 3, 5], "b": [ @@ -1294,7 +1309,7 @@ def test_multiindex_repr(pmi, max_seq_items): ), ), ( - cudf.DataFrame( + lambda: cudf.DataFrame( { "a": [1, 2, None, 3, 5], "b": [ @@ -1324,7 +1339,7 @@ def test_multiindex_repr(pmi, max_seq_items): ), ), ( - cudf.DataFrame( + lambda: cudf.DataFrame( { "a": ["(abc", "2", None, "3", "5"], "b": [ @@ -1356,7 +1371,7 @@ def test_multiindex_repr(pmi, max_seq_items): ], ) def test_multiindex_null_repr(gdi, expected_repr): - actual_repr = repr(gdi) + actual_repr = repr(gdi()) assert actual_repr.split() == expected_repr.split() From e5fd4e638b26f645726c23eba86aa0de1fea1c87 Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com> Date: Tue, 5 Aug 2025 11:56:13 -0700 Subject: [PATCH 060/366] Increase alignment requirement for parquet bloom filter to 256 (#19573) Closes #19539. Related to https://github.com/rapidsai/rmm/issues/2002 This PR increases the alignment requirement for parquet bloom filters from `32` to `256`(== cuda allocation alignment) to avoid allocating invalid sized buffers and/or data corruption with aligned mr adapter. Doing so, doesn't really affect anything as 256 byte aligned addresses are also 32 byte aligned. Authors: - Muhammad Haseeb (https://github.com/mhaseeb123) Approvers: - David Wendt (https://github.com/davidwendt) - Yunsong Wang (https://github.com/PointKernel) - Matthew Murray (https://github.com/Matt711) URL: https://github.com/rapidsai/cudf/pull/19573 --- cpp/src/io/parquet/bloom_filter_reader.cu | 10 +++-- .../io/experimental/hybrid_scan_test.cpp | 3 +- .../parquet/bloom_filter_alignment.parquet | Bin 0 -> 1805 bytes python/cudf/cudf/tests/test_parquet.py | 41 ++++++++++++++++++ 4 files changed, 49 insertions(+), 5 deletions(-) create mode 100644 python/cudf/cudf/tests/data/parquet/bloom_filter_alignment.parquet diff --git a/cpp/src/io/parquet/bloom_filter_reader.cu b/cpp/src/io/parquet/bloom_filter_reader.cu index f222365de18..d2d7fcac959 100644 --- a/cpp/src/io/parquet/bloom_filter_reader.cu +++ b/cpp/src/io/parquet/bloom_filter_reader.cu @@ -402,10 +402,12 @@ size_t aggregate_reader_metadata::get_bloom_filter_alignment() const // Required alignment: // https://github.com/NVIDIA/cuCollections/blob/deab5799f3e4226cb8a49acf2199c03b14941ee4/include/cuco/detail/bloom_filter/bloom_filter_impl.cuh#L55-L67 using policy_type = cuco::arrow_filter_policy; - return alignof(cuco::bloom_filter_ref, - cuco::thread_scope_thread, - policy_type>::filter_block_type); + auto constexpr alignment = alignof(cuco::bloom_filter_ref, + cuco::thread_scope_thread, + policy_type>::filter_block_type); + static_assert((alignment & (alignment - 1)) == 0, "Alignment must be a power of 2"); + return std::max(alignment, rmm::CUDA_ALLOCATION_ALIGNMENT); } std::vector aggregate_reader_metadata::read_bloom_filters( diff --git a/cpp/tests/io/experimental/hybrid_scan_test.cpp b/cpp/tests/io/experimental/hybrid_scan_test.cpp index 6ec572d1e49..5cb787f451e 100644 --- a/cpp/tests/io/experimental/hybrid_scan_test.cpp +++ b/cpp/tests/io/experimental/hybrid_scan_test.cpp @@ -31,9 +31,10 @@ #include #include +#include #include -auto constexpr bloom_filter_alignment = 32; +auto constexpr bloom_filter_alignment = rmm::CUDA_ALLOCATION_ALIGNMENT; namespace { diff --git a/python/cudf/cudf/tests/data/parquet/bloom_filter_alignment.parquet b/python/cudf/cudf/tests/data/parquet/bloom_filter_alignment.parquet new file mode 100644 index 0000000000000000000000000000000000000000..26441e59257a3a79546a061950e0b7fdcb0fad31 GIT binary patch literal 1805 zcmb`IZ%i9y9LJw4Z7F{WE%a{fEb{QL>9($e0;3wHl-07aQaa`$nl0>l_t2j0_3-Y> zpT#A(kTKjeOy4N42$%&;G`?Jni;`?=jM)pLaanv}w#W=7rpd@MjWhn9GIS7MEM`qV z-(H`4?)&>ae}2!!Ljx_Wk$uO&&Ksg^ghdFQXY%J5_%#4UfDteOW}q0b03|>vPzIC( zR-gi?1Z;pEr~;}12S9x}U>o2BT)=kV3BV2108aw700!!SdY}Pd5j$JNCVY^cjWx57 z##hWNO)kQiLtVH$%)1XP_@cKu-WU(&Egf3wWTJ;pcBdQ|i1g6}`>!!m_MHN*d zT1`oxs%u!5G!2ZrOw6JX&D75%+;~jVQ<%qr5ON&EDSnhN``UV>VI7lkA;o8tgk$zu z&$VTwG%5HCN=8u!wj+nEh$O?t+K^4=)5H~Xq5LyrysGk9eS+?*n1VH;j!FW-8AafA z36hnvSf0SVN_d)bM9F1Eh@Bw1PE^R8@p+*Kk2PIU$*xEPa*%PIWW~DUR;gwWa+q|Z zj_bH)7z5_B6Ih0BWi{`24of<%j2Bb|ZfMd;Q;j#OXgZDcvXWrJlesLP(^HCc5@vHG zr^3KALTg3LjbtRqjj&uJttkm6Nz$0BMrESJR#MrOl++BZJ4jfA^aHJasJb>P?Ns(s z>7Wv%@>Gr0*xEv`0wDez3bwXA^mJ?c|JXr4Ow>9zmDMEUhU(n_`J1lQAAM!#jqb^{ zJGUz?Gz9Br-uPH}`xoZ^p#6qdn0~hF)qvZwwCCpa#Yul`srL-mcwE5Xvmv5E^rsG#bJAJ=BT{XC9HT^s@T)Fu9rG=H{esuTz+(Lfa zl^wO_na`@v4o-a;{N2fnZ}p3}n=jw{6?v9RU-~vC^_*Vg`IJ2{yYk+fOX1KF%lE5| z_b+@j_1dM2bD`XiyNHO9%~`~cNN@JiD#|SbGO*-bo^H>ILkH;7APeM zAAW>xH2Npih3MSb1}@^UAg?zJ%M|X$qeRtcy^pr|Tm8-0o5)ES5qGo*?Lu>hAc{NC Lb(pd^ya@jWHUP16 literal 0 HcmV?d00001 diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py index d1e82a552ad..94eb2c794a5 100644 --- a/python/cudf/cudf/tests/test_parquet.py +++ b/python/cudf/cudf/tests/test_parquet.py @@ -4523,6 +4523,47 @@ def test_parquet_bloom_filters( ) +@pytest.fixture(params=["cuda", "pool", "cuda_async"]) +def memory_resource(request): + import rmm + + current_mr = rmm.mr.get_current_device_resource() + + kind = request.param + if kind == "cuda": + mr = rmm.mr.CudaMemoryResource() + elif kind == "pool": + base = rmm.mr.CudaMemoryResource() + free, _ = rmm.mr.available_device_memory() + size = int(round(free * 0.5 / 256) * 256) + mr = rmm.mr.PoolMemoryResource(base, size, size) + elif kind == "cuda_async": + mr = rmm.mr.CudaAsyncMemoryResource() + + rmm.mr.set_current_device_resource(mr) + + try: + yield mr + finally: + rmm.mr.set_current_device_resource(current_mr) + + +@pytest.mark.parametrize("columns", [["r_reason_desc"], None]) +def test_parquet_bloom_filters_alignment(datadir, columns, memory_resource): + fname = datadir / "bloom_filter_alignment.parquet" + filters = [("r_reason_desc", "==", "Did not like the color")] + + # Read expected table using pyarrow + expected = pq.read_table(fname, columns=columns, filters=filters) + + # Read with cudf using the memory resource from fixture + read = cudf.read_parquet( + fname, columns=columns, filters=filters + ).to_arrow() + + assert_eq(expected, read) + + def test_parquet_reader_unsupported_compression(datadir): fname = datadir / "hadoop_lz4_compressed.parquet" From f9b180c50cbfd7dab999a4ef60772d7eb11f94a3 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 5 Aug 2025 12:39:13 -0700 Subject: [PATCH 061/366] Remove cudf/_fuzz_testing directory (#19510) Discussed offline, `_fuzz_testing` contained a set of scripts that were once used for testing major IO refactors. It appears it hasn't been used in quite some time, and aspirationally we would like to move towards using library like `hypothesis` for continual fuzz testing xref https://github.com/rapidsai/cudf/issues/1612 Marking as non-breaking as this was "private" functionality in the `cudf` namespace Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/19510 --- python/cudf/cudf/_fuzz_testing/__init__.py | 0 python/cudf/cudf/_fuzz_testing/avro.py | 117 -------- python/cudf/cudf/_fuzz_testing/csv.py | 203 -------------- python/cudf/cudf/_fuzz_testing/fuzzer.py | 114 -------- python/cudf/cudf/_fuzz_testing/io.py | 110 -------- python/cudf/cudf/_fuzz_testing/json.py | 193 ------------- python/cudf/cudf/_fuzz_testing/main.py | 45 --- python/cudf/cudf/_fuzz_testing/orc.py | 199 -------------- python/cudf/cudf/_fuzz_testing/parquet.py | 169 ------------ .../_fuzz_testing/tests/fuzz_test_avro.py | 38 --- .../cudf/_fuzz_testing/tests/fuzz_test_csv.py | 132 --------- .../_fuzz_testing/tests/fuzz_test_json.py | 94 ------- .../cudf/_fuzz_testing/tests/fuzz_test_orc.py | 98 ------- .../_fuzz_testing/tests/fuzz_test_parquet.py | 106 -------- .../cudf/cudf/_fuzz_testing/tests/readme.md | 100 ------- python/cudf/cudf/_fuzz_testing/utils.py | 257 ------------------ 16 files changed, 1975 deletions(-) delete mode 100644 python/cudf/cudf/_fuzz_testing/__init__.py delete mode 100644 python/cudf/cudf/_fuzz_testing/avro.py delete mode 100644 python/cudf/cudf/_fuzz_testing/csv.py delete mode 100644 python/cudf/cudf/_fuzz_testing/fuzzer.py delete mode 100644 python/cudf/cudf/_fuzz_testing/io.py delete mode 100644 python/cudf/cudf/_fuzz_testing/json.py delete mode 100644 python/cudf/cudf/_fuzz_testing/main.py delete mode 100644 python/cudf/cudf/_fuzz_testing/orc.py delete mode 100644 python/cudf/cudf/_fuzz_testing/parquet.py delete mode 100644 python/cudf/cudf/_fuzz_testing/tests/fuzz_test_avro.py delete mode 100644 python/cudf/cudf/_fuzz_testing/tests/fuzz_test_csv.py delete mode 100644 python/cudf/cudf/_fuzz_testing/tests/fuzz_test_json.py delete mode 100644 python/cudf/cudf/_fuzz_testing/tests/fuzz_test_orc.py delete mode 100644 python/cudf/cudf/_fuzz_testing/tests/fuzz_test_parquet.py delete mode 100644 python/cudf/cudf/_fuzz_testing/tests/readme.md delete mode 100644 python/cudf/cudf/_fuzz_testing/utils.py diff --git a/python/cudf/cudf/_fuzz_testing/__init__.py b/python/cudf/cudf/_fuzz_testing/__init__.py deleted file mode 100644 index e69de29bb2d..00000000000 diff --git a/python/cudf/cudf/_fuzz_testing/avro.py b/python/cudf/cudf/_fuzz_testing/avro.py deleted file mode 100644 index 172193aa672..00000000000 --- a/python/cudf/cudf/_fuzz_testing/avro.py +++ /dev/null @@ -1,117 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. - -import copy -import io -import logging -import random - -import numpy as np - -import cudf -from cudf._fuzz_testing.io import IOFuzz -from cudf._fuzz_testing.utils import ( - ALL_POSSIBLE_VALUES, - _generate_rand_meta, - pandas_to_avro, - pyarrow_to_pandas, -) -from cudf.testing import dataset_generator as dg - -logging.basicConfig( - format="%(asctime)s %(levelname)-8s %(message)s", - level=logging.INFO, - datefmt="%Y-%m-%d %H:%M:%S", -) - - -class AvroReader(IOFuzz): - def __init__( - self, - dirs=None, - max_rows=100_000, - max_columns=1000, - max_string_length=None, - max_lists_length=None, - max_lists_nesting_depth=None, - ): - super().__init__( - dirs=dirs, - max_rows=max_rows, - max_columns=max_columns, - max_string_length=max_string_length, - max_lists_length=max_lists_length, - max_lists_nesting_depth=max_lists_nesting_depth, - ) - self._df = None - - def generate_input(self): - if self._regression: - ( - dtypes_meta, - num_rows, - num_cols, - seed, - ) = self.get_next_regression_params() - else: - dtypes_list = list( - cudf.utils.dtypes.ALL_TYPES - - {"category"} - # No unsigned support in avro: - # https://avro.apache.org/docs/current/spec.html - - cudf.utils.dtypes.UNSIGNED_TYPES - # TODO: Remove DATETIME_TYPES once - # following bug is fixed: - # https://github.com/rapidsai/cudf/issues/6482 - - cudf.utils.dtypes.DATETIME_TYPES - # TODO: Remove DURATION_TYPES once - # following bug is fixed: - # https://github.com/rapidsai/cudf/issues/6604 - - cudf.utils.dtypes.TIMEDELTA_TYPES - ) - seed = random.randint(0, 2**32 - 1) - dtypes_meta, num_rows, num_cols = _generate_rand_meta( - self, dtypes_list, seed - ) - self._current_params["dtypes_meta"] = dtypes_meta - - self._current_params["seed"] = seed - self._current_params["num_rows"] = num_rows - self._current_params["num_cols"] = num_cols - logging.info( - f"Generating DataFrame with rows: {num_rows} " - f"and columns: {num_cols}" - ) - table = dg.rand_dataframe(dtypes_meta, num_rows, seed) - df = pyarrow_to_pandas(table) - self._df = df - logging.info(f"Shape of DataFrame generated: {table.shape}") - - file_obj = io.BytesIO() - pandas_to_avro(df, file_io_obj=file_obj) - file_obj.seek(0) - buf = file_obj.read() - self._current_buffer = copy.copy(buf) - return (df, buf) - - def write_data(self, file_name): - if self._current_buffer is not None: - with open(file_name + "_crash.avro", "wb") as crash_dataset: - crash_dataset.write(self._current_buffer) - - def set_rand_params(self, params): - params_dict = {} - rng = np.random.default_rng(seed=None) - for param, values in params.items(): - if values == ALL_POSSIBLE_VALUES: - if param == "columns": - col_size = self._rand(len(self._df.columns)) - params_dict[param] = list( - np.unique(rng.choice(self._df.columns, col_size)) - ) - elif param in ("skiprows", "num_rows"): - params_dict[param] = rng.choice( - [None, self._rand(len(self._df))] - ) - else: - params_dict[param] = rng.choice(values) - self._current_params["test_kwargs"] = self.process_kwargs(params_dict) diff --git a/python/cudf/cudf/_fuzz_testing/csv.py b/python/cudf/cudf/_fuzz_testing/csv.py deleted file mode 100644 index fa3ed40ce91..00000000000 --- a/python/cudf/cudf/_fuzz_testing/csv.py +++ /dev/null @@ -1,203 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. - -import logging -import random - -import numpy as np - -import cudf -from cudf._fuzz_testing.io import IOFuzz -from cudf._fuzz_testing.utils import ( - ALL_POSSIBLE_VALUES, - _generate_rand_meta, - pyarrow_to_pandas, -) -from cudf.testing import dataset_generator as dg -from cudf.utils.dtypes import pandas_dtypes_to_np_dtypes - -logging.basicConfig( - format="%(asctime)s %(levelname)-8s %(message)s", - level=logging.INFO, - datefmt="%Y-%m-%d %H:%M:%S", -) - - -class CSVReader(IOFuzz): - def __init__( - self, - dirs=None, - max_rows=100_000, - max_columns=1000, - max_string_length=None, - max_lists_length=None, - max_lists_nesting_depth=None, - ): - super().__init__( - dirs=dirs, - max_rows=max_rows, - max_columns=max_columns, - max_string_length=max_string_length, - max_lists_length=max_lists_length, - max_lists_nesting_depth=max_lists_nesting_depth, - ) - - def generate_input(self): - if self._regression: - ( - dtypes_meta, - num_rows, - num_cols, - seed, - ) = self.get_next_regression_params() - else: - seed = random.randint(0, 2**32 - 1) - random.seed(seed) - dtypes_list = list(cudf.utils.dtypes.ALL_TYPES) - dtypes_meta, num_rows, num_cols = _generate_rand_meta( - self, dtypes_list, seed - ) - self._current_params["dtypes_meta"] = dtypes_meta - self._current_params["seed"] = seed - self._current_params["num_rows"] = num_rows - self._current_params["num_columns"] = num_cols - logging.info( - f"Generating DataFrame with rows: {num_rows} " - f"and columns: {num_cols}" - ) - table = dg.rand_dataframe(dtypes_meta, num_rows, seed) - df = pyarrow_to_pandas(table) - - logging.info(f"Shape of DataFrame generated: {df.shape}") - self._current_buffer = df - return df.to_csv() - - def write_data(self, file_name): - if self._current_buffer is not None: - self._current_buffer.to_csv(file_name + "_crash.csv") - - def set_rand_params(self, params): - params_dict = {} - rng = np.random.default_rng(seed=None) - for param, values in params.items(): - if values == ALL_POSSIBLE_VALUES: - if param == "usecols": - col_size = self._rand(len(self._df.columns)) - col_val = rng.choice( - [ - None, - np.unique(rng.choice(self._df.columns, col_size)), - ] - ) - params_dict[param] = ( - col_val if col_val is None else list(col_val) - ) - elif param == "dtype": - dtype_val = rng.choice([None, self._df.dtypes.to_dict()]) - if dtype_val is not None: - dtype_val = { - col_name: "category" - if isinstance(dtype, cudf.CategoricalDtype) - else pandas_dtypes_to_np_dtypes[dtype] - for col_name, dtype in dtype_val.items() - } - params_dict[param] = dtype_val - elif param == "header": - header_val = rng.choice( - ["infer", rng.integers(low=0, high=len(self._df))] - ) - params_dict[param] = header_val - elif param == "skiprows": - params_dict[param] = rng.integers( - low=0, high=len(self._df) - ) - elif param == "skipfooter": - params_dict[param] = rng.integers( - low=0, high=len(self._df) - ) - elif param == "nrows": - nrows_val = rng.choice( - [None, rng.integers(low=0, high=len(self._df))] - ) - params_dict[param] = nrows_val - else: - params_dict[param] = rng.choice(values) - self._current_params["test_kwargs"] = self.process_kwargs(params_dict) - - -class CSVWriter(IOFuzz): - def __init__( - self, - dirs=None, - max_rows=100_000, - max_columns=1000, - max_string_length=None, - max_lists_length=None, - max_lists_nesting_depth=None, - ): - super().__init__( - dirs=dirs, - max_rows=max_rows, - max_columns=max_columns, - max_string_length=max_string_length, - max_lists_length=max_lists_length, - max_lists_nesting_depth=max_lists_nesting_depth, - ) - - def generate_input(self): - if self._regression: - ( - dtypes_meta, - num_rows, - num_cols, - seed, - ) = self.get_next_regression_params() - else: - seed = random.randint(0, 2**32 - 1) - random.seed(seed) - dtypes_list = list(cudf.utils.dtypes.ALL_TYPES) - dtypes_meta, num_rows, num_cols = _generate_rand_meta( - self, dtypes_list, seed - ) - self._current_params["dtypes_meta"] = dtypes_meta - self._current_params["seed"] = seed - self._current_params["num_rows"] = num_rows - self._current_params["num_columns"] = num_cols - logging.info( - f"Generating DataFrame with rows: {num_rows} " - f"and columns: {num_cols}" - ) - table = dg.rand_dataframe(dtypes_meta, num_rows, seed) - df = pyarrow_to_pandas(table) - - logging.info(f"Shape of DataFrame generated: {df.shape}") - self._current_buffer = df - return df - - def write_data(self, file_name): - if self._current_buffer is not None: - self._current_buffer.to_csv(file_name + "_crash.csv") - - def set_rand_params(self, params): - params_dict = {} - rng = np.random.default_rng(seed=None) - for param, values in params.items(): - if values == ALL_POSSIBLE_VALUES: - if param == "columns": - col_size = self._rand(len(self._current_buffer.columns)) - params_dict[param] = list( - np.unique( - rng.choice(self._current_buffer.columns, col_size) - ) - ) - elif param == "chunksize": - params_dict[param] = rng.choice( - [ - None, - rng.integers( - low=1, high=max(1, len(self._current_buffer)) - ), - ] - ) - else: - params_dict[param] = rng.choice(values) - self._current_params["test_kwargs"] = self.process_kwargs(params_dict) diff --git a/python/cudf/cudf/_fuzz_testing/fuzzer.py b/python/cudf/cudf/_fuzz_testing/fuzzer.py deleted file mode 100644 index 4b080937a17..00000000000 --- a/python/cudf/cudf/_fuzz_testing/fuzzer.py +++ /dev/null @@ -1,114 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. - -import datetime -import json -import logging -import os -import sys -import traceback - -logging.basicConfig( - format="%(asctime)s %(levelname)-8s %(message)s", - level=logging.INFO, - datefmt="%Y-%m-%d %H:%M:%S", -) - - -class Fuzzer: - def __init__( - self, - target, - data_handler_class, - dirs=None, - crash_reports_dir=None, - regression=False, - max_rows_size=100_000, - max_cols_size=1000, - runs=-1, - max_string_length=None, - params=None, - write_data_on_failure=True, - max_lists_length=None, - max_lists_nesting_depth=None, - ): - self._target = target - self._dirs = [] if dirs is None else dirs - self._crash_dir = crash_reports_dir - self._data_handler = data_handler_class( - dirs=self._dirs, - max_rows=max_rows_size, - max_columns=max_cols_size, - max_string_length=max_string_length, - max_lists_length=max_lists_length, - max_lists_nesting_depth=max_lists_nesting_depth, - ) - self._total_executions = 0 - self._regression = regression - self._start_time = None - self.runs = runs - self.params = params - self.write_data_on_failure = write_data_on_failure - - def log_stats(self): - end_time = datetime.datetime.now() - total_time_taken = end_time - self._start_time - - logging.info(f"Run-Time elapsed (hh:mm:ss.ms) {total_time_taken}") - - def write_crash(self, error): - error_file_name = str(datetime.datetime.now()) - if self._crash_dir: - crash_path = os.path.join( - self._crash_dir, - error_file_name + "_crash.json", - ) - crash_log_path = os.path.join( - self._crash_dir, - error_file_name + "_crash.log", - ) - else: - crash_path = error_file_name + "_crash.json" - crash_log_path = error_file_name + "_crash.log" - - with open(crash_path, "w") as f: - json.dump( - self._data_handler.current_params, f, sort_keys=True, indent=4 - ) - - logging.info(f"Crash params was written to {crash_path}") - - with open(crash_log_path, "w") as f: - f.write(str(error)) - logging.info(f"Crash exception was written to {crash_log_path}") - - if self.write_data_on_failure: - self._data_handler.write_data(error_file_name) - - def start(self): - while True: - logging.info(f"Running test {self._total_executions}") - file_name = self._data_handler.generate_input() - try: - self._start_time = datetime.datetime.now() - if self.params is None: - self._target(file_name) - else: - self._data_handler.set_rand_params(self.params) - kwargs = self._data_handler._current_params["test_kwargs"] - logging.info(f"Parameters passed: {kwargs!s}") - self._target(file_name, **kwargs) - except KeyboardInterrupt: - logging.info( - f"Keyboard Interrupt encountered, stopping after " - f"{self.runs} runs." - ) - sys.exit(0) - except Exception as e: - logging.exception(e) - self.write_crash(traceback.format_exc()) - self.log_stats() - if self.runs != -1 and self._total_executions >= self.runs: - logging.info(f"Completed {self.runs}, stopping now.") - break - - self._total_executions += 1 diff --git a/python/cudf/cudf/_fuzz_testing/io.py b/python/cudf/cudf/_fuzz_testing/io.py deleted file mode 100644 index a4b8e18d8b4..00000000000 --- a/python/cudf/cudf/_fuzz_testing/io.py +++ /dev/null @@ -1,110 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. - -import copy -import json -import logging -import os -import random -import sys - -import numpy as np - -logging.basicConfig( - format="%(asctime)s %(levelname)-8s %(message)s", - level=logging.INFO, - datefmt="%Y-%m-%d %H:%M:%S", -) - - -class IOFuzz: - def __init__( - self, - dirs=None, - max_rows=100_000, - max_columns=1000, - max_string_length=None, - max_lists_length=None, - max_lists_nesting_depth=None, - max_structs_nesting_depth=None, - max_struct_null_frequency=None, - max_struct_types_at_each_level=None, - ): - dirs = [] if dirs is None else dirs - self._inputs = [] - self._max_rows = max_rows - self._max_columns = max_columns - self._max_string_length = max_string_length - self._max_lists_length = max_lists_length - self._max_lists_nesting_depth = max_lists_nesting_depth - self._max_structs_nesting_depth = max_structs_nesting_depth - self._max_struct_null_frequency = max_struct_null_frequency - self._max_struct_types_at_each_level = max_struct_types_at_each_level - - for i, path in enumerate(dirs): - if i == 0 and not os.path.exists(path): - raise FileNotFoundError(f"No {path} exists") - - if os.path.isfile(path) and path.endswith("_crash.json"): - self._load_params(path) - else: - for i in os.listdir(path): - file_name = os.path.join(path, i) - if os.path.isfile(file_name) and file_name.endswith( - "_crash.json" - ): - self._load_params(file_name) - self._regression = bool(self._inputs) - self._idx = 0 - self._current_params = {} - self._current_buffer = None - - def _load_params(self, path): - with open(path) as f: - params = json.load(f) - self._inputs.append(params) - - @staticmethod - def _rand(n): - return random.randrange(0, n + 1) - - def generate_input(self): - raise NotImplementedError("Must be implemented by inherited class") - - @property - def current_params(self): - return self._current_params - - def get_next_regression_params(self): - if self._idx >= len(self._inputs): - logging.info( - "Reached the end of all crash.json files to run..Exiting.." - ) - sys.exit(0) - param = self._inputs[self._idx] - dtypes_meta = param["dtypes_meta"] - num_rows = param["num_rows"] - num_cols = param["num_columns"] - seed = param["seed"] - random.seed(seed) - self._idx += 1 - self._current_params = copy.copy(param) - return dtypes_meta, num_rows, num_cols, seed - - def set_rand_params(self, params): - rng = np.random.default_rng(seed=None) - params_dict = { - param: rng.choice(values) for param, values in params.items() - } - self._current_params["test_kwargs"] = self.process_kwargs( - params_dict=params_dict - ) - - def process_kwargs(self, params_dict): - return { - key: bool(value) - if isinstance(value, np.bool_) - else str(value) - if isinstance(value, np.dtype) - else value - for key, value in params_dict.items() - } diff --git a/python/cudf/cudf/_fuzz_testing/json.py b/python/cudf/cudf/_fuzz_testing/json.py deleted file mode 100644 index 45d2c8d8cf0..00000000000 --- a/python/cudf/cudf/_fuzz_testing/json.py +++ /dev/null @@ -1,193 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. - -import logging -import random -from collections import abc - -import numpy as np - -import cudf -from cudf._fuzz_testing.io import IOFuzz -from cudf._fuzz_testing.utils import ( - ALL_POSSIBLE_VALUES, - _generate_rand_meta, - pyarrow_to_pandas, -) -from cudf.testing import dataset_generator as dg -from cudf.utils.dtypes import pandas_dtypes_to_np_dtypes - -logging.basicConfig( - format="%(asctime)s %(levelname)-8s %(message)s", - level=logging.INFO, - datefmt="%Y-%m-%d %H:%M:%S", -) - - -def _get_dtype_param_value(dtype_val): - if dtype_val is not None and isinstance(dtype_val, abc.Mapping): - processed_dtypes = {} - for col_name, dtype in dtype_val.items(): - if isinstance(dtype, cudf.CategoricalDtype): - processed_dtypes[col_name] = "category" - else: - processed_dtypes[col_name] = str( - pandas_dtypes_to_np_dtypes.get(dtype, dtype) - ) - return processed_dtypes - return dtype_val - - -class JSONReader(IOFuzz): - def __init__( - self, - dirs=None, - max_rows=100_000, - max_columns=1000, - max_string_length=None, - max_lists_length=None, - max_lists_nesting_depth=None, - ): - super().__init__( - dirs=dirs, - max_rows=max_rows, - max_columns=max_columns, - max_string_length=max_string_length, - max_lists_length=max_lists_length, - max_lists_nesting_depth=max_lists_nesting_depth, - ) - - def generate_input(self): - if self._regression: - ( - dtypes_meta, - num_rows, - num_cols, - seed, - ) = self.get_next_regression_params() - else: - seed = random.randint(0, 2**32 - 1) - random.seed(seed) - dtypes_list = list( - cudf.utils.dtypes.ALL_TYPES - # https://github.com/pandas-dev/pandas/issues/20599 - - {"uint64"} - # TODO: Remove DATETIME_TYPES after this is fixed: - # https://github.com/rapidsai/cudf/issues/6586 - - set(cudf.utils.dtypes.DATETIME_TYPES) - ) - # TODO: Uncomment following after following - # issue is fixed: - # https://github.com/rapidsai/cudf/issues/7086 - # dtypes_list.extend(["list"]) - dtypes_meta, num_rows, num_cols = _generate_rand_meta( - self, dtypes_list, seed - ) - self._current_params["dtypes_meta"] = dtypes_meta - self._current_params["seed"] = seed - self._current_params["num_rows"] = num_rows - self._current_params["num_columns"] = num_cols - logging.info( - f"Generating DataFrame with rows: {num_rows} " - f"and columns: {num_cols}" - ) - table = dg.rand_dataframe(dtypes_meta, num_rows, seed) - df = pyarrow_to_pandas(table) - self._current_buffer = df - logging.info(f"Shape of DataFrame generated: {df.shape}") - - return df.to_json(orient="records", lines=True) - - def write_data(self, file_name): - if self._current_buffer is not None: - self._current_buffer.to_json( - file_name + "_crash_json.json", orient="records", lines=True - ) - - def set_rand_params(self, params): - params_dict = {} - rng = np.random.default_rng(seed=None) - for param, values in params.items(): - if param == "dtype" and values == ALL_POSSIBLE_VALUES: - dtype_val = rng.choice( - [True, self._current_buffer.dtypes.to_dict()] - ) - params_dict[param] = _get_dtype_param_value(dtype_val) - else: - params_dict[param] = rng.choice(values) - self._current_params["test_kwargs"] = self.process_kwargs(params_dict) - - -class JSONWriter(IOFuzz): - def __init__( - self, - dirs=None, - max_rows=100_000, - max_columns=1000, - max_string_length=None, - ): - super().__init__( - dirs=dirs, - max_rows=max_rows, - max_columns=max_columns, - max_string_length=max_string_length, - ) - - def generate_input(self): - if self._regression: - ( - dtypes_meta, - num_rows, - num_cols, - seed, - ) = self.get_next_regression_params() - else: - seed = random.randint(0, 2**32 - 1) - random.seed(seed) - dtypes_list = list( - cudf.utils.dtypes.ALL_TYPES - # https://github.com/pandas-dev/pandas/issues/20599 - - {"uint64"} - # TODO: Remove DATETIME_TYPES after this is fixed: - # https://github.com/rapidsai/cudf/issues/6586 - - set(cudf.utils.dtypes.DATETIME_TYPES) - ) - # TODO: Uncomment following after following - # issue is fixed: - # https://github.com/rapidsai/cudf/issues/7086 - # dtypes_list.extend(["list"]) - dtypes_meta, num_rows, num_cols = _generate_rand_meta( - self, dtypes_list, seed - ) - self._current_params["dtypes_meta"] = dtypes_meta - self._current_params["seed"] = seed - self._current_params["num_rows"] = num_rows - self._current_params["num_columns"] = num_cols - logging.info( - f"Generating DataFrame with rows: {num_rows} " - f"and columns: {num_cols}" - ) - table = dg.rand_dataframe(dtypes_meta, num_rows, seed) - df = pyarrow_to_pandas(table) - - logging.info(f"Shape of DataFrame generated: {df.shape}") - self._current_buffer = df - return df - - def write_data(self, file_name): - if self._current_buffer is not None: - self._current_buffer.to_json( - file_name + "_crash_json.json", lines=True, orient="records" - ) - - def set_rand_params(self, params): - params_dict = {} - rng = np.random.default_rng(seed=None) - for param, values in params.items(): - if param == "dtype" and values == ALL_POSSIBLE_VALUES: - dtype_val = rng.choice( - [True, self._current_buffer.dtypes.to_dict()] - ) - params_dict[param] = _get_dtype_param_value(dtype_val) - else: - params_dict[param] = rng.choice(values) - self._current_params["test_kwargs"] = self.process_kwargs(params_dict) diff --git a/python/cudf/cudf/_fuzz_testing/main.py b/python/cudf/cudf/_fuzz_testing/main.py deleted file mode 100644 index 54e49b63e41..00000000000 --- a/python/cudf/cudf/_fuzz_testing/main.py +++ /dev/null @@ -1,45 +0,0 @@ -# Copyright (c) 2020-2022, NVIDIA CORPORATION. - -from cudf._fuzz_testing import fuzzer - - -class PythonFuzz: - def __init__(self, func, params=None, data_handle=None, **kwargs): - self.function = func - self.data_handler_class = data_handle - self.fuzz_worker = fuzzer.Fuzzer( - target=self.function, - data_handler_class=self.data_handler_class, - dirs=kwargs.get("dir", None), - crash_reports_dir=kwargs.get("crash_reports_dir", None), - regression=kwargs.get("regression", False), - max_rows_size=kwargs.get("max_rows_size", 100_000), - max_cols_size=kwargs.get("max_cols_size", 1000), - runs=kwargs.get("runs", -1), - max_string_length=kwargs.get("max_string_length", None), - params=params, - write_data_on_failure=kwargs.get("write_data_on_failure", True), - max_lists_length=kwargs.get("max_lists_length", None), - max_lists_nesting_depth=kwargs.get( - "max_lists_nesting_depth", None - ), - ) - - def __call__(self, *args, **kwargs): - self.fuzz_worker.start() - - -# wrap PythonFuzz to allow for deferred calling -def pythonfuzz(function=None, data_handle=None, params=None, **kwargs): - if function: - return PythonFuzz(function, params, **kwargs) - else: - - def wrapper(function): - return PythonFuzz(function, params, data_handle, **kwargs) - - return wrapper - - -if __name__ == "__main__": - PythonFuzz(None) diff --git a/python/cudf/cudf/_fuzz_testing/orc.py b/python/cudf/cudf/_fuzz_testing/orc.py deleted file mode 100644 index 4d9e4abb09e..00000000000 --- a/python/cudf/cudf/_fuzz_testing/orc.py +++ /dev/null @@ -1,199 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. - -import copy -import io -import logging -import random - -import numpy as np -import pyarrow as pa - -import cudf -from cudf._fuzz_testing.io import IOFuzz -from cudf._fuzz_testing.utils import ( - ALL_POSSIBLE_VALUES, - _generate_rand_meta, - pyarrow_to_pandas, -) -from cudf.testing import dataset_generator as dg - -logging.basicConfig( - format="%(asctime)s %(levelname)-8s %(message)s", - level=logging.INFO, - datefmt="%Y-%m-%d %H:%M:%S", -) - - -class OrcReader(IOFuzz): - def __init__( - self, - dirs=None, - max_rows=100_000, - max_columns=1000, - max_string_length=None, - max_lists_length=None, - max_lists_nesting_depth=None, - ): - super().__init__( - dirs=dirs, - max_rows=max_rows, - max_columns=max_columns, - max_string_length=max_string_length, - max_lists_length=max_lists_length, - max_lists_nesting_depth=max_lists_nesting_depth, - ) - self._df = None - - def generate_input(self): - if self._regression: - ( - dtypes_meta, - num_rows, - num_cols, - seed, - ) = self.get_next_regression_params() - else: - dtypes_list = list( - cudf.utils.dtypes.ALL_TYPES - - {"category"} - # Following dtypes are not supported by orc - # https://orc.apache.org/specification/ORCv0/ - - cudf.utils.dtypes.TIMEDELTA_TYPES - - cudf.utils.dtypes.UNSIGNED_TYPES - - {"datetime64[ns]"} - ) - seed = random.randint(0, 2**32 - 1) - dtypes_meta, num_rows, num_cols = _generate_rand_meta( - self, dtypes_list, seed - ) - self._current_params["dtypes_meta"] = dtypes_meta - self._current_params["seed"] = seed - self._current_params["num_rows"] = num_rows - self._current_params["num_cols"] = num_cols - logging.info( - f"Generating DataFrame with rows: {num_rows} " - f"and columns: {num_cols}" - ) - table = dg.rand_dataframe(dtypes_meta, num_rows, seed) - df = pyarrow_to_pandas(table) - logging.info(f"Shape of DataFrame generated: {table.shape}") - self._df = df - file_obj = io.BytesIO() - pa.orc.write_table(table, file_obj, stripe_size=self._rand(len(df))) - file_obj.seek(0) - buf = file_obj.read() - self._current_buffer = copy.copy(buf) - return (df, buf) - - def write_data(self, file_name): - if self._current_buffer is not None: - with open(file_name + "_crash.orc", "wb") as crash_dataset: - crash_dataset.write(self._current_buffer) - - def set_rand_params(self, params): - params_dict = {} - rng = np.random.default_rng(seed=None) - for param, values in params.items(): - if values == ALL_POSSIBLE_VALUES: - if param == "columns": - col_size = self._rand(len(self._df.columns)) - params_dict[param] = list( - np.unique(rng.choice(self._df.columns, col_size)) - ) - elif param == "stripes": - f = io.BytesIO(self._current_buffer) - orcFile = pa.orc.ORCFile(f) - stripes = list(range(orcFile.nstripes)) - params_dict[param] = rng.choice( - [ - None, - list( - map( - int, - np.unique( - rng.choice(stripes, orcFile.nstripes) - ), - ) - ), - ] - ) - elif param == "use_index": - params_dict[param] = rng.choice([True, False]) - elif param in ("skiprows", "num_rows"): - params_dict[param] = rng.choice( - [None, self._rand(len(self._df))] - ) - else: - if not isinstance(values, list): - raise TypeError("values must be of type list") - params_dict[param] = rng.choice(values) - self._current_params["test_kwargs"] = self.process_kwargs(params_dict) - - -class OrcWriter(IOFuzz): - def __init__( - self, - dirs=None, - max_rows=100_000, - max_columns=1000, - max_string_length=None, - max_lists_length=None, - max_lists_nesting_depth=None, - ): - super().__init__( - dirs=dirs, - max_rows=max_rows, - max_columns=max_columns, - max_string_length=max_string_length, - max_lists_length=max_lists_length, - max_lists_nesting_depth=max_lists_nesting_depth, - ) - self._df = None - - def generate_input(self): - if self._regression: - ( - dtypes_meta, - num_rows, - num_cols, - seed, - ) = self.get_next_regression_params() - else: - dtypes_list = list( - cudf.utils.dtypes.ALL_TYPES - # TODO: Remove "bool" from below - # list after following issue is fixed: - # https://github.com/rapidsai/cudf/issues/6763 - - {"category", "bool"} - # Following dtypes are not supported by orc - # https://orc.apache.org/specification/ORCv0/ - - cudf.utils.dtypes.TIMEDELTA_TYPES - - cudf.utils.dtypes.UNSIGNED_TYPES - # TODO: Remove `DATETIME_TYPES` once - # following bug is fixed: - # https://github.com/rapidsai/cudf/issues/7355 - - cudf.utils.dtypes.DATETIME_TYPES - ) - seed = random.randint(0, 2**32 - 1) - dtypes_meta, num_rows, num_cols = _generate_rand_meta( - self, dtypes_list, seed - ) - self._current_params["dtypes_meta"] = dtypes_meta - self._current_params["seed"] = seed - self._current_params["num_rows"] = num_rows - self._current_params["num_cols"] = num_cols - logging.info( - f"Generating DataFrame with rows: {num_rows} " - f"and columns: {num_cols}" - ) - table = dg.rand_dataframe(dtypes_meta, num_rows, seed) - df = pyarrow_to_pandas(table) - logging.info(f"Shape of DataFrame generated: {table.shape}") - self._df = df - return df - - def write_data(self, file_name): - # Due to the lack of really fast reference writer we are dumping - # the dataframe to a parquet file - if self._df is not None: - self._df.to_parquet(file_name + "_crash.parquet") diff --git a/python/cudf/cudf/_fuzz_testing/parquet.py b/python/cudf/cudf/_fuzz_testing/parquet.py deleted file mode 100644 index bd3df1b0847..00000000000 --- a/python/cudf/cudf/_fuzz_testing/parquet.py +++ /dev/null @@ -1,169 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. - -import logging -import random - -import numpy as np - -import cudf -from cudf._fuzz_testing.io import IOFuzz -from cudf._fuzz_testing.utils import ( - ALL_POSSIBLE_VALUES, - _generate_rand_meta, - pyarrow_to_pandas, -) -from cudf.testing import dataset_generator as dg - -logging.basicConfig( - format="%(asctime)s %(levelname)-8s %(message)s", - level=logging.INFO, - datefmt="%Y-%m-%d %H:%M:%S", -) - - -class ParquetReader(IOFuzz): - def __init__( - self, - dirs=None, - max_rows=100_000, - max_columns=1000, - max_string_length=None, - max_lists_length=None, - max_lists_nesting_depth=None, - ): - super().__init__( - dirs=dirs, - max_rows=max_rows, - max_columns=max_columns, - max_string_length=max_string_length, - max_lists_length=max_lists_length, - max_lists_nesting_depth=max_lists_nesting_depth, - ) - self._df = None - - def generate_input(self): - if self._regression: - ( - dtypes_meta, - num_rows, - num_cols, - seed, - ) = self.get_next_regression_params() - else: - dtypes_list = list( - cudf.utils.dtypes.ALL_TYPES - - {"category", "datetime64[ns]"} - - cudf.utils.dtypes.TIMEDELTA_TYPES - # TODO: Remove uint32 below after this bug is fixed - # https://github.com/pandas-dev/pandas/issues/37327 - - {"uint32"} - | {"list", "decimal64"} - ) - seed = random.randint(0, 2**32 - 1) - dtypes_meta, num_rows, num_cols = _generate_rand_meta( - self, dtypes_list, seed - ) - self._current_params["dtypes_meta"] = dtypes_meta - self._current_params["seed"] = seed - self._current_params["num_rows"] = num_rows - self._current_params["num_cols"] = num_cols - logging.info( - f"Generating DataFrame with rows: {num_rows} " - f"and columns: {num_cols}" - ) - table = dg.rand_dataframe(dtypes_meta, num_rows, seed) - df = pyarrow_to_pandas(table) - logging.info(f"Shape of DataFrame generated: {table.shape}") - - # TODO: Change this to write into - # a BytesIO object once below issue is fixed - # https://issues.apache.org/jira/browse/ARROW-10123 - - # file = io.BytesIO() - - df.to_parquet("temp_file") - # file.seek(0) - # self._current_buffer = copy.copy(file.read()) - # return self._current_buffer - self._df = df - return "temp_file" - - def write_data(self, file_name): - if self._current_buffer is not None: - with open(file_name + "_crash.parquet", "wb") as crash_dataset: - crash_dataset.write(self._current_buffer) - - def set_rand_params(self, params): - params_dict = {} - rng = np.random.default_rng(seed=None) - for param, values in params.items(): - if param == "columns" and values == ALL_POSSIBLE_VALUES: - col_size = self._rand(len(self._df.columns)) - params_dict[param] = list( - np.unique(rng.choice(self._df.columns, col_size)) - ) - else: - params_dict[param] = rng.choice(values) - self._current_params["test_kwargs"] = self.process_kwargs(params_dict) - - -class ParquetWriter(IOFuzz): - def __init__( - self, - dirs=None, - max_rows=100_000, - max_columns=1000, - max_string_length=None, - max_lists_length=None, - max_lists_nesting_depth=None, - ): - super().__init__( - dirs=dirs, - max_rows=max_rows, - max_columns=max_columns, - max_string_length=max_string_length, - max_lists_length=max_lists_length, - max_lists_nesting_depth=max_lists_nesting_depth, - ) - - def generate_input(self): - if self._regression: - ( - dtypes_meta, - num_rows, - num_cols, - seed, - ) = self.get_next_regression_params() - else: - seed = random.randint(0, 2**32 - 1) - random.seed(seed) - dtypes_list = list( - cudf.utils.dtypes.ALL_TYPES - - {"category", "timedelta64[ns]", "datetime64[ns]"} - # TODO: Remove uint32 below after this bug is fixed - # https://github.com/pandas-dev/pandas/issues/37327 - - {"uint32"} - | {"list", "decimal64"} - ) - dtypes_meta, num_rows, num_cols = _generate_rand_meta( - self, dtypes_list, seed - ) - self._current_params["dtypes_meta"] = dtypes_meta - self._current_params["seed"] = seed - self._current_params["num_rows"] = num_rows - self._current_params["num_columns"] = num_cols - logging.info( - f"Generating DataFrame with rows: {num_rows} " - f"and columns: {num_cols}" - ) - - table = dg.rand_dataframe(dtypes_meta, num_rows, seed) - df = pyarrow_to_pandas(table) - - logging.info(f"Shape of DataFrame generated: {df.shape}") - self._current_buffer = df - return df - - def write_data(self, file_name): - if self._current_buffer is not None: - self._current_buffer.to_parquet(file_name + "_crash.parquet") diff --git a/python/cudf/cudf/_fuzz_testing/tests/fuzz_test_avro.py b/python/cudf/cudf/_fuzz_testing/tests/fuzz_test_avro.py deleted file mode 100644 index 5a90aec5828..00000000000 --- a/python/cudf/cudf/_fuzz_testing/tests/fuzz_test_avro.py +++ /dev/null @@ -1,38 +0,0 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. - -import sys - -import cudf -from cudf._fuzz_testing.avro import AvroReader -from cudf._fuzz_testing.main import pythonfuzz -from cudf._fuzz_testing.utils import ( - ALL_POSSIBLE_VALUES, - compare_dataframe, - run_test, -) - - -@pythonfuzz( - data_handle=AvroReader, - params={ - "columns": ALL_POSSIBLE_VALUES, - "skiprows": ALL_POSSIBLE_VALUES, - "num_rows": ALL_POSSIBLE_VALUES, - }, -) -def avro_reader_test(input_tuple, columns, skiprows, num_rows): - pdf, parquet_buffer = input_tuple - expected_pdf = pdf[skiprows:] - if num_rows is not None: - expected_pdf = expected_pdf.head(num_rows) - if skiprows is not None or num_rows is not None: - expected_pdf = expected_pdf.reset_index(drop=True) - - gdf = cudf.read_avro( - parquet_buffer, columns=columns, skiprows=skiprows, num_rows=num_rows - ) - compare_dataframe(expected_pdf, gdf) - - -if __name__ == "__main__": - run_test(globals(), sys.argv) diff --git a/python/cudf/cudf/_fuzz_testing/tests/fuzz_test_csv.py b/python/cudf/cudf/_fuzz_testing/tests/fuzz_test_csv.py deleted file mode 100644 index d90f3ea1aca..00000000000 --- a/python/cudf/cudf/_fuzz_testing/tests/fuzz_test_csv.py +++ /dev/null @@ -1,132 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. - -import sys -from io import StringIO - -import pandas as pd - -import cudf -from cudf._fuzz_testing.csv import CSVReader, CSVWriter -from cudf._fuzz_testing.main import pythonfuzz -from cudf._fuzz_testing.utils import ( - ALL_POSSIBLE_VALUES, - compare_content, - run_test, -) -from cudf.testing import assert_eq - - -@pythonfuzz(data_handle=CSVReader) -def csv_reader_test(csv_buffer): - pdf = pd.read_csv(StringIO(csv_buffer)) - gdf = cudf.read_csv(StringIO(csv_buffer)) - - assert_eq(gdf, pdf) - - -@pythonfuzz(data_handle=CSVWriter) -def csv_writer_test(pdf): - gdf = cudf.from_pandas(pdf) - - pd_buffer = pdf.to_csv() - gd_buffer = gdf.to_csv() - - compare_content(pd_buffer, gd_buffer) - actual = cudf.read_csv(StringIO(gd_buffer)) - expected = pd.read_csv(StringIO(pd_buffer)) - assert_eq(actual, expected) - - -@pythonfuzz( - data_handle=CSVWriter, - params={ - "sep": list([",", "|", "\t", "\r", "~"]), - "header": [True, False], - "na_rep": [ - "", - "", - "NA", - "_NA_", - "__", - "<<<<>>>>>", - "--<>--", - "-+><+-", - ], - "columns": ALL_POSSIBLE_VALUES, - "index": [True, False], - "lineterminator": ["\n", "\r", "\r\n"], - "chunksize": ALL_POSSIBLE_VALUES, - }, -) -def csv_writer_test_params( - pdf, sep, header, na_rep, columns, index, lineterminator, chunksize -): - gdf = cudf.from_pandas(pdf) - - pd_buffer = pdf.to_csv( - sep=sep, - header=header, - na_rep=na_rep, - columns=columns, - index=index, - lineterminator=lineterminator, - chunksize=chunksize, - ) - gd_buffer = gdf.to_csv( - sep=sep, - header=header, - na_rep=na_rep, - columns=columns, - index=index, - lineterminator=lineterminator, - chunksize=chunksize, - ) - - # TODO: Uncomment once this issue is fixed - # https://github.com/rapidsai/cudf/issues/6418 - # compare_content(pd_buffer, gd_buffer) - - actual = cudf.read_csv( - StringIO(gd_buffer), - delimiter=sep, - na_values=na_rep, - lineterminator=lineterminator, - ) - expected = pd.read_csv( - StringIO(pd_buffer), - delimiter=sep, - na_values=na_rep, - lineterminator=lineterminator, - ) - if not header: - # TODO: Remove renaming columns once the following bug is fixed: - # https://github.com/rapidsai/cudf/issues/6418 - actual.columns = expected.columns - - assert_eq(actual, expected) - - -@pythonfuzz( - data_handle=CSVReader, - params={ - "dtype": ALL_POSSIBLE_VALUES, - "usecols": ALL_POSSIBLE_VALUES, - "header": ALL_POSSIBLE_VALUES, - "skiprows": ALL_POSSIBLE_VALUES, - "skipfooter": ALL_POSSIBLE_VALUES, - "nrows": ALL_POSSIBLE_VALUES, - }, -) -def csv_reader_test_params(csv_buffer, dtype, header, skiprows): - pdf = pd.read_csv( - StringIO(csv_buffer), dtype=dtype, header=header, skiprows=skiprows - ) - gdf = cudf.read_csv( - StringIO(csv_buffer), dtype=dtype, header=header, skiprows=skiprows - ) - - assert_eq(gdf, pdf) - - -if __name__ == "__main__": - run_test(globals(), sys.argv) diff --git a/python/cudf/cudf/_fuzz_testing/tests/fuzz_test_json.py b/python/cudf/cudf/_fuzz_testing/tests/fuzz_test_json.py deleted file mode 100644 index 69e9437be93..00000000000 --- a/python/cudf/cudf/_fuzz_testing/tests/fuzz_test_json.py +++ /dev/null @@ -1,94 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. - -import io -import sys - -import pandas as pd - -import cudf -from cudf._fuzz_testing.json import JSONReader, JSONWriter -from cudf._fuzz_testing.main import pythonfuzz -from cudf._fuzz_testing.utils import ALL_POSSIBLE_VALUES, run_test -from cudf.testing import assert_eq - - -@pythonfuzz(data_handle=JSONReader) -def json_reader_test(json_buffer): - pdf = pd.read_json(io.StringIO(json_buffer), orient="records", lines=True) - # Difference in behaviour with pandas - # cudf reads column as strings only. - pdf.columns = pdf.columns.astype("str") - gdf = cudf.read_json(io.StringIO(json_buffer), engine="cudf", lines=True) - - assert_eq(gdf, pdf) - - -@pythonfuzz(data_handle=JSONReader, params={"dtype": ALL_POSSIBLE_VALUES}) -def json_reader_test_params(json_buffer, dtype): - pdf = pd.read_json(json_buffer, dtype=dtype, orient="records", lines=True) - pdf.columns = pdf.columns.astype("str") - - gdf = cudf.read_json(json_buffer, dtype=dtype, engine="cudf", lines=True) - - assert_eq(gdf, pdf) - - -@pythonfuzz(data_handle=JSONWriter) -def json_writer_test(pdf): - gdf = cudf.from_pandas(pdf) - - pdf_buffer = pdf.to_json(lines=True, orient="records") - gdf_buffer = gdf.to_json(lines=True, orient="records") - - # TODO: Uncomment once this is fixed: - # https://github.com/rapidsai/cudf/issues/6429 - # compare_content(pdf_buffer, gdf_buffer) - - actual = cudf.read_json( - gdf_buffer, engine="cudf", lines=True, orient="records" - ) - expected = pd.read_json(pdf_buffer, lines=True, orient="records") - expected.columns = expected.columns.astype("str") - assert_eq(actual, expected) - - -@pythonfuzz( - data_handle=JSONWriter, - params={ - "compression": ["gzip", "bz2", "zip", "xz", None], - "dtype": ALL_POSSIBLE_VALUES, - }, -) -def json_writer_test_params(pdf, compression, dtype): - gdf = cudf.from_pandas(pdf) - - pdf_buffer = pdf.to_json( - lines=True, orient="records", compression=compression - ) - gdf_buffer = gdf.to_json( - lines=True, orient="records", compression=compression - ) - - # TODO: Uncomment once this is fixed: - # https://github.com/rapidsai/cudf/issues/6429 - # compare_content(pdf_buffer, gdf_buffer) - - actual = cudf.read_json( - io.StringIO(gdf_buffer), - engine="cudf", - lines=True, - orient="records", - dtype=dtype, - ) - expected = pd.read_json( - io.StringIO(pdf_buffer), lines=True, orient="records", dtype=dtype - ) - - # Difference in behaviour with pandas - # cudf reads column as strings only. - expected.columns = expected.columns.astype("str") - assert_eq(actual, expected) - - -if __name__ == "__main__": - run_test(globals(), sys.argv) diff --git a/python/cudf/cudf/_fuzz_testing/tests/fuzz_test_orc.py b/python/cudf/cudf/_fuzz_testing/tests/fuzz_test_orc.py deleted file mode 100644 index 977038d1fcb..00000000000 --- a/python/cudf/cudf/_fuzz_testing/tests/fuzz_test_orc.py +++ /dev/null @@ -1,98 +0,0 @@ -# Copyright (c) 2020-2022, NVIDIA CORPORATION. - -import io -import sys - -import cudf -from cudf._fuzz_testing.main import pythonfuzz -from cudf._fuzz_testing.orc import OrcReader, OrcWriter -from cudf._fuzz_testing.utils import ( - ALL_POSSIBLE_VALUES, - compare_dataframe, - orc_to_pandas, - run_test, -) - - -@pythonfuzz( - data_handle=OrcReader, - params={ - "columns": ALL_POSSIBLE_VALUES, - "skiprows": ALL_POSSIBLE_VALUES, - "num_rows": ALL_POSSIBLE_VALUES, - "use_index": ALL_POSSIBLE_VALUES, - }, -) -def orc_reader_test(input_tuple, columns, skiprows, num_rows, use_index): - pdf, file_buffer = input_tuple - expected_pdf = pdf.iloc[skiprows:] - if num_rows is not None: - expected_pdf = expected_pdf.head(num_rows) - if skiprows is not None or num_rows is not None: - expected_pdf.reset_index(drop=True, inplace=True) - if columns is not None and len(columns) > 0: - # ORC reader picks columns if only - # there are any elements in `columns` - expected_pdf = expected_pdf[columns] - if use_index is False: - expected_pdf.reset_index(drop=True, inplace=True) - - gdf = cudf.read_orc( - io.BytesIO(file_buffer), - columns=columns, - skiprows=skiprows, - num_rows=num_rows, - use_index=use_index, - ) - - compare_dataframe(expected_pdf, gdf) - - -@pythonfuzz( - data_handle=OrcReader, - params={"columns": ALL_POSSIBLE_VALUES, "stripes": ALL_POSSIBLE_VALUES}, -) -def orc_reader_stripes_test(input_tuple, columns, stripes): - _, file_buffer = input_tuple - expected_pdf = orc_to_pandas( - file_io_obj=io.BytesIO(file_buffer), stripes=stripes - ) - - if columns is not None and len(columns) > 0: - # ORC reader picks columns if only - # there are any elements in `columns` - expected_pdf = expected_pdf[columns] - - gdf = cudf.read_orc( - io.BytesIO(file_buffer), columns=columns, stripes=stripes - ) - - compare_dataframe(expected_pdf, gdf) - - -@pythonfuzz( - data_handle=OrcWriter, - params={ - "compression": [None, "snappy"], - "enable_statistics": ["NONE", "STRIPE", "ROWGROUP"], - }, -) -def orc_writer_test(pdf, compression, enable_statistics): - file_to_strore = io.BytesIO() - - gdf = cudf.from_pandas(pdf) - - gdf.to_orc( - file_to_strore, - compression=compression, - enable_statistics=enable_statistics, - ) - file_to_strore.seek(0) - - actual_df = cudf.read_orc(file_to_strore) - - compare_dataframe(pdf, actual_df) - - -if __name__ == "__main__": - run_test(globals(), sys.argv) diff --git a/python/cudf/cudf/_fuzz_testing/tests/fuzz_test_parquet.py b/python/cudf/cudf/_fuzz_testing/tests/fuzz_test_parquet.py deleted file mode 100644 index bbc19dce1a4..00000000000 --- a/python/cudf/cudf/_fuzz_testing/tests/fuzz_test_parquet.py +++ /dev/null @@ -1,106 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. - -import sys - -import numpy as np -import pandas as pd - -import cudf -from cudf._fuzz_testing.main import pythonfuzz -from cudf._fuzz_testing.parquet import ParquetReader, ParquetWriter -from cudf._fuzz_testing.utils import ( - ALL_POSSIBLE_VALUES, - compare_dataframe, - run_test, -) - - -@pythonfuzz(data_handle=ParquetReader) -def parquet_reader_test(parquet_buffer): - pdf = pd.read_parquet(parquet_buffer) - gdf = cudf.read_parquet(parquet_buffer) - - compare_dataframe(gdf, pdf) - - -@pythonfuzz( - data_handle=ParquetReader, - params={ - "columns": ALL_POSSIBLE_VALUES, - "use_pandas_metadata": [True, False], - }, -) -def parquet_reader_columns(parquet_buffer, columns, use_pandas_metadata): - pdf = pd.read_parquet( - parquet_buffer, - columns=columns, - use_pandas_metadata=use_pandas_metadata, - ) - - gdf = cudf.read_parquet( - parquet_buffer, - columns=columns, - use_pandas_metadata=use_pandas_metadata, - ) - - compare_dataframe(gdf, pdf) - - -@pythonfuzz(data_handle=ParquetWriter) -def parquet_writer_test(pdf): - pd_file_name = "cpu_pdf.parquet" - gd_file_name = "gpu_pdf.parquet" - - gdf = cudf.from_pandas(pdf) - - pdf.to_parquet(pd_file_name) - gdf.to_parquet(gd_file_name) - - actual = cudf.read_parquet(gd_file_name) - expected = pd.read_parquet(pd_file_name) - compare_dataframe(actual, expected) - - actual = cudf.read_parquet(pd_file_name) - expected = pd.read_parquet(gd_file_name) - compare_dataframe(actual, expected) - - -@pythonfuzz( - data_handle=ParquetWriter, - params={ - "row_group_size": np.random.default_rng(seed=0).integers( - 1, 10000, 100 - ), - "compression": ["snappy", None], - }, -) -def parquet_writer_test_rowgroup_index_compression( - pdf, compression, row_group_size -): - pd_file_name = "cpu_pdf.parquet" - gd_file_name = "gpu_pdf.parquet" - - gdf = cudf.from_pandas(pdf) - - pdf.to_parquet( - pd_file_name, - compression=compression, - row_group_size=row_group_size, - ) - gdf.to_parquet( - gd_file_name, - compression=compression, - row_group_size=row_group_size, - ) - - actual = cudf.read_parquet(gd_file_name) - expected = pd.read_parquet(pd_file_name) - compare_dataframe(actual, expected) - - actual = cudf.read_parquet(pd_file_name) - expected = pd.read_parquet(gd_file_name) - compare_dataframe(actual, expected, nullable=False) - - -if __name__ == "__main__": - run_test(globals(), sys.argv) diff --git a/python/cudf/cudf/_fuzz_testing/tests/readme.md b/python/cudf/cudf/_fuzz_testing/tests/readme.md deleted file mode 100644 index f9ef1119a21..00000000000 --- a/python/cudf/cudf/_fuzz_testing/tests/readme.md +++ /dev/null @@ -1,100 +0,0 @@ -# Fuzz Tests - -This directory contains all the Fuzz tests for cudf library. - - -## Steps to write a fuzz test - -1. Add a Data Handler class which actually generates the necessary random data according to your requirements. This class should be added in `cudf/cudf/testing/`. A sample data handler class is: `CSVWriter`: https://github.com/rapidsai/cudf/blob/branch-0.16/python/cudf/cudf/testing/csv.py -2. Data Handlers are registered by the `pythonfuzz` decorator. At runtime, the Fuzzer will continuously run registered fuzz tests. - -```python -from cudf.testing.csv import CSVWriter - -@pythonfuzz(data_handle=CSVWriter) -def csv_writer_test(data_from_generate_input): - ... - ... - ... - -if __name__ == "__main__": - ... - ... - -``` -## Steps to run fuzz tests - -1. To run a fuzz test, for example a test(method) is in `write_csv.py`: - -```bash -python write_csv.py your_function_name -``` - -To run a basic csv write test in `write_csv.py`: -```bash -python write_csv.py csv_writer_test -``` - -## Tips to run specific crash file/files - -Using the `pythonfuzz` decorator pass in `regression=True` with `dirs` having list of directories -```python -@pythonfuzz(data_handle=CSVWriter, regression=True, dir=["/cudf/python/cudf/cudf/_fuzz_testing"]) -``` - - -## Tips to run for varying parameter combinations - -In the `pythonfuzz` decorator you can pass in the function parameters you would like to pass to the -fuzz-test being written via `params` as a dictionary. The values in dictionary are sampled randomly -and passed to the `your_custom_fuzz_test`. - -If a parameter value depends the kind of input generated by the `data_handle`(in this case `CSVReader`), -then you can assign `ALL_POSSIBLE_VALUES` constant to it. This constant is used as an identifier by the -`data_handle` to generate random parameter values for that specific parameter purely based on data. -To perform this customization `set_rand_params` should be implemented as shown in the below example. -```python -from cudf._fuzz_testing.main import pythonfuzz -from cudf._fuzz_testing.utils import ALL_POSSIBLE_VALUES -@pythonfuzz( - data_handle=CSVWriter, - params={ - "columns": ALL_POSSIBLE_VALUES, - "is_folder": [True, False, None], - "chunksize": ALL_POSSIBLE_VALUES, - }, -) -def your_custom_fuzz_test(data_from_data_handle, dtype, is_folder, header): - ... - ... - ... -``` - -A sample implementation of `set_rand_params` in a `data_handle` class: -``` -def set_rand_params(self, params): - params_dict = {} - for param, values in params.items(): - if values == ALL_POSSIBLE_VALUES: - if param == "columns": - col_size = self._rand(len(self._current_buffer.columns)) - params_dict[param] = list( - np.unique( - np.random.choice( - self._current_buffer.columns, col_size - ) - ) - ) - elif param == "chunksize": - params_dict[param] = np.random.choice( - [ - None, - np.random.randint( - low=1, high=max(1, len(self._current_buffer)) - ), - ] - ) - else: - params_dict[param] = np.random.choice(values) - self._current_params["test_kwargs"] = self.process_kwargs(params_dict) -``` diff --git a/python/cudf/cudf/_fuzz_testing/utils.py b/python/cudf/cudf/_fuzz_testing/utils.py deleted file mode 100644 index 4a8aea22184..00000000000 --- a/python/cudf/cudf/_fuzz_testing/utils.py +++ /dev/null @@ -1,257 +0,0 @@ -# Copyright (c) 2020-2025, NVIDIA CORPORATION. - -import random - -import fastavro -import numpy as np -import pandas as pd -import pyarrow as pa - -import cudf -from cudf.testing import assert_eq -from cudf.utils.dtypes import ( - pandas_dtypes_to_np_dtypes, - pyarrow_dtypes_to_pandas_dtypes, -) - -ALL_POSSIBLE_VALUES = "ALL_POSSIBLE_VALUES" - -_PANDAS_TO_AVRO_SCHEMA_MAP = { - cudf.dtype("int8"): "int", - pd.Int8Dtype(): ["int", "null"], - pd.Int16Dtype(): ["int", "null"], - pd.Int32Dtype(): ["int", "null"], - pd.Int64Dtype(): ["long", "null"], - pd.Float32Dtype(): ["float", "null"], - pd.Float64Dtype(): ["double", "null"], - pd.BooleanDtype(): ["boolean", "null"], - pd.StringDtype(): ["string", "null"], - cudf.dtype("bool_"): "boolean", - cudf.dtype("int16"): "int", - cudf.dtype("int32"): "int", - cudf.dtype("int64"): "long", - cudf.dtype("O"): "string", - cudf.dtype("str"): "string", - cudf.dtype("float32"): "float", - cudf.dtype("float64"): "double", - cudf.dtype(" Date: Tue, 5 Aug 2025 13:58:15 -0700 Subject: [PATCH 062/366] Use more pytest fixtures and avoid GPU parameterization in test_csv/cuda_*/cut.py and more (#19463) Towards https://github.com/rapidsai/cudf/issues/9999 * Use more pytest fixtures * Avoids pytest.mark.parametrize with GPU objects * Eliminate/reduce parameterizations of input size Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/19463 --- python/cudf/cudf/tests/test_csv.py | 467 ++++++++---------- python/cudf/cudf/tests/test_cuda_apply.py | 36 +- .../cudf/cudf/tests/test_custom_accessor.py | 48 +- python/cudf/cudf/tests/test_cut.py | 102 ++-- python/cudf/cudf/tests/test_dataframe_copy.py | 48 +- 5 files changed, 289 insertions(+), 412 deletions(-) diff --git a/python/cudf/cudf/tests/test_csv.py b/python/cudf/cudf/tests/test_csv.py index dbf00d7887e..9494d22a158 100644 --- a/python/cudf/cudf/tests/test_csv.py +++ b/python/cudf/cudf/tests/test_csv.py @@ -5,9 +5,7 @@ import os import re import shutil -from collections import OrderedDict from io import BytesIO, StringIO -from pathlib import Path import cupy as cp import numpy as np @@ -25,41 +23,36 @@ from cudf.testing._utils import assert_exceptions_equal, expect_warning_if -def make_numeric_dataframe(nrows, dtype): - df = pd.DataFrame() - df["col1"] = np.arange(nrows, dtype=dtype) - df["col2"] = np.arange(1, 1 + nrows, dtype=dtype) - return df - - -def make_datetime_dataframe(include_non_standard=False): - df = pd.DataFrame() - df["col1"] = np.array( - [ - "31/10/2010", - "05/03/2001", - "20/10/1994", - "18/10/1990", - "1/1/1970", - "2016-04-30T01:02:03.000", - "2038-01-19 03:14:07", - ] - ) - df["col2"] = np.array( - [ - "18/04/1995", - "14 / 07 / 1994", - "07/06/2006", - "16/09/2005", - "2/2/1970", - "2007-4-30 1:6:40.000PM", - "2038-01-19 03:14:08", - ] - ) - if include_non_standard: - # Last column contains non-standard date formats - df["col3"] = np.array( - [ +@pytest.fixture +def numeric_dataframe(): + return pd.DataFrame( + {"col1": [1, 2, 3], "col2": [4, 5, 6]}, + ) + + +@pytest.fixture +def datetime_dataframe(): + return pd.DataFrame( + { + "col1": [ + "31/10/2010", + "05/03/2001", + "20/10/1994", + "18/10/1990", + "1/1/1970", + "2016-04-30T01:02:03.000", + "2038-01-19 03:14:07", + ], + "col2": [ + "18/04/1995", + "14 / 07 / 1994", + "07/06/2006", + "16/09/2005", + "2/2/1970", + "2007-4-30 1:6:40.000PM", + "2038-01-19 03:14:08", + ], + "col3": [ "1 Jan", "2 January 1994", "Feb 2002", @@ -67,38 +60,33 @@ def make_datetime_dataframe(include_non_standard=False): "1-1-1996", "15-May-2009", "21-Dec-3262", - ] - ) - return df - - -def make_numpy_mixed_dataframe(): - df = pd.DataFrame() - df["Integer"] = np.array([2345, 11987, 9027, 9027]) - df["Date"] = np.array( - ["18/04/1995", "14/07/1994", "07/06/2006", "16/09/2005"] + ], + } ) - df["Float"] = np.array([9.001, 8.343, 6, 2.781]) - df["Integer2"] = np.array([2345, 106, 2088, 789277]) - df["Category"] = np.array(["M", "F", "F", "F"]) - df["String"] = np.array(["Alpha", "Beta", "Gamma", "Delta"]) - df["Boolean"] = np.array([True, False, True, False]) - return df @pytest.fixture def pd_mixed_dataframe(): - return make_numpy_mixed_dataframe() + return pd.DataFrame( + { + "Integer": [2345, 11987, 9027, 9027], + "Date": ["18/04/1995", "14/07/1994", "07/06/2006", "16/09/2005"], + "Float": [9.001, 8.343, 6, 2.781], + "Integer2": [2345, 106, 2088, 789277], + "Category": ["M", "F", "F", "F"], + "String": ["Alpha", "Beta", "Gamma", "Delta"], + "Boolean": [True, False, True, False], + } + ) @pytest.fixture -def cudf_mixed_dataframe(): - return cudf.from_pandas(make_numpy_mixed_dataframe()) +def cudf_mixed_dataframe(pd_mixed_dataframe): + return cudf.from_pandas(pd_mixed_dataframe) -def make_all_numeric_dataframe(): - df = pd.DataFrame() - +@pytest.fixture +def gdf_np_dtypes(): gdf_dtypes = [ "float", "float32", @@ -134,29 +122,17 @@ def make_all_numeric_dataframe(): np.uint32, np.uint64, ] + return dict(zip(gdf_dtypes, np_dtypes)) - for i in range(len(gdf_dtypes)): - df[gdf_dtypes[i]] = np.arange(10, dtype=np_dtypes[i]) - return ( - df, - OrderedDict(zip(gdf_dtypes, gdf_dtypes)), - OrderedDict(zip(gdf_dtypes, np_dtypes)), - ) - - -def make_all_numeric_extremes_dataframe(): - # integers 0,+1,-1,min,max - # float 0.0, -0.0,+1,-1,min,max, nan, esp, espneg, tiny, [-ve values] - df, gdf_dtypes, pdf_dtypes = make_all_numeric_dataframe() - df = pd.DataFrame() - - for gdf_dtype in gdf_dtypes: - np_type = pdf_dtypes[gdf_dtype] +@pytest.fixture +def numeric_extremes_dataframe(gdf_np_dtypes): + data = {} + for typ, np_type in gdf_np_dtypes.items(): if np.dtype(np_type).kind in "iu": itype = np.iinfo(np_type) extremes = [0, +1, -1, itype.min, itype.max] - df[gdf_dtype] = np.array(extremes * 4).astype(np_type)[:20] + data[typ] = np.array(extremes * 4).astype(np_type)[:20] else: ftype = np.finfo(np_type) extremes = [ @@ -177,60 +153,21 @@ def make_all_numeric_extremes_dataframe(): -ftype.epsneg, -ftype.tiny, ] - df[gdf_dtype] = np.array(extremes * 4, dtype=np_type)[:20] - return ( - df, - gdf_dtypes, - pdf_dtypes, - ) - - -@pytest.fixture -def pandas_extreme_numeric_dataframe(): - return make_all_numeric_extremes_dataframe()[0] - - -@pytest.fixture -def cudf_extreme_numeric_dataframe(pandas_extreme_numeric_dataframe): - return cudf.from_pandas(pandas_extreme_numeric_dataframe) - - -@pytest.fixture -def path_or_buf(tmpdir): - fname = tmpdir.mkdir("gdf_csv").join("tmp_csvreader_path_or_buf.csv") - df = make_numeric_dataframe(10, np.int32) - - df.to_csv(fname, index=False, header=False) - buffer = df.to_csv(index=False, header=False) - - def _make_path_or_buf(src): - if src == "filepath": - return str(fname) - if src == "pathobj": - return fname - if src == "bytes_io": - return BytesIO(buffer.encode()) - if src == "string_io": - return StringIO(buffer) - if src == "url": - return Path(fname).as_uri() - - raise ValueError("Invalid source type") + data[typ] = np.array(extremes * 4, dtype=np_type)[:20] + return pd.DataFrame(data) - yield _make_path_or_buf - -dtypes = [np.float64, np.float32, np.int64, np.int32, np.uint64, np.uint32] -dtypes_dict = {"1": np.float64, "2": np.float32, "3": np.int64, "4": np.int32} -nelem = [5, 25, 100] +@pytest.fixture( + params=[np.float64, np.float32, np.int64, np.int32, np.uint64, np.uint32] +) +def dtype(request): + return request.param -@pytest.mark.parametrize("dtype", dtypes) -@pytest.mark.parametrize("nelem", nelem) -def test_csv_reader_numeric_data(dtype, nelem, tmpdir): - fname = tmpdir.mkdir("gdf_csv").join("tmp_csvreader_file1.csv") +def test_csv_reader_numeric_data(dtype, numeric_dataframe, tmp_path): + fname = tmp_path / "tmp_csvreader_file1.csv" - df = make_numeric_dataframe(nelem, dtype) + df = numeric_dataframe.astype(dtype) df.to_csv(fname, index=False, header=False) dtypes = [df[k].dtype for k in df.columns] @@ -241,8 +178,8 @@ def test_csv_reader_numeric_data(dtype, nelem, tmpdir): @pytest.mark.parametrize("parse_dates", [["date2"], [0], ["date1", 1, "bad"]]) -def test_csv_reader_datetime(parse_dates): - df = make_datetime_dataframe(include_non_standard=True) +def test_csv_reader_datetime(datetime_dataframe, parse_dates): + df = datetime_dataframe buffer = df.to_csv(index=False, header=False) gdf = read_csv( @@ -264,12 +201,14 @@ def test_csv_reader_datetime(parse_dates): assert_eq(gdf, pdf) -@pytest.mark.parametrize("pandas_arg", [{"delimiter": "|"}, {"sep": "|"}]) -@pytest.mark.parametrize("cudf_arg", [{"sep": "|"}, {"delimiter": "|"}]) +@pytest.mark.parametrize("p_arg", ["delimiter", "sep"]) +@pytest.mark.parametrize("c_arg", ["sep", "delimiter"]) def test_csv_reader_mixed_data_delimiter_sep( - tmpdir, pandas_arg, cudf_arg, pd_mixed_dataframe + tmp_path, p_arg, c_arg, pd_mixed_dataframe ): - fname = tmpdir.mkdir("gdf_csv").join("tmp_csvreader_file3.csv") + pandas_arg = {p_arg: "|"} + cudf_arg = {c_arg: "|"} + fname = tmp_path / "tmp_csvreader_file3.csv" pd_mixed_dataframe.to_csv(fname, sep="|", index=False, header=False) @@ -318,8 +257,8 @@ def test_csv_reader_mixed_data_delimiter_sep( @pytest.mark.parametrize("use_list", [False, True]) -def test_csv_reader_dtype_list(use_list): - df = make_numeric_dataframe(10, dtype=np.float32) +def test_csv_reader_dtype_list(numeric_dataframe, use_list): + df = numeric_dataframe.astype(np.float32) buffer = df.to_csv(index=False, header=False) # PANDAS doesn't list but cudf does (treated as implied ordered dict) @@ -335,31 +274,34 @@ def test_csv_reader_dtype_list(use_list): @pytest.mark.parametrize("use_names", [False, True]) -def test_csv_reader_dtype_dict(use_names): +def test_csv_reader_dtype_dict(use_names, gdf_np_dtypes): # Save with the column header if not explicitly specifying a list of names - df, gdf_dtypes, pdf_dtypes = make_all_numeric_dataframe() - buffer = df.to_csv(index=False, header=(not use_names)) + df = pd.DataFrame( + { + typ: np.zeros(3, dtype=np_type) + for typ, np_type in gdf_np_dtypes.items() + } + ) + buffer = df.to_csv(index=False, header=not use_names) dtypes = df.dtypes.to_dict() - gdf_names = list(gdf_dtypes.keys()) if use_names else None - pdf_names = list(pdf_dtypes.keys()) if use_names else None - gdf = read_csv(StringIO(buffer), dtype=dtypes, names=gdf_names) - pdf = pd.read_csv(StringIO(buffer), dtype=dtypes, names=pdf_names) + names = list(gdf_np_dtypes.keys()) if use_names else None + gdf = read_csv(StringIO(buffer), dtype=dtypes, names=names) + pdf = pd.read_csv(StringIO(buffer), dtype=dtypes, names=names) assert_eq(gdf, pdf) @pytest.mark.filterwarnings("ignore:invalid value encountered in cast") @pytest.mark.parametrize("use_names", [True, False]) -def test_csv_reader_dtype_extremes(use_names): +def test_csv_reader_dtype_extremes(use_names, numeric_extremes_dataframe): # Save with the column header if not explicitly specifying a list of names - df, gdf_dtypes, pdf_dtypes = make_all_numeric_extremes_dataframe() - buffer = df.to_csv(index=False, header=(not use_names)) + df = numeric_extremes_dataframe + buffer = df.to_csv(index=False, header=not use_names) dtypes = df.dtypes.to_dict() - gdf_names = list(gdf_dtypes.keys()) if use_names else None - pdf_names = list(pdf_dtypes.keys()) if use_names else None + names = df.columns.to_list() if use_names else None - gdf = read_csv(StringIO(buffer), dtype=dtypes, names=gdf_names) - pdf = pd.read_csv(StringIO(buffer), dtype=dtypes, names=pdf_names) + gdf = read_csv(StringIO(buffer), dtype=dtypes, names=names) + pdf = pd.read_csv(StringIO(buffer), dtype=dtypes, names=names) assert_eq(gdf, pdf) @@ -368,8 +310,8 @@ def test_csv_reader_dtype_extremes(use_names): PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, reason="https://github.com/pandas-dev/pandas/issues/52449", ) -def test_csv_reader_skiprows_skipfooter(tmpdir, pd_mixed_dataframe): - fname = tmpdir.mkdir("gdf_csv").join("tmp_csvreader_file5.csv") +def test_csv_reader_skiprows_skipfooter(tmp_path, pd_mixed_dataframe): + fname = tmp_path / "tmp_csvreader_file5.csv" pd_mixed_dataframe.to_csv( fname, columns=["Integer", "Date", "Float"], index=False, header=False @@ -400,8 +342,8 @@ def test_csv_reader_skiprows_skipfooter(tmpdir, pd_mixed_dataframe): assert_eq(df_out, out, check_dtype=False) -def test_csv_reader_negative_vals(tmpdir): - fname = tmpdir.mkdir("gdf_csv").join("tmp_csvreader_file6.csv") +def test_csv_reader_negative_vals(tmp_path): + fname = tmp_path / "tmp_csvreader_file6.csv" names = ["0", "1", "2"] dtypes = ["float32", "float32", "float32"] @@ -425,8 +367,8 @@ def test_csv_reader_negative_vals(tmpdir): np.testing.assert_allclose(two, df["2"].to_numpy()) -def test_csv_reader_strings(tmpdir): - fname = tmpdir.mkdir("gdf_csv").join("tmp_csvreader_file7.csv") +def test_csv_reader_strings(tmp_path): + fname = tmp_path / "tmp_csvreader_file7.csv" names = ["text", "int"] dtypes = ["str", "int"] @@ -453,8 +395,8 @@ def test_csv_reader_strings(tmpdir): assert df["text"][3] == "d" -def test_csv_reader_strings_quotechars(tmpdir): - fname = tmpdir.mkdir("gdf_csv").join("tmp_csvreader_file8.csv") +def test_csv_reader_strings_quotechars(tmp_path): + fname = tmp_path / "tmp_csvreader_file8.csv" names = ["text", "int"] dtypes = ["str", "int"] @@ -481,8 +423,8 @@ def test_csv_reader_strings_quotechars(tmpdir): assert df["text"][3] == "f,,!.," -def test_csv_reader_usecols_int_char(tmpdir, pd_mixed_dataframe): - fname = tmpdir.mkdir("gdf_csv").join("tmp_csvreader_file10.csv") +def test_csv_reader_usecols_int_char(tmp_path, pd_mixed_dataframe): + fname = tmp_path / "tmp_csvreader_file10.csv" pd_mixed_dataframe.to_csv( fname, columns=["Integer", "Date", "Float", "Integer2"], @@ -520,8 +462,8 @@ def test_csv_reader_mangle_dupe_cols(tmpdir, buffer, mangle_dupe_cols): assert_eq(cu_df, pd_df) -def test_csv_reader_float_decimal(tmpdir): - fname = tmpdir.mkdir("gdf_csv").join("tmp_csvreader_file12.csv") +def test_csv_reader_float_decimal(tmp_path): + fname = tmp_path / "tmp_csvreader_file12.csv" names = ["basic_32", "basic_64", "round", "decimal_only", "precision"] dtypes = ["float32", "float64", "float64", "float32", "float64"] @@ -623,8 +565,8 @@ def test_csv_reader_NaN_values(): assert gdf.dtypes.iloc[0] == np.dtype("object") -def test_csv_reader_thousands(tmpdir): - fname = tmpdir.mkdir("gdf_csv").join("tmp_csvreader_file13.csv") +def test_csv_reader_thousands(tmp_path): + fname = tmp_path / "tmp_csvreader_file13.csv" names = dtypes = [ "float32", @@ -748,8 +690,8 @@ def test_csv_reader_compression( (["x", "y"], None, "True,1\nFalse,0", None, None), ], ) -def test_csv_reader_bools(tmpdir, names, dtypes, data, trues, falses): - fname = tmpdir.mkdir("gdf_csv").join("tmp_csvreader_file11.csv") +def test_csv_reader_bools(tmp_path, names, dtypes, data, trues, falses): + fname = tmp_path / "tmp_csvreader_file11.csv" lines = [",".join(names), data] @@ -852,8 +794,8 @@ def test_csv_reader_bools_NA(): assert_eq(df, expected) -def test_csv_quotednumbers(tmpdir): - fname = tmpdir.mkdir("gdf_csv").join("tmp_csvreader_file12.csv") +def test_csv_quotednumbers(tmp_path): + fname = tmp_path / "tmp_csvreader_file12.csv" names = ["integer", "decimal"] dtypes = ["int32", "float32"] @@ -881,8 +823,8 @@ def test_csv_quotednumbers(tmpdir): np.testing.assert_allclose(decimal_ref, df2["decimal"].to_numpy()) -def test_csv_reader_nrows(tmpdir): - fname = tmpdir.mkdir("gdf_csv").join("tmp_csvreader_file14.csv") +def test_csv_reader_nrows(tmp_path): + fname = tmp_path / "tmp_csvreader_file14.csv" names = ["int1", "int2"] dtypes = ["int32", "int32"] @@ -946,10 +888,9 @@ def test_csv_reader_nrows(tmpdir): read_csv(str(fname), nrows=read_rows, skipfooter=1) -def test_csv_reader_gzip_compression_strings(tmpdir): - fnamebase = tmpdir.mkdir("gdf_csv") - fname = fnamebase.join("tmp_csvreader_file15.csv") - fnamez = fnamebase.join("tmp_csvreader_file15.csv.gz") +def test_csv_reader_gzip_compression_strings(tmp_path): + fname = tmp_path / "tmp_csvreader_file15.csv" + fnamez = tmp_path / "tmp_csvreader_file15.csv.gz" names = ["text", "int"] dtypes = ["str", "int"] @@ -980,7 +921,7 @@ def test_csv_reader_gzip_compression_strings(tmpdir): assert df["text"][3] == "d" -@pytest.mark.parametrize("skip_rows", [0, 2, 4]) +@pytest.mark.parametrize("skip_rows", [0, 4]) @pytest.mark.parametrize("header_row", [0, 2]) def test_csv_reader_skiprows_header(skip_rows, header_row): names = ["float_point", "integer"] @@ -1074,16 +1015,27 @@ def test_csv_reader_filenotfound(tmpdir): @pytest.mark.parametrize( - "src", ["filepath", "pathobj", "bytes_io", "string_io", "url"] + "src", + [ + lambda path: str(path), + lambda path: path, + lambda path: BytesIO(path.read_bytes()), + lambda path: StringIO(path.read_text()), + lambda path: path.as_uri(), + ], + ids=["filepath", "pathlib.Path", "ByteIO", "StringIO", "url"], ) -def test_csv_reader_filepath_or_buffer(tmpdir, path_or_buf, src): - expect = pd.read_csv(path_or_buf("filepath")) - got = cudf.read_csv(path_or_buf(src)) +def test_csv_reader_filepath_or_buffer(tmp_path, src): + df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}, dtype=np.int32) + csv_path = tmp_path / "tmp.csv" + df.to_csv(csv_path, index=False, header=False) + expect = pd.read_csv(csv_path) + got = cudf.read_csv(src(csv_path)) assert_eq(expect, got) -def test_small_zip(tmpdir): +def test_small_zip(tmp_path): df = pd.DataFrame( { "a": [1997] * 2, @@ -1092,14 +1044,14 @@ def test_small_zip(tmpdir): } ) - fname = tmpdir.join("small_zip_file.zip") + fname = tmp_path / "small_zip_file.zip" df.to_csv(fname, index=False) got = cudf.read_csv(fname) assert_eq(df, got) -def test_csv_reader_carriage_return(tmpdir): +def test_csv_reader_carriage_return(): rows = 100 names = ["int_row", "int_double_row"] buffer = ",".join(names) + "\r\n" @@ -1145,8 +1097,8 @@ def test_csv_reader_tabs(): @pytest.mark.parametrize("segment_bytes", [10000, 19999, 30001, 36000]) -def test_csv_reader_byte_range(tmpdir, segment_bytes): - fname = tmpdir.mkdir("gdf_csv").join("tmp_csvreader_file16.csv") +def test_csv_reader_byte_range(tmp_path, segment_bytes): + fname = tmp_path / "tmp_csvreader_file16.csv" names = ["int1", "int2"] @@ -1173,8 +1125,8 @@ def test_csv_reader_byte_range(tmpdir, segment_bytes): assert list(df["int2"]) == list(ref_df["int2"]) -def test_csv_reader_byte_range_type_corner_case(tmpdir): - fname = tmpdir.mkdir("gdf_csv").join("tmp_csvreader_file17.csv") +def test_csv_reader_byte_range_type_corner_case(tmp_path): + fname = tmp_path / "tmp_csvreader_file17.csv" cudf.datasets.timeseries( start="2000-01-01", @@ -1396,7 +1348,7 @@ def test_csv_reader_repeated_column_name(): assert_eq(pdf.columns, gdf.columns) -def test_csv_reader_bools_false_positives(tmpdir): +def test_csv_reader_bools_false_positives(): # values that are equal to ["True", "TRUE", "False", "FALSE"] # when using ints to detect bool values items = [3977, 4329, 24015, 27567] @@ -1408,8 +1360,8 @@ def test_csv_reader_bools_false_positives(tmpdir): np.testing.assert_array_equal(items, df["0"].to_numpy()) -def test_csv_reader_aligned_byte_range(tmpdir): - fname = tmpdir.mkdir("gdf_csv").join("tmp_csvreader_file19.csv") +def test_csv_reader_aligned_byte_range(tmp_path): + fname = tmp_path / "tmp_csvreader_file19.csv" nelem = 1000 input_df = pd.DataFrame( @@ -1560,8 +1512,8 @@ def test_csv_blank_first_row(lineterminator): @pytest.mark.parametrize("contents", ["", "\n"]) -def test_csv_empty_file(tmpdir, contents): - fname = tmpdir.mkdir("gdf_csv").join("test_csv_empty_file.csv") +def test_csv_empty_file(tmp_path, contents): + fname = tmp_path / "test_csv_empty_file.csv" with open(fname, "w") as f: f.write(contents) @@ -1580,7 +1532,7 @@ def test_csv_empty_file(tmpdir, contents): @pytest.mark.parametrize("contents", ["", "\n"]) -def test_csv_empty_buffer(tmpdir, contents): +def test_csv_empty_buffer(contents): col_names = ["col1", "col2", "col3", "col4"] in_dtypes = ["int", "str", "float", "short"] out_dtypes = ["int64", "object", "float64", "int16"] @@ -1613,11 +1565,11 @@ def test_csv_reader_partial_dtype(dtype): assert all(names_df.dtypes == ["int16", "int64"]) -def test_csv_writer_file_handle(tmpdir): +def test_csv_writer_file_handle(tmp_path): df = pd.DataFrame({"a": [1, 2, 3], "b": ["xxx", "yyyy", "zzzzz"]}) gdf = cudf.from_pandas(df) - gdf_df_fname = tmpdir.join("gdf_df_1.csv") + gdf_df_fname = tmp_path / "gdf_df_1.csv" with open(gdf_df_fname, "w") as f: gdf.to_csv(path_or_buf=f, index=False) assert os.path.exists(gdf_df_fname) @@ -1626,11 +1578,11 @@ def test_csv_writer_file_handle(tmpdir): assert_eq(gdf, gdf2) -def test_csv_writer_file_append(tmpdir): +def test_csv_writer_file_append(tmp_path): gdf1 = cudf.DataFrame({"a": [1, 2, 3], "b": ["xxx", "yyyy", "zzzzz"]}) gdf2 = cudf.DataFrame({"a": [4, 5, 6], "b": ["foo", "bar", "baz"]}) - gdf_df_fname = tmpdir.join("gdf_df_append.csv") + gdf_df_fname = tmp_path / "gdf_df_append.csv" with open(gdf_df_fname, "w") as f: gdf1.to_csv(f, index=False) with open(gdf_df_fname, "a") as f: @@ -1641,7 +1593,7 @@ def test_csv_writer_file_append(tmpdir): assert_eq(result, expected, check_index_type=True) -def test_csv_writer_buffer(tmpdir): +def test_csv_writer_buffer(): gdf = cudf.DataFrame({"a": [1, 2, 3], "b": ["xxx", "yyyy", "zzzzz"]}) buffer = BytesIO() @@ -1651,13 +1603,11 @@ def test_csv_writer_buffer(tmpdir): assert_eq(result, gdf) -@pytest.mark.parametrize("dtype", dtypes) -@pytest.mark.parametrize("nelem", nelem) -def test_csv_writer_numeric_data(dtype, nelem, tmpdir): - pdf_df_fname = tmpdir.join("pdf_df_1.csv") - gdf_df_fname = tmpdir.join("gdf_df_1.csv") +def test_csv_writer_numeric_data(dtype, numeric_dataframe, tmp_path): + pdf_df_fname = tmp_path / "pdf_df_1.csv" + gdf_df_fname = tmp_path / "gdf_df_1.csv" - df = make_numeric_dataframe(nelem, dtype) + df = numeric_dataframe.astype(dtype) gdf = cudf.from_pandas(df) df.to_csv(path_or_buf=pdf_df_fname, index=False, lineterminator="\n") gdf.to_csv(path_or_buf=gdf_df_fname, index=False) @@ -1670,11 +1620,11 @@ def test_csv_writer_numeric_data(dtype, nelem, tmpdir): assert_eq(expect, got) -def test_csv_writer_datetime_data(tmpdir): - pdf_df_fname = tmpdir.join("pdf_df_2.csv") - gdf_df_fname = tmpdir.join("gdf_df_2.csv") +def test_csv_writer_datetime_data(datetime_dataframe, tmp_path): + pdf_df_fname = tmp_path / "pdf_df_2.csv" + gdf_df_fname = tmp_path / "gdf_df_2.csv" - df = make_datetime_dataframe() + df = datetime_dataframe gdf = cudf.from_pandas(df) df.to_csv(path_or_buf=pdf_df_fname, index=False, lineterminator="\n") gdf.to_csv(path_or_buf=gdf_df_fname, index=False) @@ -1727,15 +1677,14 @@ def test_csv_writer_multichar_terminator(lineterminator, cudf_mixed_dataframe): None, ], ) -@pytest.mark.parametrize( - "header", [True, False, np.bool_(True), np.bool_(False)] -) -@pytest.mark.parametrize( - "index", [True, False, np.bool_(True), np.bool_(False)] -) +@pytest.mark.parametrize("header", [True, False]) +@pytest.mark.parametrize("index", [True, False]) +@pytest.mark.parametrize("bool_box", [bool, np.bool_]) def test_csv_writer_column_and_header_options( - columns, header, index, pd_mixed_dataframe + columns, header, index, bool_box, pd_mixed_dataframe ): + header = bool_box(header) + index = bool_box(index) pdf = pd_mixed_dataframe df = cudf.from_pandas(pdf) @@ -1755,14 +1704,13 @@ def test_csv_writer_column_and_header_options( def test_csv_writer_empty_columns_parameter(cudf_mixed_dataframe): - df = cudf_mixed_dataframe - write_str = df.to_csv(columns=[], index=False) + write_str = cudf_mixed_dataframe.to_csv(columns=[], index=False) assert_eq(write_str, "\n") -def test_csv_writer_multiindex(tmpdir): - pdf_df_fname = tmpdir.join("pdf_df_3.csv") - gdf_df_fname = tmpdir.join("gdf_df_3.csv") +def test_csv_writer_multiindex(tmp_path): + pdf_df_fname = tmp_path / "pdf_df_3.csv" + gdf_df_fname = tmp_path / "gdf_df_3.csv" rng = np.random.default_rng(seed=0) gdf = cudf.DataFrame( @@ -1786,10 +1734,9 @@ def test_csv_writer_multiindex(tmpdir): assert_eq(expect, got) -@pytest.mark.parametrize("chunksize", [None, 9, 1000]) -@pytest.mark.parametrize("dtype", dtypes) -def test_csv_writer_chunksize(chunksize, dtype): - cu_df = cudf.from_pandas(make_numeric_dataframe(100, dtype)) +@pytest.mark.parametrize("chunksize", [None, 2, 1000]) +def test_csv_writer_chunksize(chunksize, numeric_dataframe, dtype): + cu_df = cudf.from_pandas(numeric_dataframe.astype(dtype)) buffer = BytesIO() cu_df.to_csv(buffer, chunksize=chunksize, index=False) @@ -1799,18 +1746,15 @@ def test_csv_writer_chunksize(chunksize, dtype): @pytest.mark.parametrize( - "df", + "data", [ - cudf.DataFrame({"vals": [1, 2, 3]}), - cudf.DataFrame( - {"vals1": [1, 2, 3], "vals2": ["hello", "rapids", "cudf"]} - ), - cudf.DataFrame( - {"vals1": [None, 2.0, 3.0], "vals2": ["hello", "rapids", None]} - ), + {"vals": [1, 2, 3]}, + {"vals1": [1, 2, 3], "vals2": ["hello", "rapids", "cudf"]}, + {"vals1": [None, 2.0, 3.0], "vals2": ["hello", "rapids", None]}, ], ) -def test_to_csv_empty_filename(df): +def test_to_csv_empty_filename(data): + df = cudf.DataFrame(data) pdf = df.to_pandas() actual = df.to_csv() @@ -1820,18 +1764,15 @@ def test_to_csv_empty_filename(df): @pytest.mark.parametrize( - "df", + "data", [ - cudf.DataFrame({"vals": [1, 2, 3]}), - cudf.DataFrame( - {"vals1": [1, 2, 3], "vals2": ["hello", "rapids", "cudf"]} - ), - cudf.DataFrame( - {"vals1": [None, 2.0, 3.0], "vals2": ["hello", "rapids", None]} - ), + {"vals": [1, 2, 3]}, + {"vals1": [1, 2, 3], "vals2": ["hello", "rapids", "cudf"]}, + {"vals1": [None, 2.0, 3.0], "vals2": ["hello", "rapids", None]}, ], ) -def test_to_csv_StringIO(df): +def test_to_csv_StringIO(data): + df = cudf.DataFrame(data) cudf_io = StringIO() pandas_io = StringIO() @@ -1846,8 +1787,8 @@ def test_to_csv_StringIO(df): assert cudf_io.read() == pandas_io.read() -def test_csv_writer_empty_dataframe(tmpdir): - df_fname = tmpdir.join("gdf_df_5.csv") +def test_csv_writer_empty_dataframe(tmp_path): + df_fname = tmp_path / "gdf_df_5.csv" gdf = cudf.DataFrame({"float_point": [], "integer": []}) gdf["float_point"] = gdf["float_point"].astype("float") gdf["integer"] = gdf["integer"].astype("int") @@ -1860,12 +1801,12 @@ def test_csv_writer_empty_dataframe(tmpdir): assert all(df.dtypes == ["object", "object"]) -def test_csv_write_chunksize_corner_case(tmpdir): +def test_csv_write_chunksize_corner_case(tmp_path): # With this num of rows and chunksize # libcudf splits table such a way that it # will end up creating an empty table slice # which caused the issue 5588. - df_fname = tmpdir.join("gdf_df_17.csv") + df_fname = tmp_path / "gdf_df_17.csv" df = cudf.DataFrame({"a": np.arange(10_000)}) df.to_csv(df_fname, chunksize=1000, index=False) got = cudf.read_csv(df_fname) @@ -1881,35 +1822,30 @@ def test_csv_write_no_caller_manipulation(): @pytest.mark.parametrize( - "df", + "pdf", [ - cudf.DataFrame({"a": [1, 2, 3], "": [10, 20, 40]}), - cudf.DataFrame({"": [10, 20, 40], "a": [1, 2, 3]}), - cudf.DataFrame( + pd.DataFrame({"a": [1, 2, 3], "": [10, 20, 40]}), + pd.DataFrame({"": [10, 20, 40], "a": [1, 2, 3]}), + pd.DataFrame( {"a": [1, 2, 3], "": [10, 20, 40]}, - index=cudf.Index(["a", "z", "v"], name="custom name"), + index=pd.Index(["a", "z", "v"], name="custom name"), ), ], ) @pytest.mark.parametrize("index", [True, False]) @pytest.mark.parametrize("columns", [["a"], [""], None]) -def test_csv_write_empty_column_name(df, index, columns): - pdf = df.to_pandas() +def test_csv_write_empty_column_name(pdf, index, columns): + df = cudf.DataFrame.from_pandas(pdf) expected = pdf.to_csv(index=index, columns=columns) actual = df.to_csv(index=index, columns=columns) assert expected == actual -@pytest.mark.parametrize( - "df", - [ - cudf.DataFrame(), - cudf.DataFrame(index=cudf.Index([], name="index name")), - ], -) +@pytest.mark.parametrize("idx", [None, pd.Index([], name="index name")]) @pytest.mark.parametrize("index", [True, False]) -def test_csv_write_empty_dataframe(df, index): +def test_csv_write_empty_dataframe(idx, index): + df = cudf.DataFrame(index=idx) pdf = df.to_pandas() expected = pdf.to_csv(index=index) @@ -2016,13 +1952,13 @@ def test_csv_reader_datetime_dtypes(dtype): @pytest.mark.parametrize( "df", [ - cudf.DataFrame( + lambda: cudf.DataFrame( { "a": cudf.Series([1, 2, 3, 1, 2], dtype="category"), "b": cudf.Series(["a", "c", "a", "b", "a"], dtype="category"), } ), - cudf.DataFrame( + lambda: cudf.DataFrame( { "a": cudf.Series([1.1, 2, 3, 1.1, 2], dtype="category"), "b": cudf.Series( @@ -2030,7 +1966,7 @@ def test_csv_reader_datetime_dtypes(dtype): ), } ), - cudf.DataFrame( + lambda: cudf.DataFrame( { "b": cudf.Series( [1.1, 2, 3, 1.1, 2], @@ -2044,6 +1980,7 @@ def test_csv_reader_datetime_dtypes(dtype): ], ) def test_csv_writer_category(df): + df = df() pdf = df.to_pandas() expected = pdf.to_csv() @@ -2190,12 +2127,12 @@ def test_default_integer_bitwidth_partial( @pytest.mark.filterwarnings("ignore:invalid value encountered in cast") def test_default_integer_bitwidth_extremes( - cudf_extreme_numeric_dataframe, default_integer_bitwidth + numeric_extremes_dataframe, default_integer_bitwidth ): # Test that integer columns in csv are _inferred_ as user specified # bitwidth buf = BytesIO() - cudf_extreme_numeric_dataframe.to_csv(buf) + cudf.DataFrame.from_pandas(numeric_extremes_dataframe).to_csv(buf) buf.seek(0) read = cudf.read_csv(buf) @@ -2264,10 +2201,10 @@ def test_column_selection_plus_column_names(usecols, names): ) -def test_read_compressed_BOM(tmpdir): +def test_read_compressed_BOM(tmp_path): buffer = 'int, string\n1, "a"\n2, "b"\n3, "c"\n' - fname = tmpdir.mkdir("gdf_csv").join("tmp_csvreader_file20.gz") + fname = tmp_path / "tmp_csvreader_file20.gz" with gzip.open(fname, "wt", encoding="utf-8") as f: f.write(codecs.BOM_UTF8.decode("utf-8")) f.write(buffer) diff --git a/python/cudf/cudf/tests/test_cuda_apply.py b/python/cudf/cudf/tests/test_cuda_apply.py index d22d33e2957..80c794cb0f4 100644 --- a/python/cudf/cudf/tests/test_cuda_apply.py +++ b/python/cudf/cudf/tests/test_cuda_apply.py @@ -67,8 +67,9 @@ def test_dataframe_apply_rows(dtype, has_nulls, pessimistic): assert_eq(df_expected, df_actual) -@pytest.mark.parametrize("nelem", [1, 2, 64, 128, 129]) -def test_df_apply_rows(nelem): +def test_df_apply_rows(): + nelem = 20 + def kernel(in1, in2, in3, out1, out2, extra1, extra2): for i, (x, y, z) in enumerate(zip(in1, in2, in3)): out1[i] = extra2 * x - extra1 * y @@ -100,9 +101,10 @@ def kernel(in1, in2, in3, out1, out2, extra1, extra2): np.testing.assert_array_almost_equal(got_out2, expect_out2) -@pytest.mark.parametrize("nelem", [1, 2, 64, 128, 129]) -@pytest.mark.parametrize("chunksize", [1, 2, 3, 4, 23]) -def test_df_apply_chunks(nelem, chunksize): +@pytest.mark.parametrize("chunksize", [1, 4, 23]) +def test_df_apply_chunks(chunksize): + nelem = 20 + def kernel(in1, in2, in3, out1, out2, extra1, extra2): for i, (x, y, z) in enumerate(zip(in1, in2, in3)): out1[i] = extra2 * x - extra1 * y + z @@ -134,8 +136,9 @@ def kernel(in1, in2, in3, out1, out2, extra1, extra2): np.testing.assert_array_almost_equal(got_out2.to_numpy(), expect_out2) -@pytest.mark.parametrize("nelem", [1, 15, 30, 64, 128, 129]) -def test_df_apply_custom_chunks(nelem): +def test_df_apply_custom_chunks(): + nelem = 20 + def kernel(in1, in2, in3, out1, out2, extra1, extra2): for i, (x, y, z) in enumerate(zip(in1, in2, in3)): out1[i] = extra2 * x - extra1 * y + z @@ -172,10 +175,11 @@ def kernel(in1, in2, in3, out1, out2, extra1, extra2): np.testing.assert_array_almost_equal(got_out2.to_numpy(), expect_out2) -@pytest.mark.parametrize("nelem", [1, 15, 30, 64, 128, 129]) @pytest.mark.parametrize("blkct", [None, 1, 8]) -@pytest.mark.parametrize("tpb", [1, 8, 64]) -def test_df_apply_custom_chunks_blkct_tpb(nelem, blkct, tpb): +@pytest.mark.parametrize("tpb", [1, 8]) +def test_df_apply_custom_chunks_blkct_tpb(blkct, tpb): + nelem = 20 + def kernel(in1, in2, in3, out1, out2, extra1, extra2): for i in range(cuda.threadIdx.x, in1.size, cuda.blockDim.x): x = in1[i] @@ -220,8 +224,9 @@ def kernel(in1, in2, in3, out1, out2, extra1, extra2): np.testing.assert_array_almost_equal(got_out2.to_numpy(), expect_out2) -@pytest.mark.parametrize("nelem", [1, 2, 64, 128, 1000, 5000]) -def test_df_apply_rows_incols_mapping(nelem): +def test_df_apply_rows_incols_mapping(): + nelem = 20 + def kernel(x, y, z, out1, out2, extra1, extra2): for i, (a, b, c) in enumerate(zip(x, y, z)): out1[i] = extra2 * a - extra1 * b @@ -250,9 +255,10 @@ def kernel(x, y, z, out1, out2, extra1, extra2): assert_eq(outdf[["out1", "out2"]], expected_out) -@pytest.mark.parametrize("nelem", [1, 2, 64, 128, 129]) -@pytest.mark.parametrize("chunksize", [1, 2, 3, 4, 23]) -def test_df_apply_chunks_incols_mapping(nelem, chunksize): +@pytest.mark.parametrize("chunksize", [1, 4, 23]) +def test_df_apply_chunks_incols_mapping(chunksize): + nelem = 20 + def kernel(q, p, r, out1, out2, extra1, extra2): for i, (a, b, c) in enumerate(zip(q, p, r)): out1[i] = extra2 * a - extra1 * b + c diff --git a/python/cudf/cudf/tests/test_custom_accessor.py b/python/cudf/cudf/tests/test_custom_accessor.py index 278e63f3e8b..f45b85e9c8f 100644 --- a/python/cudf/cudf/tests/test_custom_accessor.py +++ b/python/cudf/cudf/tests/test_custom_accessor.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. +# Copyright (c) 2020-2025, NVIDIA CORPORATION. import pandas as pd import pytest @@ -28,26 +28,20 @@ def bounding_box(self): return (min_x, min_y, max_x, max_y) -@pytest.mark.parametrize( - "gdf", [cudf.datasets.randomdata(nrows=6, dtypes={"x": int, "y": int})] -) -def test_dataframe_accessor(gdf): +def test_dataframe_accessor(): + gdf = cudf.DataFrame({"x": [1, 2, 3], "y": [4, 5, 6]}) pdf = gdf.to_pandas() assert_eq(gdf.point.bounding_box, pdf.point.bounding_box) -@pytest.mark.parametrize( - "gdf1", [cudf.datasets.randomdata(nrows=1, dtypes={"x": int, "y": int})] -) -@pytest.mark.parametrize( - "gdf2", [cudf.datasets.randomdata(nrows=1, dtypes={"x": int, "y": int})] -) -def test_dataframe_accessor_idendity(gdf1, gdf2): +def test_dataframe_accessor_identity(): """Test for accessor identities - An object should hold persistent reference to the same accessor - Different objects should hold difference instances of the accessor """ + gdf1 = cudf.DataFrame({"x": [1, 2, 3], "y": [4, 5, 6]}) + gdf2 = gdf1.copy() assert gdf1.point is gdf1.point assert gdf1.point is not gdf2.point @@ -65,28 +59,18 @@ def __getitem__(self, i): return self._obj[2 * i - 1] -@pytest.mark.parametrize("gidx", [cudf.Index(list(range(0, 50)))]) -def test_index_accessor(gidx): - pidx = gidx.to_pandas() +@pytest.mark.parametrize("klass", [cudf.Index, cudf.Series]) +def test_index_series_accessor(klass): + obj = klass([1, 2, 3]) + pobj = obj.to_pandas() + assert_eq(obj.odd[1], pobj.odd[1]) - for i in range(1, 10): - assert_eq(gidx.odd[i], pidx.odd[i]) - -@pytest.mark.parametrize("gs", [cudf.Series(list(range(1, 50)))]) -def test_series_accessor(gs): - ps = gs.to_pandas() - - for i in range(1, 10): - assert_eq(gs.odd[i], ps.odd[i]) - - -@pytest.mark.parametrize( - "gdf", [cudf.datasets.randomdata(nrows=6, dtypes={"x": int, "y": int})] -) -@pytest.mark.parametrize("gidx", [cudf.Index(list(range(1, 50)))]) -@pytest.mark.parametrize("gs", [cudf.Series(list(range(1, 50)))]) -def test_accessor_space_separate(gdf, gidx, gs): +def test_accessor_space_separate(): + data = [1, 2, 3] + gdf = cudf.DataFrame(data) + gidx = cudf.Index(data) + gs = cudf.Series(data) assert not id(gdf._accessors) == id(gidx._accessors) assert not id(gidx._accessors) == id(gs._accessors) assert not id(gdf._accessors) == id(gs._accessors) diff --git a/python/cudf/cudf/tests/test_cut.py b/python/cudf/cudf/tests/test_cut.py index 3f31da035aa..3fc05599976 100644 --- a/python/cudf/cudf/tests/test_cut.py +++ b/python/cudf/cudf/tests/test_cut.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021-2024, NVIDIA CORPORATION. +# Copyright (c) 2021-2025, NVIDIA CORPORATION. """ Test related to Cut @@ -12,19 +12,16 @@ from cudf.testing import assert_eq -@pytest.mark.parametrize( - "x", [[1, 7, 5, 4, 6, 3], [1, 7], np.array([1, 7, 5, 4, 6, 3])] -) +@pytest.mark.parametrize("box", [list, np.array]) @pytest.mark.parametrize("bins", [1, 2, 3]) @pytest.mark.parametrize("right", [True, False]) @pytest.mark.parametrize("include_lowest", [True, False]) -@pytest.mark.parametrize( - "ordered", [True] -) # if ordered is False we need labels -@pytest.mark.parametrize("precision", [1, 2, 3]) -def test_cut_basic(x, bins, right, include_lowest, ordered, precision): +@pytest.mark.parametrize("precision", [1, 3]) +def test_cut_basic(box, bins, right, include_lowest, precision): # will test optional labels, retbins and duplicates separately # they need more specific parameters to work + x = box([1, 7, 5, 4, 6, 3]) + ordered = True pcat = pd.cut( x=x, bins=bins, @@ -46,20 +43,16 @@ def test_cut_basic(x, bins, right, include_lowest, ordered, precision): assert_eq(pindex, gindex) -@pytest.mark.parametrize("x", [[1, 7, 5, 4, 6, 3]]) -@pytest.mark.parametrize("bins", [3]) # labels must be the same len as bins @pytest.mark.parametrize("right", [True, False]) @pytest.mark.parametrize("include_lowest", [True, False]) +@pytest.mark.parametrize("ordered", [True, False]) +@pytest.mark.parametrize("precision", [1, 3]) @pytest.mark.parametrize( - "ordered", [True, False] -) # labels must be unique if ordered=True -@pytest.mark.parametrize("precision", [1, 2, 3]) -@pytest.mark.parametrize( - "labels", [["bad", "medium", "good"], ["A", "B", "C"], [1, 2, 3], False] + "labels", [["bad", "medium", "good"], [1, 2, 3], False] ) -def test_cut_labels( - x, bins, right, include_lowest, ordered, precision, labels -): +def test_cut_labels(right, include_lowest, ordered, precision, labels): + x = [1, 7, 5, 4, 6, 3] + bins = 3 pcat = pd.cut( x=x, bins=bins, @@ -83,20 +76,14 @@ def test_cut_labels( assert_eq(pindex, gindex) -@pytest.mark.parametrize("x", [[1, 7, 5, 4, 6, 3]]) -@pytest.mark.parametrize("bins", [3]) # labels must be the same len as bins @pytest.mark.parametrize("right", [True, False]) @pytest.mark.parametrize("include_lowest", [True, False]) -@pytest.mark.parametrize( - "ordered", [False] -) # labels must be unique if ordered=True -@pytest.mark.parametrize("precision", [1, 2, 3]) -@pytest.mark.parametrize( - "labels", [["bad", "good", "good"], ["B", "A", "B"], [1, 2, 2], False] -) -def test_cut_labels_non_unique( - x, bins, right, include_lowest, ordered, precision, labels -): +@pytest.mark.parametrize("precision", [1, 3]) +@pytest.mark.parametrize("labels", [["bad", "good", "good"], [1, 2, 2], False]) +def test_cut_labels_non_unique(right, include_lowest, precision, labels): + x = [1, 7, 5, 4, 6, 3] + bins = 3 + ordered = False pcat = pd.cut( x=x, bins=bins, @@ -134,8 +121,8 @@ def test_cut_labels_non_unique( [1, 2, 3, [1, 2, 3], [0, 2, 4, 6, 10]], ) @pytest.mark.parametrize("right", [True, False]) -@pytest.mark.parametrize("precision", [3]) -def test_cut_right(x, bins, right, precision): +def test_cut_right(x, bins, right): + precision = 3 pcat = pd.cut( x=x, bins=bins, @@ -168,12 +155,10 @@ def test_cut_right(x, bins, right, precision): ) @pytest.mark.parametrize("right", [True, False]) @pytest.mark.parametrize("include_lowest", [True, False]) -@pytest.mark.parametrize("ordered", [True]) -@pytest.mark.parametrize("precision", [1, 2, 3]) -@pytest.mark.parametrize("duplicates", ["drop"]) -def test_cut_drop_duplicates( - x, bins, right, precision, duplicates, ordered, include_lowest -): +@pytest.mark.parametrize("precision", [1, 3]) +def test_cut_drop_duplicates(x, bins, right, precision, include_lowest): + ordered = True + duplicates = "drop" pcat = pd.cut( x=x, bins=bins, @@ -212,13 +197,12 @@ def test_cut_drop_duplicates( ) @pytest.mark.parametrize("right", [True, False]) @pytest.mark.parametrize("include_lowest", [True, False]) -@pytest.mark.parametrize("ordered", [True]) -@pytest.mark.parametrize("precision", [1, 2, 3]) -@pytest.mark.parametrize("duplicates", ["raises"]) -def test_cut_drop_duplicates_raises( - x, bins, right, precision, duplicates, ordered, include_lowest -): - with pytest.raises(ValueError) as excgd: +@pytest.mark.parametrize("precision", [1, 3]) +def test_cut_drop_duplicates_raises(x, bins, right, precision, include_lowest): + ordered = True + duplicates = "raise" + msg = "Bin edges must be unique" + with pytest.raises(ValueError, match=msg): cut( x=x, bins=bins, @@ -228,7 +212,7 @@ def test_cut_drop_duplicates_raises( include_lowest=include_lowest, ordered=ordered, ) - with pytest.raises(ValueError) as excpd: + with pytest.raises(ValueError, match=msg): pd.cut( x=x, bins=bins, @@ -239,8 +223,6 @@ def test_cut_drop_duplicates_raises( ordered=ordered, ) - assert_eq(str(excgd.value), str(excpd.value)) - @pytest.mark.parametrize( "x", @@ -252,14 +234,11 @@ def test_cut_drop_duplicates_raises( np.array([2, 4, 6, 8, 10]), ], ) -@pytest.mark.parametrize( - "bins", - [pd.IntervalIndex.from_tuples([(0, 1), (2, 3), (4, 5)])], -) @pytest.mark.parametrize("right", [True, False]) -@pytest.mark.parametrize("precision", [1, 2, 3]) +@pytest.mark.parametrize("precision", [1, 3]) @pytest.mark.parametrize("duplicates", ["drop", "raise"]) -def test_cut_intervalindex_bin(x, bins, right, precision, duplicates): +def test_cut_intervalindex_bin(x, right, precision, duplicates): + bins = pd.IntervalIndex.from_tuples([(0, 1), (2, 3), (4, 5)]) pcat = pd.cut( x=x, bins=bins, @@ -279,16 +258,13 @@ def test_cut_intervalindex_bin(x, bins, right, precision, duplicates): assert_eq(pindex, gindex) -@pytest.mark.parametrize( - "x", - [pd.Series(np.array([2, 4, 6, 8, 10]), index=["a", "b", "c", "d", "e"])], -) -@pytest.mark.parametrize("bins", [1, 2, 3]) +@pytest.mark.parametrize("bins", [1, 3]) @pytest.mark.parametrize("right", [True, False]) @pytest.mark.parametrize("include_lowest", [True, False]) -@pytest.mark.parametrize("ordered", [True]) -@pytest.mark.parametrize("precision", [3]) -def test_cut_series(x, bins, right, include_lowest, ordered, precision): +def test_cut_series(bins, right, include_lowest): + x = pd.Series(np.array([2, 4, 6, 8, 10]), index=["a", "b", "c", "d", "e"]) + precision = 3 + ordered = True pcat = pd.cut( x=x, bins=bins, diff --git a/python/cudf/cudf/tests/test_dataframe_copy.py b/python/cudf/cudf/tests/test_dataframe_copy.py index 3aedbf8365b..53257dc8f29 100644 --- a/python/cudf/cudf/tests/test_dataframe_copy.py +++ b/python/cudf/cudf/tests/test_dataframe_copy.py @@ -1,4 +1,4 @@ -# Copyright (c) 2018-2024, NVIDIA CORPORATION. +# Copyright (c) 2018-2025, NVIDIA CORPORATION. from copy import copy, deepcopy import cupy as cp @@ -23,55 +23,29 @@ @pytest.mark.parametrize( - "copy_parameters", + "fn", [ - {"fn": lambda x: x.copy(), "expected_equality": False}, - {"fn": lambda x: x.copy(deep=True), "expected_equality": False}, - {"fn": lambda x: copy(x), "expected_equality": False}, - {"fn": lambda x: deepcopy(x), "expected_equality": False}, - ], -) -def test_dataframe_deep_copy(copy_parameters): - pdf = pd.DataFrame( - [[1, 2, 3], [4, 5, 6], [7, 8, 9]], columns=["a", "b", "c"] - ) - gdf = DataFrame.from_pandas(pdf) - copy_pdf = copy_parameters["fn"](pdf) - copy_gdf = copy_parameters["fn"](gdf) - copy_pdf["b"] = [0, 0, 0] - copy_gdf["b"] = [0, 0, 0] - pdf_is_equal = np.array_equal(pdf["b"].values, copy_pdf["b"].values) - gdf_is_equal = np.array_equal( - gdf["b"].to_numpy(), copy_gdf["b"].to_numpy() - ) - assert pdf_is_equal == copy_parameters["expected_equality"] - assert gdf_is_equal == copy_parameters["expected_equality"] - - -@pytest.mark.parametrize( - "copy_parameters", - [ - {"fn": lambda x: x.copy(), "expected_equality": False}, - {"fn": lambda x: x.copy(deep=True), "expected_equality": False}, - {"fn": lambda x: copy(x), "expected_equality": False}, - {"fn": lambda x: deepcopy(x), "expected_equality": False}, + lambda x: x.copy(), + lambda x: x.copy(deep=True), + lambda x: copy(x), + lambda x: deepcopy(x), ], ) -def test_dataframe_deep_copy_and_insert(copy_parameters): +def test_dataframe_deep_copy(fn): pdf = pd.DataFrame( [[1, 2, 3], [4, 5, 6], [7, 8, 9]], columns=["a", "b", "c"] ) gdf = DataFrame.from_pandas(pdf) - copy_pdf = copy_parameters["fn"](pdf) - copy_gdf = copy_parameters["fn"](gdf) + copy_pdf = fn(pdf) + copy_gdf = fn(gdf) copy_pdf["b"] = [0, 0, 0] copy_gdf["b"] = [0, 0, 0] pdf_is_equal = np.array_equal(pdf["b"].values, copy_pdf["b"].values) gdf_is_equal = np.array_equal( gdf["b"].to_numpy(), copy_gdf["b"].to_numpy() ) - assert pdf_is_equal == copy_parameters["expected_equality"] - assert gdf_is_equal == copy_parameters["expected_equality"] + assert not pdf_is_equal + assert not gdf_is_equal """ From afc0d5d9beeb1041cfdf02c168e8528c778969cf Mon Sep 17 00:00:00 2001 From: brandon-b-miller <53796099+brandon-b-miller@users.noreply.github.com> Date: Tue, 5 Aug 2025 16:17:37 -0500 Subject: [PATCH 063/366] Support `cudf-polars` `str.zfill` (#19081) Closes https://github.com/rapidsai/cudf/issues/19035 Closes https://github.com/rapidsai/cudf/issues/16480 I believe this needs https://github.com/pola-rs/polars/pull/22985 to pass the one remaining failing test and the column overload described here https://github.com/rapidsai/cudf/issues/19035#issuecomment-2937332033 Authors: - https://github.com/brandon-b-miller - Matthew Roeschke (https://github.com/mroeschke) Approvers: - Matthew Roeschke (https://github.com/mroeschke) URL: https://github.com/rapidsai/cudf/pull/19081 --- .../cudf_polars/dsl/expressions/string.py | 70 ++++++++++++ .../cudf_polars/cudf_polars/testing/plugin.py | 1 + .../tests/expressions/test_stringfunction.py | 100 ++++++++++++++++++ 3 files changed, 171 insertions(+) diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/string.py b/python/cudf_polars/cudf_polars/dsl/expressions/string.py index 9a2cff2c37f..d47828b4175 100644 --- a/python/cudf_polars/cudf_polars/dsl/expressions/string.py +++ b/python/cudf_polars/cudf_polars/dsl/expressions/string.py @@ -11,6 +11,7 @@ from typing import TYPE_CHECKING, Any, ClassVar from polars.exceptions import InvalidOperationError +from polars.polars import dtype_str_repr import pylibcudf as plc @@ -137,6 +138,7 @@ def from_polars(cls, obj: pl_expr.StringFunction) -> Self: Name.Reverse, Name.Tail, Name.Titlecase, + Name.ZFill, } __slots__ = ("_regex_program", "name", "options") _non_child = ("dtype", "name", "options") @@ -264,6 +266,17 @@ def _validate_input(self) -> None: raise NotImplementedError( "strip operations only support scalar patterns" ) + elif self.name is StringFunction.Name.ZFill: + if isinstance(self.children[1], Literal): + _, width = self.children + assert isinstance(width, Literal) + if width.value is not None and width.value < 0: + dtypestr = dtype_str_repr(width.dtype.polars) + raise InvalidOperationError( + f"conversion from `{dtypestr}` to `u64` " + f"failed in column 'literal' for 1 out of " + f"1 values: [{width.value}]" + ) from None @staticmethod def _create_regex_program( @@ -322,6 +335,63 @@ def do_evaluate( ), dtype=self.dtype, ) + elif self.name is StringFunction.Name.ZFill: + # TODO: expensive validation + # polars pads based on bytes, libcudf by visual width + # only pass chars if the visual width matches the byte length + column = self.children[0].evaluate(df, context=context) + col_len_bytes = plc.strings.attributes.count_bytes(column.obj) + col_len_chars = plc.strings.attributes.count_characters(column.obj) + equal = plc.binaryop.binary_operation( + col_len_bytes, + col_len_chars, + plc.binaryop.BinaryOperator.NULL_EQUALS, + plc.DataType(plc.TypeId.BOOL8), + ) + if not plc.reduce.reduce( + equal, + plc.aggregation.all(), + plc.DataType(plc.TypeId.BOOL8), + ).to_py(): + raise InvalidOperationError( + "zfill only supports ascii strings with no unicode characters" + ) + if isinstance(self.children[1], Literal): + width = self.children[1] + assert isinstance(width, Literal) + if width.value is None: + return Column( + plc.Column.from_scalar( + plc.Scalar.from_py(None, self.dtype.plc), + column.size, + ), + self.dtype, + ) + return Column( + plc.strings.padding.zfill(column.obj, width.value), self.dtype + ) + else: + col_width = self.children[1].evaluate(df, context=context) + assert isinstance(col_width, Column) + all_gt_0 = plc.binaryop.binary_operation( + col_width.obj, + plc.Scalar.from_py(0, plc.DataType(plc.TypeId.INT64)), + plc.binaryop.BinaryOperator.GREATER_EQUAL, + plc.DataType(plc.TypeId.BOOL8), + ) + + if not plc.reduce.reduce( + all_gt_0, + plc.aggregation.all(), + plc.DataType(plc.TypeId.BOOL8), + ).to_py(): + raise InvalidOperationError("fill conversion failed.") + + return Column( + plc.strings.padding.zfill_by_widths(column.obj, col_width.obj), + self.dtype, + ) + elif self.name is StringFunction.Name.Contains: child, arg = self.children column = child.evaluate(df, context=context) diff --git a/python/cudf_polars/cudf_polars/testing/plugin.py b/python/cudf_polars/cudf_polars/testing/plugin.py index e205a7299d3..8491b2fa400 100644 --- a/python/cudf_polars/cudf_polars/testing/plugin.py +++ b/python/cudf_polars/cudf_polars/testing/plugin.py @@ -144,6 +144,7 @@ def pytest_configure(config: pytest.Config) -> None: "tests/unit/operations/test_group_by.py::test_group_by_binary_agg_with_literal": "Incorrect broadcasting of literals in groupby-agg", "tests/unit/operations/test_group_by.py::test_group_by_lit_series": "Incorrect broadcasting of literals in groupby-agg", "tests/unit/operations/test_join.py::test_cross_join_slice_pushdown": "Need to implement slice pushdown for cross joins", + "tests/unit/operations/namespaces/string/test_pad.py::test_str_zfill_unicode_not_respected": "polars doesn't add zeros for unicode characters.", "tests/unit/operations/test_rolling.py::test_rolling_group_by_empty_groups_by_take_6330": "Ordering difference, might be polars bug", "tests/unit/sql/test_cast.py::test_cast_errors[values0-values::uint8-conversion from `f64` to `u64` failed]": "Casting that raises not supported on GPU", "tests/unit/sql/test_cast.py::test_cast_errors[values1-values::uint4-conversion from `i64` to `u32` failed]": "Casting that raises not supported on GPU", diff --git a/python/cudf_polars/tests/expressions/test_stringfunction.py b/python/cudf_polars/tests/expressions/test_stringfunction.py index 6515e487e7e..9923f031300 100644 --- a/python/cudf_polars/tests/expressions/test_stringfunction.py +++ b/python/cudf_polars/tests/expressions/test_stringfunction.py @@ -504,6 +504,106 @@ def test_string_join(ldf, ignore_nulls, delimiter): assert_gpu_result_equal(q) +@pytest.mark.parametrize( + "fill", + [ + 0, + 1, + 2, + 5, + 999, + -1, + None, + ], +) +@pytest.mark.parametrize( + "input_strings", + [ + ["1", "0"], + ["123", "45"], + ["", "0"], + ["abc", "def"], + ], +) +def test_string_zfill(fill, input_strings): + ldf = pl.LazyFrame({"a": input_strings}) + q = ldf.select(pl.col("a").str.zfill(fill)) + + if fill is not None and fill < 0: + assert_collect_raises( + q, + polars_except=pl.exceptions.InvalidOperationError, + cudf_except=pl.exceptions.ComputeError, + ) + else: + assert_gpu_result_equal(q) + + +@pytest.mark.parametrize( + "fill", + [ + 5 + if not POLARS_VERSION_LT_130 + else pytest.param(5, marks=pytest.mark.xfail(reason="fixed in Polars 1.30")), + 999 + if not POLARS_VERSION_LT_130 + else pytest.param(999, marks=pytest.mark.xfail(reason="fixed in Polars 1.30")), + ], +) +def test_string_zfill_pl_129(fill): + ldf = pl.LazyFrame({"a": ["-1", "+2"]}) + q = ldf.select(pl.col("a").str.zfill(fill)) + assert_gpu_result_equal(q) + + +@pytest.mark.parametrize( + "fill", + [ + 0, + 1, + 2, + 5 + if not POLARS_VERSION_LT_130 + else pytest.param(5, marks=pytest.mark.xfail(reason="fixed in Polars 1.30")), + 999 + if not POLARS_VERSION_LT_130 + else pytest.param(999, marks=pytest.mark.xfail(reason="fixed in Polars 1.30")), + -1, + pytest.param(None, marks=pytest.mark.xfail(reason="None dtype")), + ], +) +def test_string_zfill_column(fill): + ldf = pl.DataFrame( + { + "input_strings": ["1", "0", "123", "45", "", "0", "-1", "+2", "abc", "def"], + "fill": [fill] * 10, + } + ).lazy() + q = ldf.select(pl.col("input_strings").str.zfill(pl.col("fill"))) + if fill is not None and fill < 0: + assert_collect_raises( + q, + polars_except=pl.exceptions.InvalidOperationError, + cudf_except=pl.exceptions.InvalidOperationError + if not POLARS_VERSION_LT_130 + else pl.exceptions.ComputeError, + ) + else: + assert_gpu_result_equal(q) + + +def test_string_zfill_forbidden_chars(): + ldf = pl.LazyFrame({"a": ["Café", "345", "東京", None]}) + q = ldf.select(pl.col("a").str.zfill(3)) + assert_collect_raises( + q, + polars_except=(), + cudf_except=pl.exceptions.InvalidOperationError + if not POLARS_VERSION_LT_130 + else pl.exceptions.ComputeError, + ) + + @pytest.mark.parametrize( "width", [ From 25a24203bd93b47a72be079afccc39105068eb3e Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 5 Aug 2025 17:22:07 -0700 Subject: [PATCH 064/366] Use more pytest fixtures and avoid GPU parameterization in test_dropna/factorize.py and more (#19449) Towards https://github.com/rapidsai/cudf/issues/9999 * Use more pytest fixtures * Avoids pytest.mark.parametrize with GPU objects * Eliminate/reduce parameterizations of input size Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/19449 --- python/cudf/cudf/tests/test_dlpack.py | 22 ++++++----------- python/cudf/cudf/tests/test_doctests.py | 24 ++++++------------ python/cudf/cudf/tests/test_dropna.py | 30 +++++++++++++---------- python/cudf/cudf/tests/test_duplicates.py | 8 +++--- python/cudf/cudf/tests/test_factorize.py | 6 ++--- python/cudf/cudf/tests/test_feather.py | 27 +++++++------------- python/cudf/cudf/tests/test_gcs.py | 5 ++-- 7 files changed, 48 insertions(+), 74 deletions(-) diff --git a/python/cudf/cudf/tests/test_dlpack.py b/python/cudf/cudf/tests/test_dlpack.py index ffb33870323..44c819cbd68 100644 --- a/python/cudf/cudf/tests/test_dlpack.py +++ b/python/cudf/cudf/tests/test_dlpack.py @@ -6,27 +6,19 @@ import cupy import numpy as np import pytest -from packaging import version import cudf from cudf.testing import assert_eq -nelems = [0, 3, 10] +nelems = [0, 10] dtype = [np.uint16, np.int32, np.float64] nulls = ["some", "none"] params_1d = itertools.product(nelems, dtype, nulls) -ncols = [0, 1, 2] +ncols = [0, 2] params_2d = itertools.product(ncols, nelems, dtype, nulls) -if version.parse(cupy.__version__) < version.parse("10"): - # fromDlpack deprecated in cupy version 10, replaced by from_dlpack - cupy_from_dlpack = cupy.fromDlpack -else: - cupy_from_dlpack = cupy.from_dlpack - - def data_size_expectation_builder(data, nan_null_param=False): if nan_null_param and np.isnan(data).any(): return pytest.raises((ValueError,)) @@ -117,7 +109,7 @@ def test_to_dlpack_cupy_1d(data_1d): cudf_host_array = gs.to_numpy(na_value=np.nan) dlt = gs.to_dlpack() - cupy_array = cupy_from_dlpack(dlt) + cupy_array = cupy.from_dlpack(dlt) cupy_host_array = cupy_array.get() assert_eq(cudf_host_array, cupy_host_array) @@ -131,7 +123,7 @@ def test_to_dlpack_cupy_2d(data_2d): cudf_host_array = np.array(gdf.to_pandas()).flatten() dlt = gdf.to_dlpack() - cupy_array = cupy_from_dlpack(dlt) + cupy_array = cupy.from_dlpack(dlt) cupy_host_array = cupy_array.get().flatten() assert_eq(cudf_host_array, cupy_host_array) @@ -167,7 +159,7 @@ def test_to_dlpack_cupy_2d_null(data_2d): cudf_host_array = np.array(gdf.to_pandas()).flatten() dlt = gdf.to_dlpack() - cupy_array = cupy_from_dlpack(dlt) + cupy_array = cupy.from_dlpack(dlt) cupy_host_array = cupy_array.get().flatten() assert_eq(cudf_host_array, cupy_host_array) @@ -181,7 +173,7 @@ def test_to_dlpack_cupy_1d_null(data_1d): cudf_host_array = gs.to_numpy(na_value=np.nan) dlt = gs.to_dlpack() - cupy_array = cupy_from_dlpack(dlt) + cupy_array = cupy.from_dlpack(dlt) cupy_host_array = cupy_array.get() assert_eq(cudf_host_array, cupy_host_array) @@ -193,7 +185,7 @@ def test_to_dlpack_mixed_dtypes(): cudf_host_array = df.to_numpy() dlt = df.to_dlpack() - cupy_array = cupy_from_dlpack(dlt) + cupy_array = cupy.from_dlpack(dlt) cupy_host_array = cupy_array.get() assert_eq(cudf_host_array, cupy_host_array) diff --git a/python/cudf/cudf/tests/test_doctests.py b/python/cudf/cudf/tests/test_doctests.py index 5d3d18cbe95..b196b5314ac 100644 --- a/python/cudf/cudf/tests/test_doctests.py +++ b/python/cudf/cudf/tests/test_doctests.py @@ -1,10 +1,9 @@ -# Copyright (c) 2022-2024, NVIDIA CORPORATION. +# Copyright (c) 2022-2025, NVIDIA CORPORATION. import contextlib import doctest import inspect import io import itertools -import os import numpy as np import pytest @@ -34,9 +33,6 @@ def _find_doctests_in_obj(obj, finder=None, criteria=None): ---------- obj : module or class The object to search for docstring examples. - finder : doctest.DocTestFinder, optional - The DocTestFinder object to use. If not provided, a DocTestFinder is - constructed. criteria : callable, optional Callable indicating whether to recurse over members of the provided object. If not provided, names not defined in the object's ``__all__`` @@ -74,16 +70,7 @@ def _find_doctests_in_obj(obj, finder=None, criteria=None): class TestDoctests: @pytest.fixture(autouse=True) - def chdir_to_tmp_path(cls, tmp_path): - # Some doctests generate files, so this fixture runs the tests in a - # temporary directory. - original_directory = os.getcwd() - os.chdir(tmp_path) - yield - os.chdir(original_directory) - - @pytest.fixture(autouse=True) - def prinoptions(cls): + def printoptions(cls): # TODO: NumPy now prints scalars as `np.int8(1)`, etc. this should # be adapted evantually. if version.parse(np.__version__) >= version.parse("2.0"): @@ -94,18 +81,21 @@ def prinoptions(cls): @pytest.mark.parametrize( "docstring", - itertools.chain(*[_find_doctests_in_obj(mod) for mod in tests]), + itertools.chain.from_iterable( + _find_doctests_in_obj(mod) for mod in tests + ), ids=lambda docstring: docstring.name, ) @pytest.mark.skipif( PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, reason="Doctests not expected to pass on older versions of pandas", ) - def test_docstring(self, docstring): + def test_docstring(self, docstring, monkeypatch, tmp_path): # We ignore differences in whitespace in the doctest output, and enable # the use of an ellipsis "..." to match any string in the doctest # output. An ellipsis is useful for, e.g., memory addresses or # imprecise floating point values. + monkeypatch.chdir(tmp_path) optionflags = doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE runner = doctest.DocTestRunner(optionflags=optionflags) diff --git a/python/cudf/cudf/tests/test_dropna.py b/python/cudf/cudf/tests/test_dropna.py index eeac78dbebc..1f927d03e95 100644 --- a/python/cudf/cudf/tests/test_dropna.py +++ b/python/cudf/cudf/tests/test_dropna.py @@ -1,7 +1,8 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. +# Copyright (c) 2020-2025, NVIDIA CORPORATION. import numpy as np import pandas as pd +import pyarrow as pa import pytest import cudf @@ -84,22 +85,26 @@ def test_dropna_dataframe(data, how, axis, inplace): "data", [ { - "a": cudf.Series([None, None, None], dtype="float64"), - "b": cudf.Series([1, 2, None]), + "a": pa.array([None, None, None], type=pa.float64()), + "b": [1, 2, None], }, { - "a": cudf.Series([np.nan, np.nan, np.nan], dtype="float64"), - "b": cudf.Series([1, 2, None]), + "a": pa.array([np.nan, np.nan, np.nan]), + "b": [1, 2, None], }, - cudf.Series([None, None, None], dtype="object"), + {"a": pa.array([None, None, None], type=pa.string())}, ], ) @pytest.mark.parametrize("axis", [0, 1]) def test_dropna_with_all_nulls(how, data, axis): - gdf = cudf.DataFrame({"a": data}) + gdf = cudf.DataFrame(data) pdf = gdf.to_pandas() - assert_eq(pdf.dropna(axis=axis, how=how), gdf.dropna(axis=axis, how=how)) + assert_eq( + pdf.dropna(axis=axis, how=how), + gdf.dropna(axis=axis, how=how), + check_dtype=False, + ) def test_dropna_nan_as_null(): @@ -208,12 +213,12 @@ def test_dropna_thresh_cols(thresh, subset, inplace): [ { "key": [1, 2, 10], - "val": cudf.Series([np.nan, 3, 1], nan_as_null=False), + "val": pa.array([np.nan, 3.0, 1.0]), "abc": [np.nan, None, 1], }, { "key": [None, 2, 1], - "val": cudf.Series([3, np.nan, 0.1], nan_as_null=True), + "val": pa.array([3.0, None, 0.1]), "abc": [None, 1, None], }, ], @@ -250,10 +255,9 @@ def test_dropna_index(data, dtype): assert_eq(expect, got) -@pytest.mark.parametrize("data", [[[1, None, 2], [None, None, 2]]]) @pytest.mark.parametrize("how", ["all", "any"]) -def test_dropna_multiindex(data, how): - pi = pd.MultiIndex.from_arrays(data) +def test_dropna_multiindex(how): + pi = pd.MultiIndex.from_arrays([[1, None, 2], [None, None, 2]]) gi = cudf.from_pandas(pi) expect = pi.dropna(how) diff --git a/python/cudf/cudf/tests/test_duplicates.py b/python/cudf/cudf/tests/test_duplicates.py index 67dd7a8388b..c9967b83235 100644 --- a/python/cudf/cudf/tests/test_duplicates.py +++ b/python/cudf/cudf/tests/test_duplicates.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. +# Copyright (c) 2020-2025, NVIDIA CORPORATION. import itertools import random @@ -156,7 +156,7 @@ def test_drop_duplicates(): assert_eq(gdf.drop_duplicates(), pdf.drop_duplicates()) -@pytest.mark.skip(reason="cudf does not support duplicate column names yet") +@pytest.mark.xfail(reason="cudf does not support duplicate column names yet") def test_drop_duplicates_with_duplicate_column_names(): df = pd.DataFrame( [[1, 2, 5], [3, 4, 6], [3, 4, 7]], columns=["a", "a", "b"] @@ -267,8 +267,8 @@ def test_drop_duplicates_empty(df): assert_eq(result, df) -@pytest.mark.parametrize("num_columns", [3, 4, 5]) -def test_dataframe_drop_duplicates_numeric_method(num_columns): +def test_dataframe_drop_duplicates_numeric_method(): + num_columns = 3 comb = list(itertools.permutations(range(num_columns), num_columns)) shuf = list(comb) random.Random(num_columns).shuffle(shuf) diff --git a/python/cudf/cudf/tests/test_factorize.py b/python/cudf/cudf/tests/test_factorize.py index be81e92835f..d16e725088f 100644 --- a/python/cudf/cudf/tests/test_factorize.py +++ b/python/cudf/cudf/tests/test_factorize.py @@ -10,8 +10,7 @@ from cudf.testing import assert_eq -@pytest.mark.parametrize("ncats,nelem", [(2, 2), (2, 10), (10, 100)]) -def test_factorize_series_obj(ncats, nelem): +def test_factorize_series_obj(): df = DataFrame() rng = np.random.default_rng(seed=0) @@ -31,8 +30,7 @@ def test_factorize_series_obj(ncats, nelem): np.testing.assert_array_equal(uvals.get(), handcoded) -@pytest.mark.parametrize("ncats,nelem", [(2, 2), (2, 10), (10, 100)]) -def test_factorize_index_obj(ncats, nelem): +def test_factorize_index_obj(): df = DataFrame() rng = np.random.default_rng(seed=0) diff --git a/python/cudf/cudf/tests/test_feather.py b/python/cudf/cudf/tests/test_feather.py index 6a9dd4c4a66..04c4d55afe6 100644 --- a/python/cudf/cudf/tests/test_feather.py +++ b/python/cudf/cudf/tests/test_feather.py @@ -1,4 +1,4 @@ -# Copyright (c) 2018-2024, NVIDIA CORPORATION. +# Copyright (c) 2018-2025, NVIDIA CORPORATION. import os from string import ascii_letters @@ -13,7 +13,7 @@ from cudf.testing._utils import NUMERIC_TYPES -@pytest.fixture(params=[0, 1, 10, 100]) +@pytest.fixture(params=[0, 10]) def pdf(request): rng = np.random.default_rng(seed=0) types = [*NUMERIC_TYPES, "bool"] @@ -41,25 +41,15 @@ def pdf(request): return test_pdf -@pytest.fixture -def gdf(pdf): - return cudf.DataFrame.from_pandas(pdf) - - -@pytest.fixture -def feather_file(tmp_path_factory, pdf): - fname = tmp_path_factory.mktemp("feather") / "test.feather" - pdf.to_feather(fname) - return fname - - @pytest.mark.filterwarnings("ignore:Using CPU") @pytest.mark.filterwarnings("ignore:Strings are not yet supported") @pytest.mark.parametrize( "columns", [["col_int8"], ["col_category"], ["col_int32", "col_float32"], None], ) -def test_feather_reader(feather_file, columns): +def test_feather_reader(pdf, columns, tmp_path): + feather_file = tmp_path / "test.feather" + pdf.to_feather(feather_file) expect = pa.feather.read_table(feather_file, columns=columns).to_pandas() got = ( cudf.read_feather(feather_file, columns=columns) @@ -71,9 +61,10 @@ def test_feather_reader(feather_file, columns): @pytest.mark.filterwarnings("ignore:Using CPU") -def test_feather_writer(tmpdir, pdf, gdf): - pdf_fname = tmpdir.join("pdf.feather") - gdf_fname = tmpdir.join("gdf.feather") +def test_feather_writer(tmp_path, pdf): + gdf = cudf.DataFrame.from_pandas(pdf) + pdf_fname = tmp_path / "pdf.feather" + gdf_fname = tmp_path / "gdf.feather" pdf.to_feather(pdf_fname) gdf.to_feather(gdf_fname) diff --git a/python/cudf/cudf/tests/test_gcs.py b/python/cudf/cudf/tests/test_gcs.py index 82ecd356bbf..b9b0161e6c0 100644 --- a/python/cudf/cudf/tests/test_gcs.py +++ b/python/cudf/cudf/tests/test_gcs.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. +# Copyright (c) 2020-2025, NVIDIA CORPORATION. import io import os @@ -12,7 +12,6 @@ gcsfs = pytest.importorskip("gcsfs") -TEST_PROJECT = "cudf-gcs-test-project" TEST_BUCKET = "cudf-gcs-test-bucket" @@ -27,7 +26,7 @@ def pdf(scope="module"): return df -def test_read_csv(pdf, monkeypatch, tmpdir): +def test_read_csv(pdf, monkeypatch): # Write to buffer fpath = TEST_BUCKET + "test_csv_reader.csv" buffer = pdf.to_csv(index=False) From e08b8a3b37f85720d27d8c02426652920ab964cc Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 5 Aug 2025 17:22:15 -0700 Subject: [PATCH 065/366] Move test_unaops/test_unique/test_transform.py to new cudf classic test directory structure (#19477) Towards https://github.com/rapidsai/cudf/issues/9999 * Adding more shared fixtures in `conftest.py` where applicable * Further simplify tests Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/19477 --- python/cudf/cudf/tests/conftest.py | 20 +++++ .../{ => general_functions}/test_unique.py | 78 +++++++++---------- .../test_apply.py} | 10 +-- .../cudf/cudf/tests/series/test_reductions.py | 17 ++++ .../{test_unaops.py => series/test_unary.py} | 32 ++------ 5 files changed, 83 insertions(+), 74 deletions(-) rename python/cudf/cudf/tests/{ => general_functions}/test_unique.py (56%) rename python/cudf/cudf/tests/{test_transform.py => series/test_apply.py} (65%) create mode 100644 python/cudf/cudf/tests/series/test_reductions.py rename python/cudf/cudf/tests/{test_unaops.py => series/test_unary.py} (50%) diff --git a/python/cudf/cudf/tests/conftest.py b/python/cudf/cudf/tests/conftest.py index 008867211f4..b3b48d19538 100644 --- a/python/cudf/cudf/tests/conftest.py +++ b/python/cudf/cudf/tests/conftest.py @@ -192,6 +192,26 @@ def set_decomp_env_vars(monkeypatch, request): yield +@pytest.fixture( + params=[ + "min", + "max", + "sum", + "product", + "quantile", + "all", + "any", + "std", + "var", + "median", + "kurtosis", + "skew", + ] +) +def reduction_methods(request): + return request.param + + signed_integer_types = ["int8", "int16", "int32", "int64"] unsigned_integer_types = ["uint8", "uint16", "uint32", "uint64"] float_types = ["float32", "float64"] diff --git a/python/cudf/cudf/tests/test_unique.py b/python/cudf/cudf/tests/general_functions/test_unique.py similarity index 56% rename from python/cudf/cudf/tests/test_unique.py rename to python/cudf/cudf/tests/general_functions/test_unique.py index 74e00d7c389..51da8f06dd3 100644 --- a/python/cudf/cudf/tests/test_unique.py +++ b/python/cudf/cudf/tests/general_functions/test_unique.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024-2025, NVIDIA CORPORATION. +# Copyright (c) 2025, NVIDIA CORPORATION. import cupy as cp import numpy as np @@ -9,35 +9,23 @@ from cudf.testing import assert_eq -@pytest.fixture -def df(): - rng = np.random.default_rng(seed=0) - arr = rng.integers(2, size=10, dtype=np.int64) - return pd.DataFrame( - { - "foo": arr, - "bar": [pd.Timestamp(x) for x in arr], - } - ) - - -@pytest.fixture(params=["foo", "bar"]) -def series_test_vals(request, df): - actual = cudf.unique(cudf.Series.from_pandas(df[request.param])) - expected = pd.unique(df[request.param]) - return actual, expected - - -def test_unique_series_obj(series_test_vals): - actual, expected = series_test_vals - +@pytest.mark.parametrize( + "data", + [ + [1, 1, 2], + [pd.Timestamp(1), pd.Timestamp(1), pd.Timestamp(2)], + ], +) +def test_unique_series_obj(data): + actual = cudf.unique(cudf.Series(data)) + expected = pd.unique(pd.Series(data)) assert isinstance(expected, np.ndarray) assert isinstance(actual, cudf.Series) assert_eq(actual, pd.Series(expected, name=actual.name)) @pytest.mark.parametrize( - "index", + "cudf_index,pandas_index", [ (cudf.Index, pd.Index), (cudf.MultiIndex, pd.MultiIndex), @@ -45,21 +33,26 @@ def test_unique_series_obj(series_test_vals): (cudf.CategoricalIndex, pd.CategoricalIndex), ], ) -@pytest.mark.parametrize("col", ["foo", "bar"]) -def test_unique_index_obj(index, col, df): - df = cudf.DataFrame.from_pandas(df) - if index[0] == cudf.MultiIndex: - df.index = cudf.MultiIndex.from_arrays([df[col], df[col]]) +@pytest.mark.parametrize( + "data", + [ + [1, 1, 2], + [pd.Timestamp(1), pd.Timestamp(1), pd.Timestamp(2)], + ], +) +def test_unique_index_obj(cudf_index, pandas_index, data): + if cudf_index == cudf.MultiIndex: + idx = cudf_index.from_arrays([data, data]) else: - df.index = index[0](df[col]) - actual = cudf.unique(df.index) - expected = pd.unique(df.index.to_pandas()) + idx = cudf_index(data) + actual = cudf.unique(idx) + expected = pd.unique(idx.to_pandas()) isinstance(expected, np.ndarray) - assert isinstance(actual, index[0]) + assert isinstance(actual, cudf_index) - if index[0] == cudf.MultiIndex: - expect = index[1].from_arrays( + if cudf_index == cudf.MultiIndex: + expect = pandas_index.from_arrays( [ [x[0] for x in expected], [x[1] for x in expected], @@ -68,12 +61,13 @@ def test_unique_index_obj(index, col, df): ) assert_eq(actual, expect) else: - assert_eq(actual, index[1](expected, name=actual.name)) + assert_eq(actual, cudf_index(expected, name=actual.name)) -def test_unique_cupy_ndarray(df): - arr = np.asarray(df["foo"]) - garr = cp.asarray(df["foo"]) +def test_unique_cupy_ndarray(): + ser = pd.Series(pd.Series([1, 1, 2])) + arr = np.asarray(ser) + garr = cp.asarray(ser) expected = pd.unique(arr) actual = cudf.unique(garr) @@ -102,7 +96,8 @@ def test_category_dtype_unique(data): assert_eq(actual, pd.Series(expected)) -def test_unique_fails_value_error(df): +def test_unique_fails_value_error(): + df = pd.DataFrame({"foo": [1, 2, 3]}) with pytest.raises( ValueError, match="Must pass cudf.Series, cudf.Index, or cupy.ndarray object", @@ -111,8 +106,9 @@ def test_unique_fails_value_error(df): def test_unique_fails_not_implemented_error(): + ser = cudf.Series(["foo", "foo"], dtype="category") with cudf.option_context("mode.pandas_compatible", True): with pytest.raises( NotImplementedError, match="cudf.Categorical is not implemented" ): - cudf.unique(cudf.Series(["foo", "foo"], dtype="category")) + cudf.unique(ser) diff --git a/python/cudf/cudf/tests/test_transform.py b/python/cudf/cudf/tests/series/test_apply.py similarity index 65% rename from python/cudf/cudf/tests/test_transform.py rename to python/cudf/cudf/tests/series/test_apply.py index 870bfad8a85..af957aeee6c 100644 --- a/python/cudf/cudf/tests/test_transform.py +++ b/python/cudf/cudf/tests/series/test_apply.py @@ -1,18 +1,16 @@ -# Copyright (c) 2018-2025, NVIDIA CORPORATION. +# Copyright (c) 2025, NVIDIA CORPORATION. import numpy as np import pytest from cudf import Series -from cudf.testing._utils import NUMERIC_TYPES def _generic_function(a): return a**3 -@pytest.mark.parametrize("dtype", NUMERIC_TYPES) @pytest.mark.parametrize( "udf,testfunc", [ @@ -20,10 +18,10 @@ def _generic_function(a): (lambda x: x in [1, 2, 3, 4], lambda ser: np.isin(ser, [1, 2, 3, 4])), ], ) -def test_apply_python_lambda(dtype, udf, testfunc): - size = 500 +def test_apply_python_lambda(numeric_types_as_str, udf, testfunc): + size = 50 rng = np.random.default_rng(seed=0) - lhs_arr = rng.random(size).astype(dtype) + lhs_arr = rng.random(size).astype(numeric_types_as_str) lhs_ser = Series(lhs_arr) out_ser = lhs_ser.apply(udf) diff --git a/python/cudf/cudf/tests/series/test_reductions.py b/python/cudf/cudf/tests/series/test_reductions.py new file mode 100644 index 00000000000..1c924444c1e --- /dev/null +++ b/python/cudf/cudf/tests/series/test_reductions.py @@ -0,0 +1,17 @@ +# Copyright (c) 2019-2025, NVIDIA CORPORATION. + +import numpy as np +import pandas as pd +import pytest + +from cudf import Series + + +@pytest.mark.parametrize("data", [[], [1, 2, 3]]) +def test_series_pandas_methods(data, reduction_methods): + arr = np.array(data) + sr = Series(arr) + psr = pd.Series(arr) + np.testing.assert_equal( + getattr(sr, reduction_methods)(), getattr(psr, reduction_methods)() + ) diff --git a/python/cudf/cudf/tests/test_unaops.py b/python/cudf/cudf/tests/series/test_unary.py similarity index 50% rename from python/cudf/cudf/tests/test_unaops.py rename to python/cudf/cudf/tests/series/test_unary.py index 182e9147307..c1973b26425 100644 --- a/python/cudf/cudf/tests/test_unaops.py +++ b/python/cudf/cudf/tests/series/test_unary.py @@ -3,26 +3,21 @@ from decimal import Decimal import numpy as np -import pandas as pd -import pytest from cudf import Series -from cudf.testing import _utils as utils, assert_eq +from cudf.testing import assert_eq -@pytest.mark.parametrize("dtype", utils.NUMERIC_TYPES) -def test_series_abs(dtype): +def test_series_abs(numeric_types_as_str): rng = np.random.default_rng(seed=0) - arr = (rng.random(1000) * 100).astype(dtype) + arr = (rng.random(100) * 100).astype(numeric_types_as_str) sr = Series(arr) np.testing.assert_equal(sr.abs().to_numpy(), np.abs(arr)) np.testing.assert_equal(abs(sr).to_numpy(), abs(arr)) -@pytest.mark.parametrize("dtype", utils.INTEGER_TYPES) -def test_series_invert(dtype): - rng = np.random.default_rng(seed=0) - arr = (rng.random(1000) * 100).astype(dtype) +def test_series_invert(integer_types_as_str): + arr = np.array([0, 1, 2], dtype=integer_types_as_str) sr = Series(arr) np.testing.assert_equal((~sr).to_numpy(), np.invert(arr)) np.testing.assert_equal((~sr).to_numpy(), ~arr) @@ -35,23 +30,6 @@ def test_series_neg(): np.testing.assert_equal((-sr).to_numpy(), -arr) -@pytest.mark.parametrize("mth", ["min", "max", "sum", "product"]) -def test_series_pandas_methods(mth): - rng = np.random.default_rng(seed=0) - arr = (1 + rng.random(5) * 100).astype(np.int64) - sr = Series(arr) - psr = pd.Series(arr) - np.testing.assert_equal(getattr(sr, mth)(), getattr(psr, mth)()) - - -@pytest.mark.parametrize("mth", ["min", "max", "sum", "product", "quantile"]) -def test_series_pandas_methods_empty(mth): - arr = np.array([]) - sr = Series(arr) - psr = pd.Series(arr) - np.testing.assert_equal(getattr(sr, mth)(), getattr(psr, mth)()) - - def test_series_bool_neg(): sr = Series([True, False, True, None, False, None, True, True]) psr = sr.to_pandas(nullable=True) From fd862c07548f009266a5bcb624d7de084c23c5c0 Mon Sep 17 00:00:00 2001 From: Nghia Truong <7416935+ttnghia@users.noreply.github.com> Date: Tue, 5 Aug 2025 19:21:57 -0700 Subject: [PATCH 066/366] Upgrade `gcc-toolset` for Java/JNI build to version 14 (#19500) As RAPIDS is moving to gcc-14, we should upgrade the rest of the build system (Java/JNI) to match it. This only changes in the build and test for Java/JNI, no implementation change is needed. Build for `spark-rapids-jni` will not pass until https://github.com/NVIDIA/spark-rapids-jni/pull/3568 is merged. But merging this should not be blocked by it. Authors: - Nghia Truong (https://github.com/ttnghia) Approvers: - Bradley Dice (https://github.com/bdice) - Peixin (https://github.com/pxLi) URL: https://github.com/rapidsai/cudf/pull/19500 --- .github/workflows/spark-rapids-jni.yaml | 2 +- java/ci/Dockerfile.rocky | 4 ++-- java/ci/README.md | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/spark-rapids-jni.yaml b/.github/workflows/spark-rapids-jni.yaml index 832c749874c..f317c7cf531 100644 --- a/.github/workflows/spark-rapids-jni.yaml +++ b/.github/workflows/spark-rapids-jni.yaml @@ -19,4 +19,4 @@ jobs: - name: "Build spark-rapids-jni" run: | mkdir target - CMAKE_CUDA_ARCHITECTURES=90 LIBCUDF_DEPENDENCY_MODE=latest USE_GDS=on scl enable gcc-toolset-11 build/buildcpp.sh + CMAKE_CUDA_ARCHITECTURES=90 LIBCUDF_DEPENDENCY_MODE=latest USE_GDS=on scl enable gcc-toolset-14 build/buildcpp.sh diff --git a/java/ci/Dockerfile.rocky b/java/ci/Dockerfile.rocky index 28f9361ce8a..e7cf319663d 100644 --- a/java/ci/Dockerfile.rocky +++ b/java/ci/Dockerfile.rocky @@ -26,9 +26,9 @@ ARG TARGETPLATFORM=linux/amd64 # multi-platform build with: docker buildx build --platform linux/arm64,linux/amd64 on either amd64 or arm64 host # check available official arm-based docker images at https://hub.docker.com/r/nvidia/cuda/tags (OS/ARCH) FROM --platform=$TARGETPLATFORM nvidia/cuda:$CUDA_VERSION-devel-rockylinux$OS_RELEASE -ARG TOOLSET_VERSION=11 +ARG TOOLSET_VERSION=14 ### Install basic requirements -RUN dnf --enablerepo=powertools install -y scl-utils gcc-toolset-${TOOLSET_VERSION} git zlib-devel maven tar wget patch ninja-build boost-devel +RUN dnf --enablerepo=powertools install -y scl-utils gcc-toolset-11 gcc-toolset-${TOOLSET_VERSION} git zlib-devel maven tar wget patch ninja-build boost-devel ## pre-create the CMAKE_INSTALL_PREFIX folder, set writable by any user for Jenkins RUN mkdir /usr/local/rapids /rapids && chmod 777 /usr/local/rapids /rapids diff --git a/java/ci/README.md b/java/ci/README.md index 5597cb22109..22a2dd618f6 100644 --- a/java/ci/README.md +++ b/java/ci/README.md @@ -42,7 +42,7 @@ git clone --recursive https://github.com/rapidsai/cudf.git -b branch-25.10 ```bash cd cudf export WORKSPACE=`pwd` -scl enable gcc-toolset-11 "java/ci/build-in-docker.sh" +scl enable gcc-toolset-14 "java/ci/build-in-docker.sh" ``` ### The output From 3e9e6297f082e3a9cadc9ec83e5f224fb719bee4 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Tue, 5 Aug 2025 17:23:34 -1000 Subject: [PATCH 067/366] Add streams to all modules with three or fewer functions (#19600) Contributes to https://github.com/rapidsai/cudf/issues/15163 Authors: - Vyas Ramasubramani (https://github.com/vyasr) Approvers: - Bradley Dice (https://github.com/bdice) URL: https://github.com/rapidsai/cudf/pull/19600 --- cpp/include/cudf/partitioning.hpp | 2 +- python/pylibcudf/pylibcudf/binaryop.pxd | 6 ++- python/pylibcudf/pylibcudf/binaryop.pyi | 3 ++ python/pylibcudf/pylibcudf/binaryop.pyx | 20 ++++++--- .../pylibcudf/pylibcudf/libcudf/binaryop.pxd | 16 ++++--- .../pylibcudf/libcudf/partitioning.pxd | 22 ++++++--- .../pylibcudf/pylibcudf/libcudf/quantiles.pxd | 5 ++- python/pylibcudf/pylibcudf/libcudf/reduce.pxd | 15 ++++--- python/pylibcudf/pylibcudf/libcudf/search.pxd | 6 ++- python/pylibcudf/pylibcudf/partitioning.pxd | 17 +++++-- python/pylibcudf/pylibcudf/partitioning.pyi | 17 +++++-- python/pylibcudf/pylibcudf/partitioning.pyx | 45 ++++++++++++++----- python/pylibcudf/pylibcudf/quantiles.pxd | 7 ++- python/pylibcudf/pylibcudf/quantiles.pyi | 4 ++ python/pylibcudf/pylibcudf/quantiles.pyx | 22 +++++++-- python/pylibcudf/pylibcudf/reduce.pxd | 9 ++-- python/pylibcudf/pylibcudf/reduce.pyi | 20 +++++++-- python/pylibcudf/pylibcudf/reduce.pyx | 44 ++++++++++++++---- python/pylibcudf/pylibcudf/search.pxd | 8 +++- python/pylibcudf/pylibcudf/search.pyi | 8 +++- python/pylibcudf/pylibcudf/search.pyx | 30 +++++++++++-- 21 files changed, 255 insertions(+), 71 deletions(-) diff --git a/cpp/include/cudf/partitioning.hpp b/cpp/include/cudf/partitioning.hpp index b6bb4914f4c..e6defdb37be 100644 --- a/cpp/include/cudf/partitioning.hpp +++ b/cpp/include/cudf/partitioning.hpp @@ -37,7 +37,7 @@ namespace CUDF_EXPORT cudf { /** * @brief Identifies the hash function to be used in hash partitioning */ -enum class hash_id { +enum class hash_id : int32_t { HASH_IDENTITY = 0, ///< Identity hash function that simply returns the key to be hashed HASH_MURMUR3 ///< Murmur3 hash function }; diff --git a/python/pylibcudf/pylibcudf/binaryop.pxd b/python/pylibcudf/pylibcudf/binaryop.pxd index 06625e9e2db..69398c555a8 100644 --- a/python/pylibcudf/pylibcudf/binaryop.pxd +++ b/python/pylibcudf/pylibcudf/binaryop.pxd @@ -1,7 +1,8 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. +# Copyright (c) 2024-2025, NVIDIA CORPORATION. from libcpp cimport bool from pylibcudf.libcudf.binaryop cimport binary_operator +from rmm.pylibrmm.stream cimport Stream from .column cimport Column from .scalar cimport Scalar @@ -21,7 +22,8 @@ cpdef Column binary_operation( LeftBinaryOperand lhs, RightBinaryOperand rhs, binary_operator op, - DataType output_type + DataType output_type, + Stream stream=* ) cpdef bool is_supported_operation( diff --git a/python/pylibcudf/pylibcudf/binaryop.pyi b/python/pylibcudf/pylibcudf/binaryop.pyi index f745e6c6854..08b0d58b7c8 100644 --- a/python/pylibcudf/pylibcudf/binaryop.pyi +++ b/python/pylibcudf/pylibcudf/binaryop.pyi @@ -2,6 +2,8 @@ from enum import IntEnum +from rmm.pylibrmm.stream import Stream + from pylibcudf.column import Column from pylibcudf.scalar import Scalar from pylibcudf.types import DataType @@ -48,6 +50,7 @@ def binary_operation( rhs: Column | Scalar, op: BinaryOperator, output_type: DataType, + stream: Stream | None = None, ) -> Column: ... def is_supported_operation( out: DataType, lhs: DataType, rhs: DataType, op: BinaryOperator diff --git a/python/pylibcudf/pylibcudf/binaryop.pyx b/python/pylibcudf/pylibcudf/binaryop.pyx index c6827431646..1bbafbfe227 100644 --- a/python/pylibcudf/pylibcudf/binaryop.pyx +++ b/python/pylibcudf/pylibcudf/binaryop.pyx @@ -12,9 +12,12 @@ from pylibcudf.libcudf.column.column cimport column from pylibcudf.libcudf.binaryop import \ binary_operator as BinaryOperator # no-cython-lint +from rmm.pylibrmm.stream cimport Stream + from .column cimport Column from .scalar cimport Scalar from .types cimport DataType +from .utils cimport _get_stream __all__ = ["BinaryOperator", "binary_operation", "is_supported_operation"] @@ -22,7 +25,8 @@ cpdef Column binary_operation( LeftBinaryOperand lhs, RightBinaryOperand rhs, binary_operator op, - DataType output_type + DataType output_type, + Stream stream=None ): """Perform a binary operation between a column and another column or scalar. @@ -43,6 +47,8 @@ cpdef Column binary_operation( The operation to perform. output_type : DataType The data type to use for the output. + stream : Stream | None + CUDA stream on which to perform the operation. Returns ------- @@ -50,6 +56,7 @@ cpdef Column binary_operation( The result of the binary operation """ cdef unique_ptr[column] result + stream = _get_stream(stream) if LeftBinaryOperand is Column and RightBinaryOperand is Column: with nogil: @@ -57,7 +64,8 @@ cpdef Column binary_operation( lhs.view(), rhs.view(), op, - output_type.c_obj + output_type.c_obj, + stream.view() ) elif LeftBinaryOperand is Column and RightBinaryOperand is Scalar: with nogil: @@ -65,7 +73,8 @@ cpdef Column binary_operation( lhs.view(), dereference(rhs.c_obj), op, - output_type.c_obj + output_type.c_obj, + stream.view() ) elif LeftBinaryOperand is Scalar and RightBinaryOperand is Column: with nogil: @@ -73,12 +82,13 @@ cpdef Column binary_operation( dereference(lhs.c_obj), rhs.view(), op, - output_type.c_obj + output_type.c_obj, + stream.view() ) else: raise ValueError(f"Invalid arguments {lhs} and {rhs}") - return Column.from_libcudf(move(result)) + return Column.from_libcudf(move(result), stream) cpdef bool is_supported_operation( diff --git a/python/pylibcudf/pylibcudf/libcudf/binaryop.pxd b/python/pylibcudf/pylibcudf/libcudf/binaryop.pxd index 607f7c2fa60..d27daad00bd 100644 --- a/python/pylibcudf/pylibcudf/libcudf/binaryop.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/binaryop.pxd @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. +# Copyright (c) 2020-2025, NVIDIA CORPORATION. from libc.stdint cimport int32_t from libcpp cimport bool from libcpp.memory cimport unique_ptr @@ -9,6 +9,8 @@ from pylibcudf.libcudf.column.column_view cimport column_view from pylibcudf.libcudf.scalar.scalar cimport scalar from pylibcudf.libcudf.types cimport data_type +from rmm.librmm.cuda_stream_view cimport cuda_stream_view + cdef extern from "cudf/binaryop.hpp" namespace "cudf" nogil: cpdef enum class binary_operator(int32_t): @@ -52,28 +54,32 @@ cdef extern from "cudf/binaryop.hpp" namespace "cudf" nogil: const scalar& lhs, const column_view& rhs, binary_operator op, - data_type output_type + data_type output_type, + cuda_stream_view stream ) except +libcudf_exception_handler cdef unique_ptr[column] binary_operation ( const column_view& lhs, const scalar& rhs, binary_operator op, - data_type output_type + data_type output_type, + cuda_stream_view stream ) except +libcudf_exception_handler cdef unique_ptr[column] binary_operation ( const column_view& lhs, const column_view& rhs, binary_operator op, - data_type output_type + data_type output_type, + cuda_stream_view stream ) except +libcudf_exception_handler cdef unique_ptr[column] binary_operation ( const column_view& lhs, const column_view& rhs, const string& op, - data_type output_type + data_type output_type, + cuda_stream_view stream ) except +libcudf_exception_handler cdef extern from "cudf/binaryop.hpp" namespace "cudf::binops" nogil: diff --git a/python/pylibcudf/pylibcudf/libcudf/partitioning.pxd b/python/pylibcudf/pylibcudf/libcudf/partitioning.pxd index 04566b6e40a..2ca060fd7df 100644 --- a/python/pylibcudf/pylibcudf/libcudf/partitioning.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/partitioning.pxd @@ -1,14 +1,21 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. +# Copyright (c) 2020-2025, NVIDIA CORPORATION. cimport pylibcudf.libcudf.types as libcudf_types -from libc.stdint cimport uint32_t +from libc.stdint cimport int32_t, uint32_t from libcpp.memory cimport unique_ptr from libcpp.pair cimport pair from libcpp.vector cimport vector from pylibcudf.exception_handler cimport libcudf_exception_handler from pylibcudf.libcudf.column.column cimport column from pylibcudf.libcudf.column.column_view cimport column_view +from pylibcudf.libcudf.hash cimport DEFAULT_HASH_SEED from pylibcudf.libcudf.table.table cimport table from pylibcudf.libcudf.table.table_view cimport table_view +from rmm.librmm.cuda_stream_view cimport cuda_stream_view + +cdef extern from "cudf/partitioning.hpp" namespace "cudf" nogil: + cpdef enum class hash_id(int32_t): + HASH_IDENTITY "cudf::hash_id::HASH_IDENTITY" + HASH_MURMUR3 "cudf::hash_id::HASH_MURMUR3" cdef extern from "cudf/partitioning.hpp" namespace "cudf" nogil: @@ -16,19 +23,24 @@ cdef extern from "cudf/partitioning.hpp" namespace "cudf" nogil: hash_partition "cudf::hash_partition" ( const table_view& input, const vector[libcudf_types.size_type]& columns_to_hash, - int num_partitions + int num_partitions, + hash_id hash_function, + uint32_t seed, + cuda_stream_view stream ) except +libcudf_exception_handler cdef pair[unique_ptr[table], vector[libcudf_types.size_type]] \ partition "cudf::partition" ( const table_view& t, const column_view& partition_map, - int num_partitions + int num_partitions, + cuda_stream_view stream ) except +libcudf_exception_handler cdef pair[unique_ptr[table], vector[libcudf_types.size_type]] \ round_robin_partition "cudf::round_robin_partition" ( const table_view& input, int num_partitions, - int start_partition + int start_partition, + cuda_stream_view stream ) except +libcudf_exception_handler diff --git a/python/pylibcudf/pylibcudf/libcudf/quantiles.pxd b/python/pylibcudf/pylibcudf/libcudf/quantiles.pxd index 8f60302e776..58e70d37492 100644 --- a/python/pylibcudf/pylibcudf/libcudf/quantiles.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/quantiles.pxd @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. +# Copyright (c) 2020-2025, NVIDIA CORPORATION. from libcpp cimport bool from libcpp.memory cimport unique_ptr from libcpp.vector cimport vector @@ -14,6 +14,7 @@ from pylibcudf.libcudf.types cimport ( order_info, sorted, ) +from rmm.librmm.cuda_stream_view cimport cuda_stream_view cdef extern from "cudf/quantiles.hpp" namespace "cudf" nogil: @@ -24,6 +25,7 @@ cdef extern from "cudf/quantiles.hpp" namespace "cudf" nogil: interpolation interp, column_view ordered_indices, bool exact, + cuda_stream_view stream, ) except +libcudf_exception_handler cdef unique_ptr[table] quantiles ( @@ -33,4 +35,5 @@ cdef extern from "cudf/quantiles.hpp" namespace "cudf" nogil: sorted is_input_sorted, vector[order] column_order, vector[null_order] null_precedence, + cuda_stream_view stream, ) except +libcudf_exception_handler diff --git a/python/pylibcudf/pylibcudf/libcudf/reduce.pxd b/python/pylibcudf/pylibcudf/libcudf/reduce.pxd index ad79187b733..f740e7a7acf 100644 --- a/python/pylibcudf/pylibcudf/libcudf/reduce.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/reduce.pxd @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. +# Copyright (c) 2020-2025, NVIDIA CORPORATION. from libcpp cimport bool from libcpp.memory cimport unique_ptr from libcpp.utility cimport pair @@ -7,14 +7,16 @@ from pylibcudf.libcudf.aggregation cimport reduce_aggregation, scan_aggregation from pylibcudf.libcudf.column.column cimport column from pylibcudf.libcudf.column.column_view cimport column_view from pylibcudf.libcudf.scalar.scalar cimport scalar -from pylibcudf.libcudf.types cimport data_type +from pylibcudf.libcudf.types cimport data_type, null_policy +from rmm.librmm.cuda_stream_view cimport cuda_stream_view cdef extern from "cudf/reduction.hpp" namespace "cudf" nogil: cdef unique_ptr[scalar] cpp_reduce "cudf::reduce" ( column_view col, const reduce_aggregation& agg, - data_type type + data_type type, + cuda_stream_view stream ) except +libcudf_exception_handler cpdef enum class scan_type(bool): @@ -24,10 +26,13 @@ cdef extern from "cudf/reduction.hpp" namespace "cudf" nogil: cdef unique_ptr[column] cpp_scan "cudf::scan" ( column_view col, const scan_aggregation& agg, - scan_type inclusive + scan_type inclusive, + null_policy null_handling, + cuda_stream_view stream ) except +libcudf_exception_handler cdef pair[unique_ptr[scalar], unique_ptr[scalar]] cpp_minmax "cudf::minmax" ( - column_view col + column_view col, + cuda_stream_view stream ) except +libcudf_exception_handler diff --git a/python/pylibcudf/pylibcudf/libcudf/search.pxd b/python/pylibcudf/pylibcudf/libcudf/search.pxd index 5ec06858baa..f7264985b9e 100644 --- a/python/pylibcudf/pylibcudf/libcudf/search.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/search.pxd @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. +# Copyright (c) 2020-2025, NVIDIA CORPORATION. cimport pylibcudf.libcudf.types as libcudf_types from libcpp.memory cimport unique_ptr from libcpp.vector cimport vector @@ -6,6 +6,7 @@ from pylibcudf.exception_handler cimport libcudf_exception_handler from pylibcudf.libcudf.column.column cimport column from pylibcudf.libcudf.column.column_view cimport column_view from pylibcudf.libcudf.table.table_view cimport table_view +from rmm.librmm.cuda_stream_view cimport cuda_stream_view cdef extern from "cudf/search.hpp" namespace "cudf" nogil: @@ -15,6 +16,7 @@ cdef extern from "cudf/search.hpp" namespace "cudf" nogil: table_view needles, vector[libcudf_types.order] column_order, vector[libcudf_types.null_order] null_precedence, + cuda_stream_view stream, ) except +libcudf_exception_handler cdef unique_ptr[column] upper_bound( @@ -22,9 +24,11 @@ cdef extern from "cudf/search.hpp" namespace "cudf" nogil: table_view needles, vector[libcudf_types.order] column_order, vector[libcudf_types.null_order] null_precedence, + cuda_stream_view stream, ) except +libcudf_exception_handler cdef unique_ptr[column] contains( column_view haystack, column_view needles, + cuda_stream_view stream, ) except +libcudf_exception_handler diff --git a/python/pylibcudf/pylibcudf/partitioning.pxd b/python/pylibcudf/pylibcudf/partitioning.pxd index aad60149fc4..42e40fca776 100644 --- a/python/pylibcudf/pylibcudf/partitioning.pxd +++ b/python/pylibcudf/pylibcudf/partitioning.pxd @@ -1,4 +1,6 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. +# Copyright (c) 2024-2025, NVIDIA CORPORATION. + +from rmm.pylibrmm.stream cimport Stream from .column cimport Column from .table cimport Table @@ -7,13 +9,20 @@ from .table cimport Table cpdef tuple[Table, list] hash_partition( Table input, list columns_to_hash, - int num_partitions + int num_partitions, + Stream stream = * ) -cpdef tuple[Table, list] partition(Table t, Column partition_map, int num_partitions) +cpdef tuple[Table, list] partition( + Table t, + Column partition_map, + int num_partitions, + Stream stream = *, +) cpdef tuple[Table, list] round_robin_partition( Table input, int num_partitions, - int start_partition=* + int start_partition=*, + Stream stream = * ) diff --git a/python/pylibcudf/pylibcudf/partitioning.pyi b/python/pylibcudf/pylibcudf/partitioning.pyi index 48a2ade23f1..b37ae246b41 100644 --- a/python/pylibcudf/pylibcudf/partitioning.pyi +++ b/python/pylibcudf/pylibcudf/partitioning.pyi @@ -1,14 +1,25 @@ # Copyright (c) 2024, NVIDIA CORPORATION. +from rmm.pylibrmm.stream import Stream + from pylibcudf.column import Column from pylibcudf.table import Table def hash_partition( - input: Table, columns_to_hash: list[int], num_partitions: int + input: Table, + columns_to_hash: list[int], + num_partitions: int, + stream: Stream | None = None, ) -> tuple[Table, list[int]]: ... def partition( - t: Table, partition_map: Column, num_partitions: int + t: Table, + partition_map: Column, + num_partitions: int, + stream: Stream | None = None, ) -> tuple[Table, list[int]]: ... def round_robin_partition( - input: Table, num_partitions: int, start_partition: int = 0 + input: Table, + num_partitions: int, + start_partition: int = 0, + stream: Stream | None = None, ) -> tuple[Table, list[int]]: ... diff --git a/python/pylibcudf/pylibcudf/partitioning.pyx b/python/pylibcudf/pylibcudf/partitioning.pyx index 1dacabceb06..c3fb7c1b579 100644 --- a/python/pylibcudf/pylibcudf/partitioning.pyx +++ b/python/pylibcudf/pylibcudf/partitioning.pyx @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. +# Copyright (c) 2024-2025, NVIDIA CORPORATION. cimport pylibcudf.libcudf.types as libcudf_types from libcpp.memory cimport unique_ptr @@ -7,9 +7,11 @@ from libcpp.utility cimport move from libcpp.vector cimport vector from pylibcudf.libcudf cimport partitioning as cpp_partitioning from pylibcudf.libcudf.table.table cimport table +from rmm.pylibrmm.stream cimport Stream from .column cimport Column from .table cimport Table +from .utils cimport _get_stream __all__ = [ "hash_partition", @@ -20,7 +22,8 @@ __all__ = [ cpdef tuple[Table, list] hash_partition( Table input, list columns_to_hash, - int num_partitions + int num_partitions, + Stream stream=None ): """ Partitions rows from the input table into multiple output tables. @@ -35,6 +38,8 @@ cpdef tuple[Table, list] hash_partition( Indices of input columns to hash num_partitions : int The number of partitions to use + stream : Stream | None + CUDA stream on which to perform the operation. Returns ------- @@ -45,16 +50,26 @@ cpdef tuple[Table, list] hash_partition( cdef vector[libcudf_types.size_type] c_columns_to_hash = columns_to_hash cdef int c_num_partitions = num_partitions + stream = _get_stream(stream) + with nogil: c_result = cpp_partitioning.hash_partition( input.view(), c_columns_to_hash, - c_num_partitions + c_num_partitions, + cpp_partitioning.hash_id.HASH_MURMUR3, + cpp_partitioning.DEFAULT_HASH_SEED, + stream.view() ) - return Table.from_libcudf(move(c_result.first)), list(c_result.second) + return Table.from_libcudf(move(c_result.first), stream), list(c_result.second) -cpdef tuple[Table, list] partition(Table t, Column partition_map, int num_partitions): +cpdef tuple[Table, list] partition( + Table t, + Column partition_map, + int num_partitions, + Stream stream=None, +): """ Partitions rows of `t` according to the mapping specified by `partition_map`. @@ -69,6 +84,8 @@ cpdef tuple[Table, list] partition(Table t, Column partition_map, int num_partit in `t` to it's partition. num_partitions : int The total number of partitions + stream : Stream | None + CUDA stream on which to perform the operation. Returns ------- @@ -78,20 +95,24 @@ cpdef tuple[Table, list] partition(Table t, Column partition_map, int num_partit cdef pair[unique_ptr[table], vector[libcudf_types.size_type]] c_result cdef int c_num_partitions = num_partitions + stream = _get_stream(stream) + with nogil: c_result = cpp_partitioning.partition( t.view(), partition_map.view(), - c_num_partitions + c_num_partitions, + stream.view() ) - return Table.from_libcudf(move(c_result.first)), list(c_result.second) + return Table.from_libcudf(move(c_result.first), stream), list(c_result.second) cpdef tuple[Table, list] round_robin_partition( Table input, int num_partitions, - int start_partition=0 + int start_partition=0, + Stream stream=None ): """ Round-robin partition. @@ -106,6 +127,8 @@ cpdef tuple[Table, list] round_robin_partition( Number of partitions for the table start_partition : int, default 0 Index of the 1st partition + stream : Stream | None + CUDA stream on which to perform the operation. Returns ------- @@ -117,9 +140,11 @@ cpdef tuple[Table, list] round_robin_partition( cdef int c_num_partitions = num_partitions cdef int c_start_partition = start_partition + stream = _get_stream(stream) + with nogil: c_result = cpp_partitioning.round_robin_partition( - input.view(), c_num_partitions, c_start_partition + input.view(), c_num_partitions, c_start_partition, stream.view() ) - return Table.from_libcudf(move(c_result.first)), list(c_result.second) + return Table.from_libcudf(move(c_result.first), stream), list(c_result.second) diff --git a/python/pylibcudf/pylibcudf/quantiles.pxd b/python/pylibcudf/pylibcudf/quantiles.pxd index fbc1dfb30a6..6794c874933 100644 --- a/python/pylibcudf/pylibcudf/quantiles.pxd +++ b/python/pylibcudf/pylibcudf/quantiles.pxd @@ -1,6 +1,7 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. +# Copyright (c) 2024-2025, NVIDIA CORPORATION. from libcpp.vector cimport vector from pylibcudf.libcudf.types cimport interpolation, sorted +from rmm.pylibrmm.stream cimport Stream from .column cimport Column from .table cimport Table @@ -11,7 +12,8 @@ cpdef Column quantile( vector[double] q, interpolation interp = *, Column ordered_indices = *, - bint exact = * + bint exact = *, + Stream stream = * ) cpdef Table quantiles( @@ -21,4 +23,5 @@ cpdef Table quantiles( sorted is_input_sorted = *, list column_order = *, list null_precedence = *, + Stream stream = * ) diff --git a/python/pylibcudf/pylibcudf/quantiles.pyi b/python/pylibcudf/pylibcudf/quantiles.pyi index dca6eed013a..d5fb0e5b8b1 100644 --- a/python/pylibcudf/pylibcudf/quantiles.pyi +++ b/python/pylibcudf/pylibcudf/quantiles.pyi @@ -2,6 +2,8 @@ from collections.abc import Sequence +from rmm.pylibrmm.stream import Stream + from pylibcudf.column import Column from pylibcudf.table import Table from pylibcudf.types import Interpolation, NullOrder, Order, Sorted @@ -12,6 +14,7 @@ def quantile( interp: Interpolation = Interpolation.LINEAR, ordered_indices: Column | None = None, exact: bool = True, + stream: Stream | None = None, ) -> Column: ... def quantiles( input: Table, @@ -20,4 +23,5 @@ def quantiles( is_input_sorted: Sorted = Sorted.NO, column_order: list[Order] | None = None, null_precedence: list[NullOrder] | None = None, + stream: Stream | None = None, ) -> Table: ... diff --git a/python/pylibcudf/pylibcudf/quantiles.pyx b/python/pylibcudf/pylibcudf/quantiles.pyx index 634218586ac..8eb78750d33 100644 --- a/python/pylibcudf/pylibcudf/quantiles.pyx +++ b/python/pylibcudf/pylibcudf/quantiles.pyx @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. +# Copyright (c) 2024-2025, NVIDIA CORPORATION. from libcpp cimport bool from libcpp.memory cimport unique_ptr @@ -12,10 +12,12 @@ from pylibcudf.libcudf.quantiles cimport ( ) from pylibcudf.libcudf.table.table cimport table from pylibcudf.libcudf.types cimport null_order, order, sorted +from rmm.pylibrmm.stream cimport Stream from .column cimport Column from .table cimport Table from .types cimport interpolation +from .utils cimport _get_stream __all__ = ["quantile", "quantiles"] @@ -24,7 +26,8 @@ cpdef Column quantile( vector[double] q, interpolation interp = interpolation.LINEAR, Column ordered_indices = None, - bool exact=True + bool exact=True, + Stream stream=None ): """Computes quantiles with interpolation. @@ -49,6 +52,8 @@ cpdef Column quantile( Values not indexed by this column will be ignored. exact: bool, default True Returns doubles if True. Otherwise, returns same type as input + stream : Stream | None + CUDA stream on which to perform the operation. For details, see :cpp:func:`quantile`. @@ -66,6 +71,8 @@ cpdef Column quantile( else: ordered_indices_view = ordered_indices.view() + stream = _get_stream(stream) + with nogil: c_result = cpp_quantile( input.view(), @@ -73,9 +80,10 @@ cpdef Column quantile( interp, ordered_indices_view, exact, + stream.view(), ) - return Column.from_libcudf(move(c_result)) + return Column.from_libcudf(move(c_result), stream) cpdef Table quantiles( @@ -85,6 +93,7 @@ cpdef Table quantiles( sorted is_input_sorted = sorted.NO, list column_order = None, list null_precedence = None, + Stream stream=None ): """Computes row quantiles with interpolation. @@ -121,6 +130,8 @@ cpdef Table quantiles( all other elements. Ignored if `is_input_sorted` is `Sorted.YES` + stream : Stream | None + CUDA stream on which to perform the operation. For details, see :cpp:func:`quantiles`. @@ -139,6 +150,8 @@ cpdef Table quantiles( if null_precedence is not None: null_precedence_vec = null_precedence + stream = _get_stream(stream) + with nogil: c_result = cpp_quantiles( input.view(), @@ -147,6 +160,7 @@ cpdef Table quantiles( is_input_sorted, column_order_vec, null_precedence_vec, + stream.view(), ) - return Table.from_libcudf(move(c_result)) + return Table.from_libcudf(move(c_result), stream) diff --git a/python/pylibcudf/pylibcudf/reduce.pxd b/python/pylibcudf/pylibcudf/reduce.pxd index 047f08297e4..30176a6602e 100644 --- a/python/pylibcudf/pylibcudf/reduce.pxd +++ b/python/pylibcudf/pylibcudf/reduce.pxd @@ -1,6 +1,7 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. +# Copyright (c) 2024-2025, NVIDIA CORPORATION. from pylibcudf.libcudf.reduce cimport scan_type +from rmm.pylibrmm.stream cimport Stream from .aggregation cimport Aggregation from .column cimport Column @@ -8,8 +9,8 @@ from .scalar cimport Scalar from .types cimport DataType -cpdef Scalar reduce(Column col, Aggregation agg, DataType data_type) +cpdef Scalar reduce(Column col, Aggregation agg, DataType data_type, Stream stream = *) -cpdef Column scan(Column col, Aggregation agg, scan_type inclusive) +cpdef Column scan(Column col, Aggregation agg, scan_type inclusive, Stream stream = *) -cpdef tuple minmax(Column col) +cpdef tuple minmax(Column col, Stream stream = *) diff --git a/python/pylibcudf/pylibcudf/reduce.pyi b/python/pylibcudf/pylibcudf/reduce.pyi index a09949b7b30..dd7d103b210 100644 --- a/python/pylibcudf/pylibcudf/reduce.pyi +++ b/python/pylibcudf/pylibcudf/reduce.pyi @@ -2,6 +2,8 @@ from enum import IntEnum +from rmm.pylibrmm.stream import Stream + from pylibcudf.aggregation import Aggregation from pylibcudf.column import Column from pylibcudf.scalar import Scalar @@ -11,6 +13,18 @@ class ScanType(IntEnum): INCLUSIVE = ... EXCLUSIVE = ... -def reduce(col: Column, agg: Aggregation, data_type: DataType) -> Scalar: ... -def scan(col: Column, agg: Aggregation, inclusive: ScanType) -> Column: ... -def minmax(col: Column) -> tuple[Scalar, Scalar]: ... +def reduce( + col: Column, + agg: Aggregation, + data_type: DataType, + stream: Stream | None = None, +) -> Scalar: ... +def scan( + col: Column, + agg: Aggregation, + inclusive: ScanType, + stream: Stream | None = None, +) -> Column: ... +def minmax( + col: Column, stream: Stream | None = None +) -> tuple[Scalar, Scalar]: ... diff --git a/python/pylibcudf/pylibcudf/reduce.pyx b/python/pylibcudf/pylibcudf/reduce.pyx index 1fa10dcd376..da9b0a20134 100644 --- a/python/pylibcudf/pylibcudf/reduce.pyx +++ b/python/pylibcudf/pylibcudf/reduce.pyx @@ -8,17 +8,25 @@ from pylibcudf.libcudf.aggregation cimport reduce_aggregation, scan_aggregation from pylibcudf.libcudf.column.column cimport column from pylibcudf.libcudf.reduce cimport scan_type from pylibcudf.libcudf.scalar.scalar cimport scalar +from pylibcudf.libcudf.types cimport null_policy +from rmm.pylibrmm.stream cimport Stream from .aggregation cimport Aggregation from .column cimport Column from .scalar cimport Scalar from .types cimport DataType +from .utils cimport _get_stream from pylibcudf.libcudf.reduce import scan_type as ScanType # no-cython-lint __all__ = ["ScanType", "minmax", "reduce", "scan"] -cpdef Scalar reduce(Column col, Aggregation agg, DataType data_type): +cpdef Scalar reduce( + Column col, + Aggregation agg, + DataType data_type, + Stream stream=None +): """Perform a reduction on a column For details, see ``cudf::reduce`` documentation. @@ -31,6 +39,8 @@ cpdef Scalar reduce(Column col, Aggregation agg, DataType data_type): The aggregation to perform. data_type : DataType The data type of the result. + stream : Stream | None + CUDA stream on which to perform the operation. Returns ------- @@ -39,16 +49,20 @@ cpdef Scalar reduce(Column col, Aggregation agg, DataType data_type): """ cdef unique_ptr[scalar] result cdef const reduce_aggregation *c_agg = agg.view_underlying_as_reduce() + + stream = _get_stream(stream) + with nogil: result = cpp_reduce.cpp_reduce( col.view(), dereference(c_agg), - data_type.c_obj + data_type.c_obj, + stream.view() ) - return Scalar.from_libcudf(move(result)) + return Scalar.from_libcudf(move(result), stream) -cpdef Column scan(Column col, Aggregation agg, scan_type inclusive): +cpdef Column scan(Column col, Aggregation agg, scan_type inclusive, Stream stream=None): """Perform a scan on a column For details, see ``cudf::scan`` documentation. @@ -61,6 +75,8 @@ cpdef Column scan(Column col, Aggregation agg, scan_type inclusive): The aggregation to perform. inclusive : scan_type The type of scan to perform. + stream : Stream | None + CUDA stream on which to perform the operation. Returns ------- @@ -69,16 +85,21 @@ cpdef Column scan(Column col, Aggregation agg, scan_type inclusive): """ cdef unique_ptr[column] result cdef const scan_aggregation *c_agg = agg.view_underlying_as_scan() + + stream = _get_stream(stream) + with nogil: result = cpp_reduce.cpp_scan( col.view(), dereference(c_agg), inclusive, + null_policy.EXCLUDE, + stream.view(), ) - return Column.from_libcudf(move(result)) + return Column.from_libcudf(move(result), stream) -cpdef tuple minmax(Column col): +cpdef tuple minmax(Column col, Stream stream=None): """Compute the minimum and maximum of a column For details, see ``cudf::minmax`` documentation. @@ -87,6 +108,8 @@ cpdef tuple minmax(Column col): ---------- col : Column The column to compute the minimum and maximum of. + stream : Stream | None + CUDA stream on which to perform the operation. Returns ------- @@ -95,12 +118,15 @@ cpdef tuple minmax(Column col): being the maximum. """ cdef pair[unique_ptr[scalar], unique_ptr[scalar]] result + + stream = _get_stream(stream) + with nogil: - result = cpp_reduce.cpp_minmax(col.view()) + result = cpp_reduce.cpp_minmax(col.view(), stream.view()) return ( - Scalar.from_libcudf(move(result.first)), - Scalar.from_libcudf(move(result.second)), + Scalar.from_libcudf(move(result.first), stream), + Scalar.from_libcudf(move(result.second), stream), ) ScanType.__str__ = ScanType.__repr__ diff --git a/python/pylibcudf/pylibcudf/search.pxd b/python/pylibcudf/pylibcudf/search.pxd index 0faf18b108f..4499a7fe131 100644 --- a/python/pylibcudf/pylibcudf/search.pxd +++ b/python/pylibcudf/pylibcudf/search.pxd @@ -1,4 +1,6 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. +# Copyright (c) 2024-2025, NVIDIA CORPORATION. + +from rmm.pylibrmm.stream cimport Stream from .column cimport Column from .table cimport Table @@ -9,6 +11,7 @@ cpdef Column lower_bound( Table needles, list column_order, list null_precedence, + Stream stream = * ) cpdef Column upper_bound( @@ -16,6 +19,7 @@ cpdef Column upper_bound( Table needles, list column_order, list null_precedence, + Stream stream = * ) -cpdef Column contains(Column haystack, Column needles) +cpdef Column contains(Column haystack, Column needles, Stream stream = *) diff --git a/python/pylibcudf/pylibcudf/search.pyi b/python/pylibcudf/pylibcudf/search.pyi index 7f292b129b2..3eec007a40c 100644 --- a/python/pylibcudf/pylibcudf/search.pyi +++ b/python/pylibcudf/pylibcudf/search.pyi @@ -1,5 +1,7 @@ # Copyright (c) 2024, NVIDIA CORPORATION. +from rmm.pylibrmm.stream import Stream + from pylibcudf.column import Column from pylibcudf.table import Table from pylibcudf.types import NullOrder, Order @@ -9,11 +11,15 @@ def lower_bound( needles: Table, column_order: list[Order], null_precedence: list[NullOrder], + stream: Stream | None = None, ) -> Column: ... def upper_bound( haystack: Table, needles: Table, column_order: list[Order], null_precedence: list[NullOrder], + stream: Stream | None = None, +) -> Column: ... +def contains( + haystack: Column, needles: Column, stream: Stream | None = None ) -> Column: ... -def contains(haystack: Column, needles: Column) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/search.pyx b/python/pylibcudf/pylibcudf/search.pyx index 32a133213d8..e3263348048 100644 --- a/python/pylibcudf/pylibcudf/search.pyx +++ b/python/pylibcudf/pylibcudf/search.pyx @@ -6,9 +6,11 @@ from libcpp.vector cimport vector from pylibcudf.libcudf cimport search as cpp_search from pylibcudf.libcudf.column.column cimport column from pylibcudf.libcudf.types cimport null_order, order +from rmm.pylibrmm.stream cimport Stream from .column cimport Column from .table cimport Table +from .utils cimport _get_stream __all__ = ["contains", "lower_bound", "upper_bound"] @@ -17,6 +19,7 @@ cpdef Column lower_bound( Table needles, list column_order, list null_precedence, + Stream stream=None ): """Find smallest indices in haystack where needles may be inserted to retain order. @@ -32,6 +35,8 @@ cpdef Column lower_bound( Whether each column should be sorted in ascending or descending order. null_precedence : List[NullOrder] Whether nulls should come before or after non-nulls. + stream : Stream | None + CUDA stream on which to perform the operation. Returns ------- @@ -41,14 +46,18 @@ cpdef Column lower_bound( cdef unique_ptr[column] c_result cdef vector[order] c_orders = column_order cdef vector[null_order] c_null_precedence = null_precedence + + stream = _get_stream(stream) + with nogil: c_result = cpp_search.lower_bound( haystack.view(), needles.view(), c_orders, c_null_precedence, + stream.view(), ) - return Column.from_libcudf(move(c_result)) + return Column.from_libcudf(move(c_result), stream) cpdef Column upper_bound( @@ -56,6 +65,7 @@ cpdef Column upper_bound( Table needles, list column_order, list null_precedence, + Stream stream=None ): """Find largest indices in haystack where needles may be inserted to retain order. @@ -71,6 +81,8 @@ cpdef Column upper_bound( Whether each column should be sorted in ascending or descending order. null_precedence : List[NullOrder] Whether nulls should come before or after non-nulls. + stream : Stream | None + CUDA stream on which to perform the operation. Returns ------- @@ -80,17 +92,21 @@ cpdef Column upper_bound( cdef unique_ptr[column] c_result cdef vector[order] c_orders = column_order cdef vector[null_order] c_null_precedence = null_precedence + + stream = _get_stream(stream) + with nogil: c_result = cpp_search.upper_bound( haystack.view(), needles.view(), c_orders, c_null_precedence, + stream.view(), ) - return Column.from_libcudf(move(c_result)) + return Column.from_libcudf(move(c_result), stream) -cpdef Column contains(Column haystack, Column needles): +cpdef Column contains(Column haystack, Column needles, Stream stream=None): """Check whether needles are present in haystack. For details, see :cpp:func:`contains`. @@ -101,6 +117,8 @@ cpdef Column contains(Column haystack, Column needles): The search space. needles : Column The values for which to search. + stream : Stream | None + CUDA stream on which to perform the operation. Returns ------- @@ -108,9 +126,13 @@ cpdef Column contains(Column haystack, Column needles): Boolean indicator for each needle. """ cdef unique_ptr[column] c_result + + stream = _get_stream(stream) + with nogil: c_result = cpp_search.contains( haystack.view(), needles.view(), + stream.view(), ) - return Column.from_libcudf(move(c_result)) + return Column.from_libcudf(move(c_result), stream) From 7492fc345e8866090609aef1d37bd193379bb6bf Mon Sep 17 00:00:00 2001 From: Peixin Date: Wed, 6 Aug 2025 12:45:52 +0800 Subject: [PATCH 068/366] Update spark-rapdis-jni action to use PR's base.ref and fix issue of ccache version in dockerfile (#19603) follow-up of https://github.com/rapidsai/cudf/pull/19500 1. Provide env.sh to provide common place for build ENVs 2. Update ccache version to pass compilation in gcc14 3. Update spark-rapids-jni workflow to checkout based on cudf PR's base ref instead of repo default branch ``` ++ export 'sclCMD=scl enable gcc-toolset-14' ++ sclCMD='scl enable gcc-toolset-14' + CMAKE_CUDA_ARCHITECTURES=90 + LIBCUDF_DEPENDENCY_MODE=latest + USE_GDS=on + scl enable gcc-toolset-14 build/buildcpp.sh ``` Authors: - Peixin (https://github.com/pxLi) Approvers: - Nghia Truong (https://github.com/ttnghia) - Bradley Dice (https://github.com/bdice) URL: https://github.com/rapidsai/cudf/pull/19603 --- .github/workflows/spark-rapids-jni.yaml | 3 ++- java/ci/Dockerfile.rocky | 4 ++-- java/ci/README.md | 3 ++- java/ci/env.sh | 19 +++++++++++++++++++ 4 files changed, 25 insertions(+), 4 deletions(-) create mode 100644 java/ci/env.sh diff --git a/.github/workflows/spark-rapids-jni.yaml b/.github/workflows/spark-rapids-jni.yaml index f317c7cf531..f4c168aff1a 100644 --- a/.github/workflows/spark-rapids-jni.yaml +++ b/.github/workflows/spark-rapids-jni.yaml @@ -13,10 +13,11 @@ jobs: with: repository: NVIDIA/spark-rapids-jni submodules: recursive + ref: ${{ github.event.pull_request.base.ref }} - uses: actions/checkout@v4 with: path: thirdparty/cudf - name: "Build spark-rapids-jni" run: | mkdir target - CMAKE_CUDA_ARCHITECTURES=90 LIBCUDF_DEPENDENCY_MODE=latest USE_GDS=on scl enable gcc-toolset-14 build/buildcpp.sh + source build/env.sh && CMAKE_CUDA_ARCHITECTURES=90 LIBCUDF_DEPENDENCY_MODE=latest USE_GDS=on ${sclCMD} build/buildcpp.sh diff --git a/java/ci/Dockerfile.rocky b/java/ci/Dockerfile.rocky index e7cf319663d..d6eff8125fa 100644 --- a/java/ci/Dockerfile.rocky +++ b/java/ci/Dockerfile.rocky @@ -27,13 +27,14 @@ ARG TARGETPLATFORM=linux/amd64 # check available official arm-based docker images at https://hub.docker.com/r/nvidia/cuda/tags (OS/ARCH) FROM --platform=$TARGETPLATFORM nvidia/cuda:$CUDA_VERSION-devel-rockylinux$OS_RELEASE ARG TOOLSET_VERSION=14 +ARG CMAKE_VERSION=3.30.7 +ARG CCACHE_VERSION=4.11.2 ### Install basic requirements RUN dnf --enablerepo=powertools install -y scl-utils gcc-toolset-11 gcc-toolset-${TOOLSET_VERSION} git zlib-devel maven tar wget patch ninja-build boost-devel ## pre-create the CMAKE_INSTALL_PREFIX folder, set writable by any user for Jenkins RUN mkdir /usr/local/rapids /rapids && chmod 777 /usr/local/rapids /rapids # 3.22.3+: CUDA architecture 'native' support + flexible CMAKE__*_LAUNCHER for ccache -ARG CMAKE_VERSION=3.30.7 # default x86_64 from x86 build, aarch64 cmake for arm build ARG CMAKE_ARCH=x86_64 RUN cd /usr/local && wget --quiet https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-linux-${CMAKE_ARCH}.tar.gz && \ @@ -42,7 +43,6 @@ RUN cd /usr/local && wget --quiet https://github.com/Kitware/CMake/releases/down ENV PATH /usr/local/cmake-${CMAKE_VERSION}-linux-${CMAKE_ARCH}/bin:$PATH # ccache for interactive builds -ARG CCACHE_VERSION=4.6 RUN cd /tmp && wget --quiet https://github.com/ccache/ccache/releases/download/v${CCACHE_VERSION}/ccache-${CCACHE_VERSION}.tar.gz && \ tar zxf ccache-${CCACHE_VERSION}.tar.gz && \ rm ccache-${CCACHE_VERSION}.tar.gz && \ diff --git a/java/ci/README.md b/java/ci/README.md index 22a2dd618f6..90112d838f3 100644 --- a/java/ci/README.md +++ b/java/ci/README.md @@ -42,7 +42,8 @@ git clone --recursive https://github.com/rapidsai/cudf.git -b branch-25.10 ```bash cd cudf export WORKSPACE=`pwd` -scl enable gcc-toolset-14 "java/ci/build-in-docker.sh" +source java/ci/env.sh +${sclCMD} "java/ci/build-in-docker.sh" ``` ### The output diff --git a/java/ci/env.sh b/java/ci/env.sh new file mode 100644 index 00000000000..1772437205d --- /dev/null +++ b/java/ci/env.sh @@ -0,0 +1,19 @@ +#!/bin/bash +# +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +set -ex +export sclCMD=${sclCMD:-"scl enable gcc-toolset-14"} From 8f26dc4b707a598ae24a64c061ad011c6ecea799 Mon Sep 17 00:00:00 2001 From: Matthew Murray <41342305+Matt711@users.noreply.github.com> Date: Wed, 6 Aug 2025 10:47:53 -0400 Subject: [PATCH 069/366] Pin Narwhals to 1.47 (#19358) Use Narwhals version 1.47 in CI Authors: - Matthew Murray (https://github.com/Matt711) Approvers: - Bradley Dice (https://github.com/bdice) URL: https://github.com/rapidsai/cudf/pull/19358 --- ci/test_narwhals.sh | 25 ++++++++++++++++--- dependencies.yaml | 2 +- .../cudf/cudf/testing/narwhals_test_plugin.py | 2 ++ 3 files changed, 25 insertions(+), 4 deletions(-) diff --git a/ci/test_narwhals.sh b/ci/test_narwhals.sh index 6af3be24b3b..a6ade73e5f3 100755 --- a/ci/test_narwhals.sh +++ b/ci/test_narwhals.sh @@ -32,6 +32,9 @@ python -c "import narwhals; print(narwhals.show_versions())" # test_rolling_std_expr_lazy_grouped: xpassing in Narwhals # test_rolling_sum_expr_lazy_grouped: xpassing in Narwhals # test_rolling_var_expr_lazy_grouped: xpassing in Narwhals +# test_offset_by_tz: xpassing in Narwhals +# test_double_same_aggregation: xpassing in Narwhals +# test_all_kind_of_aggs: xpassing in Narwhals TESTS_THAT_NEED_NARWHALS_FIX_FOR_CUDF="not test_rolling_mean_expr_lazy_grouped[cudf-expected_a4-3-1-True] \ and not test_rolling_mean_expr_lazy_grouped[cudf-expected_a5-4-1-True] \ and not test_rolling_mean_expr_lazy_grouped[cudf-expected_a6-5-1-True] \ @@ -44,7 +47,10 @@ and not test_rolling_sum_expr_lazy_grouped[cudf-expected_a6-5-1-True] \ and not test_rolling_var_expr_lazy_grouped[cudf-expected_a4-3-1-True-1] \ and not test_rolling_var_expr_lazy_grouped[cudf-expected_a5-4-1-True-1] \ and not test_rolling_var_expr_lazy_grouped[cudf-expected_a6-5-1-True-0] \ -and not test_horizontal_slice_with_series" +and not test_horizontal_slice_with_series \ +and not test_offset_by_tz \ +and not test_double_same_aggregation \ +and not test_all_kind_of_aggs" rapids-logger "Run narwhals tests for cuDF" PYTEST_DISABLE_PLUGIN_AUTOLOAD=1 python -m pytest \ @@ -63,9 +69,11 @@ PYTEST_DISABLE_PLUGIN_AUTOLOAD=1 python -m pytest \ # columns aren't object anymore. The test expects object, causing a mismatch. # test_nan: Narwhals expect this test to fail, but as of polars 1.30 we raise a RuntimeError, # not polars ComputeError. So the test is looking for the wrong error and fails. +# test_floordiv_int_by_zero: This bug is fixed as of 25.08, narwhals should remove the xfail TESTS_THAT_NEED_NARWHALS_FIX_FOR_CUDF_POLARS=" \ test_dtypes or \ -test_nan \ +test_nan or \ +test_floordiv_int_by_zero \ " rapids-logger "Run narwhals tests for cuDF Polars" @@ -92,12 +100,23 @@ rapids-logger "Run narwhals tests for cuDF Pandas" # test_maybe_convert_dtypes_pandas: https://github.com/rapidsai/cudf/issues/14149 # test_log_dtype_pandas: cudf is promoting the type to float64 # test_len_over_2369: It fails during fallback. The error is 'DataFrame' object has no attribute 'to_frame' +# test_all_ignore_nulls, test_allh_kleene, and test_anyh_kleene: https://github.com/rapidsai/cudf/issues/19417 +# test_offset_by_date_pandas: https://github.com/rapidsai/cudf/issues/19418 +# test_select_boolean_cols and test_select_boolean_cols_multi_group_by: https://github.com/rapidsai/cudf/issues/19421 +# test_to_datetime_pd_preserves_pyarrow_backend_dtype: https://github.com/rapidsai/cudf/issues/19422 TESTS_THAT_NEED_CUDF_FIX=" \ test_is_finite_expr or \ test_is_finite_series or \ test_maybe_convert_dtypes_pandas or \ test_log_dtype_pandas or \ -test_len_over_2369 \ +test_len_over_2369 or \ +test_all_ignore_nulls or \ +test_allh_kleene or \ +test_anyh_kleene or \ +test_offset_by_date_pandas or \ +test_select_boolean_cols or \ +test_select_boolean_cols_multi_group_by or \ +test_to_datetime_pd_preserves_pyarrow_backend_dtype \ " # test_array_dunder_with_copy: https://github.com/rapidsai/cudf/issues/18248#issuecomment-2719234741 diff --git a/dependencies.yaml b/dependencies.yaml index 7e53ff8a959..f214bde574e 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -1120,4 +1120,4 @@ dependencies: common: - output_types: [conda, requirements, pyproject] packages: - - narwhals==1.41 + - narwhals==1.47 diff --git a/python/cudf/cudf/testing/narwhals_test_plugin.py b/python/cudf/cudf/testing/narwhals_test_plugin.py index 41f38999f6e..4355397dd78 100644 --- a/python/cudf/cudf/testing/narwhals_test_plugin.py +++ b/python/cudf/cudf/testing/narwhals_test_plugin.py @@ -13,6 +13,8 @@ EXPECTED_FAILURES: Mapping[str, str] = { "tests/frame/select_test.py::test_select_duplicates[cudf]": "cuDF doesn't support having multiple columns with same names", "tests/expr_and_series/lit_test.py::test_date_lit[cudf]": "cuDF does not support pa.date32()", + "tests/frame/group_by_test.py::test_group_by_no_preserve_dtype[cudf-time]": "multiple dtype in the same column", + "tests/frame/group_by_test.py::test_group_by_no_preserve_dtype[cudf-bytes]": "cuDF doesn't support arrow TIME32", } From 8bbed010a038dcce8d41fdb0c30eb3c33016d152 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 6 Aug 2025 12:32:50 -0500 Subject: [PATCH 070/366] Compatibility with rapidsmpf 25.10.0 (#19591) https://github.com/rapidsai/rapidsmpf/pull/404 contained changes to the insert / extract interface that we use in cudf-polars. This updates our usage to be compatible with the new interface, which requires provding a `BufferResource` (obtained from the worker context) rather than a `MemoryResource`. Authors: - Tom Augspurger (https://github.com/TomAugspurger) Approvers: - Bradley Dice (https://github.com/bdice) - Lawrence Mitchell (https://github.com/wence-) - Mads R. B. Kristensen (https://github.com/madsbk) URL: https://github.com/rapidsai/cudf/pull/19591 --- .../cudf_polars/experimental/shuffle.py | 33 ++++++++++++++++--- 1 file changed, 28 insertions(+), 5 deletions(-) diff --git a/python/cudf_polars/cudf_polars/experimental/shuffle.py b/python/cudf_polars/cudf_polars/experimental/shuffle.py index c135261ed32..b6ecfc35fa8 100644 --- a/python/cudf_polars/cudf_polars/experimental/shuffle.py +++ b/python/cudf_polars/cudf_polars/experimental/shuffle.py @@ -8,7 +8,6 @@ from typing import TYPE_CHECKING, Any, TypedDict import pylibcudf as plc -import rmm.mr from rmm.pylibrmm.stream import DEFAULT_STREAM from cudf_polars.containers import DataFrame @@ -58,6 +57,14 @@ def insert_partition( ) -> None: """Add cudf-polars DataFrame chunks to an RMP shuffler.""" from rapidsmpf.integrations.cudf.partition import partition_and_pack + from rapidsmpf.integrations.dask.core import get_worker_context + + context = get_worker_context() + + if context.br is None: # pragma: no cover + raise ValueError( + "rapidsmpf insert_partition called on an uninitialized worker." + ) on = options["on"] assert not other, f"Unexpected arguments: {other}" @@ -66,8 +73,8 @@ def insert_partition( df.table, columns_to_hash=columns_to_hash, num_partitions=partition_count, + br=context.br, stream=DEFAULT_STREAM, - device_mr=rmm.mr.get_current_device_resource(), ) shuffler.insert_chunks(packed_inputs) @@ -79,16 +86,32 @@ def extract_partition( options: ShuffleOptions, ) -> DataFrame: """Extract a finished partition from the RMP shuffler.""" - from rapidsmpf.integrations.cudf.partition import unpack_and_concat + from rapidsmpf.integrations.cudf.partition import ( + unpack_and_concat, + unspill_partitions, + ) + from rapidsmpf.integrations.dask.core import get_worker_context + + context = get_worker_context() + if context.br is None: # pragma: no cover + raise ValueError( + "rapidsmpf extract_partition called on an uninitialized worker." + ) shuffler.wait_on(partition_id) column_names = options["column_names"] dtypes = options["dtypes"] return DataFrame.from_table( unpack_and_concat( - shuffler.extract(partition_id), + unspill_partitions( + shuffler.extract(partition_id), + br=context.br, + stream=DEFAULT_STREAM, + allow_overbooking=True, + statistics=context.statistics, + ), + br=context.br, stream=DEFAULT_STREAM, - device_mr=rmm.mr.get_current_device_resource(), ), column_names, dtypes, From aaaa0aa053baf95c7cdce8e6c2190c28752d1186 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Wed, 6 Aug 2025 11:01:20 -0700 Subject: [PATCH 071/366] Move some test_multiindex.py to new cudf classic test directory structure (#19496) Towards https://github.com/rapidsai/cudf/issues/9999 Towards https://github.com/rapidsai/cudf/issues/15723 Other tests in this file will be moved once the `dataframe`/`series` directory structures have been moved in https://github.com/rapidsai/cudf/pull/19485 and https://github.com/rapidsai/cudf/pull/19490 Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/19496 --- .../tests/dataframe/methods/test_transpose.py | 16 + .../indexes/multiindex/methods/__init__.py | 0 .../multiindex/methods/test_argsort.py | 53 + .../indexes/multiindex/methods/test_copy.py | 128 ++ .../multiindex/methods/test_droplevel.py | 129 ++ .../indexes/multiindex/methods/test_fillna.py | 54 + .../indexes/multiindex/methods/test_numpy.py | 16 + .../multiindex/methods/test_nunique.py | 18 + .../indexes/multiindex/methods/test_rename.py | 51 + .../multiindex/methods/test_set_methods.py | 140 ++ .../multiindex/methods/test_set_names.py | 102 ++ .../multiindex/methods/test_sort_values.py | 52 + .../multiindex/methods/test_swaplevel.py | 29 + .../indexes/multiindex/methods/test_take.py | 41 + .../multiindex/methods/test_to_frame.py | 103 ++ .../multiindex/methods/test_to_pandas.py | 41 + .../multiindex/methods/test_to_series.py | 11 + .../indexes/multiindex/methods/test_unique.py | 16 + .../indexes/multiindex/test_attributes.py | 214 +++ .../tests/indexes/multiindex/test_binaryop.py | 14 + .../indexes/multiindex/test_constructing.py | 1 - .../indexes/multiindex/test_constructors.py | 183 +++ .../tests/indexes/multiindex/test_getitem.py | 13 + .../indexes/multiindex/test_properties.py | 1 - .../indexes/multiindex/test_selecting.py | 1 - python/cudf/cudf/tests/test_multiindex.py | 1331 +---------------- 26 files changed, 1425 insertions(+), 1333 deletions(-) create mode 100644 python/cudf/cudf/tests/dataframe/methods/test_transpose.py create mode 100644 python/cudf/cudf/tests/indexes/multiindex/methods/__init__.py create mode 100644 python/cudf/cudf/tests/indexes/multiindex/methods/test_argsort.py create mode 100644 python/cudf/cudf/tests/indexes/multiindex/methods/test_copy.py create mode 100644 python/cudf/cudf/tests/indexes/multiindex/methods/test_droplevel.py create mode 100644 python/cudf/cudf/tests/indexes/multiindex/methods/test_fillna.py create mode 100644 python/cudf/cudf/tests/indexes/multiindex/methods/test_numpy.py create mode 100644 python/cudf/cudf/tests/indexes/multiindex/methods/test_nunique.py create mode 100644 python/cudf/cudf/tests/indexes/multiindex/methods/test_rename.py create mode 100644 python/cudf/cudf/tests/indexes/multiindex/methods/test_set_methods.py create mode 100644 python/cudf/cudf/tests/indexes/multiindex/methods/test_set_names.py create mode 100644 python/cudf/cudf/tests/indexes/multiindex/methods/test_sort_values.py create mode 100644 python/cudf/cudf/tests/indexes/multiindex/methods/test_swaplevel.py create mode 100644 python/cudf/cudf/tests/indexes/multiindex/methods/test_take.py create mode 100644 python/cudf/cudf/tests/indexes/multiindex/methods/test_to_frame.py create mode 100644 python/cudf/cudf/tests/indexes/multiindex/methods/test_to_pandas.py create mode 100644 python/cudf/cudf/tests/indexes/multiindex/methods/test_to_series.py create mode 100644 python/cudf/cudf/tests/indexes/multiindex/methods/test_unique.py create mode 100644 python/cudf/cudf/tests/indexes/multiindex/test_attributes.py create mode 100644 python/cudf/cudf/tests/indexes/multiindex/test_binaryop.py delete mode 100644 python/cudf/cudf/tests/indexes/multiindex/test_constructing.py create mode 100644 python/cudf/cudf/tests/indexes/multiindex/test_constructors.py create mode 100644 python/cudf/cudf/tests/indexes/multiindex/test_getitem.py delete mode 100644 python/cudf/cudf/tests/indexes/multiindex/test_properties.py delete mode 100644 python/cudf/cudf/tests/indexes/multiindex/test_selecting.py diff --git a/python/cudf/cudf/tests/dataframe/methods/test_transpose.py b/python/cudf/cudf/tests/dataframe/methods/test_transpose.py new file mode 100644 index 00000000000..bb64e6d4896 --- /dev/null +++ b/python/cudf/cudf/tests/dataframe/methods/test_transpose.py @@ -0,0 +1,16 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + + +import pandas as pd + +import cudf +from cudf.testing import assert_eq + + +def test_multiindex_transpose(): + pdf = pd.DataFrame( + {"a": [1, 2, 3], "b": [4, 5, 6]}, + index=pd.MultiIndex.from_tuples([(1, 2), (3, 4), (5, 6)]), + ) + gdf = cudf.from_pandas(pdf) + assert_eq(pdf.transpose(), gdf.transpose()) diff --git a/python/cudf/cudf/tests/indexes/multiindex/methods/__init__.py b/python/cudf/cudf/tests/indexes/multiindex/methods/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/python/cudf/cudf/tests/indexes/multiindex/methods/test_argsort.py b/python/cudf/cudf/tests/indexes/multiindex/methods/test_argsort.py new file mode 100644 index 00000000000..675c7cb69e9 --- /dev/null +++ b/python/cudf/cudf/tests/indexes/multiindex/methods/test_argsort.py @@ -0,0 +1,53 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + +import pandas as pd +import pytest + +import cudf +from cudf.testing import assert_eq + + +@pytest.mark.parametrize( + "pdi", + [ + pd.MultiIndex( + levels=[[1, 3.0, 4, 5], [1, 2.3, 5]], + codes=[[0, 0, 1, 2, 3], [0, 2, 1, 1, 0]], + names=["x", "y"], + ), + pd.MultiIndex( + levels=[[1, 3, 4, -10], [1, 11, 5]], + codes=[[0, 0, 1, 2, 3], [0, 2, 1, 1, 0]], + names=["x", "y"], + ), + pd.MultiIndex( + levels=[["a", "b", "c", "100"], ["1", "100", "5"]], + codes=[[0, 0, 1, 2, 3], [0, 2, 1, 1, 0]], + names=["x", "y"], + ), + pytest.param( + pd.MultiIndex( + levels=[[None, "b", "c", "a"], ["1", None, "5"]], + codes=[[0, 0, 1, 2, 3], [0, 2, 1, 1, 0]], + names=["x", "y"], + ), + marks=[ + pytest.mark.xfail( + reason="https://github.com/pandas-dev/pandas/issues/35584" + ) + ], + ), + ], +) +@pytest.mark.parametrize("ascending", [True, False]) +def test_multiindex_argsort(pdi, ascending): + gdi = cudf.from_pandas(pdi) + + if not ascending: + expected = pdi.argsort()[::-1] + else: + expected = pdi.argsort() + + actual = gdi.argsort(ascending=ascending) + + assert_eq(expected, actual) diff --git a/python/cudf/cudf/tests/indexes/multiindex/methods/test_copy.py b/python/cudf/cudf/tests/indexes/multiindex/methods/test_copy.py new file mode 100644 index 00000000000..1bae3b8292c --- /dev/null +++ b/python/cudf/cudf/tests/indexes/multiindex/methods/test_copy.py @@ -0,0 +1,128 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + +import operator +from functools import reduce + +import pandas as pd +import pytest + +import cudf +from cudf.testing import assert_eq + + +def test_multiindex_copy_sem(): + gmi = pd.MultiIndex.from_product( + [["python", "cobra"], [2018, 2019]], + ) + pmi = cudf.MultiIndex.from_product( + [["python", "cobra"], [2018, 2019]], + ) + names = ["x", "y"] + gmi_copy = gmi.copy(names=names) + pmi_copy = pmi.copy(names=names) + assert_eq(gmi_copy, pmi_copy) + + +@pytest.mark.parametrize( + "data", + [ + { + "Date": [ + "2020-08-27", + "2020-08-28", + "2020-08-31", + "2020-08-27", + "2020-08-28", + "2020-08-31", + "2020-08-27", + "2020-08-28", + "2020-08-31", + ], + "Close": [ + 3400.00, + 3401.80, + 3450.96, + 226.58, + 228.91, + 225.53, + 505.13, + 525.91, + 534.98, + ], + "Symbol": [ + "AMZN", + "AMZN", + "AMZN", + "MSFT", + "MSFT", + "MSFT", + "NVDA", + "NVDA", + "NVDA", + ], + }, + pd.MultiIndex( + levels=[[1001, 1002], [2001, 2002]], + codes=[[1, 1, 0, 0], [0, 1, 0, 1]], + names=["col1", "col2"], + ), + ], +) +@pytest.mark.parametrize("copy_on_write", [True, False]) +@pytest.mark.parametrize("deep", [True, False]) +def test_multiindex_copy_deep(data, copy_on_write, deep): + """Test memory identity for deep copy + Case1: Constructed from GroupBy, StringColumns + Case2: Constructed from MultiIndex, NumericColumns + """ + with cudf.option_context("copy_on_write", copy_on_write): + if isinstance(data, dict): + gdf = cudf.DataFrame(data) + mi1 = gdf.groupby(["Date", "Symbol"]).mean().index + mi2 = mi1.copy(deep=deep) + + lchildren = [col.children for col in mi1._columns] + rchildren = [col.children for col in mi2._columns] + + # Flatten + lchildren = reduce(operator.add, lchildren) + rchildren = reduce(operator.add, rchildren) + + lptrs = [ + child.base_data.get_ptr(mode="read") for child in lchildren + ] + rptrs = [ + child.base_data.get_ptr(mode="read") for child in rchildren + ] + + assert all((x == y) for x, y in zip(lptrs, rptrs)) + + elif isinstance(data, pd.MultiIndex): + data = cudf.MultiIndex.from_pandas(data) + same_ref = (not deep) or ( + cudf.get_option("copy_on_write") and not deep + ) + mi1 = data + mi2 = mi1.copy(deep=deep) + + # Assert ._levels identity + lptrs = [ + lv._column.base_data.get_ptr(mode="read") for lv in mi1._levels + ] + rptrs = [ + lv._column.base_data.get_ptr(mode="read") for lv in mi2._levels + ] + + assert all((x == y) == same_ref for x, y in zip(lptrs, rptrs)) + + # Assert ._codes identity + lptrs = [c.base_data.get_ptr(mode="read") for c in mi1._codes] + rptrs = [c.base_data.get_ptr(mode="read") for c in mi2._codes] + + assert all((x == y) == same_ref for x, y in zip(lptrs, rptrs)) + + # Assert ._data identity + lptrs = [d.base_data.get_ptr(mode="read") for d in mi1._columns] + rptrs = [d.base_data.get_ptr(mode="read") for d in mi2._columns] + + assert all((x == y) == same_ref for x, y in zip(lptrs, rptrs)) diff --git a/python/cudf/cudf/tests/indexes/multiindex/methods/test_droplevel.py b/python/cudf/cudf/tests/indexes/multiindex/methods/test_droplevel.py new file mode 100644 index 00000000000..37048f0633e --- /dev/null +++ b/python/cudf/cudf/tests/indexes/multiindex/methods/test_droplevel.py @@ -0,0 +1,129 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + +import itertools + +import numpy as np +import pandas as pd +import pytest + +import cudf +from cudf.testing import assert_eq + + +@pytest.mark.parametrize( + "level", + [ + [], + "alpha", + "location", + "weather", + 0, + 1, + [0, 1], + -1, + [-1, -2], + [-1, "weather"], + ], +) +def test_multiindex_droplevel_simple(level): + pdfIndex = pd.MultiIndex( + [ + ["a", "b", "c"], + ["house", "store", "forest"], + ["clouds", "clear", "storm"], + ["fire", "smoke", "clear"], + [ + np.datetime64("2001-01-01", "ns"), + np.datetime64("2002-01-01", "ns"), + np.datetime64("2003-01-01", "ns"), + ], + ], + [ + [0, 0, 0, 0, 1, 1, 2], + [1, 1, 1, 1, 0, 0, 2], + [0, 0, 2, 2, 2, 0, 1], + [0, 0, 0, 1, 2, 0, 1], + [1, 0, 1, 2, 0, 0, 1], + ], + ) + pdfIndex.names = ["alpha", "location", "weather", "sign", "timestamp"] + gdfIndex = cudf.from_pandas(pdfIndex) + assert_eq(pdfIndex.droplevel(level), gdfIndex.droplevel(level)) + + +@pytest.mark.parametrize( + "level", + itertools.chain( + *( + itertools.combinations( + ("alpha", "location", "weather", "sign", "timestamp"), r + ) + for r in range(5) + ) + ), +) +def test_multiindex_droplevel_name(level): + pdfIndex = pd.MultiIndex( + [ + ["a", "b", "c"], + ["house", "store", "forest"], + ["clouds", "clear", "storm"], + ["fire", "smoke", "clear"], + [ + np.datetime64("2001-01-01", "ns"), + np.datetime64("2002-01-01", "ns"), + np.datetime64("2003-01-01", "ns"), + ], + ], + [ + [0, 0, 0, 0, 1, 1, 2], + [1, 1, 1, 1, 0, 0, 2], + [0, 0, 2, 2, 2, 0, 1], + [0, 0, 0, 1, 2, 0, 1], + [1, 0, 1, 2, 0, 0, 1], + ], + ) + pdfIndex.names = ["alpha", "location", "weather", "sign", "timestamp"] + level = list(level) + gdfIndex = cudf.from_pandas(pdfIndex) + assert_eq(pdfIndex.droplevel(level), gdfIndex.droplevel(level)) + + +@pytest.mark.parametrize( + "level", + itertools.chain(*(itertools.combinations(range(5), r) for r in range(5))), +) +def test_multiindex_droplevel_index(level): + pdfIndex = pd.MultiIndex( + [ + ["a", "b", "c"], + ["house", "store", "forest"], + ["clouds", "clear", "storm"], + ["fire", "smoke", "clear"], + [ + np.datetime64("2001-01-01", "ns"), + np.datetime64("2002-01-01", "ns"), + np.datetime64("2003-01-01", "ns"), + ], + ], + [ + [0, 0, 0, 0, 1, 1, 2], + [1, 1, 1, 1, 0, 0, 2], + [0, 0, 2, 2, 2, 0, 1], + [0, 0, 0, 1, 2, 0, 1], + [1, 0, 1, 2, 0, 0, 1], + ], + ) + pdfIndex.names = ["alpha", "location", "weather", "sign", "timestamp"] + level = list(level) + gdfIndex = cudf.from_pandas(pdfIndex) + assert_eq(pdfIndex.droplevel(level), gdfIndex.droplevel(level)) + + +def test_multiindex_droplevel_single_level_none_names(): + data = [(1, 2), (3, 4)] + pidx = pd.MultiIndex.from_tuples(data, names=[None, None]) + gidx = cudf.MultiIndex.from_tuples(data, names=[None, None]) + result = gidx.droplevel(0) + expected = pidx.droplevel(0) + assert_eq(result, expected) diff --git a/python/cudf/cudf/tests/indexes/multiindex/methods/test_fillna.py b/python/cudf/cudf/tests/indexes/multiindex/methods/test_fillna.py new file mode 100644 index 00000000000..c3d23e68008 --- /dev/null +++ b/python/cudf/cudf/tests/indexes/multiindex/methods/test_fillna.py @@ -0,0 +1,54 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + +import pytest + +import cudf +from cudf.testing import assert_eq + + +@pytest.mark.parametrize( + "gdi, fill_value, expected", + [ + ( + lambda: cudf.MultiIndex( + levels=[[1, 3, 4, None], [1, 2, 5]], + codes=[[0, 0, 1, 2, 3], [0, 2, 1, 1, 0]], + names=["x", "y"], + ), + 5, + lambda: cudf.MultiIndex( + levels=[[1, 3, 4, 5], [1, 2, 5]], + codes=[[0, 0, 1, 2, 3], [0, 2, 1, 1, 0]], + names=["x", "y"], + ), + ), + ( + lambda: cudf.MultiIndex( + levels=[[1, 3, 4, None], [1, None, 5]], + codes=[[0, 0, 1, 2, 3], [0, 2, 1, 1, 0]], + names=["x", "y"], + ), + 100, + lambda: cudf.MultiIndex( + levels=[[1, 3, 4, 100], [1, 100, 5]], + codes=[[0, 0, 1, 2, 3], [0, 2, 1, 1, 0]], + names=["x", "y"], + ), + ), + ( + lambda: cudf.MultiIndex( + levels=[["a", "b", "c", None], ["1", None, "5"]], + codes=[[0, 0, 1, 2, 3], [0, 2, 1, 1, 0]], + names=["x", "y"], + ), + "100", + lambda: cudf.MultiIndex( + levels=[["a", "b", "c", "100"], ["1", "100", "5"]], + codes=[[0, 0, 1, 2, 3], [0, 2, 1, 1, 0]], + names=["x", "y"], + ), + ), + ], +) +def test_multiindex_fillna(gdi, fill_value, expected): + assert_eq(expected(), gdi().fillna(fill_value)) diff --git a/python/cudf/cudf/tests/indexes/multiindex/methods/test_numpy.py b/python/cudf/cudf/tests/indexes/multiindex/methods/test_numpy.py new file mode 100644 index 00000000000..e3b19b8e7d8 --- /dev/null +++ b/python/cudf/cudf/tests/indexes/multiindex/methods/test_numpy.py @@ -0,0 +1,16 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + + +import cudf +from cudf.testing import assert_eq + + +def test_multiindex_to_numpy(): + midx = cudf.MultiIndex( + levels=[[1, 3, 4, 5], [1, 2, 5]], + codes=[[0, 0, 1, 2, 3], [0, 2, 1, 1, 0]], + names=["x", "y"], + ) + pmidx = midx.to_pandas() + + assert_eq(midx.to_numpy(), pmidx.to_numpy()) diff --git a/python/cudf/cudf/tests/indexes/multiindex/methods/test_nunique.py b/python/cudf/cudf/tests/indexes/multiindex/methods/test_nunique.py new file mode 100644 index 00000000000..266e799267f --- /dev/null +++ b/python/cudf/cudf/tests/indexes/multiindex/methods/test_nunique.py @@ -0,0 +1,18 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + + +import pandas as pd +import pytest + +import cudf + + +@pytest.mark.parametrize("array", [[1, 2], [1, None], [None, None]]) +@pytest.mark.parametrize("dropna", [True, False]) +def test_nunique(array, dropna): + arrays = [array, [3, 4]] + gidx = cudf.MultiIndex.from_arrays(arrays) + pidx = pd.MultiIndex.from_arrays(arrays) + result = gidx.nunique(dropna=dropna) + expected = pidx.nunique(dropna=dropna) + assert result == expected diff --git a/python/cudf/cudf/tests/indexes/multiindex/methods/test_rename.py b/python/cudf/cudf/tests/indexes/multiindex/methods/test_rename.py new file mode 100644 index 00000000000..c0e4f9252f0 --- /dev/null +++ b/python/cudf/cudf/tests/indexes/multiindex/methods/test_rename.py @@ -0,0 +1,51 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + +import pandas as pd +import pytest + +import cudf +from cudf.testing import assert_eq +from cudf.testing._utils import assert_exceptions_equal + + +@pytest.mark.parametrize("name", [None, "old name"]) +@pytest.mark.parametrize( + "names", + [ + [None, None], + ["a", None], + ["new name", "another name"], + [1, None], + [2, 3], + [42, "name"], + ], +) +@pytest.mark.parametrize("inplace", [True, False]) +def test_multiindex_rename(name, names, inplace): + pi = pd.MultiIndex.from_product( + [["python", "cobra"], [2018, 2019]], names=[name, None] + ) + gi = cudf.from_pandas(pi) + + expected = pi.rename(names=names, inplace=inplace) + actual = gi.rename(names=names, inplace=inplace) + + if inplace: + expected, actual = pi, gi + + assert_eq(expected, actual) + + +@pytest.mark.parametrize( + "names", ["plain string", 123, ["str"], ["l1", "l2", "l3"]] +) +def test_multiindex_rename_error(names): + pi = pd.MultiIndex.from_product([["python", "cobra"], [2018, 2019]]) + gi = cudf.from_pandas(pi) + + assert_exceptions_equal( + lfunc=pi.rename, + rfunc=gi.rename, + lfunc_args_and_kwargs=([], {"names": names}), + rfunc_args_and_kwargs=([], {"names": names}), + ) diff --git a/python/cudf/cudf/tests/indexes/multiindex/methods/test_set_methods.py b/python/cudf/cudf/tests/indexes/multiindex/methods/test_set_methods.py new file mode 100644 index 00000000000..51731e425b4 --- /dev/null +++ b/python/cudf/cudf/tests/indexes/multiindex/methods/test_set_methods.py @@ -0,0 +1,140 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + +import pandas as pd +import pytest + +import cudf +from cudf.testing import assert_eq +from cudf.testing._utils import assert_exceptions_equal + + +def test_multiindex_union_error(): + midx = cudf.MultiIndex.from_tuples([(10, 12), (8, 9), (3, 4)]) + pidx = midx.to_pandas() + + assert_exceptions_equal( + midx.union, + pidx.union, + lfunc_args_and_kwargs=(["a"],), + rfunc_args_and_kwargs=(["b"],), + ) + + +@pytest.mark.parametrize( + "idx1, idx2", + [ + ( + pd.MultiIndex.from_arrays( + [[1, 1, 2, 2], ["Red", "Blue", "Red", "Blue"]] + ), + pd.MultiIndex.from_arrays( + [[1, 3, 2, 2], ["Red", "Green", "Red", "Green"]] + ), + ), + ( + pd.MultiIndex.from_arrays( + [[1, 2, 3, 4], ["Red", "Blue", "Red", "Blue"]], + names=["a", "b"], + ), + pd.MultiIndex.from_arrays( + [[3, 3, 2, 4], ["Red", "Green", "Red", "Green"]], + names=["x", "y"], + ), + ), + ( + pd.MultiIndex.from_arrays( + [[1, 2, 3, 4], [5, 6, 7, 10], [11, 12, 12, 13]], + names=["a", "b", "c"], + ), + pd.MultiIndex.from_arrays( + [[3, 3, 2, 4], [0.2, 0.4, 1.4, 10], [3, 3, 2, 4]] + ), + ), + ( + pd.MultiIndex.from_arrays( + [[1, 2, 3, 4], [5, 6, 7, 10], [11, 12, 12, 13]], + names=["a", "b", "c"], + ), + pd.MultiIndex.from_arrays( + [[1, 2, 3, 4], [5, 6, 7, 10], [11, 12, 12, 13]], + ), + ), + ], +) +@pytest.mark.parametrize("sort", [None, False]) +def test_intersection_mulitIndex(idx1, idx2, sort): + expected = idx1.intersection(idx2, sort=sort) + + idx1 = cudf.from_pandas(idx1) + idx2 = cudf.from_pandas(idx2) + + actual = idx1.intersection(idx2, sort=sort) + assert_eq(expected, actual, exact=False) + + +def test_difference(): + midx = cudf.MultiIndex( + levels=[[1, 3, 4, 5], [1, 2, 5]], + codes=[[0, 0, 1, 2, 3], [0, 2, 1, 1, 0]], + names=["x", "y"], + ) + midx2 = cudf.MultiIndex( + levels=[[1, 3, 4, 5], [1, 2, 5]], + codes=[[0, 0, 1, 2, 3, 3], [0, 2, 1, 1, 0, 2]], + names=["x", "y"], + ) + + expected = midx2.to_pandas().difference(midx.to_pandas()) + actual = midx2.difference(midx) + assert isinstance(actual, cudf.MultiIndex) + assert_eq(expected, actual) + + +@pytest.mark.parametrize( + "idx1, idx2", + [ + ( + pd.MultiIndex.from_arrays( + [[1, 1, 2, 2], ["Red", "Blue", "Red", "Blue"]] + ), + pd.MultiIndex.from_arrays( + [[3, 3, 2, 2], ["Red", "Green", "Red", "Green"]] + ), + ), + ( + pd.MultiIndex.from_arrays( + [[1, 2, 3, 4], ["Red", "Blue", "Red", "Blue"]], + names=["a", "b"], + ), + pd.MultiIndex.from_arrays( + [[3, 3, 2, 4], ["Red", "Green", "Red", "Green"]], + names=["x", "y"], + ), + ), + ( + pd.MultiIndex.from_arrays( + [[1, 2, 3, 4], [5, 6, 7, 10], [11, 12, 12, 13]], + names=["a", "b", "c"], + ), + pd.MultiIndex.from_arrays( + [[3, 3, 2, 4], [0.2, 0.4, 1.4, 10], [3, 3, 2, 4]] + ), + ), + ( + pd.MultiIndex.from_arrays( + [[1, 2, 3, 4], [5, 6, 7, 10], [11, 12, 12, 13]], + names=["a", "b", "c"], + ), + [(2, 6, 12)], + ), + ], +) +@pytest.mark.parametrize("sort", [None, False]) +def test_union_mulitIndex(idx1, idx2, sort): + expected = idx1.union(idx2, sort=sort) + + idx1 = cudf.from_pandas(idx1) if isinstance(idx1, pd.MultiIndex) else idx1 + idx2 = cudf.from_pandas(idx2) if isinstance(idx2, pd.MultiIndex) else idx2 + + actual = idx1.union(idx2, sort=sort) + assert_eq(expected, actual) diff --git a/python/cudf/cudf/tests/indexes/multiindex/methods/test_set_names.py b/python/cudf/cudf/tests/indexes/multiindex/methods/test_set_names.py new file mode 100644 index 00000000000..3b99adc346a --- /dev/null +++ b/python/cudf/cudf/tests/indexes/multiindex/methods/test_set_names.py @@ -0,0 +1,102 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + +import pandas as pd +import pytest + +import cudf +from cudf.testing import assert_eq +from cudf.testing._utils import assert_exceptions_equal + + +@pytest.mark.parametrize( + "names", [[None, None], ["a", None], ["new name", "another name"]] +) +@pytest.mark.parametrize("inplace", [True, False]) +def test_multiindex_set_names(names, inplace): + pi = pd.MultiIndex.from_product([["python", "cobra"], [2018, 2019]]) + gi = cudf.from_pandas(pi) + + expected = pi.set_names(names=names, inplace=inplace) + actual = gi.set_names(names=names, inplace=inplace) + + if inplace: + expected, actual = pi, gi + + assert_eq(expected, actual) + + +@pytest.mark.parametrize("idx_names", [[None, None, None], [1, 0, 2]]) +@pytest.mark.parametrize( + "level, names", + [ + (0, "abc"), + (1, "xyz"), + ([2, 1], ["a", "b"]), + ([0, 1], ["aa", "bb"]), + (None, ["a", "b", "c"]), + (None, ["a", None, "c"]), + ], +) +@pytest.mark.parametrize("inplace", [True, False]) +def test_multiindex_set_names_default_and_int_names( + idx_names, level, names, inplace +): + pi = pd.MultiIndex.from_product( + [["python", "cobra"], [2018, 2019], ["aab", "bcd"]], names=idx_names + ) + gi = cudf.from_pandas(pi) + + expected = pi.set_names(names=names, level=level, inplace=inplace) + actual = gi.set_names(names=names, level=level, inplace=inplace) + + if inplace: + expected, actual = pi, gi + + assert_eq(expected, actual) + + +@pytest.mark.parametrize( + "level, names", + [ + ([None], "abc"), + (["three", "one"], ["a", "b"]), + (["three", 1], ["a", "b"]), + ([0, "three", 1], ["a", "b", "z"]), + (["one", 1, "three"], ["a", "b", "z"]), + (["one", None, "three"], ["a", "b", "z"]), + ([2, 1], ["a", "b"]), + (1, "xyz"), + ], +) +@pytest.mark.parametrize("inplace", [True, False]) +def test_multiindex_set_names_string_names(level, names, inplace): + pi = pd.MultiIndex.from_product( + [["python", "cobra"], [2018, 2019], ["aab", "bcd"]], + names=["one", None, "three"], + ) + gi = cudf.from_pandas(pi) + + expected = pi.set_names(names=names, level=level, inplace=inplace) + actual = gi.set_names(names=names, level=level, inplace=inplace) + + if inplace: + expected, actual = pi, gi + + assert_eq(expected, actual) + + +@pytest.mark.parametrize( + "level, names", [(1, ["a"]), (None, "a"), ([1, 2], ["a"]), (None, ["a"])] +) +def test_multiindex_set_names_error(level, names): + pi = pd.MultiIndex.from_product( + [["python", "cobra"], [2018, 2019], ["aab", "bcd"]] + ) + gi = cudf.from_pandas(pi) + + assert_exceptions_equal( + lfunc=pi.set_names, + rfunc=gi.set_names, + lfunc_args_and_kwargs=([], {"names": names, "level": level}), + rfunc_args_and_kwargs=([], {"names": names, "level": level}), + ) diff --git a/python/cudf/cudf/tests/indexes/multiindex/methods/test_sort_values.py b/python/cudf/cudf/tests/indexes/multiindex/methods/test_sort_values.py new file mode 100644 index 00000000000..32e19c04a2a --- /dev/null +++ b/python/cudf/cudf/tests/indexes/multiindex/methods/test_sort_values.py @@ -0,0 +1,52 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + +import pandas as pd +import pytest + +import cudf +from cudf.testing import assert_eq + + +@pytest.mark.parametrize("ascending", [True, False]) +@pytest.mark.parametrize("return_indexer", [True, False]) +@pytest.mark.parametrize( + "pmidx", + [ + pd.MultiIndex( + levels=[[1, 3, 4, 5], [1, 2, 5]], + codes=[[0, 0, 1, 2, 3], [0, 2, 1, 1, 0]], + names=["x", "y"], + ), + pd.MultiIndex.from_product( + [["bar", "baz", "foo", "qux"], ["one", "two"]], + names=["first", "second"], + ), + pd.MultiIndex( + levels=[[], [], []], + codes=[[], [], []], + names=["one", "two", "three"], + ), + pd.MultiIndex.from_tuples([(1, 2), (3, 4)]), + ], +) +def test_multiindex_sort_values(pmidx, ascending, return_indexer): + pmidx = pmidx + midx = cudf.from_pandas(pmidx) + + expected = pmidx.sort_values( + ascending=ascending, return_indexer=return_indexer + ) + actual = midx.sort_values( + ascending=ascending, return_indexer=return_indexer + ) + + if return_indexer: + expected_indexer = expected[1] + actual_indexer = actual[1] + + assert_eq(expected_indexer, actual_indexer) + + expected = expected[0] + actual = actual[0] + + assert_eq(expected, actual) diff --git a/python/cudf/cudf/tests/indexes/multiindex/methods/test_swaplevel.py b/python/cudf/cudf/tests/indexes/multiindex/methods/test_swaplevel.py new file mode 100644 index 00000000000..4ef52a24628 --- /dev/null +++ b/python/cudf/cudf/tests/indexes/multiindex/methods/test_swaplevel.py @@ -0,0 +1,29 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + + +import cudf +from cudf.testing import assert_eq + + +def test_multiindex_swaplevel(): + midx = cudf.MultiIndex( + levels=[ + ["lama", "cow", "falcon"], + ["speed", "weight", "length"], + ["first", "second"], + ], + codes=[ + [0, 0, 0, 1, 1, 1, 2, 2, 2], + [0, 1, 2, 0, 1, 2, 0, 1, 2], + [0, 0, 0, 0, 0, 0, 1, 1, 1], + ], + names=["Col1", "Col2", "Col3"], + ) + pd_midx = midx.to_pandas() + + assert_eq(pd_midx.swaplevel(-1, -2), midx.swaplevel(-1, -2)) + assert_eq(pd_midx.swaplevel(2, 1), midx.swaplevel(2, 1)) + assert_eq(midx.swaplevel(2, 1), midx.swaplevel(1, 2)) + assert_eq(pd_midx.swaplevel(0, 2), midx.swaplevel(0, 2)) + assert_eq(pd_midx.swaplevel(2, 0), midx.swaplevel(2, 0)) + assert_eq(midx.swaplevel(1, 1), midx.swaplevel(1, 1)) diff --git a/python/cudf/cudf/tests/indexes/multiindex/methods/test_take.py b/python/cudf/cudf/tests/indexes/multiindex/methods/test_take.py new file mode 100644 index 00000000000..e8e06d84b70 --- /dev/null +++ b/python/cudf/cudf/tests/indexes/multiindex/methods/test_take.py @@ -0,0 +1,41 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + + +import numpy as np +import pandas as pd + +import cudf +from cudf.testing import assert_eq + + +def test_multiindex_take(): + pdfIndex = pd.MultiIndex( + [ + ["a", "b", "c"], + ["house", "store", "forest"], + ["clouds", "clear", "storm"], + ["fire", "smoke", "clear"], + [ + np.datetime64("2001-01-01", "ns"), + np.datetime64("2002-01-01", "ns"), + np.datetime64("2003-01-01", "ns"), + ], + ], + [ + [0, 0, 0, 0, 1, 1, 2], + [1, 1, 1, 1, 0, 0, 2], + [0, 0, 2, 2, 2, 0, 1], + [0, 0, 0, 1, 2, 0, 1], + [1, 0, 1, 2, 0, 0, 1], + ], + ) + pdfIndex.names = ["alpha", "location", "weather", "sign", "timestamp"] + gdfIndex = cudf.from_pandas(pdfIndex) + assert_eq(pdfIndex.take([0]), gdfIndex.take([0])) + assert_eq(pdfIndex.take(np.array([0])), gdfIndex.take(np.array([0]))) + assert_eq(pdfIndex.take(pd.Series([0])), gdfIndex.take(cudf.Series([0]))) + assert_eq(pdfIndex.take([0, 1]), gdfIndex.take([0, 1])) + assert_eq(pdfIndex.take(np.array([0, 1])), gdfIndex.take(np.array([0, 1]))) + assert_eq( + pdfIndex.take(pd.Series([0, 1])), gdfIndex.take(cudf.Series([0, 1])) + ) diff --git a/python/cudf/cudf/tests/indexes/multiindex/methods/test_to_frame.py b/python/cudf/cudf/tests/indexes/multiindex/methods/test_to_frame.py new file mode 100644 index 00000000000..716962bb242 --- /dev/null +++ b/python/cudf/cudf/tests/indexes/multiindex/methods/test_to_frame.py @@ -0,0 +1,103 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + + +import pandas as pd +import pytest + +import cudf +from cudf.api.extensions import no_default +from cudf.testing import assert_eq +from cudf.testing._utils import assert_exceptions_equal + + +@pytest.mark.parametrize("codes", [[0, 1, 2], [-1, 0, 1]]) +def test_multiindex_to_frame(codes): + pdfIndex = pd.MultiIndex( + [ + ["a", "b", "c"], + ], + [ + codes, + ], + ) + gdfIndex = cudf.from_pandas(pdfIndex) + assert_eq(pdfIndex.to_frame(), gdfIndex.to_frame()) + + +@pytest.mark.parametrize( + "pidx", + [ + pd.MultiIndex.from_arrays( + [[1, 2, 3, 4], [5, 6, 7, 10], [11, 12, 12, 13]], + names=["a", "b", "c"], + ), + pd.MultiIndex.from_arrays( + [[1, 2, 3, 4], [5, 6, 7, 10], [11, 12, 12, 13]], + names=["a", "a", "a"], + ), + pd.MultiIndex.from_arrays( + [[1, 2, 3, 4], [5, 6, 7, 10], [11, 12, 12, 13]], + ), + ], +) +@pytest.mark.parametrize( + "name", [None, no_default, ["x", "y", "z"], ["rapids", "rapids", "rapids"]] +) +@pytest.mark.parametrize("allow_duplicates", [True, False]) +@pytest.mark.parametrize("index", [True, False]) +def test_multiindex_to_frame_allow_duplicates( + pidx, name, allow_duplicates, index +): + gidx = cudf.from_pandas(pidx) + + if name is None or ( + ( + len(pidx.names) != len(set(pidx.names)) + and not all(x is None for x in pidx.names) + ) + and not allow_duplicates + and name is no_default + ): + assert_exceptions_equal( + pidx.to_frame, + gidx.to_frame, + lfunc_args_and_kwargs=( + [], + { + "index": index, + "name": name, + "allow_duplicates": allow_duplicates, + }, + ), + rfunc_args_and_kwargs=( + [], + { + "index": index, + "name": name, + "allow_duplicates": allow_duplicates, + }, + ), + ) + else: + if ( + len(pidx.names) != len(set(pidx.names)) + and not all(x is None for x in pidx.names) + and not isinstance(name, list) + ) or (isinstance(name, list) and len(name) != len(set(name))): + # cudf doesn't have the ability to construct dataframes + # with duplicate column names + with pytest.raises(ValueError): + gidx.to_frame( + index=index, + name=name, + allow_duplicates=allow_duplicates, + ) + else: + expected = pidx.to_frame( + index=index, name=name, allow_duplicates=allow_duplicates + ) + actual = gidx.to_frame( + index=index, name=name, allow_duplicates=allow_duplicates + ) + + assert_eq(expected, actual) diff --git a/python/cudf/cudf/tests/indexes/multiindex/methods/test_to_pandas.py b/python/cudf/cudf/tests/indexes/multiindex/methods/test_to_pandas.py new file mode 100644 index 00000000000..0c91c8e46ba --- /dev/null +++ b/python/cudf/cudf/tests/indexes/multiindex/methods/test_to_pandas.py @@ -0,0 +1,41 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + +import datetime + +import pandas as pd +import pyarrow as pa +import pytest + +import cudf + + +@pytest.mark.parametrize( + "scalar", + [ + 1, + 1.0, + "a", + datetime.datetime(2020, 1, 1), + datetime.timedelta(1), + pd.Interval(1, 2), + ], +) +def test_index_to_pandas_arrow_type_nullable_raises(scalar): + pa_array = [scalar, None] + midx = cudf.MultiIndex(levels=[pa_array], codes=[[0]]) + with pytest.raises(ValueError): + midx.to_pandas(nullable=True, arrow_type=True) + + +@pytest.mark.parametrize( + "scalar", + [1, 1.0, "a", datetime.datetime(2020, 1, 1), datetime.timedelta(1)], +) +def test_index_to_pandas_arrow_type(scalar): + pa_array = pa.array([scalar, None]) + midx = cudf.MultiIndex(levels=[pa_array], codes=[[0]]) + result = midx.to_pandas(arrow_type=True) + expected = pd.MultiIndex( + levels=[pd.arrays.ArrowExtensionArray(pa_array)], codes=[[0]] + ) + pd.testing.assert_index_equal(result, expected) diff --git a/python/cudf/cudf/tests/indexes/multiindex/methods/test_to_series.py b/python/cudf/cudf/tests/indexes/multiindex/methods/test_to_series.py new file mode 100644 index 00000000000..c017d73ef83 --- /dev/null +++ b/python/cudf/cudf/tests/indexes/multiindex/methods/test_to_series.py @@ -0,0 +1,11 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + +import pytest + +import cudf + + +def test_multiindex_to_series_error(): + midx = cudf.MultiIndex.from_tuples([("a", "b")]) + with pytest.raises(NotImplementedError): + midx.to_series() diff --git a/python/cudf/cudf/tests/indexes/multiindex/methods/test_unique.py b/python/cudf/cudf/tests/indexes/multiindex/methods/test_unique.py new file mode 100644 index 00000000000..184d175c3b4 --- /dev/null +++ b/python/cudf/cudf/tests/indexes/multiindex/methods/test_unique.py @@ -0,0 +1,16 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + + +import pandas as pd + +import cudf +from cudf.testing import assert_eq + + +def test_unique_level(): + pd_mi = pd.MultiIndex.from_arrays([[1, 1, 2], [3, 3, 2]]) + cudf_mi = cudf.MultiIndex.from_pandas(pd_mi) + + result = pd_mi.unique(level=1) + expected = cudf_mi.unique(level=1) + assert_eq(result, expected) diff --git a/python/cudf/cudf/tests/indexes/multiindex/test_attributes.py b/python/cudf/cudf/tests/indexes/multiindex/test_attributes.py new file mode 100644 index 00000000000..d26e7216a73 --- /dev/null +++ b/python/cudf/cudf/tests/indexes/multiindex/test_attributes.py @@ -0,0 +1,214 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + +import re + +import cupy as cp +import numpy as np +import pandas as pd +import pytest + +import cudf +from cudf.testing import assert_eq +from cudf.testing._utils import assert_exceptions_equal + + +@pytest.fixture( + params=[ + "from_product", + "from_tuples", + "from_arrays", + "init", + ] +) +def midx(request): + if request.param == "from_product": + return cudf.MultiIndex.from_product([[0, 1], [1, 0]]) + elif request.param == "from_tuples": + return cudf.MultiIndex.from_tuples([(0, 1), (0, 0), (1, 1), (1, 0)]) + elif request.param == "from_arrays": + return cudf.MultiIndex.from_arrays([[0, 0, 1, 1], [1, 0, 1, 0]]) + elif request.param == "init": + return cudf.MultiIndex( + levels=[[0, 1], [0, 1]], codes=[[0, 0, 1, 1], [1, 0, 1, 0]] + ) + else: + raise NotImplementedError(f"{request.param} not implemented") + + +def test_multindex_constructor_levels_always_indexes(midx): + assert_eq(midx.levels[0], cudf.Index([0, 1])) + assert_eq(midx.levels[1], cudf.Index([0, 1])) + + +def test_bool_raises(): + assert_exceptions_equal( + lfunc=bool, + rfunc=bool, + lfunc_args_and_kwargs=[[cudf.MultiIndex.from_arrays([range(1)])]], + rfunc_args_and_kwargs=[[pd.MultiIndex.from_arrays([range(1)])]], + ) + + +def test_multi_index_contains_hashable(): + gidx = cudf.MultiIndex.from_tuples(zip(["foo", "bar", "baz"], [1, 2, 3])) + pidx = gidx.to_pandas() + + assert_exceptions_equal( + lambda: [] in gidx, + lambda: [] in pidx, + lfunc_args_and_kwargs=((),), + rfunc_args_and_kwargs=((),), + ) + + +def test_multiindex_codes(): + midx = cudf.MultiIndex.from_tuples( + [("a", "b"), ("a", "c"), ("b", "c")], names=["A", "Z"] + ) + + for p_array, g_array in zip(midx.to_pandas().codes, midx.codes): + assert_eq(p_array, g_array) + + +def test_multiindex_values_pandas_compatible(): + midx = cudf.MultiIndex.from_tuples([(10, 12), (8, 9), (3, 4)]) + with cudf.option_context("mode.pandas_compatible", True): + with pytest.raises(NotImplementedError): + midx.values + + +@pytest.mark.parametrize("bad", ["foo", ["foo"]]) +def test_multiindex_set_names_validation(bad): + mi = cudf.MultiIndex.from_tuples([(0, 0), (0, 1), (1, 0), (1, 1)]) + with pytest.raises(ValueError): + mi.names = bad + + +def test_multiindex_levels(): + gidx = cudf.MultiIndex.from_product( + [range(3), ["one", "two"]], names=["first", "second"] + ) + pidx = gidx.to_pandas() + + assert_eq(gidx.levels[0], pidx.levels[0]) + assert_eq(gidx.levels[1], pidx.levels[1]) + + +@pytest.mark.parametrize( + "pidx", + [ + pd.MultiIndex.from_arrays( + [[1, 1, 2, 2], ["Red", "Blue", "Red", "Blue"]] + ), + pd.MultiIndex.from_arrays( + [[1, 2, 3, 4], [5, 6, 7, 10], [11, 12, 12, 13]], + names=["a", "b", "c"], + ), + pd.MultiIndex.from_arrays( + [[1.0, 2, 3, 4], [5, 6, 7.8, 10], [11, 12, 12, 13]], + ), + ], +) +@pytest.mark.parametrize( + "func", + [ + "is_numeric", + "is_boolean", + "is_integer", + "is_floating", + "is_object", + "is_categorical", + "is_interval", + ], +) +def test_multiindex_type_methods(pidx, func): + gidx = cudf.from_pandas(pidx) + + with pytest.warns(FutureWarning): + expected = getattr(pidx, func)() + + with pytest.warns(FutureWarning): + actual = getattr(gidx, func)() + + if func == "is_object": + assert_eq(False, actual) + else: + assert_eq(expected, actual) + + +def test_multiindex_iter_error(): + midx = cudf.MultiIndex( + levels=[[1, 3, 4, 5], [1, 2, 5]], + codes=[[0, 0, 1, 2, 3], [0, 2, 1, 1, 0]], + names=["x", "y"], + ) + + with pytest.raises( + TypeError, + match=re.escape( + f"{midx.__class__.__name__} object is not iterable. " + f"Consider using `.to_arrow()`, `.to_pandas()` or `.values_host` " + f"if you wish to iterate over the values." + ), + ): + iter(midx) + + +def test_multiindex_values(): + midx = cudf.MultiIndex( + levels=[[1, 3, 4, 5], [1, 2, 5]], + codes=[[0, 0, 1, 2, 3], [0, 2, 1, 1, 0]], + names=["x", "y"], + ) + + result = midx.values + + assert isinstance(result, cp.ndarray) + np.testing.assert_array_equal( + result.get(), np.array([[1, 1], [1, 5], [3, 2], [4, 2], [5, 1]]) + ) + + +def test_multiindex_values_host(): + midx = cudf.MultiIndex( + levels=[[1, 3, 4, 5], [1, 2, 5]], + codes=[[0, 0, 1, 2, 3], [0, 2, 1, 1, 0]], + names=["x", "y"], + ) + pmidx = midx.to_pandas() + + assert_eq(midx.values_host, pmidx.values) + + +@pytest.mark.parametrize( + "pdi", + [ + pd.MultiIndex( + levels=[[], [], []], + codes=[[], [], []], + names=["one", "two", "three"], + ), + pd.MultiIndex.from_tuples([(1, 2), (3, 4)]), + ], +) +def test_multiindex_empty(pdi): + gdi = cudf.from_pandas(pdi) + + assert_eq(pdi.empty, gdi.empty) + + +@pytest.mark.parametrize( + "pdi", + [ + pd.MultiIndex( + levels=[[], [], []], + codes=[[], [], []], + names=["one", "two", "three"], + ), + pd.MultiIndex.from_tuples([(1, 2), (3, 4)]), + ], +) +def test_multiindex_size(pdi): + gdi = cudf.from_pandas(pdi) + + assert_eq(pdi.size, gdi.size) diff --git a/python/cudf/cudf/tests/indexes/multiindex/test_binaryop.py b/python/cudf/cudf/tests/indexes/multiindex/test_binaryop.py new file mode 100644 index 00000000000..973e0feff71 --- /dev/null +++ b/python/cudf/cudf/tests/indexes/multiindex/test_binaryop.py @@ -0,0 +1,14 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + + +import numpy as np + +import cudf +from cudf.testing import assert_eq + + +def test_multiindex_eq_other_multiindex(): + idx = cudf.MultiIndex.from_tuples([(0, 0), (0, 1), (1, 0), (1, 1)]) + result = idx == idx + expected = np.array([True, True]) + assert_eq(result, expected) diff --git a/python/cudf/cudf/tests/indexes/multiindex/test_constructing.py b/python/cudf/cudf/tests/indexes/multiindex/test_constructing.py deleted file mode 100644 index 06777c8e6af..00000000000 --- a/python/cudf/cudf/tests/indexes/multiindex/test_constructing.py +++ /dev/null @@ -1 +0,0 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. diff --git a/python/cudf/cudf/tests/indexes/multiindex/test_constructors.py b/python/cudf/cudf/tests/indexes/multiindex/test_constructors.py new file mode 100644 index 00000000000..3db599f49e5 --- /dev/null +++ b/python/cudf/cudf/tests/indexes/multiindex/test_constructors.py @@ -0,0 +1,183 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + + +import cupy as cp +import numpy as np +import pandas as pd +import pytest + +import cudf +from cudf.testing import assert_eq +from cudf.testing._utils import assert_exceptions_equal + + +def test_multiindex_levels_codes_validation(): + levels = [["a", "b"], ["c", "d"]] + + # Codes not a sequence of sequences + assert_exceptions_equal( + lfunc=pd.MultiIndex, + rfunc=cudf.MultiIndex, + lfunc_args_and_kwargs=([levels, [0, 1]],), + rfunc_args_and_kwargs=([levels, [0, 1]],), + ) + + # Codes don't match levels + assert_exceptions_equal( + lfunc=pd.MultiIndex, + rfunc=cudf.MultiIndex, + lfunc_args_and_kwargs=([levels, [[0], [1], [1]]],), + rfunc_args_and_kwargs=([levels, [[0], [1], [1]]],), + ) + + # Largest code greater than number of levels + assert_exceptions_equal( + lfunc=pd.MultiIndex, + rfunc=cudf.MultiIndex, + lfunc_args_and_kwargs=([levels, [[0, 1], [0, 2]]],), + rfunc_args_and_kwargs=([levels, [[0, 1], [0, 2]]],), + ) + + # Unequal code lengths + assert_exceptions_equal( + lfunc=pd.MultiIndex, + rfunc=cudf.MultiIndex, + lfunc_args_and_kwargs=([levels, [[0, 1], [0]]],), + rfunc_args_and_kwargs=([levels, [[0, 1], [0]]],), + ) + # Didn't pass levels and codes + assert_exceptions_equal(lfunc=pd.MultiIndex, rfunc=cudf.MultiIndex) + + # Didn't pass non zero levels and codes + assert_exceptions_equal( + lfunc=pd.MultiIndex, + rfunc=cudf.MultiIndex, + lfunc_args_and_kwargs=([[], []],), + rfunc_args_and_kwargs=([[], []],), + ) + + +def test_multiindex_construction(): + levels = [["a", "b"], ["c", "d"]] + codes = [[0, 1], [1, 0]] + pmi = pd.MultiIndex(levels, codes) + mi = cudf.MultiIndex(levels, codes) + assert_eq(pmi, mi) + pmi = pd.MultiIndex(levels, codes) + mi = cudf.MultiIndex(levels=levels, codes=codes) + assert_eq(pmi, mi) + + +def test_multiindex_types(): + codes = [[0, 1], [1, 0]] + levels = [[0, 1], [2, 3]] + pmi = pd.MultiIndex(levels, codes) + mi = cudf.MultiIndex(levels, codes) + assert_eq(pmi, mi) + levels = [[1.2, 2.1], [1.3, 3.1]] + pmi = pd.MultiIndex(levels, codes) + mi = cudf.MultiIndex(levels, codes) + assert_eq(pmi, mi) + levels = [["a", "b"], ["c", "d"]] + pmi = pd.MultiIndex(levels, codes) + mi = cudf.MultiIndex(levels, codes) + assert_eq(pmi, mi) + + +def test_multiindex_from_tuples(): + arrays = [["a", "a", "b", "b"], ["house", "store", "house", "store"]] + tuples = list(zip(*arrays)) + pmi = pd.MultiIndex.from_tuples(tuples) + gmi = cudf.MultiIndex.from_tuples(tuples) + assert_eq(pmi, gmi) + + +def test_multiindex_from_dataframe(): + pdf = pd.DataFrame( + [["a", "house"], ["a", "store"], ["b", "house"], ["b", "store"]] + ) + gdf = cudf.from_pandas(pdf) + pmi = pd.MultiIndex.from_frame(pdf, names=["alpha", "location"]) + gmi = cudf.MultiIndex.from_frame(gdf, names=["alpha", "location"]) + assert_eq(pmi, gmi) + + +@pytest.mark.parametrize( + "arrays", + [ + [["a", "a", "b", "b"], ["house", "store", "house", "store"]], + [["a", "n", "n"] * 10, ["house", "store", "house", "store"]], + [ + ["a", "n", "n"], + ["house", "store", "house", "store", "store"] * 10, + ], + [ + ["a", "a", "n"] * 50, + ["house", "store", "house", "store", "store"] * 10, + ], + ], +) +def test_multiindex_from_product(arrays): + pmi = pd.MultiIndex.from_product(arrays, names=["alpha", "location"]) + gmi = cudf.MultiIndex.from_product(arrays, names=["alpha", "location"]) + assert_eq(pmi, gmi) + + +@pytest.mark.parametrize( + "array", + [ + list, + tuple, + np.array, + cp.array, + pd.Index, + cudf.Index, + pd.Series, + cudf.Series, + ], +) +def test_multiindex_from_arrays(array): + pd_data = [[0, 0, 1, 1], [1, 0, 1, 0]] + cudf_data = [array(lst) for lst in pd_data] + result = pd.MultiIndex.from_arrays(pd_data) + expected = cudf.MultiIndex.from_arrays(cudf_data) + assert_eq(result, expected) + + +@pytest.mark.parametrize("arg", ["foo", ["foo"]]) +def test_multiindex_from_arrays_wrong_arg(arg): + with pytest.raises(TypeError): + cudf.MultiIndex.from_arrays(arg) + + +@pytest.mark.parametrize( + "idx", [pd.Index, pd.CategoricalIndex, pd.DatetimeIndex, pd.TimedeltaIndex] +) +def test_from_arrays_infer_names(idx): + arrays = [idx([1], name="foo"), idx([2], name="bar")] + expected = pd.MultiIndex.from_arrays(arrays) + result = cudf.MultiIndex.from_arrays(arrays) + assert_eq(result, expected) + + +def test_multiindex_dtype_error(): + midx = cudf.MultiIndex.from_tuples([(10, 12), (8, 9), (3, 4)]) + with pytest.raises(TypeError): + cudf.Index(midx, dtype="int64") + with pytest.raises(TypeError): + cudf.Index(midx.to_pandas(), dtype="int64") + + +def test_multiindex_duplicate_names(): + gi = cudf.MultiIndex( + levels=[["a", "b"], ["b", "a"]], + codes=[[0, 0], [0, 1]], + names=["a", "a"], + ) + pi = pd.MultiIndex( + levels=[["a", "b"], ["b", "a"]], + codes=[[0, 0], [0, 1]], + names=["a", "a"], + ) + + assert_eq(gi, pi) diff --git a/python/cudf/cudf/tests/indexes/multiindex/test_getitem.py b/python/cudf/cudf/tests/indexes/multiindex/test_getitem.py new file mode 100644 index 00000000000..f04dcfe9cec --- /dev/null +++ b/python/cudf/cudf/tests/indexes/multiindex/test_getitem.py @@ -0,0 +1,13 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + +import pandas as pd + +import cudf +from cudf.testing import assert_eq + + +def test_multiindex_empty_slice_pandas_compatibility(): + expected = pd.MultiIndex.from_tuples([("a", "b")])[:0] + with cudf.option_context("mode.pandas_compatible", True): + actual = cudf.from_pandas(expected) + assert_eq(expected, actual, exact=False) diff --git a/python/cudf/cudf/tests/indexes/multiindex/test_properties.py b/python/cudf/cudf/tests/indexes/multiindex/test_properties.py deleted file mode 100644 index 06777c8e6af..00000000000 --- a/python/cudf/cudf/tests/indexes/multiindex/test_properties.py +++ /dev/null @@ -1 +0,0 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. diff --git a/python/cudf/cudf/tests/indexes/multiindex/test_selecting.py b/python/cudf/cudf/tests/indexes/multiindex/test_selecting.py deleted file mode 100644 index 06777c8e6af..00000000000 --- a/python/cudf/cudf/tests/indexes/multiindex/test_selecting.py +++ /dev/null @@ -1 +0,0 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. diff --git a/python/cudf/cudf/tests/test_multiindex.py b/python/cudf/cudf/tests/test_multiindex.py index d274166d0d2..c8c4923a4e3 100644 --- a/python/cudf/cudf/tests/test_multiindex.py +++ b/python/cudf/cudf/tests/test_multiindex.py @@ -1,26 +1,17 @@ -# Copyright (c) 2019-2025, NVIDIA CORPORATION. +# Copyright (c) 2025, NVIDIA CORPORATION. -""" -Test related to MultiIndex -""" - -import datetime import itertools import operator import pickle -import re from contextlib import contextmanager -from functools import reduce from io import BytesIO import cupy as cp import numpy as np import pandas as pd -import pyarrow as pa import pytest import cudf -from cudf.api.extensions import no_default from cudf.core.column import as_column from cudf.testing import assert_eq, assert_neq from cudf.testing._utils import assert_exceptions_equal, expect_warning_if @@ -36,79 +27,6 @@ def expect_pandas_performance_warning(idx): yield -def test_multiindex_levels_codes_validation(): - levels = [["a", "b"], ["c", "d"]] - - # Codes not a sequence of sequences - assert_exceptions_equal( - lfunc=pd.MultiIndex, - rfunc=cudf.MultiIndex, - lfunc_args_and_kwargs=([levels, [0, 1]],), - rfunc_args_and_kwargs=([levels, [0, 1]],), - ) - - # Codes don't match levels - assert_exceptions_equal( - lfunc=pd.MultiIndex, - rfunc=cudf.MultiIndex, - lfunc_args_and_kwargs=([levels, [[0], [1], [1]]],), - rfunc_args_and_kwargs=([levels, [[0], [1], [1]]],), - ) - - # Largest code greater than number of levels - assert_exceptions_equal( - lfunc=pd.MultiIndex, - rfunc=cudf.MultiIndex, - lfunc_args_and_kwargs=([levels, [[0, 1], [0, 2]]],), - rfunc_args_and_kwargs=([levels, [[0, 1], [0, 2]]],), - ) - - # Unequal code lengths - assert_exceptions_equal( - lfunc=pd.MultiIndex, - rfunc=cudf.MultiIndex, - lfunc_args_and_kwargs=([levels, [[0, 1], [0]]],), - rfunc_args_and_kwargs=([levels, [[0, 1], [0]]],), - ) - # Didn't pass levels and codes - assert_exceptions_equal(lfunc=pd.MultiIndex, rfunc=cudf.MultiIndex) - - # Didn't pass non zero levels and codes - assert_exceptions_equal( - lfunc=pd.MultiIndex, - rfunc=cudf.MultiIndex, - lfunc_args_and_kwargs=([[], []],), - rfunc_args_and_kwargs=([[], []],), - ) - - -def test_multiindex_construction(): - levels = [["a", "b"], ["c", "d"]] - codes = [[0, 1], [1, 0]] - pmi = pd.MultiIndex(levels, codes) - mi = cudf.MultiIndex(levels, codes) - assert_eq(pmi, mi) - pmi = pd.MultiIndex(levels, codes) - mi = cudf.MultiIndex(levels=levels, codes=codes) - assert_eq(pmi, mi) - - -def test_multiindex_types(): - codes = [[0, 1], [1, 0]] - levels = [[0, 1], [2, 3]] - pmi = pd.MultiIndex(levels, codes) - mi = cudf.MultiIndex(levels, codes) - assert_eq(pmi, mi) - levels = [[1.2, 2.1], [1.3, 3.1]] - pmi = pd.MultiIndex(levels, codes) - mi = cudf.MultiIndex(levels, codes) - assert_eq(pmi, mi) - levels = [["a", "b"], ["c", "d"]] - pmi = pd.MultiIndex(levels, codes) - mi = cudf.MultiIndex(levels, codes) - assert_eq(pmi, mi) - - def test_multiindex_df_assignment(): pdf = pd.DataFrame({"x": [1, 2, 3]}) gdf = cudf.from_pandas(pdf) @@ -129,30 +47,6 @@ def test_multiindex_series_assignment(): assert_eq(ps, gs) -def test_multiindex_swaplevel(): - midx = cudf.MultiIndex( - levels=[ - ["lama", "cow", "falcon"], - ["speed", "weight", "length"], - ["first", "second"], - ], - codes=[ - [0, 0, 0, 1, 1, 1, 2, 2, 2], - [0, 1, 2, 0, 1, 2, 0, 1, 2], - [0, 0, 0, 0, 0, 0, 1, 1, 1], - ], - names=["Col1", "Col2", "Col3"], - ) - pd_midx = midx.to_pandas() - - assert_eq(pd_midx.swaplevel(-1, -2), midx.swaplevel(-1, -2)) - assert_eq(pd_midx.swaplevel(2, 1), midx.swaplevel(2, 1)) - assert_eq(midx.swaplevel(2, 1), midx.swaplevel(1, 2)) - assert_eq(pd_midx.swaplevel(0, 2), midx.swaplevel(0, 2)) - assert_eq(pd_midx.swaplevel(2, 0), midx.swaplevel(2, 0)) - assert_eq(midx.swaplevel(1, 1), midx.swaplevel(1, 1)) - - def test_string_index(): rng = np.random.default_rng(seed=0) pdf = pd.DataFrame(rng.random(size=(5, 5))) @@ -256,13 +150,6 @@ def test_from_pandas(pdf, pdfIndex): assert_eq(pdf, gdf) -def test_multiindex_transpose(pdf, pdfIndex): - pdf = pdf.copy(deep=False) - pdf.index = pdfIndex - gdf = cudf.from_pandas(pdf) - assert_eq(pdf.transpose(), gdf.transpose()) - - def test_from_pandas_series(): pdf = pd.DataFrame( {"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]} @@ -285,26 +172,6 @@ def test_series_multiindex(pdfIndex): assert_eq(ps, gs) -def test_multiindex_take(pdf, gdf, pdfIndex): - gdfIndex = cudf.from_pandas(pdfIndex) - pdf = pdf.copy(deep=False) - gdf = gdf.copy(deep=False) - pdf.index = pdfIndex - gdf.index = gdfIndex - assert_eq(pdf.index.take([0]), gdf.index.take([0])) - assert_eq(pdf.index.take(np.array([0])), gdf.index.take(np.array([0]))) - from cudf import Series - - assert_eq(pdf.index.take(pd.Series([0])), gdf.index.take(Series([0]))) - assert_eq(pdf.index.take([0, 1]), gdf.index.take([0, 1])) - assert_eq( - pdf.index.take(np.array([0, 1])), gdf.index.take(np.array([0, 1])) - ) - assert_eq( - pdf.index.take(pd.Series([0, 1])), gdf.index.take(Series([0, 1])) - ) - - def test_multiindex_getitem(pdf, gdf, pdfIndex): gdfIndex = cudf.from_pandas(pdfIndex) pdf = pdf.copy(deep=False) @@ -495,45 +362,6 @@ def test_multiindex_columns(pdf, pdfIndex, query): assert_eq(expected, got) -def test_multiindex_from_tuples(): - arrays = [["a", "a", "b", "b"], ["house", "store", "house", "store"]] - tuples = list(zip(*arrays)) - pmi = pd.MultiIndex.from_tuples(tuples) - gmi = cudf.MultiIndex.from_tuples(tuples) - assert_eq(pmi, gmi) - - -def test_multiindex_from_dataframe(): - pdf = pd.DataFrame( - [["a", "house"], ["a", "store"], ["b", "house"], ["b", "store"]] - ) - gdf = cudf.from_pandas(pdf) - pmi = pd.MultiIndex.from_frame(pdf, names=["alpha", "location"]) - gmi = cudf.MultiIndex.from_frame(gdf, names=["alpha", "location"]) - assert_eq(pmi, gmi) - - -@pytest.mark.parametrize( - "arrays", - [ - [["a", "a", "b", "b"], ["house", "store", "house", "store"]], - [["a", "n", "n"] * 1000, ["house", "store", "house", "store"]], - [ - ["a", "n", "n"], - ["house", "store", "house", "store", "store"] * 1000, - ], - [ - ["a", "a", "n"] * 50, - ["house", "store", "house", "store", "store"] * 100, - ], - ], -) -def test_multiindex_from_product(arrays): - pmi = pd.MultiIndex.from_product(arrays, names=["alpha", "location"]) - gmi = cudf.MultiIndex.from_product(arrays, names=["alpha", "location"]) - assert_eq(pmi, gmi) - - def test_multiindex_index_and_columns(): rng = np.random.default_rng(seed=0) gdf = cudf.DataFrame( @@ -702,173 +530,6 @@ def test_multiindex_equals(): assert_eq(mi1.equals(mi2), False) -def test_multiindex_copy_sem(): - """Test semantic equality for MultiIndex.copy""" - names = ["X", "Y"] - data = { - "Date": [ - "2020-08-27", - "2020-08-28", - "2020-08-31", - "2020-08-27", - "2020-08-28", - "2020-08-31", - "2020-08-27", - "2020-08-28", - "2020-08-31", - ], - "Close": [ - 3400.00, - 3401.80, - 3450.96, - 226.58, - 228.91, - 225.53, - 505.13, - 525.91, - 534.98, - ], - "Symbol": [ - "AMZN", - "AMZN", - "AMZN", - "MSFT", - "MSFT", - "MSFT", - "NVDA", - "NVDA", - "NVDA", - ], - } - gdf = cudf.DataFrame(data) - pdf = gdf.to_pandas() - - gdf = gdf.groupby(["Date", "Symbol"], sort=True).mean() - pdf = pdf.groupby(["Date", "Symbol"], sort=True).mean() - - gmi = gdf.index - gmi_copy = gmi.copy(names=names) - - pmi = pdf.index - pmi_copy = pmi.copy(names=names) - - for glv, plv in zip(gmi_copy.levels, pmi_copy.levels): - assert all(glv.values_host == plv.values) - for gval, pval in zip(gmi.codes, pmi.codes): - assert_eq(gval, pval) - assert_eq(gmi_copy.names, pmi_copy.names) - - # Test same behavior when used on DataFrame - gdf.index = gmi_copy - pdf.index = pmi_copy - assert repr(gdf) == repr(pdf) - - -@pytest.mark.parametrize( - "data", - [ - { - "Date": [ - "2020-08-27", - "2020-08-28", - "2020-08-31", - "2020-08-27", - "2020-08-28", - "2020-08-31", - "2020-08-27", - "2020-08-28", - "2020-08-31", - ], - "Close": [ - 3400.00, - 3401.80, - 3450.96, - 226.58, - 228.91, - 225.53, - 505.13, - 525.91, - 534.98, - ], - "Symbol": [ - "AMZN", - "AMZN", - "AMZN", - "MSFT", - "MSFT", - "MSFT", - "NVDA", - "NVDA", - "NVDA", - ], - }, - pd.MultiIndex( - levels=[[1001, 1002], [2001, 2002]], - codes=[[1, 1, 0, 0], [0, 1, 0, 1]], - names=["col1", "col2"], - ), - ], -) -@pytest.mark.parametrize("copy_on_write", [True, False]) -@pytest.mark.parametrize("deep", [True, False]) -def test_multiindex_copy_deep(data, copy_on_write, deep): - """Test memory identity for deep copy - Case1: Constructed from GroupBy, StringColumns - Case2: Constructed from MultiIndex, NumericColumns - """ - with cudf.option_context("copy_on_write", copy_on_write): - if isinstance(data, dict): - gdf = cudf.DataFrame(data) - mi1 = gdf.groupby(["Date", "Symbol"]).mean().index - mi2 = mi1.copy(deep=deep) - - lchildren = [col.children for col in mi1._columns] - rchildren = [col.children for col in mi2._columns] - - # Flatten - lchildren = reduce(operator.add, lchildren) - rchildren = reduce(operator.add, rchildren) - - lptrs = [ - child.base_data.get_ptr(mode="read") for child in lchildren - ] - rptrs = [ - child.base_data.get_ptr(mode="read") for child in rchildren - ] - - assert all((x == y) for x, y in zip(lptrs, rptrs)) - - elif isinstance(data, pd.MultiIndex): - data = cudf.MultiIndex.from_pandas(data) - same_ref = (not deep) or ( - cudf.get_option("copy_on_write") and not deep - ) - mi1 = data - mi2 = mi1.copy(deep=deep) - - # Assert ._levels identity - lptrs = [ - lv._column.base_data.get_ptr(mode="read") for lv in mi1._levels - ] - rptrs = [ - lv._column.base_data.get_ptr(mode="read") for lv in mi2._levels - ] - - assert all((x == y) == same_ref for x, y in zip(lptrs, rptrs)) - - # Assert ._codes identity - lptrs = [c.base_data.get_ptr(mode="read") for c in mi1._codes] - rptrs = [c.base_data.get_ptr(mode="read") for c in mi2._codes] - - assert all((x == y) == same_ref for x, y in zip(lptrs, rptrs)) - - # Assert ._data identity - lptrs = [d.base_data.get_ptr(mode="read") for d in mi1._columns] - rptrs = [d.base_data.get_ptr(mode="read") for d in mi2._columns] - - assert all((x == y) == same_ref for x, y in zip(lptrs, rptrs)) - - @pytest.mark.parametrize( "iloc_rows", [ @@ -988,27 +649,6 @@ def test_multicolumn_item(): assert_eq(gdgT[(0, 0)], pdgT[(0, 0)]) -def test_multiindex_to_frame(pdfIndex, pdfIndexNulls): - gdfIndex = cudf.from_pandas(pdfIndex) - assert_eq(pdfIndex.to_frame(), gdfIndex.to_frame()) - - gdfIndex = cudf.from_pandas(pdfIndexNulls) - assert_eq( - pdfIndexNulls.to_frame().fillna("nan"), - gdfIndex.to_frame().fillna("nan"), - ) - - -def test_multiindex_groupby_to_frame(): - gdf = cudf.DataFrame( - {"x": [1, 5, 3, 4, 1], "y": [1, 1, 2, 2, 5], "z": [0, 1, 0, 1, 0]} - ) - pdf = gdf.to_pandas() - gdg = gdf.groupby(["x", "y"], sort=True).count() - pdg = pdf.groupby(["x", "y"], sort=True).count() - assert_eq(pdg.index.to_frame(), gdg.index.to_frame()) - - def test_multiindex_reset_index(pdf, gdf, pdfIndex): gdfIndex = cudf.from_pandas(pdfIndex) pdf = pdf.copy(deep=False) @@ -1140,499 +780,6 @@ def test_multicolumn_set_item(pdf, pdfIndex): assert_eq(pdf, gdf) -def test_multiindex_iter_error(): - midx = cudf.MultiIndex( - levels=[[1, 3, 4, 5], [1, 2, 5]], - codes=[[0, 0, 1, 2, 3], [0, 2, 1, 1, 0]], - names=["x", "y"], - ) - - with pytest.raises( - TypeError, - match=re.escape( - f"{midx.__class__.__name__} object is not iterable. " - f"Consider using `.to_arrow()`, `.to_pandas()` or `.values_host` " - f"if you wish to iterate over the values." - ), - ): - iter(midx) - - -def test_multiindex_values(): - midx = cudf.MultiIndex( - levels=[[1, 3, 4, 5], [1, 2, 5]], - codes=[[0, 0, 1, 2, 3], [0, 2, 1, 1, 0]], - names=["x", "y"], - ) - - result = midx.values - - assert isinstance(result, cp.ndarray) - np.testing.assert_array_equal( - result.get(), np.array([[1, 1], [1, 5], [3, 2], [4, 2], [5, 1]]) - ) - - -def test_multiindex_values_host(): - midx = cudf.MultiIndex( - levels=[[1, 3, 4, 5], [1, 2, 5]], - codes=[[0, 0, 1, 2, 3], [0, 2, 1, 1, 0]], - names=["x", "y"], - ) - pmidx = midx.to_pandas() - - assert_eq(midx.values_host, pmidx.values) - - -def test_multiindex_to_numpy(): - midx = cudf.MultiIndex( - levels=[[1, 3, 4, 5], [1, 2, 5]], - codes=[[0, 0, 1, 2, 3], [0, 2, 1, 1, 0]], - names=["x", "y"], - ) - pmidx = midx.to_pandas() - - assert_eq(midx.to_numpy(), pmidx.to_numpy()) - - -@pytest.mark.parametrize( - "gdi, fill_value, expected", - [ - ( - lambda: cudf.MultiIndex( - levels=[[1, 3, 4, None], [1, 2, 5]], - codes=[[0, 0, 1, 2, 3], [0, 2, 1, 1, 0]], - names=["x", "y"], - ), - 5, - lambda: cudf.MultiIndex( - levels=[[1, 3, 4, 5], [1, 2, 5]], - codes=[[0, 0, 1, 2, 3], [0, 2, 1, 1, 0]], - names=["x", "y"], - ), - ), - ( - lambda: cudf.MultiIndex( - levels=[[1, 3, 4, None], [1, None, 5]], - codes=[[0, 0, 1, 2, 3], [0, 2, 1, 1, 0]], - names=["x", "y"], - ), - 100, - lambda: cudf.MultiIndex( - levels=[[1, 3, 4, 100], [1, 100, 5]], - codes=[[0, 0, 1, 2, 3], [0, 2, 1, 1, 0]], - names=["x", "y"], - ), - ), - ( - lambda: cudf.MultiIndex( - levels=[["a", "b", "c", None], ["1", None, "5"]], - codes=[[0, 0, 1, 2, 3], [0, 2, 1, 1, 0]], - names=["x", "y"], - ), - "100", - lambda: cudf.MultiIndex( - levels=[["a", "b", "c", "100"], ["1", "100", "5"]], - codes=[[0, 0, 1, 2, 3], [0, 2, 1, 1, 0]], - names=["x", "y"], - ), - ), - ], -) -def test_multiindex_fillna(gdi, fill_value, expected): - assert_eq(expected(), gdi().fillna(fill_value)) - - -@pytest.mark.parametrize( - "pdi", - [ - pd.MultiIndex( - levels=[[], [], []], - codes=[[], [], []], - names=["one", "two", "three"], - ), - pd.MultiIndex.from_tuples( - list( - zip( - *[ - [ - "bar", - "bar", - "baz", - "baz", - "foo", - "foo", - "qux", - "qux", - ], - [ - "one", - "two", - "one", - "two", - "one", - "two", - "one", - "two", - ], - ] - ) - ) - ), - ], -) -def test_multiindex_empty(pdi): - gdi = cudf.from_pandas(pdi) - - assert_eq(pdi.empty, gdi.empty) - - -@pytest.mark.parametrize( - "pdi", - [ - pd.MultiIndex( - levels=[[], [], []], - codes=[[], [], []], - names=["one", "two", "three"], - ), - pd.MultiIndex.from_tuples( - list( - zip( - *[ - [ - "bar", - "bar", - "baz", - "baz", - "foo", - "foo", - "qux", - "qux", - ], - [ - "one", - "two", - "one", - "two", - "one", - "two", - "one", - "two", - ], - ] - ) - ) - ), - ], -) -def test_multiindex_size(pdi): - gdi = cudf.from_pandas(pdi) - - assert_eq(pdi.size, gdi.size) - - -@pytest.mark.parametrize( - "level", - [ - [], - "alpha", - "location", - "weather", - 0, - 1, - [0, 1], - -1, - [-1, -2], - [-1, "weather"], - ], -) -def test_multiindex_droplevel_simple(pdfIndex, level): - gdfIndex = cudf.from_pandas(pdfIndex) - assert_eq(pdfIndex.droplevel(level), gdfIndex.droplevel(level)) - - -@pytest.mark.parametrize( - "level", - itertools.chain( - *( - itertools.combinations( - ("alpha", "location", "weather", "sign", "timestamp"), r - ) - for r in range(5) - ) - ), -) -def test_multiindex_droplevel_name(pdfIndex, level): - level = list(level) - gdfIndex = cudf.from_pandas(pdfIndex) - assert_eq(pdfIndex.droplevel(level), gdfIndex.droplevel(level)) - - -@pytest.mark.parametrize( - "level", - itertools.chain(*(itertools.combinations(range(5), r) for r in range(5))), -) -def test_multiindex_droplevel_index(pdfIndex, level): - level = list(level) - gdfIndex = cudf.from_pandas(pdfIndex) - assert_eq(pdfIndex.droplevel(level), gdfIndex.droplevel(level)) - - -@pytest.mark.parametrize("ascending", [True, False]) -@pytest.mark.parametrize("return_indexer", [True, False]) -@pytest.mark.parametrize( - "pmidx", - [ - pd.MultiIndex( - levels=[[1, 3, 4, 5], [1, 2, 5]], - codes=[[0, 0, 1, 2, 3], [0, 2, 1, 1, 0]], - names=["x", "y"], - ), - pd.MultiIndex.from_product( - [["bar", "baz", "foo", "qux"], ["one", "two"]], - names=["first", "second"], - ), - pd.MultiIndex( - levels=[[], [], []], - codes=[[], [], []], - names=["one", "two", "three"], - ), - pd.MultiIndex.from_tuples( - list( - zip( - *[ - [ - "bar", - "bar", - "baz", - "baz", - "foo", - "foo", - "qux", - "qux", - ], - [ - "one", - "two", - "one", - "two", - "one", - "two", - "one", - "two", - ], - ] - ) - ) - ), - ], -) -def test_multiindex_sort_values(pmidx, ascending, return_indexer): - pmidx = pmidx - midx = cudf.from_pandas(pmidx) - - expected = pmidx.sort_values( - ascending=ascending, return_indexer=return_indexer - ) - actual = midx.sort_values( - ascending=ascending, return_indexer=return_indexer - ) - - if return_indexer: - expected_indexer = expected[1] - actual_indexer = actual[1] - - assert_eq(expected_indexer, actual_indexer) - - expected = expected[0] - actual = actual[0] - - assert_eq(expected, actual) - - -@pytest.mark.parametrize( - "pdi", - [ - pd.MultiIndex( - levels=[[1, 3.0, 4, 5], [1, 2.3, 5]], - codes=[[0, 0, 1, 2, 3], [0, 2, 1, 1, 0]], - names=["x", "y"], - ), - pd.MultiIndex( - levels=[[1, 3, 4, -10], [1, 11, 5]], - codes=[[0, 0, 1, 2, 3], [0, 2, 1, 1, 0]], - names=["x", "y"], - ), - pd.MultiIndex( - levels=[["a", "b", "c", "100"], ["1", "100", "5"]], - codes=[[0, 0, 1, 2, 3], [0, 2, 1, 1, 0]], - names=["x", "y"], - ), - pytest.param( - pd.MultiIndex( - levels=[[None, "b", "c", "a"], ["1", None, "5"]], - codes=[[0, 0, 1, 2, 3], [0, 2, 1, 1, 0]], - names=["x", "y"], - ), - marks=[ - pytest.mark.xfail( - reason="https://github.com/pandas-dev/pandas/issues/35584" - ) - ], - ), - ], -) -@pytest.mark.parametrize("ascending", [True, False]) -def test_multiindex_argsort(pdi, ascending): - gdi = cudf.from_pandas(pdi) - - if not ascending: - expected = pdi.argsort()[::-1] - else: - expected = pdi.argsort() - - actual = gdi.argsort(ascending=ascending) - - assert_eq(expected, actual) - - -@pytest.mark.parametrize( - "names", [[None, None], ["a", None], ["new name", "another name"]] -) -@pytest.mark.parametrize("inplace", [True, False]) -def test_multiindex_set_names(names, inplace): - pi = pd.MultiIndex.from_product([["python", "cobra"], [2018, 2019]]) - gi = cudf.from_pandas(pi) - - expected = pi.set_names(names=names, inplace=inplace) - actual = gi.set_names(names=names, inplace=inplace) - - if inplace: - expected, actual = pi, gi - - assert_eq(expected, actual) - - -@pytest.mark.parametrize("idx_names", [[None, None, None], [1, 0, 2]]) -@pytest.mark.parametrize( - "level, names", - [ - (0, "abc"), - (1, "xyz"), - ([2, 1], ["a", "b"]), - ([0, 1], ["aa", "bb"]), - (None, ["a", "b", "c"]), - (None, ["a", None, "c"]), - ], -) -@pytest.mark.parametrize("inplace", [True, False]) -def test_multiindex_set_names_default_and_int_names( - idx_names, level, names, inplace -): - pi = pd.MultiIndex.from_product( - [["python", "cobra"], [2018, 2019], ["aab", "bcd"]], names=idx_names - ) - gi = cudf.from_pandas(pi) - - expected = pi.set_names(names=names, level=level, inplace=inplace) - actual = gi.set_names(names=names, level=level, inplace=inplace) - - if inplace: - expected, actual = pi, gi - - assert_eq(expected, actual) - - -@pytest.mark.parametrize( - "level, names", - [ - ([None], "abc"), - (["three", "one"], ["a", "b"]), - (["three", 1], ["a", "b"]), - ([0, "three", 1], ["a", "b", "z"]), - (["one", 1, "three"], ["a", "b", "z"]), - (["one", None, "three"], ["a", "b", "z"]), - ([2, 1], ["a", "b"]), - (1, "xyz"), - ], -) -@pytest.mark.parametrize("inplace", [True, False]) -def test_multiindex_set_names_string_names(level, names, inplace): - pi = pd.MultiIndex.from_product( - [["python", "cobra"], [2018, 2019], ["aab", "bcd"]], - names=["one", None, "three"], - ) - gi = cudf.from_pandas(pi) - - expected = pi.set_names(names=names, level=level, inplace=inplace) - actual = gi.set_names(names=names, level=level, inplace=inplace) - - if inplace: - expected, actual = pi, gi - - assert_eq(expected, actual) - - -@pytest.mark.parametrize( - "level, names", [(1, ["a"]), (None, "a"), ([1, 2], ["a"]), (None, ["a"])] -) -def test_multiindex_set_names_error(level, names): - pi = pd.MultiIndex.from_product( - [["python", "cobra"], [2018, 2019], ["aab", "bcd"]] - ) - gi = cudf.from_pandas(pi) - - assert_exceptions_equal( - lfunc=pi.set_names, - rfunc=gi.set_names, - lfunc_args_and_kwargs=([], {"names": names, "level": level}), - rfunc_args_and_kwargs=([], {"names": names, "level": level}), - ) - - -@pytest.mark.parametrize("name", [None, "old name"]) -@pytest.mark.parametrize( - "names", - [ - [None, None], - ["a", None], - ["new name", "another name"], - [1, None], - [2, 3], - [42, "name"], - ], -) -@pytest.mark.parametrize("inplace", [True, False]) -def test_multiindex_rename(name, names, inplace): - pi = pd.MultiIndex.from_product( - [["python", "cobra"], [2018, 2019]], names=[name, None] - ) - gi = cudf.from_pandas(pi) - - expected = pi.rename(names=names, inplace=inplace) - actual = gi.rename(names=names, inplace=inplace) - - if inplace: - expected, actual = pi, gi - - assert_eq(expected, actual) - - -@pytest.mark.parametrize( - "names", ["plain string", 123, ["str"], ["l1", "l2", "l3"]] -) -def test_multiindex_rename_error(names): - pi = pd.MultiIndex.from_product([["python", "cobra"], [2018, 2019]]) - gi = cudf.from_pandas(pi) - - assert_exceptions_equal( - lfunc=pi.rename, - rfunc=gi.rename, - lfunc_args_and_kwargs=([], {"names": names}), - rfunc_args_and_kwargs=([], {"names": names}), - ) - - @pytest.mark.parametrize( "key", [0, 1, [], [0, 1], slice(None), slice(0, 0), slice(0, 1), slice(0, 2)], @@ -1646,141 +793,6 @@ def test_multiindex_indexing(key): assert_eq(gi[key], pi[key], exact=False) -def test_multiindex_duplicate_names(): - gi = cudf.MultiIndex( - levels=[["a", "b"], ["b", "a"]], - codes=[[0, 0], [0, 1]], - names=["a", "a"], - ) - pi = pd.MultiIndex( - levels=[["a", "b"], ["b", "a"]], - codes=[[0, 0], [0, 1]], - names=["a", "a"], - ) - - assert_eq(gi, pi) - - -def test_difference(): - midx = cudf.MultiIndex( - levels=[[1, 3, 4, 5], [1, 2, 5]], - codes=[[0, 0, 1, 2, 3], [0, 2, 1, 1, 0]], - names=["x", "y"], - ) - midx2 = cudf.MultiIndex( - levels=[[1, 3, 4, 5], [1, 2, 5]], - codes=[[0, 0, 1, 2, 3, 3], [0, 2, 1, 1, 0, 2]], - names=["x", "y"], - ) - - expected = midx2.to_pandas().difference(midx.to_pandas()) - actual = midx2.difference(midx) - assert isinstance(actual, cudf.MultiIndex) - assert_eq(expected, actual) - - -@pytest.mark.parametrize( - "idx1, idx2", - [ - ( - pd.MultiIndex.from_arrays( - [[1, 1, 2, 2], ["Red", "Blue", "Red", "Blue"]] - ), - pd.MultiIndex.from_arrays( - [[3, 3, 2, 2], ["Red", "Green", "Red", "Green"]] - ), - ), - ( - pd.MultiIndex.from_arrays( - [[1, 2, 3, 4], ["Red", "Blue", "Red", "Blue"]], - names=["a", "b"], - ), - pd.MultiIndex.from_arrays( - [[3, 3, 2, 4], ["Red", "Green", "Red", "Green"]], - names=["x", "y"], - ), - ), - ( - pd.MultiIndex.from_arrays( - [[1, 2, 3, 4], [5, 6, 7, 10], [11, 12, 12, 13]], - names=["a", "b", "c"], - ), - pd.MultiIndex.from_arrays( - [[3, 3, 2, 4], [0.2, 0.4, 1.4, 10], [3, 3, 2, 4]] - ), - ), - ( - pd.MultiIndex.from_arrays( - [[1, 2, 3, 4], [5, 6, 7, 10], [11, 12, 12, 13]], - names=["a", "b", "c"], - ), - [(2, 6, 12)], - ), - ], -) -@pytest.mark.parametrize("sort", [None, False]) -def test_union_mulitIndex(idx1, idx2, sort): - expected = idx1.union(idx2, sort=sort) - - idx1 = cudf.from_pandas(idx1) if isinstance(idx1, pd.MultiIndex) else idx1 - idx2 = cudf.from_pandas(idx2) if isinstance(idx2, pd.MultiIndex) else idx2 - - actual = idx1.union(idx2, sort=sort) - assert_eq(expected, actual) - - -@pytest.mark.parametrize( - "idx1, idx2", - [ - ( - pd.MultiIndex.from_arrays( - [[1, 1, 2, 2], ["Red", "Blue", "Red", "Blue"]] - ), - pd.MultiIndex.from_arrays( - [[1, 3, 2, 2], ["Red", "Green", "Red", "Green"]] - ), - ), - ( - pd.MultiIndex.from_arrays( - [[1, 2, 3, 4], ["Red", "Blue", "Red", "Blue"]], - names=["a", "b"], - ), - pd.MultiIndex.from_arrays( - [[3, 3, 2, 4], ["Red", "Green", "Red", "Green"]], - names=["x", "y"], - ), - ), - ( - pd.MultiIndex.from_arrays( - [[1, 2, 3, 4], [5, 6, 7, 10], [11, 12, 12, 13]], - names=["a", "b", "c"], - ), - pd.MultiIndex.from_arrays( - [[3, 3, 2, 4], [0.2, 0.4, 1.4, 10], [3, 3, 2, 4]] - ), - ), - ( - pd.MultiIndex.from_arrays( - [[1, 2, 3, 4], [5, 6, 7, 10], [11, 12, 12, 13]], - names=["a", "b", "c"], - ), - pd.MultiIndex.from_arrays( - [[1, 2, 3, 4], [5, 6, 7, 10], [11, 12, 12, 13]], - ), - ), - ], -) -@pytest.mark.parametrize("sort", [None, False]) -def test_intersection_mulitIndex(idx1, idx2, sort): - expected = idx1.intersection(idx2, sort=sort) - - idx1 = cudf.from_pandas(idx1) - idx2 = cudf.from_pandas(idx2) - - actual = idx1.intersection(idx2, sort=sort) - assert_eq(expected, actual, exact=False) - - @pytest.mark.parametrize( "names", [ @@ -1810,48 +822,6 @@ def test_pickle_roundtrip_multiindex(names): assert_eq(expected_df, actual_df) -@pytest.mark.parametrize( - "pidx", - [ - pd.MultiIndex.from_arrays( - [[1, 1, 2, 2], ["Red", "Blue", "Red", "Blue"]] - ), - pd.MultiIndex.from_arrays( - [[1, 2, 3, 4], [5, 6, 7, 10], [11, 12, 12, 13]], - names=["a", "b", "c"], - ), - pd.MultiIndex.from_arrays( - [[1.0, 2, 3, 4], [5, 6, 7.8, 10], [11, 12, 12, 13]], - ), - ], -) -@pytest.mark.parametrize( - "func", - [ - "is_numeric", - "is_boolean", - "is_integer", - "is_floating", - "is_object", - "is_categorical", - "is_interval", - ], -) -def test_multiindex_type_methods(pidx, func): - gidx = cudf.from_pandas(pidx) - - with pytest.warns(FutureWarning): - expected = getattr(pidx, func)() - - with pytest.warns(FutureWarning): - actual = getattr(gidx, func)() - - if func == "is_object": - assert_eq(False, actual) - else: - assert_eq(expected, actual) - - def test_multiindex_index_single_row(): arrays = [["a", "a", "b", "b"], [1, 2, 3, 4]] tuples = list(zip(*arrays)) @@ -1864,23 +834,6 @@ def test_multiindex_index_single_row(): assert_eq(pdf.loc[("b", 3)], gdf.loc[("b", 3)]) -def test_multiindex_levels(): - gidx = cudf.MultiIndex.from_product( - [range(3), ["one", "two"]], names=["first", "second"] - ) - pidx = gidx.to_pandas() - - assert_eq(gidx.levels[0], pidx.levels[0]) - assert_eq(gidx.levels[1], pidx.levels[1]) - - -def test_multiindex_empty_slice_pandas_compatibility(): - expected = pd.MultiIndex.from_tuples([("a", "b")])[:0] - with cudf.option_context("mode.pandas_compatible", True): - actual = cudf.from_pandas(expected) - assert_eq(expected, actual, exact=False) - - @pytest.mark.parametrize( "levels", itertools.chain.from_iterable( @@ -1904,134 +857,6 @@ def test_multiindex_sort_index_partial(levels): assert_eq(expect, got) -def test_multiindex_to_series_error(): - midx = cudf.MultiIndex.from_tuples([("a", "b")]) - with pytest.raises(NotImplementedError): - midx.to_series() - - -@pytest.mark.parametrize( - "pidx", - [ - pd.MultiIndex.from_arrays( - [[1, 2, 3, 4], [5, 6, 7, 10], [11, 12, 12, 13]], - names=["a", "b", "c"], - ), - pd.MultiIndex.from_arrays( - [[1, 2, 3, 4], [5, 6, 7, 10], [11, 12, 12, 13]], - names=["a", "a", "a"], - ), - pd.MultiIndex.from_arrays( - [[1, 2, 3, 4], [5, 6, 7, 10], [11, 12, 12, 13]], - ), - ], -) -@pytest.mark.parametrize( - "name", [None, no_default, ["x", "y", "z"], ["rapids", "rapids", "rapids"]] -) -@pytest.mark.parametrize("allow_duplicates", [True, False]) -@pytest.mark.parametrize("index", [True, False]) -def test_multiindex_to_frame_allow_duplicates( - pidx, name, allow_duplicates, index -): - gidx = cudf.from_pandas(pidx) - - if name is None or ( - ( - len(pidx.names) != len(set(pidx.names)) - and not all(x is None for x in pidx.names) - ) - and not allow_duplicates - and name is no_default - ): - assert_exceptions_equal( - pidx.to_frame, - gidx.to_frame, - lfunc_args_and_kwargs=( - [], - { - "index": index, - "name": name, - "allow_duplicates": allow_duplicates, - }, - ), - rfunc_args_and_kwargs=( - [], - { - "index": index, - "name": name, - "allow_duplicates": allow_duplicates, - }, - ), - ) - else: - if ( - len(pidx.names) != len(set(pidx.names)) - and not all(x is None for x in pidx.names) - and not isinstance(name, list) - ) or (isinstance(name, list) and len(name) != len(set(name))): - # cudf doesn't have the ability to construct dataframes - # with duplicate column names - with pytest.raises(ValueError): - gidx.to_frame( - index=index, - name=name, - allow_duplicates=allow_duplicates, - ) - else: - expected = pidx.to_frame( - index=index, name=name, allow_duplicates=allow_duplicates - ) - actual = gidx.to_frame( - index=index, name=name, allow_duplicates=allow_duplicates - ) - - assert_eq(expected, actual) - - -@pytest.mark.parametrize("bad", ["foo", ["foo"]]) -def test_multiindex_set_names_validation(bad): - mi = cudf.MultiIndex.from_tuples([(0, 0), (0, 1), (1, 0), (1, 1)]) - with pytest.raises(ValueError): - mi.names = bad - - -def test_multiindex_values_pandas_compatible(): - midx = cudf.MultiIndex.from_tuples([(10, 12), (8, 9), (3, 4)]) - with cudf.option_context("mode.pandas_compatible", True): - with pytest.raises(NotImplementedError): - midx.values - - -def test_multiindex_dtype_error(): - midx = cudf.MultiIndex.from_tuples([(10, 12), (8, 9), (3, 4)]) - with pytest.raises(TypeError): - cudf.Index(midx, dtype="int64") - with pytest.raises(TypeError): - cudf.Index(midx.to_pandas(), dtype="int64") - - -def test_multiindex_codes(): - midx = cudf.MultiIndex.from_tuples( - [("a", "b"), ("a", "c"), ("b", "c")], names=["A", "Z"] - ) - - for p_array, g_array in zip(midx.to_pandas().codes, midx.codes): - assert_eq(p_array, g_array) - - -def test_multiindex_union_error(): - midx = cudf.MultiIndex.from_tuples([(10, 12), (8, 9), (3, 4)]) - pidx = midx.to_pandas() - - assert_exceptions_equal( - midx.union, - pidx.union, - lfunc_args_and_kwargs=(["a"],), - rfunc_args_and_kwargs=(["b"],), - ) - - @pytest.mark.parametrize("idx_get", [(0, 0), (0, 1), (1, 0), (1, 1)]) @pytest.mark.parametrize("cols_get", [0, 1, [0, 1], [1, 0], [1], [0]]) def test_multiindex_loc_scalar(idx_get, cols_get): @@ -2043,157 +868,3 @@ def test_multiindex_loc_scalar(idx_get, cols_get): expected = pdf.loc[idx_get, cols_get] assert_eq(actual, expected) - - -def test_multiindex_eq_other_multiindex(): - idx = cudf.MultiIndex.from_tuples([(0, 0), (0, 1), (1, 0), (1, 1)]) - result = idx == idx - expected = np.array([True, True]) - assert_eq(result, expected) - - -@pytest.fixture( - params=[ - "from_product", - "from_tuples", - "from_arrays", - "init", - ] -) -def midx(request): - if request.param == "from_product": - return cudf.MultiIndex.from_product([[0, 1], [1, 0]]) - elif request.param == "from_tuples": - return cudf.MultiIndex.from_tuples([(0, 1), (0, 0), (1, 1), (1, 0)]) - elif request.param == "from_arrays": - return cudf.MultiIndex.from_arrays([[0, 0, 1, 1], [1, 0, 1, 0]]) - elif request.param == "init": - return cudf.MultiIndex( - levels=[[0, 1], [0, 1]], codes=[[0, 0, 1, 1], [1, 0, 1, 0]] - ) - else: - raise NotImplementedError(f"{request.param} not implemented") - - -def test_multindex_constructor_levels_always_indexes(midx): - assert_eq(midx.levels[0], cudf.Index([0, 1])) - assert_eq(midx.levels[1], cudf.Index([0, 1])) - - -@pytest.mark.parametrize( - "array", - [ - list, - tuple, - np.array, - cp.array, - pd.Index, - cudf.Index, - pd.Series, - cudf.Series, - ], -) -def test_multiindex_from_arrays(array): - pd_data = [[0, 0, 1, 1], [1, 0, 1, 0]] - cudf_data = [array(lst) for lst in pd_data] - result = pd.MultiIndex.from_arrays(pd_data) - expected = cudf.MultiIndex.from_arrays(cudf_data) - assert_eq(result, expected) - - -@pytest.mark.parametrize("arg", ["foo", ["foo"]]) -def test_multiindex_from_arrays_wrong_arg(arg): - with pytest.raises(TypeError): - cudf.MultiIndex.from_arrays(arg) - - -@pytest.mark.parametrize( - "scalar", - [ - 1, - 1.0, - "a", - datetime.datetime(2020, 1, 1), - datetime.timedelta(1), - pd.Interval(1, 2), - ], -) -def test_index_to_pandas_arrow_type_nullable_raises(scalar): - pa_array = [scalar, None] - midx = cudf.MultiIndex(levels=[pa_array], codes=[[0]]) - with pytest.raises(ValueError): - midx.to_pandas(nullable=True, arrow_type=True) - - -@pytest.mark.parametrize( - "scalar", - [1, 1.0, "a", datetime.datetime(2020, 1, 1), datetime.timedelta(1)], -) -def test_index_to_pandas_arrow_type(scalar): - pa_array = pa.array([scalar, None]) - midx = cudf.MultiIndex(levels=[pa_array], codes=[[0]]) - result = midx.to_pandas(arrow_type=True) - expected = pd.MultiIndex( - levels=[pd.arrays.ArrowExtensionArray(pa_array)], codes=[[0]] - ) - pd.testing.assert_index_equal(result, expected) - - -def test_multi_index_contains_hashable(): - gidx = cudf.MultiIndex.from_tuples(zip(["foo", "bar", "baz"], [1, 2, 3])) - pidx = gidx.to_pandas() - - assert_exceptions_equal( - lambda: [] in gidx, - lambda: [] in pidx, - lfunc_args_and_kwargs=((),), - rfunc_args_and_kwargs=((),), - ) - - -@pytest.mark.parametrize("array", [[1, 2], [1, None], [None, None]]) -@pytest.mark.parametrize("dropna", [True, False]) -def test_nunique(array, dropna): - arrays = [array, [3, 4]] - gidx = cudf.MultiIndex.from_arrays(arrays) - pidx = pd.MultiIndex.from_arrays(arrays) - result = gidx.nunique(dropna=dropna) - expected = pidx.nunique(dropna=dropna) - assert result == expected - - -def test_bool_raises(): - assert_exceptions_equal( - lfunc=bool, - rfunc=bool, - lfunc_args_and_kwargs=[[cudf.MultiIndex.from_arrays([range(1)])]], - rfunc_args_and_kwargs=[[pd.MultiIndex.from_arrays([range(1)])]], - ) - - -def test_unique_level(): - pd_mi = pd.MultiIndex.from_arrays([[1, 1, 2], [3, 3, 2]]) - cudf_mi = cudf.MultiIndex.from_pandas(pd_mi) - - result = pd_mi.unique(level=1) - expected = cudf_mi.unique(level=1) - assert_eq(result, expected) - - -@pytest.mark.parametrize( - "idx", [pd.Index, pd.CategoricalIndex, pd.DatetimeIndex, pd.TimedeltaIndex] -) -def test_from_arrays_infer_names(idx): - arrays = [idx([1], name="foo"), idx([2], name="bar")] - expected = pd.MultiIndex.from_arrays(arrays) - result = cudf.MultiIndex.from_arrays(arrays) - assert_eq(result, expected) - - -def test_multiindex_droplevel_single_level_none_names(): - data = [(1, 2), (3, 4)] - pidx = pd.MultiIndex.from_tuples(data, names=[None, None]) - gidx = cudf.MultiIndex.from_tuples(data, names=[None, None]) - result = gidx.droplevel(0) - expected = pidx.droplevel(0) - assert_eq(result, expected) From 1fab75fe7c3a75a06d30973c5862c10e1f5dc652 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Wed, 6 Aug 2025 12:59:37 -0700 Subject: [PATCH 072/366] Use more pytest fixtures and clean data files cuDF classic tests subdirectories (#19474) Towards https://github.com/rapidsai/cudf/issues/9999 * Use more pytest fixtures * Remove & move data files where appropriate * Eliminate/reduce parameterizations of input size Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/19474 --- .pre-commit-config.yaml | 2 +- python/cudf/cudf/tests/data/ipums.pkl | Bin 99199 -> 0 bytes .../cudf/cudf/tests/data/{ => text}/vocab.txt | 0 .../groupby/test_ordering_pandas_compat.py | 14 +++++--------- .../cudf/tests/input_output/test_parquet.py | 4 ++-- .../cudf/cudf/tests/series/test_datetimelike.py | 9 ++++----- .../cudf/tests/text/test_subword_tokenizer.py | 5 ++--- 7 files changed, 14 insertions(+), 20 deletions(-) delete mode 100644 python/cudf/cudf/tests/data/ipums.pkl rename python/cudf/cudf/tests/data/{ => text}/vocab.txt (100%) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 1483a268ba3..9850bce2c95 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -63,7 +63,7 @@ repos: ^cpp/src/io/parquet/ipc/Schema_generated.h| ^cpp/src/io/parquet/ipc/Message_generated.h| ^cpp/include/cudf_test/cxxopts.hpp| - ^python/cudf/cudf/tests/data/vocab.txt| + ^python/cudf/cudf/tests/data/text/vocab.txt| ^python/cudf/cudf/tests/text/test_text_methods.py ) - repo: local diff --git a/python/cudf/cudf/tests/data/ipums.pkl b/python/cudf/cudf/tests/data/ipums.pkl deleted file mode 100644 index 5c8e896487d7c2c116612f49ca44c3711ba6617d..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 99199 zcmeHw34B~t_5Vwotz}mb>H?)KZCOg1q)jVZCX-1rO)@i`nItV+N!yg$(3aY!QV^sn zyAcourHFulY_bW0pdhQtDu@f9;4Yvbg4+*&;E(_J``$TkZXYvgit_L8|L5Kh=X}pS z=iGblz3;tq@0*#Fb(QDmYbrdJ;q6xTuUOo_dfK96Rt-#BvZ{aiKw)@XYyVJxn=hYN znEI*0iG?Y{RSOnFSs32I_FuYUXkgWf{$;BR!&|m2J7&?*-Tf>2j~ZB2@S9CJ9PM`6 z(0}p3+JV*67WS`3nDGl%3>>>4xdJ*gFk1{)smy?t5^u$t9tl z^AoWwg0;r-u~biYVYoV%$R=Z5P@$TEH>6xoOD@)(DGXP3boAxn)6o(1QI$<(sHll| z_2lvibW|N{odXfmh-Z?sXG4)qv?tT4!fjhUj4EeP2}uD(0j zj>x%u7L!KTROfm!=%Y5CPUQmUadBc@-Sj#PCzi=@NveIs5NbQp-HEPPDjZE6MpG3} z<+B*UnD)7FZJ!$~QB5Ggv$8Fg#RAu+`%;N)t^?y5ry{%TPG|G&v36V;73(>hba!`8 zj!L%9WYctw#j}ZybPpEFca3%Drx!)G%E#s3`jkBbfpU10HgAv5j zyk3>rM2c6E(4J-2l)4j~H2x=yFwV!or8d*m)7^qdoI{!u^B;^{N4n1! z^66w(f=k`flkCdR=d4A-O4jzIlKC8NO==UZ(*2ey1`(<<9qAOJkMmKY#*@epfDXOodXW)od5L%dAoeCYlJPEH^2%JU8^)?sTa0T`lfpHD2p?R>o{aBP zo9adp}^Aiw>QOgx*C$_adQxkq|&6L=XufL<(cnhcW8I81-R{`Y=X)7^6OnQ6I*r z4`bAaG3vt@4PlIiFh)ZdqalpZ5XNW-V>E;@8p0S2VT^__Mq?PGF^tg|#%K&mOF(ZsIBaAU4j4>mOF(ZsIBaAU4j4>mOF(ZsIGmJ4aj4?Be zF*A%YGmJ4aj4?BeF*A%YGmJ4ajL{UvXbNLAg)y4K7)@b}rZ7fR7^5kS(G`Wr-Z{SrBxF=WUy8OM`ztJF%zye;CLo5E4%{l74r#r^}C>&J>OQmRkcJ7^`xDtvV-) zYE28(xwc@Btj)D$v7P48Rc*XQ%@oC9JVkZElOOovNuz*?jLjwHf&05>4Y;JlmSi^H zF_*ct3L8Inxf)-^oLdb|HpShELNev+=4V-fTWhj6P}TGy`vu0fMApyNmDyMvBdsEE z5j&E(47a|TK;Up%GRUTt;HcDMn@J#{<3<>#qNT=Nq3U|5d1%2!Vt4bisaU>;qnV#g zcKc@*NV%R|jQ3O^q4hSoOu93XT7ct7dyl^Y<5XmrTqY5N8Jkg6K8xgz0sBZBXMp_- zws9(oI@XKyMy|72R*nrLrJ;zoprinfeNk5|3FW{K_$50~gfB=*-8YiUbo zd8NjtPn|OgVRE+H)kih$bQzN97o?IP=5@z9v)YQxYeCR4q9 z1}J6|`6RS#jgxOCP|6jg5-$b31v5D0;HHO}vyG>Dgu}CzM2L<)AFJ_-kZZ?Pu7bb= zWG(h~IC7mu^cOMFTGaZYuT5oI(z%@P8;HIZw>nPd2|h3?@`6Inbj9Ml`61%y&t-F$qnQ;5fgxhZ>+hBt$W(1H8jqQ(Ek@%(Q?$Qb(ekvZVlf)eo2o5F^B>w^ zWIS@JLopf;ovJNH#8c(09Ek@({Q?_J-ec#`w zyxGzm%;W9cw_kKnXg}qpxwr3}=x%=OQIsv4LjCxsE?;@zociXE9SG^w_E7KpJvuwA z-_ZW++H)fnpicI&1G{q30y^+Er| zb?~QgUQXI4CBCeyj>n4Uk9R_M;AiRmcq6c2z2?Y#TbhG7XgA@2P#>&MLr`z-3iW>b)x~x2SdPrfc*Jkrf#JA(JL*0vYiSWpU<9Mq29mV%kn|JVN$3M)^ldFKZGN5p+2}?O<`T_ z+E4J@eJC7HG@jNK{WvAY8P)s#SI!B>Gf~!g{cIU`LvY=%><-5jZSRkBUB`y@+&pPN zIn)Q^-!F*c^@V!By(jT7Uf*wiR@%4AeJAVp*Jc?{CTt(9TW8R3bQ~khU)u%il??A| zU;j*})H}mCrROcrhmQN7c^O9{Y!}3tCG(jZ>iv1&lM3tpb-r$15EuRU@oqUVsAD|7 z|GHv4f83{PyrMoj&S>3lx1=}dcY?&z=efB#h`USo+cYR=f|R?St!5%zMGS`{cSbg!A^>HS7}fyG*X5&ik3-_4DUFzc*N) zwrq2-Ub_eNJ$0X*6ZRWi=edFX`QkXD#p*=&aFJ}Q@{yZd)~V7p7#*ObkG)_ zH>TS2dQo2r{hQGB!vD@~Jnxlpp7$i`9|7(R-2rHO8uZI6&==a>R_l4kVNy@xdcOvJ z19X?7pOX=1JLn$;!sNl-M>O~4N+5~rK^sx)MEa4aXM4L3>S$BFwq-j^!`Bn3-xYL_ z=#BuAJ`DOGC=Q~1CTJe04YUgg!}l_vDIiSA>jF&x?Eq>AO$T*=*row=C}?NUL{KLP zOW-X6&BFjm&tkB(xbGemoz`iiUi}o+PwVCIZ&*0nX7>e?zZP5rm5C&-9oI_h>Z5f< z49)LD>x$GY0T6YR={FHXJN4|F zeGzHnwb>gq4KxeH`=AlT>$DSSKTtb}?ddxegj>{G2%?^L_DenOlR)f`GW%!0ycakY z&Yx{6KzMAtIuLz$fAOBG2QemNO$O}&;&`?L(UY{D5zA4ClnJ4%> zGCxpOR2}O|wmmA-Ux~JIq(1CZb1g()R^%L_g}yArsBF(!?WIut*pD4UWkuTUqIx3N zjD@IrUm+uVVld=Kqt$=O!F3_R-D)v&Z(dllDZL4*-7( zpY3@V`45y&OCLO|mMc1!X2jh(GCsDkRWhyc9**{@zVuU>sIvOgUXIjAK zso2z&ifzYYY+f7HEtC#x7|7BnNck1?`%cUAR>R3MjB733iO}tj zSf?TOLm-y(tk@s2!jb>^W*5eY6cd?S3}2yf zxiO!S?A#YDGjF>u$}BdjeN!(p3<8#EhG2c-pQqOr%Po4I`w(6_%Q#*aM9otgoeXHxlwXtQ|=*M{MQ{z#u zl~BTi8?=B z6Y8`b`(&Lqq7v;!RA06YZN&?T>~AL`skdfS3g~YXd7*7qupNb18uKJ zf3?|dRd4I;>uu{x`RaYBeV0Ox!PaX(wvO_q>a{(ZZnnZPXi+&jF1xO>jduH}o^=*m zuX121+HAk5P21S0K8m#Q8fdY7DYNJ`3TgpYoBFHGuB)u5K8jI2>nyfjWkrrf$79R1 zYrPa|dqw)#_h_^Y{b;A2?>Z#??oMUaZ5!)0@v|G-M}4X1_kwoaZbv=s^xK?8*NVuz zMdW(u+Nhm2T`S7=+N!^{vFqx`w%Shp8elvW^aK*!b4WN3z-IyKosef^(>f0FEZ`-; zPomK};8#G8!S9pk@GZ!lAinc1194DmL653rh^_q;e zl@WQQK5d(h!`5k6)Nu{%LVxPjU!OJGXLHIrpHgG6Z7M6;{?tYFs#CqzqwVavEo(dK zbnR@N`j*ns7L8ZVPy6}5i6f20wyCUWkFk`FcIs^#WnDkowM6x{kG5B|Z7SRC)vr|F zQ5$`zv-R4i+DajH>SwpN`%#~$j(S(z3jF`N0{T173I|OywC@a!_Xv9z3YhUzNQXA{|&!2;ihiJ>T zJ!-T4qBgr754cZOFLj6<_&u{#`e=>-B{}d3eHC;)h`&px{wmOepxZ&$fF1^24Z0Wf1<)5k=YY-y(Y_vZHt5Ts zi$V8;9s{ut`dtFL0i=cF(_+iiX`NU~)^?>({i8PRi^ywl*Ec59-(JIw`IoXUMjxaH z1+~60r2je$U}JtG+0n&&veaSLyMyfCLNiD5oe5vJ`!>E`(TBeLtcEgus~4Sx$w`(B_)py+R*X&Wc@d(e=-F?bc7?u3A+BVbo0enW#qunf-uZ4bbiNMHU| zi86iWJ&YWKcKj{+uN%w=O?*~yIgCBnzaKLU@ z!^!xq5%jlXVTi0xfIN((Mcu3LSphx$r$W9K-?y_p?X=OqrPA~Iuy${Wo;v!Fm}jVe zcx>RS<06uHomDP}9DCHp>rP#tWxH8qu2a_1Yg-)`k@O^dR4#`cd(>7Xx+O9v+W&^+ z<0EYR4OX-bLcQa{+_t&O_i_C;SK)KIxyrx0=C}>Ka>O~6vr*6cm&j*hBtP0%qCShE zD@XJ%XQQ4ukjNZ3k{@j>Q6Jp;;rTTx(?6=S>y=_xv~5wjRA1D+7mHqh-m@x?2I4h}5xYzbY%LkJ`0A)zeRlZR1*SJryZy zo&Ks4)#2LGl2MZT{-2znYmTcXW#(yj(|?Sp=Hc?cE0mo5hK&uiX58+|_{ z?j`j_uy2bxb$oZ_@8~H{6XfR%i$Kk&vmb4@D|9N`{ZdZD)(?6H{>wpEfoz{WB=+y& zbBoxHMLYdW;5(wb5cbPK+oQe~#Lq$4&mGWdA9lZ#Z-8wn=rPdWu<%cB?Rod?S)}Xm zK?&<`!gp(YK0*1+J%Yc{G5ZIKdyWqV&lQTOxm^KX3{?FekuP9{%7I?a(;gM z8peAIV%P;Pe)$dNaj4kthF^ooQw84-JR5rUu{(UZHY|67;=c5Db`5;ko^w9|ZLXF! z>iM(?Q=r$_{qP$Qzr})UfsY_A`}h`Uy67K*E{}StQh*QJ^EsOjpR=Tm`c4%AKil^9 z5*z>C?O>5R1o?NCry~yg7zVu#Vz~^&zo#ozPC*;Cr&x7ms7ru)LENKx8BPLGp8;tftlPxz8}KJA-ZCJ^FfC#~LFz{UdA?v<-uwLSi1M|9 z`vaMy*$2OiI2S}a$NXK9S-0tnsIz?os1X$1i>YH#q|DD%wXXiGtFD~Pwsuj!YABU# zJ9R9IlzES6-Nuc{Y-<<#leP!(+L7#1Cly;yS?8;IqAky|+U+HB6zI?E#JT8mMpUAn zMeQovdRxbKyw}v9s51SuXe|1&u+E}!wH|1TcC9Pgexs4u&i1qG)GJZ1CF);FZ~K&! z+0M3UKZ;x@T@Tg$JIM3#NW~rtWxGC7yV2N2>Wgz!(zryOJCSR_qJAnXs*l>WJ?qif zw#_@y`kOuY@^AK7_&0m}8z%nE9t-768@{jkOCZ;+3BD(}sSbaeHz)X;z1l3ESte7I zx@W*{^u_RRq1Xq@#78{ucL$C9_ktQ*|6Y)>OZ~l|#*Y5IAY*Ur?*%Ww7s>wLDh9*! z`4cB&uWs2UoHu2iH)YP7_^TeU$Ro>rI>ex9R0%?c-S{NvrTXGA(SYq`Hk^xp=DQcXFdC8duwCo7=Yt z&bV5)(O#o+GS9~>;{(`$^Ls>=Qftn465ccAO1XHpzFb>dUn;h3FD2V+R8B6{E~+cH z=21VpUM`-kxBbh>6uGKoubVX8$M9*8};^_$*{ThD_m+ z$1|+&3L72;zwmse=b&vx$M+Zd@Rqjw*%S3>pHTUQXIg%D%h>yXCV}iiJ4+0tacUvA zQi;O8S@IzIun&^9OIk9-4N5a=NaOwx^%=wd1JX)Ef?McQpy{fR1TpNdgi z)KAa8w*K9U&HdIcYo%hx=DVG~-zl=4u6d^=)2@1m8x($erDt>+er;D!PfO63bU~qe z3i5ZPuE^g|tGpX*>d)`W9ur^oZx{NE!;1t%*mKi*RHtoKZ=>C&l&o!uw%yh{`G5Zk zEdCD-aZ~%*RR90bF#luSaQr$4l}Cd*fYJDt^Wp!NnB}2 z=b9hl*{$jc@U<%f+^R3Ye_a>g*fRorZgPOfoD<-VhXuIcv;aSHQ-D98D*7)4c<5aL zrau#4)06=B_(XvFtq<_fe1OSs2bh=^;A;|Zmt%waT`d7N9T4F1y92BczxbI!eebIR zy#Am7-?%0~F5kf6g(+q$v(1Ts?cTnYZ{UCPvg;Z2+1Ko6>P(}VVP={pGt102A2s`% z1I&TuAak&ZnHCc_ttMgGOuJcN7Mew7u{p*JnKkC)=2)}V9B)oACz?;1&zMupspd3u zx;ev~X+CR)&F9Qn=4^A0InP{VzHBZwmzYb^l`L6k%`M!D3 zJY*g=kC;czW9A3uar1=vp?T8$$o$y+#QfAeWu7+An4g)Sn`g~)<`?Fd=2zy|<~QcI z=6B}z=6UmidC~mA{L#E*UN)}|j>nBMxCI&iCF8+u$hIZJzfyYzf&>Wc>F$2bYoYmqddr$c`afNybl%2ZzXxC;J2$KLH+G zN5}yo~H}vMb2GN_Hg~f4ww#HQ6;}*OGmW z>^idR$!;LKk?bb2uan(O#$PuLeuL~*vfIe;j~IG`x0BsL_ARn+lif*n7a4y5HOOBE z4cA{xOZGdm-;+I0_5#_9WPc$0 zBN;!vAAFhYPh_u<{TXZkkKy2J5K?PS#s5%ggs1WG@M+w`>}e*ODQ2qK%j|9TG1E=G zX)w*E!z9fd(`mX)w@I0_$(Xq&YjP%UdQ7kBGxN-RbBHHB)|(5>m&~8c zE9TGUHS-tqSMxXXck>VPU*>i5PxIgA4fCe?m-!#_)}Y6UR*;P$8%tJ6Rz+4#Rzp@x zHjZpdvaQJQ559VX{3+bv4rCu7+mUQ1GX5TC@PlL@BKt7eN62<2n@F}R*(9>v$aW{& zgKST-$z)T=rjqSNwl~>6Wc!j$BioOxj%+$vJy`=;BiRhHnPmHu9YA&<*+FCnlQomY z$XdwaWUXWgvNp1IvJSE&*&MP?vM#c2vJ_dGEJHSzEK8On%aiqx^^*0G%_Eynb_m&_ zWQUO*PId&@0NV0FQaAFf`w?3hJIuP$KsqrUp%{ma(CGRRlJ;o9X# zty%y(Jm2=eL&K&yH~~&)v^QOaTh@P7D0q|L*9C7DyhZREg0~9Z zCiqRk+Xe3s{FdOi1@9EROYl2_cMIMlc(34ng7*tPAoyLu?+Jci@Dah^3qCLS2f;rI zz9jgv;GYElTkw4qf$)Ao?v^Y&2y#zi;kLxWor#5;6ASkz7H&~2+@)B!QL%8ZV&VS8 z!Yzu0yA%sIDi-clEZnJBxKFWgM`Gco#KPT(g?kapB0(d_t%!xY5eqjYmSuvg1V16j zjfjPN5z8k9xh=7*6XXuX!cB^W`xFbeDi-cmEZneIxM#7PF39bRg*zAvH!&9OV=SK& zJWKFw!E*#ZFUT#Ag}WXLH$E2beJtGmSiUI8osfl_Aq)3I7H)|w+!a~4F|u%PWa0M6 za;e}~1TPc3T<{9PuL@o%c$MJQg4YOMEBH0R>jbYCyg~3r!J7oXE_k!xErQ<=yjAcv z!EXxQE_jFFw*H|rK;F|fBE2XJbai;_#6!(z&o`RDFrwC3J+)Hq8!F>ex6`UrxpJ1KfbisPT2Ej(b8G=Nu2ObO-%dj$^>JXG*7!NUa?2rd&`E_jUK zO2Ll_t`b}=I3&16@Z*9H3O*$Gu;8PDj|u)j@NvN>1b-;_q~MPPe=PVD!Ji5~CHS=9 zGlD-8{JG$>g3k&5LhzS@zY_el;BN$fEBHIX7X)7vd`0ljg0Bj`CioY@zY6|M@b7~E z5d1H}*9HG6_=ez{g8vfyAHlZ-^?|IA2XlO8o^q@af0Iow-DS?a4W&B z1-B90R&YDP4+!okxRc-n!4C?4Nbtjg9}(PHa2LUeg1ZV%65LI2cfmab_Y|BgI7M)( z;9i1z3+^Mhui!Mn{RHa-rwi5#HV8Hf&Jdg_*d#bhaJJw_1zQCZf^C9bf+@j_;9SA1 zU`{YE*e5to@DRbn1P>QHLU4iLk%Ik#3k4So4hSw0JW6o6;0nQG1Xl`vOmMZ}kl-4@ zj|(0vxK{8u!Q%xt+^%-e?P|Do;hLq(hL)~aT^O!fy=>{Cfx>x(&T*aNga3>7nt?T| ovCj=xE*Ls)Wnp;S;{KujC9C?E4?te9d|;@b3cMk&Svc+g0Fni84gdfE diff --git a/python/cudf/cudf/tests/data/vocab.txt b/python/cudf/cudf/tests/data/text/vocab.txt similarity index 100% rename from python/cudf/cudf/tests/data/vocab.txt rename to python/cudf/cudf/tests/data/text/vocab.txt diff --git a/python/cudf/cudf/tests/groupby/test_ordering_pandas_compat.py b/python/cudf/cudf/tests/groupby/test_ordering_pandas_compat.py index a8c5ae3b6a3..6fb32ae36b9 100644 --- a/python/cudf/cudf/tests/groupby/test_ordering_pandas_compat.py +++ b/python/cudf/cudf/tests/groupby/test_ordering_pandas_compat.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. +# Copyright (c) 2024-2025, NVIDIA CORPORATION. import numpy as np import pytest @@ -6,14 +6,10 @@ from cudf.testing import assert_eq -@pytest.fixture(params=[False, True], ids=["without_nulls", "with_nulls"]) -def with_nulls(request): - return request.param - - -@pytest.mark.parametrize("nrows", [30, 300, 300_000]) -@pytest.mark.parametrize("nkeys", [1, 2, 4]) -def test_groupby_maintain_order_random(nrows, nkeys, with_nulls): +@pytest.mark.parametrize("with_nulls", [False, True]) +def test_groupby_maintain_order_random(with_nulls): + nrows = 20 + nkeys = 3 rng = np.random.default_rng(seed=0) key_names = [f"key{key}" for key in range(nkeys)] key_values = [rng.integers(100, size=nrows) for _ in key_names] diff --git a/python/cudf/cudf/tests/input_output/test_parquet.py b/python/cudf/cudf/tests/input_output/test_parquet.py index a377bcac285..13acf6825b4 100644 --- a/python/cudf/cudf/tests/input_output/test_parquet.py +++ b/python/cudf/cudf/tests/input_output/test_parquet.py @@ -11,7 +11,7 @@ from cudf.testing import assert_eq -def test_parquet_long_list(tmpdir): +def test_parquet_long_list(tmp_path): # This test generates int and string list columns, where each has a row that is very large. # When generated by the cudf writer these long rows are contained on a single page, # but when generated by pyarrow they span several pages. @@ -44,7 +44,7 @@ def test_parquet_long_list(tmpdir): ) # Write the table to a parquet file using pyarrow - file_name = tmpdir.join("long_row_list_test.pq") + file_name = tmp_path / "long_row_list_test.pq" # https://arrow.apache.org/docs/python/generated/pyarrow.parquet.write_table.html pq.write_table( generated_table, diff --git a/python/cudf/cudf/tests/series/test_datetimelike.py b/python/cudf/cudf/tests/series/test_datetimelike.py index 72662ff70f5..400777e46e1 100644 --- a/python/cudf/cudf/tests/series/test_datetimelike.py +++ b/python/cudf/cudf/tests/series/test_datetimelike.py @@ -25,9 +25,11 @@ def _get_all_zones(): return zones -# NOTE: ALL_TIME_ZONES is a very large list; we likely do NOT want to +# NOTE: _get_all_zones is a very large list; we likely do NOT want to # use it for more than a handful of tests -ALL_TIME_ZONES = _get_all_zones() +@pytest.fixture(params=_get_all_zones()) +def zone_name(request): + return request.param @pytest.fixture(params=["ns", "us", "ms", "s"]) @@ -42,7 +44,6 @@ def tz(request): return request.param -@pytest.mark.parametrize("zone_name", ALL_TIME_ZONES) def test_tz_localize(unit, zone_name): s = cudf.Series(date_range("2001-01-01", "2001-01-02", freq="1s")) s = s.astype(f" Date: Wed, 6 Aug 2025 11:24:21 -1000 Subject: [PATCH 073/366] Always use strict zipping (#19584) Closes https://github.com/rapidsai/cudf/issues/15835. Tightens up a few checks and fixes some tests in the process. Authors: - Vyas Ramasubramani (https://github.com/vyasr) Approvers: - Gil Forsyth (https://github.com/gforsyth) - Matthew Roeschke (https://github.com/mroeschke) - Matthew Murray (https://github.com/Matt711) - Lawrence Mitchell (https://github.com/wence-) URL: https://github.com/rapidsai/cudf/pull/19584 --- docs/cudf/source/user_guide/10min.ipynb | 2 +- pyproject.toml | 9 +++ python/cudf/cudf/core/abc.py | 8 ++- python/cudf/cudf/core/accessors/struct.py | 2 +- python/cudf/cudf/core/column/column.py | 7 +- python/cudf/cudf/core/column/datetime.py | 2 +- python/cudf/cudf/core/column/struct.py | 10 ++- python/cudf/cudf/core/column_accessor.py | 16 +++-- python/cudf/cudf/core/dataframe.py | 71 ++++++++++++------- python/cudf/cudf/core/dtypes.py | 2 +- python/cudf/cudf/core/frame.py | 35 ++++++--- python/cudf/cudf/core/groupby/groupby.py | 38 +++++++--- python/cudf/cudf/core/index.py | 2 +- python/cudf/cudf/core/indexed_frame.py | 21 ++++-- python/cudf/cudf/core/join/join.py | 14 ++-- python/cudf/cudf/core/multiindex.py | 36 +++++++--- python/cudf/cudf/core/reshape.py | 15 ++-- python/cudf/cudf/core/series.py | 2 + python/cudf/cudf/core/udf/groupby_typing.py | 7 +- python/cudf/cudf/io/json.py | 2 +- python/cudf/cudf/io/orc.py | 6 +- python/cudf/cudf/io/parquet.py | 23 ++++-- python/cudf/cudf/options.py | 2 +- .../cudf/pandas/scripts/conftest-patch.py | 1 - python/cudf/cudf/testing/_utils.py | 4 +- .../dataframe/methods/test_convert_dtypes.py | 4 +- .../groupby/test_ordering_pandas_compat.py | 4 +- .../indexes/multiindex/methods/test_copy.py | 14 ++-- .../indexes/multiindex/test_attributes.py | 8 ++- .../indexes/multiindex/test_constructors.py | 2 +- python/cudf/cudf/tests/test_array_ufunc.py | 10 +-- python/cudf/cudf/tests/test_categorical.py | 12 +--- .../cudf/cudf/tests/test_column_accessor.py | 6 +- python/cudf/cudf/tests/test_csv.py | 2 +- python/cudf/cudf/tests/test_cuda_apply.py | 20 +++--- python/cudf/cudf/tests/test_dataframe.py | 14 ++-- python/cudf/cudf/tests/test_duplicates.py | 4 +- python/cudf/cudf/tests/test_groupby.py | 9 +-- python/cudf/cudf/tests/test_index.py | 6 +- python/cudf/cudf/tests/test_join_order.py | 28 +++++--- python/cudf/cudf/tests/test_json.py | 9 ++- python/cudf/cudf/tests/test_list.py | 2 +- python/cudf/cudf/tests/test_monotonic.py | 2 +- python/cudf/cudf/tests/test_multiindex.py | 4 +- python/cudf/cudf/tests/test_onehot.py | 6 +- python/cudf/cudf/tests/test_parquet.py | 2 +- python/cudf/cudf/tests/test_spilling.py | 2 +- .../tests/testing/test_assert_frame_equal.py | 2 +- python/cudf/cudf/utils/hash_vocab_utils.py | 4 +- python/cudf/cudf/utils/ioutils.py | 13 ++-- .../cudf_pandas_tests/test_cudf_pandas.py | 8 +-- .../cudf/cudf_pandas_tests/test_profiler.py | 2 +- .../tests/test_catboost.py | 2 +- .../tests/test_cuml.py | 2 +- .../tests/test_matplotlib.py | 2 +- .../tests/test_seaborn.py | 2 +- .../tests/test_xgboost.py | 4 +- python/cudf/pyproject.toml | 4 +- python/dask_cudf/dask_cudf/backends.py | 1 + python/dask_cudf/dask_cudf/io/orc.py | 4 +- .../dask_cudf/tests/test_accessor.py | 10 +-- python/dask_cudf/dask_cudf/tests/test_core.py | 2 +- python/pylibcudf/pylibcudf/io/types.pyx | 7 +- python/pylibcudf/tests/common/utils.py | 14 ++-- python/pylibcudf/tests/conftest.py | 5 +- python/pylibcudf/tests/io/test_avro.py | 3 +- python/pylibcudf/tests/test_binaryops.py | 2 +- python/pylibcudf/tests/test_copying.py | 34 ++++++--- python/pylibcudf/tests/test_lists.py | 4 +- .../tests/test_nvtext_generate_ngrams.py | 2 +- python/pylibcudf/tests/test_nvtext_jaccard.py | 2 +- python/pylibcudf/tests/test_nvtext_minhash.py | 10 ++- python/pylibcudf/tests/test_quantiles.py | 2 +- python/pylibcudf/tests/test_reshape.py | 2 +- python/pylibcudf/tests/test_string_find.py | 6 +- python/pylibcudf/tests/test_string_replace.py | 2 +- .../pylibcudf/tests/test_string_replace_re.py | 2 +- python/pylibcudf/tests/test_string_slice.py | 1 + 78 files changed, 423 insertions(+), 244 deletions(-) diff --git a/docs/cudf/source/user_guide/10min.ipynb b/docs/cudf/source/user_guide/10min.ipynb index f9762a5ff0f..87782cd7fb5 100644 --- a/docs/cudf/source/user_guide/10min.ipynb +++ b/docs/cudf/source/user_guide/10min.ipynb @@ -1982,7 +1982,7 @@ ], "source": [ "arrays = [[\"a\", \"a\", \"b\", \"b\"], [1, 2, 3, 4]]\n", - "tuples = list(zip(*arrays))\n", + "tuples = list(zip(*arrays, strict=True))\n", "idx = cudf.MultiIndex.from_tuples(tuples)\n", "idx" ] diff --git a/pyproject.toml b/pyproject.toml index 59246db6fb0..291fd063aeb 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -47,6 +47,8 @@ select = [ "W", # isort "I", + # zip-without-explicit-strict + "B905", # no-blank-line-before-function "D201", # one-blank-line-after-class @@ -134,6 +136,10 @@ fixable = ["ALL"] exclude = [ "cpp/scripts/gdb-pretty-printers.py", ] +extend-unsafe-fixes = [ + # zip-without-explicit-strict's autofix sets strict to False, but we want to enforce True + "B905", +] [tool.ruff.lint.flake8-tidy-imports.banned-api] "numpy.can_cast".msg = "Use find_common_dtype from cudf.utils.dtypes instead" @@ -144,6 +150,9 @@ exclude = [ [tool.ruff.lint.per-file-ignores] # We use "== None" to demonstrate null handling in this notebook "docs/cudf/source/user_guide/missing-data.ipynb" = ["E711"] +# We demonstrate UDFs where numba doesn't support `zip(..., strict=True)` (or +# any keywords) and we don't want to litter demo notebooks with noqas +"docs/cudf/source/user_guide/guide-to-udfs.ipynb" = ["B905"] # Lots of pytest implicitly injected attributes in conftest-patch.py "python/cudf/cudf/pandas/scripts/conftest-patch.py" = ["F821"] "python/cudf/cudf/pandas/scripts/*" = ["D"] diff --git a/python/cudf/cudf/core/abc.py b/python/cudf/cudf/core/abc.py index c8ea03b04fe..2a7250709a4 100644 --- a/python/cudf/cudf/core/abc.py +++ b/python/cudf/cudf/core/abc.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. +# Copyright (c) 2020-2025, NVIDIA CORPORATION. """Common abstract base classes for cudf.""" import numpy @@ -157,7 +157,7 @@ def host_serialize(self): header["writeable"] = len(frames) * (None,) frames = [ f.memoryview() if c else memoryview(f) - for c, f in zip(header["is-cuda"], frames) + for c, f in zip(header["is-cuda"], frames, strict=True) ] return header, frames @@ -183,7 +183,9 @@ def host_deserialize(cls, header, frames): """ frames = [ cudf.core.buffer.as_buffer(f) if c else f - for c, f in zip(header["is-cuda"], map(memoryview, frames)) + for c, f in zip( + header["is-cuda"], map(memoryview, frames), strict=True + ) ] obj = cls.device_deserialize(header, frames) return obj diff --git a/python/cudf/cudf/core/accessors/struct.py b/python/cudf/cudf/core/accessors/struct.py index aad74a4ffe4..3f81ca3ca2b 100644 --- a/python/cudf/cudf/core/accessors/struct.py +++ b/python/cudf/cudf/core/accessors/struct.py @@ -109,7 +109,7 @@ def explode(self) -> DataFrame: data = { name: col.copy(deep=True) for name, col in zip( - self._column.dtype.fields, self._column.children + self._column.dtype.fields, self._column.children, strict=True ) } rangeindex = len(data) == 0 diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index f3d8a7798be..a2c21071bbe 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -459,7 +459,7 @@ def children(self) -> tuple[ColumnBase, ...]: ) self._children = tuple( # type: ignore[assignment] child._with_type_metadata(dtype) - for child, dtype in zip(children, dtypes) + for child, dtype in zip(children, dtypes, strict=True) ) return self._children # type: ignore[return-value] @@ -2062,7 +2062,8 @@ def serialize(self) -> tuple[dict, list]: frames.extend(mask_frames) if self.children: child_headers, child_frames = zip( - *(c.device_serialize() for c in self.children) + *(c.device_serialize() for c in self.children), + strict=True, ) header["subheaders"] = list(child_headers) frames.extend(chain(*child_frames)) @@ -3386,7 +3387,7 @@ def serialize_columns(columns: list[ColumnBase]) -> tuple[list[dict], list]: header_columns: list[tuple[dict, list]] = [ c.device_serialize() for c in columns ] - headers, column_frames = zip(*header_columns) + headers, column_frames = zip(*header_columns, strict=True) for f in column_frames: frames.extend(f) diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py index 3fcebe118d5..08a0c20506e 100644 --- a/python/cudf/cudf/core/column/datetime.py +++ b/python/cudf/cudf/core/column/datetime.py @@ -440,7 +440,7 @@ def isocalendar(self) -> dict[str, ColumnBase]: return { field: self.strftime(format=directive).astype(np.dtype(np.uint32)) for field, directive in zip( - ["year", "week", "day"], ["%G", "%V", "%u"] + ["year", "week", "day"], ["%G", "%V", "%u"], strict=True ) } diff --git a/python/cudf/cudf/core/column/struct.py b/python/cudf/cudf/core/column/struct.py index 79fd5e51f4f..d0a9f0389f6 100644 --- a/python/cudf/cudf/core/column/struct.py +++ b/python/cudf/cudf/core/column/struct.py @@ -111,7 +111,10 @@ def to_arrow(self) -> pa.Array: else self.dtype ) pa_type = pa.struct( - {field: child.type for field, child in zip(dtype.fields, children)} + { + field: child.type + for field, child in zip(dtype.fields, children, strict=True) + } ) if self.mask is not None: @@ -190,7 +193,10 @@ def _rename_fields(self, names) -> Self: but with the field names equal to `names`. """ dtype = StructDtype( - {name: col.dtype for name, col in zip(names, self.children)} + { + name: col.dtype + for name, col in zip(names, self.children, strict=True) + } ) return StructColumn( # type: ignore[return-value] data=None, diff --git a/python/cudf/cudf/core/column_accessor.py b/python/cudf/cudf/core/column_accessor.py index 08c501d4747..c776dbc6b20 100644 --- a/python/cudf/cudf/core/column_accessor.py +++ b/python/cudf/cudf/core/column_accessor.py @@ -253,7 +253,9 @@ def _grouped_data(self) -> MutableMapping: return the underlying mapping as a nested mapping. """ if self.multiindex: - return _NestedGetItemDict.from_zip(zip(self.names, self.columns)) + return _NestedGetItemDict.from_zip( + zip(self.names, self.columns, strict=True) + ) else: return self._data @@ -383,7 +385,7 @@ def insert(self, name: Hashable, value: ColumnBase, loc: int = -1) -> None: else: new_keys = self.names[:loc] + (name,) + self.names[loc:] new_values = self.columns[:loc] + (value,) + self.columns[loc:] - self._data = dict(zip(new_keys, new_values)) + self._data = dict(zip(new_keys, new_values, strict=True)) self._clear_cache(old_ncols, old_ncols + 1) # The type(name) may no longer match the prior label_dtype @@ -458,7 +460,9 @@ def get_labels_by_index( "Cannot use Series object for mask iloc indexing" ) # TODO: Doesn't handle on-device columns - return tuple(n for n, keep in zip(self.names, index) if keep) + return tuple( + n for n, keep in zip(self.names, index, strict=True) if keep + ) else: if len(set(index)) != len(index): # type: ignore[arg-type] raise NotImplementedError( @@ -569,7 +573,9 @@ def _select_by_label_list_like(self, key: tuple) -> Self: ) data = dict( item - for item, keep in zip(self._grouped_data.items(), key) + for item, keep in zip( + self._grouped_data.items(), key, strict=True + ) if keep ) else: @@ -743,7 +749,7 @@ def rename_column(x): if not all(isinstance(label, old_type) for label in new_col_names): label_dtype = None - data = dict(zip(new_col_names, self.values())) + data = dict(zip(new_col_names, self.values(), strict=True)) return type(self)( data=data, level_names=self.level_names, diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index fd709d4616f..ff0fe0e0564 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -55,6 +55,7 @@ from cudf.core.column import ( CategoricalColumn, ColumnBase, + StringColumn, StructColumn, as_column, column_empty, @@ -145,12 +146,17 @@ def _recursively_update_struct_names( ) -> ColumnBase: """Update a Column with struct names from pylibcudf.io.TableWithMetadata.child_names""" if col.children: - children = list(col.children) - for i, (child, names) in enumerate( - zip(children, child_names.values()) - ): - children[i] = _recursively_update_struct_names(child, names) - col.set_base_children(tuple(children)) + if not child_names: + assert isinstance(col, StringColumn), ( + "Only string columns can have unnamed children" + ) + else: + children = list(col.children) + for i, (child, names) in enumerate( + zip(children, child_names.values(), strict=True) + ): + children[i] = _recursively_update_struct_names(child, names) + col.set_base_children(tuple(children)) if isinstance(col.dtype, StructDtype): col = col._rename_fields(child_names.keys()) # type: ignore[attr-defined] @@ -717,7 +723,9 @@ def _array_to_column_accessor( return ColumnAccessor( { column_label: as_column(data[:, i], nan_as_null=nan_as_null) - for column_label, i in zip(columns_labels, range(data.shape[1])) + for column_label, i in zip( + columns_labels, range(data.shape[1]), strict=True + ) }, verify=False, rangeindex=isinstance(columns_labels, pd.RangeIndex), @@ -764,7 +772,7 @@ def _mapping_to_column_accessor( ) data = data.copy() for key, aligned_series in zip( - values_as_series.keys(), aligned_input_series + values_as_series.keys(), aligned_input_series, strict=True ): if index is not None: aligned_series = aligned_series.reindex(index=index) @@ -1902,6 +1910,7 @@ def _concat( zip( indices[:first_data_column_position], cols[:first_data_column_position], + strict=True, ) ) ) @@ -1911,6 +1920,7 @@ def _concat( zip( indices[first_data_column_position:], cols[first_data_column_position:], + strict=True, ) ), index=table_index, @@ -1984,11 +1994,14 @@ def _concat( # Reassign the categories for any categorical index cols if not isinstance(out.index, cudf.RangeIndex): - _reassign_categories( - categories, - out.index._data, - indices[:first_data_column_position], - ) + # If the index column was constructed and not generated via concatenation, + # then reassigning categories is neither needed nor a valid operation. + if first_data_column_position > 0: + _reassign_categories( + categories, + out.index._data, + indices[:first_data_column_position], + ) if not isinstance(out.index, MultiIndex) and isinstance( out.index.dtype, CategoricalDtype ): @@ -2225,7 +2238,7 @@ def _fill_same_ca_attributes( "whose columns & index are same respectively, " "please reindex." ) - rhs = dict(zip(other_pd_index, other.values_host)) + rhs = dict(zip(other_pd_index, other.values_host, strict=True)) # For keys in right but not left, perform binops between NaN (not # NULL!) and the right value (result is NaN). left_default = as_column(np.nan, length=len(self)) @@ -2853,7 +2866,7 @@ def columns(self, columns): ) self._data = ColumnAccessor( - data=dict(zip(pd_columns, self._columns)), + data=dict(zip(pd_columns, self._columns, strict=True)), multiindex=multiindex, level_names=level_names, label_dtype=label_dtype, @@ -2875,7 +2888,7 @@ def _set_columns_like(self, other: ColumnAccessor) -> None: f"got {len(self)} elements" ) self._data = ColumnAccessor( - data=dict(zip(other.names, self._columns)), + data=dict(zip(other.names, self._columns, strict=True)), multiindex=other.multiindex, rangeindex=other.rangeindex, level_names=other.level_names, @@ -3284,7 +3297,7 @@ def where( out = [] for (name, col), other_col in zip( - self._column_labels_and_values, other_cols + self._column_labels_and_values, other_cols, strict=True ): if cond_col := cond._data.get(name): out.append(col.where(cond_col, other_col, inplace)) @@ -5895,7 +5908,7 @@ def to_arrow(self, preserve_index: bool | None = None) -> pa.Table: ) data = data.copy(deep=False) for gen_name, col_name in zip( - index_descr, index._column_names + index_descr, index._column_names, strict=True ): data._insert( data.shape[1], @@ -7632,6 +7645,7 @@ def unnamed_group_generator(): else [ stacked[i] for i in unnamed_level_values.argsort() ], + strict=True, ) ), isinstance(unnamed_level_values, pd.MultiIndex), @@ -8272,7 +8286,7 @@ def eval(self, expr: str, inplace: bool = False, **kwargs): exprs.append(e.strip()) ret = self if inplace else self.copy(deep=False) - for name, expr in zip(targets, exprs): + for name, expr in zip(targets, exprs, strict=True): ret._data[name] = self._compute_column(expr) if not inplace: return ret @@ -8450,10 +8464,11 @@ def from_pylibcudf( ColumnBase.from_pylibcudf(plc_col, data_ptr_exposed=True) for plc_col in plc_columns ) + # We only have child names if the source is a pylibcudf.io.TableWithMetadata. if child_names is not None: cudf_cols = ( - _recursively_update_struct_names(col, child_names) - for col, child_names in zip( + _recursively_update_struct_names(col, cn) + for col, cn in zip( cudf_cols, child_names.values(), strict=True ) ) @@ -8792,7 +8807,7 @@ def _setitem_with_dataframe( ): replace_df = replace_df.reindex(input_df.index) - for col_1, col_2 in zip(input_cols, replace_df._column_names): + for col_1, col_2 in zip(input_cols, replace_df._column_names, strict=True): if col_1 in input_df._column_names: if mask is not None: input_df._data[col_1][mask] = as_column(replace_df[col_2]) @@ -8848,11 +8863,11 @@ def _index_from_listlike_of_series( # Create a dictionary of the common, non-null columns def _get_non_null_cols_and_dtypes(col_idxs, list_of_columns): # A mapping of {idx: np.dtype} - dtypes = dict() + dtypes = {} # A mapping of {idx: [...columns]}, where `[...columns]` # is a list of columns with at least one valid value for each # column name across all input frames - non_null_columns = dict() + non_null_columns = {} for idx in col_idxs: for cols in list_of_columns: # Skip columns not in this frame @@ -8877,8 +8892,10 @@ def _find_common_dtypes_and_categories( ) -> dict[Any, ColumnBase]: # A mapping of {idx: categories}, where `categories` is a # column of all the unique categorical values from each - # categorical column across all input frames - categories = dict() + # categorical column across all input frames. This function + # also modifies the input dtypes dictionary in place to capture + # the common dtype across columns being concatenated. + categories = {} for idx, cols in non_null_columns.items(): # default to the first non-null dtype dtypes[idx] = cols[0].dtype @@ -8927,7 +8944,7 @@ def _cast_cols_to_common_dtypes(col_idxs, list_of_columns, dtypes, categories): def _reassign_categories(categories, cols, col_idxs): - for name, idx in zip(cols, col_idxs): + for name, idx in zip(cols, col_idxs, strict=True): if idx in categories: codes = as_unsigned_codes(len(categories[idx]), cols[name]) cols[name] = CategoricalColumn( diff --git a/python/cudf/cudf/core/dtypes.py b/python/cudf/cudf/core/dtypes.py index 60cc419e3b6..4b67544ce5a 100644 --- a/python/cudf/cudf/core/dtypes.py +++ b/python/cudf/cudf/core/dtypes.py @@ -718,7 +718,7 @@ def _recursively_replace_fields(self, result: dict) -> dict: """ new_result = {} for (new_field, field_dtype), result_value in zip( - self.fields.items(), result.values() + self.fields.items(), result.values(), strict=True ): if isinstance(field_dtype, StructDtype) and isinstance( result_value, dict diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index c458dec5a67..0c627f3ea84 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -93,7 +93,7 @@ def _columns(self) -> tuple[ColumnBase, ...]: def _column_labels_and_values( self, ) -> Iterable[tuple[Hashable, ColumnBase]]: - return zip(self._column_names, self._columns) + return zip(self._column_names, self._columns, strict=True) @property def _dtypes(self) -> Generator[tuple[Hashable, Dtype], None, None]: @@ -129,7 +129,8 @@ def serialize(self): if isinstance(cname, np.generic) else (cname, "") for cname in self._column_names - ] + ], + strict=True, ) if self._column_names else ((), ()) @@ -174,11 +175,13 @@ def deserialize(cls, header, frames): column_names = [ getattr(np, cntype)(cname) if cntype != "" else cname for cname, cntype in zip( - header["column_names"], header["column_names_numpy_type"] + header["column_names"], + header["column_names_numpy_type"], + strict=True, ) ] col_accessor = ColumnAccessor( - data=dict(zip(column_names, columns)), **kwargs + data=dict(zip(column_names, columns, strict=True)), **kwargs ) return cls._from_data(col_accessor) @@ -219,7 +222,7 @@ def _from_columns_like_self( """ if column_names is None: column_names = self._column_names - data = dict(zip(column_names, columns)) + data = dict(zip(column_names, columns, strict=True)) frame = self.__class__._from_data(data) return frame._copy_type_metadata(self) @@ -1077,7 +1080,9 @@ def from_arrow(cls, data: pa.Table) -> Self: } for name, plc_codes in zip( - dict_indices_table.column_names, plc_indices.columns() + dict_indices_table.column_names, + plc_indices.columns(), + strict=True, ): codes = ColumnBase.from_pylibcudf(plc_codes) categories = cudf_dictionaries_columns[name] @@ -1097,7 +1102,9 @@ def from_arrow(cls, data: pa.Table) -> Self: cudf_non_category_frame = { name: ColumnBase.from_pylibcudf(plc_col) for name, plc_col in zip( - data.column_names, plc.Table.from_arrow(data).columns() + data.column_names, + plc.Table.from_arrow(data).columns(), + strict=True, ) } @@ -1199,7 +1206,9 @@ def _copy_type_metadata(self: Self, other: Self) -> Self: See `ColumnBase._with_type_metadata` for more information. """ for (name, self_col), (_, other_col) in zip( - self._column_labels_and_values, other._column_labels_and_values + self._column_labels_and_values, + other._column_labels_and_values, + strict=True, ): self._data.set_by_label( name, @@ -1471,19 +1480,23 @@ def searchsorted( # https://github.com/pandas-dev/pandas/issues/54668 common_dtype_list = [ find_common_type([col.dtype, val.dtype]) - for col, val in zip(self._columns, values) + for col, val in zip(self._columns, values, strict=True) ] sources = [ col if is_dtype_equal(col.dtype, common_dtype) else col.astype(common_dtype) - for col, common_dtype in zip(self._columns, common_dtype_list) + for col, common_dtype in zip( + self._columns, common_dtype_list, strict=True + ) ] values = [ val if is_dtype_equal(val.dtype, common_dtype) else val.astype(common_dtype) - for val, common_dtype in zip(values, common_dtype_list) + for val, common_dtype in zip( + values, common_dtype_list, strict=True + ) ] outcol = ColumnBase.from_pylibcudf( diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index 36809a35066..db79e26782f 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -574,7 +574,11 @@ def groups(self): ) return dict( - zip(group_names.to_pandas(), grouped_index._split(offsets[1:-1])) + zip( + group_names.to_pandas(), + grouped_index._split(offsets[1:-1]), + strict=True, + ) ) @cached_property @@ -608,7 +612,11 @@ def indices(self) -> dict[ScalarLike, cupy.ndarray]: else: index = Index._from_column(group_keys[0]) return dict( - zip(index.to_pandas(), cp.split(indices.values, offsets[1:-1])) + zip( + index.to_pandas(), + cp.split(indices.values, offsets[1:-1]), + strict=True, + ) ) @_performance_tracking @@ -807,7 +815,9 @@ def _aggregate( requests = [] result_columns: list[list[ColumnBase]] = [] - for i, (col, aggs) in enumerate(zip(values, aggregations)): + for i, (col, aggs) in enumerate( + zip(values, aggregations, strict=True) + ): valid_aggregations = get_valid_aggregation(col.dtype) included_aggregations_i = [] col_aggregations = [] @@ -845,7 +855,7 @@ def _aggregate( else self._groupby.plc_groupby.aggregate(requests) ) - for i, result, val in zip(column_included, results, values): + for i, result in zip(column_included, results, strict=True): result_columns[i] = [ ColumnBase.from_pylibcudf(col) for col in result.columns() ] @@ -866,7 +876,7 @@ def _shift( pa_scalar_to_plc_scalar( pa.scalar(val, type=cudf_dtype_to_pa_type(col.dtype)) ) - for val, col in zip(fill_values, values) + for val, col in zip(fill_values, values, strict=True) ], ) return (ColumnBase.from_pylibcudf(col) for col in shifts.columns()) @@ -1012,8 +1022,9 @@ def agg(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs): included_aggregations, result_columns, orig_dtypes, + strict=True, ): - for agg_tuple, col in zip(aggs, cols): + for agg_tuple, col in zip(aggs, cols, strict=True): agg, agg_kind = agg_tuple agg_name = agg.__name__ if callable(agg) else agg if multilevel: @@ -1078,11 +1089,11 @@ def agg(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs): right_cols = result_index._columns join_keys = [ _match_join_keys(lcol, rcol, "inner") - for lcol, rcol in zip(left_cols, right_cols) + for lcol, rcol in zip(left_cols, right_cols, strict=True) ] # TODO: In future, see if we can centralize # logic else where that has similar patterns. - join_keys = map(list, zip(*join_keys)) + join_keys = map(list, zip(*join_keys, strict=True)) # By construction, left and right keys are related by # a permutation, so we can use an inner join. with acquire_spill_lock(): @@ -1563,7 +1574,9 @@ def sample( # Empirically shuffling with cupy is faster at this scale rs = cp.random.get_random_state() rs.seed(seed=random_state) - for off, size in zip(group_offsets, size_per_group): + for off, size in zip( + group_offsets[:-1], size_per_group, strict=True + ): rs.shuffle(indices[off : off + size]) else: keys = cp.random.default_rng(seed=random_state).random( @@ -1707,7 +1720,8 @@ def _raise_invalid_type(x): if isinstance(x, tuple) else _raise_invalid_type(x) for x in kwargs.values() - ) + ), + strict=True, ) else: raise TypeError("Must provide at least one aggregation function.") @@ -2570,7 +2584,7 @@ def interleave_columns(source_columns): res = DataFrame._from_data( { x: interleave_columns([gb_cov_corr._data[y] for y in ys]) - for ys, x in zip(cols_split, column_names) + for ys, x in zip(cols_split, column_names, strict=True) } ) @@ -2737,6 +2751,7 @@ def _scan_fill(self, method: str, limit: int) -> DataFrameOrSeries: zip( values._column_names, self._replace_nulls(values._columns, method), + strict=True, ) ) ) @@ -2906,6 +2921,7 @@ def shift( zip( values._column_names, self._shift(values._columns, periods, fill_value), + strict=True, ) ) ) diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index 8d46e435604..cc795fad2b1 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -2645,7 +2645,7 @@ def _data(self) -> ColumnAccessor: def _column_labels_and_values( self, ) -> Iterable[tuple[Hashable, ColumnBase]]: - return zip(self._column_names, self._columns) + return zip(self._column_names, self._columns, strict=True) @_performance_tracking def _as_int_index(self) -> Index: diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index d5200ed8b14..f7ef0f0db12 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -372,7 +372,7 @@ def _from_columns_like_self( else: index.name = index_names[0] - data = dict(zip(column_names, data_columns)) + data = dict(zip(column_names, data_columns, strict=True)) frame = type(self)._from_data(data, index) return frame._copy_type_metadata(self) @@ -976,7 +976,7 @@ def clip(self, lower=None, upper=None, axis=1, inplace=False): data = ( col.clip(low, high) - for col, low, high in zip(self._columns, lower, upper) + for col, low, high in zip(self._columns, lower, upper, strict=True) ) output = self._from_data_like_self( self._data._from_columns_like_self(data) @@ -2697,7 +2697,7 @@ def sort_index( ) else: ca = ColumnAccessor( - dict(zip(labels, result_columns)), + dict(zip(labels, result_columns, strict=True)), rangeindex=self._data.rangeindex, multiindex=self._data.multiindex, level_names=self._data.level_names, @@ -3743,6 +3743,7 @@ def _reindex( for left_dtype, right_dtype in zip( (dtype for _, dtype in df.index._dtypes), (dtype for _, dtype in index._dtypes), + strict=True, ) ) @@ -5320,12 +5321,18 @@ def _explode(self, explode_column: Any, ignore_index: bool): if i == column_index else new_column._with_type_metadata(old_column.dtype) for i, (new_column, old_column) in enumerate( - zip(exploded, itertools.chain(idx_cols, self._columns)) + zip( + exploded, + itertools.chain(idx_cols, self._columns), + strict=True, + ) ) ] data = type(self._data)( - dict(zip(self._column_names, exploded[len(idx_cols) :])), + dict( + zip(self._column_names, exploded[len(idx_cols) :], strict=True) + ), multiindex=self._data.multiindex, level_names=self._data.level_names, rangeindex=self._data.rangeindex, @@ -6411,7 +6418,9 @@ def rank( if dropped_cols: result = type(source)._from_data( ColumnAccessor( - dict(zip(source._column_names, result_columns)), + dict( + zip(source._column_names, result_columns, strict=True) + ), multiindex=source._data.multiindex, level_names=source._data.level_names, label_dtype=source._data.label_dtype, diff --git a/python/cudf/cudf/core/join/join.py b/python/cudf/cudf/core/join/join.py index 43fbe73170e..43c73171c0c 100644 --- a/python/cudf/cudf/core/join/join.py +++ b/python/cudf/cudf/core/join/join.py @@ -233,7 +233,9 @@ def __init__( if on else { lkey.name - for lkey, rkey in zip(self._left_keys, self._right_keys) + for lkey, rkey in zip( + self._left_keys, self._right_keys, strict=True + ) if lkey.name == rkey.name and not ( isinstance(lkey, _IndexIndexer) @@ -276,7 +278,7 @@ def _gather_maps(self, left_cols, right_cols): as_column(range(n), dtype=SIZE_TYPE_DTYPE).take( map_, nullify=null, check_bounds=False ) - for map_, n, null in zip(maps, lengths, nullify) + for map_, n, null in zip(maps, lengths, nullify, strict=True) ] if self.how == "right": # If how is right, right map is primary sort key. @@ -296,7 +298,9 @@ def perform_merge(self) -> DataFrame: left_join_cols = [] right_join_cols = [] - for left_key, right_key in zip(self._left_keys, self._right_keys): + for left_key, right_key in zip( + self._left_keys, self._right_keys, strict=True + ): lcol = left_key.get(self.lhs) rcol = right_key.get(self.rhs) lcol_casted, rcol_casted = _match_join_keys(lcol, rcol, self.how) @@ -405,7 +409,9 @@ def _merge_results(self, left_result: DataFrame, right_result: DataFrame): # combined by filling nulls in the left key column with corresponding # values from the right key column: if self.how == "outer": - for lkey, rkey in zip(self._left_keys, self._right_keys): + for lkey, rkey in zip( + self._left_keys, self._right_keys, strict=True + ): if lkey.name == rkey.name: # fill nulls in lhs from values in the rhs lkey.set( diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py index f3bdfce4321..e811ecea30c 100644 --- a/python/cudf/cudf/core/multiindex.py +++ b/python/cudf/cudf/core/multiindex.py @@ -200,7 +200,9 @@ def __init__( new_codes.append(new_code) source_data: dict[Hashable, column.ColumnBase] = {} - for i, (code, level) in enumerate(zip(new_codes, new_levels)): + for i, (code, level) in enumerate( + zip(new_codes, new_levels, strict=True) + ): if len(code): lo, hi = code.minmax() if lo < -1 or hi > len(level) - 1: @@ -263,7 +265,7 @@ def names(self, value): # definitely buggy, but we can't disallow non-unique # names either... self._data = type(self._data)( - dict(zip(value, self._columns)), + dict(zip(value, self._columns, strict=True)), level_names=self._data.level_names, verify=False, ) @@ -549,7 +551,7 @@ def __repr__(self) -> str: preprocess = self arrays = [] - for name, col in zip(self.names, preprocess._columns): + for name, col in zip(self.names, preprocess._columns, strict=True): try: pd_idx = col.to_pandas(nullable=True) except NotImplementedError: @@ -637,7 +639,7 @@ def levels(self) -> list[cudf.Index]: self._maybe_materialize_codes_and_levels() return [ idx.rename(name) # type: ignore[misc] - for idx, name in zip(self._levels, self.names) # type: ignore[arg-type] + for idx, name in zip(self._levels, self.names, strict=True) # type: ignore[arg-type] ] @property # type: ignore @@ -966,7 +968,7 @@ def __eq__(self, other): [ self_col.equals(other_col) for self_col, other_col in zip( - self._columns, other._columns + self._columns, other._columns, strict=True ) ] ) @@ -1116,7 +1118,13 @@ def to_frame( if len(column_names) != len(set(column_names)): raise ValueError("Duplicate column names are not allowed") ca = ColumnAccessor( - dict(zip(column_names, (col.copy() for col in self._columns))), + dict( + zip( + column_names, + (col.copy() for col in self._columns), + strict=True, + ) + ), verify=False, ) return cudf.DataFrame._from_data( @@ -1972,9 +1980,9 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None): join_keys = [ _match_join_keys(lcol, rcol, "inner") - for lcol, rcol in zip(target._columns, self._columns) + for lcol, rcol in zip(target._columns, self._columns, strict=True) ] - join_keys = map(list, zip(*join_keys)) + join_keys = map(list, zip(*join_keys, strict=True)) with acquire_spill_lock(): plc_tables = [ plc.Table([col.to_pylibcudf(mode="read") for col in cols]) @@ -2081,7 +2089,9 @@ def _maybe_match_names(self, other) -> list[Hashable]: return [None] * self.nlevels return [ self_name if _is_same_name(self_name, other_name) else None - for self_name, other_name in zip(self.names, other.names) + for self_name, other_name in zip( + self.names, other.names, strict=True + ) ] @_performance_tracking @@ -2177,7 +2187,9 @@ def _split_columns_by_levels( lv if isinstance(lv, int) else level_names.index(lv) for lv in levels } - for i, (name, col) in enumerate(zip(self.names, self._columns)): + for i, (name, col) in enumerate( + zip(self.names, self._columns, strict=True) + ): if in_levels and i in level_indices: name = f"level_{i}" if name is None else name yield name, col @@ -2218,7 +2230,9 @@ def _columns_for_reset_index( ) -> Generator[tuple[Any, column.ColumnBase], None, None]: """Return the columns and column names for .reset_index""" if levels is None: - for i, (col, name) in enumerate(zip(self._columns, self.names)): + for i, (col, name) in enumerate( + zip(self._columns, self.names, strict=True) + ): yield f"level_{i}" if name is None else name, col else: yield from self._split_columns_by_levels(levels, in_levels=True) diff --git a/python/cudf/cudf/core/reshape.py b/python/cudf/cudf/core/reshape.py index 02fac2c0abc..74337fd5284 100644 --- a/python/cudf/cudf/core/reshape.py +++ b/python/cudf/cudf/core/reshape.py @@ -463,7 +463,7 @@ def concat( "label types in cuDF at this time. You must convert " "the labels to the same type." ) - for k, o in zip(keys_objs, objs): + for k, o in zip(keys_objs, objs, strict=True): for name, col in o._column_labels_and_values: # if only series, then only keep keys_objs as column labels # if the existing column is multiindex, prepend it @@ -834,14 +834,14 @@ def get_dummies( elif isinstance(prefix, dict): prefix_map = prefix else: - prefix_map = dict(zip(columns, prefix)) + prefix_map = dict(zip(columns, prefix, strict=True)) if isinstance(prefix_sep, str): prefix_sep_map = {} elif isinstance(prefix_sep, dict): prefix_sep_map = prefix_sep else: - prefix_sep_map = dict(zip(columns, prefix_sep)) + prefix_sep_map = dict(zip(columns, prefix_sep, strict=True)) # If we have no columns to encode, we need to drop # fallback columns(if any) @@ -1030,6 +1030,7 @@ def as_tuple(x): target_col.split_by_offsets( list(range(nrows, new_size, nrows)) ), + strict=True, ) ) ) @@ -1335,7 +1336,9 @@ def _one_hot_encode_column( x if x is not None else "" for x in categories.to_arrow().to_pylist() ) - data = dict(zip(result_labels, column.one_hot_encode(categories))) + data = dict( + zip(result_labels, column.one_hot_encode(categories), strict=True) + ) if drop_first and len(data): data.pop(next(iter(data))) @@ -1463,8 +1466,8 @@ def crosstab( raise ValueError("colnames must be unique") data = { - **dict(zip(rownames, map(as_column, index))), - **dict(zip(colnames, map(as_column, columns))), + **dict(zip(rownames, map(as_column, index), strict=True)), + **dict(zip(colnames, map(as_column, columns), strict=True)), } df = cudf.DataFrame._from_data(data) diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index ff4fe8866bd..01cf0905723 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -99,6 +99,7 @@ def _describe_numeric(obj, percentiles): zip( _format_percentile_names(percentiles), obj.quantile(percentiles).to_numpy(na_value=np.nan).tolist(), + strict=True, ) ), "max": obj.max(), @@ -120,6 +121,7 @@ def _describe_timetype(obj, percentiles, typ): .astype(CUDF_STRING_DTYPE) .to_numpy(na_value=np.nan) .tolist(), + strict=True, ) ), "max": str(typ(obj.max())), diff --git a/python/cudf/cudf/core/udf/groupby_typing.py b/python/cudf/cudf/core/udf/groupby_typing.py index dffd7db2f71..eaa1d4c76b0 100644 --- a/python/cudf/cudf/core/udf/groupby_typing.py +++ b/python/cudf/cudf/core/udf/groupby_typing.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. +# Copyright (c) 2020-2025, NVIDIA CORPORATION. from __future__ import annotations from typing import Any @@ -204,7 +204,8 @@ def generic(self, args, kws): if funcs := call_cuda_functions.get(self.key.__name__): for sig in funcs.keys(): if all( - arg.group_scalar_type == ty for arg, ty in zip(args, sig) + arg.group_scalar_type == ty + for arg, ty in zip(args, sig, strict=True) ): return nb_signature(sig[0], *args) raise UDFError(self.make_error_string(args)) @@ -242,7 +243,7 @@ def generic(self, args, kws): retty, selfty, *argtys = sig if self.this.group_scalar_type == selfty and all( arg.group_scalar_type == ty - for arg, ty in zip(args, argtys) + for arg, ty in zip(args, argtys, strict=True) ): return nb_signature(retty, *args, recvr=self.this) raise UDFError(self.make_error_string(args)) diff --git a/python/cudf/cudf/io/json.py b/python/cudf/cudf/io/json.py index 28a6bc53ba3..fc22b603dd3 100644 --- a/python/cudf/cudf/io/json.py +++ b/python/cudf/cudf/io/json.py @@ -313,7 +313,7 @@ def _dtype_to_names_list(col: ColumnBase) -> list[tuple[Hashable, Any]]: if isinstance(col.dtype, StructDtype): return [ (name, _dtype_to_names_list(child)) - for name, child in zip(col.dtype.fields, col.children) + for name, child in zip(col.dtype.fields, col.children, strict=True) ] elif isinstance(col.dtype, ListDtype): return [("", _dtype_to_names_list(child)) for child in col.children] diff --git a/python/cudf/cudf/io/orc.py b/python/cudf/cudf/io/orc.py index 488e76304e1..cbab80a9ee7 100644 --- a/python/cudf/cudf/io/orc.py +++ b/python/cudf/cudf/io/orc.py @@ -75,7 +75,7 @@ def read_orc_statistics( file_statistics = { column_name: column_stats for column_name, column_stats in zip( - column_names, parsed_file_statistics + column_names, parsed_file_statistics, strict=True ) if columns is None or column_name in columns } @@ -86,7 +86,7 @@ def read_orc_statistics( stripe_statistics = { column_name: column_stats for column_name, column_stats in zip( - column_names, parsed_stripe_statistics + column_names, parsed_stripe_statistics, strict=True ) if columns is None or column_name in columns } @@ -688,7 +688,7 @@ def _set_col_children_metadata( ) -> None: if isinstance(col.dtype, StructDtype): for i, (child_col, name) in enumerate( - zip(col.children, list(col.dtype.fields)) + zip(col.children, list(col.dtype.fields), strict=True) ): col_meta.child(i).set_name(name) _set_col_children_metadata( diff --git a/python/cudf/cudf/io/parquet.py b/python/cudf/cudf/io/parquet.py index 5e14065c08f..7bcb80ffbdc 100644 --- a/python/cudf/cudf/io/parquet.py +++ b/python/cudf/cudf/io/parquet.py @@ -709,7 +709,9 @@ def _process_dataset( raise ValueError( "Cannot specify a row_group selection for a directory path." ) - row_groups_map = {path: rgs for path, rgs in zip(paths, row_groups)} + row_groups_map = { + path: rgs for path, rgs in zip(paths, row_groups, strict=True) + } # Apply filters and discover partition columns partition_keys = [] @@ -1091,7 +1093,9 @@ def _parquet_to_frame( # unique set of partition keys. Therefore, we start by # aggregating all paths with matching keys using a dict plan = {} - for i, (keys, path) in enumerate(zip(partition_keys, paths_or_buffers)): + for i, (keys, path) in enumerate( + zip(partition_keys, paths_or_buffers, strict=True) + ): rgs = row_groups[i] if row_groups else None tkeys = tuple(keys) if tkeys in plan: @@ -1237,7 +1241,9 @@ def _read_parquet( data = { name: ColumnBase.from_pylibcudf(col) - for name, col in zip(column_names, concatenated_columns) + for name, col in zip( + column_names, concatenated_columns, strict=True + ) } df = DataFrame._from_data(data) ioutils._add_df_col_struct_names(df, child_names) @@ -1515,7 +1521,7 @@ def _get_partitioned( subdir = fs.sep.join( [ _hive_dirname(name, val) - for name, val in zip(partition_cols, keys) + for name, val in zip(partition_cols, keys, strict=True) ] ) prefix = fs.sep.join([root_path, subdir]) @@ -1951,7 +1957,9 @@ def write_table(self, df): subdir = fs.sep.join( [ f"{name}={val}" - for name, val in zip(self.partition_cols, keys) + for name, val in zip( + self.partition_cols, keys, strict=True + ) ] ) prefix = fs.sep.join([self.path, subdir]) @@ -2029,6 +2037,7 @@ def write_table(self, df): paths, partition_info, metadata_file_paths, + strict=True, ): if path in self.path_cw_map: # path is a currently open file cw_idx = self.path_cw_map[path] @@ -2049,7 +2058,7 @@ def write_table(self, df): if new_cw_paths: # Create new cw for unhandled paths encountered in this write_table - new_paths, part_info, meta_paths = zip(*new_cw_paths) + new_paths, part_info, meta_paths = zip(*new_cw_paths, strict=True) self._chunked_writers.append( ( ParquetWriter(new_paths, **self.common_args), @@ -2167,7 +2176,7 @@ def _set_col_metadata( if isinstance(col.dtype, StructDtype): for i, (child_col, name) in enumerate( - zip(col.children, list(col.dtype.fields)) + zip(col.children, list(col.dtype.fields), strict=True) ): col_meta.child(i).set_name(name) _set_col_metadata( diff --git a/python/cudf/cudf/options.py b/python/cudf/cudf/options.py index f3d66e4ba67..a728b8582eb 100644 --- a/python/cudf/cudf/options.py +++ b/python/cudf/cudf/options.py @@ -387,7 +387,7 @@ def __init__(self, *args) -> None: "[(pat, val), ...])." ) - self.ops = tuple(zip(args[::2], args[1::2])) + self.ops = tuple(zip(args[::2], args[1::2], strict=True)) def __enter__(self) -> None: self.undo = tuple((pat, get_option(pat)) for pat, _ in self.ops) diff --git a/python/cudf/cudf/pandas/scripts/conftest-patch.py b/python/cudf/cudf/pandas/scripts/conftest-patch.py index 4dc5b3d42ab..c48e4058c02 100644 --- a/python/cudf/cudf/pandas/scripts/conftest-patch.py +++ b/python/cudf/cudf/pandas/scripts/conftest-patch.py @@ -5282,7 +5282,6 @@ def pytest_unconfigure(config): "tests/frame/methods/test_rename.py::TestRename::test_rename_axis_style_raises", "tests/frame/methods/test_rename.py::TestRename::test_rename_inplace", "tests/frame/methods/test_rename.py::TestRename::test_rename_mapper_and_positional_arguments_raises", - "tests/frame/methods/test_rename.py::TestRename::test_rename_multiindex", "tests/frame/methods/test_rename.py::TestRename::test_rename_no_mappings_raises", "tests/frame/methods/test_rename.py::TestRename::test_rename_nocopy", "tests/frame/methods/test_replace.py::TestDataFrameReplace::test_replace_NA_with_None", diff --git a/python/cudf/cudf/testing/_utils.py b/python/cudf/cudf/testing/_utils.py index 41401ab4bde..4c662808b9c 100644 --- a/python/cudf/cudf/testing/_utils.py +++ b/python/cudf/cudf/testing/_utils.py @@ -295,7 +295,9 @@ def get_ptr(x) -> int: assert lhs.offset == rhs.offset assert lhs.size == rhs.size assert len(lhs.base_children) == len(rhs.base_children) - for lhs_child, rhs_child in zip(lhs.base_children, rhs.base_children): + for lhs_child, rhs_child in zip( + lhs.base_children, rhs.base_children, strict=True + ): assert_column_memory_eq(lhs_child, rhs_child) if isinstance(lhs, cudf.core.column.CategoricalColumn) and isinstance( rhs, cudf.core.column.CategoricalColumn diff --git a/python/cudf/cudf/tests/dataframe/methods/test_convert_dtypes.py b/python/cudf/cudf/tests/dataframe/methods/test_convert_dtypes.py index d1de7245634..ea007eafdf7 100644 --- a/python/cudf/cudf/tests/dataframe/methods/test_convert_dtypes.py +++ b/python/cudf/cudf/tests/dataframe/methods/test_convert_dtypes.py @@ -1,4 +1,4 @@ -# Copyright (c) 2023-2024, NVIDIA CORPORATION. +# Copyright (c) 2023-2025, NVIDIA CORPORATION. import pandas as pd import pytest @@ -33,7 +33,7 @@ def test_convert_dtypes(): df = pd.DataFrame( { k: pd.Series(v, dtype=d) - for k, v, d in zip(data.keys(), data.values(), dtypes) + for k, v, d in zip(data.keys(), data.values(), dtypes, strict=True) } ) gdf = cudf.DataFrame.from_pandas(df) diff --git a/python/cudf/cudf/tests/groupby/test_ordering_pandas_compat.py b/python/cudf/cudf/tests/groupby/test_ordering_pandas_compat.py index 6fb32ae36b9..64bba6f4404 100644 --- a/python/cudf/cudf/tests/groupby/test_ordering_pandas_compat.py +++ b/python/cudf/cudf/tests/groupby/test_ordering_pandas_compat.py @@ -14,7 +14,9 @@ def test_groupby_maintain_order_random(with_nulls): key_names = [f"key{key}" for key in range(nkeys)] key_values = [rng.integers(100, size=nrows) for _ in key_names] value = rng.integers(-100, 100, size=nrows) - df = cudf.DataFrame(dict(zip(key_names, key_values), value=value)) + df = cudf.DataFrame( + dict(zip(key_names, key_values, strict=True), value=value) + ) if with_nulls: for key in key_names: df.loc[df[key] == 1, key] = None diff --git a/python/cudf/cudf/tests/indexes/multiindex/methods/test_copy.py b/python/cudf/cudf/tests/indexes/multiindex/methods/test_copy.py index 1bae3b8292c..2253ad085c5 100644 --- a/python/cudf/cudf/tests/indexes/multiindex/methods/test_copy.py +++ b/python/cudf/cudf/tests/indexes/multiindex/methods/test_copy.py @@ -95,7 +95,7 @@ def test_multiindex_copy_deep(data, copy_on_write, deep): child.base_data.get_ptr(mode="read") for child in rchildren ] - assert all((x == y) for x, y in zip(lptrs, rptrs)) + assert all((x == y) for x, y in zip(lptrs, rptrs, strict=True)) elif isinstance(data, pd.MultiIndex): data = cudf.MultiIndex.from_pandas(data) @@ -113,16 +113,22 @@ def test_multiindex_copy_deep(data, copy_on_write, deep): lv._column.base_data.get_ptr(mode="read") for lv in mi2._levels ] - assert all((x == y) == same_ref for x, y in zip(lptrs, rptrs)) + assert all( + (x == y) == same_ref for x, y in zip(lptrs, rptrs, strict=True) + ) # Assert ._codes identity lptrs = [c.base_data.get_ptr(mode="read") for c in mi1._codes] rptrs = [c.base_data.get_ptr(mode="read") for c in mi2._codes] - assert all((x == y) == same_ref for x, y in zip(lptrs, rptrs)) + assert all( + (x == y) == same_ref for x, y in zip(lptrs, rptrs, strict=True) + ) # Assert ._data identity lptrs = [d.base_data.get_ptr(mode="read") for d in mi1._columns] rptrs = [d.base_data.get_ptr(mode="read") for d in mi2._columns] - assert all((x == y) == same_ref for x, y in zip(lptrs, rptrs)) + assert all( + (x == y) == same_ref for x, y in zip(lptrs, rptrs, strict=True) + ) diff --git a/python/cudf/cudf/tests/indexes/multiindex/test_attributes.py b/python/cudf/cudf/tests/indexes/multiindex/test_attributes.py index d26e7216a73..54be9e670aa 100644 --- a/python/cudf/cudf/tests/indexes/multiindex/test_attributes.py +++ b/python/cudf/cudf/tests/indexes/multiindex/test_attributes.py @@ -50,7 +50,9 @@ def test_bool_raises(): def test_multi_index_contains_hashable(): - gidx = cudf.MultiIndex.from_tuples(zip(["foo", "bar", "baz"], [1, 2, 3])) + gidx = cudf.MultiIndex.from_tuples( + zip(["foo", "bar", "baz"], [1, 2, 3], strict=True) + ) pidx = gidx.to_pandas() assert_exceptions_equal( @@ -66,7 +68,9 @@ def test_multiindex_codes(): [("a", "b"), ("a", "c"), ("b", "c")], names=["A", "Z"] ) - for p_array, g_array in zip(midx.to_pandas().codes, midx.codes): + for p_array, g_array in zip( + midx.to_pandas().codes, midx.codes, strict=True + ): assert_eq(p_array, g_array) diff --git a/python/cudf/cudf/tests/indexes/multiindex/test_constructors.py b/python/cudf/cudf/tests/indexes/multiindex/test_constructors.py index 3db599f49e5..2afb5c4f179 100644 --- a/python/cudf/cudf/tests/indexes/multiindex/test_constructors.py +++ b/python/cudf/cudf/tests/indexes/multiindex/test_constructors.py @@ -86,7 +86,7 @@ def test_multiindex_types(): def test_multiindex_from_tuples(): arrays = [["a", "a", "b", "b"], ["house", "store", "house", "store"]] - tuples = list(zip(*arrays)) + tuples = list(zip(*arrays, strict=True)) pmi = pd.MultiIndex.from_tuples(tuples) gmi = cudf.MultiIndex.from_tuples(tuples) assert_eq(pmi, gmi) diff --git a/python/cudf/cudf/tests/test_array_ufunc.py b/python/cudf/cudf/tests/test_array_ufunc.py index 72080a5be3f..abc3c105320 100644 --- a/python/cudf/cudf/tests/test_array_ufunc.py +++ b/python/cudf/cudf/tests/test_array_ufunc.py @@ -118,7 +118,7 @@ def test_ufunc_index(request, ufunc): expect = ufunc(*(arg.to_pandas() for arg in pandas_args)) if ufunc.nout > 1: - for g, e in zip(got, expect): + for g, e in zip(got, expect, strict=True): assert_eq(g, e, check_exact=False) else: assert_eq(got, expect, check_exact=False) @@ -145,7 +145,7 @@ def test_binary_ufunc_index_array(ufunc, reflect): expect = ufunc(args[0].to_pandas(), args[1].to_numpy()) if ufunc.nout > 1: - for g, e in zip(got, expect): + for g, e in zip(got, expect, strict=True): if reflect: assert (cp.asnumpy(g) == e).all() else: @@ -241,7 +241,7 @@ def test_ufunc_series(request, ufunc, has_nulls, indexed): expect = ufunc(*(arg.to_pandas() for arg in pandas_args)) if ufunc.nout > 1: - for g, e in zip(got, expect): + for g, e in zip(got, expect, strict=True): if has_nulls: e[mask] = np.nan assert_eq(g, e, check_exact=False) @@ -336,7 +336,7 @@ def test_binary_ufunc_series_array( expect = ufunc(args[0].to_pandas(), args[1].to_numpy()) if ufunc.nout > 1: - for g, e in zip(got, expect): + for g, e in zip(got, expect, strict=True): if has_nulls: e[mask] = np.nan if reflect: @@ -457,7 +457,7 @@ def test_ufunc_dataframe(request, ufunc, has_nulls, indexed): expect = ufunc(*(arg.to_pandas() for arg in pandas_args)) if ufunc.nout > 1: - for g, e in zip(got, expect): + for g, e in zip(got, expect, strict=True): if has_nulls: e[mask] = np.nan assert_eq(g, e, check_exact=False) diff --git a/python/cudf/cudf/tests/test_categorical.py b/python/cudf/cudf/tests/test_categorical.py index 8d4f6091df0..75cd40aa436 100644 --- a/python/cudf/cudf/tests/test_categorical.py +++ b/python/cudf/cudf/tests/test_categorical.py @@ -37,6 +37,7 @@ def pd_str_cat(): def test_categorical_basic(): cat = pd.Categorical(["a", "a", "b", "c", "a"], categories=["a", "b", "c"]) cudf_cat = cudf.Index(cat) + assert_eq(cat.codes, cudf_cat.codes.to_numpy()) pdsr = pd.Series(cat, index=["p", "q", "r", "s", "t"]) sr = cudf.Series(cat, index=["p", "q", "r", "s", "t"]) @@ -50,16 +51,7 @@ def test_categorical_basic(): pdsr.cat.codes.values, sr.cat.codes.to_numpy() ) - string = str(sr) - expect_str = """ -p a -q a -r b -s c -t a -""" - assert all(x == y for x, y in zip(string.split(), expect_str.split())) - assert_eq(cat.codes, cudf_cat.codes.to_numpy()) + assert str(sr) == str(pdsr) def test_categorical_integer(): diff --git a/python/cudf/cudf/tests/test_column_accessor.py b/python/cudf/cudf/tests/test_column_accessor.py index 27ec4fcd1f3..023dbbd8daf 100644 --- a/python/cudf/cudf/tests/test_column_accessor.py +++ b/python/cudf/cudf/tests/test_column_accessor.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. +# Copyright (c) 2020-2025, NVIDIA CORPORATION. import pandas as pd @@ -34,7 +34,7 @@ def check_ca_equal(lhs, rhs): assert lhs.multiindex == rhs.multiindex assert lhs.rangeindex == rhs.rangeindex assert lhs.label_dtype == rhs.label_dtype - for l_key, r_key in zip(lhs, rhs): + for l_key, r_key in zip(lhs, rhs, strict=True): assert l_key == r_key assert_eq(lhs[l_key], rhs[r_key]) @@ -102,7 +102,7 @@ def test_iter(simple_data): yields column names. """ ca = ColumnAccessor(simple_data) - for expect_key, got_key in zip(simple_data, ca): + for expect_key, got_key in zip(simple_data, ca, strict=True): assert expect_key == got_key diff --git a/python/cudf/cudf/tests/test_csv.py b/python/cudf/cudf/tests/test_csv.py index 9494d22a158..4f9ca1b4261 100644 --- a/python/cudf/cudf/tests/test_csv.py +++ b/python/cudf/cudf/tests/test_csv.py @@ -122,7 +122,7 @@ def gdf_np_dtypes(): np.uint32, np.uint64, ] - return dict(zip(gdf_dtypes, np_dtypes)) + return dict(zip(gdf_dtypes, np_dtypes, strict=True)) @pytest.fixture diff --git a/python/cudf/cudf/tests/test_cuda_apply.py b/python/cudf/cudf/tests/test_cuda_apply.py index 80c794cb0f4..f7b0af9e51a 100644 --- a/python/cudf/cudf/tests/test_cuda_apply.py +++ b/python/cudf/cudf/tests/test_cuda_apply.py @@ -16,7 +16,8 @@ def _kernel_multiply(a, b, out): - for i, (x, y) in enumerate(zip(a, b)): + # numba doesn't support zip(..., strict=True), so we must tell ruff to ignore it. + for i, (x, y) in enumerate(zip(a, b)): # noqa: B905 out[i] = x * y @@ -71,7 +72,7 @@ def test_df_apply_rows(): nelem = 20 def kernel(in1, in2, in3, out1, out2, extra1, extra2): - for i, (x, y, z) in enumerate(zip(in1, in2, in3)): + for i, (x, y, z) in enumerate(zip(in1, in2, in3)): # noqa: B905 out1[i] = extra2 * x - extra1 * y out2[i] = y - extra1 * z @@ -106,7 +107,7 @@ def test_df_apply_chunks(chunksize): nelem = 20 def kernel(in1, in2, in3, out1, out2, extra1, extra2): - for i, (x, y, z) in enumerate(zip(in1, in2, in3)): + for i, (x, y, z) in enumerate(zip(in1, in2, in3)): # noqa: B905 out1[i] = extra2 * x - extra1 * y + z out2[i] = i @@ -140,7 +141,7 @@ def test_df_apply_custom_chunks(): nelem = 20 def kernel(in1, in2, in3, out1, out2, extra1, extra2): - for i, (x, y, z) in enumerate(zip(in1, in2, in3)): + for i, (x, y, z) in enumerate(zip(in1, in2, in3)): # noqa: B905 out1[i] = extra2 * x - extra1 * y + z out2[i] = i @@ -157,7 +158,10 @@ def kernel(in1, in2, in3, out1, out2, extra1, extra2): expect_out1 = extra2 * in1 - extra1 * in2 + in3 expect_out2 = np.hstack( - [np.arange(e - s) for s, e in zip(chunks, chunks[1:] + [len(df)])] + [ + np.arange(e - s) + for s, e in zip(chunks, chunks[1:] + [len(df)], strict=True) + ] ) outdf = df.apply_chunks( @@ -203,7 +207,7 @@ def kernel(in1, in2, in3, out1, out2, extra1, extra2): expect_out2 = np.hstack( [ tpb * np.arange(e - s) - for s, e in zip(chunks, chunks[1:] + [len(df)]) + for s, e in zip(chunks, chunks[1:] + [len(df)], strict=True) ] ) @@ -228,7 +232,7 @@ def test_df_apply_rows_incols_mapping(): nelem = 20 def kernel(x, y, z, out1, out2, extra1, extra2): - for i, (a, b, c) in enumerate(zip(x, y, z)): + for i, (a, b, c) in enumerate(zip(x, y, z)): # noqa: B905 out1[i] = extra2 * a - extra1 * b out2[i] = b - extra1 * c @@ -260,7 +264,7 @@ def test_df_apply_chunks_incols_mapping(chunksize): nelem = 20 def kernel(q, p, r, out1, out2, extra1, extra2): - for i, (a, b, c) in enumerate(zip(q, p, r)): + for i, (a, b, c) in enumerate(zip(q, p, r)): # noqa: B905 out1[i] = extra2 * a - extra1 * b + c out2[i] = i diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index 328d6fbca7b..ccf4c7de0d8 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -348,7 +348,7 @@ def test_axes(data): expected = psr.axes actual = csr.axes - for e, a in zip(expected, actual): + for e, a in zip(expected, actual, strict=True): assert_eq(e, a, exact=False) @@ -1677,7 +1677,7 @@ def test_dataframe_hash_partition_keep_index(keep_index): parts = gdf.partition_by_hash(["key"], nparts=2, keep_index=keep_index) - for exp, got in zip(expected, parts): + for exp, got in zip(expected, parts, strict=True): assert_eq(exp, got) @@ -5724,7 +5724,7 @@ def test_cov_nans(): ) def test_df_sr_binop(psr, colnames, op): data = [[3.0, 2.0, 5.0], [3.0, None, 5.0], [6.0, 7.0, np.nan]] - data = dict(zip(colnames, data)) + data = dict(zip(colnames, data, strict=True)) gsr = cudf.Series.from_pandas(psr).astype("float64") @@ -5774,7 +5774,7 @@ def test_df_sr_binop(psr, colnames, op): def test_df_sr_binop_col_order(op): colnames = [0, 1, 2] data = [[0, 2, 5], [3, None, 5], [6, 7, np.nan]] - data = dict(zip(colnames, data)) + data = dict(zip(colnames, data, strict=True)) gdf = cudf.DataFrame(data) pdf = pd.DataFrame.from_dict(data) @@ -9081,8 +9081,8 @@ def test_update_for_dataframes( ): errors = "ignore" join = "left" - left = dict(zip(left_keys, data_left)) - right = dict(zip(right_keys, data_right)) + left = dict(zip(left_keys, data_left, strict=True)) + right = dict(zip(right_keys, data_right, strict=True)) pdf = pd.DataFrame(left) gdf = cudf.DataFrame(left, nan_as_null=False) @@ -10926,7 +10926,7 @@ def test_dataframe_column_name(name): @pytest.mark.parametrize("names", [["abc", "def"], [1, 2], ["abc", 10]]) def test_dataframe_multiindex_column_names(names): arrays = [["A", "A", "B", "B"], ["one", "two", "one", "two"]] - tuples = list(zip(*arrays)) + tuples = list(zip(*arrays, strict=True)) index = pd.MultiIndex.from_tuples(tuples, names=["first", "second"]) pdf = pd.DataFrame([[1, 2, 3, 4], [5, 6, 7, 8]], columns=index) diff --git a/python/cudf/cudf/tests/test_duplicates.py b/python/cudf/cudf/tests/test_duplicates.py index c9967b83235..ce7e80e0dba 100644 --- a/python/cudf/cudf/tests/test_duplicates.py +++ b/python/cudf/cudf/tests/test_duplicates.py @@ -588,7 +588,9 @@ def test_drop_duplicates_multi_index(): ["one", "two", "one", "two", "one", "two", "one", "two"], ] - idx = pd.MultiIndex.from_tuples(list(zip(*arrays)), names=["a", "b"]) + idx = pd.MultiIndex.from_tuples( + list(zip(*arrays, strict=True)), names=["a", "b"] + ) rng = np.random.default_rng(seed=0) pdf = pd.DataFrame(rng.integers(0, 2, (8, 4)), index=idx) gdf = cudf.DataFrame.from_pandas(pdf) diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py index f332fb37e56..552ac748e3e 100644 --- a/python/cudf/cudf/tests/test_groupby.py +++ b/python/cudf/cudf/tests/test_groupby.py @@ -217,7 +217,7 @@ def test_groupby_as_index_multiindex(pdf, gdf, as_index): assert_eq(pdf, gdf) else: # column names don't match - check just the values - for gcol, pcol in zip(gdf, pdf): + for gcol, pcol in zip(gdf, pdf, strict=True): assert_array_equal(gdf[gcol].to_numpy(), pdf[pcol].values) @@ -1916,7 +1916,7 @@ def test_grouping(grouper): gdf = cudf.from_pandas(pdf) for pdf_group, gdf_group in zip( - pdf.groupby(grouper), gdf.groupby(grouper) + pdf.groupby(grouper), gdf.groupby(grouper), strict=True ): assert pdf_group[0] == gdf_group[0] assert_eq(pdf_group[1], gdf_group[1]) @@ -2749,7 +2749,7 @@ def test_groupby_shift_row_mixed_fill(shift_perc, direction, fill_value): # Pandas does not support specifying different fill_value by column, so we # simulate it column by column expected = pdf.copy() - for col, single_fill in zip(pdf.iloc[:, 1:], fill_value): + for col, single_fill in zip(pdf.iloc[:, 1:], fill_value, strict=True): expected[col] = ( pdf[col] .groupby(pdf["0"]) @@ -3725,7 +3725,8 @@ def expected(self, df, n, take_head, preserve_order): sorted(values_to_sort.tolist(), key=keyfunc), key=keyfunc, ) - ) + ), + strict=True, ) return cudf.DataFrame( {"a": expect_a, "b": expect_b}, index=index diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py index 9ee55790b73..dbb193019b2 100644 --- a/python/cudf/cudf/tests/test_index.py +++ b/python/cudf/cudf/tests/test_index.py @@ -522,7 +522,8 @@ def test_empty_df_head_tail_index(n): "one", "two", ], - ] + ], + strict=True, ) ) ), @@ -550,7 +551,8 @@ def test_empty_df_head_tail_index(n): "one", "two", ], - ] + ], + strict=True, ) ) ) diff --git a/python/cudf/cudf/tests/test_join_order.py b/python/cudf/cudf/tests/test_join_order.py index 9a95f0e01ab..60ec93f5040 100644 --- a/python/cudf/cudf/tests/test_join_order.py +++ b/python/cudf/cudf/tests/test_join_order.py @@ -1,4 +1,4 @@ -# Copyright (c) 2023-2024, NVIDIA CORPORATION. +# Copyright (c) 2023-2025, NVIDIA CORPORATION. import itertools import operator @@ -64,7 +64,7 @@ def expect_inner(left, right, sort): keys = [] val_x = [] val_y = [] - for k, v in zip(left_key, left_val): + for k, v in zip(left_key, left_val, strict=True): if k not in right_have: continue for i in right_have[k]: @@ -76,7 +76,11 @@ def expect_inner(left, right, sort): # Python sort is stable, so this will preserve input order for # equal items. keys, val_x, val_y = zip( - *sorted(zip(keys, val_x, val_y), key=operator.itemgetter(0)) + *sorted( + zip(keys, val_x, val_y, strict=True), + key=operator.itemgetter(0), + ), + strict=True, ) return cudf.DataFrame({"key": keys, "val_x": val_x, "val_y": val_y}) @@ -92,7 +96,7 @@ def expect_left(left, right, sort): keys = [] val_x = [] val_y = [] - for k, v in zip(left_key, left_val): + for k, v in zip(left_key, left_val, strict=True): if k not in right_have: right_vals = [None] else: @@ -107,7 +111,11 @@ def expect_left(left, right, sort): # Python sort is stable, so this will preserve input order for # equal items. keys, val_x, val_y = zip( - *sorted(zip(keys, val_x, val_y), key=operator.itemgetter(0)) + *sorted( + zip(keys, val_x, val_y, strict=True), + key=operator.itemgetter(0), + ), + strict=True, ) return cudf.DataFrame({"key": keys, "val_x": val_x, "val_y": val_y}) @@ -122,7 +130,7 @@ def expect_outer(left, right, sort): keys = [] val_x = [] val_y = [] - for k, v in zip(left_key, left_val): + for k, v in zip(left_key, left_val, strict=True): if k not in right_have: right_vals = [None] else: @@ -132,7 +140,7 @@ def expect_outer(left, right, sort): val_x.append(v) val_y.append(rv) left_have = set(left_key) - for k, v in zip(right_key, right_val): + for k, v in zip(right_key, right_val, strict=True): if k not in left_have: keys.append(k) val_x.append(None) @@ -142,7 +150,11 @@ def expect_outer(left, right, sort): # equal items. # outer joins are always sorted, but we test both sort values keys, val_x, val_y = zip( - *sorted(zip(keys, val_x, val_y), key=operator.itemgetter(0)) + *sorted( + zip(keys, val_x, val_y, strict=True), + key=operator.itemgetter(0), + ), + strict=True, ) return cudf.DataFrame({"key": keys, "val_x": val_x, "val_y": val_y}) diff --git a/python/cudf/cudf/tests/test_json.py b/python/cudf/cudf/tests/test_json.py index 6fd2ea24df2..adcd3fff21d 100644 --- a/python/cudf/cudf/tests/test_json.py +++ b/python/cudf/cudf/tests/test_json.py @@ -354,11 +354,14 @@ def test_json_lines_basic(json_input, engine): can_warn = isinstance(json_input, str) and not json_input.endswith(".json") with expect_warning_if(can_warn): cu_df = cudf.read_json(json_input, engine=engine, lines=True) + # io types must seek to the beginning before you can read again + if hasattr(json_input, "seek"): + json_input.seek(0) with expect_warning_if(can_warn): pd_df = pd.read_json(json_input, lines=True) assert all(cu_df.dtypes == ["int64", "int64", "int64"]) - for cu_col, pd_col in zip(cu_df.columns, pd_df.columns): + for cu_col, pd_col in zip(cu_df.columns, pd_df.columns, strict=True): assert str(cu_col) == str(pd_col) np.testing.assert_array_equal(pd_df[pd_col], cu_df[cu_col].to_numpy()) @@ -392,7 +395,7 @@ def test_json_lines_multiple(tmpdir, json_input, engine): pd_df = pd.concat([pdf, pdf]) assert all(cu_df.dtypes == ["int64", "int64", "int64"]) - for cu_col, pd_col in zip(cu_df.columns, pd_df.columns): + for cu_col, pd_col in zip(cu_df.columns, pd_df.columns, strict=True): assert str(cu_col) == str(pd_col) np.testing.assert_array_equal(pd_df[pd_col], cu_df[cu_col].to_numpy()) @@ -430,7 +433,7 @@ def test_json_read_directory(tmpdir, json_input, engine): pd_df = pd.concat([pdf, pdf, pdf]) assert all(cu_df.dtypes == ["int64", "int64", "int64"]) - for cu_col, pd_col in zip(cu_df.columns, pd_df.columns): + for cu_col, pd_col in zip(cu_df.columns, pd_df.columns, strict=True): assert str(cu_col) == str(pd_col) np.testing.assert_array_equal(pd_df[pd_col], cu_df[cu_col].to_numpy()) diff --git a/python/cudf/cudf/tests/test_list.py b/python/cudf/cudf/tests/test_list.py index 9648673273e..cadd5c80a54 100644 --- a/python/cudf/cudf/tests/test_list.py +++ b/python/cudf/cudf/tests/test_list.py @@ -185,7 +185,7 @@ def test_take(data, idx): ps = pd.Series(data) gs = cudf.from_pandas(ps) - expected = pd.Series(zip(ps, idx)).map( + expected = pd.Series(zip(ps, idx, strict=True)).map( lambda x: [x[0][i] for i in x[1]] if x[0] is not None else None ) got = gs.list.take(idx) diff --git a/python/cudf/cudf/tests/test_monotonic.py b/python/cudf/cudf/tests/test_monotonic.py index ae5f1e1c90c..842b40a6d37 100644 --- a/python/cudf/cudf/tests/test_monotonic.py +++ b/python/cudf/cudf/tests/test_monotonic.py @@ -211,7 +211,7 @@ def test_multiindex(): ], ) def test_multiindex_tuples(testarr): - tuples = list(zip(*testarr[0])) + tuples = list(zip(*testarr[0], strict=True)) index = MultiIndex.from_tuples(tuples, names=testarr[1]) index_pd = pd.MultiIndex.from_tuples(tuples, names=testarr[1]) diff --git a/python/cudf/cudf/tests/test_multiindex.py b/python/cudf/cudf/tests/test_multiindex.py index c8c4923a4e3..b612e20a17f 100644 --- a/python/cudf/cudf/tests/test_multiindex.py +++ b/python/cudf/cudf/tests/test_multiindex.py @@ -577,7 +577,7 @@ def test_multiindex_iloc(pdf, gdf, pdfIndex, iloc_rows, iloc_columns): def test_multiindex_iloc_scalar(): arrays = [["a", "a", "b", "b"], [1, 2, 3, 4]] - tuples = list(zip(*arrays)) + tuples = list(zip(*arrays, strict=True)) idx = cudf.MultiIndex.from_tuples(tuples) gdf = cudf.DataFrame( {"first": cp.random.rand(4), "second": cp.random.rand(4)} @@ -824,7 +824,7 @@ def test_pickle_roundtrip_multiindex(names): def test_multiindex_index_single_row(): arrays = [["a", "a", "b", "b"], [1, 2, 3, 4]] - tuples = list(zip(*arrays)) + tuples = list(zip(*arrays, strict=True)) idx = cudf.MultiIndex.from_tuples(tuples) gdf = cudf.DataFrame( {"first": cp.random.rand(4), "second": cp.random.rand(4)} diff --git a/python/cudf/cudf/tests/test_onehot.py b/python/cudf/cudf/tests/test_onehot.py index f15e48dfdc8..b85882a79f5 100644 --- a/python/cudf/cudf/tests/test_onehot.py +++ b/python/cudf/cudf/tests/test_onehot.py @@ -41,7 +41,11 @@ def test_get_dummies(data, index, dtype): def test_onehot_get_dummies_multicol(n_cols): n_categories = 5 data = dict( - zip(ascii_lowercase, (np.arange(n_categories) for _ in range(n_cols))) + zip( + ascii_lowercase[:n_cols], + (np.arange(n_categories) for _ in range(n_cols)), + strict=True, + ) ) gdf = cudf.DataFrame(data) diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py index 94eb2c794a5..2ab48f7ecce 100644 --- a/python/cudf/cudf/tests/test_parquet.py +++ b/python/cudf/cudf/tests/test_parquet.py @@ -421,7 +421,7 @@ def num_row_groups(rows, group_size): assert num_columns == len(pdf.columns) assert num_rows == len(pdf.index) assert row_groups == num_row_groups(num_rows, row_group_size) - for a, b in zip(col_names, pdf.columns): + for a, b in zip(col_names, pdf.columns, strict=True): assert a == b diff --git a/python/cudf/cudf/tests/test_spilling.py b/python/cudf/cudf/tests/test_spilling.py index 08226dd7f6d..4f5c94c30f0 100644 --- a/python/cudf/cudf/tests/test_spilling.py +++ b/python/cudf/cudf/tests/test_spilling.py @@ -791,5 +791,5 @@ def test_scatter_by_map(): with cudf.option_context("spill", True): df = cudf.DataFrame(data) result = df.scatter_by_map(data) - for i, res in zip(data, result): + for i, res in zip(data, result, strict=True): assert_eq(res, cudf.DataFrame([i], index=[i])) diff --git a/python/cudf/cudf/tests/testing/test_assert_frame_equal.py b/python/cudf/cudf/tests/testing/test_assert_frame_equal.py index 719dc0b1e60..a25aab458cb 100644 --- a/python/cudf/cudf/tests/testing/test_assert_frame_equal.py +++ b/python/cudf/cudf/tests/testing/test_assert_frame_equal.py @@ -41,7 +41,7 @@ def test_basic_assert_frame_equal( p_left["c"] = np.array(data, dtype="int64") p_right = pd.DataFrame(index=index) - for dtype, name in zip(rdtype, rname): + for dtype, name in zip(rdtype, rname, strict=True): p_right[name] = np.array(data, dtype=dtype) left = cudf.from_pandas(p_left) diff --git a/python/cudf/cudf/utils/hash_vocab_utils.py b/python/cudf/cudf/utils/hash_vocab_utils.py index b6660b4b1cc..2e383d590d5 100644 --- a/python/cudf/cudf/utils/hash_vocab_utils.py +++ b/python/cudf/cudf/utils/hash_vocab_utils.py @@ -195,7 +195,9 @@ def _store_func( f.write(f"{num_outer_bins}\n") f.writelines( f"{coeff} {offset}\n" - for coeff, offset in zip(inner_table_coeffs, offsets_into_ht) + for coeff, offset in zip( + inner_table_coeffs, offsets_into_ht, strict=True + ) ) f.write(f"{len(hash_table)}\n") f.writelines(f"{kv}\n" for kv in hash_table) diff --git a/python/cudf/cudf/utils/ioutils.py b/python/cudf/cudf/utils/ioutils.py index 40d54157c45..47e5ecee570 100644 --- a/python/cudf/cudf/utils/ioutils.py +++ b/python/cudf/cudf/utils/ioutils.py @@ -2295,9 +2295,10 @@ def _get_remote_bytes_all( zip( *( (r, j, min(j + blocksize, s)) - for r, s in zip(remote_paths, sizes) + for r, s in zip(remote_paths, sizes, strict=True) for j in range(0, s, blocksize) - ) + ), + strict=True, ), ) @@ -2306,7 +2307,9 @@ def _get_remote_bytes_all( # Construct local byte buffers # (Need to make sure path offsets are ordered correctly) - unique_count = dict(zip(*np.unique(paths, return_counts=True))) + unique_count = dict( + zip(*np.unique(paths, return_counts=True), strict=True) + ) offset = np.cumsum([0] + [unique_count[p] for p in remote_paths]) buffers = [ functools.reduce(operator.add, chunks[offset[i] : offset[i + 1]]) @@ -2340,7 +2343,7 @@ def _get_remote_bytes_parquet( ) buffers = [] - for size, path in zip(sizes, remote_paths): + for size, path in zip(sizes, remote_paths, strict=True): path_data = data[path] buf = np.empty(size, dtype="b") for range_offset in path_data.keys(): @@ -2393,7 +2396,7 @@ def _update_col_struct_field_names( if col.children: children = list(col.children) for i, (child, names) in enumerate( - zip(children, child_names.values()) + zip(children, child_names.values(), strict=True) ): children[i] = _update_col_struct_field_names(child, names) col.set_base_children(tuple(children)) diff --git a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py index 67c219056ca..4db0129bbae 100644 --- a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py +++ b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py @@ -791,14 +791,14 @@ def test_chunked_json_reader(tmpdir, data): pd.read_json(file_path, lines=True, chunksize=1) as pd_reader, xpd.read_json(file_path, lines=True, chunksize=1) as xpd_reader, ): - for pd_chunk, xpd_chunk in zip(pd_reader, xpd_reader): + for pd_chunk, xpd_chunk in zip(pd_reader, xpd_reader, strict=True): tm.assert_equal(pd_chunk, xpd_chunk) with ( pd.read_json(StringIO(data), lines=True, chunksize=1) as pd_reader, xpd.read_json(StringIO(data), lines=True, chunksize=1) as xpd_reader, ): - for pd_chunk, xpd_chunk in zip(pd_reader, xpd_reader): + for pd_chunk, xpd_chunk in zip(pd_reader, xpd_reader, strict=True): tm.assert_equal(pd_chunk, xpd_chunk) @@ -818,14 +818,14 @@ def test_chunked_csv_reader(tmpdir, data): pd.read_csv(file_path, chunksize=1) as pd_reader, xpd.read_csv(file_path, chunksize=1) as xpd_reader, ): - for pd_chunk, xpd_chunk in zip(pd_reader, xpd_reader): + for pd_chunk, xpd_chunk in zip(pd_reader, xpd_reader, strict=True): tm.assert_equal(pd_chunk, xpd_chunk, check_index_type=False) with ( pd.read_json(StringIO(data), lines=True, chunksize=1) as pd_reader, xpd.read_json(StringIO(data), lines=True, chunksize=1) as xpd_reader, ): - for pd_chunk, xpd_chunk in zip(pd_reader, xpd_reader): + for pd_chunk, xpd_chunk in zip(pd_reader, xpd_reader, strict=True): tm.assert_equal(pd_chunk, xpd_chunk, check_index_type=False) diff --git a/python/cudf/cudf_pandas_tests/test_profiler.py b/python/cudf/cudf_pandas_tests/test_profiler.py index f4426e09ef8..13db59f04cb 100644 --- a/python/cudf/cudf_pandas_tests/test_profiler.py +++ b/python/cudf/cudf_pandas_tests/test_profiler.py @@ -65,7 +65,7 @@ def test_profiler(): "np.isclose", "pd.Timestamp", ] - for line_stats, call in zip(per_line_stats, calls): + for line_stats, call in zip(per_line_stats, calls, strict=True): # Check that the expected function calls were recorded. assert call in line_stats[1] # No CPU time diff --git a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_catboost.py b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_catboost.py index c42ecf29fcc..ea210ecf3bb 100644 --- a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_catboost.py +++ b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_catboost.py @@ -11,7 +11,7 @@ def assert_catboost_equal(expect, got, rtol=1e-7, atol=0.0): if isinstance(expect, (tuple, list)): assert len(expect) == len(got) - for e, g in zip(expect, got): + for e, g in zip(expect, got, strict=True): assert_catboost_equal(e, g, rtol, atol) elif isinstance(expect, np.ndarray): np.testing.assert_allclose(expect, got, rtol=rtol, atol=atol) diff --git a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_cuml.py b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_cuml.py index 92a6513b7b1..5590db9dc1a 100644 --- a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_cuml.py +++ b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_cuml.py @@ -30,7 +30,7 @@ def assert_cuml_equal(expect, got): np.testing.assert_allclose(expect, got) elif isinstance(expect, tuple) and isinstance(got, tuple): assert len(expect) == len(got) - for e, g in zip(expect, got): + for e, g in zip(expect, got, strict=True): assert_cuml_equal(e, g) elif isinstance(expect, pd.DataFrame): assert pd.testing.assert_frame_equal(expect, got) diff --git a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_matplotlib.py b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_matplotlib.py index 6a33666790d..1a8ff355d4d 100644 --- a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_matplotlib.py +++ b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_matplotlib.py @@ -13,7 +13,7 @@ def assert_plots_equal(expect, got): if isinstance(expect, Axes) and isinstance(got, Axes): for expect_ch, got_ch in zip( - expect.get_children(), got.get_children() + expect.get_children(), got.get_children(), strict=True ): assert type(expect_ch) is type(got_ch) if isinstance(expect_ch, Line2D): diff --git a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_seaborn.py b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_seaborn.py index 02b2b1b9997..4ae17782533 100644 --- a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_seaborn.py +++ b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_seaborn.py @@ -12,7 +12,7 @@ def assert_plots_equal(expect, got): if isinstance(expect, Axes) and isinstance(got, Axes): for expect_ch, got_ch in zip( - expect.get_children(), got.get_children() + expect.get_children(), got.get_children(), strict=True ): assert type(expect_ch) is type(got_ch) if isinstance(expect_ch, Line2D): diff --git a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_xgboost.py b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_xgboost.py index ba98273404d..9f67c5cb307 100644 --- a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_xgboost.py +++ b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_xgboost.py @@ -1,4 +1,4 @@ -# Copyright (c) 2023-2024, NVIDIA CORPORATION. +# Copyright (c) 2023-2025, NVIDIA CORPORATION. from __future__ import annotations @@ -17,7 +17,7 @@ def xgboost_assert_equal(expect, got, rtol: float = 1e-7, atol: float = 0.0): if isinstance(expect, (tuple, list)): assert len(expect) == len(got) - for e, g in zip(expect, got): + for e, g in zip(expect, got, strict=True): xgboost_assert_equal(e, g, rtol, atol) elif isinstance(expect, scipy.sparse.csr_matrix): np.testing.assert_allclose(expect.data, got.data, rtol=rtol, atol=atol) diff --git a/python/cudf/pyproject.toml b/python/cudf/pyproject.toml index bd0bf9087a0..8e770112a67 100644 --- a/python/cudf/pyproject.toml +++ b/python/cudf/pyproject.toml @@ -106,7 +106,9 @@ filterwarnings = [ # PerformanceWarning from cupy warming up the JIT cache "ignore:Jitify is performing a one-time only warm-up to populate the persistent cache:cupy._util.PerformanceWarning", # Ignore numba PEP 456 warning specific to arm machines - "ignore:FNV hashing is not implemented in Numba.*:UserWarning" + "ignore:FNV hashing is not implemented in Numba.*:UserWarning", + # Allow running UDF tests on older architectures + "ignore:.*NVRTC log messages whilst compiling.*:UserWarning", ] markers = [ "spilling: mark benchmark a good candidate to run with `CUDF_SPILL=ON`", diff --git a/python/dask_cudf/dask_cudf/backends.py b/python/dask_cudf/dask_cudf/backends.py index 678384d78e6..097319b8b18 100644 --- a/python/dask_cudf/dask_cudf/backends.py +++ b/python/dask_cudf/dask_cudf/backends.py @@ -438,6 +438,7 @@ def group_split_cudf(df, c, k, ignore_index=False): map_size=k, keep_index=not ignore_index, ), + strict=True, ) ) diff --git a/python/dask_cudf/dask_cudf/io/orc.py b/python/dask_cudf/dask_cudf/io/orc.py index 5de28751912..6a6d54db615 100644 --- a/python/dask_cudf/dask_cudf/io/orc.py +++ b/python/dask_cudf/dask_cudf/io/orc.py @@ -101,7 +101,7 @@ def read_orc(path, columns=None, filters=None, storage_options=None, **kwargs): ) sources = [] - for path, n in zip(paths, nstripes_per_file): + for path, n in zip(paths, nstripes_per_file, strict=True): for stripe in ( range(n) if filters is None @@ -186,7 +186,7 @@ def to_orc( dwrite = delayed(write_orc_partition) parts = [ dwrite(d, path, fs, filename, compression=compression) - for d, filename in zip(df.to_delayed(), filenames) + for d, filename in zip(df.to_delayed(), filenames, strict=True) ] if compute: diff --git a/python/dask_cudf/dask_cudf/tests/test_accessor.py b/python/dask_cudf/dask_cudf/tests/test_accessor.py index 13407f5d56f..d5bd0b5047b 100644 --- a/python/dask_cudf/dask_cudf/tests/test_accessor.py +++ b/python/dask_cudf/dask_cudf/tests/test_accessor.py @@ -131,15 +131,7 @@ def test_categorical_basic(data): pdsr.cat.codes.values, result.cat.codes.values_host ) - string = str(result) - expect_str = """ -0 a -1 a -2 b -3 c -4 a -""" - assert all(x == y for x, y in zip(string.split(), expect_str.split())) + assert str(result) == str(pdsr) with dask.config.set({"dataframe.convert-string": False}): df = DataFrame() df["a"] = ["xyz", "abc", "def"] * 10 diff --git a/python/dask_cudf/dask_cudf/tests/test_core.py b/python/dask_cudf/dask_cudf/tests/test_core.py index 3d4c26d36b3..ec6f25747b4 100644 --- a/python/dask_cudf/dask_cudf/tests/test_core.py +++ b/python/dask_cudf/dask_cudf/tests/test_core.py @@ -136,7 +136,7 @@ def _fragmented_gdf(df, nsplit): subdivsize = n // nsplit starts = [i * subdivsize for i in range(nsplit)] ends = starts[1:] + [None] - frags = [df[s:e] for s, e in zip(starts, ends)] + frags = [df[s:e] for s, e in zip(starts, ends, strict=True)] return frags diff --git a/python/pylibcudf/pylibcudf/io/types.pyx b/python/pylibcudf/pylibcudf/io/types.pyx index af57af6e694..844f21e56ba 100644 --- a/python/pylibcudf/pylibcudf/io/types.pyx +++ b/python/pylibcudf/pylibcudf/io/types.pyx @@ -376,14 +376,15 @@ cdef class TableWithMetadata: def child_names(self): """ Return a dictionary mapping the names of columns with children - to the names of their child columns + to the names of their child columns. Columns without children + get an empty dictionary. """ return TableWithMetadata._parse_col_names(self.metadata.schema_info) @staticmethod cdef dict _parse_col_names(vector[column_name_info] infos): - cdef dict child_names = dict() - cdef dict names = dict() + cdef dict child_names + cdef dict names = {} for col_info in infos: child_names = TableWithMetadata._parse_col_names(col_info.children) names[col_info.name.decode()] = child_names diff --git a/python/pylibcudf/tests/common/utils.py b/python/pylibcudf/tests/common/utils.py index a844b74eb26..1ec0afed6cb 100644 --- a/python/pylibcudf/tests/common/utils.py +++ b/python/pylibcudf/tests/common/utils.py @@ -163,7 +163,7 @@ def _is_supported_for_pc_is_nan(arr_type): return _is_supported_for_pc_is_nan(arr_type.value_type) return True - for lh_arr, rh_arr in zip(lhs, rhs): + for lh_arr, rh_arr in zip(lhs, rhs, strict=True): # pc.is_nan does not support nested list # with float (eg. list>) if not _is_supported_for_pc_is_nan(lh_arr.type): @@ -213,7 +213,9 @@ def assert_table_eq(pa_table: pa.Table, plc_table: plc.Table) -> None: """Verify that a pylibcudf table and PyArrow table are equal.""" assert plc_table.shape() == pa_table.shape - for plc_col, pa_col in zip(plc_table.columns(), pa_table.columns): + for plc_col, pa_col in zip( + plc_table.columns(), pa_table.columns, strict=True + ): assert_column_eq(pa_col, plc_col) @@ -235,7 +237,9 @@ def assert_table_and_meta_eq( if not check_types_if_empty and plc_table.num_rows() == 0: return - for plc_col, pa_col in zip(plc_table.columns(), pa_table.columns): + for plc_col, pa_col in zip( + plc_table.columns(), pa_table.columns, strict=True + ): assert_column_eq(pa_col, plc_col, check_field_nullability) # Check column name equality @@ -263,7 +267,9 @@ def nesting_level(typ) -> tuple[int, int]: list_, struct = nesting_level(typ.value_type) return list_ + 1, struct elif isinstance(typ, pa.StructType): - lists, structs = map(max, zip(*(nesting_level(t.type) for t in typ))) + lists, structs = map( + max, zip(*(nesting_level(t.type) for t in typ), strict=True) + ) return lists, structs + 1 else: return 0, 0 diff --git a/python/pylibcudf/tests/conftest.py b/python/pylibcudf/tests/conftest.py index 4840fd0a6ad..24f9d79aa7f 100644 --- a/python/pylibcudf/tests/conftest.py +++ b/python/pylibcudf/tests/conftest.py @@ -119,7 +119,10 @@ def _generate_nested_data(typ): ) elif isinstance(typ, pa.ListType): pa_array = pa.array( - [list(row_vals) for row_vals in zip(rand_arrs[0])], + [ + list(row_vals) + for row_vals in zip(rand_arrs[0], strict=True) + ], type=typ, ) child_colnames.append(("", grandchild_colnames)) diff --git a/python/pylibcudf/tests/io/test_avro.py b/python/pylibcudf/tests/io/test_avro.py index 58faa7d6ad7..13c2d37a8e9 100644 --- a/python/pylibcudf/tests/io/test_avro.py +++ b/python/pylibcudf/tests/io/test_avro.py @@ -89,7 +89,8 @@ def _make_avro_table(avro_dtypes, avro_dtype_data, nullable=False): ) records = [ - {"prop1": val1, "prop2": val2} for val1, val2 in zip(*avro_dtype_data) + {"prop1": val1, "prop2": val2} + for val1, val2 in zip(*avro_dtype_data, strict=True) ] buffer = io.BytesIO() diff --git a/python/pylibcudf/tests/test_binaryops.py b/python/pylibcudf/tests/test_binaryops.py index dfe90a6723f..b25c5d22083 100644 --- a/python/pylibcudf/tests/test_binaryops.py +++ b/python/pylibcudf/tests/test_binaryops.py @@ -94,7 +94,7 @@ def inner(x, y): return None return func(x, y) - return pa.array([inner(x, y) for x, y in zip(x, y)]) + return pa.array([inner(x, y) for x, y in zip(x, y, strict=True)]) return wrapper diff --git a/python/pylibcudf/tests/test_copying.py b/python/pylibcudf/tests/test_copying.py index 29a5df4ebda..d636b10d536 100644 --- a/python/pylibcudf/tests/test_copying.py +++ b/python/pylibcudf/tests/test_copying.py @@ -216,7 +216,7 @@ def _pyarrow_boolean_mask_scatter_table(source, mask, target_table): return pa.table( [ _pyarrow_boolean_mask_scatter_column(r, mask, v) - for v, r in zip(target_table, source) + for v, r in zip(target_table, source, strict=True) ], [""] * target_table.num_columns, ) @@ -448,7 +448,9 @@ def test_empty_like_table(source_table): _, plc_source_table = source_table result = plc.copying.empty_like(plc_source_table) assert result.num_columns() == plc_source_table.num_columns() - for icol, rcol in zip(plc_source_table.columns(), result.columns()): + for icol, rcol in zip( + plc_source_table.columns(), result.columns(), strict=True + ): assert rcol.type() == icol.type() @@ -664,7 +666,7 @@ def test_slice_column(target_column): upper_bounds = bounds[1::2] lower_bounds = bounds[::2] result = plc.copying.slice(plc_target_column, bounds) - for lb, ub, slice_ in zip(lower_bounds, upper_bounds, result): + for lb, ub, slice_ in zip(lower_bounds, upper_bounds, result, strict=True): assert_column_eq(pa_target_column[lb:ub], slice_) @@ -692,17 +694,23 @@ def test_slice_table(target_table): upper_bounds = bounds[1::2] lower_bounds = bounds[::2] result = plc.copying.slice(plc_target_table, bounds) - for lb, ub, slice_ in zip(lower_bounds, upper_bounds, result): + for lb, ub, slice_ in zip(lower_bounds, upper_bounds, result, strict=True): assert_table_eq(pa_target_table[lb:ub], slice_) def test_split_column(target_column): upper_bounds = [1, 3, 5] - lower_bounds = [0] + upper_bounds[:-1] + lower_bounds = [0, *upper_bounds] + upper_bounds_extended = [*upper_bounds, None] # None means to the end pa_target_column, plc_target_column = target_column result = plc.copying.split(plc_target_column, upper_bounds) - for lb, ub, split in zip(lower_bounds, upper_bounds, result): - assert_column_eq(pa_target_column[lb:ub], split) + for lb, ub, split in zip( + lower_bounds, upper_bounds_extended, result, strict=True + ): + if ub is None: + assert_column_eq(pa_target_column[lb:], split) + else: + assert_column_eq(pa_target_column[lb:ub], split) def test_split_column_decreasing(target_column): @@ -721,10 +729,16 @@ def test_split_table(target_table): pa_target_table, plc_target_table = target_table upper_bounds = [1, 3, 5] - lower_bounds = [0] + upper_bounds[:-1] + lower_bounds = [0, *upper_bounds] + upper_bounds_extended = [*upper_bounds, None] # None means to the end result = plc.copying.split(plc_target_table, upper_bounds) - for lb, ub, split in zip(lower_bounds, upper_bounds, result): - assert_table_eq(pa_target_table[lb:ub], split) + for lb, ub, split in zip( + lower_bounds, upper_bounds_extended, result, strict=True + ): + if ub is None: + assert_table_eq(pa_target_table[lb:], split) + else: + assert_table_eq(pa_target_table[lb:ub], split) def test_copy_if_else_column_column(target_column, mask, source_scalar): diff --git a/python/pylibcudf/tests/test_lists.py b/python/pylibcudf/tests/test_lists.py index 1ceaa4bee80..718d053085b 100644 --- a/python/pylibcudf/tests/test_lists.py +++ b/python/pylibcudf/tests/test_lists.py @@ -52,7 +52,9 @@ def test_concatenate_rows(test_data): got = plc.lists.concatenate_rows(plc_tbl) - expect = pa.array([pair[0] + pair[1] for pair in zip(*test_data[0])]) + expect = pa.array( + [pair[0] + pair[1] for pair in zip(*test_data[0], strict=True)] + ) assert_column_eq(expect, got) diff --git a/python/pylibcudf/tests/test_nvtext_generate_ngrams.py b/python/pylibcudf/tests/test_nvtext_generate_ngrams.py index 30724e40698..ffa4b89423c 100644 --- a/python/pylibcudf/tests/test_nvtext_generate_ngrams.py +++ b/python/pylibcudf/tests/test_nvtext_generate_ngrams.py @@ -48,7 +48,7 @@ def test_hash_character_ngrams(input_col, ngram, seed): pa_result = plc.interop.to_arrow(result) assert all( len(got) == max(0, len(s.as_py()) - ngram + 1) - for got, s in zip(pa_result, input_col) + for got, s in zip(pa_result, input_col, strict=True) ) assert pa_result.type == pa.list_( pa.field("element", pa.uint32(), nullable=False) diff --git a/python/pylibcudf/tests/test_nvtext_jaccard.py b/python/pylibcudf/tests/test_nvtext_jaccard.py index 3f7b0682d31..98725f40c16 100644 --- a/python/pylibcudf/tests/test_nvtext_jaccard.py +++ b/python/pylibcudf/tests/test_nvtext_jaccard.py @@ -31,7 +31,7 @@ def jaccard_index(s1, s2, width): expect = pa.array( [ jaccard_index(s1.as_py(), s2.as_py(), width) - for s1, s2 in zip(input1, input2) + for s1, s2 in zip(input1, input2, strict=True) ], type=pa.float32(), ) diff --git a/python/pylibcudf/tests/test_nvtext_minhash.py b/python/pylibcudf/tests/test_nvtext_minhash.py index 6315488a205..bc6b569dbc6 100644 --- a/python/pylibcudf/tests/test_nvtext_minhash.py +++ b/python/pylibcudf/tests/test_nvtext_minhash.py @@ -29,7 +29,10 @@ def test_minhash(minhash_input_data, width): width, ) pa_result = plc.interop.to_arrow(result) - assert all(len(got) == len(seeds) for got, s in zip(pa_result, input_arr)) + assert all( + len(got) == len(seeds) + for got, s in zip(pa_result, input_arr, strict=True) + ) assert pa_result.type == pa.list_( pa.field("element", seed_type, nullable=False) ) @@ -75,7 +78,10 @@ def test_minhash_ngrams(minhash_ngrams_input_data, ngrams): plc.Column.from_arrow(ab), ) pa_result = plc.interop.to_arrow(result) - assert all(len(got) == len(ab) for got, s in zip(pa_result, input_arr)) + assert all( + len(got) == len(ab) + for got, s in zip(pa_result, input_arr, strict=True) + ) assert pa_result.type == pa.list_( pa.field("element", seed_type, nullable=False) ) diff --git a/python/pylibcudf/tests/test_quantiles.py b/python/pylibcudf/tests/test_quantiles.py index 68b7c4203dc..1c72637fd2d 100644 --- a/python/pylibcudf/tests/test_quantiles.py +++ b/python/pylibcudf/tests/test_quantiles.py @@ -131,7 +131,7 @@ def _pyarrow_quantiles( [ (name, order_mapper[order]) for name, order in zip( - pa_tbl_data.column_names, column_order + pa_tbl_data.column_names, column_order, strict=True ) ], null_placement="at_start" diff --git a/python/pylibcudf/tests/test_reshape.py b/python/pylibcudf/tests/test_reshape.py index 68f7307fda7..9ee7e389335 100644 --- a/python/pylibcudf/tests/test_reshape.py +++ b/python/pylibcudf/tests/test_reshape.py @@ -20,7 +20,7 @@ def test_interleave_columns(reshape_data): raw_data, reshape_plc_tbl = reshape_data got = plc.reshape.interleave_columns(reshape_plc_tbl) - interleaved_data = [pa.array(pair) for pair in zip(*raw_data)] + interleaved_data = [pa.array(pair) for pair in zip(*raw_data, strict=True)] expect = pa.concat_arrays(interleaved_data) diff --git a/python/pylibcudf/tests/test_string_find.py b/python/pylibcudf/tests/test_string_find.py index 0489385b33f..633e9f31512 100644 --- a/python/pylibcudf/tests/test_string_find.py +++ b/python/pylibcudf/tests/test_string_find.py @@ -142,8 +142,7 @@ def handle_none(st, target): [ handle_none(elem, target) for elem, target in zip( - pa_data_col.to_pylist(), - pa_target_col.to_pylist(), + pa_data_col.to_pylist(), pa_target_col.to_pylist(), strict=True ) ], type=pa.bool_(), @@ -159,8 +158,7 @@ def test_find_column(data_col, target_col): [ elem.find(target) if not (elem is None or target is None) else None for elem, target in zip( - pa_data_col.to_pylist(), - pa_target_col.to_pylist(), + pa_data_col.to_pylist(), pa_target_col.to_pylist(), strict=True ) ], type=pa.int32(), diff --git a/python/pylibcudf/tests/test_string_replace.py b/python/pylibcudf/tests/test_string_replace.py index f75ce722f38..2fc0b528656 100644 --- a/python/pylibcudf/tests/test_string_replace.py +++ b/python/pylibcudf/tests/test_string_replace.py @@ -108,7 +108,7 @@ def test_replace_col(data_col, col_repl_target, col_repl): # for targets/repls, so let's implement our own in python def replace_list(elem, targets, repls): - for target, repl in zip(targets, repls): + for target, repl in zip(targets, repls, strict=True): res = elem.replace(target, repl) if res != elem: return res diff --git a/python/pylibcudf/tests/test_string_replace_re.py b/python/pylibcudf/tests/test_string_replace_re.py index 6aa51d4324c..842aea69a8b 100644 --- a/python/pylibcudf/tests/test_string_replace_re.py +++ b/python/pylibcudf/tests/test_string_replace_re.py @@ -50,7 +50,7 @@ def test_replace_re_list_str_columns(flags): flags=flags, ) expect = arr - for pat, repl in zip(pats, repls): + for pat, repl in zip(pats, repls, strict=True): expect = pc.replace_substring_regex( expect, pat, diff --git a/python/pylibcudf/tests/test_string_slice.py b/python/pylibcudf/tests/test_string_slice.py index 576c78e8c12..60b38513d2e 100644 --- a/python/pylibcudf/tests/test_string_slice.py +++ b/python/pylibcudf/tests/test_string_slice.py @@ -87,6 +87,7 @@ def slice_string(st, start, stop): pa_col.to_pylist(), pa_starts_col.to_pylist(), pa_stops_col.to_pylist(), + strict=True, ) ], type=pa.string(), From 8013a532d792337b983e9b891d206a9fb0288e9d Mon Sep 17 00:00:00 2001 From: brandon-b-miller <53796099+brandon-b-miller@users.noreply.github.com> Date: Wed, 6 Aug 2025 16:38:12 -0500 Subject: [PATCH 074/366] Enable casting integer dtypes to `pl.Datetime` via `cudf-polars` (#19607) Part of https://github.com/rapidsai/cudf/issues/17060 This PR adds the ability to view an integral column as a datetime type in `cudf-polars`. Authors: - https://github.com/brandon-b-miller Approvers: - Matthew Roeschke (https://github.com/mroeschke) URL: https://github.com/rapidsai/cudf/pull/19607 --- .../cudf_polars/containers/column.py | 14 +++++++ .../cudf_polars/cudf_polars/utils/dtypes.py | 6 +++ .../tests/expressions/test_datetime_basic.py | 42 +++++++++++++++++++ 3 files changed, 62 insertions(+) diff --git a/python/cudf_polars/cudf_polars/containers/column.py b/python/cudf_polars/cudf_polars/containers/column.py index 4d7a9992f97..31e4c66a02c 100644 --- a/python/cudf_polars/cudf_polars/containers/column.py +++ b/python/cudf_polars/cudf_polars/containers/column.py @@ -293,6 +293,20 @@ def astype(self, dtype: DataType) -> Column: or self.obj.type().id() == plc.TypeId.STRING ): return Column(self._handle_string_cast(plc_dtype), dtype=dtype) + elif plc.traits.is_integral_not_bool( + self.obj.type() + ) and plc.traits.is_timestamp(plc_dtype): + upcasted = plc.unary.cast(self.obj, plc.DataType(plc.TypeId.INT64)) + result = plc.column.Column( + plc_dtype, + upcasted.size(), + upcasted.data(), + upcasted.null_mask(), + upcasted.null_count(), + upcasted.offset(), + upcasted.children(), + ) + return Column(result, dtype=dtype).sorted_like(self) else: result = Column(plc.unary.cast(self.obj, plc_dtype), dtype=dtype) if is_order_preserving_cast(self.obj.type(), plc_dtype): diff --git a/python/cudf_polars/cudf_polars/utils/dtypes.py b/python/cudf_polars/cudf_polars/utils/dtypes.py index a38652f84d9..23a94cf2a67 100644 --- a/python/cudf_polars/cudf_polars/utils/dtypes.py +++ b/python/cudf_polars/cudf_polars/utils/dtypes.py @@ -63,6 +63,12 @@ def can_cast(from_: plc.DataType, to: plc.DataType) -> bool: and not from_is_empty and is_numeric_not_bool(from_) ) + or ( + plc.traits.is_integral_not_bool(from_) + and from_.id() != plc.TypeId.UINT64 # not overflow safe + and not to_is_empty + and plc.traits.is_timestamp(to) + ) ) diff --git a/python/cudf_polars/tests/expressions/test_datetime_basic.py b/python/cudf_polars/tests/expressions/test_datetime_basic.py index 52071591948..5d98165a419 100644 --- a/python/cudf_polars/tests/expressions/test_datetime_basic.py +++ b/python/cudf_polars/tests/expressions/test_datetime_basic.py @@ -11,6 +11,7 @@ from cudf_polars.dsl.expr import TemporalFunction from cudf_polars.testing.asserts import ( + assert_collect_raises, assert_gpu_result_equal, assert_ir_translation_raises, ) @@ -344,3 +345,44 @@ def test_datetime_cast_time_unit_duration(dtype, time_unit): q = df.select(pl.col("date").dt.cast_time_unit(time_unit).alias("time_unit_ms")) assert_gpu_result_equal(q) + + +@pytest.mark.parametrize( + "datetime_dtype", + [ + pl.Datetime("ms"), + pl.Datetime("us"), + pl.Datetime("ns"), + ], +) +@pytest.mark.parametrize( + "integer_dtype", + [ + pl.Int64(), + pl.UInt64(), + pl.Int32(), + pl.UInt32(), + pl.Int16(), + pl.UInt16(), + pl.Int8(), + pl.UInt8(), + ], +) +def test_datetime_from_integer(datetime_dtype, integer_dtype): + values = [ + 0, + 1, + 100, + pl.select(integer_dtype.max()).item(), + pl.select(integer_dtype.min()).item(), + ] + df = pl.LazyFrame({"data": pl.Series(values, dtype=integer_dtype)}) + q = df.select(pl.col("data").cast(datetime_dtype).alias("datetime_from_int")) + if integer_dtype == pl.UInt64(): + assert_collect_raises( + q, + cudf_except=pl.exceptions.ComputeError, + polars_except=pl.exceptions.InvalidOperationError, + ) + else: + assert_gpu_result_equal(q) From 2f2a0bd99004433ffb598e7ffc53d75e91bb5411 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Wed, 6 Aug 2025 12:09:34 -1000 Subject: [PATCH 075/366] Add streams to all modules with 4-5 functions (#19609) Contributes to #15163 Authors: - Vyas Ramasubramani (https://github.com/vyasr) Approvers: - Matthew Murray (https://github.com/Matt711) URL: https://github.com/rapidsai/cudf/pull/19609 --- .../pylibcudf/pylibcudf/column_factories.pxd | 8 +- .../pylibcudf/pylibcudf/column_factories.pyi | 12 +-- .../pylibcudf/pylibcudf/column_factories.pyx | 54 +++++++++---- python/pylibcudf/pylibcudf/filling.pxd | 10 ++- python/pylibcudf/pylibcudf/filling.pyi | 24 ++++-- python/pylibcudf/pylibcudf/filling.pyx | 48 ++++++++--- .../libcudf/column/column_factories.pxd | 33 +++++--- .../pylibcudf/pylibcudf/libcudf/filling.pxd | 15 +++- .../pylibcudf/pylibcudf/libcudf/replace.pxd | 27 ++++--- .../pylibcudf/pylibcudf/libcudf/rolling.pxd | 11 ++- python/pylibcudf/pylibcudf/replace.pxd | 17 +++- python/pylibcudf/pylibcudf/replace.pyi | 10 ++- python/pylibcudf/pylibcudf/replace.pyx | 80 +++++++++++++++---- python/pylibcudf/pylibcudf/rolling.pxd | 13 +++ python/pylibcudf/pylibcudf/rolling.pyi | 5 ++ python/pylibcudf/pylibcudf/rolling.pyx | 26 ++++-- 16 files changed, 301 insertions(+), 92 deletions(-) diff --git a/python/pylibcudf/pylibcudf/column_factories.pxd b/python/pylibcudf/pylibcudf/column_factories.pxd index d556085ab64..00a576dea45 100644 --- a/python/pylibcudf/pylibcudf/column_factories.pxd +++ b/python/pylibcudf/pylibcudf/column_factories.pxd @@ -1,5 +1,6 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. +# Copyright (c) 2024-2025, NVIDIA CORPORATION. from pylibcudf.libcudf.types cimport mask_state +from rmm.pylibrmm.stream cimport Stream from .column cimport Column from .types cimport DataType, size_type, type_id @@ -22,28 +23,33 @@ cpdef Column make_numeric_column( DataType type_, size_type size, MaskArg mask, + Stream stream = *, ) cpdef Column make_fixed_point_column( DataType type_, size_type size, MaskArg mask, + Stream stream = *, ) cpdef Column make_timestamp_column( DataType type_, size_type size, MaskArg mask, + Stream stream = *, ) cpdef Column make_duration_column( DataType type_, size_type size, MaskArg mask, + Stream stream = *, ) cpdef Column make_fixed_width_column( DataType type_, size_type size, MaskArg mask, + Stream stream = *, ) diff --git a/python/pylibcudf/pylibcudf/column_factories.pyi b/python/pylibcudf/pylibcudf/column_factories.pyi index c87fe423acb..58556d580df 100644 --- a/python/pylibcudf/pylibcudf/column_factories.pyi +++ b/python/pylibcudf/pylibcudf/column_factories.pyi @@ -1,20 +1,22 @@ # Copyright (c) 2024, NVIDIA CORPORATION. +from rmm.pylibrmm.stream import Stream + from pylibcudf.column import Column from pylibcudf.types import DataType, MaskState, TypeId def make_empty_column(type_or_id: DataType | TypeId) -> Column: ... def make_numeric_column( - type_: DataType, size: int, mstate: MaskState + type_: DataType, size: int, mstate: MaskState, stream: Stream | None = None ) -> Column: ... def make_fixed_point_column( - type_: DataType, size: int, mstate: MaskState + type_: DataType, size: int, mstate: MaskState, stream: Stream | None = None ) -> Column: ... def make_timestamp_column( - type_: DataType, size: int, mstate: MaskState + type_: DataType, size: int, mstate: MaskState, stream: Stream | None = None ) -> Column: ... def make_duration_column( - type_: DataType, size: int, mstate: MaskState + type_: DataType, size: int, mstate: MaskState, stream: Stream | None = None ) -> Column: ... def make_fixed_width_column( - type_: DataType, size: int, mstate: MaskState + type_: DataType, size: int, mstate: MaskState, stream: Stream | None = None ) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/column_factories.pyx b/python/pylibcudf/pylibcudf/column_factories.pyx index c4969a7f502..cec01077425 100644 --- a/python/pylibcudf/pylibcudf/column_factories.pyx +++ b/python/pylibcudf/pylibcudf/column_factories.pyx @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. +# Copyright (c) 2024-2025, NVIDIA CORPORATION. from libcpp.memory cimport unique_ptr from libcpp.utility cimport move from pylibcudf.libcudf.column.column cimport column @@ -11,10 +11,12 @@ from pylibcudf.libcudf.column.column_factories cimport ( make_timestamp_column as cpp_make_timestamp_column, ) from pylibcudf.libcudf.types cimport mask_state, size_type +from rmm.pylibrmm.stream cimport Stream from .types cimport DataType, type_id from .types import MaskState, TypeId +from .utils cimport _get_stream __all__ = [ @@ -69,7 +71,8 @@ cpdef Column make_empty_column(MakeEmptyColumnOperand type_or_id): cpdef Column make_numeric_column( DataType type_, size_type size, - MaskArg mstate + MaskArg mstate, + Stream stream=None ): """Creates an empty numeric column. @@ -88,19 +91,23 @@ cpdef Column make_numeric_column( state = mstate else: raise TypeError("Invalid mask argument") + stream = _get_stream(stream) + with nogil: result = cpp_make_numeric_column( type_.c_obj, size, - state + state, + stream.view() ) - return Column.from_libcudf(move(result)) + return Column.from_libcudf(move(result), stream) cpdef Column make_fixed_point_column( DataType type_, size_type size, - MaskArg mstate + MaskArg mstate, + Stream stream=None ): cdef unique_ptr[column] result @@ -115,20 +122,24 @@ cpdef Column make_fixed_point_column( state = mstate else: raise TypeError("Invalid mask argument") + stream = _get_stream(stream) + with nogil: result = cpp_make_fixed_point_column( type_.c_obj, size, - state + state, + stream.view() ) - return Column.from_libcudf(move(result)) + return Column.from_libcudf(move(result), stream) cpdef Column make_timestamp_column( DataType type_, size_type size, - MaskArg mstate + MaskArg mstate, + Stream stream=None ): cdef unique_ptr[column] result @@ -143,20 +154,24 @@ cpdef Column make_timestamp_column( state = mstate else: raise TypeError("Invalid mask argument") + stream = _get_stream(stream) + with nogil: result = cpp_make_timestamp_column( type_.c_obj, size, - state + state, + stream.view() ) - return Column.from_libcudf(move(result)) + return Column.from_libcudf(move(result), stream) cpdef Column make_duration_column( DataType type_, size_type size, - MaskArg mstate + MaskArg mstate, + Stream stream=None ): cdef unique_ptr[column] result @@ -171,20 +186,24 @@ cpdef Column make_duration_column( state = mstate else: raise TypeError("Invalid mask argument") + stream = _get_stream(stream) + with nogil: result = cpp_make_duration_column( type_.c_obj, size, - state + state, + stream.view() ) - return Column.from_libcudf(move(result)) + return Column.from_libcudf(move(result), stream) cpdef Column make_fixed_width_column( DataType type_, size_type size, - MaskArg mstate + MaskArg mstate, + Stream stream=None ): cdef unique_ptr[column] result @@ -199,11 +218,14 @@ cpdef Column make_fixed_width_column( state = mstate else: raise TypeError("Invalid mask argument") + stream = _get_stream(stream) + with nogil: result = cpp_make_fixed_width_column( type_.c_obj, size, - state + state, + stream.view() ) - return Column.from_libcudf(move(result)) + return Column.from_libcudf(move(result), stream) diff --git a/python/pylibcudf/pylibcudf/filling.pxd b/python/pylibcudf/pylibcudf/filling.pxd index 56aef086e1b..4a55068bd43 100644 --- a/python/pylibcudf/pylibcudf/filling.pxd +++ b/python/pylibcudf/pylibcudf/filling.pxd @@ -1,5 +1,6 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. +# Copyright (c) 2024-2025, NVIDIA CORPORATION. from pylibcudf.libcudf.types cimport size_type +from rmm.pylibrmm.stream cimport Stream from .column cimport Column from .scalar cimport Scalar @@ -14,6 +15,7 @@ cpdef Column fill( size_type begin, size_type end, Scalar value, + Stream stream = *, ) cpdef void fill_in_place( @@ -21,21 +23,25 @@ cpdef void fill_in_place( size_type c_begin, size_type c_end, Scalar value, + Stream stream = *, ) cpdef Column sequence( size_type size, Scalar init, Scalar step, + Stream stream = *, ) cpdef Table repeat( Table input_table, - ColumnOrSize count + ColumnOrSize count, + Stream stream = *, ) cpdef Column calendrical_month_sequence( size_type n, Scalar init, size_type months, + Stream stream = *, ) diff --git a/python/pylibcudf/pylibcudf/filling.pyi b/python/pylibcudf/pylibcudf/filling.pyi index 0b5e29bdc32..ecbc914d58f 100644 --- a/python/pylibcudf/pylibcudf/filling.pyi +++ b/python/pylibcudf/pylibcudf/filling.pyi @@ -1,17 +1,31 @@ # Copyright (c) 2024, NVIDIA CORPORATION. +from rmm.pylibrmm.stream import Stream + from pylibcudf.column import Column from pylibcudf.scalar import Scalar from pylibcudf.table import Table def fill( - destination: Column, begin: int, end: int, value: Scalar + destination: Column, + begin: int, + end: int, + value: Scalar, + stream: Stream | None = None, ) -> Column: ... def fill_in_place( - destination: Column, begin: int, end: int, value: Scalar + destination: Column, + begin: int, + end: int, + value: Scalar, + stream: Stream | None = None, ) -> None: ... -def sequence(size: int, init: Scalar, step: Scalar) -> Column: ... -def repeat(input_table: Table, count: Column | int) -> Table: ... +def sequence( + size: int, init: Scalar, step: Scalar, stream: Stream | None = None +) -> Column: ... +def repeat( + input_table: Table, count: Column | int, stream: Stream | None = None +) -> Table: ... def calendrical_month_sequence( - n: int, init: Scalar, months: int + n: int, init: Scalar, months: int, stream: Stream | None = None ) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/filling.pyx b/python/pylibcudf/pylibcudf/filling.pyx index ea5b45ff7c2..f8d33adebef 100644 --- a/python/pylibcudf/pylibcudf/filling.pyx +++ b/python/pylibcudf/pylibcudf/filling.pyx @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. +# Copyright (c) 2024-2025, NVIDIA CORPORATION. from cython.operator cimport dereference from libcpp.memory cimport unique_ptr @@ -13,10 +13,12 @@ from pylibcudf.libcudf.filling cimport ( ) from pylibcudf.libcudf.table.table cimport table from pylibcudf.libcudf.types cimport size_type +from rmm.pylibrmm.stream cimport Stream from .column cimport Column from .scalar cimport Scalar from .table cimport Table +from .utils cimport _get_stream __all__ = [ @@ -32,6 +34,7 @@ cpdef Column fill( size_type begin, size_type end, Scalar value, + Stream stream=None, ): """Fill destination column from begin to end with value. @@ -56,20 +59,25 @@ cpdef Column fill( """ cdef unique_ptr[column] result + + stream = _get_stream(stream) + with nogil: result = cpp_fill( destination.view(), begin, end, - dereference(( value).c_obj) + dereference(( value).c_obj), + stream.view() ) - return Column.from_libcudf(move(result)) + return Column.from_libcudf(move(result), stream) cpdef void fill_in_place( Column destination, size_type begin, size_type end, Scalar value, + Stream stream=None, ): """Fill destination column in place from begin to end with value. @@ -92,15 +100,18 @@ cpdef void fill_in_place( None """ + stream = _get_stream(stream) + with nogil: cpp_fill_in_place( destination.mutable_view(), begin, end, - dereference(value.c_obj) + dereference(value.c_obj), + stream.view() ) -cpdef Column sequence(size_type size, Scalar init, Scalar step): +cpdef Column sequence(size_type size, Scalar init, Scalar step, Stream stream=None): """Create a sequence column of size ``size`` with initial value ``init`` and step ``step``. @@ -123,18 +134,23 @@ cpdef Column sequence(size_type size, Scalar init, Scalar step): cdef unique_ptr[column] result cdef size_type c_size = size + + stream = _get_stream(stream) + with nogil: result = cpp_sequence( c_size, dereference(init.c_obj), dereference(step.c_obj), + stream.view() ) - return Column.from_libcudf(move(result)) + return Column.from_libcudf(move(result), stream) cpdef Table repeat( Table input_table, - ColumnOrSize count + ColumnOrSize count, + Stream stream=None, ): """Repeat rows of a Table. @@ -160,25 +176,30 @@ cpdef Table repeat( cdef unique_ptr[table] result + stream = _get_stream(stream) + if ColumnOrSize is Column: with nogil: result = cpp_repeat( input_table.view(), - count.view() + count.view(), + stream.view() ) if ColumnOrSize is size_type: with nogil: result = cpp_repeat( input_table.view(), - count + count, + stream.view() ) - return Table.from_libcudf(move(result)) + return Table.from_libcudf(move(result), stream) cpdef Column calendrical_month_sequence( size_type n, Scalar init, size_type months, + Stream stream=None, ): """Fill destination column from begin to end with value. @@ -202,10 +223,13 @@ cpdef Column calendrical_month_sequence( cdef unique_ptr[column] c_result + stream = _get_stream(stream) + with nogil: c_result = cpp_calendrical_month_sequence( n, dereference(init.c_obj), - months + months, + stream.view() ) - return Column.from_libcudf(move(c_result)) + return Column.from_libcudf(move(c_result), stream) diff --git a/python/pylibcudf/pylibcudf/libcudf/column/column_factories.pxd b/python/pylibcudf/pylibcudf/libcudf/column/column_factories.pxd index 162822d2365..60b48a7bb6a 100644 --- a/python/pylibcudf/pylibcudf/libcudf/column/column_factories.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/column/column_factories.pxd @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. +# Copyright (c) 2020-2025, NVIDIA CORPORATION. from libcpp.memory cimport unique_ptr from pylibcudf.exception_handler cimport libcudf_exception_handler from pylibcudf.libcudf.column.column cimport column @@ -12,65 +12,76 @@ from pylibcudf.libcudf.types cimport ( ) from rmm.librmm.device_buffer cimport device_buffer +from rmm.librmm.cuda_stream_view cimport cuda_stream_view cdef extern from "cudf/column/column_factories.hpp" namespace "cudf" nogil: cdef unique_ptr[column] make_numeric_column( data_type type, size_type size, - mask_state state + mask_state state, + cuda_stream_view stream ) except +libcudf_exception_handler cdef unique_ptr[column] make_numeric_column( data_type type, size_type size, device_buffer mask, - size_type null_count + size_type null_count, + cuda_stream_view stream ) except +libcudf_exception_handler cdef unique_ptr[column] make_fixed_point_column( data_type type, size_type size, - mask_state state) except +libcudf_exception_handler + mask_state state, + cuda_stream_view stream) except +libcudf_exception_handler cdef unique_ptr[column] make_fixed_point_column( data_type type, size_type size, device_buffer mask, - size_type null_count) except +libcudf_exception_handler + size_type null_count, + cuda_stream_view stream) except +libcudf_exception_handler cdef unique_ptr[column] make_timestamp_column( data_type type, size_type size, - mask_state state) except +libcudf_exception_handler + mask_state state, + cuda_stream_view stream) except +libcudf_exception_handler cdef unique_ptr[column] make_timestamp_column( data_type type, size_type size, device_buffer mask, - size_type null_count) except +libcudf_exception_handler + size_type null_count, + cuda_stream_view stream) except +libcudf_exception_handler cdef unique_ptr[column] make_duration_column( data_type type, size_type size, - mask_state state) except +libcudf_exception_handler + mask_state state, + cuda_stream_view stream) except +libcudf_exception_handler cdef unique_ptr[column] make_duration_column( data_type type, size_type size, device_buffer mask, - size_type null_count) except +libcudf_exception_handler + size_type null_count, + cuda_stream_view stream) except +libcudf_exception_handler cdef unique_ptr[column] make_fixed_width_column( data_type type, size_type size, - mask_state state) except +libcudf_exception_handler + mask_state state, + cuda_stream_view stream) except +libcudf_exception_handler cdef unique_ptr[column] make_fixed_width_column( data_type type, size_type size, device_buffer mask, - size_type null_count) except +libcudf_exception_handler + size_type null_count, + cuda_stream_view stream) except +libcudf_exception_handler cdef unique_ptr[column] make_column_from_scalar( const scalar& s, diff --git a/python/pylibcudf/pylibcudf/libcudf/filling.pxd b/python/pylibcudf/pylibcudf/libcudf/filling.pxd index d9ae573d23b..ee8e844a2a4 100644 --- a/python/pylibcudf/pylibcudf/libcudf/filling.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/filling.pxd @@ -11,6 +11,7 @@ from pylibcudf.libcudf.scalar.scalar cimport scalar from pylibcudf.libcudf.table.table cimport table from pylibcudf.libcudf.table.table_view cimport table_view from pylibcudf.libcudf.types cimport size_type +from rmm.librmm.cuda_stream_view cimport cuda_stream_view cdef extern from "cudf/filling.hpp" namespace "cudf" nogil: @@ -18,34 +19,40 @@ cdef extern from "cudf/filling.hpp" namespace "cudf" nogil: const column_view & input, size_type begin, size_type end, - const scalar & value + const scalar & value, + cuda_stream_view stream ) except +libcudf_exception_handler cdef void fill_in_place( const mutable_column_view & destination, size_type begin, size_type end, - const scalar & value + const scalar & value, + cuda_stream_view stream ) except +libcudf_exception_handler cdef unique_ptr[table] repeat( const table_view & input, const column_view & count, + cuda_stream_view stream ) except +libcudf_exception_handler cdef unique_ptr[table] repeat( const table_view & input, - size_type count + size_type count, + cuda_stream_view stream ) except +libcudf_exception_handler cdef unique_ptr[column] sequence( size_type size, const scalar & init, - const scalar & step + const scalar & step, + cuda_stream_view stream ) except +libcudf_exception_handler cdef unique_ptr[column] calendrical_month_sequence( size_type n, const scalar& init, size_type months, + cuda_stream_view stream ) except +libcudf_exception_handler diff --git a/python/pylibcudf/pylibcudf/libcudf/replace.pxd b/python/pylibcudf/pylibcudf/libcudf/replace.pxd index bef5a25367b..d95c3dc838f 100644 --- a/python/pylibcudf/pylibcudf/libcudf/replace.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/replace.pxd @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. +# Copyright (c) 2020-2025, NVIDIA CORPORATION. from libcpp cimport bool from libcpp.memory cimport unique_ptr from pylibcudf.exception_handler cimport libcudf_exception_handler @@ -8,6 +8,7 @@ from pylibcudf.libcudf.column.column_view cimport ( mutable_column_view, ) from pylibcudf.libcudf.scalar.scalar cimport scalar +from rmm.librmm.cuda_stream_view cimport cuda_stream_view cdef extern from "cudf/replace.hpp" namespace "cudf" nogil: @@ -18,32 +19,40 @@ cdef extern from "cudf/replace.hpp" namespace "cudf" nogil: cdef unique_ptr[column] replace_nulls( column_view source_column, - column_view replacement_column) except +libcudf_exception_handler + column_view replacement_column, + cuda_stream_view stream) except +libcudf_exception_handler cdef unique_ptr[column] replace_nulls( column_view source_column, - scalar replacement) except +libcudf_exception_handler + scalar replacement, + cuda_stream_view stream) except +libcudf_exception_handler cdef unique_ptr[column] replace_nulls( column_view source_column, - replace_policy replace_policy) except +libcudf_exception_handler + replace_policy replace_policy, + cuda_stream_view stream) except +libcudf_exception_handler cdef unique_ptr[column] find_and_replace_all( column_view source_column, column_view values_to_replace, - column_view replacement_values) except +libcudf_exception_handler + column_view replacement_values, + cuda_stream_view stream) except +libcudf_exception_handler cdef unique_ptr[column] clamp( column_view source_column, scalar lo, scalar lo_replace, - scalar hi, scalar hi_replace) except +libcudf_exception_handler + scalar hi, scalar hi_replace, + cuda_stream_view stream) except +libcudf_exception_handler cdef unique_ptr[column] clamp( column_view source_column, - scalar lo, scalar hi) except +libcudf_exception_handler + scalar lo, scalar hi, + cuda_stream_view stream) except +libcudf_exception_handler cdef unique_ptr[column] normalize_nans_and_zeros( - column_view source_column) except +libcudf_exception_handler + column_view source_column, + cuda_stream_view stream) except +libcudf_exception_handler cdef void normalize_nans_and_zeros( - mutable_column_view source_column) except +libcudf_exception_handler + mutable_column_view source_column, + cuda_stream_view stream) except +libcudf_exception_handler diff --git a/python/pylibcudf/pylibcudf/libcudf/rolling.pxd b/python/pylibcudf/pylibcudf/libcudf/rolling.pxd index 75402051941..cc889990888 100644 --- a/python/pylibcudf/pylibcudf/libcudf/rolling.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/rolling.pxd @@ -11,6 +11,7 @@ from pylibcudf.libcudf.scalar.scalar cimport scalar from pylibcudf.libcudf.table.table cimport table from pylibcudf.libcudf.table.table_view cimport table_view from pylibcudf.libcudf.types cimport data_type, null_order, order, size_type +from rmm.librmm.cuda_stream_view cimport cuda_stream_view cdef extern from "cudf/rolling.hpp" namespace "cudf" nogil: @@ -40,7 +41,8 @@ cdef extern from "cudf/rolling.hpp" namespace "cudf" nogil: null_order null_order, range_window_type preceding, range_window_type following, - vector[rolling_request]& requests + vector[rolling_request]& requests, + cuda_stream_view stream ) except +libcudf_exception_handler cdef unique_ptr[column] rolling_window( @@ -48,14 +50,16 @@ cdef extern from "cudf/rolling.hpp" namespace "cudf" nogil: column_view preceding_window, column_view following_window, size_type min_periods, - rolling_aggregation& agg) except +libcudf_exception_handler + rolling_aggregation& agg, + cuda_stream_view stream) except +libcudf_exception_handler cdef unique_ptr[column] rolling_window( column_view source, size_type preceding_window, size_type following_window, size_type min_periods, - rolling_aggregation& agg) except +libcudf_exception_handler + rolling_aggregation& agg, + cuda_stream_view stream) except +libcudf_exception_handler cdef pair[unique_ptr[column], unique_ptr[column]] make_range_windows( const table_view& group_keys, @@ -64,6 +68,7 @@ cdef extern from "cudf/rolling.hpp" namespace "cudf" nogil: null_order null_order, range_window_type preceding, range_window_type following, + cuda_stream_view stream ) except +libcudf_exception_handler bool is_valid_rolling_aggregation( diff --git a/python/pylibcudf/pylibcudf/replace.pxd b/python/pylibcudf/pylibcudf/replace.pxd index cb9fa8bf960..9b66f772570 100644 --- a/python/pylibcudf/pylibcudf/replace.pxd +++ b/python/pylibcudf/pylibcudf/replace.pxd @@ -1,7 +1,8 @@ -# Copyright (c) 2023-2024, NVIDIA CORPORATION. +# Copyright (c) 2023-2025, NVIDIA CORPORATION. from libcpp cimport bool from pylibcudf.libcudf.replace cimport replace_policy +from rmm.pylibrmm.stream cimport Stream from .column cimport Column from .scalar cimport Scalar @@ -16,12 +17,17 @@ ctypedef fused ReplacementType: object -cpdef Column replace_nulls(Column source_column, ReplacementType replacement) +cpdef Column replace_nulls( + Column source_column, + ReplacementType replacement, + Stream stream = *, +) cpdef Column find_and_replace_all( Column source_column, Column values_to_replace, Column replacement_values, + Stream stream = * ) cpdef Column clamp( @@ -30,6 +36,11 @@ cpdef Column clamp( Scalar hi, Scalar lo_replace=*, Scalar hi_replace=*, + Stream stream = * ) -cpdef Column normalize_nans_and_zeros(Column source_column, bool inplace=*) +cpdef Column normalize_nans_and_zeros( + Column source_column, + bool inplace=*, + Stream stream = *, +) diff --git a/python/pylibcudf/pylibcudf/replace.pyi b/python/pylibcudf/pylibcudf/replace.pyi index eed7a2a6c52..35ef60746a6 100644 --- a/python/pylibcudf/pylibcudf/replace.pyi +++ b/python/pylibcudf/pylibcudf/replace.pyi @@ -2,6 +2,8 @@ from enum import IntEnum +from rmm.pylibrmm.stream import Stream + from pylibcudf.column import Column from pylibcudf.scalar import Scalar @@ -10,12 +12,15 @@ class ReplacePolicy(IntEnum): FOLLOWING = ... def replace_nulls( - source_column: Column, replacement: Column | Scalar | ReplacePolicy + source_column: Column, + replacement: Column | Scalar | ReplacePolicy, + stream: Stream | None = None, ) -> Column: ... def find_and_replace_all( source_column: Column, values_to_replace: Column, replacement_values: Column, + stream: Stream | None = None, ) -> Column: ... def clamp( source_column: Column, @@ -23,7 +28,8 @@ def clamp( hi: Scalar, lo_replace: Scalar | None = None, hi_replace: Scalar | None = None, + stream: Stream | None = None, ) -> Column: ... def normalize_nans_and_zeros( - source_column: Column, inplace: bool = False + source_column: Column, inplace: bool = False, stream: Stream | None = None ) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/replace.pyx b/python/pylibcudf/pylibcudf/replace.pyx index d84f814a5ee..e4089266c4f 100644 --- a/python/pylibcudf/pylibcudf/replace.pyx +++ b/python/pylibcudf/pylibcudf/replace.pyx @@ -8,12 +8,14 @@ from libcpp.memory cimport unique_ptr from libcpp.utility cimport move from pylibcudf.libcudf cimport replace as cpp_replace from pylibcudf.libcudf.column.column cimport column +from rmm.pylibrmm.stream cimport Stream from pylibcudf.libcudf.replace import \ replace_policy as ReplacePolicy # no-cython-lint from .column cimport Column from .scalar cimport Scalar +from .utils cimport _get_stream __all__ = [ "ReplacePolicy", @@ -24,7 +26,11 @@ __all__ = [ ] -cpdef Column replace_nulls(Column source_column, ReplacementType replacement): +cpdef Column replace_nulls( + Column source_column, + ReplacementType replacement, + Stream stream=None, +): """Replace nulls in source_column. The values used to replace nulls depends on the type of replacement: @@ -47,6 +53,8 @@ cpdef Column replace_nulls(Column source_column, ReplacementType replacement): If a Column, the values to use as replacements. If a Scalar, the value to use as a replacement. If a replace_policy, the policy to use to determine the replacement value. + stream : Stream | None + CUDA stream on which to perform the operation. Returns ------- @@ -56,6 +64,9 @@ cpdef Column replace_nulls(Column source_column, ReplacementType replacement): """ cdef unique_ptr[column] c_result cdef replace_policy policy + + stream = _get_stream(stream) + # Due to https://github.com/cython/cython/issues/5984, if this function is # called as a Python function (i.e. without typed inputs, which is always # true in pure Python files), the type of `replacement` will be `object` @@ -64,8 +75,12 @@ cpdef Column replace_nulls(Column source_column, ReplacementType replacement): if isinstance(replacement, ReplacePolicy): policy = replacement with nogil: - c_result = cpp_replace.replace_nulls(source_column.view(), policy) - return Column.from_libcudf(move(c_result)) + c_result = cpp_replace.replace_nulls( + source_column.view(), + policy, + stream.view(), + ) + return Column.from_libcudf(move(c_result), stream) else: raise TypeError("replacement must be a Column, Scalar, or replace_policy") @@ -73,23 +88,31 @@ cpdef Column replace_nulls(Column source_column, ReplacementType replacement): if ReplacementType is Column: c_result = cpp_replace.replace_nulls( source_column.view(), - replacement.view() + replacement.view(), + stream.view() ) elif ReplacementType is Scalar: c_result = cpp_replace.replace_nulls( - source_column.view(), dereference(replacement.c_obj) + source_column.view(), + dereference(replacement.c_obj), + stream.view(), ) elif ReplacementType is replace_policy: - c_result = cpp_replace.replace_nulls(source_column.view(), replacement) + c_result = cpp_replace.replace_nulls( + source_column.view(), + replacement, + stream.view(), + ) else: assert False, "Internal error. Please contact pylibcudf developers" - return Column.from_libcudf(move(c_result)) + return Column.from_libcudf(move(c_result), stream) cpdef Column find_and_replace_all( Column source_column, Column values_to_replace, Column replacement_values, + Stream stream=None ): """Replace all occurrences of values_to_replace with replacement_values. @@ -103,6 +126,8 @@ cpdef Column find_and_replace_all( The column containing values to replace. replacement_values : Column The column containing replacement values. + stream : Stream | None + CUDA stream on which to perform the operation. Returns ------- @@ -111,13 +136,17 @@ cpdef Column find_and_replace_all( replaced by replacement_values. """ cdef unique_ptr[column] c_result + + stream = _get_stream(stream) + with nogil: c_result = cpp_replace.find_and_replace_all( source_column.view(), values_to_replace.view(), replacement_values.view(), + stream.view(), ) - return Column.from_libcudf(move(c_result)) + return Column.from_libcudf(move(c_result), stream) cpdef Column clamp( @@ -126,6 +155,7 @@ cpdef Column clamp( Scalar hi, Scalar lo_replace=None, Scalar hi_replace=None, + Stream stream=None ): """Clamp the values in source_column to the range [lo, hi]. @@ -145,6 +175,8 @@ cpdef Column clamp( hi_replace : Scalar, optional The value to use for elements that are greater than hi. If not specified, the value of hi is used. + stream : Stream | None + CUDA stream on which to perform the operation. Returns ------- @@ -155,25 +187,34 @@ cpdef Column clamp( raise ValueError("lo_replace and hi_replace must be specified together") cdef unique_ptr[column] c_result + + stream = _get_stream(stream) + with nogil: if lo_replace is None: c_result = cpp_replace.clamp( source_column.view(), dereference(lo.c_obj), dereference(hi.c_obj), + stream.view(), ) else: c_result = cpp_replace.clamp( source_column.view(), dereference(lo.c_obj), - dereference(hi.c_obj), dereference(lo_replace.c_obj), + dereference(hi.c_obj), dereference(hi_replace.c_obj), + stream.view(), ) - return Column.from_libcudf(move(c_result)) + return Column.from_libcudf(move(c_result), stream) -cpdef Column normalize_nans_and_zeros(Column source_column, bool inplace=False): +cpdef Column normalize_nans_and_zeros( + Column source_column, + bool inplace=False, + Stream stream=None, +): """Normalize NaNs and zeros in source_column. For details, see :cpp:func:`normalize_nans_and_zeros`. @@ -185,6 +226,8 @@ cpdef Column normalize_nans_and_zeros(Column source_column, bool inplace=False): inplace : bool, optional If True, normalize source_column in place. If False, return a new column with the normalized values. + stream : Stream | None + CUDA stream on which to perform the operation. Returns ------- @@ -192,13 +235,22 @@ cpdef Column normalize_nans_and_zeros(Column source_column, bool inplace=False): A copy of source_column with NaNs and zeros normalized. """ cdef unique_ptr[column] c_result + + stream = _get_stream(stream) + with nogil: if inplace: - cpp_replace.normalize_nans_and_zeros(source_column.mutable_view()) + cpp_replace.normalize_nans_and_zeros( + source_column.mutable_view(), + stream.view(), + ) else: - c_result = cpp_replace.normalize_nans_and_zeros(source_column.view()) + c_result = cpp_replace.normalize_nans_and_zeros( + source_column.view(), + stream.view(), + ) if not inplace: - return Column.from_libcudf(move(c_result)) + return Column.from_libcudf(move(c_result), stream) ReplacePolicy.__str__ = ReplacePolicy.__repr__ diff --git a/python/pylibcudf/pylibcudf/rolling.pxd b/python/pylibcudf/pylibcudf/rolling.pxd index 305cdd97238..43242410bda 100644 --- a/python/pylibcudf/pylibcudf/rolling.pxd +++ b/python/pylibcudf/pylibcudf/rolling.pxd @@ -7,6 +7,7 @@ from pylibcudf.libcudf.rolling cimport ( bounded_closed, bounded_open, current_row, rolling_request, unbounded ) from pylibcudf.libcudf.types cimport null_order, order, size_type +from rmm.pylibrmm.stream cimport Stream from .aggregation cimport Aggregation from .column cimport Column @@ -60,6 +61,7 @@ cpdef Table grouped_range_rolling_window( PrecedingRangeWindowType preceding, FollowingRangeWindowType following, list requests, + Stream stream = *, ) cpdef Column rolling_window( @@ -68,6 +70,17 @@ cpdef Column rolling_window( WindowType following_window, size_type min_periods, Aggregation agg, + Stream stream = *, ) cpdef bool is_valid_rolling_aggregation(DataType source, Aggregation agg) + +cpdef tuple make_range_windows( + Table group_keys, + Column orderby, + order order, + null_order null_order, + PrecedingRangeWindowType preceding, + FollowingRangeWindowType following, + Stream stream = *, +) diff --git a/python/pylibcudf/pylibcudf/rolling.pyi b/python/pylibcudf/pylibcudf/rolling.pyi index 9431e40e156..334792f571a 100644 --- a/python/pylibcudf/pylibcudf/rolling.pyi +++ b/python/pylibcudf/pylibcudf/rolling.pyi @@ -1,5 +1,7 @@ # Copyright (c) 2024-2025, NVIDIA CORPORATION. +from rmm.pylibrmm.stream import Stream + from pylibcudf.aggregation import Aggregation from pylibcudf.column import Column from pylibcudf.scalar import Scalar @@ -32,6 +34,7 @@ def grouped_range_rolling_window( preceding: RangeWindowType, following: RangeWindowType, requests: list[RollingRequest], + stream: Stream | None = None, ) -> Table: ... def rolling_window[WindowType: (Column, int)]( source: Column, @@ -39,6 +42,7 @@ def rolling_window[WindowType: (Column, int)]( following_window: WindowType, min_periods: int, agg: Aggregation, + stream: Stream | None = None, ) -> Column: ... def is_valid_rolling_aggregation( source: DataType, agg: Aggregation @@ -50,4 +54,5 @@ def make_range_windows( null_order: NullOrder, preceding: RangeWindowType, following: RangeWindowType, + stream: Stream | None = None, ) -> tuple[Column, Column]: ... diff --git a/python/pylibcudf/pylibcudf/rolling.pyx b/python/pylibcudf/pylibcudf/rolling.pyx index 9f4de0491d8..8b9c46994a5 100644 --- a/python/pylibcudf/pylibcudf/rolling.pyx +++ b/python/pylibcudf/pylibcudf/rolling.pyx @@ -11,11 +11,13 @@ from pylibcudf.libcudf.aggregation cimport rolling_aggregation from pylibcudf.libcudf.column.column cimport column from pylibcudf.libcudf.table.table cimport table from pylibcudf.libcudf.types cimport size_type +from rmm.pylibrmm.stream cimport Stream from .aggregation cimport Aggregation from .column cimport Column from .scalar cimport Scalar from .types cimport DataType +from .utils cimport _get_stream __all__ = [ @@ -121,6 +123,7 @@ cpdef Table grouped_range_rolling_window( PrecedingRangeWindowType preceding, FollowingRangeWindowType following, list requests, + Stream stream=None, ): """ Perform grouping-aware range-based rolling window aggregations on some columns. @@ -154,6 +157,8 @@ cpdef Table grouped_range_rolling_window( for req in requests: crequests.push_back(move((req).view())) + stream = _get_stream(stream) + with nogil: result = cpp_rolling.grouped_range_rolling_window( group_keys.view(), @@ -162,9 +167,10 @@ cpdef Table grouped_range_rolling_window( null_order, dereference(preceding.c_obj.get()), dereference(following.c_obj.get()), - crequests + crequests, + stream.view() ) - return Table.from_libcudf(move(result)) + return Table.from_libcudf(move(result), stream) cpdef Column rolling_window( @@ -173,6 +179,7 @@ cpdef Column rolling_window( WindowType following_window, size_type min_periods, Aggregation agg, + Stream stream=None, ): """Perform a rolling window operation on a column @@ -202,6 +209,9 @@ cpdef Column rolling_window( # TODO: Consider making all the conversion functions nogil functions that # reclaim the GIL internally for just the necessary scope like column.view() cdef const rolling_aggregation *c_agg = agg.view_underlying_as_rolling() + + stream = _get_stream(stream) + if WindowType is Column: with nogil: result = cpp_rolling.rolling_window( @@ -210,6 +220,7 @@ cpdef Column rolling_window( following_window.view(), min_periods, dereference(c_agg), + stream.view() ) else: with nogil: @@ -219,9 +230,10 @@ cpdef Column rolling_window( following_window, min_periods, dereference(c_agg), + stream.view() ) - return Column.from_libcudf(move(result)) + return Column.from_libcudf(move(result), stream) cpdef bool is_valid_rolling_aggregation(DataType source, Aggregation agg): @@ -249,6 +261,7 @@ cpdef tuple make_range_windows( null_order null_order, PrecedingRangeWindowType preceding, FollowingRangeWindowType following, + Stream stream=None, ): """ Constructs preceding and following columns given window range specifications. @@ -277,6 +290,8 @@ cpdef tuple make_range_windows( """ cdef pair[unique_ptr[column], unique_ptr[column]] result + stream = _get_stream(stream) + with nogil: result = cpp_rolling.make_range_windows( group_keys.view(), @@ -285,8 +300,9 @@ cpdef tuple make_range_windows( null_order, dereference(preceding.c_obj.get()), dereference(following.c_obj.get()), + stream.view() ) return ( - Column.from_libcudf(move(result.first)), - Column.from_libcudf(move(result.second)) + Column.from_libcudf(move(result.first), stream), + Column.from_libcudf(move(result.second), stream) ) From 01c16280281f922079d1113f504f50b82bc503e5 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Wed, 6 Aug 2025 15:47:12 -0700 Subject: [PATCH 076/366] Fix strftime with non-exact %a, %A, %b, %B (#19570) closes https://github.com/rapidsai/cudf/issues/19568 Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/19570 --- python/cudf/cudf/core/column/datetime.py | 108 +++++++++++++---------- python/cudf/cudf/tests/test_datetime.py | 5 ++ 2 files changed, 64 insertions(+), 49 deletions(-) diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py index 08a0c20506e..24a5968fa5a 100644 --- a/python/cudf/cudf/core/column/datetime.py +++ b/python/cudf/cudf/core/column/datetime.py @@ -5,6 +5,7 @@ import calendar import functools import locale +import re import warnings from locale import nl_langinfo from typing import TYPE_CHECKING, Literal @@ -23,10 +24,9 @@ get_tz_data, ) from cudf.core.buffer import Buffer, acquire_spill_lock -from cudf.core.column.column import ColumnBase, as_column, column_empty +from cudf.core.column.column import ColumnBase, as_column from cudf.core.column.temporal_base import TemporalBaseColumn from cudf.utils.dtypes import ( - CUDF_STRING_DTYPE, _get_base_dtype, cudf_dtype_from_pa_type, cudf_dtype_to_pa_type, @@ -60,49 +60,6 @@ "%a", } -_DATETIME_NAMES = [ - nl_langinfo(locale.AM_STR), # type: ignore - nl_langinfo(locale.PM_STR), # type: ignore - nl_langinfo(locale.DAY_1), - nl_langinfo(locale.DAY_2), - nl_langinfo(locale.DAY_3), - nl_langinfo(locale.DAY_4), - nl_langinfo(locale.DAY_5), - nl_langinfo(locale.DAY_6), - nl_langinfo(locale.DAY_7), - nl_langinfo(locale.ABDAY_1), - nl_langinfo(locale.ABDAY_2), - nl_langinfo(locale.ABDAY_3), - nl_langinfo(locale.ABDAY_4), - nl_langinfo(locale.ABDAY_5), - nl_langinfo(locale.ABDAY_6), - nl_langinfo(locale.ABDAY_7), - nl_langinfo(locale.MON_1), - nl_langinfo(locale.MON_2), - nl_langinfo(locale.MON_3), - nl_langinfo(locale.MON_4), - nl_langinfo(locale.MON_5), - nl_langinfo(locale.MON_6), - nl_langinfo(locale.MON_7), - nl_langinfo(locale.MON_8), - nl_langinfo(locale.MON_9), - nl_langinfo(locale.MON_10), - nl_langinfo(locale.MON_11), - nl_langinfo(locale.MON_12), - nl_langinfo(locale.ABMON_1), - nl_langinfo(locale.ABMON_2), - nl_langinfo(locale.ABMON_3), - nl_langinfo(locale.ABMON_4), - nl_langinfo(locale.ABMON_5), - nl_langinfo(locale.ABMON_6), - nl_langinfo(locale.ABMON_7), - nl_langinfo(locale.ABMON_8), - nl_langinfo(locale.ABMON_9), - nl_langinfo(locale.ABMON_10), - nl_langinfo(locale.ABMON_11), - nl_langinfo(locale.ABMON_12), -] - def _resolve_binop_resolution( left_unit: Literal["s", "ms", "us", "ns"], @@ -459,19 +416,72 @@ def as_timedelta_column(self, dtype: np.dtype) -> None: # type: ignore[override f"cannot astype a datetimelike from {self.dtype} to {dtype}" ) + @functools.cached_property + def _strftime_names(self) -> plc.Column: + """Strftime names for %A, %a, %B, %b""" + return plc.Column.from_iterable_of_py( + [ + nl_langinfo(loc) + for loc in ( + locale.AM_STR, + locale.PM_STR, + locale.DAY_1, + locale.DAY_2, + locale.DAY_3, + locale.DAY_4, + locale.DAY_5, + locale.DAY_6, + locale.DAY_7, + locale.ABDAY_1, + locale.ABDAY_2, + locale.ABDAY_3, + locale.ABDAY_4, + locale.ABDAY_5, + locale.ABDAY_6, + locale.ABDAY_7, + locale.MON_1, + locale.MON_2, + locale.MON_3, + locale.MON_4, + locale.MON_5, + locale.MON_6, + locale.MON_7, + locale.MON_8, + locale.MON_9, + locale.MON_10, + locale.MON_11, + locale.MON_12, + locale.ABMON_1, + locale.ABMON_2, + locale.ABMON_3, + locale.ABMON_4, + locale.ABMON_5, + locale.ABMON_6, + locale.ABMON_7, + locale.ABMON_8, + locale.ABMON_9, + locale.ABMON_10, + locale.ABMON_11, + locale.ABMON_12, + ) + ] + ) + def strftime(self, format: str) -> StringColumn: if len(self) == 0: return super().strftime(format) - if format in _DATETIME_SPECIAL_FORMATS: - names = as_column(_DATETIME_NAMES) + if re.search("%[aAbB]", format): + names = self._strftime_names else: - names = column_empty(0, dtype=CUDF_STRING_DTYPE) + names = plc.Column.from_scalar( + plc.Scalar.from_py(None, plc.DataType(plc.TypeId.STRING)), 0 + ) with acquire_spill_lock(): return type(self).from_pylibcudf( # type: ignore[return-value] plc.strings.convert.convert_datetime.from_timestamps( self.to_pylibcudf(mode="read"), format, - names.to_pylibcudf(mode="read"), + names, ) ) diff --git a/python/cudf/cudf/tests/test_datetime.py b/python/cudf/cudf/tests/test_datetime.py index fcc03e43944..94fca7a2a6b 100644 --- a/python/cudf/cudf/tests/test_datetime.py +++ b/python/cudf/cudf/tests/test_datetime.py @@ -1195,6 +1195,11 @@ def test_datetime_fillna(data, dtype, fill_value): "%B", "%a", "%A", + "%U_", + "_%b", + "%B*", + "%a ", + "%A1", ], ) def test_datetime_strftime(data, dtype, date_format): From 58fc6dccba1aed18ffa3bdcdca303827bb61831f Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com> Date: Wed, 6 Aug 2025 17:39:00 -0700 Subject: [PATCH 077/366] Fix uninitialized variable and misaligned write in parquet generic decoder (#19601) Contributes to #19469 This PR fixes two minor bugs when a page is skipped for decoding in the parquet generic decoder. The bugs were discovered while working on #19469. The first fix initializes s->nesting_info = nullptr to avoid invalid null back copy at return. The second fix writes list offsets up to `< max_depth` instead of `<= max_depth` Authors: - Muhammad Haseeb (https://github.com/mhaseeb123) Approvers: - Basit Ayantunde (https://github.com/lamarrr) - David Wendt (https://github.com/davidwendt) - Paul Mattione (https://github.com/pmattione-nvidia) URL: https://github.com/rapidsai/cudf/pull/19601 --- cpp/src/io/parquet/decode_fixed.cu | 2 ++ cpp/src/io/parquet/page_decode.cuh | 4 ++-- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/cpp/src/io/parquet/decode_fixed.cu b/cpp/src/io/parquet/decode_fixed.cu index c475a06785c..077f9f80715 100644 --- a/cpp/src/io/parquet/decode_fixed.cu +++ b/cpp/src/io/parquet/decode_fixed.cu @@ -1054,6 +1054,8 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size_t, 8) if (not page_mask[page_idx]) { pp->num_nulls = pp->num_rows; pp->num_valids = 0; + // Set s->nesting info = nullptr to bypass `null_count_back_copier` at return + s->nesting_info = nullptr; return; } } diff --git a/cpp/src/io/parquet/page_decode.cuh b/cpp/src/io/parquet/page_decode.cuh index 0b896d7d548..6c987230709 100644 --- a/cpp/src/io/parquet/page_decode.cuh +++ b/cpp/src/io/parquet/page_decode.cuh @@ -727,7 +727,7 @@ inline __device__ void get_nesting_bounds(int& start_depth, * This function iterates through the nesting levels of a column and updates the offsets for a list * column. The offset for the current nesting level equals the length of the next nesting level * - * @tparam decode_block_size The size of the block used for decoding. + * @tparam block_size The size of the block used for decoding. * @param[in,out] state Pointer to page state containing column and nesting information. */ template @@ -738,7 +738,7 @@ static __device__ void update_list_offsets_for_pruned_pages(page_state_s* state) auto const tid = cg::this_thread_block().thread_rank(); // Iterate by depth and store offset(s) to the list location(s) - for (int depth = tid; depth <= max_depth; depth += block_size) { + for (int depth = tid; depth < max_depth; depth += block_size) { auto& nesting_info = state->nesting_info[depth]; // If we're -not- at a leaf column and we're within nesting/row bounds and we have a valid // data_out pointer, it implies this is a list column, so emit an offset for the current nesting From f6f41c42fe47d86fff03219f1b33aab5e9adbde9 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Wed, 6 Aug 2025 18:15:26 -0700 Subject: [PATCH 078/366] Move test_joining to new cudf classic test directory structure (#19501) Towards https://github.com/rapidsai/cudf/issues/9999 Towards https://github.com/rapidsai/cudf/issues/15723 Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/19501 --- python/cudf/cudf/tests/reshape/test_join.py | 670 +++++++++ .../test_merge.py} | 1263 ++--------------- 2 files changed, 788 insertions(+), 1145 deletions(-) create mode 100644 python/cudf/cudf/tests/reshape/test_join.py rename python/cudf/cudf/tests/{test_joining.py => reshape/test_merge.py} (53%) diff --git a/python/cudf/cudf/tests/reshape/test_join.py b/python/cudf/cudf/tests/reshape/test_join.py new file mode 100644 index 00000000000..fa80cd299a4 --- /dev/null +++ b/python/cudf/cudf/tests/reshape/test_join.py @@ -0,0 +1,670 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + + +import numpy as np +import pandas as pd +import pytest + +import cudf +from cudf.core._compat import PANDAS_CURRENT_SUPPORTED_VERSION, PANDAS_VERSION +from cudf.testing import assert_eq +from cudf.testing._utils import ( + assert_exceptions_equal, +) + + +@pytest.fixture( + params=( + "left", + "inner", + "outer", + "right", + "leftanti", + "leftsemi", + "cross", + ) +) +def how(request): + return request.param + + +def assert_join_results_equal(expect, got, how, **kwargs): + if how == "right": + got = got[expect.columns] + + if isinstance(expect, (pd.Series, cudf.Series)): + return assert_eq( + expect.sort_values().reset_index(drop=True), + got.sort_values().reset_index(drop=True), + **kwargs, + ) + elif isinstance(expect, (pd.DataFrame, cudf.DataFrame)): + if not len( + expect.columns + ): # can't sort_values() on a df without columns + return assert_eq(expect, got, **kwargs) + + assert_eq( + expect.sort_values(expect.columns.to_list()).reset_index( + drop=True + ), + got.sort_values(got.columns.to_list()).reset_index(drop=True), + **kwargs, + ) + elif isinstance(expect, (pd.Index, cudf.Index)): + return assert_eq(expect.sort_values(), got.sort_values(), **kwargs) + else: + raise ValueError(f"Not a join result: {type(expect).__name__}") + + +@pytest.mark.parametrize( + "aa, bb", + [ + [[0, 0, 4, 5, 5], [0, 0, 2, 3, 5]], + [[0, 0, 1, 2, 3], [0, 1, 2, 2, 3]], + [range(5), range(5, 10)], + [[0.1, 0.2, 0.3, 0.4, 0.5], [0.6, 0.7, 0.8, 0.9, 1.0]], + ], +) +def test_dataframe_join_how(aa, bb, how): + df = cudf.DataFrame( + { + "a": aa, + "b": bb, + } + ) + + def work_pandas(df, how): + df1 = df.set_index("a") + df2 = df.set_index("b") + if how == "leftanti": + joined = df1[~df1.index.isin(df2.index)][df1.columns] + elif how == "leftsemi": + joined = df1[df1.index.isin(df2.index)][df1.columns] + else: + joined = df1.join(df2, how=how, sort=True) + return joined + + def work_gdf(df): + df1 = df.set_index("a") + df2 = df.set_index("b") + joined = df1.join(df2, how=how, sort=True) + return joined + + expect = work_pandas(df.to_pandas(), how) + got = work_gdf(df) + expecto = expect.copy() + goto = got.copy() + + expect = expect.astype(np.float64).fillna(np.nan)[expect.columns] + got = got.astype(np.float64).fillna(np.nan)[expect.columns] + + assert got.index.name is None + + assert list(expect.columns) == list(got.columns) + if how in {"left", "inner", "right", "leftanti", "leftsemi"}: + assert_eq(sorted(expect.index.values), sorted(got.index.values)) + if how != "outer": + # Newly introduced ambiguous ValueError thrown when + # an index and column have the same name. Rename the + # index so sorts work. + # TODO: What is the less hacky way? + expect.index.name = "bob" + got.index.name = "mary" + assert_join_results_equal(expect, got, how=how) + # if(how=='right'): + # _sorted_check_series(expect['a'], expect['b'], + # got['a'], got['b']) + # else: + # _sorted_check_series(expect['b'], expect['a'], got['b'], + # got['a']) + else: + magic = 0xDEADBEAF + for c in expecto.columns: + expect = expecto[c].fillna(-1) + got = goto[c].fillna(-1) + + direct_equal = np.all(expect.values == got.to_numpy()) + nanfilled_equal = np.all( + expect.fillna(magic).values == got.fillna(magic).to_numpy() + ) + msg = "direct_equal={}, nanfilled_equal={}".format( + direct_equal, nanfilled_equal + ) + assert direct_equal or nanfilled_equal, msg + + +@pytest.mark.skipif( + PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, + reason="bug in older version of pandas", +) +def test_dataframe_join_suffix(): + rng = np.random.default_rng(seed=0) + + df = cudf.DataFrame(rng.integers(0, 5, (5, 3)), columns=list("abc")) + + left = df.set_index("a") + right = df.set_index("c") + msg = ( + "there are overlapping columns but lsuffix and rsuffix are not defined" + ) + with pytest.raises(ValueError, match=msg): + left.join(right) + + got = left.join(right, lsuffix="_left", rsuffix="_right", sort=True) + expect = left.to_pandas().join( + right.to_pandas(), + lsuffix="_left", + rsuffix="_right", + sort=True, + ) + # TODO: Retain result index name + expect.index.name = None + assert_join_results_equal(expect, got, how="inner") + + +def test_dataframe_join_cats(): + lhs = cudf.DataFrame() + lhs["a"] = pd.Categorical(list("aababcabbc"), categories=list("abc")) + lhs["b"] = bb = np.arange(len(lhs)) + lhs = lhs.set_index("a") + + rhs = cudf.DataFrame() + rhs["a"] = pd.Categorical(list("abcac"), categories=list("abc")) + rhs["c"] = cc = np.arange(len(rhs)) + rhs = rhs.set_index("a") + + got = lhs.join(rhs) + expect = lhs.to_pandas().join(rhs.to_pandas()) + + # Note: pandas make an object Index after joining + assert_join_results_equal(expect, got, how="inner") + + # Just do some rough checking here. + assert list(got.columns) == ["b", "c"] + assert len(got) > 0 + assert set(got.index.to_pandas()) & set("abc") + assert set(got["b"].to_numpy()) & set(bb) + assert set(got["c"].to_numpy()) & set(cc) + + +def test_dataframe_join_combine_cats(): + lhs = cudf.DataFrame({"join_index": ["a", "b", "c"], "data_x": [1, 2, 3]}) + rhs = cudf.DataFrame({"join_index": ["b", "c", "d"], "data_y": [2, 3, 4]}) + + lhs["join_index"] = lhs["join_index"].astype("category") + rhs["join_index"] = rhs["join_index"].astype("category") + + lhs = lhs.set_index("join_index") + rhs = rhs.set_index("join_index") + + lhs_pd = lhs.to_pandas() + rhs_pd = rhs.to_pandas() + + lhs_pd.index = lhs_pd.index.astype("object") + rhs_pd.index = rhs_pd.index.astype("object") + + expect = lhs_pd.join(rhs_pd, how="outer") + expect.index = expect.index.astype("category") + got = lhs.join(rhs, how="outer") + + assert_eq(expect.index.sort_values(), got.index.sort_values()) + + +def test_dataframe_join_mismatch_cats(how): + if how in {"leftanti", "leftsemi"}: + pytest.skip(f"{how} not implemented in pandas") + + pdf1 = pd.DataFrame( + { + "join_col": ["a", "b", "c", "d", "e"], + "data_col_left": [10, 20, 30, 40, 50], + } + ) + pdf2 = pd.DataFrame( + {"join_col": ["c", "e", "f"], "data_col_right": [6, 7, 8]} + ) + + pdf1["join_col"] = pdf1["join_col"].astype("category") + pdf2["join_col"] = pdf2["join_col"].astype("category") + + gdf1 = cudf.from_pandas(pdf1) + gdf2 = cudf.from_pandas(pdf2) + + gdf1 = gdf1.set_index("join_col") + gdf2 = gdf2.set_index("join_col") + + pdf1 = pdf1.set_index("join_col") + pdf2 = pdf2.set_index("join_col") + join_gdf = gdf1.join(gdf2, how=how, sort=True) + join_pdf = pdf1.join(pdf2, how=how) + + got = join_gdf.fillna(-1).to_pandas() + expect = join_pdf.fillna(-1) # note: cudf join doesn't mask NA + + # We yield a categorical here whereas pandas gives Object. + expect.index = expect.index.astype("category") + # cudf creates the columns in different order than pandas for right join + if how == "right": + got = got[["data_col_left", "data_col_right"]] + + expect.data_col_right = expect.data_col_right.astype(np.int64) + expect.data_col_left = expect.data_col_left.astype(np.int64) + + assert_join_results_equal(expect, got, how=how, check_categorical=False) + + +def test_join_datetimes_index(datetime_types_as_str): + datetimes = pd.Series(pd.date_range("20010101", "20010102", freq="12h")) + pdf_lhs = pd.DataFrame(index=[1, 0, 1, 2, 0, 0, 1]) + pdf_rhs = pd.DataFrame({"d": datetimes}) + gdf_lhs = cudf.from_pandas(pdf_lhs) + gdf_rhs = cudf.from_pandas(pdf_rhs) + + gdf_rhs["d"] = gdf_rhs["d"].astype(datetime_types_as_str) + + pdf = pdf_lhs.join(pdf_rhs, sort=True) + gdf = gdf_lhs.join(gdf_rhs, sort=True) + + assert gdf["d"].dtype == cudf.dtype(datetime_types_as_str) + + assert_join_results_equal(pdf, gdf, how="inner", check_dtype=False) + + +@pytest.mark.parametrize( + "column_a", + [ + ( + pd.Series([None, 1, 2, 3, 4, 5, 6, 7], dtype=np.float64), + pd.Series([8, 9, 10, 11, 12, None, 14, 15], dtype=np.float64), + ) + ], +) +@pytest.mark.parametrize( + "column_b", + [ + ( + pd.Series([0, 1, 0, None, 1, 0, 0, 0], dtype=np.float64), + pd.Series([None, 1, 2, 1, 2, 2, 0, 0], dtype=np.float64), + ) + ], +) +@pytest.mark.parametrize( + "column_c", + [ + ( + pd.Series(["dog", "cat", "fish", "bug"] * 2), + pd.Series(["bird", "cat", "mouse", "snake"] * 2), + ), + ( + pd.Series(["dog", "cat", "fish", "bug"] * 2).astype("category"), + pd.Series(["bird", "cat", "mouse", "snake"] * 2).astype( + "category" + ), + ), + ], +) +def test_join_multi(how, column_a, column_b, column_c): + if how in {"leftanti", "leftsemi"}: + pytest.skip(f"{how} not implemented in pandas") + + index = ["b", "c"] + df1 = pd.DataFrame() + df1["a1"] = column_a[0] + df1["b"] = column_b[0] + df1["c"] = column_c[0] + df1 = df1.set_index(index) + gdf1 = cudf.from_pandas(df1) + + df2 = pd.DataFrame() + df2["a2"] = column_a[1] + df2["b"] = column_b[1] + df2["c"] = column_c[1] + df2 = df2.set_index(index) + gdf2 = cudf.from_pandas(df2) + + gdf_result = gdf1.join(gdf2, how=how, sort=True) + pdf_result = df1.join(df2, how=how, sort=True) + + # Make sure columns are in the same order + columns = pdf_result.columns.values + gdf_result = gdf_result[columns] + pdf_result = pdf_result[columns] + + assert_join_results_equal(pdf_result, gdf_result, how="inner") + + +@pytest.mark.parametrize( + ("lhs", "rhs"), + [ + (["a", "b"], ["a"]), + (["a"], ["a", "b"]), + (["a", "b"], ["b"]), + (["b"], ["a", "b"]), + (["a"], ["a"]), + ], +) +@pytest.mark.parametrize("level", ["a", "b", 0, 1]) +def test_index_join(lhs, rhs, how, level): + if how in {"leftanti", "leftsemi", "cross"}: + pytest.skip(f"{how} not implemented in pandas") + + l_pdf = pd.DataFrame({"a": [2, 3, 1, 4], "b": [3, 7, 8, 1]}) + r_pdf = pd.DataFrame({"a": [1, 5, 4, 0], "b": [3, 9, 8, 4]}) + l_df = cudf.from_pandas(l_pdf) + r_df = cudf.from_pandas(r_pdf) + p_lhs = l_pdf.set_index(lhs).index + p_rhs = r_pdf.set_index(rhs).index + g_lhs = l_df.set_index(lhs).index + g_rhs = r_df.set_index(rhs).index + + expected = p_lhs.join(p_rhs, level=level, how=how).to_frame(index=False) + got = g_lhs.join(g_rhs, level=level, how=how).to_frame(index=False) + + assert_join_results_equal(expected, got, how=how) + + +def test_index_join_corner_cases(): + l_pdf = pd.DataFrame({"a": [2, 3, 1, 4], "b": [3, 7, 8, 1]}) + r_pdf = pd.DataFrame( + {"a": [1, 5, 4, 0], "b": [3, 9, 8, 4], "c": [2, 3, 6, 0]} + ) + l_df = cudf.from_pandas(l_pdf) + r_df = cudf.from_pandas(r_pdf) + + # Join when column name doesn't match with level + lhs = ["a", "b"] + # level and rhs don't match + rhs = ["c"] + level = "b" + how = "outer" + p_lhs = l_pdf.set_index(lhs).index + p_rhs = r_pdf.set_index(rhs).index + g_lhs = l_df.set_index(lhs).index + g_rhs = r_df.set_index(rhs).index + expected = p_lhs.join(p_rhs, level=level, how=how).to_frame(index=False) + got = g_lhs.join(g_rhs, level=level, how=how).to_frame(index=False) + + assert_join_results_equal(expected, got, how=how) + + # sort is supported only in case of two non-MultiIndex join + # Join when column name doesn't match with level + lhs = ["a"] + # level and rhs don't match + rhs = ["a"] + level = "b" + how = "left" + p_lhs = l_pdf.set_index(lhs).index + p_rhs = r_pdf.set_index(rhs).index + g_lhs = l_df.set_index(lhs).index + g_rhs = r_df.set_index(rhs).index + expected = p_lhs.join(p_rhs, how=how, sort=True) + got = g_lhs.join(g_rhs, how=how, sort=True) + + assert_join_results_equal(expected, got, how=how) + + # Pandas Index.join on categorical column returns generic column + # but cudf will be returning a categorical column itself. + lhs = ["a", "b"] + rhs = ["a"] + level = "a" + how = "inner" + l_df["a"] = l_df["a"].astype("category") + r_df["a"] = r_df["a"].astype("category") + p_lhs = l_pdf.set_index(lhs).index + p_rhs = r_pdf.set_index(rhs).index + g_lhs = l_df.set_index(lhs).index + g_rhs = r_df.set_index(rhs).index + expected = p_lhs.join(p_rhs, level=level, how=how).to_frame(index=False) + got = g_lhs.join(g_rhs, level=level, how=how).to_frame(index=False) + + got["a"] = got["a"].astype(expected["a"].dtype) + + assert_join_results_equal(expected, got, how=how) + + +def test_index_join_exception_cases(): + l_df = cudf.DataFrame({"a": [2, 3, 1, 4], "b": [3, 7, 8, 1]}) + r_df = cudf.DataFrame( + {"a": [1, 5, 4, 0], "b": [3, 9, 8, 4], "c": [2, 3, 6, 0]} + ) + + # Join between two MultiIndex + lhs = ["a", "b"] + rhs = ["a", "c"] + level = "a" + how = "outer" + g_lhs = l_df.set_index(lhs).index + g_rhs = r_df.set_index(rhs).index + + with pytest.raises(TypeError): + g_lhs.join(g_rhs, level=level, how=how) + + # Improper level value, level should be an int or scalar value + level = ["a"] + rhs = ["a"] + g_lhs = l_df.set_index(lhs).index + g_rhs = r_df.set_index(rhs).index + with pytest.raises(ValueError): + g_lhs.join(g_rhs, level=level, how=how) + + +def test_typecast_on_join_indexes(): + join_data_l = cudf.Series([1, 2, 3, 4, 5], dtype="int8") + join_data_r = cudf.Series([1, 2, 3, 4, 6], dtype="int32") + other_data = ["a", "b", "c", "d", "e"] + + gdf_l = cudf.DataFrame({"join_col": join_data_l, "B": other_data}) + gdf_r = cudf.DataFrame({"join_col": join_data_r, "B": other_data}) + + gdf_l = gdf_l.set_index("join_col") + gdf_r = gdf_r.set_index("join_col") + + exp_join_data = [1, 2, 3, 4] + exp_other_data = ["a", "b", "c", "d"] + + expect = cudf.DataFrame( + { + "join_col": exp_join_data, + "B_x": exp_other_data, + "B_y": exp_other_data, + } + ) + expect = expect.set_index("join_col") + + got = gdf_l.join(gdf_r, how="inner", lsuffix="_x", rsuffix="_y") + + assert_join_results_equal(expect, got, how="inner") + + +def test_typecast_on_join_multiindices(): + join_data_l_0 = cudf.Series([1, 2, 3, 4, 5], dtype="int8") + join_data_l_1 = cudf.Series([2, 3, 4.1, 5.9, 6], dtype="float32") + join_data_l_2 = cudf.Series([7, 8, 9, 0, 1], dtype="float32") + + join_data_r_0 = cudf.Series([1, 2, 3, 4, 5], dtype="int32") + join_data_r_1 = cudf.Series([2, 3, 4, 5, 6], dtype="int32") + join_data_r_2 = cudf.Series([7, 8, 9, 0, 0], dtype="float64") + + other_data = ["a", "b", "c", "d", "e"] + + gdf_l = cudf.DataFrame( + { + "join_col_0": join_data_l_0, + "join_col_1": join_data_l_1, + "join_col_2": join_data_l_2, + "B": other_data, + } + ) + gdf_r = cudf.DataFrame( + { + "join_col_0": join_data_r_0, + "join_col_1": join_data_r_1, + "join_col_2": join_data_r_2, + "B": other_data, + } + ) + + gdf_l = gdf_l.set_index(["join_col_0", "join_col_1", "join_col_2"]) + gdf_r = gdf_r.set_index(["join_col_0", "join_col_1", "join_col_2"]) + + exp_join_data_0 = cudf.Series([1, 2], dtype="int32") + exp_join_data_1 = cudf.Series([2, 3], dtype="float64") + exp_join_data_2 = cudf.Series([7, 8], dtype="float64") + exp_other_data = cudf.Series(["a", "b"]) + + expect = cudf.DataFrame( + { + "join_col_0": exp_join_data_0, + "join_col_1": exp_join_data_1, + "join_col_2": exp_join_data_2, + "B_x": exp_other_data, + "B_y": exp_other_data, + } + ) + expect = expect.set_index(["join_col_0", "join_col_1", "join_col_2"]) + got = gdf_l.join(gdf_r, how="inner", lsuffix="_x", rsuffix="_y") + + assert_join_results_equal(expect, got, how="inner") + + +def test_typecast_on_join_indexes_matching_categorical(): + join_data_l = cudf.Series(["a", "b", "c", "d", "e"], dtype="category") + join_data_r = cudf.Series(["a", "b", "c", "d", "e"], dtype="str") + other_data = [1, 2, 3, 4, 5] + + gdf_l = cudf.DataFrame({"join_col": join_data_l, "B": other_data}) + gdf_r = cudf.DataFrame({"join_col": join_data_r, "B": other_data}) + + gdf_l = gdf_l.set_index("join_col") + gdf_r = gdf_r.set_index("join_col") + + exp_join_data = ["a", "b", "c", "d", "e"] + exp_other_data = [1, 2, 3, 4, 5] + + expect = cudf.DataFrame( + { + "join_col": exp_join_data, + "B_x": exp_other_data, + "B_y": exp_other_data, + } + ) + expect = expect.set_index("join_col") + got = gdf_l.join(gdf_r, how="inner", lsuffix="_x", rsuffix="_y") + + assert_join_results_equal(expect, got, how="inner") + + +def test_join_multiindex_empty(): + lhs = pd.DataFrame({"a": [1, 2, 3], "b": [2, 3, 4]}, index=["a", "b", "c"]) + lhs.columns = pd.MultiIndex.from_tuples([("a", "x"), ("a", "y")]) + rhs = pd.DataFrame(index=["a", "c", "d"]) + g_lhs = cudf.from_pandas(lhs) + g_rhs = cudf.from_pandas(rhs) + assert_exceptions_equal( + lfunc=lhs.join, + rfunc=g_lhs.join, + lfunc_args_and_kwargs=([rhs], {"how": "inner"}), + rfunc_args_and_kwargs=([g_rhs], {"how": "inner"}), + check_exception_type=False, + ) + + +def test_join_on_index_with_duplicate_names(): + # although index levels with duplicate names are poorly supported + # overall, we *should* be able to join on them: + lhs = pd.DataFrame({"a": [1, 2, 3]}) + rhs = pd.DataFrame({"b": [1, 2, 3]}) + lhs.index = pd.MultiIndex.from_tuples( + [(1, 1), (1, 2), (2, 1)], names=["x", "x"] + ) + rhs.index = pd.MultiIndex.from_tuples( + [(1, 1), (1, 3), (2, 1)], names=["x", "x"] + ) + expect = lhs.join(rhs, how="inner") + + lhs = cudf.from_pandas(lhs) + rhs = cudf.from_pandas(rhs) + got = lhs.join(rhs, how="inner") + + assert_join_results_equal(expect, got, how="inner") + + +def test_join_multiindex_index(): + # test joining a MultiIndex with an Index with overlapping name + lhs = ( + cudf.DataFrame({"a": [2, 3, 1], "b": [3, 4, 2]}) + .set_index(["a", "b"]) + .index + ) + rhs = cudf.DataFrame({"a": [1, 4, 3]}).set_index("a").index + expect = lhs.to_pandas().join(rhs.to_pandas(), how="inner") + got = lhs.join(rhs, how="inner") + assert_join_results_equal(expect, got, how="inner") + + +def test_dataframe_join_on(): + """Verify that specifying the on parameter gives a NotImplementedError.""" + df = cudf.DataFrame({"a": [1, 2, 3]}) + with pytest.raises(NotImplementedError): + df.join(df, on="a") + + +def test_index_join_return_indexers_notimplemented(): + index = cudf.RangeIndex(start=0, stop=20, step=2) + other = cudf.Index([4, 4, 3, 3]) + with pytest.raises(NotImplementedError): + index.join(other, how="left", return_indexers=True) + + +@pytest.mark.xfail( + reason="https://github.com/pandas-dev/pandas/issues/57065", +) +@pytest.mark.parametrize("how", ["inner", "outer"]) +def test_index_join_names(how): + idx1 = cudf.Index([10, 1, 2, 4, 2, 1], name="a") + idx2 = cudf.Index([-10, 2, 3, 1, 2], name="b") + pidx1 = idx1.to_pandas() + pidx2 = idx2.to_pandas() + + expected = pidx1.join(pidx2, how=how) + actual = idx1.join(idx2, how=how) + assert_join_results_equal(actual, expected, how=how) + + +@pytest.mark.parametrize( + "left_data", + [ + {"lkey": ["foo", "bar", "baz", "foo"], "value": [1, 2, 3, 5]}, + {"lkey": ["foo", "bar", "baz", "foo"], "value": [5, 3, 2, 1]}, + { + "lkey": ["foo", "bar", "baz", "foo"], + "value": [5, 3, 2, 1], + "extra_left": [1, 2, 3, 4], + }, + ], +) +@pytest.mark.parametrize( + "right_data", + [ + {"rkey": ["foo", "bar", "baz", "foo"], "value": [5, 6, 7, 8]}, + {"rkey": ["foo", "bar", "baz", "foo"], "value": [8, 7, 6, 5]}, + { + "rkey": ["foo", "bar", "baz", "foo"], + "value": [8, 7, 6, 5], + "extra_right": [10, 2, 30, 4], + }, + ], +) +@pytest.mark.parametrize("sort", [True, False]) +def test_cross_join_overlapping(left_data, right_data, sort): + df1 = cudf.DataFrame(left_data) + df2 = cudf.DataFrame(right_data) + + pdf1 = df1.to_pandas() + pdf2 = df2.to_pandas() + expected = pdf1.join( + pdf2, how="cross", lsuffix="_x", rsuffix="_y", sort=sort + ) + result = df1.join(df2, how="cross", lsuffix="_x", rsuffix="_y", sort=sort) + assert_eq(result, expected) diff --git a/python/cudf/cudf/tests/test_joining.py b/python/cudf/cudf/tests/reshape/test_merge.py similarity index 53% rename from python/cudf/cudf/tests/test_joining.py rename to python/cudf/cudf/tests/reshape/test_merge.py index bb24111cfc3..f661eb4b587 100644 --- a/python/cudf/cudf/tests/test_joining.py +++ b/python/cudf/cudf/tests/reshape/test_merge.py @@ -1,19 +1,15 @@ -# Copyright (c) 2018-2025, NVIDIA CORPORATION. +# Copyright (c) 2025, NVIDIA CORPORATION. -from itertools import combinations, product, repeat import numpy as np import pandas as pd import pytest import cudf -from cudf.core._compat import PANDAS_CURRENT_SUPPORTED_VERSION, PANDAS_VERSION from cudf.core.dtypes import CategoricalDtype, Decimal64Dtype, Decimal128Dtype from cudf.testing import assert_eq from cudf.testing._utils import ( - INTEGER_TYPES, NUMERIC_TYPES, - TIMEDELTA_TYPES, assert_exceptions_equal, expect_warning_if, ) @@ -35,28 +31,6 @@ def how(request): return request.param -rng = np.random.default_rng(seed=0) - - -@pytest.fixture( - params=[ - [[0, 0, 4, 5, 5], [0, 0, 2, 3, 5]], - [[0, 0, 1, 2, 3], [0, 1, 2, 2, 3]], - [rng.integers(0, 50, 100), rng.integers(0, 50, 100)], - [rng.random(50), rng.random(50)], - ] -) -def aa_bb(request): - return request.param - - -def pd_odd_joins(left, right, join_type): - if join_type == "leftanti": - return left[~left.index.isin(right.index)][left.columns] - elif join_type == "leftsemi": - return left[left.index.isin(right.index)][left.columns] - - def assert_join_results_equal(expect, got, how, **kwargs): if how == "right": got = got[expect.columns] @@ -86,193 +60,6 @@ def assert_join_results_equal(expect, got, how, **kwargs): raise ValueError(f"Not a join result: {type(expect).__name__}") -def test_dataframe_join_how(aa_bb, how): - aa, bb = aa_bb - df = cudf.DataFrame( - { - "a": aa, - "b": bb, - } - ) - - def work_pandas(df, how): - df1 = df.set_index("a") - df2 = df.set_index("b") - if how == "leftanti": - joined = pd_odd_joins(df1, df2, "leftanti") - elif how == "leftsemi": - joined = pd_odd_joins(df1, df2, "leftsemi") - else: - joined = df1.join(df2, how=how, sort=True) - return joined - - def work_gdf(df): - df1 = df.set_index("a") - df2 = df.set_index("b") - joined = df1.join(df2, how=how, sort=True) - return joined - - expect = work_pandas(df.to_pandas(), how) - got = work_gdf(df) - expecto = expect.copy() - goto = got.copy() - - expect = expect.astype(np.float64).fillna(np.nan)[expect.columns] - got = got.astype(np.float64).fillna(np.nan)[expect.columns] - - assert got.index.name is None - - assert list(expect.columns) == list(got.columns) - if how in {"left", "inner", "right", "leftanti", "leftsemi"}: - assert_eq(sorted(expect.index.values), sorted(got.index.values)) - if how != "outer": - # Newly introduced ambiguous ValueError thrown when - # an index and column have the same name. Rename the - # index so sorts work. - # TODO: What is the less hacky way? - expect.index.name = "bob" - got.index.name = "mary" - assert_join_results_equal(expect, got, how=how) - # if(how=='right'): - # _sorted_check_series(expect['a'], expect['b'], - # got['a'], got['b']) - # else: - # _sorted_check_series(expect['b'], expect['a'], got['b'], - # got['a']) - else: - magic = 0xDEADBEAF - for c in expecto.columns: - expect = expecto[c].fillna(-1) - got = goto[c].fillna(-1) - - direct_equal = np.all(expect.values == got.to_numpy()) - nanfilled_equal = np.all( - expect.fillna(magic).values == got.fillna(magic).to_numpy() - ) - msg = "direct_equal={}, nanfilled_equal={}".format( - direct_equal, nanfilled_equal - ) - assert direct_equal or nanfilled_equal, msg - - -@pytest.mark.skipif( - PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, - reason="bug in older version of pandas", -) -def test_dataframe_join_suffix(): - rng = np.random.default_rng(seed=0) - - df = cudf.DataFrame(rng.integers(0, 5, (5, 3)), columns=list("abc")) - - left = df.set_index("a") - right = df.set_index("c") - msg = ( - "there are overlapping columns but lsuffix and rsuffix are not defined" - ) - with pytest.raises(ValueError, match=msg): - left.join(right) - - got = left.join(right, lsuffix="_left", rsuffix="_right", sort=True) - expect = left.to_pandas().join( - right.to_pandas(), - lsuffix="_left", - rsuffix="_right", - sort=True, - ) - # TODO: Retain result index name - expect.index.name = None - assert_join_results_equal(expect, got, how="inner") - - -def test_dataframe_join_cats(): - lhs = cudf.DataFrame() - lhs["a"] = pd.Categorical(list("aababcabbc"), categories=list("abc")) - lhs["b"] = bb = np.arange(len(lhs)) - lhs = lhs.set_index("a") - - rhs = cudf.DataFrame() - rhs["a"] = pd.Categorical(list("abcac"), categories=list("abc")) - rhs["c"] = cc = np.arange(len(rhs)) - rhs = rhs.set_index("a") - - got = lhs.join(rhs) - expect = lhs.to_pandas().join(rhs.to_pandas()) - - # Note: pandas make an object Index after joining - assert_join_results_equal(expect, got, how="inner") - - # Just do some rough checking here. - assert list(got.columns) == ["b", "c"] - assert len(got) > 0 - assert set(got.index.to_pandas()) & set("abc") - assert set(got["b"].to_numpy()) & set(bb) - assert set(got["c"].to_numpy()) & set(cc) - - -def test_dataframe_join_combine_cats(): - lhs = cudf.DataFrame({"join_index": ["a", "b", "c"], "data_x": [1, 2, 3]}) - rhs = cudf.DataFrame({"join_index": ["b", "c", "d"], "data_y": [2, 3, 4]}) - - lhs["join_index"] = lhs["join_index"].astype("category") - rhs["join_index"] = rhs["join_index"].astype("category") - - lhs = lhs.set_index("join_index") - rhs = rhs.set_index("join_index") - - lhs_pd = lhs.to_pandas() - rhs_pd = rhs.to_pandas() - - lhs_pd.index = lhs_pd.index.astype("object") - rhs_pd.index = rhs_pd.index.astype("object") - - expect = lhs_pd.join(rhs_pd, how="outer") - expect.index = expect.index.astype("category") - got = lhs.join(rhs, how="outer") - - assert_eq(expect.index.sort_values(), got.index.sort_values()) - - -@pytest.mark.parametrize("how", ["left", "right", "inner", "outer"]) -def test_dataframe_join_mismatch_cats(how): - pdf1 = pd.DataFrame( - { - "join_col": ["a", "b", "c", "d", "e"], - "data_col_left": [10, 20, 30, 40, 50], - } - ) - pdf2 = pd.DataFrame( - {"join_col": ["c", "e", "f"], "data_col_right": [6, 7, 8]} - ) - - pdf1["join_col"] = pdf1["join_col"].astype("category") - pdf2["join_col"] = pdf2["join_col"].astype("category") - - gdf1 = cudf.from_pandas(pdf1) - gdf2 = cudf.from_pandas(pdf2) - - gdf1 = gdf1.set_index("join_col") - gdf2 = gdf2.set_index("join_col") - - pdf1 = pdf1.set_index("join_col") - pdf2 = pdf2.set_index("join_col") - join_gdf = gdf1.join(gdf2, how=how, sort=True) - join_pdf = pdf1.join(pdf2, how=how) - - got = join_gdf.fillna(-1).to_pandas() - expect = join_pdf.fillna(-1) # note: cudf join doesn't mask NA - - # We yield a categorical here whereas pandas gives Object. - expect.index = expect.index.astype("category") - # cudf creates the columns in different order than pandas for right join - if how == "right": - got = got[["data_col_left", "data_col_right"]] - - expect.data_col_right = expect.data_col_right.astype(np.int64) - expect.data_col_left = expect.data_col_left.astype(np.int64) - - assert_join_results_equal(expect, got, how=how, check_categorical=False) - - @pytest.mark.parametrize("on", ["key1", ["key1", "key2"], None]) def test_dataframe_merge_on(on): rng = np.random.default_rng(seed=0) @@ -421,47 +208,21 @@ def test_dataframe_merge_order(): "pairs", [ ("", ""), - ("", "a"), - ("", "ab"), ("", "abc"), - ("", "b"), - ("", "bcd"), - ("", "cde"), ("a", "a"), - ("a", "ab"), - ("a", "abc"), - ("a", "b"), - ("a", "bcd"), - ("a", "cde"), - ("ab", "ab"), - ("ab", "abc"), - ("ab", "b"), - ("ab", "bcd"), - ("ab", "cde"), - ("abc", "abc"), - ("abc", "b"), - ("abc", "bcd"), - ("abc", "cde"), - ("b", "b"), - ("b", "bcd"), - ("b", "cde"), - ("bcd", "bcd"), - ("bcd", "cde"), - ("cde", "cde"), ], ) -@pytest.mark.parametrize("max", [5, 1000]) -@pytest.mark.parametrize("rows", [1, 5, 100]) -@pytest.mark.parametrize("how", ["left", "inner", "outer"]) -def test_dataframe_pairs_of_triples(pairs, max, rows, how): +def test_dataframe_pairs_of_triples(pairs, how): + if how in {"leftsemi", "leftanti"}: + pytest.skip(f"{how} not implemented in pandas") rng = np.random.default_rng(seed=0) pdf_left = pd.DataFrame() pdf_right = pd.DataFrame() for left_column in pairs[0]: - pdf_left[left_column] = rng.integers(0, max, rows) + pdf_left[left_column] = rng.integers(0, 10, 10) for right_column in pairs[1]: - pdf_right[right_column] = rng.integers(0, max, rows) + pdf_right[right_column] = rng.integers(0, 10, 10) gdf_left = cudf.from_pandas(pdf_left) gdf_right = cudf.from_pandas(pdf_right) if not set(pdf_left.columns).intersection(pdf_right.columns): @@ -517,10 +278,12 @@ def test_safe_merging_with_left_empty(): assert len(pdf_result) == len(gdf_result) -@pytest.mark.parametrize("how", ["left", "inner", "outer"]) @pytest.mark.parametrize("left_empty", [True, False]) @pytest.mark.parametrize("right_empty", [True, False]) def test_empty_joins(how, left_empty, right_empty): + if how in {"leftsemi", "leftanti"}: + pytest.skip(f"{how} not implemented in pandas") + pdf = pd.DataFrame({"x": [1, 2, 3]}) if left_empty: @@ -676,24 +439,23 @@ def test_merge_left_right_index_left_right_on_kwargs2(kwargs): assert gd_merge.empty +@pytest.mark.parametrize("how", ["inner", "outer", "left"]) @pytest.mark.parametrize( - "hows", [{"how": "inner"}, {"how": "left"}, {"how": "outer"}] -) -@pytest.mark.parametrize( - "ons", + "on", [ - {"on": "a"}, - {"on": ["a", "b"]}, - {"on": ["b", "a"]}, - {"on": ["a", "aa", "b"]}, - {"on": ["b", "a", "aa"]}, + "a", + ["a", "b"], + ["b", "a"], + ["a", "aa", "b"], + ["b", "a", "aa"], ], ) -def test_merge_sort(ons, hows): - kwargs = {} - kwargs.update(hows) - kwargs.update(ons) - kwargs["sort"] = True +def test_merge_sort(on, how): + kwargs = { + "sort": True, + "how": how, + "on": on, + } a = [4, 6, 9, 5, 2, 4, 1, 8, 1] b = [9, 8, 7, 8, 3, 9, 7, 9, 2] aa = [8, 9, 2, 9, 3, 1, 2, 3, 4] @@ -761,27 +523,6 @@ def test_merge_sort_on_indexes(kwargs): assert gd_merge["a"].is_monotonic_increasing -@pytest.mark.parametrize( - "dtype", - ["datetime64[s]", "datetime64[ms]", "datetime64[us]", "datetime64[ns]"], -) -def test_join_datetimes_index(dtype): - datetimes = pd.Series(pd.date_range("20010101", "20010102", freq="12h")) - pdf_lhs = pd.DataFrame(index=[1, 0, 1, 2, 0, 0, 1]) - pdf_rhs = pd.DataFrame({"d": datetimes}) - gdf_lhs = cudf.from_pandas(pdf_lhs) - gdf_rhs = cudf.from_pandas(pdf_rhs) - - gdf_rhs["d"] = gdf_rhs["d"].astype(dtype) - - pdf = pdf_lhs.join(pdf_rhs, sort=True) - gdf = gdf_lhs.join(gdf_rhs, sort=True) - - assert gdf["d"].dtype == cudf.dtype(dtype) - - assert_join_results_equal(pdf, gdf, how="inner", check_dtype=False) - - def test_join_with_different_names(): left = pd.DataFrame({"a": [0, 1, 2.0, 3, 4, 5, 9]}) right = pd.DataFrame({"b": [12, 5, 3, 9.0, 5], "c": [1, 2, 3, 4, 5.0]}) @@ -812,67 +553,6 @@ def test_join_empty_table_dtype(): assert_eq(pd_merge["a"].dtype, gd_merge["a"].dtype) -@pytest.mark.parametrize("how", ["outer", "inner", "left", "right"]) -@pytest.mark.parametrize( - "column_a", - [ - ( - pd.Series([None, 1, 2, 3, 4, 5, 6, 7], dtype=np.float64), - pd.Series([8, 9, 10, 11, 12, None, 14, 15], dtype=np.float64), - ) - ], -) -@pytest.mark.parametrize( - "column_b", - [ - ( - pd.Series([0, 1, 0, None, 1, 0, 0, 0], dtype=np.float64), - pd.Series([None, 1, 2, 1, 2, 2, 0, 0], dtype=np.float64), - ) - ], -) -@pytest.mark.parametrize( - "column_c", - [ - ( - pd.Series(["dog", "cat", "fish", "bug"] * 2), - pd.Series(["bird", "cat", "mouse", "snake"] * 2), - ), - ( - pd.Series(["dog", "cat", "fish", "bug"] * 2).astype("category"), - pd.Series(["bird", "cat", "mouse", "snake"] * 2).astype( - "category" - ), - ), - ], -) -def test_join_multi(how, column_a, column_b, column_c): - index = ["b", "c"] - df1 = pd.DataFrame() - df1["a1"] = column_a[0] - df1["b"] = column_b[0] - df1["c"] = column_c[0] - df1 = df1.set_index(index) - gdf1 = cudf.from_pandas(df1) - - df2 = pd.DataFrame() - df2["a2"] = column_a[1] - df2["b"] = column_b[1] - df2["c"] = column_c[1] - df2 = df2.set_index(index) - gdf2 = cudf.from_pandas(df2) - - gdf_result = gdf1.join(gdf2, how=how, sort=True) - pdf_result = df1.join(df2, how=how, sort=True) - - # Make sure columns are in the same order - columns = pdf_result.columns.values - gdf_result = gdf_result[columns] - pdf_result = pdf_result[columns] - - assert_join_results_equal(pdf_result, gdf_result, how="inner") - - @pytest.mark.parametrize( "kwargs", [ @@ -962,18 +642,25 @@ def test_merge_multi(kwargs): assert_join_results_equal(expect, got, how="left") -@pytest.mark.parametrize("dtype_l", INTEGER_TYPES) -@pytest.mark.parametrize("dtype_r", INTEGER_TYPES) -def test_typecast_on_join_int_to_int(dtype_l, dtype_r): +@pytest.fixture +def integer_types_as_str2(integer_types_as_str): + return integer_types_as_str + + +def test_typecast_on_join_int_to_int( + integer_types_as_str, integer_types_as_str2 +): other_data = ["a", "b", "c"] - join_data_l = cudf.Series([1, 2, 3], dtype=dtype_l) - join_data_r = cudf.Series([1, 2, 4], dtype=dtype_r) + join_data_l = cudf.Series([1, 2, 3], dtype=integer_types_as_str) + join_data_r = cudf.Series([1, 2, 4], dtype=integer_types_as_str2) gdf_l = cudf.DataFrame({"join_col": join_data_l, "B": other_data}) gdf_r = cudf.DataFrame({"join_col": join_data_r, "B": other_data}) - exp_dtype = find_common_type((np.dtype(dtype_l), np.dtype(dtype_r))) + exp_dtype = find_common_type( + (np.dtype(integer_types_as_str), np.dtype(integer_types_as_str2)) + ) exp_join_data = [1, 2] exp_other_data = ["a", "b"] @@ -992,20 +679,29 @@ def test_typecast_on_join_int_to_int(dtype_l, dtype_r): assert_join_results_equal(expect, got, how="inner") -@pytest.mark.parametrize("dtype_l", ["float32", "float64"]) -@pytest.mark.parametrize("dtype_r", ["float32", "float64"]) -def test_typecast_on_join_float_to_float(dtype_l, dtype_r): +@pytest.fixture +def float_types_as_str2(float_types_as_str): + return float_types_as_str + + +def test_typecast_on_join_float_to_float( + float_types_as_str, float_types_as_str2 +): other_data = ["a", "b", "c", "d", "e", "f"] - join_data_l = cudf.Series([1, 2, 3, 0.9, 4.5, 6], dtype=dtype_l) - join_data_r = cudf.Series([1, 2, 3, 0.9, 4.5, 7], dtype=dtype_r) + join_data_l = cudf.Series([1, 2, 3, 0.9, 4.5, 6], dtype=float_types_as_str) + join_data_r = cudf.Series( + [1, 2, 3, 0.9, 4.5, 7], dtype=float_types_as_str2 + ) gdf_l = cudf.DataFrame({"join_col": join_data_l, "B": other_data}) gdf_r = cudf.DataFrame({"join_col": join_data_r, "B": other_data}) - exp_dtype = find_common_type((np.dtype(dtype_l), np.dtype(dtype_r))) + exp_dtype = find_common_type( + (np.dtype(float_types_as_str), np.dtype(float_types_as_str2)) + ) - if dtype_l != dtype_r: + if float_types_as_str != float_types_as_str2: exp_join_data = [1, 2, 3, 4.5] exp_other_data = ["a", "b", "c", "e"] else: @@ -1027,24 +723,37 @@ def test_typecast_on_join_float_to_float(dtype_l, dtype_r): assert_join_results_equal(expect, got, how="inner") -@pytest.mark.parametrize("dtype_l", NUMERIC_TYPES) -@pytest.mark.parametrize("dtype_r", NUMERIC_TYPES) -def test_typecast_on_join_mixed_int_float(dtype_l, dtype_r): +@pytest.fixture +def numeric_types_as_str2(numeric_types_as_str): + return numeric_types_as_str + + +def test_typecast_on_join_mixed_int_float( + numeric_types_as_str, numeric_types_as_str2 +): if ( - ("int" in dtype_l or "long" in dtype_l) - and ("int" in dtype_r or "long" in dtype_r) - ) or ("float" in dtype_l and "float" in dtype_r): + ("int" in numeric_types_as_str or "long" in numeric_types_as_str) + and ("int" in numeric_types_as_str2 or "long" in numeric_types_as_str2) + ) or ( + "float" in numeric_types_as_str and "float" in numeric_types_as_str2 + ): pytest.skip("like types not tested in this function") other_data = ["a", "b", "c", "d", "e", "f"] - join_data_l = cudf.Series([1, 2, 3, 0.9, 4.5, 6], dtype=dtype_l) - join_data_r = cudf.Series([1, 2, 3, 0.9, 4.5, 7], dtype=dtype_r) + join_data_l = cudf.Series( + [1, 2, 3, 0.9, 4.5, 6], dtype=numeric_types_as_str + ) + join_data_r = cudf.Series( + [1, 2, 3, 0.9, 4.5, 7], dtype=numeric_types_as_str2 + ) gdf_l = cudf.DataFrame({"join_col": join_data_l, "B": other_data}) gdf_r = cudf.DataFrame({"join_col": join_data_r, "B": other_data}) - exp_dtype = find_common_type((np.dtype(dtype_l), np.dtype(dtype_r))) + exp_dtype = find_common_type( + (np.dtype(numeric_types_as_str), np.dtype(numeric_types_as_str2)) + ) exp_join_data = [1, 2, 3] exp_other_data = ["a", "b", "c"] @@ -1275,27 +984,28 @@ def test_mixed_decimal_typecast(dtype_l, dtype_r): gdf_l.merge(gdf_r, on="join_col", how="inner") -@pytest.mark.parametrize( - "dtype_l", - ["datetime64[s]", "datetime64[ms]", "datetime64[us]", "datetime64[ns]"], -) -@pytest.mark.parametrize( - "dtype_r", - ["datetime64[s]", "datetime64[ms]", "datetime64[us]", "datetime64[ns]"], -) -def test_typecast_on_join_dt_to_dt(dtype_l, dtype_r): +@pytest.fixture +def datetime_types_as_str2(datetime_types_as_str): + return datetime_types_as_str + + +def test_typecast_on_join_dt_to_dt( + datetime_types_as_str, datetime_types_as_str2 +): other_data = ["a", "b", "c", "d", "e"] join_data_l = cudf.Series( ["1991-11-20", "1999-12-31", "2004-12-04", "2015-01-01", "2019-08-15"] - ).astype(dtype_l) + ).astype(datetime_types_as_str) join_data_r = cudf.Series( ["1991-11-20", "1999-12-31", "2004-12-04", "2015-01-01", "2019-08-16"] - ).astype(dtype_r) + ).astype(datetime_types_as_str2) gdf_l = cudf.DataFrame({"join_col": join_data_l, "B": other_data}) gdf_r = cudf.DataFrame({"join_col": join_data_r, "B": other_data}) - exp_dtype = max(np.dtype(dtype_l), np.dtype(dtype_r)) + exp_dtype = max( + np.dtype(datetime_types_as_str), np.dtype(datetime_types_as_str2) + ) exp_join_data = ["1991-11-20", "1999-12-31", "2004-12-04", "2015-01-01"] exp_other_data = ["a", "b", "c", "d"] @@ -1552,725 +1262,6 @@ def test_categorical_typecast_outer_one_cat(dtype): assert result["key"].dtype == left["key"].dtype.categories.dtype -@pytest.mark.parametrize( - ("lhs", "rhs"), - [ - (["a", "b"], ["a"]), - (["a"], ["a", "b"]), - (["a", "b"], ["b"]), - (["b"], ["a", "b"]), - (["a"], ["a"]), - ], -) -@pytest.mark.parametrize("how", ["left", "right", "outer", "inner"]) -@pytest.mark.parametrize("level", ["a", "b", 0, 1]) -def test_index_join(lhs, rhs, how, level): - l_pdf = pd.DataFrame({"a": [2, 3, 1, 4], "b": [3, 7, 8, 1]}) - r_pdf = pd.DataFrame({"a": [1, 5, 4, 0], "b": [3, 9, 8, 4]}) - l_df = cudf.from_pandas(l_pdf) - r_df = cudf.from_pandas(r_pdf) - p_lhs = l_pdf.set_index(lhs).index - p_rhs = r_pdf.set_index(rhs).index - g_lhs = l_df.set_index(lhs).index - g_rhs = r_df.set_index(rhs).index - - expected = p_lhs.join(p_rhs, level=level, how=how).to_frame(index=False) - got = g_lhs.join(g_rhs, level=level, how=how).to_frame(index=False) - - assert_join_results_equal(expected, got, how=how) - - -def test_index_join_corner_cases(): - l_pdf = pd.DataFrame({"a": [2, 3, 1, 4], "b": [3, 7, 8, 1]}) - r_pdf = pd.DataFrame( - {"a": [1, 5, 4, 0], "b": [3, 9, 8, 4], "c": [2, 3, 6, 0]} - ) - l_df = cudf.from_pandas(l_pdf) - r_df = cudf.from_pandas(r_pdf) - - # Join when column name doesn't match with level - lhs = ["a", "b"] - # level and rhs don't match - rhs = ["c"] - level = "b" - how = "outer" - p_lhs = l_pdf.set_index(lhs).index - p_rhs = r_pdf.set_index(rhs).index - g_lhs = l_df.set_index(lhs).index - g_rhs = r_df.set_index(rhs).index - expected = p_lhs.join(p_rhs, level=level, how=how).to_frame(index=False) - got = g_lhs.join(g_rhs, level=level, how=how).to_frame(index=False) - - assert_join_results_equal(expected, got, how=how) - - # sort is supported only in case of two non-MultiIndex join - # Join when column name doesn't match with level - lhs = ["a"] - # level and rhs don't match - rhs = ["a"] - level = "b" - how = "left" - p_lhs = l_pdf.set_index(lhs).index - p_rhs = r_pdf.set_index(rhs).index - g_lhs = l_df.set_index(lhs).index - g_rhs = r_df.set_index(rhs).index - expected = p_lhs.join(p_rhs, how=how, sort=True) - got = g_lhs.join(g_rhs, how=how, sort=True) - - assert_join_results_equal(expected, got, how=how) - - # Pandas Index.join on categorical column returns generic column - # but cudf will be returning a categorical column itself. - lhs = ["a", "b"] - rhs = ["a"] - level = "a" - how = "inner" - l_df["a"] = l_df["a"].astype("category") - r_df["a"] = r_df["a"].astype("category") - p_lhs = l_pdf.set_index(lhs).index - p_rhs = r_pdf.set_index(rhs).index - g_lhs = l_df.set_index(lhs).index - g_rhs = r_df.set_index(rhs).index - expected = p_lhs.join(p_rhs, level=level, how=how).to_frame(index=False) - got = g_lhs.join(g_rhs, level=level, how=how).to_frame(index=False) - - got["a"] = got["a"].astype(expected["a"].dtype) - - assert_join_results_equal(expected, got, how=how) - - -def test_index_join_exception_cases(): - l_df = cudf.DataFrame({"a": [2, 3, 1, 4], "b": [3, 7, 8, 1]}) - r_df = cudf.DataFrame( - {"a": [1, 5, 4, 0], "b": [3, 9, 8, 4], "c": [2, 3, 6, 0]} - ) - - # Join between two MultiIndex - lhs = ["a", "b"] - rhs = ["a", "c"] - level = "a" - how = "outer" - g_lhs = l_df.set_index(lhs).index - g_rhs = r_df.set_index(rhs).index - - with pytest.raises(TypeError): - g_lhs.join(g_rhs, level=level, how=how) - - # Improper level value, level should be an int or scalar value - level = ["a"] - rhs = ["a"] - g_lhs = l_df.set_index(lhs).index - g_rhs = r_df.set_index(rhs).index - with pytest.raises(ValueError): - g_lhs.join(g_rhs, level=level, how=how) - - -def test_typecast_on_join_indexes(): - join_data_l = cudf.Series([1, 2, 3, 4, 5], dtype="int8") - join_data_r = cudf.Series([1, 2, 3, 4, 6], dtype="int32") - other_data = ["a", "b", "c", "d", "e"] - - gdf_l = cudf.DataFrame({"join_col": join_data_l, "B": other_data}) - gdf_r = cudf.DataFrame({"join_col": join_data_r, "B": other_data}) - - gdf_l = gdf_l.set_index("join_col") - gdf_r = gdf_r.set_index("join_col") - - exp_join_data = [1, 2, 3, 4] - exp_other_data = ["a", "b", "c", "d"] - - expect = cudf.DataFrame( - { - "join_col": exp_join_data, - "B_x": exp_other_data, - "B_y": exp_other_data, - } - ) - expect = expect.set_index("join_col") - - got = gdf_l.join(gdf_r, how="inner", lsuffix="_x", rsuffix="_y") - - assert_join_results_equal(expect, got, how="inner") - - -def test_typecast_on_join_multiindices(): - join_data_l_0 = cudf.Series([1, 2, 3, 4, 5], dtype="int8") - join_data_l_1 = cudf.Series([2, 3, 4.1, 5.9, 6], dtype="float32") - join_data_l_2 = cudf.Series([7, 8, 9, 0, 1], dtype="float32") - - join_data_r_0 = cudf.Series([1, 2, 3, 4, 5], dtype="int32") - join_data_r_1 = cudf.Series([2, 3, 4, 5, 6], dtype="int32") - join_data_r_2 = cudf.Series([7, 8, 9, 0, 0], dtype="float64") - - other_data = ["a", "b", "c", "d", "e"] - - gdf_l = cudf.DataFrame( - { - "join_col_0": join_data_l_0, - "join_col_1": join_data_l_1, - "join_col_2": join_data_l_2, - "B": other_data, - } - ) - gdf_r = cudf.DataFrame( - { - "join_col_0": join_data_r_0, - "join_col_1": join_data_r_1, - "join_col_2": join_data_r_2, - "B": other_data, - } - ) - - gdf_l = gdf_l.set_index(["join_col_0", "join_col_1", "join_col_2"]) - gdf_r = gdf_r.set_index(["join_col_0", "join_col_1", "join_col_2"]) - - exp_join_data_0 = cudf.Series([1, 2], dtype="int32") - exp_join_data_1 = cudf.Series([2, 3], dtype="float64") - exp_join_data_2 = cudf.Series([7, 8], dtype="float64") - exp_other_data = cudf.Series(["a", "b"]) - - expect = cudf.DataFrame( - { - "join_col_0": exp_join_data_0, - "join_col_1": exp_join_data_1, - "join_col_2": exp_join_data_2, - "B_x": exp_other_data, - "B_y": exp_other_data, - } - ) - expect = expect.set_index(["join_col_0", "join_col_1", "join_col_2"]) - got = gdf_l.join(gdf_r, how="inner", lsuffix="_x", rsuffix="_y") - - assert_join_results_equal(expect, got, how="inner") - - -def test_typecast_on_join_indexes_matching_categorical(): - join_data_l = cudf.Series(["a", "b", "c", "d", "e"], dtype="category") - join_data_r = cudf.Series(["a", "b", "c", "d", "e"], dtype="str") - other_data = [1, 2, 3, 4, 5] - - gdf_l = cudf.DataFrame({"join_col": join_data_l, "B": other_data}) - gdf_r = cudf.DataFrame({"join_col": join_data_r, "B": other_data}) - - gdf_l = gdf_l.set_index("join_col") - gdf_r = gdf_r.set_index("join_col") - - exp_join_data = ["a", "b", "c", "d", "e"] - exp_other_data = [1, 2, 3, 4, 5] - - expect = cudf.DataFrame( - { - "join_col": exp_join_data, - "B_x": exp_other_data, - "B_y": exp_other_data, - } - ) - expect = expect.set_index("join_col") - got = gdf_l.join(gdf_r, how="inner", lsuffix="_x", rsuffix="_y") - - assert_join_results_equal(expect, got, how="inner") - - -@pytest.mark.parametrize( - "lhs", - [ - lambda: cudf.Series([1, 2, 3], name="a"), - lambda: cudf.DataFrame({"a": [2, 3, 4], "c": [4, 5, 6]}), - ], -) -@pytest.mark.parametrize( - "rhs", - [ - lambda: cudf.Series([1, 2, 3], name="b"), - lambda: cudf.DataFrame({"b": [2, 3, 4], "c": [4, 5, 6]}), - ], -) -@pytest.mark.parametrize( - "how", ["left", "inner", "outer", "leftanti", "leftsemi"] -) -@pytest.mark.parametrize( - "kwargs", - [ - {"left_on": "a", "right_on": "b"}, - {"left_index": True, "right_on": "b"}, - {"left_on": "a", "right_index": True}, - {"left_index": True, "right_index": True}, - ], -) -def test_series_dataframe_mixed_merging(lhs, rhs, how, kwargs): - if how in ("leftsemi", "leftanti") and ( - kwargs.get("left_index") or kwargs.get("right_index") - ): - pytest.skip("Index joins not compatible with leftsemi and leftanti") - - lhs = lhs() - rhs = rhs() - check_lhs = lhs.copy() - check_rhs = rhs.copy() - if isinstance(lhs, cudf.Series): - check_lhs = lhs.to_frame() - if isinstance(rhs, cudf.Series): - check_rhs = rhs.to_frame() - - expect = cudf.merge(check_lhs, check_rhs, how=how, **kwargs) - got = cudf.merge(lhs, rhs, how=how, **kwargs) - - assert_join_results_equal(expect, got, how=how) - - -@pytest.mark.xfail(reason="Cannot sort values of list dtype") -@pytest.mark.parametrize( - "how", ["left", "inner", "right", "leftanti", "leftsemi"] -) -def test_merge_with_lists(how): - pd_left = pd.DataFrame( - { - "a": [1, 2, 3, 4, 5, 6], - "b": [[1, 2, 3], [4, 5], None, [6], [7, 8, None], []], - "c": ["a", "b", "c", "d", "e", "f"], - } - ) - pd_right = pd.DataFrame( - { - "a": [4, 3, 2, 1, 0, -1], - "d": [[[1, 2], None], [], [[3, 4]], None, [[5], [6, 7]], [[8]]], - } - ) - - gd_left = cudf.from_pandas(pd_left) - gd_right = cudf.from_pandas(pd_right) - - expect = pd_left.merge(pd_right, on="a") - got = gd_left.merge(gd_right, on="a") - - assert_join_results_equal(expect, got, how=how) - - -def test_join_renamed_index(): - df = cudf.DataFrame( - {0: [1, 2, 3, 4, 5], 1: [1, 2, 3, 4, 5], "c": [1, 2, 3, 4, 5]} - ).set_index([0, 1]) - df.index.names = ["a", "b"] # doesn't actually change df._index._data - - expect = df.to_pandas().merge( - df.to_pandas(), left_index=True, right_index=True - ) - got = df.merge(df, left_index=True, right_index=True, how="inner") - assert_join_results_equal(expect, got, how="inner") - - -@pytest.mark.parametrize( - "lhs_col, lhs_idx, rhs_col, rhs_idx, on", - [ - (["A", "B"], "L0", ["B", "C"], "L0", ["B"]), - (["A", "B"], "L0", ["B", "C"], "L0", ["L0"]), - (["A", "B"], "L0", ["B", "C"], "L0", ["B", "L0"]), - (["A", "B"], "L0", ["C", "L0"], "A", ["A"]), - (["A", "B"], "L0", ["C", "L0"], "A", ["L0"]), - (["A", "B"], "L0", ["C", "L0"], "A", ["A", "L0"]), - ], -) -@pytest.mark.parametrize( - "how", ["left", "inner", "right", "outer", "leftanti", "leftsemi"] -) -def test_join_merge_with_on(lhs_col, lhs_idx, rhs_col, rhs_idx, on, how): - lhs_data = {col_name: [4, 5, 6] for col_name in lhs_col} - lhs_index = cudf.Index([0, 1, 2], name=lhs_idx) - - rhs_data = {col_name: [4, 5, 6] for col_name in rhs_col} - rhs_index = cudf.Index([2, 3, 4], name=rhs_idx) - - gd_left = cudf.DataFrame(lhs_data, lhs_index) - gd_right = cudf.DataFrame(rhs_data, rhs_index) - pd_left = gd_left.to_pandas() - pd_right = gd_right.to_pandas() - - expect = pd_left.merge(pd_right, on=on).sort_index(axis=1, ascending=False) - got = gd_left.merge(gd_right, on=on).sort_index(axis=1, ascending=False) - - assert_join_results_equal(expect, got, how=how) - - -@pytest.mark.parametrize( - "on", - ["A", "L0"], -) -@pytest.mark.parametrize( - "how", ["left", "inner", "right", "outer", "leftanti", "leftsemi"] -) -def test_join_merge_invalid_keys(on, how): - gd_left = cudf.DataFrame( - {"A": [1, 2, 3], "B": [4, 5, 6]}, index=cudf.Index([0, 1, 2], name="C") - ) - gd_right = cudf.DataFrame( - {"D": [2, 3, 4], "E": [7, 8, 0]}, index=cudf.Index([0, 2, 4], name="F") - ) - pd_left = gd_left.to_pandas() - pd_right = gd_right.to_pandas() - - with pytest.raises(KeyError): - pd_left.merge(pd_right, on=on) - gd_left.merge(gd_right, on=on) - - -@pytest.mark.parametrize( - "str_data", - [[], ["a", "b", "c", "d", "e"], [None, None, None, None, None]], -) -@pytest.mark.parametrize("num_keys", [1, 2, 3]) -@pytest.mark.parametrize("how", ["left", "right", "inner", "outer"]) -def test_string_join_key(str_data, num_keys, how): - other_data = [1, 2, 3, 4, 5][: len(str_data)] - - pdf = pd.DataFrame() - gdf = cudf.DataFrame() - for i in range(num_keys): - pdf[i] = pd.Series(str_data, dtype="str") - gdf[i] = cudf.Series(str_data, dtype="str") - pdf["a"] = other_data - gdf["a"] = other_data - if len(other_data) == 0: - pdf["a"] = pdf["a"].astype("str") - pdf2 = pdf.copy() - gdf2 = gdf.copy() - - expect = pdf.merge(pdf2, on=list(range(num_keys)), how=how) - got = gdf.merge(gdf2, on=list(range(num_keys)), how=how) - - if len(expect) == 0 and len(got) == 0: - expect = expect.reset_index(drop=True) - got = got[expect.columns] # reorder columns - - if how == "right": - got = got[expect.columns] # reorder columns - - assert_join_results_equal(expect, got, how=how) - - -@pytest.mark.parametrize( - "str_data_nulls", - [ - ["a", "b", "c"], - ["a", "b", "f", "g"], - ["f", "g", "h", "i", "j"], - ["f", "g", "h"], - [None, None, None, None, None], - [], - ], -) -def test_string_join_key_nulls(str_data_nulls): - str_data = ["a", "b", "c", "d", "e"] - other_data = [1, 2, 3, 4, 5] - - other_data_nulls = [6, 7, 8, 9, 10][: len(str_data_nulls)] - - pdf = pd.DataFrame() - gdf = cudf.DataFrame() - pdf["key"] = pd.Series(str_data, dtype="str") - gdf["key"] = cudf.Series(str_data, dtype="str") - pdf["vals"] = other_data - gdf["vals"] = other_data - - pdf2 = pd.DataFrame() - gdf2 = cudf.DataFrame() - pdf2["key"] = pd.Series(str_data_nulls, dtype="str") - gdf2["key"] = cudf.Series(str_data_nulls, dtype="str") - pdf2["vals"] = pd.Series(other_data_nulls, dtype="int64") - gdf2["vals"] = cudf.Series(other_data_nulls, dtype="int64") - - expect = pdf.merge(pdf2, on="key", how="left") - got = gdf.merge(gdf2, on="key", how="left") - got["vals_y"] = got["vals_y"].fillna(-1) - - if len(expect) == 0 and len(got) == 0: - expect = expect.reset_index(drop=True) - got = got[expect.columns] - - expect["vals_y"] = expect["vals_y"].fillna(-1).astype("int64") - - assert_join_results_equal(expect, got, how="left") - - -@pytest.mark.parametrize( - "str_data", [[], ["a", "b", "c", "d", "e"], [None, None, None, None, None]] -) -@pytest.mark.parametrize("num_cols", [1, 2, 3]) -@pytest.mark.parametrize("how", ["left", "right", "inner", "outer"]) -def test_string_join_non_key(str_data, num_cols, how): - other_data = [1, 2, 3, 4, 5][: len(str_data)] - - pdf = pd.DataFrame() - gdf = cudf.DataFrame() - for i in range(num_cols): - pdf[i] = pd.Series(str_data, dtype="str") - gdf[i] = cudf.Series(str_data, dtype="str") - pdf["a"] = other_data - gdf["a"] = other_data - if len(other_data) == 0: - pdf["a"] = pdf["a"].astype("str") - - pdf2 = pdf.copy() - gdf2 = gdf.copy() - - expect = pdf.merge(pdf2, on=["a"], how=how) - got = gdf.merge(gdf2, on=["a"], how=how) - - if len(expect) == 0 and len(got) == 0: - expect = expect.reset_index(drop=True) - got = got[expect.columns] - - if how == "right": - got = got[expect.columns] # reorder columns - - assert_join_results_equal(expect, got, how=how) - - -@pytest.mark.parametrize( - "str_data_nulls", - [ - ["a", "b", "c"], - ["a", "b", "f", "g"], - ["f", "g", "h", "i", "j"], - ["f", "g", "h"], - [None, None, None, None, None], - [], - ], -) -def test_string_join_non_key_nulls(str_data_nulls): - str_data = ["a", "b", "c", "d", "e"] - other_data = [1, 2, 3, 4, 5] - - other_data_nulls = [6, 7, 8, 9, 10][: len(str_data_nulls)] - - pdf = pd.DataFrame() - gdf = cudf.DataFrame() - pdf["vals"] = pd.Series(str_data, dtype="str") - gdf["vals"] = cudf.Series(str_data, dtype="str") - pdf["key"] = other_data - gdf["key"] = other_data - - pdf2 = pd.DataFrame() - gdf2 = cudf.DataFrame() - pdf2["vals"] = pd.Series(str_data_nulls, dtype="str") - gdf2["vals"] = cudf.Series(str_data_nulls, dtype="str") - pdf2["key"] = pd.Series(other_data_nulls, dtype="int64") - gdf2["key"] = cudf.Series(other_data_nulls, dtype="int64") - - expect = pdf.merge(pdf2, on="key", how="left") - got = gdf.merge(gdf2, on="key", how="left") - - if len(expect) == 0 and len(got) == 0: - expect = expect.reset_index(drop=True) - got = got[expect.columns] - - assert_join_results_equal(expect, got, how="left") - - -def test_string_join_values_nulls(): - left_dict = [ - {"b": "MATCH 1", "a": 1.0}, - {"b": "MATCH 1", "a": 1.0}, - {"b": "LEFT NO MATCH 1", "a": -1.0}, - {"b": "MATCH 2", "a": 2.0}, - {"b": "MATCH 2", "a": 2.0}, - {"b": "MATCH 1", "a": 1.0}, - {"b": "MATCH 1", "a": 1.0}, - {"b": "MATCH 2", "a": 2.0}, - {"b": "MATCH 2", "a": 2.0}, - {"b": "LEFT NO MATCH 2", "a": -2.0}, - {"b": "MATCH 3", "a": 3.0}, - {"b": "MATCH 3", "a": 3.0}, - ] - - right_dict = [ - {"b": "RIGHT NO MATCH 1", "c": -1.0}, - {"b": "MATCH 3", "c": 3.0}, - {"b": "MATCH 2", "c": 2.0}, - {"b": "RIGHT NO MATCH 2", "c": -2.0}, - {"b": "RIGHT NO MATCH 3", "c": -3.0}, - {"b": "MATCH 1", "c": 1.0}, - ] - - left_pdf = pd.DataFrame(left_dict) - right_pdf = pd.DataFrame(right_dict) - - left_gdf = cudf.DataFrame.from_pandas(left_pdf) - right_gdf = cudf.DataFrame.from_pandas(right_pdf) - - expect = left_pdf.merge(right_pdf, how="left", on="b") - got = left_gdf.merge(right_gdf, how="left", on="b") - - expect = expect.sort_values(by=["a", "b", "c"]).reset_index(drop=True) - got = got.sort_values(by=["a", "b", "c"]).reset_index(drop=True) - - assert_join_results_equal(expect, got, how="left") - - -@pytest.mark.parametrize( - "left_on,right_on", - [ - *product(["a", "b", "c"], ["a", "b"]), - *zip(combinations(["a", "b", "c"], 2), repeat(["a", "b"])), - ], -) -def test_merge_mixed_index_columns(left_on, right_on): - left = pd.DataFrame({"a": [1, 2, 1, 2], "b": [2, 3, 3, 4]}).set_index("a") - right = pd.DataFrame({"a": [1, 2, 1, 3], "b": [2, 30, 3, 4]}).set_index( - "a" - ) - - left["c"] = 10 - - expect = left.merge(right, left_on=left_on, right_on=right_on, how="outer") - cleft = cudf.from_pandas(left) - cright = cudf.from_pandas(right) - got = cleft.merge(cright, left_on=left_on, right_on=right_on, how="outer") - assert_join_results_equal(expect, got, how="outer") - - -def test_merge_multiindex_columns(): - lhs = pd.DataFrame({"a": [1, 2, 3], "b": [2, 3, 4]}) - lhs.columns = pd.MultiIndex.from_tuples([("a", "x"), ("a", "y")]) - rhs = pd.DataFrame({"a": [1, 2, 3], "b": [2, 3, 4]}) - rhs.columns = pd.MultiIndex.from_tuples([("a", "x"), ("a", "z")]) - expect = lhs.merge(rhs, on=[("a", "x")], how="inner") - - lhs = cudf.from_pandas(lhs) - rhs = cudf.from_pandas(rhs) - got = lhs.merge(rhs, on=[("a", "x")], how="inner") - - assert_join_results_equal(expect, got, how="inner") - - -def test_join_multiindex_empty(): - lhs = pd.DataFrame({"a": [1, 2, 3], "b": [2, 3, 4]}, index=["a", "b", "c"]) - lhs.columns = pd.MultiIndex.from_tuples([("a", "x"), ("a", "y")]) - rhs = pd.DataFrame(index=["a", "c", "d"]) - g_lhs = cudf.from_pandas(lhs) - g_rhs = cudf.from_pandas(rhs) - assert_exceptions_equal( - lfunc=lhs.join, - rfunc=g_lhs.join, - lfunc_args_and_kwargs=([rhs], {"how": "inner"}), - rfunc_args_and_kwargs=([g_rhs], {"how": "inner"}), - check_exception_type=False, - ) - - -def test_join_on_index_with_duplicate_names(): - # although index levels with duplicate names are poorly supported - # overall, we *should* be able to join on them: - lhs = pd.DataFrame({"a": [1, 2, 3]}) - rhs = pd.DataFrame({"b": [1, 2, 3]}) - lhs.index = pd.MultiIndex.from_tuples( - [(1, 1), (1, 2), (2, 1)], names=["x", "x"] - ) - rhs.index = pd.MultiIndex.from_tuples( - [(1, 1), (1, 3), (2, 1)], names=["x", "x"] - ) - expect = lhs.join(rhs, how="inner") - - lhs = cudf.from_pandas(lhs) - rhs = cudf.from_pandas(rhs) - got = lhs.join(rhs, how="inner") - - assert_join_results_equal(expect, got, how="inner") - - -def test_join_redundant_params(): - lhs = cudf.DataFrame( - {"a": [1, 2, 3], "c": [2, 3, 4]}, index=cudf.Index([0, 1, 2], name="c") - ) - rhs = cudf.DataFrame( - {"b": [1, 2, 3]}, index=cudf.Index([0, 1, 2], name="a") - ) - with pytest.raises(ValueError): - lhs.merge(rhs, on="a", left_index=True) - with pytest.raises(ValueError): - lhs.merge(rhs, left_on="a", left_index=True, right_index=True) - with pytest.raises(ValueError): - lhs.merge(rhs, right_on="a", left_index=True, right_index=True) - with pytest.raises(ValueError): - lhs.merge(rhs, left_on="c", right_on="b") - - -def test_join_multiindex_index(): - # test joining a MultiIndex with an Index with overlapping name - lhs = ( - cudf.DataFrame({"a": [2, 3, 1], "b": [3, 4, 2]}) - .set_index(["a", "b"]) - .index - ) - rhs = cudf.DataFrame({"a": [1, 4, 3]}).set_index("a").index - expect = lhs.to_pandas().join(rhs.to_pandas(), how="inner") - got = lhs.join(rhs, how="inner") - assert_join_results_equal(expect, got, how="inner") - - -def test_dataframe_join_on(): - """Verify that specifying the on parameter gives a NotImplementedError.""" - df = cudf.DataFrame({"a": [1, 2, 3]}) - with pytest.raises(NotImplementedError): - df.join(df, on="a") - - -def test_index_join_return_indexers_notimplemented(): - index = cudf.RangeIndex(start=0, stop=20, step=2) - other = cudf.Index([4, 4, 3, 3]) - with pytest.raises(NotImplementedError): - index.join(other, how="left", return_indexers=True) - - -@pytest.mark.parametrize("how", ["inner", "outer"]) -def test_index_join_names(request, how): - idx1 = cudf.Index([10, 1, 2, 4, 2, 1], name="a") - idx2 = cudf.Index([-10, 2, 3, 1, 2], name="b") - request.applymarker( - pytest.mark.xfail( - reason="https://github.com/pandas-dev/pandas/issues/57065", - ) - ) - pidx1 = idx1.to_pandas() - pidx2 = idx2.to_pandas() - - expected = pidx1.join(pidx2, how=how) - actual = idx1.join(idx2, how=how) - assert_join_results_equal(actual, expected, how=how) - - -@pytest.mark.parametrize("dtype", ["datetime64[ns]", "timedelta64[ns]"]) -def test_join_datetime_timedelta_error(dtype): - df1 = cudf.DataFrame({"a": cudf.Series([10, 20, 30], dtype=dtype)}) - df2 = df1.astype("int") - - with pytest.raises(TypeError): - df1.merge(df2) - - -@pytest.mark.parametrize("dtype1", TIMEDELTA_TYPES) -@pytest.mark.parametrize("dtype2", TIMEDELTA_TYPES) -def test_merge_timedelta_types(dtype1, dtype2): - df1 = cudf.DataFrame({"a": cudf.Series([10, 20, 30], dtype=dtype1)}) - df2 = cudf.DataFrame({"a": cudf.Series([20, 500, 33240], dtype=dtype2)}) - - pdf1 = df1.to_pandas() - pdf2 = df2.to_pandas() - actual = df1.merge(df2) - expected = pdf1.merge(pdf2) - - # Pandas is materializing the index, which is unnecessary - # hence the special handling. - assert_eq( - actual, - expected, - check_index_type=False - if isinstance(actual.index, cudf.RangeIndex) - and isinstance(expected.index, pd.Index) - else True, - check_dtype=len(actual) > 0, - ) - - def test_merge_index_on_opposite_how_column_reset_index(): df = pd.DataFrame({"a": [1, 2, 3, 4, 5]}, index=[1, 3, 5, 7, 9]) ser = pd.Series([1, 2], index=pd.Index([1, 2], name="a"), name="b") @@ -2319,59 +1310,41 @@ def test_merge_left_on_right_index_sort(): assert_eq(result, expected) -@pytest.mark.parametrize( - "left_data", - [ - {"lkey": ["foo", "bar", "baz", "foo"], "value": [1, 2, 3, 5]}, - {"lkey": ["foo", "bar", "baz", "foo"], "value": [5, 3, 2, 1]}, - { - "lkey": ["foo", "bar", "baz", "foo"], - "value": [5, 3, 2, 1], - "extra_left": [1, 2, 3, 4], - }, - ], -) -@pytest.mark.parametrize( - "right_data", - [ - {"rkey": ["foo", "bar", "baz", "foo"], "value": [5, 6, 7, 8]}, - {"rkey": ["foo", "bar", "baz", "foo"], "value": [8, 7, 6, 5]}, - { - "rkey": ["foo", "bar", "baz", "foo"], - "value": [8, 7, 6, 5], - "extra_right": [10, 2, 30, 4], - }, - ], -) -@pytest.mark.parametrize("sort", [True, False]) -def test_cross_join_overlapping(left_data, right_data, sort): - df1 = cudf.DataFrame(left_data) - df2 = cudf.DataFrame(right_data) +def test_merge_renamed_index(): + df = cudf.DataFrame( + {0: [1, 2, 3, 4, 5], 1: [1, 2, 3, 4, 5], "c": [1, 2, 3, 4, 5]} + ).set_index([0, 1]) + df.index.names = ["a", "b"] # doesn't actually change df._index._data - pdf1 = df1.to_pandas() - pdf2 = df2.to_pandas() - expected = pdf1.join( - pdf2, how="cross", lsuffix="_x", rsuffix="_y", sort=sort + expect = df.to_pandas().merge( + df.to_pandas(), left_index=True, right_index=True ) - result = df1.join(df2, how="cross", lsuffix="_x", rsuffix="_y", sort=sort) - assert_eq(result, expected) + got = df.merge(df, left_index=True, right_index=True, how="inner") + assert_join_results_equal(expect, got, how="inner") -@pytest.mark.parametrize( - "suffixes", - [ - ("_left", "_right"), - ("", "_right"), - ("_left", ""), - ], -) -def test_cross_merge(suffixes): - df1 = cudf.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) - pdf1 = df1.to_pandas() +def test_merge_redundant_params(): + lhs = cudf.DataFrame( + {"a": [1, 2, 3], "c": [2, 3, 4]}, index=cudf.Index([0, 1, 2], name="c") + ) + rhs = cudf.DataFrame( + {"b": [1, 2, 3]}, index=cudf.Index([0, 1, 2], name="a") + ) + with pytest.raises(ValueError): + lhs.merge(rhs, on="a", left_index=True) + with pytest.raises(ValueError): + lhs.merge(rhs, left_on="a", left_index=True, right_index=True) + with pytest.raises(ValueError): + lhs.merge(rhs, right_on="a", left_index=True, right_index=True) + with pytest.raises(ValueError): + lhs.merge(rhs, left_on="c", right_on="b") - df2 = cudf.DataFrame({"a": [11, 12, 13], "d": [40, 50, 60]}) - pdf2 = df2.to_pandas() - expected = pdf1.merge(pdf2, how="cross", suffixes=suffixes) - result = df1.merge(df2, how="cross", suffixes=suffixes) - assert_eq(result, expected) +def test_merge_datetime_timedelta_error(temporal_types_as_str): + df1 = cudf.DataFrame( + {"a": cudf.Series([10, 20, 30], dtype=temporal_types_as_str)} + ) + df2 = df1.astype("int") + + with pytest.raises(TypeError): + df1.merge(df2) From 28613267714446c29a28183522fea86db24cae1c Mon Sep 17 00:00:00 2001 From: Vukasin Milovanovic Date: Wed, 6 Aug 2025 19:16:24 -0700 Subject: [PATCH 079/366] Compile `libcudf_kafka` and `cudf_kafka` with C++20 (#19543) Continuation of #19065, hopefully the last one. Discovered that the kafka components are not compiled using C++20, unlike the rest of libcudf/cuDF. Authors: - Vukasin Milovanovic (https://github.com/vuule) Approvers: - Muhammad Haseeb (https://github.com/mhaseeb123) - Nghia Truong (https://github.com/ttnghia) URL: https://github.com/rapidsai/cudf/pull/19543 --- cpp/libcudf_kafka/CMakeLists.txt | 11 ++++++----- cpp/libcudf_kafka/tests/CMakeLists.txt | 4 +++- python/cudf_kafka/CMakeLists.txt | 7 +++++++ 3 files changed, 16 insertions(+), 6 deletions(-) diff --git a/cpp/libcudf_kafka/CMakeLists.txt b/cpp/libcudf_kafka/CMakeLists.txt index 06e03bb84d4..84fe208b4a0 100644 --- a/cpp/libcudf_kafka/CMakeLists.txt +++ b/cpp/libcudf_kafka/CMakeLists.txt @@ -29,6 +29,10 @@ project( # Set a default build type if none was specified rapids_cmake_build_type(Release) +# Set C++ standard globally to ensure all targets use C++20 +set(CMAKE_CXX_STANDARD 20) +set(CMAKE_CXX_STANDARD_REQUIRED ON) + # For now, disable CMake's automatic module scanning for C++ files. There is an sccache bug in the # version RAPIDS uses in CI that causes it to handle the resulting -M* flags incorrectly with # gcc>=14. We can remove this once we upgrade to a newer sccache version. @@ -83,11 +87,8 @@ if(TARGET conda_env) endif() set_target_properties( - cudf_kafka - PROPERTIES BUILD_RPATH "\$ORIGIN" - INSTALL_RPATH "\$ORIGIN" # set target compile options - CXX_STANDARD 20 - CXX_STANDARD_REQUIRED ON + cudf_kafka PROPERTIES BUILD_RPATH "\$ORIGIN" INSTALL_RPATH + "\$ORIGIN" # set target compile options ) add_library(cudf_kafka::cudf_kafka ALIAS cudf_kafka) diff --git a/cpp/libcudf_kafka/tests/CMakeLists.txt b/cpp/libcudf_kafka/tests/CMakeLists.txt index b819cb6fc3b..05913aae110 100644 --- a/cpp/libcudf_kafka/tests/CMakeLists.txt +++ b/cpp/libcudf_kafka/tests/CMakeLists.txt @@ -1,5 +1,5 @@ # ============================================================================= -# Copyright (c) 2018-2023, NVIDIA CORPORATION. +# Copyright (c) 2018-2025, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except # in compliance with the License. You may obtain a copy of the License at @@ -28,6 +28,8 @@ function(ConfigureTest test_name) ${test_name} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "$" INSTALL_RPATH "\$ORIGIN/../../../lib" + CXX_STANDARD 20 + CXX_STANDARD_REQUIRED ON ) target_link_libraries( ${test_name} PRIVATE GTest::gmock GTest::gmock_main GTest::gtest_main cudf_kafka diff --git a/python/cudf_kafka/CMakeLists.txt b/python/cudf_kafka/CMakeLists.txt index dc352e0772a..07d436a0761 100644 --- a/python/cudf_kafka/CMakeLists.txt +++ b/python/cudf_kafka/CMakeLists.txt @@ -22,6 +22,13 @@ project( LANGUAGES CXX ) +# Set C++ standard to match other cudf Python packages +set(CMAKE_CXX_STANDARD 20) +set(CMAKE_CXX_STANDARD_REQUIRED ON) + +# Disable CMake's automatic module scanning for C++ files due to sccache bug with gcc>=14 +set(CMAKE_CXX_SCAN_FOR_MODULES OFF) + find_package(cudf_kafka "${RAPIDS_VERSION}" REQUIRED) if(NOT cudf_kafka_FOUND) From d1a00b19c7ea7c72940140415ce2bb0cb40ee0eb Mon Sep 17 00:00:00 2001 From: David Wendt <45795991+davidwendt@users.noreply.github.com> Date: Thu, 7 Aug 2025 09:46:57 -0400 Subject: [PATCH 080/366] Additional gtests error checks for string/timestamp convert libcudf APIs (#19562) Converts generic `cudf::logic_error` exceptions to more appropriate `std::invalid_argument` exceptions and adds additional gtests for those cases. Also update the doxygen as appropriate. Authors: - David Wendt (https://github.com/davidwendt) Approvers: - Basit Ayantunde (https://github.com/lamarrr) - Nghia Truong (https://github.com/ttnghia) URL: https://github.com/rapidsai/cudf/pull/19562 --- .../cudf/strings/convert/convert_datetime.hpp | 18 +++++++------ cpp/src/strings/convert/convert_datetime.cu | 25 +++++++++++-------- cpp/tests/strings/datetime_tests.cpp | 16 +++++++----- 3 files changed, 35 insertions(+), 24 deletions(-) diff --git a/cpp/include/cudf/strings/convert/convert_datetime.hpp b/cpp/include/cudf/strings/convert/convert_datetime.hpp index 04eba83925d..874cb9aa69e 100644 --- a/cpp/include/cudf/strings/convert/convert_datetime.hpp +++ b/cpp/include/cudf/strings/convert/convert_datetime.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2024, NVIDIA CORPORATION. + * Copyright (c) 2019-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -34,7 +34,7 @@ namespace strings { * @brief Returns a new timestamp column converting a strings column into * timestamps using the provided format pattern. * - * The format pattern can include the following specifiers: "%Y,%y,%m,%d,%H,%I,%p,%M,%S,%f,%z" + * The format pattern can include the following specifiers: * * | Specifier | Description | * | :-------: | ----------- | @@ -94,7 +94,7 @@ std::unique_ptr to_timestamps( * @brief Verifies the given strings column can be parsed to timestamps using the provided format * pattern. * - * The format pattern can include the following specifiers: "%Y,%y,%m,%d,%H,%I,%p,%M,%S,%f,%z" + * The format pattern can include the following specifiers: * * | Specifier | Description | * | :-------: | ----------- | @@ -125,6 +125,9 @@ std::unique_ptr to_timestamps( * This will return a column of type BOOL8 where a `true` row indicates the corresponding * input string can be parsed correctly with the given format. * + * @throw std::invalid_argument if the `format` string is empty + * @throw std::invalid_argument if a specifier is not supported + * * @param input Strings instance for this operation * @param format String specifying the timestamp format in strings * @param stream CUDA stream used for device memory operations and kernel launches @@ -141,7 +144,7 @@ std::unique_ptr is_timestamp( * @brief Returns a new strings column converting a timestamp column into * strings using the provided format pattern. * - * The format pattern can include the following specifiers: "%Y,%y,%m,%d,%H,%I,%p,%M,%S,%f,%z,%Z" + * The format pattern can include the following specifiers: * * | Specifier | Description | * | :-------: | ----------- | @@ -230,9 +233,10 @@ std::unique_ptr is_timestamp( * } * @endcode * - * @throw cudf::logic_error if `timestamps` column parameter is not a timestamp type. - * @throw cudf::logic_error if the `format` string is empty - * @throw cudf::logic_error if `names.size()` is an invalid size. Must be 0 or 40 strings. + * @throw std::invalid_argument if `timestamps` column parameter is not a timestamp type. + * @throw std::invalid_argument if the `format` string is empty + * @throw std::invalid_argument if `names.size()` is an invalid size. Must be 0 or 40 strings. + * @throw std::invalid_argument if a specifier is not supported * * @param timestamps Timestamp values to convert * @param format The string specifying output format. diff --git a/cpp/src/strings/convert/convert_datetime.cu b/cpp/src/strings/convert/convert_datetime.cu index d3564d62efe..965b71e4a08 100644 --- a/cpp/src/strings/convert/convert_datetime.cu +++ b/cpp/src/strings/convert/convert_datetime.cu @@ -135,7 +135,7 @@ struct format_compiler { items.push_back(format_item::new_literal(ch)); continue; } - CUDF_EXPECTS(length > 0, "Unfinished specifier in timestamp format"); + CUDF_EXPECTS(length > 0, "Unfinished specifier in timestamp format", std::invalid_argument); ch = *str++; length--; @@ -145,7 +145,9 @@ struct format_compiler { continue; } if (ch >= '0' && ch <= '9') { - CUDF_EXPECTS(*str == 'f', "precision not supported for specifier: " + std::string(1, *str)); + CUDF_EXPECTS(*str == 'f', + "precision not supported for specifier: " + std::string(1, *str), + std::invalid_argument); specifiers[*str] = static_cast(ch - '0'); ch = *str++; length--; @@ -153,7 +155,8 @@ struct format_compiler { // check if the specifier found is supported CUDF_EXPECTS(specifiers.find(ch) != specifiers.end(), - "invalid format specifier: " + std::string(1, ch)); + "invalid format specifier: " + std::string(1, ch), + std::invalid_argument); // create the format item for this specifier items.push_back(format_item::new_specifier(ch, specifiers[ch])); @@ -428,7 +431,7 @@ struct dispatch_to_timestamps_fn { rmm::cuda_stream_view) const requires(not cudf::is_timestamp()) { - CUDF_FAIL("Only timestamps type are expected"); + CUDF_FAIL("Only timestamps type are expected", std::invalid_argument); } }; @@ -441,10 +444,9 @@ std::unique_ptr to_timestamps(strings_column_view const& input, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { - if (input.is_empty()) - return make_empty_column(timestamp_type); // make_timestamp_column(timestamp_type, 0); + if (input.is_empty()) { return make_empty_column(timestamp_type); } - CUDF_EXPECTS(!format.empty(), "Format parameter must not be empty."); + CUDF_EXPECTS(!format.empty(), "Format parameter must not be empty.", std::invalid_argument); auto d_strings = column_device_view::create(input.parent(), stream); @@ -682,7 +684,7 @@ std::unique_ptr is_timestamp(strings_column_view const& input, size_type strings_count = input.size(); if (strings_count == 0) return make_empty_column(type_id::BOOL8); - CUDF_EXPECTS(!format.empty(), "Format parameter must not be empty."); + CUDF_EXPECTS(!format.empty(), "Format parameter must not be empty.", std::invalid_argument); auto d_strings = column_device_view::create(input.parent(), stream); @@ -1123,7 +1125,7 @@ struct dispatch_from_timestamps_fn { strings_children operator()(Args&&...) const requires(not cudf::is_timestamp()) { - CUDF_FAIL("Only timestamps type are expected"); + CUDF_FAIL("Only timestamps type are expected", std::invalid_argument); } }; @@ -1138,9 +1140,10 @@ std::unique_ptr from_timestamps(column_view const& timestamps, { if (timestamps.is_empty()) return make_empty_column(type_id::STRING); - CUDF_EXPECTS(!format.empty(), "Format parameter must not be empty."); + CUDF_EXPECTS(!format.empty(), "Format parameter must not be empty.", std::invalid_argument); CUDF_EXPECTS(names.is_empty() || names.size() == format_names_size, - "Invalid size for format names."); + "Invalid size for format names.", + std::invalid_argument); auto const d_names = column_device_view::create(names.parent(), stream); diff --git a/cpp/tests/strings/datetime_tests.cpp b/cpp/tests/strings/datetime_tests.cpp index 6a5f95a318e..5192e0e5269 100644 --- a/cpp/tests/strings/datetime_tests.cpp +++ b/cpp/tests/strings/datetime_tests.cpp @@ -625,18 +625,22 @@ TEST_F(StringsDatetimeTest, Errors) cudf::logic_error); EXPECT_THROW( cudf::strings::to_timestamps(view, cudf::data_type{cudf::type_id::TIMESTAMP_SECONDS}, ""), - cudf::logic_error); + std::invalid_argument); EXPECT_THROW( cudf::strings::to_timestamps(view, cudf::data_type{cudf::type_id::TIMESTAMP_SECONDS}, "%2Y"), - cudf::logic_error); + std::invalid_argument); EXPECT_THROW( cudf::strings::to_timestamps(view, cudf::data_type{cudf::type_id::TIMESTAMP_SECONDS}, "%g"), - cudf::logic_error); + std::invalid_argument); cudf::test::fixed_width_column_wrapper invalid_timestamps{1530705600}; - EXPECT_THROW(cudf::strings::from_timestamps(invalid_timestamps), cudf::logic_error); + EXPECT_THROW(cudf::strings::from_timestamps(invalid_timestamps), std::invalid_argument); cudf::test::fixed_width_column_wrapper timestamps{ 1530705600}; - EXPECT_THROW(cudf::strings::from_timestamps(timestamps, ""), cudf::logic_error); - EXPECT_THROW(cudf::strings::from_timestamps(timestamps, "%A %B", view), cudf::logic_error); + EXPECT_THROW(cudf::strings::from_timestamps(timestamps, ""), std::invalid_argument); + EXPECT_THROW(cudf::strings::from_timestamps(timestamps, "%B", view), std::invalid_argument); + + EXPECT_THROW(cudf::strings::is_timestamp(view, "%D"), std::invalid_argument); + EXPECT_THROW(cudf::strings::is_timestamp(view, "%p %"), std::invalid_argument); + EXPECT_THROW(cudf::strings::from_timestamps(timestamps, "%Y:%H", view), std::invalid_argument); } From 0456b16ae8e86d759c17b2c9b503fd4453ca7d8d Mon Sep 17 00:00:00 2001 From: David Wendt <45795991+davidwendt@users.noreply.github.com> Date: Thu, 7 Aug 2025 11:14:22 -0400 Subject: [PATCH 081/366] Replace sprintf with std::format in libcudf parquet tests (#19364) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fixes some build failures from the nightly tests https://github.com/rapidsai/cudf/actions/runs/16258662170/job/45899486553 ``` │ │ $SRC_DIR/cpp/tests/io/parquet_common.cpp:507:26: error: '%09d' directive writing between 9 and 11 bytes into a region of size 10 [-Werror=format-overflow=] │ │ 507 | sprintf(buf.data(), "%09d", (i % 2 == 0) ? i : (num_ordered_rows - i)); ``` Authors: - David Wendt (https://github.com/davidwendt) Approvers: - Shruti Shivakumar (https://github.com/shrshi) - Nghia Truong (https://github.com/ttnghia) URL: https://github.com/rapidsai/cudf/pull/19364 --- .../io/experimental/hybrid_scan_common.cpp | 11 ++-- cpp/tests/io/parquet_common.cpp | 24 ++++----- cpp/tests/io/parquet_v2_test.cpp | 54 +++++++------------ 3 files changed, 33 insertions(+), 56 deletions(-) diff --git a/cpp/tests/io/experimental/hybrid_scan_common.cpp b/cpp/tests/io/experimental/hybrid_scan_common.cpp index 1224d9d4988..c17541b3321 100644 --- a/cpp/tests/io/experimental/hybrid_scan_common.cpp +++ b/cpp/tests/io/experimental/hybrid_scan_common.cpp @@ -25,6 +25,9 @@ #include #include +#include +#include + cudf::host_span fetch_footer_bytes(cudf::host_span buffer) { using namespace cudf::io::parquet; @@ -87,11 +90,7 @@ cudf::test::strings_column_wrapper constant_strings(cudf::size_type value) { CUDF_EXPECTS(value >= 0 && value <= 9999, "String value must be between 0000 and 9999"); - auto elements = - thrust::make_transform_iterator(thrust::make_constant_iterator(value), [](auto i) { - std::array buf{}; - snprintf(buf.data(), buf.size(), "%04d", i); - return std::string(buf.data()); - }); + auto elements = thrust::make_transform_iterator(thrust::make_constant_iterator(value), + [](auto i) { return std::format("{:04d}", i); }); return cudf::test::strings_column_wrapper(elements, elements + num_ordered_rows); } diff --git a/cpp/tests/io/parquet_common.cpp b/cpp/tests/io/parquet_common.cpp index aa0f5fafe4e..f0be469e5ed 100644 --- a/cpp/tests/io/parquet_common.cpp +++ b/cpp/tests/io/parquet_common.cpp @@ -18,6 +18,9 @@ #include +#include +#include + // Global environment for temporary files cudf::test::TempDirTestEnvironment* const temp_env = static_cast( @@ -478,11 +481,8 @@ template std::enable_if_t, cudf::test::strings_column_wrapper> ascending() { - auto elements = cudf::detail::make_counting_transform_iterator(0, [](auto i) { - std::array buf{}; - snprintf(buf.data(), buf.size(), "%09d", i); - return std::string(buf.data()); - }); + auto elements = cudf::detail::make_counting_transform_iterator( + 0, [](auto i) { return std::format("{:09d}", i); }); return cudf::test::strings_column_wrapper(elements, elements + num_ordered_rows); } @@ -490,11 +490,8 @@ template std::enable_if_t, cudf::test::strings_column_wrapper> descending() { - auto elements = cudf::detail::make_counting_transform_iterator(0, [](auto i) { - std::array buf{}; - snprintf(buf.data(), buf.size(), "%09d", static_cast(num_ordered_rows - i)); - return std::string(buf.data()); - }); + auto elements = cudf::detail::make_counting_transform_iterator( + 0, [](auto i) { return std::format("{:09d}", static_cast(num_ordered_rows - i)); }); return cudf::test::strings_column_wrapper(elements, elements + num_ordered_rows); } @@ -502,11 +499,8 @@ template std::enable_if_t, cudf::test::strings_column_wrapper> unordered() { - auto elements = cudf::detail::make_counting_transform_iterator(0, [](auto i) { - std::array buf{}; - snprintf(buf.data(), buf.size(), "%09d", (i % 2 == 0) ? i : (num_ordered_rows - i)); - return std::string(buf.data()); - }); + auto elements = cudf::detail::make_counting_transform_iterator( + 0, [](auto i) { return std::format("{:09d}", (i % 2 == 0) ? i : (num_ordered_rows - i)); }); return cudf::test::strings_column_wrapper(elements, elements + num_ordered_rows); } diff --git a/cpp/tests/io/parquet_v2_test.cpp b/cpp/tests/io/parquet_v2_test.cpp index b165b6bfa82..c30495c4e70 100644 --- a/cpp/tests/io/parquet_v2_test.cpp +++ b/cpp/tests/io/parquet_v2_test.cpp @@ -24,6 +24,8 @@ #include #include +#include +#include using cudf::test::iterators::no_nulls; @@ -694,12 +696,9 @@ TEST_P(ParquetV2Test, CheckColumnOffsetIndex) is_v2 ? cudf::io::parquet::PageType::DATA_PAGE_V2 : cudf::io::parquet::PageType::DATA_PAGE; // fixed length strings - auto str1_elements = cudf::detail::make_counting_transform_iterator(0, [](auto i) { - std::array buf; - snprintf(buf.data(), buf.size(), "%012d", i); - return std::string(buf.data()); - }); - auto col0 = cudf::test::strings_column_wrapper(str1_elements, str1_elements + num_rows); + auto str1_elements = cudf::detail::make_counting_transform_iterator( + 0, [](auto i) { return std::format("{:012d}", i); }); + auto col0 = cudf::test::strings_column_wrapper(str1_elements, str1_elements + num_rows); auto col1_data = random_values(num_rows); auto col2_data = random_values(num_rows); @@ -716,12 +715,9 @@ TEST_P(ParquetV2Test, CheckColumnOffsetIndex) auto col6 = cudf::test::fixed_width_column_wrapper(col6_data.begin(), col6_data.end()); // mixed length strings - auto str2_elements = cudf::detail::make_counting_transform_iterator(0, [](auto i) { - std::array buf; - snprintf(buf.data(), buf.size(), "%d", i); - return std::string(buf.data()); - }); - auto col7 = cudf::test::strings_column_wrapper(str2_elements, str2_elements + num_rows); + auto str2_elements = + cudf::detail::make_counting_transform_iterator(0, [](auto i) { return std::format("{}", i); }); + auto col7 = cudf::test::strings_column_wrapper(str2_elements, str2_elements + num_rows); auto const expected = table_view{{col0, col1, col2, col3, col4, col5, col6, col7}}; @@ -788,12 +784,9 @@ TEST_P(ParquetV2Test, CheckColumnOffsetIndexNulls) is_v2 ? cudf::io::parquet::PageType::DATA_PAGE_V2 : cudf::io::parquet::PageType::DATA_PAGE; // fixed length strings - auto str1_elements = cudf::detail::make_counting_transform_iterator(0, [](auto i) { - std::array buf; - snprintf(buf.data(), buf.size(), "%012d", i); - return std::string(buf.data()); - }); - auto col0 = cudf::test::strings_column_wrapper(str1_elements, str1_elements + num_rows); + auto str1_elements = cudf::detail::make_counting_transform_iterator( + 0, [](auto i) { return std::format("{:012d}", i); }); + auto col0 = cudf::test::strings_column_wrapper(str1_elements, str1_elements + num_rows); auto col1_data = random_values(num_rows); auto col2_data = random_values(num_rows); @@ -820,11 +813,8 @@ TEST_P(ParquetV2Test, CheckColumnOffsetIndexNulls) cudf::test::fixed_width_column_wrapper(col6_data.begin(), col6_data.end(), valids); // mixed length strings - auto str2_elements = cudf::detail::make_counting_transform_iterator(0, [](auto i) { - std::array buf; - snprintf(buf.data(), buf.size(), "%d", i); - return std::string(buf.data()); - }); + auto str2_elements = + cudf::detail::make_counting_transform_iterator(0, [](auto i) { return std::format("{}", i); }); auto col7 = cudf::test::strings_column_wrapper(str2_elements, str2_elements + num_rows, valids); auto expected = table_view{{col0, col1, col2, col3, col4, col5, col6, col7}}; @@ -898,12 +888,9 @@ TEST_P(ParquetV2Test, CheckColumnOffsetIndexNullColumn) is_v2 ? cudf::io::parquet::PageType::DATA_PAGE_V2 : cudf::io::parquet::PageType::DATA_PAGE; // fixed length strings - auto str1_elements = cudf::detail::make_counting_transform_iterator(0, [](auto i) { - std::array buf; - snprintf(buf.data(), buf.size(), "%012d", i); - return std::string(buf.data()); - }); - auto col0 = cudf::test::strings_column_wrapper(str1_elements, str1_elements + num_rows); + auto str1_elements = cudf::detail::make_counting_transform_iterator( + 0, [](auto i) { return std::format("{:012d}", i); }); + auto col0 = cudf::test::strings_column_wrapper(str1_elements, str1_elements + num_rows); auto col1_data = random_values(num_rows); auto col2_data = random_values(num_rows); @@ -915,12 +902,9 @@ TEST_P(ParquetV2Test, CheckColumnOffsetIndexNullColumn) auto col2 = cudf::test::fixed_width_column_wrapper(col2_data.begin(), col2_data.end()); // mixed length strings - auto str2_elements = cudf::detail::make_counting_transform_iterator(0, [](auto i) { - std::array buf; - snprintf(buf.data(), buf.size(), "%d", i); - return std::string(buf.data()); - }); - auto col3 = cudf::test::strings_column_wrapper(str2_elements, str2_elements + num_rows); + auto str2_elements = + cudf::detail::make_counting_transform_iterator(0, [](auto i) { return std::format("{}", i); }); + auto col3 = cudf::test::strings_column_wrapper(str2_elements, str2_elements + num_rows); auto expected = table_view{{col0, col1, col2, col3}}; From 37ea851987264ac8f20855e07f3cd7ba6c78cb31 Mon Sep 17 00:00:00 2001 From: Robert Maynard Date: Thu, 7 Aug 2025 15:55:18 -0400 Subject: [PATCH 082/366] Update rapids_config to handle user defined branch name (#19623) rapids_config will use a user defined branch over `RAPIDS_BRANCH` contents Authors: - Robert Maynard (https://github.com/robertmaynard) Approvers: - Kyle Edwards (https://github.com/KyleFromNVIDIA) URL: https://github.com/rapidsai/cudf/pull/19623 --- cmake/RAPIDS.cmake | 2 +- cmake/rapids_config.cmake | 8 ++++++-- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/cmake/RAPIDS.cmake b/cmake/RAPIDS.cmake index 40de7cefcd2..ddef819498d 100644 --- a/cmake/RAPIDS.cmake +++ b/cmake/RAPIDS.cmake @@ -18,7 +18,7 @@ cmake_minimum_required(VERSION 3.30.4 FATAL_ERROR) # Allow users to control which version is used -if(NOT rapids-cmake-branch OR NOT rapids-cmake-version) +if(NOT (rapids-cmake-branch OR rapids-cmake-version)) message( FATAL_ERROR "The CMake variable `rapids-cmake-branch` or `rapids-cmake-version` must be defined" ) diff --git a/cmake/rapids_config.cmake b/cmake/rapids_config.cmake index b706c926e7a..b2c54a3f27d 100644 --- a/cmake/rapids_config.cmake +++ b/cmake/rapids_config.cmake @@ -35,6 +35,10 @@ if(NOT _rapids_branch) ) endif() -set(rapids-cmake-version "${RAPIDS_VERSION_MAJOR_MINOR}") -set(rapids-cmake-branch "${_rapids_branch}") +if(NOT rapids-cmake-version) + set(rapids-cmake-version "${RAPIDS_VERSION_MAJOR_MINOR}") +endif() +if(NOT rapids-cmake-branch) + set(rapids-cmake-branch "${_rapids_branch}") +endif() include("${CMAKE_CURRENT_LIST_DIR}/RAPIDS.cmake") From 2ae747440d5e3f4227041e35a3fd2f03c299478b Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com> Date: Thu, 7 Aug 2025 13:48:01 -0700 Subject: [PATCH 083/366] Add nvtx ranges to public APIs of the experimental parquet reader (#19618) Contributes to #19469 This PR moves the NVTX ranges from detail to public APIs of the experimental Parquet reader. Authors: - Muhammad Haseeb (https://github.com/mhaseeb123) Approvers: - Shruti Shivakumar (https://github.com/shrshi) - Nghia Truong (https://github.com/ttnghia) URL: https://github.com/rapidsai/cudf/pull/19618 --- .../io/parquet/experimental/hybrid_scan.cpp | 28 +++++++++++++++++++ .../experimental/hybrid_scan_helpers.cpp | 3 ++ .../parquet/experimental/hybrid_scan_impl.cpp | 19 ------------- 3 files changed, 31 insertions(+), 19 deletions(-) diff --git a/cpp/src/io/parquet/experimental/hybrid_scan.cpp b/cpp/src/io/parquet/experimental/hybrid_scan.cpp index f14ff508561..692c77a6fa1 100644 --- a/cpp/src/io/parquet/experimental/hybrid_scan.cpp +++ b/cpp/src/io/parquet/experimental/hybrid_scan.cpp @@ -16,6 +16,7 @@ #include "hybrid_scan_impl.hpp" +#include #include #include @@ -33,22 +34,30 @@ hybrid_scan_reader::~hybrid_scan_reader() = default; [[nodiscard]] text::byte_range_info hybrid_scan_reader::page_index_byte_range() const { + CUDF_FUNC_RANGE(); + return _impl->page_index_byte_range(); } [[nodiscard]] FileMetaData hybrid_scan_reader::parquet_metadata() const { + CUDF_FUNC_RANGE(); + return _impl->parquet_metadata(); } void hybrid_scan_reader::setup_page_index(cudf::host_span page_index_bytes) const { + CUDF_FUNC_RANGE(); + return _impl->setup_page_index(page_index_bytes); } std::vector hybrid_scan_reader::all_row_groups( parquet_reader_options const& options) const { + CUDF_FUNC_RANGE(); + CUDF_EXPECTS(options.get_row_groups().size() <= 1, "Encountered invalid size of row group indices in parquet reader options"); @@ -61,6 +70,8 @@ std::vector hybrid_scan_reader::all_row_groups( size_type hybrid_scan_reader::total_rows_in_row_groups( cudf::host_span row_group_indices) const { + CUDF_FUNC_RANGE(); + if (row_group_indices.empty()) { return 0; } auto const input_row_group_indices = @@ -73,6 +84,8 @@ std::vector hybrid_scan_reader::filter_row_groups_with_stats( parquet_reader_options const& options, rmm::cuda_stream_view stream) const { + CUDF_FUNC_RANGE(); + // Temporary vector with row group indices from the first source auto const input_row_group_indices = std::vector>{{row_group_indices.begin(), row_group_indices.end()}}; @@ -84,6 +97,7 @@ std::pair, std::vector hybrid_scan_reader::secondary_filters_byte_ranges( cudf::host_span row_group_indices, parquet_reader_options const& options) const { + CUDF_FUNC_RANGE(); // Temporary vector with row group indices from the first source auto const input_row_group_indices = std::vector>{{row_group_indices.begin(), row_group_indices.end()}}; @@ -97,6 +111,8 @@ std::vector hybrid_scan_reader::filter_row_groups_with_dictiona parquet_reader_options const& options, rmm::cuda_stream_view stream) const { + CUDF_FUNC_RANGE(); + // Temporary vector with row group indices from the first source auto const input_row_group_indices = std::vector>{{row_group_indices.begin(), row_group_indices.end()}}; @@ -113,6 +129,8 @@ std::vector hybrid_scan_reader::filter_row_groups_with_bloom_fi parquet_reader_options const& options, rmm::cuda_stream_view stream) const { + CUDF_FUNC_RANGE(); + // Temporary vector with row group indices from the first source auto const input_row_group_indices = std::vector>{{row_group_indices.begin(), row_group_indices.end()}}; @@ -129,6 +147,8 @@ std::unique_ptr hybrid_scan_reader::build_row_mask_with_page_index rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) const { + CUDF_FUNC_RANGE(); + // Temporary vector with row group indices from the first source auto const input_row_group_indices = std::vector>{{row_group_indices.begin(), row_group_indices.end()}}; @@ -140,6 +160,8 @@ std::unique_ptr hybrid_scan_reader::build_row_mask_with_page_index hybrid_scan_reader::filter_column_chunks_byte_ranges( cudf::host_span row_group_indices, parquet_reader_options const& options) const { + CUDF_FUNC_RANGE(); + // Temporary vector with row group indices from the first source auto const input_row_group_indices = std::vector>{{row_group_indices.begin(), row_group_indices.end()}}; @@ -155,6 +177,8 @@ table_with_metadata hybrid_scan_reader::materialize_filter_columns( parquet_reader_options const& options, rmm::cuda_stream_view stream) const { + CUDF_FUNC_RANGE(); + // Temporary vector with row group indices from the first source auto const input_row_group_indices = std::vector>{{row_group_indices.begin(), row_group_indices.end()}}; @@ -171,6 +195,8 @@ table_with_metadata hybrid_scan_reader::materialize_filter_columns( hybrid_scan_reader::payload_column_chunks_byte_ranges( cudf::host_span row_group_indices, parquet_reader_options const& options) const { + CUDF_FUNC_RANGE(); + auto const input_row_group_indices = std::vector>{{row_group_indices.begin(), row_group_indices.end()}}; @@ -185,6 +211,8 @@ table_with_metadata hybrid_scan_reader::materialize_payload_columns( parquet_reader_options const& options, rmm::cuda_stream_view stream) const { + CUDF_FUNC_RANGE(); + // Temporary vector with row group indices from the first source auto const input_row_group_indices = std::vector>{{row_group_indices.begin(), row_group_indices.end()}}; diff --git a/cpp/src/io/parquet/experimental/hybrid_scan_helpers.cpp b/cpp/src/io/parquet/experimental/hybrid_scan_helpers.cpp index 3932e9bc15c..af4ce719786 100644 --- a/cpp/src/io/parquet/experimental/hybrid_scan_helpers.cpp +++ b/cpp/src/io/parquet/experimental/hybrid_scan_helpers.cpp @@ -20,6 +20,7 @@ #include "io/parquet/reader_impl_helpers.hpp" #include "io/utilities/row_selection.hpp" +#include #include #include @@ -72,6 +73,8 @@ namespace { metadata::metadata(cudf::host_span footer_bytes) { + CUDF_FUNC_RANGE(); + CompactProtocolReader cp(footer_bytes.data(), footer_bytes.size()); cp.read(this); CUDF_EXPECTS(cp.InitSchema(this), "Cannot initialize schema"); diff --git a/cpp/src/io/parquet/experimental/hybrid_scan_impl.cpp b/cpp/src/io/parquet/experimental/hybrid_scan_impl.cpp index 354eb2adde1..5c15d7b6f08 100644 --- a/cpp/src/io/parquet/experimental/hybrid_scan_impl.cpp +++ b/cpp/src/io/parquet/experimental/hybrid_scan_impl.cpp @@ -20,7 +20,6 @@ #include "hybrid_scan_helpers.hpp" #include "io/parquet/reader_impl_chunking_utils.cuh" -#include #include #include #include @@ -169,8 +168,6 @@ std::vector> hybrid_scan_reader_impl::filter_row_groups_w parquet_reader_options const& options, rmm::cuda_stream_view stream) { - CUDF_FUNC_RANGE(); - CUDF_EXPECTS(not row_group_indices.empty(), "Empty input row group indices encountered"); CUDF_EXPECTS(options.get_filter().has_value(), "Encountered empty converted filter expression"); @@ -195,8 +192,6 @@ hybrid_scan_reader_impl::secondary_filters_byte_ranges( cudf::host_span const> row_group_indices, parquet_reader_options const& options) { - CUDF_FUNC_RANGE(); - CUDF_EXPECTS(not row_group_indices.empty(), "Empty input row group indices encountered"); CUDF_EXPECTS(options.get_filter().has_value(), "Filter expression must not be empty"); @@ -230,8 +225,6 @@ hybrid_scan_reader_impl::filter_row_groups_with_dictionary_pages( parquet_reader_options const& options, rmm::cuda_stream_view stream) { - CUDF_FUNC_RANGE(); - CUDF_EXPECTS(not row_group_indices.empty(), "Empty input row group indices encountered"); CUDF_EXPECTS(options.get_filter().has_value(), "Encountered empty converted filter expression"); @@ -302,8 +295,6 @@ std::vector> hybrid_scan_reader_impl::filter_row_groups_w CUDF_EXPECTS(not row_group_indices.empty(), "Empty input row group indices encountered"); CUDF_EXPECTS(options.get_filter().has_value(), "Encountered empty converted filter expression"); - CUDF_FUNC_RANGE(); - select_columns(read_columns_mode::FILTER_COLUMNS, options); table_metadata metadata; @@ -331,8 +322,6 @@ std::unique_ptr hybrid_scan_reader_impl::build_row_mask_with_page_ CUDF_EXPECTS(not row_group_indices.empty(), "Empty input row group indices encountered"); CUDF_EXPECTS(options.get_filter().has_value(), "Encountered empty converted filter expression"); - CUDF_FUNC_RANGE(); - select_columns(read_columns_mode::FILTER_COLUMNS, options); table_metadata metadata; @@ -355,8 +344,6 @@ std::pair, std::vector> hybrid_scan_reader_impl::get_input_column_chunk_byte_ranges( cudf::host_span const> row_group_indices) const { - CUDF_FUNC_RANGE(); - // Descriptors for all the chunks that make up the selected columns auto const num_input_columns = _input_columns.size(); auto const num_row_groups = @@ -438,8 +425,6 @@ table_with_metadata hybrid_scan_reader_impl::materialize_filter_columns( CUDF_EXPECTS(not row_group_indices.empty(), "Empty input row group indices encountered"); CUDF_EXPECTS(options.get_filter().has_value(), "Encountered empty converted filter expression"); - CUDF_FUNC_RANGE(); - reset_internal_state(); table_metadata metadata; @@ -477,8 +462,6 @@ table_with_metadata hybrid_scan_reader_impl::materialize_payload_columns( CUDF_EXPECTS(row_mask.null_count() == 0, "Row mask must not have any nulls when materializing payload column"); - CUDF_FUNC_RANGE(); - reset_internal_state(); initialize_options(row_group_indices, options, stream); @@ -728,8 +711,6 @@ table_with_metadata hybrid_scan_reader_impl::finalize_output( void hybrid_scan_reader_impl::set_pass_page_mask( cudf::host_span const> data_page_mask) { - CUDF_FUNC_RANGE(); - auto const& pass = _pass_itm_data; auto const& chunks = pass->chunks; From a415e0abc8f9264dc7f222b894c686ef77d4b40f Mon Sep 17 00:00:00 2001 From: Matthew Murray <41342305+Matt711@users.noreply.github.com> Date: Fri, 8 Aug 2025 11:09:12 -0400 Subject: [PATCH 084/366] Add fast path for Parquet reading with predicate pushdown via AST filters (#19605) Authors: - Matthew Murray (https://github.com/Matt711) Approvers: - Matthew Roeschke (https://github.com/mroeschke) - Lawrence Mitchell (https://github.com/wence-) URL: https://github.com/rapidsai/cudf/pull/19605 --- python/cudf/benchmarks/API/bench_io.py | 37 ++++++ python/cudf/cudf/io/parquet.py | 175 +++++++++++++++++++++++-- 2 files changed, 202 insertions(+), 10 deletions(-) create mode 100644 python/cudf/benchmarks/API/bench_io.py diff --git a/python/cudf/benchmarks/API/bench_io.py b/python/cudf/benchmarks/API/bench_io.py new file mode 100644 index 00000000000..8a05c61ee47 --- /dev/null +++ b/python/cudf/benchmarks/API/bench_io.py @@ -0,0 +1,37 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + +"""Benchmarks for IO operations.""" + +import os +from tempfile import TemporaryDirectory + +import pytest +from config import NUM_ROWS + +import cudf + + +@pytest.mark.parametrize("num_rows", NUM_ROWS) +def bench_read_parquet_with_filters(benchmark, num_rows): + df = cudf.DataFrame( + { + "x": cudf.Series(range(num_rows), dtype="int32"), + "y": cudf.Series(range(num_rows, 2 * num_rows), dtype="float64"), + "z": cudf.Series( + ["a", "b", "c", "d"] * (num_rows // 4), dtype="str" + ), + } + ) + + with TemporaryDirectory() as tmpdir: + path = os.path.join(tmpdir, "filtered.parquet") + df.to_parquet(path) + + threshold = num_rows // 2 + + filters = [ + [("x", ">", threshold), ("z", "in", ["a", "b"])], + [("y", "<", threshold), ("z", "not in", ["c"])], + ] + + benchmark(cudf.read_parquet, path, filters=filters) diff --git a/python/cudf/cudf/io/parquet.py b/python/cudf/cudf/io/parquet.py index 7bcb80ffbdc..25a38a4709e 100644 --- a/python/cudf/cudf/io/parquet.py +++ b/python/cudf/cudf/io/parquet.py @@ -1,6 +1,7 @@ # Copyright (c) 2019-2025, NVIDIA CORPORATION. from __future__ import annotations +import datetime import io import itertools import math @@ -18,6 +19,7 @@ import pandas as pd import pylibcudf as plc +from pylibcudf import expressions as plc_expr from cudf.api.types import is_list_like from cudf.core.buffer import acquire_spill_lock @@ -534,6 +536,140 @@ def write_to_dataset( return metadata +_BIN_OPS = { + "==": plc_expr.ASTOperator.EQUAL, + "!=": plc_expr.ASTOperator.NOT_EQUAL, + "<": plc_expr.ASTOperator.LESS, + "<=": plc_expr.ASTOperator.LESS_EQUAL, + ">": plc_expr.ASTOperator.GREATER, + ">=": plc_expr.ASTOperator.GREATER_EQUAL, +} + + +def _raise_for_unsupported_scalar_types(val: Any): + if not ( + isinstance(val, (int, float)) + or isinstance(val, (datetime.date, datetime.timedelta)) + or isinstance(val, str) + ): + return val + raise NotImplementedError( + "Only numeric, string, or timestamp/duration scalars are accepted" + ) + + +def make_literal(v: Any) -> plc_expr.Literal: + return plc_expr.Literal( + plc.Scalar.from_py(_raise_for_unsupported_scalar_types(v)) + ) + + +def make_expr(col: str, op: str, val: Any) -> plc_expr.Expression: + col_ref = plc_expr.ColumnNameReference(col) + + match op: + case op if op in _BIN_OPS: + return plc_expr.Operation( + _BIN_OPS[op], + col_ref, + make_literal(val), + ) + + case "in": + equal_ops = [ + plc_expr.Operation( + plc_expr.ASTOperator.EQUAL, col_ref, make_literal(v) + ) + for v in val + ] + return reduce( + partial(plc_expr.Operation, plc_expr.ASTOperator.LOGICAL_OR), + equal_ops, + ) + + case "not in": + not_equal_ops = [ + plc_expr.Operation( + plc_expr.ASTOperator.NOT_EQUAL, col_ref, make_literal(v) + ) + for v in val + ] + return reduce( + partial(plc_expr.Operation, plc_expr.ASTOperator.LOGICAL_AND), + not_equal_ops, + ) + + case "is": + if val is None: + return plc_expr.Operation( + plc_expr.ASTOperator.IS_NULL, col_ref + ) + raise NotImplementedError("Only `is None` supported") + + case "is not": + if val is None: + return plc_expr.Operation( + plc_expr.ASTOperator.NOT, + plc_expr.Operation(plc_expr.ASTOperator.IS_NULL, col_ref), + ) + raise NotImplementedError("Only `is not None` supported") + + case _: + raise NotImplementedError(f"Unsupported op: {op}") + + +def translate_filters_to_ast( + filters: list[list[tuple[str, str, Any]]], +) -> plc_expr.Expression: + """ + Convert a filter expression in Disjunctive Normal Form (DNF) to a pylibcudf AST. + + Parameters + ---------- + filters : list[list[tuple[str, str, Any]]] + A filter expression in DNF. + DNF is represented as a list of OR-ed clauses, where each clause is a list of + AND-ed predicates. Each predicate is a tuple of the form: + (column_name, operator, value) + + Returns + ------- + plc.expressions.Expression + A pylibcudf AST expression representing the filter. + + Notes + ----- + This function supports the following operators: + - Binary comparisons: ==, !=, <, <=, >, >= + - Membership tests: "in", "not in" + - Null checks: "is None", "is not None" + + The translation is performed by recursively reducing: + - Each clause's predicates into a conjunction (AND) + - All clauses into a disjunction (OR) of those conjunctions + """ + + clauses = [ + reduce( + partial(plc_expr.Operation, plc_expr.ASTOperator.LOGICAL_AND), + (make_expr(*pred) for pred in clause), + ) + for clause in filters + ] + + if not clauses: + raise ValueError("Empty filter expression") + + return ( + reduce( + partial(plc_expr.Operation, plc_expr.ASTOperator.LOGICAL_OR), + clauses, + ) + if len(clauses) > 1 + else clauses[0] + ) + + def _parse_metadata(meta) -> tuple[bool, Any, None | np.dtype]: file_is_range_index = False file_index_cols = None @@ -824,13 +960,26 @@ def read_parquet( # Normalize and validate filters filters = _normalize_filters(filters) + # Attempt to translate filters to a libcudf AST + ast_filter = None + if ( + engine == "cudf" + and filters is not None + and categorical_partitions is None + and dataset_kwargs is None + ): + try: + ast_filter = translate_filters_to_ast(filters) + except NotImplementedError: + pass + # Use pyarrow dataset to detect/process directory-partitioned # data and apply filters. Note that we can only support partitioned # data and filtering if the input is a single directory or list of # paths. partition_keys = [] partition_categories = {} - if fs and paths: + if ast_filter is None and fs and paths: ( paths, row_groups, @@ -848,7 +997,7 @@ def read_parquet( # Prepare remote-IO options prefetch_options = kwargs.pop("prefetch_options", {}) - if not ioutils._is_local_filesystem(fs): + if ast_filter is None and not ioutils._is_local_filesystem(fs): # The default prefetch method depends on the # `row_groups` argument. In most cases we will use # method="all" by default, because it is fastest @@ -912,7 +1061,7 @@ def read_parquet( # will be dropped almost immediately after IO. However, # we do NEED these columns for accurate filtering. projected_columns = None - if columns and filters: + if ast_filter is None and columns and filters: projected_columns = columns columns = sorted( set(v[0] for v in itertools.chain.from_iterable(filters)) @@ -933,10 +1082,13 @@ def read_parquet( nrows=nrows, skip_rows=skip_rows, allow_mismatched_pq_schemas=allow_mismatched_pq_schemas, + filters=ast_filter, **kwargs, ) # Apply filters row-wise (if any are defined), and return - df = _apply_post_filters(df, filters) + if ast_filter is None: + df = _apply_post_filters(df, filters) + if projected_columns: # Elements of `projected_columns` may now be in the index. # We must filter these names from our projection @@ -1169,6 +1321,7 @@ def _read_parquet( nrows: int | None = None, skip_rows: int | None = None, allow_mismatched_pq_schemas: bool = False, + filters: plc_expr.Expression | None = None, *args, **kwargs, ) -> DataFrame: @@ -1176,7 +1329,7 @@ def _read_parquet( # cudf and pyarrow to read parquet data if engine == "cudf": if set(kwargs.keys()).difference( - set(("_chunk_read_limit", "_pass_read_limit")) + set(("_chunk_read_limit", "_pass_read_limit", "filters")) ): raise ValueError( "cudf engine doesn't support the " @@ -1192,10 +1345,11 @@ def _read_parquet( if skip_rows is None: skip_rows = 0 if get_option("io.parquet.low_memory"): - # Note: If this function ever takes accepts filters - # allow_range_index needs to be False when a filter is passed - # (see read_parquet) - allow_range_index = columns is not None and len(columns) != 0 + # If filters are used, we can't rely on a RangeIndex + # (row count may be reduced) + allow_range_index = ( + filters is None and columns is not None and len(columns) != 0 + ) options = ( plc.io.parquet.ParquetReaderOptions.builder( @@ -1213,6 +1367,8 @@ def _read_parquet( options.set_skip_rows(skip_rows) if columns is not None: options.set_columns(columns) + if filters is not None: + options.set_filter(filters) reader = plc.io.parquet.ChunkedParquetReader( options, @@ -1261,7 +1417,6 @@ def _read_parquet( return df else: allow_range_index = True - filters = kwargs.get("filters", None) if columns is not None and len(columns) == 0 or filters: allow_range_index = False From 65e55a7b885b5d1660687ad6c1e143a48bb1fa69 Mon Sep 17 00:00:00 2001 From: David Wendt <45795991+davidwendt@users.noreply.github.com> Date: Fri, 8 Aug 2025 11:13:57 -0400 Subject: [PATCH 085/366] Fix cudf::sequence() to throw exception for invalid scalar inputs (#19612) Fix the `cudf::sequence()` logic to throw an exception if given scalar parameters set with `valid=false`. This mostly effected some benchmark data generation logic. Authors: - David Wendt (https://github.com/davidwendt) Approvers: - Bradley Dice (https://github.com/bdice) - Yunsong Wang (https://github.com/PointKernel) URL: https://github.com/rapidsai/cudf/pull/19612 --- cpp/benchmarks/common/generate_input.cu | 29 +++++++++---------------- cpp/include/cudf/filling.hpp | 14 +++++++----- cpp/src/filling/sequence.cu | 15 ++++++++----- cpp/tests/filling/sequence_tests.cpp | 21 ++++++++++++++---- 4 files changed, 45 insertions(+), 34 deletions(-) diff --git a/cpp/benchmarks/common/generate_input.cu b/cpp/benchmarks/common/generate_input.cu index d8e868a4ae8..03fe04fe5d6 100644 --- a/cpp/benchmarks/common/generate_input.cu +++ b/cpp/benchmarks/common/generate_input.cu @@ -640,14 +640,7 @@ std::unique_ptr create_distinct_rows_column(data_profile const& pr auto valid_dist = random_value_fn( distribution_params{1. - profile.get_null_probability().value_or(0)}); - cudf::data_type const dtype = [&]() { - if constexpr (cudf::is_fixed_point()) - return cudf::data_type{cudf::type_to_id(), 0}; - else - return cudf::data_type{cudf::type_to_id()}; - }(); - - auto init = cudf::make_default_constructed_scalar(dtype); + auto init = cudf::make_fixed_width_scalar(T{}); auto col = cudf::sequence(num_rows, *init); rmm::device_uvector null_mask(0, cudf::get_default_stream()); @@ -698,9 +691,8 @@ template <> std::unique_ptr create_distinct_rows_column( data_profile const& profile, thrust::minstd_rand& engine, cudf::size_type num_rows) { - auto col = create_random_column(profile, engine, num_rows); - auto int_col = cudf::sequence( - num_rows, *cudf::make_default_constructed_scalar(cudf::data_type{cudf::type_id::INT32})); + auto col = create_random_column(profile, engine, num_rows); + auto int_col = cudf::sequence(num_rows, *cudf::make_fixed_width_scalar(0)); auto int2strcol = cudf::strings::from_integers(int_col->view()); auto concat_col = cudf::strings::concatenate(cudf::table_view({col->view(), int2strcol->view()})); return std::move(cudf::sample(cudf::table_view({concat_col->view()}), num_rows)->release()[0]); @@ -793,8 +785,7 @@ std::unique_ptr create_distinct_rows_column( auto const dist_params = profile.get_distribution_params(); auto col = create_random_column(profile, engine, num_rows); std::vector> children; - children.push_back(cudf::sequence( - num_rows, *cudf::make_default_constructed_scalar(cudf::data_type{cudf::type_id::INT32}))); + children.push_back(cudf::sequence(num_rows, *cudf::make_fixed_width_scalar(0))); for (int lvl = dist_params.max_depth; lvl > 1; --lvl) { std::vector> parents; parents.push_back( @@ -891,12 +882,11 @@ std::unique_ptr create_distinct_rows_column( { auto const dist_params = profile.get_distribution_params(); auto col = create_random_column(profile, engine, num_rows); - auto child_column = cudf::sequence( - num_rows, *cudf::make_default_constructed_scalar(cudf::data_type{cudf::type_id::INT32})); + auto zero = cudf::make_fixed_width_scalar(0); + auto child_column = cudf::sequence(num_rows, *zero); for (int lvl = dist_params.max_depth; lvl > 0; --lvl) { - auto offsets_column = cudf::sequence( - num_rows + 1, *cudf::make_default_constructed_scalar(cudf::data_type{cudf::type_id::INT32})); - auto list_column = cudf::make_lists_column( + auto offsets_column = cudf::sequence(num_rows + 1, *zero); + auto list_column = cudf::make_lists_column( num_rows, std::move(offsets_column), std::move(child_column), 0, rmm::device_buffer{}); std::swap(child_column, list_column); } @@ -1021,7 +1011,8 @@ std::unique_ptr create_sequence_table(std::vector co auto columns = std::vector>(dtype_ids.size()); std::transform(dtype_ids.begin(), dtype_ids.end(), columns.begin(), [&](auto dtype) mutable { auto init = cudf::make_default_constructed_scalar(cudf::data_type{dtype}); - auto col = cudf::sequence(num_rows.count, *init); + init->set_valid_async(true); + auto col = cudf::sequence(num_rows.count, *init); auto [mask, count] = create_random_null_mask(num_rows.count, null_probability, seed_dist(seed_engine)); col->set_null_mask(std::move(mask), count); diff --git a/cpp/include/cudf/filling.hpp b/cpp/include/cudf/filling.hpp index 15a21b44f3b..8650dc9b0d9 100644 --- a/cpp/include/cudf/filling.hpp +++ b/cpp/include/cudf/filling.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2024, NVIDIA CORPORATION. + * Copyright (c) 2019-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -166,9 +166,10 @@ std::unique_ptr repeat( * step = 2 * return = [0, 2, 4] * ``` - * @throws cudf::logic_error if @p init and @p step are not the same type. - * @throws cudf::logic_error if scalar types are not numeric. - * @throws cudf::logic_error if @p size is < 0. + * + * @throws std::invalid_argument if @p init and @p step are not the same type. + * @throws std::invalid_argument if scalar types are not numeric or invalid + * @throws std::invalid_argument if @p size is < 0. * * @param size Size of the output column * @param init First value in the sequence @@ -196,8 +197,9 @@ std::unique_ptr sequence( * init = 0 * return = [0, 1, 2] * ``` - * @throws cudf::logic_error if @p init is not numeric. - * @throws cudf::logic_error if @p size is < 0. + * + * @throws std::invalid_argument if @p init is not numeric or invalid + * @throws std::invalid_argument if @p size is < 0 * * @param size Size of the output column * @param init First value in the sequence diff --git a/cpp/src/filling/sequence.cu b/cpp/src/filling/sequence.cu index 2793f381002..76aca186094 100644 --- a/cpp/src/filling/sequence.cu +++ b/cpp/src/filling/sequence.cu @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include @@ -117,7 +118,7 @@ struct sequence_functor { std::unique_ptr operator()(Args&&...) requires(not cudf::is_numeric() or cudf::is_boolean()) { - CUDF_FAIL("Unsupported sequence scalar type"); + CUDF_FAIL("Unsupported sequence scalar type", cudf::data_type_error); } }; @@ -132,8 +133,11 @@ std::unique_ptr sequence(size_type size, CUDF_EXPECTS(cudf::have_same_types(init, step), "init and step must be of the same type.", cudf::data_type_error); - CUDF_EXPECTS(size >= 0, "size must be >= 0"); - CUDF_EXPECTS(is_numeric(init.type()), "Input scalar types must be numeric"); + CUDF_EXPECTS(size >= 0, "size must be >= 0", std::invalid_argument); + CUDF_EXPECTS( + is_numeric(init.type()), "Input scalar types must be numeric", std::invalid_argument); + CUDF_EXPECTS(init.is_valid(stream), "init must be a valid scalar", std::invalid_argument); + CUDF_EXPECTS(step.is_valid(stream), "step must be a valid scalar", std::invalid_argument); return type_dispatcher(init.type(), sequence_functor{}, size, init, step, stream, mr); } @@ -143,8 +147,9 @@ std::unique_ptr sequence(size_type size, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { - CUDF_EXPECTS(size >= 0, "size must be >= 0"); - CUDF_EXPECTS(is_numeric(init.type()), "init scalar type must be numeric"); + CUDF_EXPECTS(size >= 0, "size must be >= 0", std::invalid_argument); + CUDF_EXPECTS(is_numeric(init.type()), "init scalar type must be numeric", cudf::data_type_error); + CUDF_EXPECTS(init.is_valid(stream), "init must be a valid scalar", std::invalid_argument); return type_dispatcher(init.type(), sequence_functor{}, size, init, stream, mr); } diff --git a/cpp/tests/filling/sequence_tests.cpp b/cpp/tests/filling/sequence_tests.cpp index 53782c90c26..cf49d9f6c04 100644 --- a/cpp/tests/filling/sequence_tests.cpp +++ b/cpp/tests/filling/sequence_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2024, NVIDIA CORPORATION. + * Copyright (c) 2020-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -84,15 +84,15 @@ TEST_F(SequenceTestFixture, BadTypes) { cudf::string_scalar string_init("zero"); cudf::string_scalar string_step("???"); - EXPECT_THROW(cudf::sequence(10, string_init, string_step), cudf::logic_error); + EXPECT_THROW(cudf::sequence(10, string_init, string_step), std::invalid_argument); cudf::numeric_scalar bool_init(true); cudf::numeric_scalar bool_step(false); - EXPECT_THROW(cudf::sequence(10, bool_init, bool_step), cudf::logic_error); + EXPECT_THROW(cudf::sequence(10, bool_init, bool_step), cudf::data_type_error); cudf::timestamp_scalar ts_init(cudf::duration_s{10}, true); cudf::timestamp_scalar ts_step(cudf::duration_s{10}, true); - EXPECT_THROW(cudf::sequence(10, ts_init, ts_step), cudf::logic_error); + EXPECT_THROW(cudf::sequence(10, ts_init, ts_step), std::invalid_argument); } TEST_F(SequenceTestFixture, MismatchedInputs) @@ -110,6 +110,19 @@ TEST_F(SequenceTestFixture, MismatchedInputs) EXPECT_THROW(cudf::sequence(10, init3, step3), cudf::data_type_error); } +TEST_F(SequenceTestFixture, InvalidInput) +{ + cudf::numeric_scalar init(0, false); + EXPECT_THROW(cudf::sequence(10, init), std::invalid_argument); + cudf::numeric_scalar step1(1); + EXPECT_THROW(cudf::sequence(10, init, step1), std::invalid_argument); + + cudf::numeric_scalar zero(0); + cudf::numeric_scalar step(1, false); + EXPECT_THROW(cudf::sequence(10, zero, step), std::invalid_argument); + EXPECT_THROW(cudf::sequence(10, init, step), std::invalid_argument); +} + TYPED_TEST(SequenceTypedTestFixture, DefaultStep) { using T = TypeParam; From 3ff177cfd7d7772dbaf38b3f3edbd9083f717e18 Mon Sep 17 00:00:00 2001 From: Matthew Murray <41342305+Matt711@users.noreply.github.com> Date: Fri, 8 Aug 2025 12:47:47 -0400 Subject: [PATCH 086/366] Add PDS-DS queries 2 through 10 to cudf-polars benchmarks (#19488) Contributes to #19125 by adding PDS-DS queries 2 through 10. And to https://github.com/rapidsai/cudf/issues/19533 by removing the CPU collect calls in Q6 and Q9. Authors: - Matthew Murray (https://github.com/Matt711) Approvers: - Lawrence Mitchell (https://github.com/wence-) URL: https://github.com/rapidsai/cudf/pull/19488 --- .../benchmarks/pdsds_queries/q10.py | 225 ++++++++ .../benchmarks/pdsds_queries/q2.py | 244 ++++++++ .../benchmarks/pdsds_queries/q3.py | 65 +++ .../benchmarks/pdsds_queries/q4.py | 359 ++++++++++++ .../benchmarks/pdsds_queries/q5.py | 462 +++++++++++++++ .../benchmarks/pdsds_queries/q6.py | 92 +++ .../benchmarks/pdsds_queries/q7.py | 79 +++ .../benchmarks/pdsds_queries/q8.py | 524 ++++++++++++++++++ .../benchmarks/pdsds_queries/q9.py | 137 +++++ 9 files changed, 2187 insertions(+) create mode 100644 python/cudf_polars/cudf_polars/experimental/benchmarks/pdsds_queries/q10.py create mode 100644 python/cudf_polars/cudf_polars/experimental/benchmarks/pdsds_queries/q2.py create mode 100644 python/cudf_polars/cudf_polars/experimental/benchmarks/pdsds_queries/q3.py create mode 100644 python/cudf_polars/cudf_polars/experimental/benchmarks/pdsds_queries/q4.py create mode 100644 python/cudf_polars/cudf_polars/experimental/benchmarks/pdsds_queries/q5.py create mode 100644 python/cudf_polars/cudf_polars/experimental/benchmarks/pdsds_queries/q6.py create mode 100644 python/cudf_polars/cudf_polars/experimental/benchmarks/pdsds_queries/q7.py create mode 100644 python/cudf_polars/cudf_polars/experimental/benchmarks/pdsds_queries/q8.py create mode 100644 python/cudf_polars/cudf_polars/experimental/benchmarks/pdsds_queries/q9.py diff --git a/python/cudf_polars/cudf_polars/experimental/benchmarks/pdsds_queries/q10.py b/python/cudf_polars/cudf_polars/experimental/benchmarks/pdsds_queries/q10.py new file mode 100644 index 00000000000..f4818511009 --- /dev/null +++ b/python/cudf_polars/cudf_polars/experimental/benchmarks/pdsds_queries/q10.py @@ -0,0 +1,225 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. +# SPDX-License-Identifier: Apache-2.0 + +"""Query 10.""" + +from __future__ import annotations + +from typing import TYPE_CHECKING + +import polars as pl + +from cudf_polars.experimental.benchmarks.utils import get_data + +if TYPE_CHECKING: + from cudf_polars.experimental.benchmarks.utils import RunConfig + + +def duckdb_impl(run_config: RunConfig) -> str: + """Query 10.""" + return """ + SELECT cd_gender, + cd_marital_status, + cd_education_status, + Count(*) cnt1, + cd_purchase_estimate, + Count(*) cnt2, + cd_credit_rating, + Count(*) cnt3, + cd_dep_count, + Count(*) cnt4, + cd_dep_employed_count, + Count(*) cnt5, + cd_dep_college_count, + Count(*) cnt6 + FROM customer c, + customer_address ca, + customer_demographics + WHERE c.c_current_addr_sk = ca.ca_address_sk + AND ca_county IN ( 'Lycoming County', 'Sheridan County', + 'Kandiyohi County', + 'Pike County', + 'Greene County' ) + AND cd_demo_sk = c.c_current_cdemo_sk + AND EXISTS (SELECT * + FROM store_sales, + date_dim + WHERE c.c_customer_sk = ss_customer_sk + AND ss_sold_date_sk = d_date_sk + AND d_year = 2002 + AND d_moy BETWEEN 4 AND 4 + 3) + AND ( EXISTS (SELECT * + FROM web_sales, + date_dim + WHERE c.c_customer_sk = ws_bill_customer_sk + AND ws_sold_date_sk = d_date_sk + AND d_year = 2002 + AND d_moy BETWEEN 4 AND 4 + 3) + OR EXISTS (SELECT * + FROM catalog_sales, + date_dim + WHERE c.c_customer_sk = cs_ship_customer_sk + AND cs_sold_date_sk = d_date_sk + AND d_year = 2002 + AND d_moy BETWEEN 4 AND 4 + 3) ) + GROUP BY cd_gender, + cd_marital_status, + cd_education_status, + cd_purchase_estimate, + cd_credit_rating, + cd_dep_count, + cd_dep_employed_count, + cd_dep_college_count + ORDER BY cd_gender, + cd_marital_status, + cd_education_status, + cd_purchase_estimate, + cd_credit_rating, + cd_dep_count, + cd_dep_employed_count, + cd_dep_college_count + LIMIT 100; + """ + + +def polars_impl(run_config: RunConfig) -> pl.LazyFrame: + """Query 10.""" + # Load required tables + customer = get_data(run_config.dataset_path, "customer", run_config.suffix) + customer_address = get_data( + run_config.dataset_path, "customer_address", run_config.suffix + ) + customer_demographics = get_data( + run_config.dataset_path, "customer_demographics", run_config.suffix + ) + store_sales = get_data(run_config.dataset_path, "store_sales", run_config.suffix) + web_sales = get_data(run_config.dataset_path, "web_sales", run_config.suffix) + catalog_sales = get_data( + run_config.dataset_path, "catalog_sales", run_config.suffix + ) + date_dim = get_data(run_config.dataset_path, "date_dim", run_config.suffix) + + # Target counties and date range + target_counties = [ + "Lycoming County", + "Sheridan County", + "Kandiyohi County", + "Pike County", + "Greene County", + ] + + # Get customers with store sales in the target period (EXISTS condition 1) + store_customers = ( + store_sales.join(date_dim, left_on="ss_sold_date_sk", right_on="d_date_sk") + .filter( + (pl.col("d_year") == 2002) + & (pl.col("d_moy").is_between(4, 7, closed="both")) + ) + .select("ss_customer_sk") + .unique() + ) + + # Get customers with web sales in the target period (EXISTS condition 2a) + web_customers = ( + web_sales.join(date_dim, left_on="ws_sold_date_sk", right_on="d_date_sk") + .filter( + (pl.col("d_year") == 2002) + & (pl.col("d_moy").is_between(4, 7, closed="both")) + ) + .select(pl.col("ws_bill_customer_sk").alias("customer_sk")) + .unique() + ) + + # Get customers with catalog sales in the target period (EXISTS condition 2b) + catalog_customers = ( + catalog_sales.join(date_dim, left_on="cs_sold_date_sk", right_on="d_date_sk") + .filter( + (pl.col("d_year") == 2002) + & (pl.col("d_moy").is_between(4, 7, closed="both")) + ) + .select(pl.col("cs_ship_customer_sk").alias("customer_sk")) + .unique() + ) + + # Combine web and catalog customers (OR condition) + web_or_catalog_customers = pl.concat([web_customers, catalog_customers]).unique() + + # Main query: join customer tables and apply filters + return ( + customer.join( + customer_address, left_on="c_current_addr_sk", right_on="ca_address_sk" + ) + .join( + customer_demographics, left_on="c_current_cdemo_sk", right_on="cd_demo_sk" + ) + .filter(pl.col("ca_county").is_in(target_counties)) + # Apply EXISTS conditions through joins + .join( + store_customers, + left_on="c_customer_sk", + right_on="ss_customer_sk", + how="inner", + ) + .join( + web_or_catalog_customers, + left_on="c_customer_sk", + right_on="customer_sk", + how="inner", + ) + .group_by( + [ + "cd_gender", + "cd_marital_status", + "cd_education_status", + "cd_purchase_estimate", + "cd_credit_rating", + "cd_dep_count", + "cd_dep_employed_count", + "cd_dep_college_count", + ] + ) + .agg( + [ + # Cast -> Int64 to match DuckDB + # TODO: We should plan to make these optional + pl.len().alias("cnt1").cast(pl.Int64), + pl.len().alias("cnt2").cast(pl.Int64), + pl.len().alias("cnt3").cast(pl.Int64), + pl.len().alias("cnt4").cast(pl.Int64), + pl.len().alias("cnt5").cast(pl.Int64), + pl.len().alias("cnt6").cast(pl.Int64), + ] + ) + .sort( + [ + "cd_gender", + "cd_marital_status", + "cd_education_status", + "cd_purchase_estimate", + "cd_credit_rating", + "cd_dep_count", + "cd_dep_employed_count", + "cd_dep_college_count", + ], + nulls_last=True, + ) + .limit(100) + .select( + [ + "cd_gender", + "cd_marital_status", + "cd_education_status", + "cnt1", + "cd_purchase_estimate", + "cnt2", + "cd_credit_rating", + "cnt3", + "cd_dep_count", + "cnt4", + "cd_dep_employed_count", + "cnt5", + "cd_dep_college_count", + "cnt6", + ] + ) + ) diff --git a/python/cudf_polars/cudf_polars/experimental/benchmarks/pdsds_queries/q2.py b/python/cudf_polars/cudf_polars/experimental/benchmarks/pdsds_queries/q2.py new file mode 100644 index 00000000000..76fe4c05571 --- /dev/null +++ b/python/cudf_polars/cudf_polars/experimental/benchmarks/pdsds_queries/q2.py @@ -0,0 +1,244 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. +# SPDX-License-Identifier: Apache-2.0 + +"""Query 2.""" + +from __future__ import annotations + +from typing import TYPE_CHECKING + +import polars as pl + +from cudf_polars.experimental.benchmarks.utils import get_data + +if TYPE_CHECKING: + from cudf_polars.experimental.benchmarks.utils import RunConfig + + +def duckdb_impl(run_config: RunConfig) -> str: + """Query 2.""" + return """ + WITH wscs + AS (SELECT sold_date_sk, + sales_price + FROM (SELECT ws_sold_date_sk sold_date_sk, + ws_ext_sales_price sales_price + FROM web_sales) + UNION ALL + (SELECT cs_sold_date_sk sold_date_sk, + cs_ext_sales_price sales_price + FROM catalog_sales)), + wswscs + AS (SELECT d_week_seq, + Sum(CASE + WHEN ( d_day_name = 'Sunday' ) THEN sales_price + ELSE NULL + END) sun_sales, + Sum(CASE + WHEN ( d_day_name = 'Monday' ) THEN sales_price + ELSE NULL + END) mon_sales, + Sum(CASE + WHEN ( d_day_name = 'Tuesday' ) THEN sales_price + ELSE NULL + END) tue_sales, + Sum(CASE + WHEN ( d_day_name = 'Wednesday' ) THEN sales_price + ELSE NULL + END) wed_sales, + Sum(CASE + WHEN ( d_day_name = 'Thursday' ) THEN sales_price + ELSE NULL + END) thu_sales, + Sum(CASE + WHEN ( d_day_name = 'Friday' ) THEN sales_price + ELSE NULL + END) fri_sales, + Sum(CASE + WHEN ( d_day_name = 'Saturday' ) THEN sales_price + ELSE NULL + END) sat_sales + FROM wscs, + date_dim + WHERE d_date_sk = sold_date_sk + GROUP BY d_week_seq) + SELECT d_week_seq1, + Round(sun_sales1 / sun_sales2, 2), + Round(mon_sales1 / mon_sales2, 2), + Round(tue_sales1 / tue_sales2, 2), + Round(wed_sales1 / wed_sales2, 2), + Round(thu_sales1 / thu_sales2, 2), + Round(fri_sales1 / fri_sales2, 2), + Round(sat_sales1 / sat_sales2, 2) + FROM (SELECT wswscs.d_week_seq d_week_seq1, + sun_sales sun_sales1, + mon_sales mon_sales1, + tue_sales tue_sales1, + wed_sales wed_sales1, + thu_sales thu_sales1, + fri_sales fri_sales1, + sat_sales sat_sales1 + FROM wswscs, + date_dim + WHERE date_dim.d_week_seq = wswscs.d_week_seq + AND d_year = 1998) y, + (SELECT wswscs.d_week_seq d_week_seq2, + sun_sales sun_sales2, + mon_sales mon_sales2, + tue_sales tue_sales2, + wed_sales wed_sales2, + thu_sales thu_sales2, + fri_sales fri_sales2, + sat_sales sat_sales2 + FROM wswscs, + date_dim + WHERE date_dim.d_week_seq = wswscs.d_week_seq + AND d_year = 1998 + 1) z + WHERE d_week_seq1 = d_week_seq2 - 53 + ORDER BY d_week_seq1; + """ + + +def polars_impl(run_config: RunConfig) -> pl.LazyFrame: + """Query 2.""" + # Load required tables + web_sales = get_data(run_config.dataset_path, "web_sales", run_config.suffix) + catalog_sales = get_data( + run_config.dataset_path, "catalog_sales", run_config.suffix + ) + date_dim = get_data(run_config.dataset_path, "date_dim", run_config.suffix) + # Step 1: Create wscs CTE equivalent (union of web and catalog sales) + wscs = pl.concat( + [ + web_sales.select( + [ + pl.col("ws_sold_date_sk").alias("sold_date_sk"), + pl.col("ws_ext_sales_price").alias("sales_price"), + ] + ), + catalog_sales.select( + [ + pl.col("cs_sold_date_sk").alias("sold_date_sk"), + pl.col("cs_ext_sales_price").alias("sales_price"), + ] + ), + ] + ) + # Step 2: Create wswscs CTE equivalent (aggregate by week and day of week) + # First join with date_dim to get day names + wscs_with_dates = wscs.join(date_dim, left_on="sold_date_sk", right_on="d_date_sk") + # Create separate aggregations for each day to better control null handling + days = ( + "Sunday", + "Monday", + "Tuesday", + "Wednesday", + "Thursday", + "Friday", + "Saturday", + ) + day_cols = ( + "sun_sales", + "mon_sales", + "tue_sales", + "wed_sales", + "thu_sales", + "fri_sales", + "sat_sales", + ) + # Start with all week sequences + all_weeks = wscs_with_dates.select("d_week_seq").unique() + wswscs = all_weeks + + wswscs = ( + wscs_with_dates.with_columns( + [ + pl.when(pl.col("d_day_name") == day) + .then(pl.col("sales_price")) + .otherwise(None) + .alias(name) + for day, name in zip(days, day_cols, strict=True) + ] + ) + .group_by("d_week_seq") + .agg( + *(pl.col(name).sum().alias(name) for name in day_cols), + *(pl.col(name).count().alias(f"{name}_count") for name in day_cols), + ) + .with_columns( + [ + pl.when(pl.col(f"{name}_count") > 0) + .then(pl.col(name)) + .otherwise(None) + .alias(name) + for name in day_cols + ] + ) + .select(["d_week_seq", *day_cols]) + ) + + # Step 3: Create year 1998 data (y subquery equivalent) + y_1998 = ( + wswscs.join(date_dim, left_on="d_week_seq", right_on="d_week_seq") + .filter(pl.col("d_year") == 1998) + .select( + [ + pl.col("d_week_seq").alias("d_week_seq1"), + pl.col("sun_sales").alias("sun_sales1"), + pl.col("mon_sales").alias("mon_sales1"), + pl.col("tue_sales").alias("tue_sales1"), + pl.col("wed_sales").alias("wed_sales1"), + pl.col("thu_sales").alias("thu_sales1"), + pl.col("fri_sales").alias("fri_sales1"), + pl.col("sat_sales").alias("sat_sales1"), + ] + ) + ) + # Step 4: Create year 1999 data (z subquery equivalent) + z_1999 = ( + wswscs.join(date_dim, left_on="d_week_seq", right_on="d_week_seq") + .filter(pl.col("d_year") == 1999) + .select( + [ + pl.col("d_week_seq").alias("d_week_seq2"), + pl.col("sun_sales").alias("sun_sales2"), + pl.col("mon_sales").alias("mon_sales2"), + pl.col("tue_sales").alias("tue_sales2"), + pl.col("wed_sales").alias("wed_sales2"), + pl.col("thu_sales").alias("thu_sales2"), + pl.col("fri_sales").alias("fri_sales2"), + pl.col("sat_sales").alias("sat_sales2"), + ] + ) + ) + # Step 5: Join the two years and calculate ratios + return ( + y_1998.join(z_1999, left_on="d_week_seq1", right_on=pl.col("d_week_seq2") - 53) + .select( + [ + pl.col("d_week_seq1"), + (pl.col("sun_sales1") / pl.col("sun_sales2")) + .round(2) + .alias("round((sun_sales1 / sun_sales2), 2)"), + (pl.col("mon_sales1") / pl.col("mon_sales2")) + .round(2) + .alias("round((mon_sales1 / mon_sales2), 2)"), + (pl.col("tue_sales1") / pl.col("tue_sales2")) + .round(2) + .alias("round((tue_sales1 / tue_sales2), 2)"), + (pl.col("wed_sales1") / pl.col("wed_sales2")) + .round(2) + .alias("round((wed_sales1 / wed_sales2), 2)"), + (pl.col("thu_sales1") / pl.col("thu_sales2")) + .round(2) + .alias("round((thu_sales1 / thu_sales2), 2)"), + (pl.col("fri_sales1") / pl.col("fri_sales2")) + .round(2) + .alias("round((fri_sales1 / fri_sales2), 2)"), + (pl.col("sat_sales1") / pl.col("sat_sales2")) + .round(2) + .alias("round((sat_sales1 / sat_sales2), 2)"), + ] + ) + .sort("d_week_seq1") + ) diff --git a/python/cudf_polars/cudf_polars/experimental/benchmarks/pdsds_queries/q3.py b/python/cudf_polars/cudf_polars/experimental/benchmarks/pdsds_queries/q3.py new file mode 100644 index 00000000000..effb62d09ff --- /dev/null +++ b/python/cudf_polars/cudf_polars/experimental/benchmarks/pdsds_queries/q3.py @@ -0,0 +1,65 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. +# SPDX-License-Identifier: Apache-2.0 + +"""Query 3.""" + +from __future__ import annotations + +from typing import TYPE_CHECKING + +import polars as pl + +from cudf_polars.experimental.benchmarks.utils import get_data + +if TYPE_CHECKING: + from cudf_polars.experimental.benchmarks.utils import RunConfig + + +def duckdb_impl(run_config: RunConfig) -> str: + """Query 3.""" + return """ + SELECT dt.d_year, + item.i_brand_id brand_id, + item.i_brand brand, + Sum(ss_ext_discount_amt) sum_agg + FROM date_dim dt, + store_sales, + item + WHERE dt.d_date_sk = store_sales.ss_sold_date_sk + AND store_sales.ss_item_sk = item.i_item_sk + AND item.i_manufact_id = 427 + AND dt.d_moy = 11 + GROUP BY dt.d_year, + item.i_brand, + item.i_brand_id + ORDER BY dt.d_year, + sum_agg DESC, + brand_id + LIMIT 100; + """ + + +def polars_impl(run_config: RunConfig) -> pl.LazyFrame: + """Query 3.""" + # Load required tables + date_dim = get_data(run_config.dataset_path, "date_dim", run_config.suffix) + store_sales = get_data(run_config.dataset_path, "store_sales", run_config.suffix) + item = get_data(run_config.dataset_path, "item", run_config.suffix) + # Execute the query following the SQL logic + return ( + date_dim.join(store_sales, left_on="d_date_sk", right_on="ss_sold_date_sk") + .join(item, left_on="ss_item_sk", right_on="i_item_sk") + .filter((pl.col("i_manufact_id") == 427) & (pl.col("d_moy") == 11)) + .group_by(["d_year", "i_brand", "i_brand_id"]) + .agg([pl.col("ss_ext_discount_amt").sum().alias("sum_agg")]) + .select( + [ + pl.col("d_year"), + pl.col("i_brand_id").alias("brand_id"), + pl.col("i_brand").alias("brand"), + pl.col("sum_agg"), + ] + ) + .sort(["d_year", "sum_agg", "brand_id"], descending=[False, True, False]) + .limit(100) + ) diff --git a/python/cudf_polars/cudf_polars/experimental/benchmarks/pdsds_queries/q4.py b/python/cudf_polars/cudf_polars/experimental/benchmarks/pdsds_queries/q4.py new file mode 100644 index 00000000000..357eb260c77 --- /dev/null +++ b/python/cudf_polars/cudf_polars/experimental/benchmarks/pdsds_queries/q4.py @@ -0,0 +1,359 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. +# SPDX-License-Identifier: Apache-2.0 + +"""Query 4.""" + +from __future__ import annotations + +from typing import TYPE_CHECKING + +import polars as pl + +from cudf_polars.experimental.benchmarks.utils import get_data + +if TYPE_CHECKING: + from cudf_polars.experimental.benchmarks.utils import RunConfig + + +def duckdb_impl(run_config: RunConfig) -> str: + """Query 4.""" + return """ + WITH year_total + AS (SELECT c_customer_id customer_id, + c_first_name customer_first_name, + c_last_name customer_last_name, + c_preferred_cust_flag customer_preferred_cust_flag + , + c_birth_country + customer_birth_country, + c_login customer_login, + c_email_address customer_email_address, + d_year dyear, + Sum(( ( ss_ext_list_price - ss_ext_wholesale_cost + - ss_ext_discount_amt + ) + + + ss_ext_sales_price ) / 2) year_total, + 's' sale_type + FROM customer, + store_sales, + date_dim + WHERE c_customer_sk = ss_customer_sk + AND ss_sold_date_sk = d_date_sk + GROUP BY c_customer_id, + c_first_name, + c_last_name, + c_preferred_cust_flag, + c_birth_country, + c_login, + c_email_address, + d_year + UNION ALL + SELECT c_customer_id customer_id, + c_first_name customer_first_name, + c_last_name customer_last_name, + c_preferred_cust_flag + customer_preferred_cust_flag, + c_birth_country customer_birth_country + , + c_login + customer_login, + c_email_address customer_email_address + , + d_year dyear + , + Sum(( ( ( cs_ext_list_price + - cs_ext_wholesale_cost + - cs_ext_discount_amt + ) + + cs_ext_sales_price ) / 2 )) year_total, + 'c' sale_type + FROM customer, + catalog_sales, + date_dim + WHERE c_customer_sk = cs_bill_customer_sk + AND cs_sold_date_sk = d_date_sk + GROUP BY c_customer_id, + c_first_name, + c_last_name, + c_preferred_cust_flag, + c_birth_country, + c_login, + c_email_address, + d_year + UNION ALL + SELECT c_customer_id customer_id, + c_first_name customer_first_name, + c_last_name customer_last_name, + c_preferred_cust_flag + customer_preferred_cust_flag, + c_birth_country customer_birth_country + , + c_login + customer_login, + c_email_address customer_email_address + , + d_year dyear + , + Sum(( ( ( ws_ext_list_price + - ws_ext_wholesale_cost + - ws_ext_discount_amt + ) + + ws_ext_sales_price ) / 2 )) year_total, + 'w' sale_type + FROM customer, + web_sales, + date_dim + WHERE c_customer_sk = ws_bill_customer_sk + AND ws_sold_date_sk = d_date_sk + GROUP BY c_customer_id, + c_first_name, + c_last_name, + c_preferred_cust_flag, + c_birth_country, + c_login, + c_email_address, + d_year) + SELECT t_s_secyear.customer_id, + t_s_secyear.customer_first_name, + t_s_secyear.customer_last_name, + t_s_secyear.customer_preferred_cust_flag + FROM year_total t_s_firstyear, + year_total t_s_secyear, + year_total t_c_firstyear, + year_total t_c_secyear, + year_total t_w_firstyear, + year_total t_w_secyear + WHERE t_s_secyear.customer_id = t_s_firstyear.customer_id + AND t_s_firstyear.customer_id = t_c_secyear.customer_id + AND t_s_firstyear.customer_id = t_c_firstyear.customer_id + AND t_s_firstyear.customer_id = t_w_firstyear.customer_id + AND t_s_firstyear.customer_id = t_w_secyear.customer_id + AND t_s_firstyear.sale_type = 's' + AND t_c_firstyear.sale_type = 'c' + AND t_w_firstyear.sale_type = 'w' + AND t_s_secyear.sale_type = 's' + AND t_c_secyear.sale_type = 'c' + AND t_w_secyear.sale_type = 'w' + AND t_s_firstyear.dyear = 2001 + AND t_s_secyear.dyear = 2001 + 1 + AND t_c_firstyear.dyear = 2001 + AND t_c_secyear.dyear = 2001 + 1 + AND t_w_firstyear.dyear = 2001 + AND t_w_secyear.dyear = 2001 + 1 + AND t_s_firstyear.year_total > 0 + AND t_c_firstyear.year_total > 0 + AND t_w_firstyear.year_total > 0 + AND CASE + WHEN t_c_firstyear.year_total > 0 THEN t_c_secyear.year_total / + t_c_firstyear.year_total + ELSE NULL + END > CASE + WHEN t_s_firstyear.year_total > 0 THEN + t_s_secyear.year_total / + t_s_firstyear.year_total + ELSE NULL + END + AND CASE + WHEN t_c_firstyear.year_total > 0 THEN t_c_secyear.year_total / + t_c_firstyear.year_total + ELSE NULL + END > CASE + WHEN t_w_firstyear.year_total > 0 THEN + t_w_secyear.year_total / + t_w_firstyear.year_total + ELSE NULL + END + ORDER BY t_s_secyear.customer_id, + t_s_secyear.customer_first_name, + t_s_secyear.customer_last_name, + t_s_secyear.customer_preferred_cust_flag + LIMIT 100; + """ + + +def build_sales_subquery( # noqa: D103 + sales_df: pl.LazyFrame, + date_df: pl.LazyFrame, + customer_df: pl.LazyFrame, + sold_date_key: str, + customer_key: str, + col_prefix: str, + *, + year_filter: bool = False, + include_customer_info: bool = False, +) -> pl.LazyFrame: + profit_expr = ( + ( + pl.col(f"{col_prefix}ext_list_price") + - pl.col(f"{col_prefix}ext_wholesale_cost") + - pl.col(f"{col_prefix}ext_discount_amt") + ) + + pl.col(f"{col_prefix}ext_sales_price") + ) / 2 + + df = ( + sales_df.join(date_df, left_on=sold_date_key, right_on="d_date_sk") + .join(customer_df, left_on=customer_key, right_on="c_customer_sk") + .group_by( + [ + "c_customer_id", + "c_first_name", + "c_last_name", + "c_preferred_cust_flag", + "c_birth_country", + "c_login", + "c_email_address", + "d_year", + ] + ) + .agg(profit_expr.sum().alias("year_total")) + ) + + if year_filter: + df = df.filter(pl.col("year_total") > 0) + + if include_customer_info: + return df.select( + [ + pl.col("c_customer_id").alias("customer_id"), + pl.col("c_first_name").alias("customer_first_name"), + pl.col("c_last_name").alias("customer_last_name"), + pl.col("c_preferred_cust_flag").alias("customer_preferred_cust_flag"), + pl.col("year_total"), + ] + ) + else: + return df.select( + [pl.col("c_customer_id").alias("customer_id"), pl.col("year_total")] + ) + + +def polars_impl(run_config: RunConfig) -> pl.LazyFrame: + """Query 4.""" + # Load required tables + customer = get_data(run_config.dataset_path, "customer", run_config.suffix) + store_sales = get_data(run_config.dataset_path, "store_sales", run_config.suffix) + catalog_sales = get_data( + run_config.dataset_path, "catalog_sales", run_config.suffix + ) + web_sales = get_data(run_config.dataset_path, "web_sales", run_config.suffix) + date_dim = get_data(run_config.dataset_path, "date_dim", run_config.suffix) + date_2001 = date_dim.filter(pl.col("d_year") == 2001) + date_2002 = date_dim.filter(pl.col("d_year") == 2002) + + # Store sales - first year (2001) + t_s_firstyear = build_sales_subquery( + store_sales, + date_2001, + customer, + sold_date_key="ss_sold_date_sk", + customer_key="ss_customer_sk", + col_prefix="ss_", + year_filter=True, + include_customer_info=True, + ) + + # Store sales - second year (2002) + t_s_secyear = build_sales_subquery( + store_sales, + date_2002, + customer, + sold_date_key="ss_sold_date_sk", + customer_key="ss_customer_sk", + col_prefix="ss_", + year_filter=False, + include_customer_info=True, + ) + + # Catalog sales - first year (2001) + t_c_firstyear = build_sales_subquery( + catalog_sales, + date_2001, + customer, + sold_date_key="cs_sold_date_sk", + customer_key="cs_bill_customer_sk", + col_prefix="cs_", + year_filter=True, + include_customer_info=False, + ) + + # Catalog sales - first year (2002) + t_c_secyear = build_sales_subquery( + catalog_sales, + date_2002, + customer, + sold_date_key="cs_sold_date_sk", + customer_key="cs_bill_customer_sk", + col_prefix="cs_", + year_filter=False, + include_customer_info=False, + ) + + # Web sales - first year (2001) + t_w_firstyear = build_sales_subquery( + web_sales, + date_2001, + customer, + sold_date_key="ws_sold_date_sk", + customer_key="ws_bill_customer_sk", + col_prefix="ws_", + year_filter=True, + include_customer_info=False, + ) + + # Web sales - first year (2001) + t_w_secyear = build_sales_subquery( + web_sales, + date_2002, + customer, + sold_date_key="ws_sold_date_sk", + customer_key="ws_bill_customer_sk", + col_prefix="ws_", + year_filter=False, + include_customer_info=False, + ) + + # Perform the joins and filtering + sort_cols = [ + "customer_id", + "customer_first_name", + "customer_last_name", + "customer_preferred_cust_flag", + ] + return ( + t_s_secyear.join(t_s_firstyear, on="customer_id", suffix="_sf", how="inner") + .join(t_c_firstyear, on="customer_id", suffix="_cf", how="inner") + .join(t_c_secyear, on="customer_id", suffix="_cs", how="inner") + .join(t_w_firstyear, on="customer_id", suffix="_wf", how="inner") + .join(t_w_secyear, on="customer_id", suffix="_ws", how="inner") + .filter( + # All first year totals must be > 0 + (pl.col("year_total_sf") > 0) + & (pl.col("year_total_cf") > 0) + & (pl.col("year_total_wf") > 0) + & + # Catalog growth rate > Store growth rate + ( + pl.when(pl.col("year_total_cf") > 0) + .then(pl.col("year_total_cs") / pl.col("year_total_cf")) + .otherwise(None) + > pl.when(pl.col("year_total_sf") > 0) + .then(pl.col("year_total") / pl.col("year_total_sf")) + .otherwise(None) + ) + & + # Catalog growth rate > Web growth rate + ( + pl.when(pl.col("year_total_cf") > 0) + .then(pl.col("year_total_cs") / pl.col("year_total_cf")) + .otherwise(None) + > pl.when(pl.col("year_total_wf") > 0) + .then(pl.col("year_total_ws") / pl.col("year_total_wf")) + .otherwise(None) + ) + ) + .select(sort_cols) + .sort(sort_cols) + .limit(100) + ) diff --git a/python/cudf_polars/cudf_polars/experimental/benchmarks/pdsds_queries/q5.py b/python/cudf_polars/cudf_polars/experimental/benchmarks/pdsds_queries/q5.py new file mode 100644 index 00000000000..f03d9e04b9f --- /dev/null +++ b/python/cudf_polars/cudf_polars/experimental/benchmarks/pdsds_queries/q5.py @@ -0,0 +1,462 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. +# SPDX-License-Identifier: Apache-2.0 + +"""Query 5.""" + +from __future__ import annotations + +from datetime import date, timedelta +from typing import TYPE_CHECKING + +import polars as pl + +from cudf_polars.experimental.benchmarks.utils import get_data + +if TYPE_CHECKING: + from cudf_polars.experimental.benchmarks.utils import RunConfig + + +def duckdb_impl(run_config: RunConfig) -> str: + """Query 5.""" + return """ + WITH ssr AS + ( + SELECT s_store_id, + Sum(sales_price) AS sales, + Sum(profit) AS profit, + Sum(return_amt) AS returns1, + Sum(net_loss) AS profit_loss + FROM ( + SELECT ss_store_sk AS store_sk, + ss_sold_date_sk AS date_sk, + ss_ext_sales_price AS sales_price, + ss_net_profit AS profit, + Cast(0 AS DECIMAL(7,2)) AS return_amt, + Cast(0 AS DECIMAL(7,2)) AS net_loss + FROM store_sales + UNION ALL + SELECT sr_store_sk AS store_sk, + sr_returned_date_sk AS date_sk, + Cast(0 AS DECIMAL(7,2)) AS sales_price, + Cast(0 AS DECIMAL(7,2)) AS profit, + sr_return_amt AS return_amt, + sr_net_loss AS net_loss + FROM store_returns ) salesreturns, + date_dim, + store + WHERE date_sk = d_date_sk + AND d_date BETWEEN Cast('2002-08-22' AS DATE) AND ( + Cast('2002-08-22' AS DATE) + INTERVAL '14' day) + AND store_sk = s_store_sk + GROUP BY s_store_id) , csr AS + ( + SELECT cp_catalog_page_id, + sum(sales_price) AS sales, + sum(profit) AS profit, + sum(return_amt) AS returns1, + sum(net_loss) AS profit_loss + FROM ( + SELECT cs_catalog_page_sk AS page_sk, + cs_sold_date_sk AS date_sk, + cs_ext_sales_price AS sales_price, + cs_net_profit AS profit, + cast(0 AS decimal(7,2)) AS return_amt, + cast(0 AS decimal(7,2)) AS net_loss + FROM catalog_sales + UNION ALL + SELECT cr_catalog_page_sk AS page_sk, + cr_returned_date_sk AS date_sk, + cast(0 AS decimal(7,2)) AS sales_price, + cast(0 AS decimal(7,2)) AS profit, + cr_return_amount AS return_amt, + cr_net_loss AS net_loss + FROM catalog_returns ) salesreturns, + date_dim, + catalog_page + WHERE date_sk = d_date_sk + AND d_date BETWEEN cast('2002-08-22' AS date) AND ( + cast('2002-08-22' AS date) + INTERVAL '14' day) + AND page_sk = cp_catalog_page_sk + GROUP BY cp_catalog_page_id) , wsr AS + ( + SELECT web_site_id, + sum(sales_price) AS sales, + sum(profit) AS profit, + sum(return_amt) AS returns1, + sum(net_loss) AS profit_loss + FROM ( + SELECT ws_web_site_sk AS wsr_web_site_sk, + ws_sold_date_sk AS date_sk, + ws_ext_sales_price AS sales_price, + ws_net_profit AS profit, + cast(0 AS decimal(7,2)) AS return_amt, + cast(0 AS decimal(7,2)) AS net_loss + FROM web_sales + UNION ALL + SELECT ws_web_site_sk AS wsr_web_site_sk, + wr_returned_date_sk AS date_sk, + cast(0 AS decimal(7,2)) AS sales_price, + cast(0 AS decimal(7,2)) AS profit, + wr_return_amt AS return_amt, + wr_net_loss AS net_loss + FROM web_returns + LEFT OUTER JOIN web_sales + ON ( + wr_item_sk = ws_item_sk + AND wr_order_number = ws_order_number) ) salesreturns, + date_dim, + web_site + WHERE date_sk = d_date_sk + AND d_date BETWEEN cast('2002-08-22' AS date) AND ( + cast('2002-08-22' AS date) + INTERVAL '14' day) + AND wsr_web_site_sk = web_site_sk + GROUP BY web_site_id) + SELECT + channel , + id , + sum(sales) AS sales , + sum(returns1) AS returns1 , + sum(profit) AS profit + FROM ( + SELECT 'store channel' AS channel , + 'store' + || s_store_id AS id , + sales , + returns1 , + (profit - profit_loss) AS profit + FROM ssr + UNION ALL + SELECT 'catalog channel' AS channel , + 'catalog_page' + || cp_catalog_page_id AS id , + sales , + returns1 , + (profit - profit_loss) AS profit + FROM csr + UNION ALL + SELECT 'web channel' AS channel , + 'web_site' + || web_site_id AS id , + sales , + returns1 , + (profit - profit_loss) AS profit + FROM wsr ) x + GROUP BY rollup (channel, id) + ORDER BY channel , + id + LIMIT 100; + """ + + +def polars_impl(run_config: RunConfig) -> pl.LazyFrame: + """Query 5.""" + # Load required tables + store_sales = get_data(run_config.dataset_path, "store_sales", run_config.suffix) + store_returns = get_data( + run_config.dataset_path, "store_returns", run_config.suffix + ) + catalog_sales = get_data( + run_config.dataset_path, "catalog_sales", run_config.suffix + ) + catalog_returns = get_data( + run_config.dataset_path, "catalog_returns", run_config.suffix + ) + web_sales = get_data(run_config.dataset_path, "web_sales", run_config.suffix) + web_returns = get_data(run_config.dataset_path, "web_returns", run_config.suffix) + date_dim = get_data(run_config.dataset_path, "date_dim", run_config.suffix) + store = get_data(run_config.dataset_path, "store", run_config.suffix) + catalog_page = get_data(run_config.dataset_path, "catalog_page", run_config.suffix) + web_site = get_data(run_config.dataset_path, "web_site", run_config.suffix) + + # Date range filter - use actual date values + start_date = date(2002, 8, 22) + end_date = start_date + timedelta(days=14) + + # Step 1: Create ssr CTE (Store Sales and Returns) + # Filter sales and returns by date first, then transform + store_sales_data = ( + store_sales.join(date_dim, left_on="ss_sold_date_sk", right_on="d_date_sk") + .filter(pl.col("d_date").is_between(start_date, end_date, closed="both")) + .select( + [ + pl.col("ss_store_sk").alias("store_sk"), + pl.col("ss_sold_date_sk").alias("date_sk"), + pl.col("ss_ext_sales_price").alias("sales_price"), + pl.col("ss_net_profit").alias("profit"), + pl.lit(0.0).alias("return_amt"), + pl.lit(0.0).alias("net_loss"), + ] + ) + ) + store_returns_data = ( + store_returns.join( + date_dim, left_on="sr_returned_date_sk", right_on="d_date_sk" + ) + .filter(pl.col("d_date").is_between(start_date, end_date, closed="both")) + .select( + [ + pl.col("sr_store_sk").alias("store_sk"), + pl.col("sr_returned_date_sk").alias("date_sk"), + pl.lit(0.0).alias("sales_price"), + pl.lit(0.0).alias("profit"), + pl.col("sr_return_amt").alias("return_amt"), + pl.col("sr_net_loss").alias("net_loss"), + ] + ) + ) + store_salesreturns = pl.concat([store_sales_data, store_returns_data]) + ssr = ( + store_salesreturns.join(store, left_on="store_sk", right_on="s_store_sk") + .group_by("s_store_id") + .agg( + [ + pl.col("sales_price").sum().alias("sales"), + pl.col("sales_price").count().alias("sales_count"), + pl.col("profit").sum().alias("profit"), + pl.col("profit").count().alias("profit_count"), + pl.col("return_amt").sum().alias("returns1"), + pl.col("return_amt").count().alias("returns1_count"), + pl.col("net_loss").sum().alias("profit_loss"), + pl.col("net_loss").count().alias("profit_loss_count"), + ] + ) + .with_columns( + [ + pl.when(pl.col("sales_count") > 0) + .then(pl.col("sales")) + .otherwise(None) + .alias("sales"), + pl.when(pl.col("profit_count") > 0) + .then(pl.col("profit")) + .otherwise(None) + .alias("profit"), + pl.when(pl.col("returns1_count") > 0) + .then(pl.col("returns1")) + .otherwise(None) + .alias("returns1"), + pl.when(pl.col("profit_loss_count") > 0) + .then(pl.col("profit_loss")) + .otherwise(None) + .alias("profit_loss"), + ] + ) + .drop(["sales_count", "profit_count", "returns1_count", "profit_loss_count"]) + ) + + # Step 2: Create csr CTE (Catalog Sales and Returns) + # Filter sales and returns by date first, then transform + catalog_sales_data = ( + catalog_sales.join(date_dim, left_on="cs_sold_date_sk", right_on="d_date_sk") + .filter(pl.col("d_date").is_between(start_date, end_date, closed="both")) + .select( + [ + pl.col("cs_catalog_page_sk").alias("page_sk"), + pl.col("cs_sold_date_sk").alias("date_sk"), + pl.col("cs_ext_sales_price").alias("sales_price"), + pl.col("cs_net_profit").alias("profit"), + pl.lit(0.0).alias("return_amt"), + pl.lit(0.0).alias("net_loss"), + ] + ) + ) + catalog_returns_data = ( + catalog_returns.join( + date_dim, left_on="cr_returned_date_sk", right_on="d_date_sk" + ) + .filter(pl.col("d_date").is_between(start_date, end_date, closed="both")) + .select( + [ + pl.col("cr_catalog_page_sk").alias("page_sk"), + pl.col("cr_returned_date_sk").alias("date_sk"), + pl.lit(0.0).alias("sales_price"), + pl.lit(0.0).alias("profit"), + pl.col("cr_return_amount").alias("return_amt"), + pl.col("cr_net_loss").alias("net_loss"), + ] + ) + ) + catalog_salesreturns = pl.concat([catalog_sales_data, catalog_returns_data]) + csr = ( + catalog_salesreturns.join( + catalog_page, left_on="page_sk", right_on="cp_catalog_page_sk" + ) + .group_by("cp_catalog_page_id") + .agg( + [ + pl.col("sales_price").sum().alias("sales"), + pl.col("sales_price").count().alias("sales_count"), + pl.col("profit").sum().alias("profit"), + pl.col("profit").count().alias("profit_count"), + pl.col("return_amt").sum().alias("returns1"), + pl.col("return_amt").count().alias("returns1_count"), + pl.col("net_loss").sum().alias("profit_loss"), + pl.col("net_loss").count().alias("profit_loss_count"), + ] + ) + .with_columns( + [ + pl.when(pl.col("sales_count") > 0) + .then(pl.col("sales")) + .otherwise(None) + .alias("sales"), + pl.when(pl.col("profit_count") > 0) + .then(pl.col("profit")) + .otherwise(None) + .alias("profit"), + pl.when(pl.col("returns1_count") > 0) + .then(pl.col("returns1")) + .otherwise(None) + .alias("returns1"), + pl.when(pl.col("profit_loss_count") > 0) + .then(pl.col("profit_loss")) + .otherwise(None) + .alias("profit_loss"), + ] + ) + .drop(["sales_count", "profit_count", "returns1_count", "profit_loss_count"]) + ) + + # Step 3: Create wsr CTE (Web Sales and Returns) + # Filter sales and returns by date first, then transform + web_sales_data = ( + web_sales.join(date_dim, left_on="ws_sold_date_sk", right_on="d_date_sk") + .filter(pl.col("d_date").is_between(start_date, end_date, closed="both")) + .select( + [ + pl.col("ws_web_site_sk").alias("wsr_web_site_sk"), + pl.col("ws_sold_date_sk").alias("date_sk"), + pl.col("ws_ext_sales_price").alias("sales_price"), + pl.col("ws_net_profit").alias("profit"), + pl.lit(0.0).alias("return_amt"), + pl.lit(0.0).alias("net_loss"), + ] + ) + ) + # For web returns, we need the LEFT OUTER JOIN with web_sales, then filter by date + web_returns_data = ( + web_returns.join(date_dim, left_on="wr_returned_date_sk", right_on="d_date_sk") + .filter(pl.col("d_date").is_between(start_date, end_date, closed="both")) + .join( + web_sales.select(["ws_item_sk", "ws_order_number", "ws_web_site_sk"]), + left_on=["wr_item_sk", "wr_order_number"], + right_on=["ws_item_sk", "ws_order_number"], + how="left", + ) + .select( + [ + pl.col("ws_web_site_sk").alias("wsr_web_site_sk"), + pl.col("wr_returned_date_sk").alias("date_sk"), + pl.lit(0.0).alias("sales_price"), + pl.lit(0.0).alias("profit"), + pl.col("wr_return_amt").alias("return_amt"), + pl.col("wr_net_loss").alias("net_loss"), + ] + ) + ) + web_salesreturns = pl.concat([web_sales_data, web_returns_data]) + wsr = ( + web_salesreturns.join( + web_site, left_on="wsr_web_site_sk", right_on="web_site_sk" + ) + .group_by("web_site_id") + .agg( + [ + pl.col("sales_price").sum().alias("sales"), + pl.col("sales_price").count().alias("sales_count"), + pl.col("profit").sum().alias("profit"), + pl.col("profit").count().alias("profit_count"), + pl.col("return_amt").sum().alias("returns1"), + pl.col("return_amt").count().alias("returns1_count"), + pl.col("net_loss").sum().alias("profit_loss"), + pl.col("net_loss").count().alias("profit_loss_count"), + ] + ) + .with_columns( + [ + pl.when(pl.col("sales_count") > 0) + .then(pl.col("sales")) + .otherwise(None) + .alias("sales"), + pl.when(pl.col("profit_count") > 0) + .then(pl.col("profit")) + .otherwise(None) + .alias("profit"), + pl.when(pl.col("returns1_count") > 0) + .then(pl.col("returns1")) + .otherwise(None) + .alias("returns1"), + pl.when(pl.col("profit_loss_count") > 0) + .then(pl.col("profit_loss")) + .otherwise(None) + .alias("profit_loss"), + ] + ) + .drop(["sales_count", "profit_count", "returns1_count", "profit_loss_count"]) + ) + + # Step 4: Create the union of all channels + store_channel = ssr.select( + [ + pl.lit("store channel").alias("channel"), + (pl.lit("store") + pl.col("s_store_id").cast(pl.Utf8)).alias("id"), + pl.col("sales"), + pl.col("returns1"), + (pl.col("profit") - pl.col("profit_loss")).alias("profit"), + ] + ) + catalog_channel = csr.select( + [ + pl.lit("catalog channel").alias("channel"), + (pl.lit("catalog_page") + pl.col("cp_catalog_page_id").cast(pl.Utf8)).alias( + "id" + ), + pl.col("sales"), + pl.col("returns1"), + (pl.col("profit") - pl.col("profit_loss")).alias("profit"), + ] + ) + web_channel = wsr.select( + [ + pl.lit("web channel").alias("channel"), + (pl.lit("web_site") + pl.col("web_site_id").cast(pl.Utf8)).alias("id"), + pl.col("sales"), + pl.col("returns1"), + (pl.col("profit") - pl.col("profit_loss")).alias("profit"), + ] + ) + all_channels = pl.concat([store_channel, catalog_channel, web_channel]) + + # Step 5: Group by channel and id (filter out NULL rollup rows) + return ( + all_channels.group_by(["channel", "id"]) + .agg( + [ + pl.col("sales").sum().alias("sales"), + pl.col("sales").count().alias("sales_count"), + pl.col("returns1").sum().alias("returns1"), + pl.col("returns1").count().alias("returns1_count"), + pl.col("profit").sum().alias("profit"), + pl.col("profit").count().alias("profit_count"), + ] + ) + .with_columns( + [ + pl.when(pl.col("sales_count") > 0) + .then(pl.col("sales")) + .otherwise(None) + .alias("sales"), + pl.when(pl.col("returns1_count") > 0) + .then(pl.col("returns1")) + .otherwise(None) + .alias("returns1"), + pl.when(pl.col("profit_count") > 0) + .then(pl.col("profit")) + .otherwise(None) + .alias("profit"), + ] + ) + .drop(["sales_count", "returns1_count", "profit_count"]) + .filter(pl.col("channel").is_not_null() & pl.col("id").is_not_null()) + .sort(["channel", "id"]) + .limit(100) + ) diff --git a/python/cudf_polars/cudf_polars/experimental/benchmarks/pdsds_queries/q6.py b/python/cudf_polars/cudf_polars/experimental/benchmarks/pdsds_queries/q6.py new file mode 100644 index 00000000000..b597e7582a2 --- /dev/null +++ b/python/cudf_polars/cudf_polars/experimental/benchmarks/pdsds_queries/q6.py @@ -0,0 +1,92 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. +# SPDX-License-Identifier: Apache-2.0 + +"""Query 6.""" + +from __future__ import annotations + +from typing import TYPE_CHECKING + +import polars as pl + +from cudf_polars.experimental.benchmarks.utils import get_data + +if TYPE_CHECKING: + from cudf_polars.experimental.benchmarks.utils import RunConfig + + +def duckdb_impl(run_config: RunConfig) -> str: + """Query 6.""" + return """ + SELECT a.ca_state state, + Count(*) cnt + FROM customer_address a, + customer c, + store_sales s, + date_dim d, + item i + WHERE a.ca_address_sk = c.c_current_addr_sk + AND c.c_customer_sk = s.ss_customer_sk + AND s.ss_sold_date_sk = d.d_date_sk + AND s.ss_item_sk = i.i_item_sk + AND d.d_month_seq = (SELECT DISTINCT ( d_month_seq ) + FROM date_dim + WHERE d_year = 1998 + AND d_moy = 7) + AND i.i_current_price > 1.2 * (SELECT Avg(j.i_current_price) + FROM item j + WHERE j.i_category = i.i_category) + GROUP BY a.ca_state + HAVING Count(*) >= 10 + --ORDER BY cnt + ORDER BY cnt, state + LIMIT 100; + """ + + +def polars_impl(run_config: RunConfig) -> pl.LazyFrame: + """Query 6.""" + # Load required tables + customer_address = get_data( + run_config.dataset_path, "customer_address", run_config.suffix + ) + customer = get_data(run_config.dataset_path, "customer", run_config.suffix) + store_sales = get_data(run_config.dataset_path, "store_sales", run_config.suffix) + date_dim = get_data(run_config.dataset_path, "date_dim", run_config.suffix) + item = get_data(run_config.dataset_path, "item", run_config.suffix) + + # Subquery 1: d_month_seq values for July 1998 + target_month_seq_table = ( + date_dim.filter((pl.col("d_year") == 1998) & (pl.col("d_moy") == 7)) + .select("d_month_seq") + .unique() + ) + + # Subquery 2: Calculate average price per category + avg_price_per_category = item.group_by("i_category").agg( + pl.col("i_current_price").mean().alias("avg_price") + ) + + return ( + customer_address.join( + customer, left_on="ca_address_sk", right_on="c_current_addr_sk" + ) + .join(store_sales, left_on="c_customer_sk", right_on="ss_customer_sk") + .join(date_dim, left_on="ss_sold_date_sk", right_on="d_date_sk") + .join(item, left_on="ss_item_sk", right_on="i_item_sk") + .join(avg_price_per_category, on="i_category") + .join(target_month_seq_table, on="d_month_seq", how="semi") + .filter(pl.col("i_current_price") > 1.2 * pl.col("avg_price")) + .group_by("ca_state") + .agg(pl.len().alias("cnt")) + .filter(pl.col("cnt") >= 10) + .sort(["cnt", "ca_state"], nulls_last=True) + .limit(100) + .select( + [ + pl.col("ca_state").alias("state"), + # Cast -> Int64 to match DuckDB + pl.col("cnt").cast(pl.Int64), + ] + ) + ) diff --git a/python/cudf_polars/cudf_polars/experimental/benchmarks/pdsds_queries/q7.py b/python/cudf_polars/cudf_polars/experimental/benchmarks/pdsds_queries/q7.py new file mode 100644 index 00000000000..7efef3cbb14 --- /dev/null +++ b/python/cudf_polars/cudf_polars/experimental/benchmarks/pdsds_queries/q7.py @@ -0,0 +1,79 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. +# SPDX-License-Identifier: Apache-2.0 + +"""Query 7.""" + +from __future__ import annotations + +from typing import TYPE_CHECKING + +import polars as pl + +from cudf_polars.experimental.benchmarks.utils import get_data + +if TYPE_CHECKING: + from cudf_polars.experimental.benchmarks.utils import RunConfig + + +def duckdb_impl(run_config: RunConfig) -> str: + """Query 7.""" + return """ + SELECT i_item_id, + Avg(ss_quantity) agg1, + Avg(ss_list_price) agg2, + Avg(ss_coupon_amt) agg3, + Avg(ss_sales_price) agg4 + FROM store_sales, + customer_demographics, + date_dim, + item, + promotion + WHERE ss_sold_date_sk = d_date_sk + AND ss_item_sk = i_item_sk + AND ss_cdemo_sk = cd_demo_sk + AND ss_promo_sk = p_promo_sk + AND cd_gender = 'F' + AND cd_marital_status = 'W' + AND cd_education_status = '2 yr Degree' + AND ( p_channel_email = 'N' + OR p_channel_event = 'N' ) + AND d_year = 1998 + GROUP BY i_item_id + ORDER BY i_item_id + LIMIT 100; + """ + + +def polars_impl(run_config: RunConfig) -> pl.LazyFrame: + """Query 7.""" + # Load required tables + store_sales = get_data(run_config.dataset_path, "store_sales", run_config.suffix) + customer_demographics = get_data( + run_config.dataset_path, "customer_demographics", run_config.suffix + ) + date_dim = get_data(run_config.dataset_path, "date_dim", run_config.suffix) + item = get_data(run_config.dataset_path, "item", run_config.suffix) + promotion = get_data(run_config.dataset_path, "promotion", run_config.suffix) + + return ( + store_sales.join(date_dim, left_on="ss_sold_date_sk", right_on="d_date_sk") + .join(item, left_on="ss_item_sk", right_on="i_item_sk") + .join(customer_demographics, left_on="ss_cdemo_sk", right_on="cd_demo_sk") + .join(promotion, left_on="ss_promo_sk", right_on="p_promo_sk") + .filter(pl.col("cd_gender") == "F") + .filter(pl.col("cd_marital_status") == "W") + .filter(pl.col("cd_education_status") == "2 yr Degree") + .filter((pl.col("p_channel_email") == "N") | (pl.col("p_channel_event") == "N")) + .filter(pl.col("d_year") == 1998) + .group_by("i_item_id") + .agg( + [ + pl.col("ss_quantity").mean().alias("agg1"), + pl.col("ss_list_price").mean().alias("agg2"), + pl.col("ss_coupon_amt").mean().alias("agg3"), + pl.col("ss_sales_price").mean().alias("agg4"), + ] + ) + .sort("i_item_id", nulls_last=True) + .limit(100) + ) diff --git a/python/cudf_polars/cudf_polars/experimental/benchmarks/pdsds_queries/q8.py b/python/cudf_polars/cudf_polars/experimental/benchmarks/pdsds_queries/q8.py new file mode 100644 index 00000000000..5a06a654ff8 --- /dev/null +++ b/python/cudf_polars/cudf_polars/experimental/benchmarks/pdsds_queries/q8.py @@ -0,0 +1,524 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. +# SPDX-License-Identifier: Apache-2.0 + +"""Query 8.""" + +from __future__ import annotations + +from typing import TYPE_CHECKING + +import polars as pl + +from cudf_polars.experimental.benchmarks.utils import get_data + +if TYPE_CHECKING: + from cudf_polars.experimental.benchmarks.utils import RunConfig + +""" +warning!, one filter removed to prevent zero row results + +note: alternate zip code + '70069', # 93 preferred customers + '60069', # 87 preferred customers + '78877', # 87 preferred customers + '60169', # 87 preferred customers + '68252', # 86 preferred customers + '71087', # 84 preferred customers + '71711', # 84 preferred customers + '68877', # 84 preferred customers + '55709', # 82 preferred customers +""" + +TARGET_YEAR = 1996 +TARGET_QUARTER = 2 +TARGET_ZIPS = [ + "67436", + "26121", + "38443", + "63157", + "68856", + "19485", + "86425", + "26741", + "70991", + "60899", + "63573", + "47556", + "56193", + "93314", + "87827", + "62017", + "85067", + "95390", + "48091", + "10261", + "81845", + "41790", + "42853", + "24675", + "12840", + "60065", + "84430", + "57451", + "24021", + "91735", + "75335", + "71935", + "34482", + "56943", + "70695", + "52147", + "56251", + "28411", + "86653", + "23005", + "22478", + "29031", + "34398", + "15365", + "42460", + "33337", + "59433", + "73943", + "72477", + "74081", + "74430", + "64605", + "39006", + "11226", + "49057", + "97308", + "42663", + "18187", + "19768", + "43454", + "32147", + "76637", + "51975", + "11181", + "45630", + "33129", + "45995", + "64386", + "55522", + "26697", + "20963", + "35154", + "64587", + "49752", + "66386", + "30586", + "59286", + "13177", + "66646", + "84195", + "74316", + "36853", + "32927", + "12469", + "11904", + "36269", + "17724", + "55346", + "12595", + "53988", + "65439", + "28015", + "63268", + "73590", + "29216", + "82575", + "69267", + "13805", + "91678", + "79460", + "94152", + "14961", + "15419", + "48277", + "62588", + "55493", + "28360", + "14152", + "55225", + "18007", + "53705", + "56573", + "80245", + "71769", + "57348", + "36845", + "13039", + "17270", + "22363", + "83474", + "25294", + "43269", + "77666", + "15488", + "99146", + "64441", + "43338", + "38736", + "62754", + "48556", + "86057", + "23090", + "38114", + "66061", + "18910", + "84385", + "23600", + "19975", + "27883", + "65719", + "19933", + "32085", + "49731", + "40473", + "27190", + "46192", + "23949", + "44738", + "12436", + "64794", + "68741", + "15333", + "24282", + "49085", + "31844", + "71156", + "48441", + "17100", + "98207", + "44982", + "20277", + "71496", + "96299", + "37583", + "22206", + "89174", + "30589", + "61924", + "53079", + "10976", + "13104", + "42794", + "54772", + "15809", + "56434", + "39975", + "13874", + "30753", + "77598", + "78229", + "59478", + "12345", + "55547", + "57422", + "42600", + "79444", + "29074", + "29752", + "21676", + "32096", + "43044", + "39383", + "37296", + "36295", + "63077", + "16572", + "31275", + "18701", + "40197", + "48242", + "27219", + "49865", + "84175", + "30446", + "25165", + "13807", + "72142", + "70499", + "70464", + "71429", + "18111", + "70857", + "29545", + "36425", + "52706", + "36194", + "42963", + "75068", + "47921", + "74763", + "90990", + "89456", + "62073", + "88397", + "73963", + "75885", + "62657", + "12530", + "81146", + "57434", + "25099", + "41429", + "98441", + "48713", + "52552", + "31667", + "14072", + "13903", + "44709", + "85429", + "58017", + "38295", + "44875", + "73541", + "30091", + "12707", + "23762", + "62258", + "33247", + "78722", + "77431", + "14510", + "35656", + "72428", + "92082", + "35267", + "43759", + "24354", + "90952", + "11512", + "21242", + "22579", + "56114", + "32339", + "52282", + "41791", + "24484", + "95020", + "28408", + "99710", + "11899", + "43344", + "72915", + "27644", + "62708", + "74479", + "17177", + "32619", + "12351", + "91339", + "31169", + "57081", + "53522", + "16712", + "34419", + "71779", + "44187", + "46206", + "96099", + "61910", + "53664", + "12295", + "31837", + "33096", + "10813", + "63048", + "31732", + "79118", + "73084", + "72783", + "84952", + "46965", + "77956", + "39815", + "32311", + "75329", + "48156", + "30826", + "49661", + "13736", + "92076", + "74865", + "88149", + "92397", + "52777", + "68453", + "32012", + "21222", + "52721", + "24626", + "18210", + "42177", + "91791", + "75251", + "82075", + "44372", + "45542", + "20609", + "60115", + "17362", + "22750", + "90434", + "31852", + "54071", + "33762", + "14705", + "40718", + "56433", + "30996", + "40657", + "49056", + "23585", + "66455", + "41021", + "74736", + "72151", + "37007", + "21729", + "60177", + "84558", + "59027", + "93855", + "60022", + "86443", + "19541", + "86886", + "30532", + "39062", + "48532", + "34713", + "52077", + "22564", + "64638", + "15273", + "31677", + "36138", + "62367", + "60261", + "80213", + "42818", + "25113", + "72378", + "69802", + "69096", + "55443", + "28820", + "13848", + "78258", + "37490", + "30556", + "77380", + "28447", + "44550", + "26791", + "70609", + "82182", + "33306", + "43224", + "22322", + "86959", + "68519", + "14308", + "46501", + "81131", + "34056", + "61991", + "19896", + "87804", + "65774", + "92564", +] + + +def duckdb_impl(run_config: RunConfig) -> str: + """Query 8.""" + return f""" + -- start query 8 in stream 0 using template query8.tpl + SELECT s_store_name, + Sum(ss_net_profit) + FROM store_sales, + date_dim, + store, + (SELECT ca_zip + FROM (SELECT Substr(ca_zip, 1, 5) ca_zip + FROM customer_address + WHERE Substr(ca_zip, 1, 5) IN ({", ".join(f"'{zip}'" for zip in TARGET_ZIPS)}) + INTERSECT + SELECT ca_zip + FROM (SELECT Substr(ca_zip, 1, 5) ca_zip, + Count(*) cnt + FROM customer_address, + customer + WHERE ca_address_sk = c_current_addr_sk + AND c_preferred_cust_flag = 'Y' + GROUP BY ca_zip + HAVING Count(*) > 10)A1)A2) V1 + WHERE ss_store_sk = s_store_sk + AND ss_sold_date_sk = d_date_sk + AND d_qoy = {TARGET_QUARTER} + AND d_year = {TARGET_YEAR} + AND ( Substr(s_zip, 1, 2) = Substr(V1.ca_zip, 1, 2) ) + GROUP BY s_store_name + ORDER BY s_store_name + LIMIT 100; + + """ + + +def polars_impl(run_config: RunConfig) -> pl.LazyFrame: + """Query 8.""" + # Load required tables + store_sales = get_data(run_config.dataset_path, "store_sales", run_config.suffix) + date_dim = get_data(run_config.dataset_path, "date_dim", run_config.suffix) + store = get_data(run_config.dataset_path, "store", run_config.suffix) + customer_address = get_data( + run_config.dataset_path, "customer_address", run_config.suffix + ) + customer = get_data(run_config.dataset_path, "customer", run_config.suffix) + + # First subquery: get first 5 chars of zip codes from target list + target_zips_5char = ( + customer_address.select(pl.col("ca_zip").str.slice(0, 5).alias("ca_zip")) + .filter(pl.col("ca_zip").is_in(TARGET_ZIPS)) + .unique() + ) + + # Second subquery: preferred customers by zip with count > 10 + preferred_customer_zips = ( + customer_address.join( + customer, left_on="ca_address_sk", right_on="c_current_addr_sk" + ) + .filter(pl.col("c_preferred_cust_flag") == "Y") + .group_by(pl.col("ca_zip").str.slice(0, 5).alias("ca_zip")) + .agg(pl.len().alias("cnt")) + .filter(pl.col("cnt") > 10) + .select("ca_zip") + ) + + # INTERSECT: Get common zip codes between target list and preferred customer zips + intersect_zips = target_zips_5char.join( + preferred_customer_zips, on="ca_zip", how="inner" + ).select("ca_zip") + + # Main query: join store_sales with date_dim, store, and filter by zip codes + return ( + store_sales.join(date_dim, left_on="ss_sold_date_sk", right_on="d_date_sk") + .join(store, left_on="ss_store_sk", right_on="s_store_sk") + .join( + intersect_zips, + left_on=pl.col("s_zip").str.slice(0, 2), + right_on=pl.col("ca_zip").str.slice(0, 2), + ) + .filter(pl.col("d_qoy") == TARGET_QUARTER) + .filter(pl.col("d_year") == TARGET_YEAR) + .group_by("s_store_name") + .agg(pl.col("ss_net_profit").sum().alias("sum")) + .sort("s_store_name", nulls_last=True) + .limit(100) + .select([pl.col("s_store_name"), pl.col("sum").alias("sum(ss_net_profit)")]) + ) diff --git a/python/cudf_polars/cudf_polars/experimental/benchmarks/pdsds_queries/q9.py b/python/cudf_polars/cudf_polars/experimental/benchmarks/pdsds_queries/q9.py new file mode 100644 index 00000000000..6f4ae38dac1 --- /dev/null +++ b/python/cudf_polars/cudf_polars/experimental/benchmarks/pdsds_queries/q9.py @@ -0,0 +1,137 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. +# SPDX-License-Identifier: Apache-2.0 + +"""Query 9.""" + +from __future__ import annotations + +from typing import TYPE_CHECKING + +import polars as pl + +from cudf_polars.experimental.benchmarks.utils import get_data + +if TYPE_CHECKING: + from cudf_polars.experimental.benchmarks.utils import RunConfig + + +def duckdb_impl(run_config: RunConfig) -> str: + """Query 9.""" + return """ + -- start query 9 in stream 0 using template query9.tpl + SELECT CASE + WHEN (SELECT Count(*) + FROM store_sales + WHERE ss_quantity BETWEEN 1 AND 20) > 3672 THEN + (SELECT Avg(ss_ext_list_price) + FROM store_sales + WHERE + ss_quantity BETWEEN 1 AND 20) + ELSE (SELECT Avg(ss_net_profit) + FROM store_sales + WHERE ss_quantity BETWEEN 1 AND 20) + END bucket1, + CASE + WHEN (SELECT Count(*) + FROM store_sales + WHERE ss_quantity BETWEEN 21 AND 40) > 3392 THEN + (SELECT Avg(ss_ext_list_price) + FROM store_sales + WHERE + ss_quantity BETWEEN 21 AND 40) + ELSE (SELECT Avg(ss_net_profit) + FROM store_sales + WHERE ss_quantity BETWEEN 21 AND 40) + END bucket2, + CASE + WHEN (SELECT Count(*) + FROM store_sales + WHERE ss_quantity BETWEEN 41 AND 60) > 32784 THEN + (SELECT Avg(ss_ext_list_price) + FROM store_sales + WHERE + ss_quantity BETWEEN 41 AND 60) + ELSE (SELECT Avg(ss_net_profit) + FROM store_sales + WHERE ss_quantity BETWEEN 41 AND 60) + END bucket3, + CASE + WHEN (SELECT Count(*) + FROM store_sales + WHERE ss_quantity BETWEEN 61 AND 80) > 26032 THEN + (SELECT Avg(ss_ext_list_price) + FROM store_sales + WHERE + ss_quantity BETWEEN 61 AND 80) + ELSE (SELECT Avg(ss_net_profit) + FROM store_sales + WHERE ss_quantity BETWEEN 61 AND 80) + END bucket4, + CASE + WHEN (SELECT Count(*) + FROM store_sales + WHERE ss_quantity BETWEEN 81 AND 100) > 23982 THEN + (SELECT Avg(ss_ext_list_price) + FROM store_sales + WHERE + ss_quantity BETWEEN 81 AND 100) + ELSE (SELECT Avg(ss_net_profit) + FROM store_sales + WHERE ss_quantity BETWEEN 81 AND 100) + END bucket5 + FROM reason + WHERE r_reason_sk = 1; + """ + + +def polars_impl(run_config: RunConfig) -> pl.LazyFrame: + """Query 9.""" + # Load required tables + store_sales = get_data(run_config.dataset_path, "store_sales", run_config.suffix) + reason = get_data(run_config.dataset_path, "reason", run_config.suffix) + + # Define bucket configurations: (min_qty, max_qty, count_threshold) + buckets = [ + (1, 20, 3672), + (21, 40, 3392), + (41, 60, 32784), + (61, 80, 26032), + (81, 100, 23982), + ] + + # Calculate each bucket summary + bucket_stats = [] + for i, (min_qty, max_qty, _) in enumerate(buckets, 1): + # Compute count, avg(ss_ext_list_price), avg(ss_net_profit) for each quantity range + stats = store_sales.filter( + pl.col("ss_quantity").is_between(min_qty, max_qty, closed="both") + ).select( + [ + pl.len().alias(f"count_{i}"), + pl.col("ss_ext_list_price").mean().alias(f"avg_price_{i}"), + pl.col("ss_net_profit").mean().alias(f"avg_profit_{i}"), + ] + ) + bucket_stats.append(stats) + + # Combine all bucket summaries into one row + combined_stats = pl.concat(bucket_stats, how="horizontal") + + # Select appropriate value per bucket based on count threshold + bucket_values = [] + for i, (_, _, threshold) in enumerate(buckets, 1): + bucket = ( + pl.when(pl.col(f"count_{i}") > threshold) + .then(pl.col(f"avg_price_{i}")) + .otherwise(pl.col(f"avg_profit_{i}")) + .alias(f"bucket{i}") + ) + bucket_values.append(bucket) + + # Create result DataFrame with one row (using reason table as in SQL) + return ( + reason.filter(pl.col("r_reason_sk") == 1) + .join(combined_stats, how="cross") + .select(bucket_values) + .limit(1) + ) From b7d8e9094da3379044a426e103eb5df47fa4b7f2 Mon Sep 17 00:00:00 2001 From: Kyle Edwards Date: Fri, 8 Aug 2025 16:43:28 -0400 Subject: [PATCH 087/366] Fix anchor naming conventions in dependencies.yaml (#19635) Replace hypens with underscores, and remove any `-dep` suffix. Authors: - Kyle Edwards (https://github.com/KyleFromNVIDIA) Approvers: - Bradley Dice (https://github.com/bdice) URL: https://github.com/rapidsai/cudf/pull/19635 --- dependencies.yaml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/dependencies.yaml b/dependencies.yaml index f214bde574e..7b240f5bc84 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -488,7 +488,7 @@ dependencies: common: - output_types: [conda, requirements, pyproject] packages: - - &numba-cuda-dep numba-cuda>=0.14.0,<0.15.0a0 + - &numba_cuda numba-cuda>=0.14.0,<0.15.0a0 pyarrow_run: common: - output_types: [conda] @@ -653,14 +653,14 @@ dependencies: - output_types: [conda, requirements, pyproject] packages: - cachetools - - &numba-dep numba>=0.59.1,<0.62.0a0 + - &numba numba>=0.59.1,<0.62.0a0 - nvtx>=0.2.1 - packaging - rich - typing_extensions>=4.0.0 - output_types: [conda] packages: - - *numba-cuda-dep + - *numba_cuda - output_types: requirements packages: # pip recognizes the index as a global option for the requirements.txt file @@ -682,10 +682,10 @@ dependencies: matrices: - matrix: {cuda: "12.*"} packages: - - &numba-cuda-cu12-dep numba-cuda[cu12]>=0.14.0,<0.15.0a0 + - &numba_cuda_cu12 numba-cuda[cu12]>=0.14.0,<0.15.0a0 - matrix: # Fallback for no matrix packages: - - *numba-cuda-cu12-dep + - *numba_cuda_cu12 - output_types: [requirements, pyproject] matrices: - matrix: From 2cefac9e7ccc4cf9a35c64b210abade5bffa1599 Mon Sep 17 00:00:00 2001 From: Vukasin Milovanovic Date: Fri, 8 Aug 2025 14:42:04 -0700 Subject: [PATCH 088/366] Propagate exceptions thrown in async IO operations (#19628) closes #19586 In some places in libcudf, `std::future::wait()` is called to wait on asynchronous IO operations. The issue with this is that wait does not propagate any exceptions that may be thrown. This PR replaces the `wait()` calls with `get()` and adds readers and writers tests to make sure exceptions are re-thrown. Authors: - Vukasin Milovanovic (https://github.com/vuule) Approvers: - David Wendt (https://github.com/davidwendt) - Amin Aramoon (https://github.com/aminaramoon) - Muhammad Haseeb (https://github.com/mhaseeb123) - Nghia Truong (https://github.com/ttnghia) URL: https://github.com/rapidsai/cudf/pull/19628 --- cpp/src/io/orc/writer_impl.cu | 4 +- cpp/src/io/parquet/bloom_filter_reader.cu | 2 +- cpp/src/io/parquet/reader_impl_preprocess.cu | 6 +- .../parquet/reader_impl_preprocess_utils.cu | 2 +- cpp/src/io/parquet/writer_impl.cu | 4 +- cpp/tests/io/io_test_utils.hpp | 119 ++++++++++++++++++ cpp/tests/io/json/json_test.cpp | 56 +++++++++ cpp/tests/io/orc_test.cpp | 54 ++++++++ cpp/tests/io/parquet_reader_test.cpp | 55 ++++++++ 9 files changed, 293 insertions(+), 9 deletions(-) create mode 100644 cpp/tests/io/io_test_utils.hpp diff --git a/cpp/src/io/orc/writer_impl.cu b/cpp/src/io/orc/writer_impl.cu index 612f0b69737..64c8c729e4a 100644 --- a/cpp/src/io/orc/writer_impl.cu +++ b/cpp/src/io/orc/writer_impl.cu @@ -2715,8 +2715,8 @@ void writer::impl::write_orc_data_to_sink(encoded_data const& enc_data, stripe.footerLength = bytes_written; } } - for (auto const& task : write_tasks) { - task.wait(); + for (auto& task : write_tasks) { + task.get(); } } diff --git a/cpp/src/io/parquet/bloom_filter_reader.cu b/cpp/src/io/parquet/bloom_filter_reader.cu index d2d7fcac959..7abceb3ad4d 100644 --- a/cpp/src/io/parquet/bloom_filter_reader.cu +++ b/cpp/src/io/parquet/bloom_filter_reader.cu @@ -391,7 +391,7 @@ void read_bloom_filter_data(host_span const> sources // Read task sync function for (auto& task : read_tasks) { - task.wait(); + task.get(); } } diff --git a/cpp/src/io/parquet/reader_impl_preprocess.cu b/cpp/src/io/parquet/reader_impl_preprocess.cu index 2343ade4784..87fb1950017 100644 --- a/cpp/src/io/parquet/reader_impl_preprocess.cu +++ b/cpp/src/io/parquet/reader_impl_preprocess.cu @@ -345,10 +345,10 @@ void reader_impl::read_compressed_data() auto& chunks = pass.chunks; - auto const [has_compressed_data, read_chunks_tasks] = read_column_chunks(); - pass.has_compressed_data = has_compressed_data; + auto [has_compressed_data, read_chunks_tasks] = read_column_chunks(); + pass.has_compressed_data = has_compressed_data; - read_chunks_tasks.wait(); + read_chunks_tasks.get(); // Process dataset chunk pages into output columns auto const total_pages = _has_page_index ? count_page_headers_with_pgidx(chunks, _stream) diff --git a/cpp/src/io/parquet/reader_impl_preprocess_utils.cu b/cpp/src/io/parquet/reader_impl_preprocess_utils.cu index 04b36abb37f..6029d7ff538 100644 --- a/cpp/src/io/parquet/reader_impl_preprocess_utils.cu +++ b/cpp/src/io/parquet/reader_impl_preprocess_utils.cu @@ -238,7 +238,7 @@ void generate_depth_remappings( } auto sync_fn = [](decltype(read_tasks) read_tasks) { for (auto& task : read_tasks) { - task.wait(); + task.get(); } }; return std::async(std::launch::deferred, sync_fn, std::move(read_tasks)); diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu index 41fdbe7e6fc..621eadb0e5b 100644 --- a/cpp/src/io/parquet/writer_impl.cu +++ b/cpp/src/io/parquet/writer_impl.cu @@ -2497,8 +2497,8 @@ void writer::impl::write_parquet_data_to_sink( if (i == 0) { row_group.file_offset = chunk_offset; } } } - for (auto const& task : write_tasks) { - task.wait(); + for (auto& task : write_tasks) { + task.get(); } } diff --git a/cpp/tests/io/io_test_utils.hpp b/cpp/tests/io/io_test_utils.hpp new file mode 100644 index 00000000000..a19eb597e50 --- /dev/null +++ b/cpp/tests/io/io_test_utils.hpp @@ -0,0 +1,119 @@ +/* + * Copyright (c) 2025, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include + +#include +#include + +namespace cudf::test { + +/** + * @brief Custom exception for device read async testing + */ +class AsyncException : public std::exception {}; + +/** + * @brief Datasource that throws an exception in device_read_async for testing + */ +class ThrowingDeviceReadDatasource : public cudf::io::datasource { + private: + std::vector const& data_; + + public: + explicit ThrowingDeviceReadDatasource(std::vector const& data) : data_(data) {} + + std::unique_ptr host_read(size_t offset, size_t size) override + { + size = std::min(size, data_.size() - offset); + // Convert char data to bytes for the buffer + std::vector byte_data(size); + std::memcpy(byte_data.data(), data_.data() + offset, size); + return cudf::io::datasource::buffer::create(std::move(byte_data)); + } + + size_t host_read(size_t offset, size_t size, uint8_t* dst) override + { + auto const read_size = std::min(size, data_.size() - offset); + std::memcpy(dst, data_.data() + offset, read_size); + return read_size; + } + + [[nodiscard]] bool supports_device_read() const override { return true; } + + std::unique_ptr device_read(size_t offset, + size_t size, + rmm::cuda_stream_view stream) override + { + // For testing, just copy the data from the host buffer into a new buffer + size = std::min(size, data_.size() - offset); + rmm::device_buffer out_data(size, stream); + cudaMemcpyAsync( + out_data.data(), data_.data() + offset, size, cudaMemcpyHostToDevice, stream.value()); + cudaStreamSynchronize(stream.value()); + return cudf::io::datasource::buffer::create(std::move(out_data)); + } + + std::future device_read_async(size_t offset, + size_t size, + uint8_t* dst, + rmm::cuda_stream_view stream) override + { + // This datasource returns a future that throws a custom exception when accessed for testing + std::promise promise; + promise.set_exception(std::make_exception_ptr(AsyncException())); + return promise.get_future(); + } + + [[nodiscard]] size_t size() const override { return data_.size(); } +}; + +/** + * @brief Data sink that throws an exception in device_write_async for testing + */ +class ThrowingDeviceWriteDataSink : public cudf::io::data_sink { + private: + size_t buffer_size_ = 0; + + public: + void host_write(void const* data, size_t size) override { buffer_size_ += size; } + + [[nodiscard]] bool supports_device_write() const override { return true; } + + void device_write(void const* gpu_data, size_t size, rmm::cuda_stream_view stream) override + { + buffer_size_ += size; + } + + std::future device_write_async(void const* gpu_data, + size_t size, + rmm::cuda_stream_view stream) override + { + // This data sink returns a future that throws a custom exception when accessed for testing + std::promise promise; + promise.set_exception(std::make_exception_ptr(AsyncException())); + return promise.get_future(); + } + + void flush() override {} + + size_t bytes_written() override { return buffer_size_; } +}; + +} // namespace cudf::test diff --git a/cpp/tests/io/json/json_test.cpp b/cpp/tests/io/json/json_test.cpp index 8bdda1dc4bf..53e4d3881e1 100644 --- a/cpp/tests/io/json/json_test.cpp +++ b/cpp/tests/io/json/json_test.cpp @@ -14,6 +14,8 @@ * limitations under the License. */ +#include "../io_test_utils.hpp" + #include #include #include @@ -3591,4 +3593,58 @@ TEST_F(JsonBatchedReaderTest, EmptyLastBatch) cudf::test::strings_column_wrapper{{"b", "b", "b", "b"}}); } +TEST_F(JsonReaderTest, DeviceReadAsyncThrows) +{ + // Create simple JSON data + std::string json_string = R"({"a": 1} +{"a": 2} +{"a": 3} +{"a": 4} +{"a": 5})"; + + // Convert to char vector + std::vector json_data(json_string.begin(), json_string.end()); + + // Create our throwing datasource + auto throwing_source = std::make_unique(json_data); + cudf::io::source_info source_info(throwing_source.get()); + + // Try to read the JSON data - this should either succeed or propagate AsyncException + // from device_read_async. + cudf::io::json_reader_options read_args = + cudf::io::json_reader_options::builder(source_info).lines(true); + try { + cudf::io::read_json(read_args); + // Test passes if no exception is thrown + } catch (const cudf::test::AsyncException&) { + // Test passes if AsyncException is thrown (expected test exception) + } catch (const std::exception& e) { + // Test fails if any other exception is thrown + FAIL() << "Unexpected exception thrown: " << e.what(); + } +} + +TEST_F(JsonReaderTest, DeviceWriteAsyncThrows) +{ + // Create a simple table to write + auto col0 = cudf::test::fixed_width_column_wrapper{{1, 2, 3, 4, 5}}; + auto table_to_write = cudf::table_view{{col0}}; + + auto throwing_sink = std::make_unique(); + + cudf::io::json_writer_options write_args = cudf::io::json_writer_options::builder( + cudf::io::sink_info{throwing_sink.get()}, table_to_write); + + // The write_json call should either succeed or throw AsyncException. + try { + cudf::io::write_json(write_args); + // Test passes if no exception is thrown + } catch (const cudf::test::AsyncException&) { + // Test passes if AsyncException is thrown (expected test exception) + } catch (const std::exception& e) { + // Test fails if any other exception is thrown + FAIL() << "Unexpected exception thrown: " << e.what(); + } +} + CUDF_TEST_PROGRAM_MAIN() diff --git a/cpp/tests/io/orc_test.cpp b/cpp/tests/io/orc_test.cpp index b0b83f7f419..90b5ebbfd43 100644 --- a/cpp/tests/io/orc_test.cpp +++ b/cpp/tests/io/orc_test.cpp @@ -15,6 +15,7 @@ */ #include "compression_common.hpp" +#include "io_test_utils.hpp" #include #include @@ -2305,6 +2306,59 @@ TEST_F(OrcWriterTest, MultipleBlocksInStripeFooter) CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view()); } +TEST_F(OrcReaderTest, DeviceReadAsyncThrows) +{ + // Create a simple ORC file in memory + auto col0 = cudf::test::fixed_width_column_wrapper{{1, 2, 3, 4, 5}}; + auto table_to_write = table_view{{col0}}; + + std::vector out_buffer; + cudf::io::orc_writer_options write_args = + cudf::io::orc_writer_options::builder(cudf::io::sink_info{&out_buffer}, table_to_write); + cudf::io::write_orc(write_args); + + // Create our throwing datasource + auto throwing_source = std::make_unique(out_buffer); + cudf::io::source_info source_info(throwing_source.get()); + + // Try to read the ORC file - this should either succeed or propagate AsyncException + // from device_read_async. + cudf::io::orc_reader_options read_args = cudf::io::orc_reader_options::builder(source_info); + try { + cudf::io::read_orc(read_args); + // Test passes if no exception is thrown + } catch (const cudf::test::AsyncException&) { + // Test passes if AsyncException is thrown (expected test exception) + } catch (const std::exception& e) { + // Test fails if any other exception is thrown + FAIL() << "Unexpected exception thrown: " << e.what(); + } +} + +TEST_F(OrcReaderTest, DeviceWriteAsyncThrows) +{ + // Create a simple table to write + auto col0 = cudf::test::fixed_width_column_wrapper{{1, 2, 3, 4, 5}}; + auto table_to_write = table_view{{col0}}; + + auto throwing_sink = std::make_unique(); + + cudf::io::orc_writer_options write_args = + cudf::io::orc_writer_options::builder(cudf::io::sink_info{throwing_sink.get()}, table_to_write); + + // The write_orc call should either succeed or throw AsyncException. + // Should only fail if a different exception is thrown. + try { + cudf::io::write_orc(write_args); + // Test passes if no exception is thrown + } catch (const cudf::test::AsyncException&) { + // Test passes if AsyncException is thrown (expected test exception) + } catch (const std::exception& e) { + // Test fails if any other exception is thrown + FAIL() << "Unexpected exception thrown: " << e.what(); + } +} + INSTANTIATE_TEST_CASE_P(Nvcomp, OrcCompressionTest, ::testing::Combine(::testing::Values("NVCOMP"), diff --git a/cpp/tests/io/parquet_reader_test.cpp b/cpp/tests/io/parquet_reader_test.cpp index a39c943f067..3213034b541 100644 --- a/cpp/tests/io/parquet_reader_test.cpp +++ b/cpp/tests/io/parquet_reader_test.cpp @@ -15,6 +15,7 @@ */ #include "compression_common.hpp" +#include "io_test_utils.hpp" #include "parquet_common.hpp" #include @@ -33,6 +34,7 @@ #include #include +#include using ParquetDecompressionTest = DecompressionTest; @@ -3165,3 +3167,56 @@ TEST_F(ParquetReaderTest, RowBoundsAndFilter) metadata.num_row_groups_after_stats_filter.value() == 3); // RGs: {},{0,1,4},{} } } + +TEST_F(ParquetReaderTest, DeviceReadAsyncThrows) +{ + // Create a simple parquet file in memory + auto col0 = cudf::test::fixed_width_column_wrapper{{1, 2, 3, 4, 5}}; + auto table_to_write = table_view{{col0}}; + + std::vector out_buffer; + cudf::io::parquet_writer_options write_args = + cudf::io::parquet_writer_options::builder(cudf::io::sink_info{&out_buffer}, table_to_write); + cudf::io::write_parquet(write_args); + + // Create our throwing datasource + auto throwing_source = std::make_unique(out_buffer); + cudf::io::source_info source_info(throwing_source.get()); + + // Try to read the parquet file - this should either succeed or propagate AsyncException + // from device_read_async. + cudf::io::parquet_reader_options read_args = + cudf::io::parquet_reader_options::builder(source_info); + try { + cudf::io::read_parquet(read_args); + // Test passes if no exception is thrown + } catch (const cudf::test::AsyncException&) { + // Test passes if AsyncException is thrown (expected test exception) + } catch (const std::exception& e) { + // Test fails if any other exception is thrown + FAIL() << "Unexpected exception thrown: " << e.what(); + } +} + +TEST_F(ParquetReaderTest, DeviceWriteAsyncThrows) +{ + // Create a simple table to write + auto col0 = cudf::test::fixed_width_column_wrapper{{1, 2, 3, 4, 5}}; + auto table_to_write = table_view{{col0}}; + + auto throwing_sink = std::make_unique(); + + cudf::io::parquet_writer_options write_args = cudf::io::parquet_writer_options::builder( + cudf::io::sink_info{throwing_sink.get()}, table_to_write); + + // The write_parquet call should either succeed or throw AsyncException. + try { + cudf::io::write_parquet(write_args); + // Test passes if no exception is thrown + } catch (const cudf::test::AsyncException&) { + // Test passes if AsyncException is thrown (expected test exception) + } catch (const std::exception& e) { + // Test fails if any other exception is thrown + FAIL() << "Unexpected exception thrown: " << e.what(); + } +} From 94b245fb5f7e5cf4d0cda78486a0f85853ff1782 Mon Sep 17 00:00:00 2001 From: Gary Shen Date: Mon, 11 Aug 2025 15:27:55 +0800 Subject: [PATCH 089/366] Use cudaDeviceGetAttribute to get ComputeMode for CUDA13 (#19645) CUDA13 doesn't support get compute mode from cudaDeviceProp.computeMode. It's changed to use cudaDeviceGetAttribute. Use macro CUDART_VERSION > 13000 to detect building on cuda13 And keep the original code for cuda12 in else part. close #19644 Authors: - Gary Shen (https://github.com/GaryShen2008) Approvers: - Nghia Truong (https://github.com/ttnghia) URL: https://github.com/rapidsai/cudf/pull/19645 --- java/src/main/native/src/CudaJni.cpp | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/java/src/main/native/src/CudaJni.cpp b/java/src/main/native/src/CudaJni.cpp index 1fca42a18c4..c5359c821ae 100644 --- a/java/src/main/native/src/CudaJni.cpp +++ b/java/src/main/native/src/CudaJni.cpp @@ -199,9 +199,19 @@ JNIEXPORT jint JNICALL Java_ai_rapids_cudf_Cuda_getNativeComputeMode(JNIEnv* env cudf::jni::auto_set_device(env); int device; CUDF_CUDA_TRY(cudaGetDevice(&device)); + +#if defined(CUDART_VERSION) && CUDART_VERSION >= 13000 + // CUDA 13.0+ removed computeMode from cudaDeviceProp + // Return computeMode from cudaDeviceGetAttribute + int compute_mode; + CUDF_CUDA_TRY(cudaDeviceGetAttribute(&compute_mode, cudaDevAttrComputeMode, device)); + return compute_mode; +#else + // CUDA 12.x and earlier cudaDeviceProp device_prop; CUDF_CUDA_TRY(cudaGetDeviceProperties(&device_prop, device)); return device_prop.computeMode; +#endif } CATCH_STD(env, -2); } From 78c8a953ed7d21a37cdc8a7e8fe6048af8e38a90 Mon Sep 17 00:00:00 2001 From: David Wendt <45795991+davidwendt@users.noreply.github.com> Date: Mon, 11 Aug 2025 08:17:37 -0400 Subject: [PATCH 090/366] Simplify cudf::scalar usage in reduce utility (#19608) Fixes the `cudf::reduction::detail::reduce` internal utility to use the returned `cudf::scalar` instances directly in the CUB calls to simplify the logic. This should help solve the issues for building/running #19119 -- the device-scalar ctors are no longer required. Authors: - David Wendt (https://github.com/davidwendt) Approvers: - Tianyu Liu (https://github.com/kingcrimsontianyu) - Muhammad Haseeb (https://github.com/mhaseeb123) - Nghia Truong (https://github.com/ttnghia) URL: https://github.com/rapidsai/cudf/pull/19608 --- .../cudf/reduction/detail/reduction.cuh | 58 +++++++++---------- cpp/src/copying/get_element.cu | 20 +++---- cpp/src/scalar/scalar_factories.cpp | 19 +++--- 3 files changed, 41 insertions(+), 56 deletions(-) diff --git a/cpp/include/cudf/reduction/detail/reduction.cuh b/cpp/include/cudf/reduction/detail/reduction.cuh index 32c618e3b60..6b15c8fb4c0 100644 --- a/cpp/include/cudf/reduction/detail/reduction.cuh +++ b/cpp/include/cudf/reduction/detail/reduction.cuh @@ -40,18 +40,17 @@ namespace detail { /** * @brief Compute the specified simple reduction over the input range of elements. * - * @param[in] d_in the begin iterator - * @param[in] num_items the number of items - * @param[in] op the reduction operator - * @param[in] init Optional initial value of the reduction - * @param[in] stream CUDA stream used for device memory operations and kernel launches - * @param[in] mr Device memory resource used to allocate the returned scalar's device - * memory - * @returns Output scalar in device memory - * * @tparam Op the reduction operator with device binary operator * @tparam InputIterator the input column iterator * @tparam OutputType the output type of reduction + * + * @param d_in the begin iterator + * @param num_items the number of items + * @param op the reduction operator + * @param init Optional initial value of the reduction + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned scalar's device memory + * @returns Output scalar in device memory */ template reduce(InputIterator d_in, { auto const binary_op = cudf::detail::cast_functor(op.get_binary_op()); auto const initial_value = init.value_or(op.template get_identity()); - auto dev_result = rmm::device_scalar{initial_value, stream, mr}; + using ScalarType = cudf::scalar_type_t; + auto result = std::make_unique(initial_value, true, stream, mr); // Allocate temporary storage rmm::device_buffer d_temp_storage; @@ -74,7 +74,7 @@ std::unique_ptr reduce(InputIterator d_in, cub::DeviceReduce::Reduce(d_temp_storage.data(), temp_storage_bytes, d_in, - dev_result.data(), + result->data(), num_items, binary_op, initial_value, @@ -85,15 +85,12 @@ std::unique_ptr reduce(InputIterator d_in, cub::DeviceReduce::Reduce(d_temp_storage.data(), temp_storage_bytes, d_in, - dev_result.data(), + result->data(), num_items, binary_op, initial_value, stream.value()); - - // only for string_view, data is copied - auto s = new cudf::scalar_type_t(std::move(dev_result), true, stream, mr); - return std::unique_ptr(s); + return result; } template reduce(InputIterator d_in, initial_value, stream.value()); - using ScalarType = cudf::scalar_type_t; - auto s = new ScalarType(dev_result, true, stream, mr); // only for string_view, data is copied - return std::unique_ptr(s); + return std::make_unique(dev_result, true, stream, mr); } /** * @brief compute reduction by the compound operator (reduce and transform) * - * @param[in] d_in the begin iterator - * @param[in] num_items the number of items - * @param[in] op the reduction operator - * @param[in] valid_count Number of valid items - * @param[in] ddof Delta degrees of freedom used for standard deviation and variance - * @param[in] init Optional initial value of the reduction - * @param[in] stream CUDA stream used for device memory operations and kernel launches - * @param[in] mr Device memory resource used to allocate the returned scalar's device - * memory - * @returns Output scalar in device memory - * * The reduction operator must have `intermediate::compute_result()` method. * This method performs reduction using binary operator `Op::Op` and transforms the * result to `OutputType` using `compute_result()` transform method. @@ -177,6 +161,16 @@ std::unique_ptr reduce(InputIterator d_in, * @tparam Op the reduction operator with device binary operator * @tparam InputIterator the input column iterator * @tparam OutputType the output type of reduction + * + * @param d_in the begin iterator + * @param num_items the number of items + * @param op the reduction operator + * @param valid_count Number of valid items + * @param ddof Delta degrees of freedom used for standard deviation and variance + * @param init Optional initial value of the reduction + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned scalar's device memory + * @returns Output scalar in device memory */ template reduce(InputIterator d_in, // compute the result value from intermediate value in device using ScalarType = cudf::scalar_type_t; - auto result = new ScalarType(OutputType{0}, true, stream, mr); + auto result = std::make_unique(OutputType{0}, true, stream, mr); thrust::for_each_n(rmm::exec_policy(stream), intermediate_result.data(), 1, [dres = result->data(), op, valid_count, ddof] __device__(auto i) { *dres = op.template compute_result(i, valid_count, ddof); }); - return std::unique_ptr(result); + return result; } } // namespace detail diff --git a/cpp/src/copying/get_element.cu b/cpp/src/copying/get_element.cu index 80b0bd5242f..b492f774a46 100644 --- a/cpp/src/copying/get_element.cu +++ b/cpp/src/copying/get_element.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2024, NVIDIA CORPORATION. + * Copyright (c) 2020-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -108,9 +108,7 @@ struct get_element_functor { stream); if (!key_index_scalar.is_valid(stream)) { - auto null_result = make_default_constructed_scalar(dict_view.keys().type(), stream, mr); - null_result->set_valid_async(false, stream); - return null_result; + return make_default_constructed_scalar(dict_view.keys().type(), stream, mr); } // retrieve the key element using the key-index @@ -156,12 +154,12 @@ struct get_element_functor { auto device_col = column_device_view::create(input, stream); - cudf::detail::device_scalar temp_data(stream, mr); - cudf::detail::device_scalar temp_valid(stream, mr); + auto result = std::make_unique>( + Type{}, numeric::scale_type{input.type().scale()}, false, stream, mr); device_single_thread( - [buffer = temp_data.data(), - validity = temp_valid.data(), + [buffer = result->data(), + validity = result->validity_data(), d_col = *device_col, index] __device__() mutable { *buffer = d_col.element(index); @@ -169,11 +167,7 @@ struct get_element_functor { }, stream); - return std::make_unique>(std::move(temp_data), - numeric::scale_type{input.type().scale()}, - temp_valid.value(stream), - stream, - mr); + return result; } template >* p = nullptr> diff --git a/cpp/src/scalar/scalar_factories.cpp b/cpp/src/scalar/scalar_factories.cpp index 9f242bdffe0..b31c47c6cff 100644 --- a/cpp/src/scalar/scalar_factories.cpp +++ b/cpp/src/scalar/scalar_factories.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2024, NVIDIA CORPORATION. + * Copyright (c) 2019-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -27,25 +27,22 @@ namespace cudf { namespace { struct scalar_construction_helper { template , std::enable_if_t() and not is_fixed_point()>* = nullptr> std::unique_ptr operator()(rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) const { - using Type = device_storage_type_t; - auto s = new ScalarType(Type{}, false, stream, mr); - return std::unique_ptr(s); + using Type = device_storage_type_t; + using ScalarType = scalar_type_t; + return std::make_unique(Type{}, false, stream, mr); } - template , - std::enable_if_t()>* = nullptr> + template ()>* = nullptr> std::unique_ptr operator()(rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) const { - using Type = device_storage_type_t; - auto s = new ScalarType(Type{}, numeric::scale_type{0}, false, stream, mr); - return std::unique_ptr(s); + using Type = device_storage_type_t; + using ScalarType = scalar_type_t; + return std::make_unique(Type{}, numeric::scale_type{0}, false, stream, mr); } template ()>* = nullptr> From 314dcbc72feda715830815a74a0ad3f91ae493e1 Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Mon, 11 Aug 2025 11:25:40 -0500 Subject: [PATCH 091/366] Make `DataFrame.dtypes` not fallback to CPU always (#19627) Fixes: #19620 This PR returns the corresponding pandas dtypes for `CategoricalDtype` and `IntervalDtype` only in pandas compatibilty mode so that correct dtypes are present with `cudf.pandas` enabled. This change introduces 9 low priority failures that are now xfailed. Authors: - GALI PREM SAGAR (https://github.com/galipremsagar) Approvers: - Matthew Roeschke (https://github.com/mroeschke) URL: https://github.com/rapidsai/cudf/pull/19627 --- python/cudf/cudf/core/dataframe.py | 23 ++++++++++- python/cudf/cudf/pandas/_wrappers/pandas.py | 19 +++++++++- .../cudf/pandas/scripts/conftest-patch.py | 38 +++++++++++-------- 3 files changed, 63 insertions(+), 17 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index ff0fe0e0564..9833768a15a 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -70,6 +70,7 @@ Decimal64Dtype, Decimal128Dtype, IntervalDtype, + ListDtype, StructDtype, ) from cudf.core.groupby.groupby import DataFrameGroupBy, groupby_doc_template @@ -1252,7 +1253,27 @@ def dtypes(self) -> pd.Series: string object dtype: object """ - return pd.Series(dict(self._dtypes), dtype="object") + result_dict = dict(self._dtypes) + if cudf.get_option("mode.pandas_compatible"): + for key, value in result_dict.items(): + if isinstance( + value, + ( + ListDtype, + StructDtype, + Decimal32Dtype, + Decimal64Dtype, + Decimal128Dtype, + ), + ): + raise TypeError( + f"Column '{key}' has {type(value).__name__}, which is not supported in pandas." + ) + + result = pd.Series( + result_dict, index=self._data.to_pandas_index, dtype="object" + ) + return result @property def ndim(self) -> int: diff --git a/python/cudf/cudf/pandas/_wrappers/pandas.py b/python/cudf/cudf/pandas/_wrappers/pandas.py index 6f35412282d..492334c9416 100644 --- a/python/cudf/cudf/pandas/_wrappers/pandas.py +++ b/python/cudf/cudf/pandas/_wrappers/pandas.py @@ -266,6 +266,23 @@ def ignore_ipython_canary_check(self, **kwargs): ) +def _DataFrame_dtypes_apply_func(value): + if isinstance(value, (cudf.CategoricalDtype, cudf.IntervalDtype)): + return value.to_pandas() + return value + + +def _DataFrame__dtypes(self): + result = _fast_slow_function_call( + lambda self: self.dtypes, + self, + )[0] + result = _maybe_wrap_result( + result._fsproxy_slow.apply(_DataFrame_dtypes_apply_func), None + ) + return result + + DataFrame = make_final_proxy_type( "DataFrame", cudf.DataFrame, @@ -280,7 +297,7 @@ def ignore_ipython_canary_check(self, **kwargs): "_constructor_sliced": _FastSlowAttribute("_constructor_sliced"), "_accessors": set(), "_ipython_canary_method_should_not_exist_": ignore_ipython_canary_check, - "dtypes": _FastSlowAttribute("dtypes", private=True), + "dtypes": property(_DataFrame__dtypes), }, ) diff --git a/python/cudf/cudf/pandas/scripts/conftest-patch.py b/python/cudf/cudf/pandas/scripts/conftest-patch.py index c48e4058c02..c9795a643a9 100644 --- a/python/cudf/cudf/pandas/scripts/conftest-patch.py +++ b/python/cudf/cudf/pandas/scripts/conftest-patch.py @@ -88,21 +88,6 @@ def pytest_unconfigure(config): # TODO: Pass these tests with cudf.pandas enabled. NODEIDS_THAT_FAIL_WITH_CUDF_PANDAS = { - "tests/arrays/categorical/test_analytics.py::TestCategoricalAnalytics::test_searchsorted[False]", - "tests/arrays/categorical/test_analytics.py::TestCategoricalAnalytics::test_searchsorted[None]", - "tests/arrays/categorical/test_analytics.py::TestCategoricalAnalytics::test_searchsorted[True]", - "tests/arrays/test_datetimelike.py::test_searchsorted_datetimelike_with_listlike_invalid_dtype[arg0-values1]", - "tests/extension/test_categorical.py::TestCategorical::test_searchsorted[True]", - "tests/indexes/numeric/test_indexing.py::TestGetIndexer::test_get_indexer_nearest_decreasing[backfill-expected1]", - "tests/indexes/numeric/test_indexing.py::TestGetIndexer::test_get_indexer_nearest_decreasing[nearest-expected2]", - "tests/indexes/numeric/test_indexing.py::TestGetIndexer::test_get_indexer_nearest_decreasing[pad-expected0]", - "tests/indexes/numeric/test_indexing.py::TestGetIndexer::test_get_indexer_nearest_error", - "tests/indexes/numeric/test_indexing.py::TestGetIndexer::test_get_indexer_with_method_numeric_vs_bool[backfill]", - "tests/indexes/numeric/test_indexing.py::TestGetIndexer::test_get_indexer_with_method_numeric_vs_bool[nearest]", - "tests/indexes/numeric/test_indexing.py::TestGetIndexer::test_get_indexer_with_method_numeric_vs_bool[pad]", - "tests/indexes/ranges/test_indexing.py::TestGetIndexer::test_get_indexer_limit", - "tests/indexes/timedeltas/test_searchsorted.py::TestSearchSorted::test_searchsorted_invalid_argument_dtype[arg0]", - "tests/indexing/test_loc.py::TestLocBaseIndependent::test_loc_setitem_frame_with_inverted_slice", "tests/api/test_api.py::test_pandas_array_alias", "tests/apply/test_frame_apply.py::test_agg_transform[axis='columns']", "tests/apply/test_frame_apply.py::test_agg_transform[axis='index']", @@ -760,6 +745,9 @@ def pytest_unconfigure(config): "tests/arrays/boolean/test_reduction.py::test_reductions_return_types[count-True]", "tests/arrays/categorical/test_analytics.py::TestCategoricalAnalytics::test_memory_usage", "tests/arrays/categorical/test_analytics.py::TestCategoricalAnalytics::test_min_max_ordered[array]", + "tests/arrays/categorical/test_analytics.py::TestCategoricalAnalytics::test_searchsorted[False]", + "tests/arrays/categorical/test_analytics.py::TestCategoricalAnalytics::test_searchsorted[None]", + "tests/arrays/categorical/test_analytics.py::TestCategoricalAnalytics::test_searchsorted[True]", "tests/arrays/categorical/test_api.py::TestCategoricalAPI::test_rename_categories", "tests/arrays/categorical/test_api.py::TestCategoricalAPI::test_set_categories", "tests/arrays/categorical/test_api.py::TestPrivateCategoricalAPI::test_codes_immutable", @@ -1347,6 +1335,7 @@ def pytest_unconfigure(config): "tests/arrays/test_datetimelike.py::TestPeriodArray::test_array_interface[YE]", "tests/arrays/test_datetimelike.py::TestTimedeltaArray::test_array_interface", "tests/arrays/test_datetimelike.py::TestTimedeltaArray::test_searchsorted_castable_strings[pyarrow_numpy-series]", + "tests/arrays/test_datetimelike.py::test_searchsorted_datetimelike_with_listlike_invalid_dtype[arg0-values1]", "tests/arrays/test_datetimes.py::TestDatetimeArray::test_array_interface", "tests/arrays/test_datetimes.py::TestDatetimeArray::test_astype_copies[datetime64[ns]-datetime64[ns]]", "tests/arrays/test_datetimes.py::TestDatetimeArray::test_astype_to_same", @@ -3764,6 +3753,7 @@ def pytest_unconfigure(config): "tests/extension/test_categorical.py::TestCategorical::test_reduce_series_boolean[any-False]", "tests/extension/test_categorical.py::TestCategorical::test_reduce_series_boolean[any-True]", "tests/extension/test_categorical.py::TestCategorical::test_reindex_non_na_fill_value", + "tests/extension/test_categorical.py::TestCategorical::test_searchsorted[True]", "tests/extension/test_categorical.py::TestCategorical::test_series_constructor", "tests/extension/test_categorical.py::TestCategorical::test_series_constructor_scalar_na_with_index", "tests/extension/test_categorical.py::TestCategorical::test_setitem_frame_2d_values", @@ -8180,11 +8170,18 @@ def pytest_unconfigure(config): "tests/indexes/numeric/test_astype.py::TestAstype::test_cannot_cast_inf_to_int[nan-int]", "tests/indexes/numeric/test_indexing.py::TestContains::test_contains_float64_nans", "tests/indexes/numeric/test_indexing.py::TestGetIndexer::test_get_indexer_invalid", + "tests/indexes/numeric/test_indexing.py::TestGetIndexer::test_get_indexer_nearest_decreasing[backfill-expected1]", + "tests/indexes/numeric/test_indexing.py::TestGetIndexer::test_get_indexer_nearest_decreasing[nearest-expected2]", + "tests/indexes/numeric/test_indexing.py::TestGetIndexer::test_get_indexer_nearest_decreasing[pad-expected0]", + "tests/indexes/numeric/test_indexing.py::TestGetIndexer::test_get_indexer_nearest_error", "tests/indexes/numeric/test_indexing.py::TestGetIndexer::test_get_indexer_numeric_index_boolean_target[get_indexer-float64]", "tests/indexes/numeric/test_indexing.py::TestGetIndexer::test_get_indexer_numeric_index_boolean_target[get_indexer-int64]", "tests/indexes/numeric/test_indexing.py::TestGetIndexer::test_get_indexer_numeric_index_boolean_target[get_indexer-range]", "tests/indexes/numeric/test_indexing.py::TestGetIndexer::test_get_indexer_numeric_index_boolean_target[get_indexer-uint64]", "tests/indexes/numeric/test_indexing.py::TestGetIndexer::test_get_indexer_numeric_vs_bool", + "tests/indexes/numeric/test_indexing.py::TestGetIndexer::test_get_indexer_with_method_numeric_vs_bool[backfill]", + "tests/indexes/numeric/test_indexing.py::TestGetIndexer::test_get_indexer_with_method_numeric_vs_bool[nearest]", + "tests/indexes/numeric/test_indexing.py::TestGetIndexer::test_get_indexer_with_method_numeric_vs_bool[pad]", "tests/indexes/numeric/test_indexing.py::TestGetIndexer::test_get_loc_masked_na_and_nan", "tests/indexes/numeric/test_numeric.py::TestFloatNumericIndex::test_equals_numeric", "tests/indexes/numeric/test_numeric.py::TestFloatNumericIndex::test_type_coercion_fail[int16]", @@ -8223,6 +8220,7 @@ def pytest_unconfigure(config): "tests/indexes/ranges/test_constructors.py::TestRangeIndexConstructors::test_constructor[args5-kwargs5-0-0-1-foo]", "tests/indexes/ranges/test_constructors.py::TestRangeIndexConstructors::test_constructor[args6-kwargs6-0-0-1-None]", "tests/indexes/ranges/test_constructors.py::TestRangeIndexConstructors::test_constructor[args6-kwargs6-0-0-1-foo]", + "tests/indexes/ranges/test_indexing.py::TestGetIndexer::test_get_indexer_limit", "tests/indexes/ranges/test_join.py::TestJoin::test_join_self[inner]", "tests/indexes/ranges/test_join.py::TestJoin::test_join_self[left]", "tests/indexes/ranges/test_join.py::TestJoin::test_join_self[outer]", @@ -8751,6 +8749,7 @@ def pytest_unconfigure(config): "tests/indexes/timedeltas/test_formats.py::TestTimedeltaIndexRendering::test_representation[__repr__]", "tests/indexes/timedeltas/test_formats.py::TestTimedeltaIndexRendering::test_representation[__str__]", "tests/indexes/timedeltas/test_indexing.py::TestContains::test_contains", + "tests/indexes/timedeltas/test_searchsorted.py::TestSearchSorted::test_searchsorted_invalid_argument_dtype[arg0]", "tests/indexes/timedeltas/test_setops.py::TestTimedeltaIndex::test_intersection_equal[False]", "tests/indexes/timedeltas/test_setops.py::TestTimedeltaIndex::test_intersection_equal[None]", "tests/indexes/timedeltas/test_setops.py::TestTimedeltaIndex::test_intersection_non_monotonic[None-rng2-expected2]", @@ -10010,6 +10009,7 @@ def pytest_unconfigure(config): "tests/indexing/test_loc.py::TestLocBaseIndependent::test_loc_setitem_consistency[val2]", "tests/indexing/test_loc.py::TestLocBaseIndependent::test_loc_setitem_consistency_dt64_to_float", "tests/indexing/test_loc.py::TestLocBaseIndependent::test_loc_setitem_empty_append_expands_rows_mixed_dtype", + "tests/indexing/test_loc.py::TestLocBaseIndependent::test_loc_setitem_frame_with_inverted_slice", "tests/indexing/test_loc.py::TestLocBaseIndependent::test_loc_uint64_disallow_negative", "tests/indexing/test_loc.py::TestLocBaseIndependent::test_setitem_new_key_tz[loc]", "tests/indexing/test_loc.py::TestLocBaseIndependent::test_setitem_new_key_tz[setitem]", @@ -14991,6 +14991,14 @@ def pytest_unconfigure(config): "tests/window/test_rolling_functions.py::test_rolling_max_resample[None]", "tests/window/test_rolling_functions.py::test_rolling_median_resample", "tests/window/test_rolling_functions.py::test_rolling_min_max_numeric_types[data_type0]", + "tests/window/test_rolling_functions.py::test_rolling_min_max_numeric_types[data_type2]", + "tests/window/test_rolling_functions.py::test_rolling_min_max_numeric_types[data_type3]", + "tests/window/test_rolling_functions.py::test_rolling_min_max_numeric_types[data_type4]", + "tests/window/test_rolling_functions.py::test_rolling_min_max_numeric_types[data_type5]", + "tests/window/test_rolling_functions.py::test_rolling_min_max_numeric_types[data_type6]", + "tests/window/test_rolling_functions.py::test_rolling_min_max_numeric_types[data_type7]", + "tests/window/test_rolling_functions.py::test_rolling_min_max_numeric_types[data_type8]", + "tests/window/test_rolling_functions.py::test_rolling_min_max_numeric_types[data_type9]", "tests/window/test_rolling_functions.py::test_rolling_min_resample[10]", "tests/window/test_rolling_functions.py::test_rolling_min_resample[1]", "tests/window/test_rolling_functions.py::test_rolling_min_resample[2]", From 3caed58d4d532f01c922deabf464ad6c934d9d49 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Mon, 11 Aug 2025 13:55:32 -0700 Subject: [PATCH 092/366] Move test_replace.py to new cudf classic directory structure (#19629) Towards https://github.com/rapidsai/cudf/issues/9999 Towards https://github.com/rapidsai/cudf/issues/15723 Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - GALI PREM SAGAR (https://github.com/galipremsagar) URL: https://github.com/rapidsai/cudf/pull/19629 --- .../cudf/tests/dataframe/methods/test_clip.py | 66 + .../tests/dataframe/methods/test_fillna.py | 76 + .../tests/dataframe/methods/test_replace.py | 172 ++ .../tests/dataframe/methods/test_where.py | 81 + .../cudf/tests/series/methods/test_clip.py | 46 + .../cudf/tests/series/methods/test_fillna.py | 496 +++++- .../cudf/tests/series/methods/test_replace.py | 527 ++++++ .../cudf/tests/series/methods/test_where.py | 47 + python/cudf/cudf/tests/test_replace.py | 1459 ----------------- 9 files changed, 1510 insertions(+), 1460 deletions(-) create mode 100644 python/cudf/cudf/tests/dataframe/methods/test_clip.py create mode 100644 python/cudf/cudf/tests/dataframe/methods/test_fillna.py create mode 100644 python/cudf/cudf/tests/dataframe/methods/test_replace.py create mode 100644 python/cudf/cudf/tests/dataframe/methods/test_where.py create mode 100644 python/cudf/cudf/tests/series/methods/test_clip.py create mode 100644 python/cudf/cudf/tests/series/methods/test_replace.py delete mode 100644 python/cudf/cudf/tests/test_replace.py diff --git a/python/cudf/cudf/tests/dataframe/methods/test_clip.py b/python/cudf/cudf/tests/dataframe/methods/test_clip.py new file mode 100644 index 00000000000..3387368278e --- /dev/null +++ b/python/cudf/cudf/tests/dataframe/methods/test_clip.py @@ -0,0 +1,66 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + +import pandas as pd +import pytest + +import cudf +from cudf.testing import assert_eq + + +@pytest.mark.parametrize( + ("lower", "upper"), + [ + ([2, 7.4], [4, 7.9]), + ([2, 7.4], None), + ( + None, + [4, 7.9], + ), + ], +) +def test_dataframe_clip(lower, upper, inplace): + pdf = pd.DataFrame( + {"a": [1, 2, 3, 4, 5], "b": [7.1, 7.24, 7.5, 7.8, 8.11]} + ) + gdf = cudf.from_pandas(pdf) + + got = gdf.clip(lower=lower, upper=upper, inplace=inplace) + expect = pdf.clip(lower=lower, upper=upper, axis=1) + + if inplace is True: + assert_eq(expect, gdf) + else: + assert_eq(expect, got) + + +@pytest.mark.parametrize( + ("lower", "upper"), + [("b", "d"), ("b", None), (None, "c"), (None, None)], +) +@pytest.mark.parametrize("inplace", [True, False]) +def test_dataframe_category_clip(lower, upper, inplace): + data = ["a", "b", "c", "d", "e"] + pdf = pd.DataFrame({"a": data}) + gdf = cudf.from_pandas(pdf) + gdf["a"] = gdf["a"].astype("category") + + expect = pdf.clip(lower=lower, upper=upper) + got = gdf.clip(lower=lower, upper=upper, inplace=inplace) + + if inplace is True: + assert_eq(expect, gdf.astype("str")) + else: + assert_eq(expect, got.astype("str")) + + +@pytest.mark.parametrize( + ("lower", "upper"), + [([2, 7.4], [4, 7.9, "d"]), ([2, 7.4, "a"], [4, 7.9, "d"])], +) +def test_dataframe_exceptions_for_clip(lower, upper): + gdf = cudf.DataFrame( + {"a": [1, 2, 3, 4, 5], "b": [7.1, 7.24, 7.5, 7.8, 8.11]} + ) + + with pytest.raises(ValueError): + gdf.clip(lower=lower, upper=upper) diff --git a/python/cudf/cudf/tests/dataframe/methods/test_fillna.py b/python/cudf/cudf/tests/dataframe/methods/test_fillna.py new file mode 100644 index 00000000000..8a121cad33d --- /dev/null +++ b/python/cudf/cudf/tests/dataframe/methods/test_fillna.py @@ -0,0 +1,76 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + + +import numpy as np +import pandas as pd +import pytest + +import cudf +from cudf.testing import assert_eq + + +@pytest.mark.parametrize( + "pdf", + [ + pd.DataFrame({"a": [1, 2, None], "b": [None, None, 5]}), + pd.DataFrame( + {"a": [1, 2, None], "b": [None, None, 5]}, index=["a", "p", "z"] + ), + pd.DataFrame({"a": [1, 2, 3]}), + ], +) +@pytest.mark.parametrize( + "value", + [ + 10, + pd.Series([10, 20, 30]), + pd.Series([3, 4, 5]), + pd.Series([10, 20, 30], index=["z", "a", "p"]), + {"a": 5, "b": pd.Series([3, 4, 5])}, + {"a": 5001}, + {"b": pd.Series([11, 22, 33], index=["a", "p", "z"])}, + {"a": 5, "b": pd.Series([3, 4, 5], index=["a", "p", "z"])}, + {"c": 100}, + np.nan, + ], +) +def test_fillna_dataframe(pdf, value, inplace): + if inplace: + pdf = pdf.copy(deep=True) + gdf = cudf.from_pandas(pdf) + + fill_value_pd = value + if isinstance(fill_value_pd, (pd.Series, pd.DataFrame)): + fill_value_cudf = cudf.from_pandas(fill_value_pd) + elif isinstance(fill_value_pd, dict): + fill_value_cudf = {} + for key in fill_value_pd: + temp_val = fill_value_pd[key] + if isinstance(temp_val, pd.Series): + temp_val = cudf.from_pandas(temp_val) + fill_value_cudf[key] = temp_val + else: + fill_value_cudf = value + + expect = pdf.fillna(fill_value_pd, inplace=inplace) + got = gdf.fillna(fill_value_cudf, inplace=inplace) + + if inplace: + got = gdf + expect = pdf + + assert_eq(expect, got) + + +def test_fillna_columns_multiindex(): + columns = pd.MultiIndex.from_tuples([("a", "b"), ("d", "e")]) + pdf = pd.DataFrame( + {"0": [1, 2, None, 3, None], "1": [None, None, None, None, 4]} + ) + pdf.columns = columns + gdf = cudf.from_pandas(pdf) + + expected = pdf.fillna(10) + actual = gdf.fillna(10) + + assert_eq(expected, actual) diff --git a/python/cudf/cudf/tests/dataframe/methods/test_replace.py b/python/cudf/cudf/tests/dataframe/methods/test_replace.py new file mode 100644 index 00000000000..9f6b9007de7 --- /dev/null +++ b/python/cudf/cudf/tests/dataframe/methods/test_replace.py @@ -0,0 +1,172 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + +import numpy as np +import pandas as pd +import pytest + +import cudf +from cudf.core._compat import ( + PANDAS_CURRENT_SUPPORTED_VERSION, + PANDAS_VERSION, +) +from cudf.testing import assert_eq +from cudf.testing._utils import ( + assert_exceptions_equal, + expect_warning_if, +) + + +@pytest.mark.skipif( + PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, + reason="warning introduced in pandas-2.2.0", +) +@pytest.mark.parametrize( + "data, dtype", + [ + ( + { + "a": [0, 1, None, 2, 3], + "b": [3, 2, 2, 3, None], + "c": ["abc", "def", ".", None, None], + }, + None, + ), + ( + { + "a": ["one", "two", None, "three"], + "b": ["one", None, "two", "three"], + }, + "category", + ), + ( + { + "col one": [None, 10, 11, None, 1000, 500, 600], + "col two": ["abc", "def", "ghi", None, "pp", None, "a"], + "a": [0.324, 0.234, 324.342, 23.32, 9.9, None, None], + }, + None, + ), + ], +) +@pytest.mark.parametrize( + "to_replace,value", + [ + (0, 4), + ([0, 1], [4, 5]), + ([0, 1], 4), + ({"a": 0, "b": 0}, {"a": 4, "b": 5}), + ({"a": 0}, {"a": 4}), + ("abc", "---"), + ([".", "gh"], "hi"), + ([".", "def"], ["_", None]), + ({"c": 0}, {"a": 4, "b": 5}), + ({"a": 2}, {"c": "a"}), + ("two", "three"), + ([1, 2], pd.Series([10, 11])), + (pd.Series([10, 11], index=[3, 2]), None), + ( + pd.Series(["a+", "+c", "p", "---"], index=["abc", "gh", "l", "z"]), + None, + ), + ( + pd.Series([10, 11], index=[3, 2]), + {"a": [-10, -30], "l": [-111, -222]}, + ), + (pd.Series([10, 11], index=[3, 2]), 555), + ( + pd.Series([10, 11], index=["a", "b"]), + pd.Series([555, 1111], index=["a", "b"]), + ), + ({"a": "2", "b": "3", "zzz": "hi"}, None), + ({"a": 2, "b": 3, "zzz": "hi"}, 324353), + ( + {"a": 2, "b": 3, "zzz": "hi"}, + pd.Series([5, 6, 10], index=["a", "b", "col one"]), + ), + ], +) +def test_dataframe_replace(data, dtype, to_replace, value): + gdf = cudf.DataFrame(data, dtype=dtype) + pdf = gdf.to_pandas() + + pd_value = value + if isinstance(value, pd.Series): + gd_value = cudf.from_pandas(value) + else: + gd_value = value + + pd_to_replace = to_replace + if isinstance(to_replace, pd.Series): + gd_to_replace = cudf.from_pandas(to_replace) + else: + gd_to_replace = to_replace + + can_warn = ( + isinstance(gdf["a"].dtype, cudf.CategoricalDtype) + and isinstance(to_replace, str) + and to_replace == "two" + and isinstance(value, str) + and value == "three" + ) + with expect_warning_if(can_warn): + if pd_value is None: + expected = pdf.replace(to_replace=pd_to_replace) + else: + expected = pdf.replace(to_replace=pd_to_replace, value=pd_value) + with expect_warning_if(can_warn): + actual = gdf.replace(to_replace=gd_to_replace, value=gd_value) + + expected_sorted = expected.sort_values(by=list(expected.columns), axis=0) + actual_sorted = actual.sort_values(by=list(actual.columns), axis=0) + + assert_eq(expected_sorted, actual_sorted) + + +def test_dataframe_replace_with_nulls(): + # numerical + pdf1 = pd.DataFrame({"a": [0, 1, 2, 3], "b": [0, 1, 2, 3]}) + gdf1 = cudf.from_pandas(pdf1) + pdf2 = pdf1.replace(0, 4) + gdf2 = gdf1.replace(0, None).fillna(4) + assert_eq(gdf2, pdf2) + + # list input + pdf6 = pdf1.replace([0, 1], [4, 5]) + gdf6 = gdf1.replace([0, 1], [4, None]).fillna(5) + assert_eq(gdf6, pdf6) + + pdf7 = pdf1.replace([0, 1], 4) + gdf7 = gdf1.replace([0, 1], None).fillna(4) + assert_eq(gdf7, pdf7) + + # dict input: + pdf8 = pdf1.replace({"a": 0, "b": 0}, {"a": 4, "b": 5}) + gdf8 = gdf1.replace({"a": 0, "b": 0}, {"a": None, "b": 5}).fillna(4) + assert_eq(gdf8, pdf8) + + gdf1 = cudf.DataFrame({"a": [0, 1, 2, 3], "b": [0, 1, 2, None]}) + gdf9 = gdf1.replace([0, 1], [4, 5]).fillna(3) + assert_eq(gdf9, pdf6) + + +def test_replace_df_error(): + pdf = pd.DataFrame({"a": [1, 2, 3, 4, 5, 666]}) + gdf = cudf.from_pandas(pdf) + + assert_exceptions_equal( + lfunc=pdf.replace, + rfunc=gdf.replace, + lfunc_args_and_kwargs=([], {"to_replace": -1, "value": []}), + rfunc_args_and_kwargs=([], {"to_replace": -1, "value": []}), + ) + + +def test_replace_multiple_rows(datadir): + path = datadir / "parquet" / "replace_multiple_rows.parquet" + pdf = pd.read_parquet(path) + gdf = cudf.read_parquet(path) + + pdf.replace([np.inf, -np.inf], np.nan, inplace=True) + gdf.replace([np.inf, -np.inf], np.nan, inplace=True) + + assert_eq(pdf, gdf, check_dtype=False) diff --git a/python/cudf/cudf/tests/dataframe/methods/test_where.py b/python/cudf/cudf/tests/dataframe/methods/test_where.py new file mode 100644 index 00000000000..f7af9945272 --- /dev/null +++ b/python/cudf/cudf/tests/dataframe/methods/test_where.py @@ -0,0 +1,81 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + + +import numpy as np +import pandas as pd +import pytest + +import cudf +from cudf.testing import assert_eq + + +@pytest.mark.parametrize("fill_value", [[888, 999]]) +def test_dataframe_with_nulls_where_with_scalars(fill_value): + pdf = pd.DataFrame( + { + "A": [-1, 2, -3, None, 5, 6, -7, 0], + "B": [4, -2, 3, None, 7, 6, 8, 0], + } + ) + gdf = cudf.from_pandas(pdf) + + expect = pdf.where(pdf % 3 == 0, fill_value) + got = gdf.where(gdf % 3 == 0, fill_value) + + assert_eq(expect, got) + + +def test_dataframe_with_different_types(): + # Testing for int and float + pdf = pd.DataFrame( + {"A": [111, 22, 31, 410, 56], "B": [-10.12, 121.2, 45.7, 98.4, 87.6]} + ) + gdf = cudf.from_pandas(pdf) + expect = pdf.where(pdf > 50, -pdf) + got = gdf.where(gdf > 50, -gdf) + + assert_eq(expect, got) + + # Testing for string + pdf = pd.DataFrame({"A": ["a", "bc", "cde", "fghi"]}) + gdf = cudf.from_pandas(pdf) + pdf_mask = pd.DataFrame({"A": [True, False, True, False]}) + gdf_mask = cudf.from_pandas(pdf_mask) + expect = pdf.where(pdf_mask, ["cudf"]) + got = gdf.where(gdf_mask, ["cudf"]) + + assert_eq(expect, got) + + # Testing for categoriacal + pdf = pd.DataFrame({"A": ["a", "b", "b", "c"]}) + pdf["A"] = pdf["A"].astype("category") + gdf = cudf.from_pandas(pdf) + expect = pdf.where(pdf_mask, "c") + got = gdf.where(gdf_mask, ["c"]) + + assert_eq(expect, got) + + +def test_dataframe_where_with_different_options(): + pdf = pd.DataFrame({"A": [1, 2, 3], "B": [3, 4, 5]}) + gdf = cudf.from_pandas(pdf) + + # numpy array + boolean_mask = np.array([[False, True], [True, False], [False, True]]) + + expect = pdf.where(boolean_mask, -pdf) + got = gdf.where(boolean_mask, -gdf) + + assert_eq(expect, got) + + # with single scalar + expect = pdf.where(boolean_mask, 8) + got = gdf.where(boolean_mask, 8) + + assert_eq(expect, got) + + # with multi scalar + expect = pdf.where(boolean_mask, [8, 9]) + got = gdf.where(boolean_mask, [8, 9]) + + assert_eq(expect, got) diff --git a/python/cudf/cudf/tests/series/methods/test_clip.py b/python/cudf/cudf/tests/series/methods/test_clip.py new file mode 100644 index 00000000000..0a3fc787ef1 --- /dev/null +++ b/python/cudf/cudf/tests/series/methods/test_clip.py @@ -0,0 +1,46 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + + +import pandas as pd +import pytest + +import cudf +from cudf.testing import assert_eq + + +@pytest.mark.parametrize( + ("data", "lower", "upper"), + [ + ([1, 2, 3, 4, 5], 2, 4), + ([1, 2, 3, 4, 5], 2, None), + ([1, 2, 3, 4, 5], None, 4), + ([1, 2, 3, 4, 5], None, None), + ([1, 2, 3, 4, 5], 4, 2), + ([1.0, 2.0, 3.0, 4.0, 5.0], 4, 2), + (pd.Series([1, 2, 3, 4, 5], dtype="int32"), 4, 2), + (["a", "b", "c", "d", "e"], "b", "d"), + (["a", "b", "c", "d", "e"], "b", None), + (["a", "b", "c", "d", "e"], None, "d"), + (["a", "b", "c", "d", "e"], "d", "b"), + ], +) +@pytest.mark.parametrize("inplace", [True, False]) +def test_series_clip(data, lower, upper, inplace): + psr = pd.Series(data) + gsr = cudf.from_pandas(psr) + + expect = psr.clip(lower=lower, upper=upper) + got = gsr.clip(lower=lower, upper=upper, inplace=inplace) + + if inplace is True: + assert_eq(expect, gsr) + else: + assert_eq(expect, got) + + +def test_series_exceptions_for_clip(): + with pytest.raises(ValueError): + cudf.Series([1, 2, 3, 4]).clip([1, 2], [2, 3]) + + with pytest.raises(NotImplementedError): + cudf.Series([1, 2, 3, 4]).clip(1, 2, axis=0) diff --git a/python/cudf/cudf/tests/series/methods/test_fillna.py b/python/cudf/cudf/tests/series/methods/test_fillna.py index e64fb209519..094b27c4fff 100644 --- a/python/cudf/cudf/tests/series/methods/test_fillna.py +++ b/python/cudf/cudf/tests/series/methods/test_fillna.py @@ -1,10 +1,19 @@ # Copyright (c) 2020-2025, NVIDIA CORPORATION. +import decimal import numpy as np +import pandas as pd +import pyarrow as pa import pytest import cudf +from cudf.core._compat import ( + PANDAS_CURRENT_SUPPORTED_VERSION, + PANDAS_GE_220, + PANDAS_VERSION, +) from cudf.testing import assert_eq +from cudf.testing._utils import assert_exceptions_equal @pytest.mark.parametrize( @@ -16,7 +25,6 @@ [np.nan, 1, 10, 393.32, np.nan], ], ) -@pytest.mark.parametrize("nan_as_null", [True, False]) @pytest.mark.parametrize("fill_value", [1.2, 332, np.nan]) def test_fillna_with_nan(data, nan_as_null, fill_value): gs = cudf.Series(data, dtype="float64", nan_as_null=nan_as_null) @@ -79,3 +87,489 @@ def test_timedelta_fillna(data, timedelta_types_as_str, fill_value): actual = actual.dropna() assert_eq(expected, actual) + + +@pytest.mark.skipif( + PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, + reason="warning not present in older pandas versions", +) +@pytest.mark.parametrize( + "data", + [ + [1, None, None, 2, 3, 4], + [None, None, 1, 2, None, 3, 4], + [1, 2, None, 3, 4, None, None], + [0] + [None] * 14, + [None] * 14 + [0], + ], +) +@pytest.mark.parametrize("container", [pd.Series, pd.DataFrame]) +@pytest.mark.parametrize("method", ["ffill", "bfill"]) +def test_fillna_method_numerical( + data, container, numeric_types_as_str, method, inplace +): + if container == pd.DataFrame: + data = {"a": data, "b": data, "c": data} + + pdata = container(data) + + data_dtype = numeric_types_as_str + if np.dtype(numeric_types_as_str).kind != "f": + data_dtype = cudf.utils.dtypes.np_dtypes_to_pandas_dtypes[ + np.dtype(numeric_types_as_str) + ] + pdata = pdata.astype(data_dtype) + + # Explicitly using nans_as_nulls=True + gdata = cudf.from_pandas(pdata, nan_as_null=True) + + with pytest.warns(FutureWarning): + expected = pdata.fillna(method=method, inplace=inplace) + with pytest.warns(FutureWarning): + actual = gdata.fillna(method=method, inplace=inplace) + + if inplace: + expected = pdata + actual = gdata + + assert_eq(expected, actual, check_dtype=False) + + +@pytest.mark.parametrize( + "gsr_data, dtype", + [ + ( + ["2.34", "5.2", "7.47", None, "92.29", None], + cudf.Decimal64Dtype(7, 2), + ), + ( + ["-74.56", None, "-23.73", "34.55", "2.89", None], + cudf.Decimal32Dtype(7, 2), + ), + ( + ["85.955", np.nan, "-3.243", np.nan, "29.492", np.nan], + cudf.Decimal64Dtype(8, 3), + ), + ( + ["2.964", None, "57.432", "-989.330", None, "56.444"], + cudf.Decimal64Dtype(8, 3), + ), + ( + [np.nan, "55.2498", np.nan, "-5.2965", "-28.9423", np.nan], + cudf.Decimal64Dtype(10, 4), + ), + ( + ["2.964", None, "54347.432", "-989.330", None, "56.444"], + cudf.Decimal128Dtype(20, 7), + ), + ], +) +@pytest.mark.parametrize( + "fill_value", + [ + lambda: 42, + lambda: -123, + lambda: decimal.Decimal("8.2"), + lambda: decimal.Decimal("-12.87"), + lambda: cudf.Series( + [None, -854, 9533, -274, -845, 7924], dtype="int32" + ), + lambda: cudf.Series( + ["-53.5", "13.4", "-64.3", None, "42.42", None] + ).astype(cudf.Decimal64Dtype(7, 2)), + lambda: cudf.Series( + ["57.45", np.nan, np.nan, "686.49", "-55.5", "73.24"], + ).astype(cudf.Decimal64Dtype(7, 2)), + ], +) +def test_fillna_decimal(gsr_data, dtype, fill_value, inplace): + gsr = cudf.Series(gsr_data).astype(dtype) + psr = gsr.to_pandas() + fill_value = fill_value() + if isinstance(fill_value, cudf.Series): + p_fill_value = fill_value.to_pandas() + else: + p_fill_value = fill_value + + expected = psr.fillna(p_fill_value, inplace=inplace) + got = gsr.fillna(fill_value, inplace=inplace) + + assert_eq(expected, got, check_dtype=False) + + +@pytest.mark.parametrize( + "psr", + [ + pd.Series(["a", "b", "a", None, "c", None], dtype="category"), + pd.Series( + ["a", "b", "a", None, "c", None], + dtype="category", + index=["q", "r", "z", "a", "b", "c"], + ), + pd.Series( + ["a", "b", "a", None, "c", None], + dtype="category", + index=["x", "t", "p", "q", "r", "z"], + ), + pd.Series(["a", "b", "a", np.nan, "c", np.nan], dtype="category"), + pd.Series( + [None, None, None, None, None, None, "a", "b", "c"], + dtype="category", + ), + ], +) +@pytest.mark.parametrize( + "fill_value", + [ + "c", + pd.Series(["c", "c", "c", "c", "c", "a"], dtype="category"), + pd.Series( + ["a", "b", "a", None, "c", None], + dtype="category", + index=["x", "t", "p", "q", "r", "z"], + ), + pd.Series( + ["a", "b", "a", None, "c", None], + dtype="category", + index=["q", "r", "z", "a", "b", "c"], + ), + pd.Series(["a", "b", "a", None, "c", None], dtype="category"), + pd.Series(["a", "b", "a", np.nan, "c", np.nan], dtype="category"), + ], +) +def test_fillna_categorical(psr, fill_value, inplace): + if inplace: + psr = psr.copy(deep=True) + gsr = cudf.from_pandas(psr) + + if isinstance(fill_value, pd.Series): + fill_value_cudf = cudf.from_pandas(fill_value) + else: + fill_value_cudf = fill_value + + if ( + isinstance(fill_value_cudf, cudf.Series) + and gsr.dtype != fill_value_cudf.dtype + ): + assert_exceptions_equal( + lfunc=psr.fillna, + rfunc=gsr.fillna, + lfunc_args_and_kwargs=([fill_value], {"inplace": inplace}), + rfunc_args_and_kwargs=([fill_value_cudf], {"inplace": inplace}), + ) + else: + expected = psr.fillna(fill_value, inplace=inplace) + got = gsr.fillna(fill_value_cudf, inplace=inplace) + + if inplace: + expected = psr + got = gsr + + assert_eq(expected, got) + + +@pytest.mark.parametrize( + "psr", + [ + pd.Series( + pd.date_range( + "2010-01-01", + "2020-01-10", + freq="1YE" if PANDAS_GE_220 else "1y", + ) + ), + pd.Series(["2010-01-01", None, "2011-10-10"], dtype="datetime64[ns]"), + pd.Series( + [ + None, + None, + None, + None, + None, + None, + "2011-10-10", + "2010-01-01", + "2010-01-02", + "2010-01-04", + "2010-11-01", + ], + dtype="datetime64[ns]", + ), + pd.Series( + [ + None, + None, + None, + None, + None, + None, + "2011-10-10", + "2010-01-01", + "2010-01-02", + "2010-01-04", + "2010-11-01", + ], + dtype="datetime64[ns]", + index=["a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k"], + ), + ], +) +@pytest.mark.parametrize( + "fill_value", + [ + pd.Timestamp("2010-01-02"), + pd.Series( + pd.date_range( + "2010-01-01", + "2020-01-10", + freq="1YE" if PANDAS_GE_220 else "1y", + ) + ) + + pd.Timedelta("1d"), + pd.Series(["2010-01-01", None, "2011-10-10"], dtype="datetime64[ns]"), + pd.Series( + [ + None, + None, + None, + None, + None, + None, + "2011-10-10", + "2010-01-01", + "2010-01-02", + "2010-01-04", + "2010-11-01", + ], + dtype="datetime64[ns]", + ), + pd.Series( + [ + None, + None, + None, + None, + None, + None, + "2011-10-10", + "2010-01-01", + "2010-01-02", + "2010-01-04", + "2010-11-01", + ], + dtype="datetime64[ns]", + index=["a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k"], + ), + ], +) +def test_fillna_datetime(psr, fill_value, inplace): + if inplace: + psr = psr.copy(deep=True) + gsr = cudf.from_pandas(psr) + + if isinstance(fill_value, pd.Series): + fill_value_cudf = cudf.from_pandas(fill_value) + else: + fill_value_cudf = fill_value + + expected = psr.fillna(fill_value, inplace=inplace) + got = gsr.fillna(fill_value_cudf, inplace=inplace) + + if inplace: + got = gsr + expected = psr + + assert_eq(expected, got) + + +@pytest.mark.skipif( + PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, + reason="warning not present in older pandas versions", +) +@pytest.mark.parametrize( + "data", + [ + # Categorical + pd.Categorical([1, 2, None, None, 3, 4]), + pd.Categorical([None, None, 1, None, 3, 4]), + pd.Categorical([1, 2, None, 3, 4, None, None]), + pd.Categorical(["1", "20", None, None, "3", "40"]), + pd.Categorical([None, None, "10", None, "30", "4"]), + pd.Categorical(["1", "20", None, "30", "4", None, None]), + # Datetime + np.array( + [ + "2020-01-01 08:00:00", + "2020-01-01 09:00:00", + None, + "2020-01-01 10:00:00", + None, + "2020-01-01 10:00:00", + ], + dtype="datetime64[ns]", + ), + np.array( + [ + None, + None, + "2020-01-01 09:00:00", + "2020-01-01 10:00:00", + None, + "2020-01-01 10:00:00", + ], + dtype="datetime64[ns]", + ), + np.array( + [ + "2020-01-01 09:00:00", + None, + None, + "2020-01-01 10:00:00", + None, + None, + ], + dtype="datetime64[ns]", + ), + # Timedelta + np.array( + [10, 100, 1000, None, None, 10, 100, 1000], dtype="datetime64[ns]" + ), + np.array( + [None, None, 10, None, 1000, 100, 10], dtype="datetime64[ns]" + ), + np.array( + [10, 100, None, None, 1000, None, None], dtype="datetime64[ns]" + ), + # String + np.array( + ["10", "100", "1000", None, None, "10", "100", "1000"], + dtype="object", + ), + np.array( + [None, None, "1000", None, "10", "100", "10"], dtype="object" + ), + np.array( + ["10", "100", None, None, "1000", None, None], dtype="object" + ), + ], +) +@pytest.mark.parametrize("container", [pd.Series, pd.DataFrame]) +@pytest.mark.parametrize("method", ["ffill", "bfill"]) +def test_fillna_method_fixed_width_non_num(data, container, method, inplace): + if container == pd.DataFrame: + data = {"a": data, "b": data, "c": data} + + pdata = container(data) + + # Explicitly using nans_as_nulls=True + gdata = cudf.from_pandas(pdata, nan_as_null=True) + + with pytest.warns(FutureWarning): + expected = pdata.fillna(method=method, inplace=inplace) + with pytest.warns(FutureWarning): + actual = gdata.fillna(method=method, inplace=inplace) + + if inplace: + expected = pdata + actual = gdata + + assert_eq(expected, actual) + + +@pytest.mark.parametrize( + "psr", + [ + pd.Series(["a", "b", "c", "d"]), + pd.Series([None] * 4, dtype="object"), + pd.Series(["z", None, "z", None]), + pd.Series(["x", "y", None, None, None]), + pd.Series([None, None, None, "i", "P"]), + ], +) +@pytest.mark.parametrize( + "fill_value", + [ + "a", + pd.Series(["a", "b", "c", "d"]), + pd.Series(["z", None, "z", None]), + pd.Series([None] * 4, dtype="object"), + pd.Series(["x", "y", None, None, None]), + pd.Series([None, None, None, "i", "P"]), + ], +) +def test_fillna_string(psr, fill_value, inplace): + if inplace: + psr = psr.copy(deep=True) + gsr = cudf.from_pandas(psr) + + if isinstance(fill_value, pd.Series): + fill_value_cudf = cudf.from_pandas(fill_value) + else: + fill_value_cudf = fill_value + + expected = psr.fillna(fill_value, inplace=inplace) + got = gsr.fillna(fill_value_cudf, inplace=inplace) + + if inplace: + expected = psr + got = gsr + + assert_eq(expected, got) + + +def test_series_fillna_invalid_dtype(integer_types_as_str): + gdf = cudf.Series([1, 2, None, 3], dtype=integer_types_as_str) + fill_value = 2.5 + msg = ( + f"Cannot safely cast non-equivalent" + f" {type(fill_value).__name__} to {gdf.dtype.type.__name__}" + ) + with pytest.raises(TypeError, match=msg): + gdf.fillna(fill_value) + + +@pytest.mark.parametrize( + "data", [[1, 2.0, 3, 4, None, 1, None, 10, None], ["a", "b", "c"]] +) +@pytest.mark.parametrize( + "index", + [ + None, + [1, 2, 3], + ["a", "b", "z"], + ["a", "b", "c", "d", "e", "f", "g", "l", "m"], + ], +) +@pytest.mark.parametrize("value", [[1, 2, 3, 4, None, 1, None, 10, None]]) +def test_series_fillna(data, index, value): + psr = pd.Series( + data, + index=index if index is not None and len(index) == len(data) else None, + ) + gsr = cudf.Series( + data, + index=index if index is not None and len(index) == len(data) else None, + ) + + expect = psr.fillna(pd.Series(value)) + got = gsr.fillna(cudf.Series(value)) + assert_eq(expect, got) + + +def test_series_fillna_error(): + psr = pd.Series([1, 2, None, 3, None]) + gsr = cudf.from_pandas(psr) + + assert_exceptions_equal( + psr.fillna, + gsr.fillna, + ([pd.DataFrame({"a": [1, 2, 3]})],), + ([cudf.DataFrame({"a": [1, 2, 3]})],), + ) + + +def test_fillna_nan_and_null(): + ser = cudf.Series(pa.array([float("nan"), None, 1.1]), nan_as_null=False) + result = ser.fillna(2.2) + expected = cudf.Series([2.2, 2.2, 1.1]) + assert_eq(result, expected) diff --git a/python/cudf/cudf/tests/series/methods/test_replace.py b/python/cudf/cudf/tests/series/methods/test_replace.py new file mode 100644 index 00000000000..7bdad916cab --- /dev/null +++ b/python/cudf/cudf/tests/series/methods/test_replace.py @@ -0,0 +1,527 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + +import re + +import numpy as np +import pandas as pd +import pytest + +import cudf +from cudf.core._compat import ( + PANDAS_CURRENT_SUPPORTED_VERSION, + PANDAS_GE_220, + PANDAS_GT_214, + PANDAS_VERSION, +) +from cudf.testing import assert_eq +from cudf.testing._utils import ( + assert_exceptions_equal, + expect_warning_if, +) + + +@pytest.mark.parametrize( + "gsr_data, dtype", + [ + [[5, 1, 2, 3, None, 243, None, 4], None], + [["one", "two", "three", None, "one"], "category"], + [[*list(range(400)), None], None], + ], +) +@pytest.mark.parametrize( + "to_replace,value", + [ + (0, 5), + ("one", "two"), + ("one", "five"), + ("abc", "hello"), + ([0, 1], [5, 6]), + ([22, 323, 27, 0], -1), + ([1, 2, 3], cudf.Series([10, 11, 12])), + (cudf.Series([1, 2, 3]), None), + ({1: 10, 2: 22}, None), + (np.inf, 4), + ], +) +def test_series_replace_all(gsr_data, dtype, to_replace, value): + gsr = cudf.Series(gsr_data, dtype=dtype) + psr = gsr.to_pandas() + + gd_to_replace = to_replace + if isinstance(to_replace, cudf.Series): + pd_to_replace = to_replace.to_pandas() + else: + pd_to_replace = to_replace + + gd_value = value + if isinstance(value, cudf.Series): + pd_value = value.to_pandas() + else: + pd_value = value + + expect_warn = ( + isinstance(gsr.dtype, cudf.CategoricalDtype) + and isinstance(gd_to_replace, str) + and gd_to_replace == "one" + ) + with expect_warning_if(expect_warn): + actual = gsr.replace(to_replace=gd_to_replace, value=gd_value) + with expect_warning_if(expect_warn and PANDAS_GE_220): + if pd_value is None: + # TODO: Remove this workaround once cudf + # introduces `no_default` values + expected = psr.replace(to_replace=pd_to_replace) + else: + expected = psr.replace(to_replace=pd_to_replace, value=pd_value) + + assert_eq( + expected.sort_values().reset_index(drop=True), + actual.sort_values().reset_index(drop=True), + ) + + +def test_series_replace(): + a1 = np.array([0, 1, 2, 3, 4]) + + # Numerical + a2 = np.array([5, 1, 2, 3, 4]) + sr1 = cudf.Series(a1) + sr2 = sr1.replace(0, 5) + assert_eq(a2, sr2.to_numpy()) + + # Categorical + psr3 = pd.Series(["one", "two", "three"], dtype="category") + with expect_warning_if(PANDAS_GE_220, FutureWarning): + psr4 = psr3.replace("one", "two") + sr3 = cudf.from_pandas(psr3) + with pytest.warns(FutureWarning): + sr4 = sr3.replace("one", "two") + assert_eq( + psr4.sort_values().reset_index(drop=True), + sr4.sort_values().reset_index(drop=True), + ) + with expect_warning_if(PANDAS_GE_220, FutureWarning): + psr5 = psr3.replace("one", "five") + with pytest.warns(FutureWarning): + sr5 = sr3.replace("one", "five") + + assert_eq(psr5, sr5) + + # List input + a6 = np.array([5, 6, 2, 3, 4]) + sr6 = sr1.replace([0, 1], [5, 6]) + assert_eq(a6, sr6.to_numpy()) + + assert_eq( + sr1.replace([0, 1], [5.5, 6.5]), + sr1.to_pandas().replace([0, 1], [5.5, 6.5]), + ) + + # Series input + a8 = np.array([5, 5, 5, 3, 4]) + sr8 = sr1.replace(sr1[:3].to_numpy(), 5) + assert_eq(a8, sr8.to_numpy()) + + # large input containing null + sr9 = cudf.Series([*list(range(400)), None]) + sr10 = sr9.replace([22, 323, 27, 0], None) + assert sr10.null_count == 5 + assert len(sr10.dropna().to_numpy()) == (401 - 5) + + sr11 = sr9.replace([22, 323, 27, 0], -1) + assert sr11.null_count == 1 + assert len(sr11.dropna().to_numpy()) == (401 - 1) + + # large input not containing nulls + sr9 = sr9.fillna(-11) + sr12 = sr9.replace([22, 323, 27, 0], None) + assert sr12.null_count == 4 + assert len(sr12.dropna().to_numpy()) == (401 - 4) + + sr13 = sr9.replace([22, 323, 27, 0], -1) + assert sr13.null_count == 0 + assert len(sr13.to_numpy()) == 401 + + +def test_series_replace_with_nulls(): + a1 = np.array([0, 1, 2, 3, 4]) + + # Numerical + a2 = np.array([-10, 1, 2, 3, 4]) + sr1 = cudf.Series(a1) + sr2 = sr1.replace(0, None).fillna(-10) + assert_eq(a2, sr2.to_numpy()) + + # List input + a6 = np.array([-10, 6, 2, 3, 4]) + sr6 = sr1.replace([0, 1], [None, 6]).fillna(-10) + assert_eq(a6, sr6.to_numpy()) + + sr1 = cudf.Series([0, 1, 2, 3, 4, None]) + assert_eq( + sr1.replace([0, 1], [5.5, 6.5]).fillna(-10), + sr1.to_pandas().replace([0, 1], [5.5, 6.5]).fillna(-10), + ) + + # Series input + a8 = np.array([-10, -10, -10, 3, 4, -10]) + sr8 = sr1.replace(cudf.Series([-10] * 3, index=sr1[:3]), None).fillna(-10) + assert_eq(a8, sr8.to_numpy()) + + a9 = np.array([-10, 6, 2, 3, 4, -10]) + sr9 = sr1.replace([0, 1], [None, 6]).fillna(-10) + assert_eq(a9, sr9.to_numpy()) + + +@pytest.mark.parametrize( + "psr", + [ + pd.Series([0, 1, None, 2, None], dtype=pd.Int8Dtype()), + pd.Series([0, 1, np.nan, 2, np.nan]), + ], +) +@pytest.mark.parametrize("fill_value", [10, pd.Series([10, 20, 30, 40, 50])]) +def test_series_fillna_numerical( + psr, numeric_types_as_str, fill_value, inplace +): + if inplace: + psr = psr.copy(deep=True) + # TODO: These tests should use Pandas' nullable int type + # when we support a recent enough version of Pandas + # https://pandas.pydata.org/pandas-docs/stable/user_guide/integer_na.html + if np.dtype(numeric_types_as_str).kind != "f" and psr.dtype.kind == "i": + psr = psr.astype( + cudf.utils.dtypes.np_dtypes_to_pandas_dtypes[ + np.dtype(numeric_types_as_str) + ] + ) + + gsr = cudf.from_pandas(psr) + + if isinstance(fill_value, pd.Series): + fill_value_cudf = cudf.from_pandas(fill_value) + else: + fill_value_cudf = fill_value + + expected = psr.fillna(fill_value, inplace=inplace) + actual = gsr.fillna(fill_value_cudf, inplace=inplace) + + if inplace: + expected = psr + actual = gsr + + # TODO: Remove check_dtype when we have support + # to compare with pandas nullable dtypes + assert_eq(expected, actual, check_dtype=False) + + +def test_series_multiple_times_with_nulls(): + sr = cudf.Series([1, 2, 3, None]) + expected = cudf.Series([None, None, None, None], dtype=np.int64) + + for i in range(3): + got = sr.replace([1, 2, 3], None) + assert_eq(expected, got) + # BUG: #2695 + # The following series will acquire a chunk of memory and update with + # values, but these values may still linger even after the memory + # gets released. This memory space might get used for replace in + # subsequent calls and the memory used for mask may have junk values. + # So, if it is not updated properly, the result would be wrong. + # So, this will help verify that scenario. + cudf.Series([1, 1, 1, None]) + + +@pytest.mark.parametrize( + "replacement", [128, 128.0, 128.5, 32769, 32769.0, 32769.5] +) +def test_numeric_series_replace_dtype( + request, numeric_types_as_str, replacement +): + request.applymarker( + pytest.mark.xfail( + condition=PANDAS_GT_214 + and ( + ( + numeric_types_as_str == "int8" + and replacement in {128, 128.0, 32769, 32769.0} + ) + or ( + numeric_types_as_str == "int16" + and replacement in {32769, 32769.0} + ) + ), + reason="Pandas throws an AssertionError for these " + "cases and asks us to log a bug, they are trying to " + "avoid a RecursionError which cudf will not run into", + ) + ) + psr = pd.Series([0, 1, 2, 3, 4, 5], dtype=numeric_types_as_str) + sr = cudf.from_pandas(psr) + + expect = psr.replace(1, replacement) + got = sr.replace(1, replacement) + + assert_eq(expect, got) + + # to_replace is a list, replacement is a scalar + expect = psr.replace([2, 3], replacement) + got = sr.replace([2, 3], replacement) + + assert_eq(expect, got) + + # If to_replace is a scalar and replacement is a list + with pytest.raises(TypeError): + sr.replace(0, [replacement, 2]) + + # Both list of unequal length + with pytest.raises(ValueError): + sr.replace([0, 1], [replacement]) + + # Both lists of equal length + expect = psr.replace([2, 3], [replacement, replacement]) + got = sr.replace([2, 3], [replacement, replacement]) + assert_eq(expect, got) + + +@pytest.mark.parametrize( + "pframe, replace_args", + [ + ( + pd.Series([5, 1, 2, 3, 4]), + {"to_replace": 5, "value": 0, "inplace": True}, + ), + ( + pd.Series([5, 1, 2, 3, 4]), + {"to_replace": {5: 0, 3: -5}, "inplace": True}, + ), + (pd.Series([5, 1, 2, 3, 4]), {}), + pytest.param( + pd.Series(["one", "two", "three"], dtype="category"), + {"to_replace": "one", "value": "two", "inplace": True}, + marks=pytest.mark.xfail( + condition=PANDAS_VERSION >= PANDAS_CURRENT_SUPPORTED_VERSION, + reason="https://github.com/pandas-dev/pandas/issues/43232" + "https://github.com/pandas-dev/pandas/issues/53358", + ), + ), + ( + pd.DataFrame({"A": [0, 1, 2, 3, 4], "B": [5, 6, 7, 8, 9]}), + {"to_replace": 5, "value": 0, "inplace": True}, + ), + ( + pd.Series([1, 2, 3, 45]), + { + "to_replace": np.array([]).astype(int), + "value": 77, + "inplace": True, + }, + ), + ( + pd.Series([1, 2, 3, 45]), + { + "to_replace": np.array([]).astype(int), + "value": 77, + "inplace": False, + }, + ), + ( + pd.DataFrame({"a": [1, 2, 3, 4, 5, 666]}), + {"to_replace": {"a": 2}, "value": {"a": -33}, "inplace": True}, + ), + ( + pd.DataFrame({"a": [1, 2, 3, 4, 5, 666]}), + { + "to_replace": {"a": [2, 5]}, + "value": {"a": [9, 10]}, + "inplace": True, + }, + ), + ( + pd.DataFrame({"a": [1, 2, 3, 4, 5, 666]}), + {"to_replace": [], "value": [], "inplace": True}, + ), + ], +) +@pytest.mark.skipif( + PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, + reason="Warning not given on older versions of pandas", +) +def test_replace_inplace(pframe, replace_args): + gpu_frame = cudf.from_pandas(pframe) + pandas_frame = pframe.copy() + + gpu_copy = gpu_frame.copy() + cpu_copy = pandas_frame.copy() + + assert_eq(gpu_frame, pandas_frame) + assert_eq(gpu_copy, cpu_copy) + with expect_warning_if(len(replace_args) == 0): + gpu_frame.replace(**replace_args) + with expect_warning_if(len(replace_args) == 0): + pandas_frame.replace(**replace_args) + assert_eq(gpu_frame, pandas_frame) + assert_eq(gpu_copy, cpu_copy) + + +def test_series_replace_errors(): + gsr = cudf.Series([1, 2, None, 3, None]) + psr = gsr.to_pandas() + + with pytest.raises( + TypeError, + match=re.escape( + "to_replace and value should be of same types," + "got to_replace dtype: int64 and " + "value dtype: object" + ), + ): + gsr.replace(1, "a") + + gsr = cudf.Series(["a", "b", "c"]) + with pytest.raises( + TypeError, + match=re.escape( + "to_replace and value should be of same types," + "got to_replace dtype: int64 and " + "value dtype: object" + ), + ): + gsr.replace([1, 2], ["a", "b"]) + + assert_exceptions_equal( + psr.replace, + gsr.replace, + ([{"a": 1}, 1],), + ([{"a": 1}, 1],), + ) + + assert_exceptions_equal( + lfunc=psr.replace, + rfunc=gsr.replace, + lfunc_args_and_kwargs=([[1, 2], [1]],), + rfunc_args_and_kwargs=([[1, 2], [1]],), + ) + + assert_exceptions_equal( + lfunc=psr.replace, + rfunc=gsr.replace, + lfunc_args_and_kwargs=([object(), [1]],), + rfunc_args_and_kwargs=([object(), [1]],), + ) + + assert_exceptions_equal( + lfunc=psr.replace, + rfunc=gsr.replace, + lfunc_args_and_kwargs=([{"a": 1}, object()],), + rfunc_args_and_kwargs=([{"a": 1}, object()],), + ) + + +@pytest.mark.parametrize( + "gsr,old,new,expected", + [ + ( + lambda: cudf.Series(["a", "b", "c", None]), + None, + "a", + lambda: cudf.Series(["a", "b", "c", "a"]), + ), + ( + lambda: cudf.Series(["a", "b", "c", None]), + [None, "a", "a"], + ["c", "b", "d"], + lambda: cudf.Series(["d", "b", "c", "c"]), + ), + ( + lambda: cudf.Series(["a", "b", "c", None]), + [None, "a"], + ["b", None], + lambda: cudf.Series([None, "b", "c", "b"]), + ), + ( + lambda: cudf.Series(["a", "b", "c", None]), + [None, None], + [None, None], + lambda: cudf.Series(["a", "b", "c", None]), + ), + ( + lambda: cudf.Series([1, 2, None, 3]), + None, + 10, + lambda: cudf.Series([1, 2, 10, 3]), + ), + ( + lambda: cudf.Series([1, 2, None, 3]), + [None, 1, 1], + [3, 2, 4], + lambda: cudf.Series([4, 2, 3, 3]), + ), + ( + lambda: cudf.Series([1, 2, None, 3]), + [None, 1], + [2, None], + lambda: cudf.Series([None, 2, 2, 3]), + ), + ( + lambda: cudf.Series(["a", "q", "t", None], dtype="category"), + None, + "z", + lambda: cudf.Series(["a", "q", "t", "z"], dtype="category"), + ), + ( + lambda: cudf.Series(["a", "q", "t", None], dtype="category"), + [None, "a", "q"], + ["z", None, None], + lambda: cudf.Series([None, None, "t", "z"], dtype="category"), + ), + ( + lambda: cudf.Series(["a", None, "t", None], dtype="category"), + [None, "t"], + ["p", None], + lambda: cudf.Series(["a", "p", None, "p"], dtype="category"), + ), + ], +) +def test_replace_nulls(gsr, old, new, expected): + gsr = gsr() + with expect_warning_if(isinstance(gsr.dtype, cudf.CategoricalDtype)): + actual = gsr.replace(old, new) + assert_eq( + expected().sort_values().reset_index(drop=True), + actual.sort_values().reset_index(drop=True), + ) + + +def test_replace_with_index_objects(): + result = cudf.Series([1, 2]).replace(cudf.Index([1]), cudf.Index([2])) + expected = pd.Series([1, 2]).replace(pd.Index([1]), pd.Index([2])) + assert_eq(result, expected) + + +def test_replace_datetime_series(): + pd_series = pd.Series(pd.date_range("20210101", periods=5)) + pd_result = pd_series.replace( + pd.Timestamp("2021-01-02"), pd.Timestamp("2021-01-10") + ) + + cudf_series = cudf.Series(pd.date_range("20210101", periods=5)) + cudf_result = cudf_series.replace( + pd.Timestamp("2021-01-02"), pd.Timestamp("2021-01-10") + ) + + assert_eq(pd_result, cudf_result) + + +def test_replace_timedelta_series(): + pd_series = pd.Series(pd.timedelta_range("1 days", periods=5)) + pd_result = pd_series.replace( + pd.Timedelta("2 days"), pd.Timedelta("10 days") + ) + + cudf_series = cudf.Series(pd.timedelta_range("1 days", periods=5)) + cudf_result = cudf_series.replace( + pd.Timedelta("2 days"), pd.Timedelta("10 days") + ) + + assert_eq(pd_result, cudf_result) diff --git a/python/cudf/cudf/tests/series/methods/test_where.py b/python/cudf/cudf/tests/series/methods/test_where.py index e0f01fd3cb8..9cb12df2f0c 100644 --- a/python/cudf/cudf/tests/series/methods/test_where.py +++ b/python/cudf/cudf/tests/series/methods/test_where.py @@ -1,9 +1,12 @@ # Copyright (c) 2025, NVIDIA CORPORATION. +import operator import re +import pandas as pd import pytest import cudf +from cudf.testing import assert_eq def test_series_where_mixed_dtypes_error(): @@ -23,3 +26,47 @@ def test_series_where_mixed_bool_dtype(): s = cudf.Series([True, False, True]) with pytest.raises(TypeError): s.where(~s, 10) + + +@pytest.mark.parametrize("fill_value", [100, 100.0, 128.5]) +@pytest.mark.parametrize("op", [operator.gt, operator.eq, operator.lt]) +def test_series_where(numeric_types_as_str, fill_value, op): + psr = pd.Series(list(range(10)), dtype=numeric_types_as_str) + sr = cudf.from_pandas(psr) + + try: + scalar_fits = sr.dtype.type(fill_value) == fill_value + except OverflowError: + scalar_fits = False + + if not scalar_fits: + with pytest.raises(TypeError): + sr.where(op(sr, 0), fill_value) + else: + # Cast back to original dtype as pandas automatically upcasts + expect = psr.where(op(psr, 0), fill_value) + got = sr.where(op(sr, 0), fill_value) + # pandas returns 'float16' dtype, which is not supported in cudf + assert_eq( + expect, + got, + check_dtype=expect.dtype.kind != "f", + ) + + +@pytest.mark.parametrize("fill_value", [100, 100.0, 100.5]) +def test_series_with_nulls_where(fill_value): + psr = pd.Series([None] * 3 + list(range(5))) + sr = cudf.from_pandas(psr) + + expect = psr.where(psr > 0, fill_value) + got = sr.where(sr > 0, fill_value) + assert_eq(expect, got) + + expect = psr.where(psr < 0, fill_value) + got = sr.where(sr < 0, fill_value) + assert_eq(expect, got) + + expect = psr.where(psr == 0, fill_value) + got = sr.where(sr == 0, fill_value) + assert_eq(expect, got) diff --git a/python/cudf/cudf/tests/test_replace.py b/python/cudf/cudf/tests/test_replace.py deleted file mode 100644 index b1efcef5d1e..00000000000 --- a/python/cudf/cudf/tests/test_replace.py +++ /dev/null @@ -1,1459 +0,0 @@ -# Copyright (c) 2020-2025, NVIDIA CORPORATION. - -import operator -import re -from decimal import Decimal - -import numpy as np -import pandas as pd -import pyarrow as pa -import pytest - -import cudf -from cudf.core._compat import ( - PANDAS_CURRENT_SUPPORTED_VERSION, - PANDAS_GE_220, - PANDAS_GT_214, - PANDAS_VERSION, -) -from cudf.core.dtypes import Decimal32Dtype, Decimal64Dtype, Decimal128Dtype -from cudf.testing import assert_eq -from cudf.testing._utils import ( - INTEGER_TYPES, - NUMERIC_TYPES, - assert_exceptions_equal, - expect_warning_if, -) - - -@pytest.mark.parametrize( - "gsr_data, dtype", - [ - [[5, 1, 2, 3, None, 243, None, 4], None], - [["one", "two", "three", None, "one"], "category"], - [[*list(range(400)), None], None], - ], -) -@pytest.mark.parametrize( - "to_replace,value", - [ - (0, 5), - ("one", "two"), - ("one", "five"), - ("abc", "hello"), - ([0, 1], [5, 6]), - ([22, 323, 27, 0], -1), - ([1, 2, 3], cudf.Series([10, 11, 12])), - (cudf.Series([1, 2, 3]), None), - ({1: 10, 2: 22}, None), - (np.inf, 4), - ], -) -def test_series_replace_all(gsr_data, dtype, to_replace, value): - gsr = cudf.Series(gsr_data, dtype=dtype) - psr = gsr.to_pandas() - - gd_to_replace = to_replace - if isinstance(to_replace, cudf.Series): - pd_to_replace = to_replace.to_pandas() - else: - pd_to_replace = to_replace - - gd_value = value - if isinstance(value, cudf.Series): - pd_value = value.to_pandas() - else: - pd_value = value - - expect_warn = ( - isinstance(gsr.dtype, cudf.CategoricalDtype) - and isinstance(gd_to_replace, str) - and gd_to_replace == "one" - ) - with expect_warning_if(expect_warn): - actual = gsr.replace(to_replace=gd_to_replace, value=gd_value) - with expect_warning_if(expect_warn and PANDAS_GE_220): - if pd_value is None: - # TODO: Remove this workaround once cudf - # introduces `no_default` values - expected = psr.replace(to_replace=pd_to_replace) - else: - expected = psr.replace(to_replace=pd_to_replace, value=pd_value) - - assert_eq( - expected.sort_values().reset_index(drop=True), - actual.sort_values().reset_index(drop=True), - ) - - -def test_series_replace(): - a1 = np.array([0, 1, 2, 3, 4]) - - # Numerical - a2 = np.array([5, 1, 2, 3, 4]) - sr1 = cudf.Series(a1) - sr2 = sr1.replace(0, 5) - assert_eq(a2, sr2.to_numpy()) - - # Categorical - psr3 = pd.Series(["one", "two", "three"], dtype="category") - with expect_warning_if(PANDAS_GE_220, FutureWarning): - psr4 = psr3.replace("one", "two") - sr3 = cudf.from_pandas(psr3) - with pytest.warns(FutureWarning): - sr4 = sr3.replace("one", "two") - assert_eq( - psr4.sort_values().reset_index(drop=True), - sr4.sort_values().reset_index(drop=True), - ) - with expect_warning_if(PANDAS_GE_220, FutureWarning): - psr5 = psr3.replace("one", "five") - with pytest.warns(FutureWarning): - sr5 = sr3.replace("one", "five") - - assert_eq(psr5, sr5) - - # List input - a6 = np.array([5, 6, 2, 3, 4]) - sr6 = sr1.replace([0, 1], [5, 6]) - assert_eq(a6, sr6.to_numpy()) - - assert_eq( - sr1.replace([0, 1], [5.5, 6.5]), - sr1.to_pandas().replace([0, 1], [5.5, 6.5]), - ) - - # Series input - a8 = np.array([5, 5, 5, 3, 4]) - sr8 = sr1.replace(sr1[:3].to_numpy(), 5) - assert_eq(a8, sr8.to_numpy()) - - # large input containing null - sr9 = cudf.Series([*list(range(400)), None]) - sr10 = sr9.replace([22, 323, 27, 0], None) - assert sr10.null_count == 5 - assert len(sr10.dropna().to_numpy()) == (401 - 5) - - sr11 = sr9.replace([22, 323, 27, 0], -1) - assert sr11.null_count == 1 - assert len(sr11.dropna().to_numpy()) == (401 - 1) - - # large input not containing nulls - sr9 = sr9.fillna(-11) - sr12 = sr9.replace([22, 323, 27, 0], None) - assert sr12.null_count == 4 - assert len(sr12.dropna().to_numpy()) == (401 - 4) - - sr13 = sr9.replace([22, 323, 27, 0], -1) - assert sr13.null_count == 0 - assert len(sr13.to_numpy()) == 401 - - -def test_series_replace_with_nulls(): - a1 = np.array([0, 1, 2, 3, 4]) - - # Numerical - a2 = np.array([-10, 1, 2, 3, 4]) - sr1 = cudf.Series(a1) - sr2 = sr1.replace(0, None).fillna(-10) - assert_eq(a2, sr2.to_numpy()) - - # List input - a6 = np.array([-10, 6, 2, 3, 4]) - sr6 = sr1.replace([0, 1], [None, 6]).fillna(-10) - assert_eq(a6, sr6.to_numpy()) - - sr1 = cudf.Series([0, 1, 2, 3, 4, None]) - assert_eq( - sr1.replace([0, 1], [5.5, 6.5]).fillna(-10), - sr1.to_pandas().replace([0, 1], [5.5, 6.5]).fillna(-10), - ) - - # Series input - a8 = np.array([-10, -10, -10, 3, 4, -10]) - sr8 = sr1.replace(cudf.Series([-10] * 3, index=sr1[:3]), None).fillna(-10) - assert_eq(a8, sr8.to_numpy()) - - a9 = np.array([-10, 6, 2, 3, 4, -10]) - sr9 = sr1.replace([0, 1], [None, 6]).fillna(-10) - assert_eq(a9, sr9.to_numpy()) - - -@pytest.mark.skipif( - PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, - reason="warning introduced in pandas-2.2.0", -) -@pytest.mark.parametrize( - "data, dtype", - [ - ( - { - "a": [0, 1, None, 2, 3], - "b": [3, 2, 2, 3, None], - "c": ["abc", "def", ".", None, None], - }, - None, - ), - ( - { - "a": ["one", "two", None, "three"], - "b": ["one", None, "two", "three"], - }, - "category", - ), - ( - { - "col one": [None, 10, 11, None, 1000, 500, 600], - "col two": ["abc", "def", "ghi", None, "pp", None, "a"], - "a": [0.324, 0.234, 324.342, 23.32, 9.9, None, None], - }, - None, - ), - ], -) -@pytest.mark.parametrize( - "to_replace,value", - [ - (0, 4), - ([0, 1], [4, 5]), - ([0, 1], 4), - ({"a": 0, "b": 0}, {"a": 4, "b": 5}), - ({"a": 0}, {"a": 4}), - ("abc", "---"), - ([".", "gh"], "hi"), - ([".", "def"], ["_", None]), - ({"c": 0}, {"a": 4, "b": 5}), - ({"a": 2}, {"c": "a"}), - ("two", "three"), - ([1, 2], pd.Series([10, 11])), - (pd.Series([10, 11], index=[3, 2]), None), - ( - pd.Series(["a+", "+c", "p", "---"], index=["abc", "gh", "l", "z"]), - None, - ), - ( - pd.Series([10, 11], index=[3, 2]), - {"a": [-10, -30], "l": [-111, -222]}, - ), - (pd.Series([10, 11], index=[3, 2]), 555), - ( - pd.Series([10, 11], index=["a", "b"]), - pd.Series([555, 1111], index=["a", "b"]), - ), - ({"a": "2", "b": "3", "zzz": "hi"}, None), - ({"a": 2, "b": 3, "zzz": "hi"}, 324353), - ( - {"a": 2, "b": 3, "zzz": "hi"}, - pd.Series([5, 6, 10], index=["a", "b", "col one"]), - ), - ], -) -def test_dataframe_replace(data, dtype, to_replace, value): - gdf = cudf.DataFrame(data, dtype=dtype) - pdf = gdf.to_pandas() - - pd_value = value - if isinstance(value, pd.Series): - gd_value = cudf.from_pandas(value) - else: - gd_value = value - - pd_to_replace = to_replace - if isinstance(to_replace, pd.Series): - gd_to_replace = cudf.from_pandas(to_replace) - else: - gd_to_replace = to_replace - - can_warn = ( - isinstance(gdf["a"].dtype, cudf.CategoricalDtype) - and isinstance(to_replace, str) - and to_replace == "two" - and isinstance(value, str) - and value == "three" - ) - with expect_warning_if(can_warn): - if pd_value is None: - expected = pdf.replace(to_replace=pd_to_replace) - else: - expected = pdf.replace(to_replace=pd_to_replace, value=pd_value) - with expect_warning_if(can_warn): - actual = gdf.replace(to_replace=gd_to_replace, value=gd_value) - - expected_sorted = expected.sort_values(by=list(expected.columns), axis=0) - actual_sorted = actual.sort_values(by=list(actual.columns), axis=0) - - assert_eq(expected_sorted, actual_sorted) - - -def test_dataframe_replace_with_nulls(): - # numerical - pdf1 = pd.DataFrame({"a": [0, 1, 2, 3], "b": [0, 1, 2, 3]}) - gdf1 = cudf.from_pandas(pdf1) - pdf2 = pdf1.replace(0, 4) - gdf2 = gdf1.replace(0, None).fillna(4) - assert_eq(gdf2, pdf2) - - # list input - pdf6 = pdf1.replace([0, 1], [4, 5]) - gdf6 = gdf1.replace([0, 1], [4, None]).fillna(5) - assert_eq(gdf6, pdf6) - - pdf7 = pdf1.replace([0, 1], 4) - gdf7 = gdf1.replace([0, 1], None).fillna(4) - assert_eq(gdf7, pdf7) - - # dict input: - pdf8 = pdf1.replace({"a": 0, "b": 0}, {"a": 4, "b": 5}) - gdf8 = gdf1.replace({"a": 0, "b": 0}, {"a": None, "b": 5}).fillna(4) - assert_eq(gdf8, pdf8) - - gdf1 = cudf.DataFrame({"a": [0, 1, 2, 3], "b": [0, 1, 2, None]}) - gdf9 = gdf1.replace([0, 1], [4, 5]).fillna(3) - assert_eq(gdf9, pdf6) - - -@pytest.mark.parametrize( - "psr", - [ - pd.Series([0, 1, None, 2, None], dtype=pd.Int8Dtype()), - pd.Series([0, 1, np.nan, 2, np.nan]), - ], -) -@pytest.mark.parametrize("data_dtype", NUMERIC_TYPES) -@pytest.mark.parametrize("fill_value", [10, pd.Series([10, 20, 30, 40, 50])]) -@pytest.mark.parametrize("inplace", [True, False]) -def test_series_fillna_numerical(psr, data_dtype, fill_value, inplace): - test_psr = psr.copy(deep=True) - # TODO: These tests should use Pandas' nullable int type - # when we support a recent enough version of Pandas - # https://pandas.pydata.org/pandas-docs/stable/user_guide/integer_na.html - if np.dtype(data_dtype).kind not in ("f") and test_psr.dtype.kind == "i": - test_psr = test_psr.astype( - cudf.utils.dtypes.np_dtypes_to_pandas_dtypes[np.dtype(data_dtype)] - ) - - gsr = cudf.from_pandas(test_psr) - - if isinstance(fill_value, pd.Series): - fill_value_cudf = cudf.from_pandas(fill_value) - else: - fill_value_cudf = fill_value - - expected = test_psr.fillna(fill_value, inplace=inplace) - actual = gsr.fillna(fill_value_cudf, inplace=inplace) - - if inplace: - expected = test_psr - actual = gsr - - # TODO: Remove check_dtype when we have support - # to compare with pandas nullable dtypes - assert_eq(expected, actual, check_dtype=False) - - -@pytest.mark.skipif( - PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, - reason="warning not present in older pandas versions", -) -@pytest.mark.parametrize( - "data", - [ - [1, None, None, 2, 3, 4], - [None, None, 1, 2, None, 3, 4], - [1, 2, None, 3, 4, None, None], - [0] + [None] * 14, - [None] * 14 + [0], - ], -) -@pytest.mark.parametrize("container", [pd.Series, pd.DataFrame]) -@pytest.mark.parametrize("data_dtype", NUMERIC_TYPES) -@pytest.mark.parametrize("method", ["ffill", "bfill"]) -@pytest.mark.parametrize("inplace", [True, False]) -def test_fillna_method_numerical(data, container, data_dtype, method, inplace): - if container == pd.DataFrame: - data = {"a": data, "b": data, "c": data} - - pdata = container(data) - - if np.dtype(data_dtype).kind not in ("f"): - data_dtype = cudf.utils.dtypes.np_dtypes_to_pandas_dtypes[ - np.dtype(data_dtype) - ] - pdata = pdata.astype(data_dtype) - - # Explicitly using nans_as_nulls=True - gdata = cudf.from_pandas(pdata, nan_as_null=True) - - with pytest.warns(FutureWarning): - expected = pdata.fillna(method=method, inplace=inplace) - with pytest.warns(FutureWarning): - actual = gdata.fillna(method=method, inplace=inplace) - - if inplace: - expected = pdata - actual = gdata - - assert_eq(expected, actual, check_dtype=False) - - -@pytest.mark.parametrize( - "gsr_data, dtype", - [ - (["2.34", "5.2", "7.47", None, "92.29", None], Decimal64Dtype(7, 2)), - ( - ["-74.56", None, "-23.73", "34.55", "2.89", None], - Decimal32Dtype(7, 2), - ), - ( - ["85.955", np.nan, "-3.243", np.nan, "29.492", np.nan], - Decimal64Dtype(8, 3), - ), - ( - ["2.964", None, "57.432", "-989.330", None, "56.444"], - Decimal64Dtype(8, 3), - ), - ( - [np.nan, "55.2498", np.nan, "-5.2965", "-28.9423", np.nan], - Decimal64Dtype(10, 4), - ), - ( - ["2.964", None, "54347.432", "-989.330", None, "56.444"], - Decimal128Dtype(20, 7), - ), - ], -) -@pytest.mark.parametrize( - "fill_value", - [ - 42, - -123, - Decimal("8.2"), - Decimal("-12.87"), - cudf.Series([None, -854, 9533, -274, -845, 7924], dtype="int32"), - cudf.Series(["-53.5", "13.4", "-64.3", None, "42.42", None]).astype( - Decimal64Dtype(7, 2) - ), - cudf.Series( - ["57.45", np.nan, np.nan, "686.49", "-55.5", "73.24"], - ).astype(Decimal64Dtype(7, 2)), - ], -) -@pytest.mark.parametrize("inplace", [True, False]) -def test_fillna_decimal(gsr_data, dtype, fill_value, inplace): - gsr = cudf.Series(gsr_data).astype(dtype) - psr = gsr.to_pandas() - - if isinstance(fill_value, cudf.Series): - p_fill_value = fill_value.to_pandas() - else: - p_fill_value = fill_value - - expected = psr.fillna(p_fill_value, inplace=inplace) - got = gsr.fillna(fill_value, inplace=inplace) - - assert_eq(expected, got, check_dtype=False) - - -@pytest.mark.parametrize( - "psr_data", - [ - pd.Series(["a", "b", "a", None, "c", None], dtype="category"), - pd.Series( - ["a", "b", "a", None, "c", None], - dtype="category", - index=["q", "r", "z", "a", "b", "c"], - ), - pd.Series( - ["a", "b", "a", None, "c", None], - dtype="category", - index=["x", "t", "p", "q", "r", "z"], - ), - pd.Series(["a", "b", "a", np.nan, "c", np.nan], dtype="category"), - pd.Series( - [None, None, None, None, None, None, "a", "b", "c"], - dtype="category", - ), - ], -) -@pytest.mark.parametrize( - "fill_value", - [ - "c", - pd.Series(["c", "c", "c", "c", "c", "a"], dtype="category"), - pd.Series( - ["a", "b", "a", None, "c", None], - dtype="category", - index=["x", "t", "p", "q", "r", "z"], - ), - pd.Series( - ["a", "b", "a", None, "c", None], - dtype="category", - index=["q", "r", "z", "a", "b", "c"], - ), - pd.Series(["a", "b", "a", None, "c", None], dtype="category"), - pd.Series(["a", "b", "a", np.nan, "c", np.nan], dtype="category"), - ], -) -@pytest.mark.parametrize("inplace", [True, False]) -def test_fillna_categorical(psr_data, fill_value, inplace): - psr = psr_data.copy(deep=True) - gsr = cudf.from_pandas(psr) - - if isinstance(fill_value, pd.Series): - fill_value_cudf = cudf.from_pandas(fill_value) - else: - fill_value_cudf = fill_value - - if ( - isinstance(fill_value_cudf, cudf.Series) - and gsr.dtype != fill_value_cudf.dtype - ): - assert_exceptions_equal( - lfunc=psr.fillna, - rfunc=gsr.fillna, - lfunc_args_and_kwargs=([fill_value], {"inplace": inplace}), - rfunc_args_and_kwargs=([fill_value_cudf], {"inplace": inplace}), - ) - else: - expected = psr.fillna(fill_value, inplace=inplace) - got = gsr.fillna(fill_value_cudf, inplace=inplace) - - if inplace: - expected = psr - got = gsr - - assert_eq(expected, got) - - -@pytest.mark.parametrize( - "psr_data", - [ - pd.Series( - pd.date_range( - "2010-01-01", - "2020-01-10", - freq="1YE" if PANDAS_GE_220 else "1y", - ) - ), - pd.Series(["2010-01-01", None, "2011-10-10"], dtype="datetime64[ns]"), - pd.Series( - [ - None, - None, - None, - None, - None, - None, - "2011-10-10", - "2010-01-01", - "2010-01-02", - "2010-01-04", - "2010-11-01", - ], - dtype="datetime64[ns]", - ), - pd.Series( - [ - None, - None, - None, - None, - None, - None, - "2011-10-10", - "2010-01-01", - "2010-01-02", - "2010-01-04", - "2010-11-01", - ], - dtype="datetime64[ns]", - index=["a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k"], - ), - ], -) -@pytest.mark.parametrize( - "fill_value", - [ - pd.Timestamp("2010-01-02"), - pd.Series( - pd.date_range( - "2010-01-01", - "2020-01-10", - freq="1YE" if PANDAS_GE_220 else "1y", - ) - ) - + pd.Timedelta("1d"), - pd.Series(["2010-01-01", None, "2011-10-10"], dtype="datetime64[ns]"), - pd.Series( - [ - None, - None, - None, - None, - None, - None, - "2011-10-10", - "2010-01-01", - "2010-01-02", - "2010-01-04", - "2010-11-01", - ], - dtype="datetime64[ns]", - ), - pd.Series( - [ - None, - None, - None, - None, - None, - None, - "2011-10-10", - "2010-01-01", - "2010-01-02", - "2010-01-04", - "2010-11-01", - ], - dtype="datetime64[ns]", - index=["a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k"], - ), - ], -) -@pytest.mark.parametrize("inplace", [True, False]) -def test_fillna_datetime(psr_data, fill_value, inplace): - psr = psr_data.copy(deep=True) - gsr = cudf.from_pandas(psr) - - if isinstance(fill_value, pd.Series): - fill_value_cudf = cudf.from_pandas(fill_value) - else: - fill_value_cudf = fill_value - - expected = psr.fillna(fill_value, inplace=inplace) - got = gsr.fillna(fill_value_cudf, inplace=inplace) - - if inplace: - got = gsr - expected = psr - - assert_eq(expected, got) - - -@pytest.mark.skipif( - PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, - reason="warning not present in older pandas versions", -) -@pytest.mark.parametrize( - "data", - [ - # Categorical - pd.Categorical([1, 2, None, None, 3, 4]), - pd.Categorical([None, None, 1, None, 3, 4]), - pd.Categorical([1, 2, None, 3, 4, None, None]), - pd.Categorical(["1", "20", None, None, "3", "40"]), - pd.Categorical([None, None, "10", None, "30", "4"]), - pd.Categorical(["1", "20", None, "30", "4", None, None]), - # Datetime - np.array( - [ - "2020-01-01 08:00:00", - "2020-01-01 09:00:00", - None, - "2020-01-01 10:00:00", - None, - "2020-01-01 10:00:00", - ], - dtype="datetime64[ns]", - ), - np.array( - [ - None, - None, - "2020-01-01 09:00:00", - "2020-01-01 10:00:00", - None, - "2020-01-01 10:00:00", - ], - dtype="datetime64[ns]", - ), - np.array( - [ - "2020-01-01 09:00:00", - None, - None, - "2020-01-01 10:00:00", - None, - None, - ], - dtype="datetime64[ns]", - ), - # Timedelta - np.array( - [10, 100, 1000, None, None, 10, 100, 1000], dtype="datetime64[ns]" - ), - np.array( - [None, None, 10, None, 1000, 100, 10], dtype="datetime64[ns]" - ), - np.array( - [10, 100, None, None, 1000, None, None], dtype="datetime64[ns]" - ), - # String - np.array( - ["10", "100", "1000", None, None, "10", "100", "1000"], - dtype="object", - ), - np.array( - [None, None, "1000", None, "10", "100", "10"], dtype="object" - ), - np.array( - ["10", "100", None, None, "1000", None, None], dtype="object" - ), - ], -) -@pytest.mark.parametrize("container", [pd.Series, pd.DataFrame]) -@pytest.mark.parametrize("method", ["ffill", "bfill"]) -@pytest.mark.parametrize("inplace", [True, False]) -def test_fillna_method_fixed_width_non_num(data, container, method, inplace): - if container == pd.DataFrame: - data = {"a": data, "b": data, "c": data} - - pdata = container(data) - - # Explicitly using nans_as_nulls=True - gdata = cudf.from_pandas(pdata, nan_as_null=True) - - with pytest.warns(FutureWarning): - expected = pdata.fillna(method=method, inplace=inplace) - with pytest.warns(FutureWarning): - actual = gdata.fillna(method=method, inplace=inplace) - - if inplace: - expected = pdata - actual = gdata - - assert_eq(expected, actual) - - -@pytest.mark.parametrize( - "df", - [ - pd.DataFrame({"a": [1, 2, None], "b": [None, None, 5]}), - pd.DataFrame( - {"a": [1, 2, None], "b": [None, None, 5]}, index=["a", "p", "z"] - ), - pd.DataFrame({"a": [1, 2, 3]}), - ], -) -@pytest.mark.parametrize( - "value", - [ - 10, - pd.Series([10, 20, 30]), - pd.Series([3, 4, 5]), - pd.Series([10, 20, 30], index=["z", "a", "p"]), - {"a": 5, "b": pd.Series([3, 4, 5])}, - {"a": 5001}, - {"b": pd.Series([11, 22, 33], index=["a", "p", "z"])}, - {"a": 5, "b": pd.Series([3, 4, 5], index=["a", "p", "z"])}, - {"c": 100}, - np.nan, - ], -) -@pytest.mark.parametrize("inplace", [True, False]) -def test_fillna_dataframe(df, value, inplace): - pdf = df.copy(deep=True) - gdf = cudf.from_pandas(pdf) - - fill_value_pd = value - if isinstance(fill_value_pd, (pd.Series, pd.DataFrame)): - fill_value_cudf = cudf.from_pandas(fill_value_pd) - elif isinstance(fill_value_pd, dict): - fill_value_cudf = {} - for key in fill_value_pd: - temp_val = fill_value_pd[key] - if isinstance(temp_val, pd.Series): - temp_val = cudf.from_pandas(temp_val) - fill_value_cudf[key] = temp_val - else: - fill_value_cudf = value - - expect = pdf.fillna(fill_value_pd, inplace=inplace) - got = gdf.fillna(fill_value_cudf, inplace=inplace) - - if inplace: - got = gdf - expect = pdf - - assert_eq(expect, got) - - -@pytest.mark.parametrize( - "ps_data", - [ - pd.Series(["a", "b", "c", "d"]), - pd.Series([None] * 4, dtype="object"), - pd.Series(["z", None, "z", None]), - pd.Series(["x", "y", None, None, None]), - pd.Series([None, None, None, "i", "P"]), - ], -) -@pytest.mark.parametrize( - "fill_value", - [ - "a", - pd.Series(["a", "b", "c", "d"]), - pd.Series(["z", None, "z", None]), - pd.Series([None] * 4, dtype="object"), - pd.Series(["x", "y", None, None, None]), - pd.Series([None, None, None, "i", "P"]), - ], -) -@pytest.mark.parametrize("inplace", [True, False]) -def test_fillna_string(ps_data, fill_value, inplace): - psr = ps_data.copy(deep=True) - gsr = cudf.from_pandas(psr) - - if isinstance(fill_value, pd.Series): - fill_value_cudf = cudf.from_pandas(fill_value) - else: - fill_value_cudf = fill_value - - expected = psr.fillna(fill_value, inplace=inplace) - got = gsr.fillna(fill_value_cudf, inplace=inplace) - - if inplace: - expected = psr - got = gsr - - assert_eq(expected, got) - - -@pytest.mark.parametrize("data_dtype", INTEGER_TYPES) -def test_series_fillna_invalid_dtype(data_dtype): - gdf = cudf.Series([1, 2, None, 3], dtype=data_dtype) - fill_value = 2.5 - msg = ( - f"Cannot safely cast non-equivalent" - f" {type(fill_value).__name__} to {gdf.dtype.type.__name__}" - ) - with pytest.raises(TypeError, match=msg): - gdf.fillna(fill_value) - - -@pytest.mark.parametrize("data_dtype", NUMERIC_TYPES) -@pytest.mark.parametrize("fill_value", [100, 100.0, 128.5]) -@pytest.mark.parametrize("op", [operator.gt, operator.eq, operator.lt]) -def test_series_where(data_dtype, fill_value, op): - psr = pd.Series(list(range(10)), dtype=data_dtype) - sr = cudf.from_pandas(psr) - - try: - scalar_fits = sr.dtype.type(fill_value) == fill_value - except OverflowError: - scalar_fits = False - - if not scalar_fits: - with pytest.raises(TypeError): - sr.where(op(sr, 0), fill_value) - else: - # Cast back to original dtype as pandas automatically upcasts - expect = psr.where(op(psr, 0), fill_value) - got = sr.where(op(sr, 0), fill_value) - # pandas returns 'float16' dtype, which is not supported in cudf - assert_eq( - expect, - got, - check_dtype=expect.dtype.kind not in ("f"), - ) - - -@pytest.mark.parametrize("fill_value", [100, 100.0, 100.5]) -def test_series_with_nulls_where(fill_value): - psr = pd.Series([None] * 3 + list(range(5))) - sr = cudf.from_pandas(psr) - - expect = psr.where(psr > 0, fill_value) - got = sr.where(sr > 0, fill_value) - assert_eq(expect, got) - - expect = psr.where(psr < 0, fill_value) - got = sr.where(sr < 0, fill_value) - assert_eq(expect, got) - - expect = psr.where(psr == 0, fill_value) - got = sr.where(sr == 0, fill_value) - assert_eq(expect, got) - - -@pytest.mark.parametrize("fill_value", [[888, 999]]) -def test_dataframe_with_nulls_where_with_scalars(fill_value): - pdf = pd.DataFrame( - { - "A": [-1, 2, -3, None, 5, 6, -7, 0], - "B": [4, -2, 3, None, 7, 6, 8, 0], - } - ) - gdf = cudf.from_pandas(pdf) - - expect = pdf.where(pdf % 3 == 0, fill_value) - got = gdf.where(gdf % 3 == 0, fill_value) - - assert_eq(expect, got) - - -def test_dataframe_with_different_types(): - # Testing for int and float - pdf = pd.DataFrame( - {"A": [111, 22, 31, 410, 56], "B": [-10.12, 121.2, 45.7, 98.4, 87.6]} - ) - gdf = cudf.from_pandas(pdf) - expect = pdf.where(pdf > 50, -pdf) - got = gdf.where(gdf > 50, -gdf) - - assert_eq(expect, got) - - # Testing for string - pdf = pd.DataFrame({"A": ["a", "bc", "cde", "fghi"]}) - gdf = cudf.from_pandas(pdf) - pdf_mask = pd.DataFrame({"A": [True, False, True, False]}) - gdf_mask = cudf.from_pandas(pdf_mask) - expect = pdf.where(pdf_mask, ["cudf"]) - got = gdf.where(gdf_mask, ["cudf"]) - - assert_eq(expect, got) - - # Testing for categoriacal - pdf = pd.DataFrame({"A": ["a", "b", "b", "c"]}) - pdf["A"] = pdf["A"].astype("category") - gdf = cudf.from_pandas(pdf) - expect = pdf.where(pdf_mask, "c") - got = gdf.where(gdf_mask, ["c"]) - - assert_eq(expect, got) - - -def test_dataframe_where_with_different_options(): - pdf = pd.DataFrame({"A": [1, 2, 3], "B": [3, 4, 5]}) - gdf = cudf.from_pandas(pdf) - - # numpy array - boolean_mask = np.array([[False, True], [True, False], [False, True]]) - - expect = pdf.where(boolean_mask, -pdf) - got = gdf.where(boolean_mask, -gdf) - - assert_eq(expect, got) - - # with single scalar - expect = pdf.where(boolean_mask, 8) - got = gdf.where(boolean_mask, 8) - - assert_eq(expect, got) - - # with multi scalar - expect = pdf.where(boolean_mask, [8, 9]) - got = gdf.where(boolean_mask, [8, 9]) - - assert_eq(expect, got) - - -def test_series_multiple_times_with_nulls(): - sr = cudf.Series([1, 2, 3, None]) - expected = cudf.Series([None, None, None, None], dtype=np.int64) - - for i in range(3): - got = sr.replace([1, 2, 3], None) - assert_eq(expected, got) - # BUG: #2695 - # The following series will acquire a chunk of memory and update with - # values, but these values may still linger even after the memory - # gets released. This memory space might get used for replace in - # subsequent calls and the memory used for mask may have junk values. - # So, if it is not updated properly, the result would be wrong. - # So, this will help verify that scenario. - cudf.Series([1, 1, 1, None]) - - -@pytest.mark.parametrize("series_dtype", NUMERIC_TYPES) -@pytest.mark.parametrize( - "replacement", [128, 128.0, 128.5, 32769, 32769.0, 32769.5] -) -def test_numeric_series_replace_dtype(request, series_dtype, replacement): - request.applymarker( - pytest.mark.xfail( - condition=PANDAS_GT_214 - and ( - ( - series_dtype == "int8" - and replacement in {128, 128.0, 32769, 32769.0} - ) - or ( - series_dtype == "int16" and replacement in {32769, 32769.0} - ) - ), - reason="Pandas throws an AssertionError for these " - "cases and asks us to log a bug, they are trying to " - "avoid a RecursionError which cudf will not run into", - ) - ) - psr = pd.Series([0, 1, 2, 3, 4, 5], dtype=series_dtype) - sr = cudf.from_pandas(psr) - - expect = psr.replace(1, replacement) - got = sr.replace(1, replacement) - - assert_eq(expect, got) - - # to_replace is a list, replacement is a scalar - expect = psr.replace([2, 3], replacement) - got = sr.replace([2, 3], replacement) - - assert_eq(expect, got) - - # If to_replace is a scalar and replacement is a list - with pytest.raises(TypeError): - sr.replace(0, [replacement, 2]) - - # Both list of unequal length - with pytest.raises(ValueError): - sr.replace([0, 1], [replacement]) - - # Both lists of equal length - expect = psr.replace([2, 3], [replacement, replacement]) - got = sr.replace([2, 3], [replacement, replacement]) - assert_eq(expect, got) - - -@pytest.mark.parametrize( - "pframe, replace_args", - [ - ( - pd.Series([5, 1, 2, 3, 4]), - {"to_replace": 5, "value": 0, "inplace": True}, - ), - ( - pd.Series([5, 1, 2, 3, 4]), - {"to_replace": {5: 0, 3: -5}, "inplace": True}, - ), - (pd.Series([5, 1, 2, 3, 4]), {}), - pytest.param( - pd.Series(["one", "two", "three"], dtype="category"), - {"to_replace": "one", "value": "two", "inplace": True}, - marks=pytest.mark.xfail( - condition=PANDAS_VERSION >= PANDAS_CURRENT_SUPPORTED_VERSION, - reason="https://github.com/pandas-dev/pandas/issues/43232" - "https://github.com/pandas-dev/pandas/issues/53358", - ), - ), - ( - pd.DataFrame({"A": [0, 1, 2, 3, 4], "B": [5, 6, 7, 8, 9]}), - {"to_replace": 5, "value": 0, "inplace": True}, - ), - ( - pd.Series([1, 2, 3, 45]), - { - "to_replace": np.array([]).astype(int), - "value": 77, - "inplace": True, - }, - ), - ( - pd.Series([1, 2, 3, 45]), - { - "to_replace": np.array([]).astype(int), - "value": 77, - "inplace": False, - }, - ), - ( - pd.DataFrame({"a": [1, 2, 3, 4, 5, 666]}), - {"to_replace": {"a": 2}, "value": {"a": -33}, "inplace": True}, - ), - ( - pd.DataFrame({"a": [1, 2, 3, 4, 5, 666]}), - { - "to_replace": {"a": [2, 5]}, - "value": {"a": [9, 10]}, - "inplace": True, - }, - ), - ( - pd.DataFrame({"a": [1, 2, 3, 4, 5, 666]}), - {"to_replace": [], "value": [], "inplace": True}, - ), - ], -) -@pytest.mark.skipif( - PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, - reason="Warning not given on older versions of pandas", -) -def test_replace_inplace(pframe, replace_args): - gpu_frame = cudf.from_pandas(pframe) - pandas_frame = pframe.copy() - - gpu_copy = gpu_frame.copy() - cpu_copy = pandas_frame.copy() - - assert_eq(gpu_frame, pandas_frame) - assert_eq(gpu_copy, cpu_copy) - with expect_warning_if(len(replace_args) == 0): - gpu_frame.replace(**replace_args) - with expect_warning_if(len(replace_args) == 0): - pandas_frame.replace(**replace_args) - assert_eq(gpu_frame, pandas_frame) - assert_eq(gpu_copy, cpu_copy) - - -def test_replace_df_error(): - pdf = pd.DataFrame({"a": [1, 2, 3, 4, 5, 666]}) - gdf = cudf.from_pandas(pdf) - - assert_exceptions_equal( - lfunc=pdf.replace, - rfunc=gdf.replace, - lfunc_args_and_kwargs=([], {"to_replace": -1, "value": []}), - rfunc_args_and_kwargs=([], {"to_replace": -1, "value": []}), - ) - - -@pytest.mark.parametrize( - ("lower", "upper"), - [ - ([2, 7.4], [4, 7.9]), - ([2, 7.4], None), - ( - None, - [4, 7.9], - ), - ], -) -@pytest.mark.parametrize("inplace", [True, False]) -def test_dataframe_clip(lower, upper, inplace): - pdf = pd.DataFrame( - {"a": [1, 2, 3, 4, 5], "b": [7.1, 7.24, 7.5, 7.8, 8.11]} - ) - gdf = cudf.from_pandas(pdf) - - got = gdf.clip(lower=lower, upper=upper, inplace=inplace) - expect = pdf.clip(lower=lower, upper=upper, axis=1) - - if inplace is True: - assert_eq(expect, gdf) - else: - assert_eq(expect, got) - - -@pytest.mark.parametrize( - ("lower", "upper"), - [("b", "d"), ("b", None), (None, "c"), (None, None)], -) -@pytest.mark.parametrize("inplace", [True, False]) -def test_dataframe_category_clip(lower, upper, inplace): - data = ["a", "b", "c", "d", "e"] - pdf = pd.DataFrame({"a": data}) - gdf = cudf.from_pandas(pdf) - gdf["a"] = gdf["a"].astype("category") - - expect = pdf.clip(lower=lower, upper=upper) - got = gdf.clip(lower=lower, upper=upper, inplace=inplace) - - if inplace is True: - assert_eq(expect, gdf.astype("str")) - else: - assert_eq(expect, got.astype("str")) - - -@pytest.mark.parametrize( - ("lower", "upper"), - [([2, 7.4], [4, 7.9, "d"]), ([2, 7.4, "a"], [4, 7.9, "d"])], -) -def test_dataframe_exceptions_for_clip(lower, upper): - gdf = cudf.DataFrame( - {"a": [1, 2, 3, 4, 5], "b": [7.1, 7.24, 7.5, 7.8, 8.11]} - ) - - with pytest.raises(ValueError): - gdf.clip(lower=lower, upper=upper) - - -@pytest.mark.parametrize( - ("data", "lower", "upper"), - [ - ([1, 2, 3, 4, 5], 2, 4), - ([1, 2, 3, 4, 5], 2, None), - ([1, 2, 3, 4, 5], None, 4), - ([1, 2, 3, 4, 5], None, None), - ([1, 2, 3, 4, 5], 4, 2), - ([1.0, 2.0, 3.0, 4.0, 5.0], 4, 2), - (pd.Series([1, 2, 3, 4, 5], dtype="int32"), 4, 2), - (["a", "b", "c", "d", "e"], "b", "d"), - (["a", "b", "c", "d", "e"], "b", None), - (["a", "b", "c", "d", "e"], None, "d"), - (["a", "b", "c", "d", "e"], "d", "b"), - ], -) -@pytest.mark.parametrize("inplace", [True, False]) -def test_series_clip(data, lower, upper, inplace): - psr = pd.Series(data) - gsr = cudf.from_pandas(psr) - - expect = psr.clip(lower=lower, upper=upper) - got = gsr.clip(lower=lower, upper=upper, inplace=inplace) - - if inplace is True: - assert_eq(expect, gsr) - else: - assert_eq(expect, got) - - -def test_series_exceptions_for_clip(): - with pytest.raises(ValueError): - cudf.Series([1, 2, 3, 4]).clip([1, 2], [2, 3]) - - with pytest.raises(NotImplementedError): - cudf.Series([1, 2, 3, 4]).clip(1, 2, axis=0) - - -@pytest.mark.parametrize( - "data", [[1, 2.0, 3, 4, None, 1, None, 10, None], ["a", "b", "c"]] -) -@pytest.mark.parametrize( - "index", - [ - None, - [1, 2, 3], - ["a", "b", "z"], - ["a", "b", "c", "d", "e", "f", "g", "l", "m"], - ], -) -@pytest.mark.parametrize("value", [[1, 2, 3, 4, None, 1, None, 10, None]]) -def test_series_fillna(data, index, value): - psr = pd.Series( - data, - index=index if index is not None and len(index) == len(data) else None, - ) - gsr = cudf.Series( - data, - index=index if index is not None and len(index) == len(data) else None, - ) - - expect = psr.fillna(pd.Series(value)) - got = gsr.fillna(cudf.Series(value)) - assert_eq(expect, got) - - -def test_series_fillna_error(): - psr = pd.Series([1, 2, None, 3, None]) - gsr = cudf.from_pandas(psr) - - assert_exceptions_equal( - psr.fillna, - gsr.fillna, - ([pd.DataFrame({"a": [1, 2, 3]})],), - ([cudf.DataFrame({"a": [1, 2, 3]})],), - ) - - -def test_series_replace_errors(): - gsr = cudf.Series([1, 2, None, 3, None]) - psr = gsr.to_pandas() - - with pytest.raises( - TypeError, - match=re.escape( - "to_replace and value should be of same types," - "got to_replace dtype: int64 and " - "value dtype: object" - ), - ): - gsr.replace(1, "a") - - gsr = cudf.Series(["a", "b", "c"]) - with pytest.raises( - TypeError, - match=re.escape( - "to_replace and value should be of same types," - "got to_replace dtype: int64 and " - "value dtype: object" - ), - ): - gsr.replace([1, 2], ["a", "b"]) - - assert_exceptions_equal( - psr.replace, - gsr.replace, - ([{"a": 1}, 1],), - ([{"a": 1}, 1],), - ) - - assert_exceptions_equal( - lfunc=psr.replace, - rfunc=gsr.replace, - lfunc_args_and_kwargs=([[1, 2], [1]],), - rfunc_args_and_kwargs=([[1, 2], [1]],), - ) - - assert_exceptions_equal( - lfunc=psr.replace, - rfunc=gsr.replace, - lfunc_args_and_kwargs=([object(), [1]],), - rfunc_args_and_kwargs=([object(), [1]],), - ) - - assert_exceptions_equal( - lfunc=psr.replace, - rfunc=gsr.replace, - lfunc_args_and_kwargs=([{"a": 1}, object()],), - rfunc_args_and_kwargs=([{"a": 1}, object()],), - ) - - -@pytest.mark.parametrize( - "gsr,old,new,expected", - [ - ( - lambda: cudf.Series(["a", "b", "c", None]), - None, - "a", - lambda: cudf.Series(["a", "b", "c", "a"]), - ), - ( - lambda: cudf.Series(["a", "b", "c", None]), - [None, "a", "a"], - ["c", "b", "d"], - lambda: cudf.Series(["d", "b", "c", "c"]), - ), - ( - lambda: cudf.Series(["a", "b", "c", None]), - [None, "a"], - ["b", None], - lambda: cudf.Series([None, "b", "c", "b"]), - ), - ( - lambda: cudf.Series(["a", "b", "c", None]), - [None, None], - [None, None], - lambda: cudf.Series(["a", "b", "c", None]), - ), - ( - lambda: cudf.Series([1, 2, None, 3]), - None, - 10, - lambda: cudf.Series([1, 2, 10, 3]), - ), - ( - lambda: cudf.Series([1, 2, None, 3]), - [None, 1, 1], - [3, 2, 4], - lambda: cudf.Series([4, 2, 3, 3]), - ), - ( - lambda: cudf.Series([1, 2, None, 3]), - [None, 1], - [2, None], - lambda: cudf.Series([None, 2, 2, 3]), - ), - ( - lambda: cudf.Series(["a", "q", "t", None], dtype="category"), - None, - "z", - lambda: cudf.Series(["a", "q", "t", "z"], dtype="category"), - ), - ( - lambda: cudf.Series(["a", "q", "t", None], dtype="category"), - [None, "a", "q"], - ["z", None, None], - lambda: cudf.Series([None, None, "t", "z"], dtype="category"), - ), - ( - lambda: cudf.Series(["a", None, "t", None], dtype="category"), - [None, "t"], - ["p", None], - lambda: cudf.Series(["a", "p", None, "p"], dtype="category"), - ), - ], -) -def test_replace_nulls(gsr, old, new, expected): - gsr = gsr() - with expect_warning_if(isinstance(gsr.dtype, cudf.CategoricalDtype)): - actual = gsr.replace(old, new) - assert_eq( - expected().sort_values().reset_index(drop=True), - actual.sort_values().reset_index(drop=True), - ) - - -def test_fillna_columns_multiindex(): - columns = pd.MultiIndex.from_tuples([("a", "b"), ("d", "e")]) - pdf = pd.DataFrame( - {"0": [1, 2, None, 3, None], "1": [None, None, None, None, 4]} - ) - pdf.columns = columns - gdf = cudf.from_pandas(pdf) - - expected = pdf.fillna(10) - actual = gdf.fillna(10) - - assert_eq(expected, actual) - - -def test_fillna_nan_and_null(): - ser = cudf.Series(pa.array([float("nan"), None, 1.1]), nan_as_null=False) - result = ser.fillna(2.2) - expected = cudf.Series([2.2, 2.2, 1.1]) - assert_eq(result, expected) - - -def test_replace_with_index_objects(): - result = cudf.Series([1, 2]).replace(cudf.Index([1]), cudf.Index([2])) - expected = pd.Series([1, 2]).replace(pd.Index([1]), pd.Index([2])) - assert_eq(result, expected) - - -# Example test function for datetime series replace -def test_replace_datetime_series(): - # Create a pandas datetime series - pd_series = pd.Series(pd.date_range("20210101", periods=5)) - # Replace a specific datetime value - pd_result = pd_series.replace( - pd.Timestamp("2021-01-02"), pd.Timestamp("2021-01-10") - ) - - # Create a cudf datetime series - cudf_series = cudf.Series(pd.date_range("20210101", periods=5)) - # Replace a specific datetime value - cudf_result = cudf_series.replace( - pd.Timestamp("2021-01-02"), pd.Timestamp("2021-01-10") - ) - - assert_eq(pd_result, cudf_result) - - -# Example test function for timedelta series replace -def test_replace_timedelta_series(): - # Create a pandas timedelta series - pd_series = pd.Series(pd.timedelta_range("1 days", periods=5)) - # Replace a specific timedelta value - pd_result = pd_series.replace( - pd.Timedelta("2 days"), pd.Timedelta("10 days") - ) - - # Create a cudf timedelta series - cudf_series = cudf.Series(pd.timedelta_range("1 days", periods=5)) - # Replace a specific timedelta value - cudf_result = cudf_series.replace( - pd.Timedelta("2 days"), pd.Timedelta("10 days") - ) - - assert_eq(pd_result, cudf_result) - - -def test_replace_multiple_rows(datadir): - path = datadir / "parquet" / "replace_multiple_rows.parquet" - pdf = pd.read_parquet(path) - gdf = cudf.read_parquet(path) - - pdf.replace([np.inf, -np.inf], np.nan, inplace=True) - gdf.replace([np.inf, -np.inf], np.nan, inplace=True) - - assert_eq(pdf, gdf, check_dtype=False) From 7acddc976937109bd4c0d77b7b3d6c22ffc7c088 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Mon, 11 Aug 2025 13:56:14 -0700 Subject: [PATCH 093/366] Move test_monotonic.py to new cudf classic test directory structure (#19572) Towards https://github.com/rapidsai/cudf/issues/9999 Towards https://github.com/rapidsai/cudf/issues/15723 Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - GALI PREM SAGAR (https://github.com/galipremsagar) URL: https://github.com/rapidsai/cudf/pull/19572 --- .../indexes/categoricalindex/__init__.py | 0 .../categoricalindex/test_attributes.py | 20 + .../tests/indexes/datetimeindex/__init__.py | 0 .../indexes/datetimeindex/test_attributes.py | 45 +++ .../tests/indexes/index/methods/__init__.py | 0 .../index/methods/test_get_slice_bounds.py | 47 +++ .../tests/indexes/index/test_attributes.py | 34 ++ .../indexes/multiindex/test_attributes.py | 54 +++ .../cudf/tests/indexes/rangeindex/__init__.py | 0 .../indexes/rangeindex/methods/__init__.py | 0 .../methods/test_get_slice_bounds.py | 37 ++ .../indexes/rangeindex/test_attributes.py | 18 + .../cudf/cudf/tests/series/test_attributes.py | 78 ++++ python/cudf/cudf/tests/test_monotonic.py | 354 ------------------ 14 files changed, 333 insertions(+), 354 deletions(-) create mode 100644 python/cudf/cudf/tests/indexes/categoricalindex/__init__.py create mode 100644 python/cudf/cudf/tests/indexes/categoricalindex/test_attributes.py create mode 100644 python/cudf/cudf/tests/indexes/datetimeindex/__init__.py create mode 100644 python/cudf/cudf/tests/indexes/datetimeindex/test_attributes.py create mode 100644 python/cudf/cudf/tests/indexes/index/methods/__init__.py create mode 100644 python/cudf/cudf/tests/indexes/index/methods/test_get_slice_bounds.py create mode 100644 python/cudf/cudf/tests/indexes/index/test_attributes.py create mode 100644 python/cudf/cudf/tests/indexes/rangeindex/__init__.py create mode 100644 python/cudf/cudf/tests/indexes/rangeindex/methods/__init__.py create mode 100644 python/cudf/cudf/tests/indexes/rangeindex/methods/test_get_slice_bounds.py create mode 100644 python/cudf/cudf/tests/indexes/rangeindex/test_attributes.py delete mode 100644 python/cudf/cudf/tests/test_monotonic.py diff --git a/python/cudf/cudf/tests/indexes/categoricalindex/__init__.py b/python/cudf/cudf/tests/indexes/categoricalindex/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/python/cudf/cudf/tests/indexes/categoricalindex/test_attributes.py b/python/cudf/cudf/tests/indexes/categoricalindex/test_attributes.py new file mode 100644 index 00000000000..cfdf26877b6 --- /dev/null +++ b/python/cudf/cudf/tests/indexes/categoricalindex/test_attributes.py @@ -0,0 +1,20 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + +import pandas as pd +import pytest + +from cudf.core.index import CategoricalIndex + + +@pytest.mark.parametrize( + "testlist", [["c", "d", "e", "f"], ["z", "y", "x", "r"]] +) +def test_categorical_index_is_unique_monotonic(testlist): + # Assuming unordered categorical data cannot be "monotonic" + raw_cat = pd.Categorical(testlist, ordered=True) + index = CategoricalIndex(raw_cat) + index_pd = pd.CategoricalIndex(raw_cat) + + assert index.is_unique == index_pd.is_unique + assert index.is_monotonic_increasing == index_pd.is_monotonic_increasing + assert index.is_monotonic_decreasing == index_pd.is_monotonic_decreasing diff --git a/python/cudf/cudf/tests/indexes/datetimeindex/__init__.py b/python/cudf/cudf/tests/indexes/datetimeindex/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/python/cudf/cudf/tests/indexes/datetimeindex/test_attributes.py b/python/cudf/cudf/tests/indexes/datetimeindex/test_attributes.py new file mode 100644 index 00000000000..5fb8c2fb647 --- /dev/null +++ b/python/cudf/cudf/tests/indexes/datetimeindex/test_attributes.py @@ -0,0 +1,45 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + +import pandas as pd +import pytest + +from cudf.core.index import DatetimeIndex + + +@pytest.mark.parametrize( + "testlist", + [ + [ + "2001-01-01 00:00:00", + "2001-02-03 08:00:00", + "2001-03-08 16:00:00", + "2001-04-11 00:00:00", + ], + [ + "2001-04-11 00:00:00", + "2001-03-08 16:00:00", + "2001-02-03 08:00:00", + "2001-01-01 00:00:00", + ], + [ + "2001-04-11 00:00:00", + "2001-02-03 08:00:00", + "2001-03-08 16:00:00", + "2001-01-01 00:00:00", + ], + [ + "2001-04-11 00:00:00", + "2001-01-01 00:00:00", + "2001-02-03 08:00:00", + "2001-03-08 16:00:00", + "2001-01-01 00:00:00", + ], + ], +) +def test_datetime_index_is_unique_monotonic(testlist): + index = DatetimeIndex(testlist) + index_pd = pd.DatetimeIndex(testlist) + + assert index.is_unique == index_pd.is_unique + assert index.is_monotonic_increasing == index_pd.is_monotonic_increasing + assert index.is_monotonic_decreasing == index_pd.is_monotonic_decreasing diff --git a/python/cudf/cudf/tests/indexes/index/methods/__init__.py b/python/cudf/cudf/tests/indexes/index/methods/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/python/cudf/cudf/tests/indexes/index/methods/test_get_slice_bounds.py b/python/cudf/cudf/tests/indexes/index/methods/test_get_slice_bounds.py new file mode 100644 index 00000000000..68efbc71f22 --- /dev/null +++ b/python/cudf/cudf/tests/indexes/index/methods/test_get_slice_bounds.py @@ -0,0 +1,47 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + +import pandas as pd +import pytest + +from cudf import Index + + +@pytest.mark.parametrize( + "testlist", + [ + [10, 9, 8, 8, 7], + [2.0, 5.0, 4.0, 3.0, 7.0], + ["b", "cat", "e", "bat", "c"], + ], +) +@pytest.mark.parametrize("side", ["left", "right"]) +def test_get_slice_bound(testlist, side): + index = Index(testlist) + index_pd = pd.Index(testlist) + for label in testlist: + expect = index_pd.get_slice_bound(label, side) + got = index.get_slice_bound(label, side) + assert got == expect + + +@pytest.mark.parametrize("label", [1, 5, 7, 11]) +@pytest.mark.parametrize("side", ["left", "right"]) +def test_get_slice_bound_missing(label, side): + mylist = [2, 4, 6, 8, 10] + index = Index(mylist) + index_pd = pd.Index(mylist) + + expect = index_pd.get_slice_bound(label, side) + got = index.get_slice_bound(label, side) + assert got == expect + + +@pytest.mark.parametrize("label", ["a", "c", "g"]) +@pytest.mark.parametrize("side", ["left", "right"]) +def test_get_slice_bound_missing_str(label, side): + mylist = ["b", "d", "f"] + index = Index(mylist) + index_pd = pd.Index(mylist) + got = index.get_slice_bound(label, side) + expect = index_pd.get_slice_bound(label, side) + assert got == expect diff --git a/python/cudf/cudf/tests/indexes/index/test_attributes.py b/python/cudf/cudf/tests/indexes/index/test_attributes.py new file mode 100644 index 00000000000..2e80dfb272e --- /dev/null +++ b/python/cudf/cudf/tests/indexes/index/test_attributes.py @@ -0,0 +1,34 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + +import numpy as np +import pandas as pd +import pytest + +from cudf import Index + + +@pytest.mark.parametrize( + "testlist", + [ + [1, 2, 3, 4], + [1, 2, 3, 4, None], + [1, 2, 3, 3, 4], + [10, 9, 8, 7], + [10, 9, 8, 8, 7], + [1, 2, 3, 4, np.nan], + [10, 9, 8, np.nan, 7], + [10, 9, 8, 8, 7, np.nan], + ["c", "d", "e", "f"], + ["c", "d", "e", "e", "f"], + ["c", "d", "e", "f", None], + ["z", "y", "x", "r"], + ["z", "y", "x", "x", "r"], + ], +) +def test_index_is_unique_monotonic(testlist): + index = Index(testlist) + index_pd = pd.Index(testlist) + + assert index.is_unique == index_pd.is_unique + assert index.is_monotonic_increasing == index_pd.is_monotonic_increasing + assert index.is_monotonic_decreasing == index_pd.is_monotonic_decreasing diff --git a/python/cudf/cudf/tests/indexes/multiindex/test_attributes.py b/python/cudf/cudf/tests/indexes/multiindex/test_attributes.py index 54be9e670aa..c938683764b 100644 --- a/python/cudf/cudf/tests/indexes/multiindex/test_attributes.py +++ b/python/cudf/cudf/tests/indexes/multiindex/test_attributes.py @@ -8,10 +8,64 @@ import pytest import cudf +from cudf import MultiIndex from cudf.testing import assert_eq from cudf.testing._utils import assert_exceptions_equal +def test_multiindex_is_unique_monotonic(): + pidx = pd.MultiIndex( + [ + ["a", "b", "c"], + ["house", "store", "forest"], + ["clouds", "clear", "storm"], + ["fire", "smoke", "clear"], + ], + [ + [0, 0, 0, 0, 1, 1, 2], + [1, 1, 1, 1, 0, 0, 2], + [0, 0, 2, 2, 2, 0, 1], + [0, 0, 0, 1, 2, 0, 1], + ], + ) + pidx.names = ["alpha", "location", "weather", "sign"] + gidx = cudf.from_pandas(pidx) + + assert pidx.is_unique == gidx.is_unique + assert pidx.is_monotonic_increasing == gidx.is_monotonic_increasing + assert pidx.is_monotonic_decreasing == gidx.is_monotonic_decreasing + + +@pytest.mark.parametrize( + "testarr", + [ + ( + [ + ["bar", "bar", "foo", "foo", "qux", "qux", "qux"], + ["one", "two", "one", "two", "one", "two", "two"], + ], + ["first", "second"], + ), + ( + [ + ["bar", "bar", "foo", "foo", "qux", "qux"], + ["one", "two", "one", "two", "one", "two"], + ], + ["first", "second"], + ), + ], +) +def test_multiindex_tuples_is_unique_monotonic(testarr): + tuples = list(zip(*testarr[0], strict=True)) + + index = MultiIndex.from_tuples(tuples, names=testarr[1]) + index_pd = pd.MultiIndex.from_tuples(tuples, names=testarr[1]) + + assert index.is_unique == index_pd.is_unique + assert index.is_monotonic_increasing == index_pd.is_monotonic_increasing + assert index.is_monotonic_decreasing == index_pd.is_monotonic_decreasing + + @pytest.fixture( params=[ "from_product", diff --git a/python/cudf/cudf/tests/indexes/rangeindex/__init__.py b/python/cudf/cudf/tests/indexes/rangeindex/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/python/cudf/cudf/tests/indexes/rangeindex/methods/__init__.py b/python/cudf/cudf/tests/indexes/rangeindex/methods/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/python/cudf/cudf/tests/indexes/rangeindex/methods/test_get_slice_bounds.py b/python/cudf/cudf/tests/indexes/rangeindex/methods/test_get_slice_bounds.py new file mode 100644 index 00000000000..6588e961bb1 --- /dev/null +++ b/python/cudf/cudf/tests/indexes/rangeindex/methods/test_get_slice_bounds.py @@ -0,0 +1,37 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + +import pandas as pd +import pytest + +from cudf.core.index import RangeIndex + + +@pytest.mark.parametrize( + "start, stop", [(0, 10), (0, 1), (3, 4), (0, 0), (3, 3)] +) +@pytest.mark.parametrize("idx", [-1, 0, 5, 10, 11]) +@pytest.mark.parametrize("side", ["left", "right"]) +def test_rangeindex_get_slice_bound_basic(start, stop, idx, side): + pd_index = pd.RangeIndex(start, stop) + cudf_index = RangeIndex(start, stop) + expect = pd_index.get_slice_bound(idx, side) + got = cudf_index.get_slice_bound(idx, side) + assert expect == got + + +@pytest.mark.parametrize( + "start, stop, step", + [(3, 20, 5), (20, 3, -5), (20, 3, 5), (3, 20, -5), (0, 0, 2), (3, 3, 2)], +) +@pytest.mark.parametrize( + "label", + [3, 8, 13, 18, 20, 15, 10, 5, -1, 0, 19, 21, 6, 11, 17], +) +@pytest.mark.parametrize("side", ["left", "right"]) +def test_rangeindex_get_slice_bound_step(start, stop, step, label, side): + pd_index = pd.RangeIndex(start, stop, step) + cudf_index = RangeIndex(start, stop, step) + + expect = pd_index.get_slice_bound(label, side) + got = cudf_index.get_slice_bound(label, side) + assert expect == got diff --git a/python/cudf/cudf/tests/indexes/rangeindex/test_attributes.py b/python/cudf/cudf/tests/indexes/rangeindex/test_attributes.py new file mode 100644 index 00000000000..a9de5d39622 --- /dev/null +++ b/python/cudf/cudf/tests/indexes/rangeindex/test_attributes.py @@ -0,0 +1,18 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + +import pandas as pd +import pytest + +from cudf.core.index import RangeIndex + + +@pytest.mark.parametrize( + "start, stop, step", [(10, 20, 1), (0, -10, -1), (5, 5, 1)] +) +def test_range_index_is_unique_monotonic(start, stop, step): + index = RangeIndex(start=start, stop=stop, step=step) + index_pd = pd.RangeIndex(start=start, stop=stop, step=step) + + assert index.is_unique == index_pd.is_unique + assert index.is_monotonic_increasing == index_pd.is_monotonic_increasing + assert index.is_monotonic_decreasing == index_pd.is_monotonic_decreasing diff --git a/python/cudf/cudf/tests/series/test_attributes.py b/python/cudf/cudf/tests/series/test_attributes.py index 6cc5999ab4f..aedbf696009 100644 --- a/python/cudf/cudf/tests/series/test_attributes.py +++ b/python/cudf/cudf/tests/series/test_attributes.py @@ -9,6 +9,84 @@ from cudf.testing import assert_eq +@pytest.mark.parametrize( + "testlist", + [ + [1, 2, 3, 4], + [1, 2, 3, 3, 4], + [10, 9, 8, 7], + [10, 9, 8, 8, 7], + ["c", "d", "e", "f"], + ["c", "d", "e", "e", "f"], + ["z", "y", "x", "r"], + ["z", "y", "x", "x", "r"], + ], +) +def test_series_is_unique_monotonic(testlist): + series = cudf.Series(testlist) + series_pd = pd.Series(testlist) + + assert series.is_unique == series_pd.is_unique + assert series.is_monotonic_increasing == series_pd.is_monotonic_increasing + assert series.is_monotonic_decreasing == series_pd.is_monotonic_decreasing + + +@pytest.mark.parametrize( + "data", + [ + [pd.Timestamp("2018-01-01"), pd.Timestamp("2019-01-31"), None], + [1, 2, 3, None], + [None, 1, 2, 3], + ["a", "b", "c", None], + [None, "a", "b", "c"], + ], +) +def test_is_monotonic_always_false_for_null(data): + ser = cudf.Series(data) + assert ser.is_monotonic_increasing is False + assert ser.is_monotonic_decreasing is False + + +@pytest.mark.parametrize("box", [cudf.Series, cudf.Index]) +@pytest.mark.parametrize( + "value,na_like", + [ + [1, None], + [np.datetime64("2020-01-01", "ns"), np.datetime64("nat", "ns")], + ["s", None], + [1.0, np.nan], + ], + ids=repr, +) +def test_is_unique(box, value, na_like): + obj = box([value], nan_as_null=False) + assert obj.is_unique + + obj = box([value, value], nan_as_null=False) + assert not obj.is_unique + + obj = box([None, value], nan_as_null=False) + assert obj.is_unique + + obj = box([None, None, value], nan_as_null=False) + assert not obj.is_unique + + if na_like is not None: + obj = box([na_like, value], nan_as_null=False) + assert obj.is_unique + + obj = box([na_like, na_like], nan_as_null=False) + assert not obj.is_unique + + try: + if not np.isnat(na_like): + # pyarrow coerces nat to null + obj = box([None, na_like, value], nan_as_null=False) + assert obj.is_unique + except TypeError: + pass + + @pytest.fixture( params=[ pd.Series([0, 1, 2, np.nan, 4, None, 6]), diff --git a/python/cudf/cudf/tests/test_monotonic.py b/python/cudf/cudf/tests/test_monotonic.py deleted file mode 100644 index 842b40a6d37..00000000000 --- a/python/cudf/cudf/tests/test_monotonic.py +++ /dev/null @@ -1,354 +0,0 @@ -# Copyright (c) 2019-2025, NVIDIA CORPORATION. - -""" -Tests related to is_unique, is_monotonic_increasing & -is_monotonic_decreasing attributes -""" - -import numpy as np -import pandas as pd -import pytest - -import cudf -from cudf import Index, MultiIndex, Series -from cudf.core.index import CategoricalIndex, DatetimeIndex, RangeIndex - - -@pytest.mark.parametrize("testrange", [(10, 20, 1), (0, -10, -1), (5, 5, 1)]) -def test_range_index(testrange): - index = RangeIndex( - start=testrange[0], stop=testrange[1], step=testrange[2] - ) - index_pd = pd.RangeIndex( - start=testrange[0], stop=testrange[1], step=testrange[2] - ) - - assert index.is_unique == index_pd.is_unique - assert index.is_monotonic_increasing == index_pd.is_monotonic_increasing - assert index.is_monotonic_decreasing == index_pd.is_monotonic_decreasing - - -@pytest.mark.parametrize( - "testlist", - [ - [1, 2, 3, 4], - [1, 2, 3, 4, None], - [1, 2, 3, 3, 4], - [10, 9, 8, 7], - [10, 9, 8, 8, 7], - ["c", "d", "e", "f"], - ["c", "d", "e", "e", "f"], - ["c", "d", "e", "f", None], - ["z", "y", "x", "r"], - ["z", "y", "x", "x", "r"], - ], -) -def test_generic_index(testlist): - index = Index(testlist) - index_pd = pd.Index(testlist) - - assert index.is_unique == index_pd.is_unique - assert index.is_monotonic_increasing == index_pd.is_monotonic_increasing - assert index.is_monotonic_decreasing == index_pd.is_monotonic_decreasing - - -@pytest.mark.parametrize( - "testlist", - [ - [1, 2, 3, 4, np.nan], - [10, 9, 8, np.nan, 7], - [10, 9, 8, 8, 7, np.nan], - ], -) -def test_float_index(testlist): - index_pd = pd.Index(testlist) - index = cudf.from_pandas(index_pd, nan_as_null=False) - - assert index.is_unique == index_pd.is_unique - assert index.is_monotonic_increasing == index_pd.is_monotonic_increasing - assert index.is_monotonic_decreasing == index_pd.is_monotonic_decreasing - - -@pytest.mark.parametrize( - "testlist", - [ - ["c", "d", "e", "f"], - ["c", "d", "e", "e", "f"], - ["z", "y", "x", "r"], - ["z", "y", "x", "x", "r"], - ], -) -def test_string_index(testlist): - index = cudf.Index(testlist) - index_pd = pd.Index(testlist) - - assert index.is_unique == index_pd.is_unique - assert index.is_monotonic_increasing == index_pd.is_monotonic_increasing - assert index.is_monotonic_decreasing == index_pd.is_monotonic_decreasing - - -@pytest.mark.parametrize( - "testlist", [["c", "d", "e", "f"], ["z", "y", "x", "r"]] -) -def test_categorical_index(testlist): - # Assuming unordered categorical data cannot be "monotonic" - raw_cat = pd.Categorical(testlist, ordered=True) - index = CategoricalIndex(raw_cat) - index_pd = pd.CategoricalIndex(raw_cat) - - assert index.is_unique == index_pd.is_unique - assert index.is_monotonic_increasing == index_pd.is_monotonic_increasing - assert index.is_monotonic_decreasing == index_pd.is_monotonic_decreasing - - -@pytest.mark.parametrize( - "testlist", - [ - [ - "2001-01-01 00:00:00", - "2001-02-03 08:00:00", - "2001-03-08 16:00:00", - "2001-04-11 00:00:00", - ], - [ - "2001-04-11 00:00:00", - "2001-03-08 16:00:00", - "2001-02-03 08:00:00", - "2001-01-01 00:00:00", - ], - [ - "2001-04-11 00:00:00", - "2001-02-03 08:00:00", - "2001-03-08 16:00:00", - "2001-01-01 00:00:00", - ], - [ - "2001-04-11 00:00:00", - "2001-01-01 00:00:00", - "2001-02-03 08:00:00", - "2001-03-08 16:00:00", - "2001-01-01 00:00:00", - ], - ], -) -def test_datetime_index(testlist): - index = DatetimeIndex(testlist) - index_pd = pd.DatetimeIndex(testlist) - - assert index.is_unique == index_pd.is_unique - assert index.is_monotonic_increasing == index_pd.is_monotonic_increasing - assert index.is_monotonic_decreasing == index_pd.is_monotonic_decreasing - - -@pytest.mark.parametrize( - "testlist", - [ - [1, 2, 3, 4], - [1, 2, 3, 3, 4], - [10, 9, 8, 7], - [10, 9, 8, 8, 7], - ["c", "d", "e", "f"], - ["c", "d", "e", "e", "f"], - ["z", "y", "x", "r"], - ["z", "y", "x", "x", "r"], - ], -) -def test_series(testlist): - series = Series(testlist) - series_pd = pd.Series(testlist) - - assert series.is_unique == series_pd.is_unique - assert series.is_monotonic_increasing == series_pd.is_monotonic_increasing - assert series.is_monotonic_decreasing == series_pd.is_monotonic_decreasing - - -def test_multiindex(): - rng = np.random.default_rng(seed=0) - pdf = pd.DataFrame(rng.random(size=(7, 5))) - pdf.index = pd.MultiIndex( - [ - ["a", "b", "c"], - ["house", "store", "forest"], - ["clouds", "clear", "storm"], - ["fire", "smoke", "clear"], - ], - [ - [0, 0, 0, 0, 1, 1, 2], - [1, 1, 1, 1, 0, 0, 2], - [0, 0, 2, 2, 2, 0, 1], - [0, 0, 0, 1, 2, 0, 1], - ], - ) - pdf.index.names = ["alpha", "location", "weather", "sign"] - gdf = cudf.from_pandas(pdf) - - assert pdf.index.is_unique == gdf.index.is_unique - assert ( - pdf.index.is_monotonic_increasing == gdf.index.is_monotonic_increasing - ) - assert ( - pdf.index.is_monotonic_decreasing == gdf.index.is_monotonic_decreasing - ) - - -@pytest.mark.parametrize( - "testarr", - [ - ( - [ - ["bar", "bar", "foo", "foo", "qux", "qux", "qux"], - ["one", "two", "one", "two", "one", "two", "two"], - ], - ["first", "second"], - ), - ( - [ - ["bar", "bar", "foo", "foo", "qux", "qux"], - ["one", "two", "one", "two", "one", "two"], - ], - ["first", "second"], - ), - ], -) -def test_multiindex_tuples(testarr): - tuples = list(zip(*testarr[0], strict=True)) - - index = MultiIndex.from_tuples(tuples, names=testarr[1]) - index_pd = pd.MultiIndex.from_tuples(tuples, names=testarr[1]) - - assert index.is_unique == index_pd.is_unique - assert index.is_monotonic_increasing == index_pd.is_monotonic_increasing - assert index.is_monotonic_decreasing == index_pd.is_monotonic_decreasing - - -@pytest.mark.parametrize( - "testlist", - [ - [10, 9, 8, 8, 7], - [2.0, 5.0, 4.0, 3.0, 7.0], - ["b", "d", "e", "a", "c"], - ["frog", "cat", "bat", "dog"], - ], -) -@pytest.mark.parametrize("side", ["left", "right"]) -def test_get_slice_bound(testlist, side): - index = Index(testlist) - index_pd = pd.Index(testlist) - for label in testlist: - expect = index_pd.get_slice_bound(label, side) - got = index.get_slice_bound(label, side) - assert got == expect - - -@pytest.mark.parametrize("bounds", [(0, 10), (0, 1), (3, 4), (0, 0), (3, 3)]) -@pytest.mark.parametrize( - "indices", - [[-1, 0, 5, 10, 11], [-1, 0, 1, 2], [2, 3, 4, 5], [-1, 0, 1], [2, 3, 4]], -) -@pytest.mark.parametrize("side", ["left", "right"]) -def test_rangeindex_get_slice_bound_basic(bounds, indices, side): - start, stop = bounds - pd_index = pd.RangeIndex(start, stop) - cudf_index = RangeIndex(start, stop) - for idx in indices: - expect = pd_index.get_slice_bound(idx, side) - got = cudf_index.get_slice_bound(idx, side) - assert expect == got - - -@pytest.mark.parametrize( - "bounds", - [(3, 20, 5), (20, 3, -5), (20, 3, 5), (3, 20, -5), (0, 0, 2), (3, 3, 2)], -) -@pytest.mark.parametrize( - "label", - [3, 8, 13, 18, 20, 15, 10, 5, -1, 0, 19, 21, 6, 11, 17], -) -@pytest.mark.parametrize("side", ["left", "right"]) -def test_rangeindex_get_slice_bound_step(bounds, label, side): - start, stop, step = bounds - pd_index = pd.RangeIndex(start, stop, step) - cudf_index = RangeIndex(start, stop, step) - - expect = pd_index.get_slice_bound(label, side) - got = cudf_index.get_slice_bound(label, side) - assert expect == got - - -@pytest.mark.parametrize("label", [1, 3, 5, 7, 9, 11]) -@pytest.mark.parametrize("side", ["left", "right"]) -def test_get_slice_bound_missing(label, side): - mylist = [2, 4, 6, 8, 10] - index = Index(mylist) - index_pd = pd.Index(mylist) - - expect = index_pd.get_slice_bound(label, side) - got = index.get_slice_bound(label, side) - assert got == expect - - -@pytest.mark.parametrize("label", ["a", "c", "e", "g"]) -@pytest.mark.parametrize("side", ["left", "right"]) -def test_get_slice_bound_missing_str(label, side): - mylist = ["b", "d", "f"] - index = Index(mylist) - index_pd = pd.Index(mylist) - got = index.get_slice_bound(label, side) - expect = index_pd.get_slice_bound(label, side) - assert got == expect - - -@pytest.mark.parametrize( - "data", - [ - [pd.Timestamp("2018-01-01"), pd.Timestamp("2019-01-31"), None], - [1, 2, 3, None], - [None, 1, 2, 3], - ["a", "b", "c", None], - [None, "a", "b", "c"], - ], -) -def test_is_monotonic_always_falls_for_null(data): - ser = Series(data) - assert ser.is_monotonic_increasing is False - assert ser.is_monotonic_decreasing is False - - -@pytest.mark.parametrize("box", [Series, Index]) -@pytest.mark.parametrize( - "value,na_like", - [ - [1, None], - [np.datetime64("2020-01-01", "ns"), np.datetime64("nat", "ns")], - ["s", None], - [1.0, np.nan], - ], - ids=repr, -) -def test_is_unique(box, value, na_like): - obj = box([value], nan_as_null=False) - assert obj.is_unique - - obj = box([value, value], nan_as_null=False) - assert not obj.is_unique - - obj = box([None, value], nan_as_null=False) - assert obj.is_unique - - obj = box([None, None, value], nan_as_null=False) - assert not obj.is_unique - - if na_like is not None: - obj = box([na_like, value], nan_as_null=False) - assert obj.is_unique - - obj = box([na_like, na_like], nan_as_null=False) - assert not obj.is_unique - - try: - if not np.isnat(na_like): - # pyarrow coerces nat to null - obj = box([None, na_like, value], nan_as_null=False) - assert obj.is_unique - except TypeError: - pass From fc762b899c2ccfdb676169e94996f3275642c53f Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Mon, 11 Aug 2025 13:57:57 -0700 Subject: [PATCH 094/366] Move test_rolling/ewm.py to new cudf classic directory structure (#19611) Towards https://github.com/rapidsai/cudf/issues/9999 Towards https://github.com/rapidsai/cudf/issues/15723 Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - GALI PREM SAGAR (https://github.com/galipremsagar) URL: https://github.com/rapidsai/cudf/pull/19611 --- python/cudf/cudf/tests/test_rolling.py | 538 ------------------ .../cudf/cudf/tests/{ => window}/test_ewm.py | 8 +- python/cudf/cudf/tests/window/test_rolling.py | 538 +++++++++++++++++- 3 files changed, 540 insertions(+), 544 deletions(-) delete mode 100644 python/cudf/cudf/tests/test_rolling.py rename python/cudf/cudf/tests/{ => window}/test_ewm.py (85%) diff --git a/python/cudf/cudf/tests/test_rolling.py b/python/cudf/cudf/tests/test_rolling.py deleted file mode 100644 index 30958f29a8f..00000000000 --- a/python/cudf/cudf/tests/test_rolling.py +++ /dev/null @@ -1,538 +0,0 @@ -# Copyright (c) 2021-2025, NVIDIA CORPORATION. - -import math -import pickle - -import numpy as np -import pandas as pd -import pytest - -import cudf -from cudf.testing import assert_eq -from cudf.testing.dataset_generator import rand_dataframe - - -@pytest.mark.parametrize( - "data,index", - [ - ([], []), - ([1, 1, 1, 1], None), - ([1, 2, 3, 4], pd.date_range("2001-01-01", "2001-01-04")), - ([1, 2, 4, 9, 9, 4], ["a", "b", "c", "d", "e", "f"]), - ], -) -@pytest.mark.parametrize( - "agg", ["sum", "min", "max", "mean", "count", "std", "var"] -) -@pytest.mark.parametrize("nulls", ["none", "one", "some", "all"]) -@pytest.mark.parametrize("center", [True, False]) -def test_rolling_series_basic(data, index, agg, nulls, center): - rng = np.random.default_rng(1) - - if len(data) > 0: - if nulls == "one": - p = rng.integers(0, len(data)) - data[p] = np.nan - elif nulls == "some": - p1, p2 = rng.integers(0, len(data), (2,)) - data[p1] = np.nan - data[p2] = np.nan - elif nulls == "all": - data = [np.nan] * len(data) - - psr = pd.Series(data, index=index) - gsr = cudf.from_pandas(psr) - for window_size in range(1, len(data) + 1): - for min_periods in range(1, window_size + 1): - expect = getattr( - psr.rolling(window_size, min_periods, center), agg - )().fillna(-1) - got = getattr( - gsr.rolling(window_size, min_periods, center), agg - )().fillna(-1) - assert_eq(expect, got, check_dtype=False, check_freq=False) - - -@pytest.mark.parametrize( - "data", - [ - {"a": [], "b": []}, - {"a": [1, 2, 3, 4], "b": [1, 2, 3, 4]}, - {"a": [1, 2, 4, 9, 9, 4], "b": [1, 2, 4, 9, 9, 4]}, - { - "a": np.array([1, 2, 4, 9, 9, 4]), - "b": np.array([1.5, 2.2, 2.2, 8.0, 9.1, 4.2]), - }, - ], -) -@pytest.mark.parametrize( - "agg", ["sum", "min", "max", "mean", "count", "std", "var"] -) -@pytest.mark.parametrize("nulls", ["none", "one", "some", "all"]) -@pytest.mark.parametrize("center", [True, False]) -def test_rolling_dataframe_basic(data, agg, nulls, center): - rng = np.random.default_rng(0) - pdf = pd.DataFrame(data) - - if len(pdf) > 0: - if nulls == "all": - pdf = pd.DataFrame(np.nan, columns=pdf.columns, index=pdf.index) - else: - for col_idx in range(len(pdf.columns)): - if nulls == "one": - p = rng.integers(0, len(data)) - pdf.iloc[p, col_idx] = np.nan - elif nulls == "some": - p1, p2 = rng.integers(0, len(data), (2,)) - pdf.iloc[p1, col_idx] = np.nan - pdf.iloc[p2, col_idx] = np.nan - - gdf = cudf.from_pandas(pdf) - for window_size in range(1, len(data) + 1): - for min_periods in range(1, window_size + 1): - expect = getattr( - pdf.rolling(window_size, min_periods, center), agg - )().fillna(-1) - got = getattr( - gdf.rolling(window_size, min_periods, center), agg - )().fillna(-1) - assert_eq(expect, got, check_dtype=False) - - -@pytest.mark.parametrize( - "agg", - [ - "sum", - "min", - "max", - "mean", - "count", - "std", - "var", - ], -) -def test_rolling_with_offset(agg): - psr = pd.Series( - [1, 2, 4, 4, np.nan, 9], - index=[ - pd.Timestamp("20190101 09:00:00"), - pd.Timestamp("20190101 09:00:01"), - pd.Timestamp("20190101 09:00:02"), - pd.Timestamp("20190101 09:00:04"), - pd.Timestamp("20190101 09:00:07"), - pd.Timestamp("20190101 09:00:08"), - ], - ) - gsr = cudf.from_pandas(psr) - assert_eq( - getattr(psr.rolling("2s"), agg)().fillna(-1), - getattr(gsr.rolling("2s"), agg)().fillna(-1), - check_dtype=False, - ) - - -@pytest.mark.parametrize("agg", ["std", "var"]) -@pytest.mark.parametrize("ddof", [0, 1]) -@pytest.mark.parametrize("center", [True, False]) -@pytest.mark.parametrize("window_size", [2, 10, 100]) -def test_rolling_var_std_large(agg, ddof, center, window_size): - iupper_bound = math.sqrt(np.iinfo(np.int64).max / window_size) - ilower_bound = -math.sqrt(abs(np.iinfo(np.int64).min) / window_size) - - fupper_bound = math.sqrt(np.finfo(np.float64).max / window_size) - flower_bound = -math.sqrt(abs(np.finfo(np.float64).min) / window_size) - - n_rows = 1_000 - data = rand_dataframe( - dtypes_meta=[ - { - "dtype": "int64", - "null_frequency": 0.4, - "cardinality": n_rows, - "min_bound": ilower_bound, - "max_bound": iupper_bound, - }, - { - "dtype": "float64", - "null_frequency": 0.4, - "cardinality": n_rows, - "min_bound": flower_bound, - "max_bound": fupper_bound, - }, - { - "dtype": "decimal64", - "null_frequency": 0.4, - "cardinality": n_rows, - "min_bound": ilower_bound, - "max_bound": iupper_bound, - }, - ], - rows=n_rows, - use_threads=False, - seed=100, - ) - pdf = data.to_pandas() - gdf = cudf.from_pandas(pdf) - - expect = getattr(pdf.rolling(window_size, 1, center), agg)(ddof=ddof) - got = getattr(gdf.rolling(window_size, 1, center), agg)(ddof=ddof) - - import platform - - if platform.machine() == "aarch64": - # Due to pandas-37051, pandas rolling var/std on uniform window is - # not reliable. Skipping these rows when comparing. - for col in expect: - mask = (got[col].fillna(-1) != 0).to_pandas() - expect[col] = expect[col][mask] - got[col] = got[col][mask] - assert_eq(expect[col], got[col], check_freq=False) - else: - assert_eq(expect, got, check_freq=False) - - -def test_rolling_var_uniform_window(): - """ - Pandas adopts an online variance calculation algorithm. This gives a - floating point artifact. - - In cudf, each window is computed independently from the previous window, - this gives better numeric precision. - """ - - s = pd.Series([1e8, 5, 5, 5]) - expected = s.rolling(3).var() - got = cudf.from_pandas(s).rolling(3).var() - - assert_eq(expected, got) - - -def test_rolling_count_with_offset(): - """ - This test covers the xfail case from test_rolling_with_offset["count"]. - It is expected that count should return a non-Nan value, even if - the counted value is a Nan, unless the min-periods condition - is not met. - This behaviour is consistent with counts for rolling-windows, - in the non-offset window case. - """ - psr = pd.Series( - [1, 2, 4, 4, np.nan, 9], - index=[ - pd.Timestamp("20190101 09:00:00"), - pd.Timestamp("20190101 09:00:01"), - pd.Timestamp("20190101 09:00:02"), - pd.Timestamp("20190101 09:00:04"), - pd.Timestamp("20190101 09:00:07"), - pd.Timestamp("20190101 09:00:08"), - ], - ) - gsr = cudf.from_pandas(psr) - assert_eq( - getattr(gsr.rolling("2s"), "count")().fillna(-1), - pd.Series( - [1, 2, 2, 1, 0, 1], - index=[ - pd.Timestamp("20190101 09:00:00"), - pd.Timestamp("20190101 09:00:01"), - pd.Timestamp("20190101 09:00:02"), - pd.Timestamp("20190101 09:00:04"), - pd.Timestamp("20190101 09:00:07"), - pd.Timestamp("20190101 09:00:08"), - ], - ), - check_dtype=False, - ) - - -def test_rolling_getattr(): - pdf = pd.DataFrame({"a": [1, 2, 3, 4], "b": [1, 2, 3, 4]}) - gdf = cudf.from_pandas(pdf) - - assert_eq( - pdf.rolling(2).a.sum().fillna(-1), - gdf.rolling(2).a.sum().fillna(-1), - check_dtype=False, - ) - - -def test_rolling_getitem(): - pdf = pd.DataFrame({"a": [1, 2, 3, 4], "b": [1, 2, 3, 4]}) - gdf = cudf.from_pandas(pdf) - - assert_eq( - pdf.rolling(2)["a"].sum().fillna(-1), - gdf.rolling(2)["a"].sum().fillna(-1), - check_dtype=False, - ) - assert_eq( - pdf.rolling(2)["a", "b"].sum().fillna(-1), - gdf.rolling(2)["a", "b"].sum().fillna(-1), - check_dtype=False, - ) - assert_eq( - pdf.rolling(2)[["a", "b"]].sum().fillna(-1), - gdf.rolling(2)["a", "b"].sum().fillna(-1), - check_dtype=False, - ) - - -def test_rolling_getitem_window(): - index = pd.DatetimeIndex( - pd.date_range("2000-01-01", "2000-01-02", freq="1h") - ) - pdf = pd.DataFrame({"x": np.arange(len(index))}, index=index) - gdf = cudf.from_pandas(pdf) - - assert_eq( - pdf.rolling("2h").x.mean(), - gdf.rolling("2h").x.mean(), - check_freq=False, - ) - - -@pytest.mark.parametrize( - "data,index", [([1.2, 4.5, 5.9, 2.4, 9.3, 7.1], None), ([], [])] -) -@pytest.mark.parametrize("center", [True, False]) -def test_rollling_series_numba_udf_basic(data, index, center): - psr = pd.Series(data, index=index) - gsr = cudf.from_pandas(psr) - - def some_func(A): - b = 0 - for a in A: - b = max(b, math.sqrt(a)) - return b - - for window_size in range(1, len(data) + 1): - for min_periods in range(1, window_size + 1): - assert_eq( - psr.rolling(window_size, min_periods, center) - .apply(some_func) - .fillna(-1), - gsr.rolling(window_size, min_periods, center) - .apply(some_func) - .fillna(-1), - check_dtype=False, - ) - - -@pytest.mark.parametrize( - "data", - [ - {"a": [], "b": []}, - {"a": [1, 2, 3, 4], "b": [1, 2, 3, 4]}, - {"a": [1, 2, 4, 9, 9, 4], "b": [1, 2, 4, 9, 9, 4]}, - { - "a": np.array([1, 2, 4, 9, 9, 4]), - "b": np.array([1.5, 2.2, 2.2, 8.0, 9.1, 4.2]), - }, - ], -) -@pytest.mark.parametrize("center", [True, False]) -def test_rolling_dataframe_numba_udf_basic(data, center): - pdf = pd.DataFrame(data) - gdf = cudf.from_pandas(pdf) - - def some_func(A): - b = 0 - for a in A: - b = b + a**2 - return b / len(A) - - for window_size in range(1, len(data) + 1): - for min_periods in range(1, window_size + 1): - assert_eq( - pdf.rolling(window_size, min_periods, center) - .apply(some_func) - .fillna(-1), - gdf.rolling(window_size, min_periods, center) - .apply(some_func) - .fillna(-1), - check_dtype=False, - ) - - -def test_rolling_numba_udf_with_offset(): - psr = pd.Series( - [1, 2, 4, 4, 8, 9], - index=[ - pd.Timestamp("20190101 09:00:00"), - pd.Timestamp("20190101 09:00:01"), - pd.Timestamp("20190101 09:00:02"), - pd.Timestamp("20190101 09:00:04"), - pd.Timestamp("20190101 09:00:07"), - pd.Timestamp("20190101 09:00:08"), - ], - ) - gsr = cudf.from_pandas(psr) - - def some_func(A): - b = 0 - for a in A: - b = b + a - return b / len(A) - - assert_eq( - psr.rolling("2s").apply(some_func).fillna(-1), - gsr.rolling("2s").apply(some_func).fillna(-1), - check_dtype=False, - ) - - -@pytest.mark.parametrize( - "agg", ["sum", "min", "max", "mean", "count", "var", "std"] -) -def test_rolling_groupby_simple(agg): - pdf = pd.DataFrame( - { - "a": [1, 1, 1, 1, 1, 1, 2, 2, 2, 2], - "b": [1, 2, 3, 1, 2, 3, 1, 2, 3, 1], - } - ) - gdf = cudf.from_pandas(pdf) - - for window_size in range(1, len(pdf) + 1): - expect = getattr(pdf.groupby("a").rolling(window_size), agg)().fillna( - -1 - ) - got = getattr(gdf.groupby("a").rolling(window_size), agg)().fillna(-1) - assert_eq(expect, got, check_dtype=False) - - pdf = pd.DataFrame( - {"a": [1, 1, 1, 2, 2], "b": [1, 1, 2, 2, 3], "c": [1, 2, 3, 4, 5]} - ) - gdf = cudf.from_pandas(pdf) - - for window_size in range(1, len(pdf) + 1): - expect = getattr(pdf.groupby("a").rolling(window_size), agg)().fillna( - -1 - ) - got = getattr(gdf.groupby("a").rolling(window_size), agg)().fillna(-1) - assert_eq(expect, got, check_dtype=False) - - -@pytest.mark.parametrize( - "agg", ["sum", "min", "max", "mean", "count", "var", "std"] -) -def test_rolling_groupby_multi(agg): - pdf = pd.DataFrame( - { - "a": [1, 1, 1, 1, 1, 1, 2, 2, 2, 2], - "b": [0, 0, 1, 1, 0, 1, 2, 1, 1, 0], - "c": [1, 2, 3, 1, 2, 3, 1, 2, 3, 1], - } - ) - gdf = cudf.from_pandas(pdf) - - for window_size in range(1, len(pdf) + 1): - expect = getattr( - pdf.groupby(["a", "b"], sort=True).rolling(window_size), agg - )().fillna(-1) - got = getattr( - gdf.groupby(["a", "b"], sort=True).rolling(window_size), agg - )().fillna(-1) - assert_eq(expect, got, check_dtype=False) - - -@pytest.mark.parametrize( - "agg", ["sum", "min", "max", "mean", "count", "var", "std"] -) -@pytest.mark.parametrize( - "window_size", ["1d", "2d", "3d", "4d", "5d", "6d", "7d"] -) -def test_rolling_groupby_offset(agg, window_size): - pdf = pd.DataFrame( - { - "date": pd.date_range(start="2016-01-01", periods=7, freq="D"), - "group": [1, 2, 2, 1, 1, 2, 1], - "val": [5, 6, 7, 8, 1, 2, 3], - } - ).set_index("date") - gdf = cudf.from_pandas(pdf) - expect = getattr(pdf.groupby("group").rolling(window_size), agg)().fillna( - -1 - ) - got = getattr(gdf.groupby("group").rolling(window_size), agg)().fillna(-1) - assert_eq(expect, got, check_dtype=False) - - -def test_rolling_custom_index_support(): - from pandas.api.indexers import BaseIndexer - - class CustomIndexer(BaseIndexer): - def get_window_bounds( - self, num_values, min_periods, center, closed, step=None - ): - start = np.empty(num_values, dtype=np.int64) - end = np.empty(num_values, dtype=np.int64) - - for i in range(num_values): - if self.use_expanding[i]: - start[i] = 0 - end[i] = i + 1 - else: - start[i] = i - end[i] = i + self.window_size - - return start, end - - use_expanding = [True, False, True, False, True] - indexer = CustomIndexer(window_size=1, use_expanding=use_expanding) - - df = pd.DataFrame({"values": range(5)}) - gdf = cudf.from_pandas(df) - - expected = df.rolling(window=indexer).sum() - actual = gdf.rolling(window=indexer).sum() - - assert_eq(expected, actual, check_dtype=False) - - -@pytest.mark.parametrize( - "indexer", - [ - pd.api.indexers.FixedForwardWindowIndexer(window_size=2), - pd.api.indexers.VariableOffsetWindowIndexer( - index=pd.date_range("2020", periods=5), offset=pd.offsets.BDay(1) - ), - ], -) -def test_rolling_indexer_support(indexer): - df = pd.DataFrame({"B": [0, 1, 2, np.nan, 4]}) - gdf = cudf.from_pandas(df) - - expected = df.rolling(window=indexer, min_periods=2).sum() - actual = gdf.rolling(window=indexer, min_periods=2).sum() - - assert_eq(expected, actual) - - -def test_rolling_series(): - df = cudf.DataFrame({"a": range(0, 100), "b": [10, 20, 30, 40, 50] * 20}) - pdf = df.to_pandas() - - expected = pdf.groupby("b")["a"].rolling(5).mean() - actual = df.groupby("b")["a"].rolling(5).mean() - - assert_eq(expected, actual) - - -@pytest.mark.parametrize("klass", ["DataFrame", "Series"]) -def test_pandas_compat_int_nan_min_periods(klass): - data = [None, 1, 2, None, 4, 6, 11] - with cudf.option_context("mode.pandas_compatible", True): - result = getattr(cudf, klass)(data).rolling(2, min_periods=1).sum() - expected = getattr(pd, klass)(data).rolling(2, min_periods=1).sum() - assert_eq(result, expected) - - result = getattr(cudf, klass)(data).rolling(2, min_periods=1).sum() - expected = getattr(cudf, klass)([None, 1, 3, 2, 4, 10, 17]) - assert_eq(result, expected) - - -def test_groupby_rolling_pickleable(): - df = cudf.DataFrame({"a": [1, 1, 2], "b": [1, 2, 3]}) - gb_rolling = pickle.loads(pickle.dumps(df.groupby("a").rolling(2))) - assert_eq(gb_rolling.obj, cudf.DataFrame({"b": [1, 2, 3]})) diff --git a/python/cudf/cudf/tests/test_ewm.py b/python/cudf/cudf/tests/window/test_ewm.py similarity index 85% rename from python/cudf/cudf/tests/test_ewm.py rename to python/cudf/cudf/tests/window/test_ewm.py index 6cb3c19d5a8..6d7734c5c00 100644 --- a/python/cudf/cudf/tests/test_ewm.py +++ b/python/cudf/cudf/tests/window/test_ewm.py @@ -1,4 +1,4 @@ -# Copyright (c) 2022-2024, NVIDIA CORPORATION. +# Copyright (c) 2022-2025, NVIDIA CORPORATION. import pytest import cudf @@ -35,12 +35,10 @@ def test_ewma(data, params, adjust): sets of keyword arguemnts that effect the raw coefficients of the formula """ - params["adjust"] = adjust - gsr = cudf.Series(data, dtype="float64") psr = gsr.to_pandas() - expect = psr.ewm(**params).mean() - got = gsr.ewm(**params).mean() + expect = psr.ewm(**params, adjust=adjust).mean() + got = gsr.ewm(**params, adjust=adjust).mean() assert_eq(expect, got) diff --git a/python/cudf/cudf/tests/window/test_rolling.py b/python/cudf/cudf/tests/window/test_rolling.py index 06777c8e6af..4af4932b9ca 100644 --- a/python/cudf/cudf/tests/window/test_rolling.py +++ b/python/cudf/cudf/tests/window/test_rolling.py @@ -1 +1,537 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. +# Copyright (c) 2023-2025, NVIDIA CORPORATION. +import math +import pickle +import platform + +import numpy as np +import pandas as pd +import pytest +from pandas.api.indexers import BaseIndexer + +import cudf +from cudf.testing import assert_eq +from cudf.testing.dataset_generator import rand_dataframe + + +@pytest.fixture(params=[True, False]) +def center(request): + return request.param + + +@pytest.fixture +def supported_rolling_reductions(reduction_methods): + if reduction_methods in [ + "product", + "quantile", + "all", + "any", + "median", + "kurtosis", + "skew", + ]: + pytest.skip(f"{reduction_methods} not implemented") + return reduction_methods + + +@pytest.mark.parametrize( + "data,index", + [ + ([], []), + ([1, 1, 1, 1], None), + ([1, 2, 3, 4], pd.date_range("2001-01-01", "2001-01-04")), + ([1, 2, 4, 9, 9, 4], ["a", "b", "c", "d", "e", "f"]), + ], +) +@pytest.mark.parametrize("nulls", ["none", "one", "some", "all"]) +def test_rolling_series_basic( + data, index, supported_rolling_reductions, nulls, center, request +): + rng = np.random.default_rng(1) + + if len(data) > 0: + if nulls == "one": + p = rng.integers(0, len(data)) + data[p] = np.nan + elif nulls == "some": + p1, p2 = rng.integers(0, len(data), (2,)) + data[p1] = np.nan + data[p2] = np.nan + elif nulls == "all": + data = [np.nan] * len(data) + + psr = pd.Series(data, index=index) + gsr = cudf.from_pandas(psr) + for window_size in range(1, len(data) + 1): + for min_periods in range(1, window_size + 1): + expect = getattr( + psr.rolling(window_size, min_periods, center), + supported_rolling_reductions, + )().fillna(-1) + got = getattr( + gsr.rolling(window_size, min_periods, center), + supported_rolling_reductions, + )().fillna(-1) + assert_eq(expect, got, check_dtype=False, check_freq=False) + + +@pytest.mark.parametrize( + "data", + [ + {"a": [], "b": []}, + {"a": [1, 2, 3, 4], "b": [1, 2, 3, 4]}, + {"a": [1, 2, 4, 9, 9, 4], "b": [1, 2, 4, 9, 9, 4]}, + { + "a": np.array([1, 2, 4, 9, 9, 4]), + "b": np.array([1.5, 2.2, 2.2, 8.0, 9.1, 4.2]), + }, + ], +) +@pytest.mark.parametrize("nulls", ["none", "one", "some", "all"]) +def test_rolling_dataframe_basic( + data, supported_rolling_reductions, nulls, center, request +): + rng = np.random.default_rng(0) + pdf = pd.DataFrame(data) + + if len(pdf) > 0: + if nulls == "all": + pdf = pd.DataFrame(np.nan, columns=pdf.columns, index=pdf.index) + else: + for col_idx in range(len(pdf.columns)): + if nulls == "one": + p = rng.integers(0, len(data)) + pdf.iloc[p, col_idx] = np.nan + elif nulls == "some": + p1, p2 = rng.integers(0, len(data), (2,)) + pdf.iloc[p1, col_idx] = np.nan + pdf.iloc[p2, col_idx] = np.nan + + gdf = cudf.from_pandas(pdf) + for window_size in range(1, len(data) + 1): + for min_periods in range(1, window_size + 1): + expect = getattr( + pdf.rolling(window_size, min_periods, center), + supported_rolling_reductions, + )().fillna(-1) + got = getattr( + gdf.rolling(window_size, min_periods, center), + supported_rolling_reductions, + )().fillna(-1) + assert_eq(expect, got, check_dtype=False) + + +def test_rolling_with_offset(supported_rolling_reductions): + psr = pd.Series( + [1, 2, 4, 4, np.nan, 9], + index=[ + pd.Timestamp("20190101 09:00:00"), + pd.Timestamp("20190101 09:00:01"), + pd.Timestamp("20190101 09:00:02"), + pd.Timestamp("20190101 09:00:04"), + pd.Timestamp("20190101 09:00:07"), + pd.Timestamp("20190101 09:00:08"), + ], + ) + gsr = cudf.from_pandas(psr) + assert_eq( + getattr(psr.rolling("2s"), supported_rolling_reductions)().fillna(-1), + getattr(gsr.rolling("2s"), supported_rolling_reductions)().fillna(-1), + check_dtype=False, + ) + + +@pytest.mark.parametrize("agg", ["std", "var"]) +@pytest.mark.parametrize("ddof", [0, 1]) +@pytest.mark.parametrize("window_size", [2, 100]) +def test_rolling_var_std_large(agg, ddof, center, window_size): + iupper_bound = math.sqrt(np.iinfo(np.int64).max / window_size) + ilower_bound = -math.sqrt(abs(np.iinfo(np.int64).min) / window_size) + + fupper_bound = math.sqrt(np.finfo(np.float64).max / window_size) + flower_bound = -math.sqrt(abs(np.finfo(np.float64).min) / window_size) + + n_rows = 1_000 + data = rand_dataframe( + dtypes_meta=[ + { + "dtype": "int64", + "null_frequency": 0.4, + "cardinality": n_rows, + "min_bound": ilower_bound, + "max_bound": iupper_bound, + }, + { + "dtype": "float64", + "null_frequency": 0.4, + "cardinality": n_rows, + "min_bound": flower_bound, + "max_bound": fupper_bound, + }, + { + "dtype": "decimal64", + "null_frequency": 0.4, + "cardinality": n_rows, + "min_bound": ilower_bound, + "max_bound": iupper_bound, + }, + ], + rows=n_rows, + use_threads=False, + seed=100, + ) + pdf = data.to_pandas() + gdf = cudf.from_pandas(pdf) + + expect = getattr(pdf.rolling(window_size, 1, center), agg)(ddof=ddof) + got = getattr(gdf.rolling(window_size, 1, center), agg)(ddof=ddof) + + if platform.machine() == "aarch64": + # Due to pandas-37051, pandas rolling var/std on uniform window is + # not reliable. Skipping these rows when comparing. + for col in expect: + mask = (got[col].fillna(-1) != 0).to_pandas() + expect[col] = expect[col][mask] + got[col] = got[col][mask] + assert_eq(expect[col], got[col], check_freq=False) + else: + assert_eq(expect, got, check_freq=False) + + +def test_rolling_var_uniform_window(): + """ + Pandas adopts an online variance calculation algorithm. This gives a + floating point artifact. + + In cudf, each window is computed independently from the previous window, + this gives better numeric precision. + """ + + s = pd.Series([1e8, 5, 5, 5]) + expected = s.rolling(3).var() + got = cudf.from_pandas(s).rolling(3).var() + + assert_eq(expected, got) + + +def test_rolling_count_with_offset(): + """ + This test covers the xfail case from test_rolling_with_offset["count"]. + It is expected that count should return a non-Nan value, even if + the counted value is a Nan, unless the min-periods condition + is not met. + This behaviour is consistent with counts for rolling-windows, + in the non-offset window case. + """ + psr = pd.Series( + [1, 2, 4, 4, np.nan, 9], + index=[ + pd.Timestamp("20190101 09:00:00"), + pd.Timestamp("20190101 09:00:01"), + pd.Timestamp("20190101 09:00:02"), + pd.Timestamp("20190101 09:00:04"), + pd.Timestamp("20190101 09:00:07"), + pd.Timestamp("20190101 09:00:08"), + ], + ) + gsr = cudf.from_pandas(psr) + assert_eq( + getattr(gsr.rolling("2s"), "count")().fillna(-1), + pd.Series( + [1, 2, 2, 1, 0, 1], + index=[ + pd.Timestamp("20190101 09:00:00"), + pd.Timestamp("20190101 09:00:01"), + pd.Timestamp("20190101 09:00:02"), + pd.Timestamp("20190101 09:00:04"), + pd.Timestamp("20190101 09:00:07"), + pd.Timestamp("20190101 09:00:08"), + ], + ), + check_dtype=False, + ) + + +def test_rolling_getattr(): + pdf = pd.DataFrame({"a": [1, 2, 3, 4], "b": [1, 2, 3, 4]}) + gdf = cudf.from_pandas(pdf) + + assert_eq( + pdf.rolling(2).a.sum().fillna(-1), + gdf.rolling(2).a.sum().fillna(-1), + check_dtype=False, + ) + + +def test_rolling_getitem(): + pdf = pd.DataFrame({"a": [1, 2, 3, 4], "b": [1, 2, 3, 4]}) + gdf = cudf.from_pandas(pdf) + + assert_eq( + pdf.rolling(2)["a"].sum().fillna(-1), + gdf.rolling(2)["a"].sum().fillna(-1), + check_dtype=False, + ) + assert_eq( + pdf.rolling(2)["a", "b"].sum().fillna(-1), + gdf.rolling(2)["a", "b"].sum().fillna(-1), + check_dtype=False, + ) + assert_eq( + pdf.rolling(2)[["a", "b"]].sum().fillna(-1), + gdf.rolling(2)["a", "b"].sum().fillna(-1), + check_dtype=False, + ) + + +def test_rolling_getitem_window(): + index = pd.DatetimeIndex( + pd.date_range("2000-01-01", "2000-01-02", freq="1h") + ) + pdf = pd.DataFrame({"x": np.arange(len(index))}, index=index) + gdf = cudf.from_pandas(pdf) + + assert_eq( + pdf.rolling("2h").x.mean(), + gdf.rolling("2h").x.mean(), + check_freq=False, + ) + + +@pytest.mark.parametrize( + "data,index", [([1.2, 4.5, 5.9, 2.4, 9.3, 7.1], None), ([], [])] +) +def test_rollling_series_numba_udf_basic(data, index, center): + psr = pd.Series(data, index=index) + gsr = cudf.from_pandas(psr) + + def some_func(A): + b = 0 + for a in A: + b = max(b, math.sqrt(a)) + return b + + for window_size in range(1, len(data) + 1): + for min_periods in range(1, window_size + 1): + assert_eq( + psr.rolling(window_size, min_periods, center) + .apply(some_func) + .fillna(-1), + gsr.rolling(window_size, min_periods, center) + .apply(some_func) + .fillna(-1), + check_dtype=False, + ) + + +@pytest.mark.parametrize( + "data", + [ + {"a": [], "b": []}, + {"a": [1, 2, 3, 4], "b": [1, 2, 3, 4]}, + {"a": [1, 2, 4, 9, 9, 4], "b": [1, 2, 4, 9, 9, 4]}, + { + "a": np.array([1, 2, 4, 9, 9, 4]), + "b": np.array([1.5, 2.2, 2.2, 8.0, 9.1, 4.2]), + }, + ], +) +def test_rolling_dataframe_numba_udf_basic(data, center): + pdf = pd.DataFrame(data) + gdf = cudf.from_pandas(pdf) + + def some_func(A): + b = 0 + for a in A: + b = b + a**2 + return b / len(A) + + for window_size in range(1, len(data) + 1): + for min_periods in range(1, window_size + 1): + assert_eq( + pdf.rolling(window_size, min_periods, center) + .apply(some_func) + .fillna(-1), + gdf.rolling(window_size, min_periods, center) + .apply(some_func) + .fillna(-1), + check_dtype=False, + ) + + +def test_rolling_numba_udf_with_offset(): + psr = pd.Series( + [1, 2, 4, 4, 8, 9], + index=[ + pd.Timestamp("20190101 09:00:00"), + pd.Timestamp("20190101 09:00:01"), + pd.Timestamp("20190101 09:00:02"), + pd.Timestamp("20190101 09:00:04"), + pd.Timestamp("20190101 09:00:07"), + pd.Timestamp("20190101 09:00:08"), + ], + ) + gsr = cudf.from_pandas(psr) + + def some_func(A): + b = 0 + for a in A: + b = b + a + return b / len(A) + + assert_eq( + psr.rolling("2s").apply(some_func).fillna(-1), + gsr.rolling("2s").apply(some_func).fillna(-1), + check_dtype=False, + ) + + +def test_rolling_groupby_simple(supported_rolling_reductions): + pdf = pd.DataFrame( + { + "a": [1, 1, 1, 1, 1, 1, 2, 2, 2, 2], + "b": [1, 2, 3, 1, 2, 3, 1, 2, 3, 1], + } + ) + gdf = cudf.from_pandas(pdf) + + for window_size in range(1, len(pdf) + 1): + expect = getattr( + pdf.groupby("a").rolling(window_size), supported_rolling_reductions + )().fillna(-1) + got = getattr( + gdf.groupby("a").rolling(window_size), supported_rolling_reductions + )().fillna(-1) + assert_eq(expect, got, check_dtype=False) + + pdf = pd.DataFrame( + {"a": [1, 1, 1, 2, 2], "b": [1, 1, 2, 2, 3], "c": [1, 2, 3, 4, 5]} + ) + gdf = cudf.from_pandas(pdf) + + for window_size in range(1, len(pdf) + 1): + expect = getattr( + pdf.groupby("a").rolling(window_size), supported_rolling_reductions + )().fillna(-1) + got = getattr( + gdf.groupby("a").rolling(window_size), supported_rolling_reductions + )().fillna(-1) + assert_eq(expect, got, check_dtype=False) + + +def test_rolling_groupby_multi(supported_rolling_reductions): + pdf = pd.DataFrame( + { + "a": [1, 1, 1, 1, 1, 1, 2, 2, 2, 2], + "b": [0, 0, 1, 1, 0, 1, 2, 1, 1, 0], + "c": [1, 2, 3, 1, 2, 3, 1, 2, 3, 1], + } + ) + gdf = cudf.from_pandas(pdf) + + for window_size in range(1, len(pdf) + 1): + expect = getattr( + pdf.groupby(["a", "b"], sort=True).rolling(window_size), + supported_rolling_reductions, + )().fillna(-1) + got = getattr( + gdf.groupby(["a", "b"], sort=True).rolling(window_size), + supported_rolling_reductions, + )().fillna(-1) + assert_eq(expect, got, check_dtype=False) + + +@pytest.mark.parametrize("window_size", ["1d", "3d", "6d", "7d"]) +def test_rolling_groupby_offset(supported_rolling_reductions, window_size): + pdf = pd.DataFrame( + { + "date": pd.date_range(start="2016-01-01", periods=7, freq="D"), + "group": [1, 2, 2, 1, 1, 2, 1], + "val": [5, 6, 7, 8, 1, 2, 3], + } + ).set_index("date") + gdf = cudf.from_pandas(pdf) + expect = getattr( + pdf.groupby("group").rolling(window_size), supported_rolling_reductions + )().fillna(-1) + got = getattr( + gdf.groupby("group").rolling(window_size), supported_rolling_reductions + )().fillna(-1) + assert_eq(expect, got, check_dtype=False) + + +def test_rolling_custom_index_support(): + class CustomIndexer(BaseIndexer): + def get_window_bounds( + self, num_values, min_periods, center, closed, step=None + ): + start = np.empty(num_values, dtype=np.int64) + end = np.empty(num_values, dtype=np.int64) + + for i in range(num_values): + if self.use_expanding[i]: + start[i] = 0 + end[i] = i + 1 + else: + start[i] = i + end[i] = i + self.window_size + + return start, end + + use_expanding = [True, False, True, False, True] + indexer = CustomIndexer(window_size=1, use_expanding=use_expanding) + + df = pd.DataFrame({"values": range(5)}) + gdf = cudf.from_pandas(df) + + expected = df.rolling(window=indexer).sum() + actual = gdf.rolling(window=indexer).sum() + + assert_eq(expected, actual, check_dtype=False) + + +@pytest.mark.parametrize( + "indexer", + [ + pd.api.indexers.FixedForwardWindowIndexer(window_size=2), + pd.api.indexers.VariableOffsetWindowIndexer( + index=pd.date_range("2020", periods=5), offset=pd.offsets.BDay(1) + ), + ], +) +def test_rolling_indexer_support(indexer): + df = pd.DataFrame({"B": [0, 1, 2, np.nan, 4]}) + gdf = cudf.from_pandas(df) + + expected = df.rolling(window=indexer, min_periods=2).sum() + actual = gdf.rolling(window=indexer, min_periods=2).sum() + + assert_eq(expected, actual) + + +def test_rolling_series(): + df = cudf.DataFrame({"a": range(0, 100), "b": [10, 20, 30, 40, 50] * 20}) + pdf = df.to_pandas() + + expected = pdf.groupby("b")["a"].rolling(5).mean() + actual = df.groupby("b")["a"].rolling(5).mean() + + assert_eq(expected, actual) + + +@pytest.mark.parametrize("klass", ["DataFrame", "Series"]) +def test_pandas_compat_int_nan_min_periods(klass): + data = [None, 1, 2, None, 4, 6, 11] + with cudf.option_context("mode.pandas_compatible", True): + result = getattr(cudf, klass)(data).rolling(2, min_periods=1).sum() + expected = getattr(pd, klass)(data).rolling(2, min_periods=1).sum() + assert_eq(result, expected) + + result = getattr(cudf, klass)(data).rolling(2, min_periods=1).sum() + expected = getattr(cudf, klass)([None, 1, 3, 2, 4, 10, 17]) + assert_eq(result, expected) + + +def test_groupby_rolling_pickleable(): + df = cudf.DataFrame({"a": [1, 1, 2], "b": [1, 2, 3]}) + gb_rolling = pickle.loads(pickle.dumps(df.groupby("a").rolling(2))) + assert_eq(gb_rolling.obj, cudf.DataFrame({"b": [1, 2, 3]})) From f9d6eccb2c65bffa7fc3941d602de572a56e6516 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Mon, 11 Aug 2025 15:16:26 -0700 Subject: [PATCH 095/366] Use more pytest fixtures and avoid GPU parameterization in test_binops/column/column_accessor/contains.py and more (#19473) Towards https://github.com/rapidsai/cudf/issues/9999 * Use more pytest fixtures * Avoids pytest.mark.parametrize with GPU objects * Eliminate/reduce parameterizations of input size Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/19473 --- python/cudf/cudf/tests/conftest.py | 76 ++ python/cudf/cudf/tests/test_binops.py | 971 +++++++----------- python/cudf/cudf/tests/test_buffer.py | 49 +- python/cudf/cudf/tests/test_categorical.py | 152 ++- python/cudf/cudf/tests/test_column.py | 54 +- .../cudf/cudf/tests/test_column_accessor.py | 53 +- python/cudf/cudf/tests/test_contains.py | 97 +- python/cudf/cudf/tests/test_copying.py | 70 +- 8 files changed, 666 insertions(+), 856 deletions(-) diff --git a/python/cudf/cudf/tests/conftest.py b/python/cudf/cudf/tests/conftest.py index b3b48d19538..7a8a6c3881e 100644 --- a/python/cudf/cudf/tests/conftest.py +++ b/python/cudf/cudf/tests/conftest.py @@ -1,6 +1,7 @@ # Copyright (c) 2019-2025, NVIDIA CORPORATION. import itertools +import operator import os import pathlib @@ -192,6 +193,69 @@ def set_decomp_env_vars(monkeypatch, request): yield +arithmetic_ops = [ + operator.add, + operator.sub, + operator.mul, + operator.floordiv, + operator.truediv, + operator.mod, + operator.pow, +] +comparison_ops = [ + operator.eq, + operator.ne, + operator.lt, + operator.le, + operator.gt, + operator.ge, +] + + +@pytest.fixture(params=arithmetic_ops) +def arithmetic_op(request): + return request.param + + +@pytest.fixture( + params=itertools.chain.from_iterable( + (op.__name__, f"r{op.__name__}") for op in arithmetic_ops + ) +) +def arithmetic_op_method(request): + """Arithmetic methods defined on Series/DataFrame""" + return request.param + + +@pytest.fixture(params=comparison_ops) +def comparison_op(request): + return request.param + + +@pytest.fixture +def comparison_op_method(comparison_op): + """Comparison methods defined on Series/DataFrame""" + return comparison_op.__name__ + + +@pytest.fixture(params=arithmetic_ops + comparison_ops) +def binary_op(request): + return request.param + + +@pytest.fixture( + params=itertools.chain( + itertools.chain.from_iterable( + (op.__name__, f"r{op.__name__}") for op in arithmetic_ops + ), + (op.__name__ for op in comparison_ops), + ) +) +def binary_op_method(request): + """Binary methods defined on Series/DataFrame""" + return request.param + + @pytest.fixture( params=[ "min", @@ -250,6 +314,12 @@ def integer_types_as_str(request): return request.param +@pytest.fixture +def integer_types_as_str2(integer_types_as_str): + """Used for testing cartesian product of integer_types_as_str""" + return integer_types_as_str + + @pytest.fixture(params=float_types) def float_types_as_str(request): """ @@ -270,6 +340,12 @@ def numeric_types_as_str(request): return request.param +@pytest.fixture +def numeric_types_as_str2(numeric_types_as_str): + """Used for testing cartesian product of numeric_types_as_str""" + return numeric_types_as_str + + @pytest.fixture( params=signed_integer_types + unsigned_integer_types diff --git a/python/cudf/cudf/tests/test_binops.py b/python/cudf/cudf/tests/test_binops.py index 34e568215ae..3a5326fb350 100644 --- a/python/cudf/cudf/tests/test_binops.py +++ b/python/cudf/cudf/tests/test_binops.py @@ -2,10 +2,11 @@ import datetime import decimal +import itertools import operator import re import warnings -from itertools import combinations_with_replacement, product +from concurrent.futures import ThreadPoolExecutor import cupy as cp import numpy as np @@ -31,112 +32,12 @@ ) STRING_TYPES = {"str"} - -_binops = [ - operator.add, - operator.sub, - operator.mul, - operator.floordiv, - operator.truediv, - operator.mod, - operator.pow, -] - -_binops_compare = [ - operator.eq, - operator.ne, - operator.lt, - operator.le, - operator.gt, - operator.ge, -] - -_bitwise_binops = [operator.and_, operator.or_, operator.xor] - -_int_types = [ - "int8", - "int16", - "int32", - "int64", - "uint8", - "uint16", - "uint32", -] - -_cmpops = [ - operator.lt, - operator.gt, - operator.le, - operator.ge, - operator.eq, - operator.ne, -] - -_reflected_ops = [ - lambda x: 1 + x, - lambda x: 2 * x, - lambda x: 2 - x, - lambda x: 2 // x, - lambda x: 2 / x, - lambda x: 3 + x, - lambda x: 3 * x, - lambda x: 3 - x, - lambda x: 3 // x, - lambda x: 3 / x, - lambda x: 3 % x, - lambda x: -1 + x, - lambda x: -2 * x, - lambda x: -2 - x, - lambda x: -2 // x, - lambda x: -2 / x, - lambda x: -3 + x, - lambda x: -3 * x, - lambda x: -3 - x, - lambda x: -3 // x, - lambda x: -3 / x, - lambda x: -3 % x, - lambda x: 0 + x, - lambda x: 0 * x, - lambda x: 0 - x, - lambda x: 0 // x, - lambda x: 0 / x, -] - -_operators_arithmetic = [ - "add", - "radd", - "sub", - "rsub", - "mul", - "rmul", - "mod", - "rmod", - "pow", - "rpow", - "div", - "divide", - "floordiv", - "rfloordiv", - "truediv", - "rtruediv", -] - -_operators_comparison = ["eq", "ne", "lt", "le", "gt", "ge"] - - pytest_xfail = pytest.mark.xfail pytestmark = pytest.mark.spilling # If spilling is enabled globally, we skip many test permutations # to reduce running time. if get_global_manager() is not None: - _binops = _binops[:1] - _binops_compare = _binops_compare[:1] - _int_types = _int_types[-1:] - _cmpops = _cmpops[:1] - _reflected_ops = _reflected_ops[:1] - _operators_arithmetic = _operators_arithmetic[:1] - _operators_comparison = _operators_comparison[:1] DATETIME_TYPES = {"datetime64[ms]"} NUMERIC_TYPES = {"float32"} FLOAT_TYPES = {"float64"} @@ -147,11 +48,10 @@ @pytest.mark.parametrize("obj_class", ["Series", "Index"]) -@pytest.mark.parametrize("binop", _binops) -def test_series_binop(request, binop, obj_class): +def test_series_binop(request, arithmetic_op, obj_class): request.applymarker( pytest.mark.xfail( - binop is operator.floordiv, + arithmetic_op is operator.floordiv, reason="https://github.com/rapidsai/cudf/issues/17073", ) ) @@ -169,8 +69,8 @@ def test_series_binop(request, binop, obj_class): sr1 = Index(sr1) sr2 = Index(sr2) - expect = binop(psr1, psr2) - result = binop(sr1, sr2) + expect = arithmetic_op(psr1, psr2) + result = arithmetic_op(sr1, sr2) if obj_class == "Index": result = Series(result) @@ -178,28 +78,25 @@ def test_series_binop(request, binop, obj_class): assert_eq(result, expect) -@pytest.mark.parametrize("binop", _binops) -def test_series_binop_concurrent(binop): +def test_series_binop_concurrent(arithmetic_op): def func(index): rng = np.random.default_rng(seed=0) arr = rng.random(100) * 10 sr = Series(arr) - result = binop(sr.astype("int32"), sr) - expect = binop(arr.astype("int32"), arr) + result = arithmetic_op(sr.astype("int32"), sr) + expect = arithmetic_op(arr.astype("int32"), arr) np.testing.assert_almost_equal(result.to_numpy(), expect, decimal=5) - from concurrent.futures import ThreadPoolExecutor - indices = range(10) with ThreadPoolExecutor(4) as e: # four processes list(e.map(func, indices)) @pytest.mark.parametrize("obj_class", ["Series", "Index"]) -@pytest.mark.parametrize("nelem,binop", list(product([1, 2, 100], _binops))) -def test_series_binop_scalar(nelem, binop, obj_class): +def test_series_binop_scalar(arithmetic_op, obj_class): + nelem = 10 rng = np.random.default_rng(seed=0) arr = rng.random(nelem) rhs = rng.choice(arr).item() @@ -208,25 +105,24 @@ def test_series_binop_scalar(nelem, binop, obj_class): if obj_class == "Index": sr = Index(sr) - result = binop(sr, rhs) + result = arithmetic_op(sr, rhs) if obj_class == "Index": result = Series(result) - np.testing.assert_almost_equal(result.to_numpy(), binop(arr, rhs)) + np.testing.assert_almost_equal(result.to_numpy(), arithmetic_op(arr, rhs)) @pytest.mark.parametrize("obj_class", ["Series", "Index"]) -@pytest.mark.parametrize("binop", _bitwise_binops) -@pytest.mark.parametrize( - "lhs_dtype,rhs_dtype", list(product(_int_types, _int_types)) -) -def test_series_bitwise_binop(binop, obj_class, lhs_dtype, rhs_dtype): +@pytest.mark.parametrize("binop", [operator.and_, operator.or_, operator.xor]) +def test_series_bitwise_binop( + binop, obj_class, integer_types_as_str, integer_types_as_str2 +): rng = np.random.default_rng(seed=0) - arr1 = (rng.random(100) * 100).astype(lhs_dtype) + arr1 = (rng.random(100) * 100).astype(integer_types_as_str) sr1 = Series(arr1) - arr2 = (rng.random(100) * 100).astype(rhs_dtype) + arr2 = (rng.random(100) * 100).astype(integer_types_as_str2) sr2 = Series(arr2) if obj_class == "Index": @@ -242,14 +138,12 @@ def test_series_bitwise_binop(binop, obj_class, lhs_dtype, rhs_dtype): @pytest.mark.parametrize("obj_class", ["Series", "Index"]) -@pytest.mark.parametrize("cmpop", _cmpops) -@pytest.mark.parametrize( - "dtype", ["int8", "int32", "int64", "float32", "float64", "datetime64[ms]"] -) -def test_series_compare(cmpop, obj_class, dtype): +def test_series_compare( + comparison_op, obj_class, numeric_and_temporal_types_as_str +): rng = np.random.default_rng(seed=0) - arr1 = rng.integers(0, 100, 100).astype(dtype) - arr2 = rng.integers(0, 100, 100).astype(dtype) + arr1 = rng.integers(0, 100, 100).astype(numeric_and_temporal_types_as_str) + arr2 = rng.integers(0, 100, 100).astype(numeric_and_temporal_types_as_str) sr1 = Series(arr1) sr2 = Series(arr2) @@ -257,73 +151,58 @@ def test_series_compare(cmpop, obj_class, dtype): sr1 = Index(sr1) sr2 = Index(sr2) - result1 = cmpop(sr1, sr1) - result2 = cmpop(sr2, sr2) - result3 = cmpop(sr1, sr2) + result1 = comparison_op(sr1, sr1) + result2 = comparison_op(sr2, sr2) + result3 = comparison_op(sr1, sr2) if obj_class == "Index": result1 = Series(result1) result2 = Series(result2) result3 = Series(result3) - np.testing.assert_equal(result1.to_numpy(), cmpop(arr1, arr1)) - np.testing.assert_equal(result2.to_numpy(), cmpop(arr2, arr2)) - np.testing.assert_equal(result3.to_numpy(), cmpop(arr1, arr2)) + np.testing.assert_equal(result1.to_numpy(), comparison_op(arr1, arr1)) + np.testing.assert_equal(result2.to_numpy(), comparison_op(arr2, arr2)) + np.testing.assert_equal(result3.to_numpy(), comparison_op(arr1, arr2)) @pytest.mark.parametrize( "dtype,val", [("int8", 200), ("int32", 2**32), ("uint8", -128), ("uint64", -1)], ) -@pytest.mark.parametrize( - "op", - [ - operator.eq, - operator.ne, - operator.lt, - operator.le, - operator.gt, - operator.ge, - ], -) @pytest.mark.parametrize("reverse", [False, True]) -def test_series_compare_integer(dtype, val, op, reverse): +def test_series_compare_integer(dtype, val, comparison_op, reverse): # Tests that these actually work, even though they are out of bound. force_cast_val = np.array(val).astype(dtype) sr = Series( [np.iinfo(dtype).min, np.iinfo(dtype).max, force_cast_val, None], dtype=dtype, ) - - if reverse: - _op = op - - def op(x, y): - return _op(y, x) - # We expect the same result as comparing to a value within range (e.g. 0) # except that a NULL value evaluates to False - if op(0, val): - expected = Series([True, True, True, None]) + exp = False + if reverse: + if comparison_op(val, 0): + exp = True + res = comparison_op(val, sr) else: - expected = Series([False, False, False, None]) + if comparison_op(0, val): + exp = True + res = comparison_op(sr, val) - res = op(sr, val) + expected = Series([exp, exp, exp, None]) assert_eq(res, expected) -def _series_compare_nulls_typegen(): - return [ - *combinations_with_replacement(DATETIME_TYPES, 2), - *combinations_with_replacement(TIMEDELTA_TYPES, 2), - *combinations_with_replacement(NUMERIC_TYPES, 2), - *combinations_with_replacement(STRING_TYPES, 2), - ] - - -@pytest.mark.parametrize("cmpop", _cmpops) -@pytest.mark.parametrize("dtypes", _series_compare_nulls_typegen()) -def test_series_compare_nulls(cmpop, dtypes): +@pytest.mark.parametrize( + "dtypes", + [ + *itertools.combinations_with_replacement(DATETIME_TYPES, 2), + *itertools.combinations_with_replacement(TIMEDELTA_TYPES, 2), + *itertools.combinations_with_replacement(NUMERIC_TYPES, 2), + *itertools.combinations_with_replacement(STRING_TYPES, 2), + ], +) +def test_series_compare_nulls(comparison_op, dtypes): ltype, rtype = dtypes ldata = [1, 2, None, None, 5] @@ -337,9 +216,9 @@ def test_series_compare_nulls(cmpop, dtypes): expect_mask = np.logical_and(lmask, rmask) expect = cudf.Series([None] * 5, dtype="bool") - expect[expect_mask] = cmpop(lser[expect_mask], rser[expect_mask]) + expect[expect_mask] = comparison_op(lser[expect_mask], rser[expect_mask]) - got = cmpop(lser, rser) + got = comparison_op(lser, rser) assert_eq(expect, got) @@ -348,11 +227,6 @@ def str_series_cmp_data(): return pd.Series(["a", "b", None, "d", "e", None], dtype="string") -@pytest.fixture(ids=[op.__name__ for op in _cmpops], params=_cmpops) -def str_series_compare_str_cmpop(request): - return request.param - - @pytest.fixture(ids=["eq", "ne"], params=[operator.eq, operator.ne]) def str_series_compare_num_cmpop(request): return request.param @@ -363,24 +237,16 @@ def cmp_scalar(request): return request.param -def test_str_series_compare_str( - str_series_cmp_data, str_series_compare_str_cmpop -): - expect = str_series_compare_str_cmpop(str_series_cmp_data, "a") - got = str_series_compare_str_cmpop( - Series.from_pandas(str_series_cmp_data), "a" - ) +def test_str_series_compare_str(str_series_cmp_data, comparison_op): + expect = comparison_op(str_series_cmp_data, "a") + got = comparison_op(Series.from_pandas(str_series_cmp_data), "a") assert_eq(expect, got.to_pandas(nullable=True)) -def test_str_series_compare_str_reflected( - str_series_cmp_data, str_series_compare_str_cmpop -): - expect = str_series_compare_str_cmpop("a", str_series_cmp_data) - got = str_series_compare_str_cmpop( - "a", Series.from_pandas(str_series_cmp_data) - ) +def test_str_series_compare_str_reflected(str_series_cmp_data, comparison_op): + expect = comparison_op("a", str_series_cmp_data) + got = comparison_op("a", Series.from_pandas(str_series_cmp_data)) assert_eq(expect, got.to_pandas(nullable=True)) @@ -408,10 +274,8 @@ def test_str_series_compare_num_reflected( @pytest.mark.parametrize("obj_class", ["Series", "Index"]) -@pytest.mark.parametrize("nelem", [1, 2, 100]) -@pytest.mark.parametrize("cmpop", _cmpops) @pytest.mark.parametrize("dtype", [*utils.NUMERIC_TYPES, "datetime64[ms]"]) -def test_series_compare_scalar(nelem, cmpop, obj_class, dtype): +def test_series_compare_scalar(comparison_op, obj_class, dtype): rng = np.random.default_rng(seed=0) arr1 = rng.integers(0, 100, 100).astype(dtype) sr1 = Series(arr1) @@ -420,23 +284,24 @@ def test_series_compare_scalar(nelem, cmpop, obj_class, dtype): if obj_class == "Index": sr1 = Index(sr1) - result1 = cmpop(sr1, rhs) - result2 = cmpop(rhs, sr1) + result1 = comparison_op(sr1, rhs) + result2 = comparison_op(rhs, sr1) if obj_class == "Index": result1 = Series(result1) result2 = Series(result2) - np.testing.assert_equal(result1.to_numpy(), cmpop(arr1, rhs)) - np.testing.assert_equal(result2.to_numpy(), cmpop(rhs, arr1)) + np.testing.assert_equal(result1.to_numpy(), comparison_op(arr1, rhs)) + np.testing.assert_equal(result2.to_numpy(), comparison_op(rhs, arr1)) _nulls = ["none", "some"] -@pytest.mark.parametrize("nelem", [1, 7, 8, 9, 32, 64, 128]) -@pytest.mark.parametrize("lhs_nulls,rhs_nulls", list(product(_nulls, _nulls))) -def test_validity_add(nelem, lhs_nulls, rhs_nulls): +@pytest.mark.parametrize("lhs_nulls", _nulls) +@pytest.mark.parametrize("rhs_nulls", _nulls) +def test_validity_add(lhs_nulls, rhs_nulls): + nelem = 10 rng = np.random.default_rng(seed=0) # LHS lhs_data = rng.random(nelem) @@ -485,21 +350,14 @@ def test_validity_add(nelem, lhs_nulls, rhs_nulls): @pytest.mark.parametrize("obj_class", ["Series", "Index"]) -@pytest.mark.parametrize( - "binop,lhs_dtype,rhs_dtype", - list( - product( - [operator.add, operator.mul], - utils.NUMERIC_TYPES, - utils.NUMERIC_TYPES, - ) - ), -) -def test_series_binop_mixed_dtype(binop, lhs_dtype, rhs_dtype, obj_class): +@pytest.mark.parametrize("binop", [operator.add, operator.mul]) +def test_series_binop_mixed_dtype( + binop, numeric_types_as_str, numeric_types_as_str2, obj_class +): nelem = 10 rng = np.random.default_rng(seed=0) - lhs = (rng.random(nelem) * nelem).astype(lhs_dtype) - rhs = (rng.random(nelem) * nelem).astype(rhs_dtype) + lhs = (rng.random(nelem) * nelem).astype(numeric_types_as_str) + rhs = (rng.random(nelem) * nelem).astype(numeric_types_as_str2) sr1 = Series(lhs) sr2 = Series(rhs) @@ -517,15 +375,13 @@ def test_series_binop_mixed_dtype(binop, lhs_dtype, rhs_dtype, obj_class): @pytest.mark.parametrize("obj_class", ["Series", "Index"]) -@pytest.mark.parametrize( - "cmpop,lhs_dtype,rhs_dtype", - list(product(_cmpops, utils.NUMERIC_TYPES, utils.NUMERIC_TYPES)), -) -def test_series_cmpop_mixed_dtype(cmpop, lhs_dtype, rhs_dtype, obj_class): +def test_series_cmpop_mixed_dtype( + comparison_op, numeric_types_as_str, numeric_types_as_str2, obj_class +): nelem = 5 rng = np.random.default_rng(seed=0) - lhs = (rng.random(nelem) * nelem).astype(lhs_dtype) - rhs = (rng.random(nelem) * nelem).astype(rhs_dtype) + lhs = (rng.random(nelem) * nelem).astype(numeric_types_as_str) + rhs = (rng.random(nelem) * nelem).astype(numeric_types_as_str2) sr1 = Series(lhs) sr2 = Series(rhs) @@ -534,28 +390,30 @@ def test_series_cmpop_mixed_dtype(cmpop, lhs_dtype, rhs_dtype, obj_class): sr1 = Index(sr1) sr2 = Index(sr2) - result = cmpop(Series(sr1), Series(sr2)) + result = comparison_op(Series(sr1), Series(sr2)) if obj_class == "Index": result = Series(result) - np.testing.assert_array_equal(result.to_numpy(), cmpop(lhs, rhs)) + np.testing.assert_array_equal(result.to_numpy(), comparison_op(lhs, rhs)) -@pytest.mark.parametrize("obj_class", ["Series", "Index"]) -@pytest.mark.parametrize( - "func, dtype", list(product(_reflected_ops, utils.NUMERIC_TYPES)) +@pytest.mark.filterwarnings( + "ignore:invalid value encountered in power:RuntimeWarning" +) +@pytest.mark.filterwarnings( + "ignore:divide by zero encountered in power:RuntimeWarning" ) -def test_series_reflected_ops_scalar(func, dtype, obj_class): +@pytest.mark.parametrize("obj_class", [cudf.Series, cudf.Index]) +@pytest.mark.parametrize("scalar", [-1, 0, 1]) +def test_series_reflected_ops_scalar( + arithmetic_op, scalar, numeric_types_as_str, obj_class +): # create random series - random_series = utils.gen_rand(dtype, 100, low=10, seed=12) + func = lambda x: arithmetic_op(scalar, x) # noqa: E731 + random_series = utils.gen_rand(numeric_types_as_str, 100, low=10, seed=12) - # gpu series - gs = Series(random_series) - - # class typing - if obj_class == "Index": - gs = Index(gs) + gs = obj_class(random_series) try: gs_result = func(gs) @@ -577,16 +435,14 @@ def test_series_reflected_ops_scalar(func, dtype, obj_class): np.testing.assert_allclose(ps_result, gs_result.to_numpy()) -@pytest.mark.parametrize("binop", _binops) -def test_different_shapes_and_columns(binop): - # TODO: support `pow()` on NaN values. Particularly, the cases: - # `pow(1, NaN) == 1` and `pow(NaN, 0) == 1` - if binop is operator.pow: - return +def test_different_shapes_and_columns(request, arithmetic_op): + if arithmetic_op is operator.pow: + msg = "TODO: Support `pow(1, NaN) == 1` and `pow(NaN, 0) == 1`" + request.applymarker(pytest.mark.xfail(reason=msg)) # Empty frame on the right side - pd_frame = binop(pd.DataFrame({"x": [1, 2]}), pd.DataFrame({})) - cd_frame = binop(cudf.DataFrame({"x": [1, 2]}), cudf.DataFrame({})) + pd_frame = arithmetic_op(pd.DataFrame({"x": [1, 2]}), pd.DataFrame({})) + cd_frame = arithmetic_op(cudf.DataFrame({"x": [1, 2]}), cudf.DataFrame({})) assert_eq(cd_frame, pd_frame) # Empty frame on the left side @@ -610,17 +466,11 @@ def test_different_shapes_and_columns(binop): assert_eq(cd_frame, pd_frame) -@pytest.mark.parametrize("binop", _binops) -def test_different_shapes_and_same_columns(binop): - # TODO: support `pow()` on NaN values. Particularly, the cases: - # `pow(1, NaN) == 1` and `pow(NaN, 0) == 1` - if binop is operator.pow: - return - - pd_frame = binop( +def test_different_shapes_and_same_columns(arithmetic_op): + pd_frame = arithmetic_op( pd.DataFrame({"x": [1, 2]}), pd.DataFrame({"x": [1, 2, 3]}) ) - cd_frame = binop( + cd_frame = arithmetic_op( cudf.DataFrame({"x": [1, 2]}), cudf.DataFrame({"x": [1, 2, 3]}) ) # cast x as float64 so it matches pandas dtype @@ -628,12 +478,12 @@ def test_different_shapes_and_same_columns(binop): assert_eq(cd_frame, pd_frame) -@pytest.mark.parametrize("binop", _binops) -def test_different_shapes_and_columns_with_unaligned_indices(binop): - # TODO: support `pow()` on NaN values. Particularly, the cases: - # `pow(1, NaN) == 1` and `pow(NaN, 0) == 1` - if binop is operator.pow: - return +def test_different_shapes_and_columns_with_unaligned_indices( + request, arithmetic_op +): + if arithmetic_op is operator.pow: + msg = "TODO: Support `pow(1, NaN) == 1` and `pow(NaN, 0) == 1`" + request.applymarker(pytest.mark.xfail(reason=msg)) # Test with a RangeIndex pdf1 = pd.DataFrame({"x": [4, 3, 2, 1], "y": [7, 3, 8, 6]}) @@ -650,8 +500,8 @@ def test_different_shapes_and_columns_with_unaligned_indices(binop): gdf2 = cudf.DataFrame.from_pandas(pdf2) gdf3 = cudf.DataFrame.from_pandas(pdf3) - pd_frame = binop(binop(pdf1, pdf2), pdf3) - cd_frame = binop(binop(gdf1, gdf2), gdf3) + pd_frame = arithmetic_op(arithmetic_op(pdf1, pdf2), pdf3) + cd_frame = arithmetic_op(arithmetic_op(gdf1, gdf2), gdf3) # cast x and y as float64 so it matches pandas dtype cd_frame["x"] = cd_frame["x"].astype(np.float64) cd_frame["y"] = cd_frame["y"].astype(np.float64) @@ -665,8 +515,8 @@ def test_different_shapes_and_columns_with_unaligned_indices(binop): pdf2 = pd.DataFrame({"x": [2]}, index=["a"]) gdf1 = cudf.DataFrame.from_pandas(pdf1) gdf2 = cudf.DataFrame.from_pandas(pdf2) - pd_frame = binop(pdf1, pdf2) - cd_frame = binop(gdf1, gdf2) + pd_frame = arithmetic_op(pdf1, pdf2) + cd_frame = arithmetic_op(gdf1, gdf2) # Sort both frames consistently for comparison pd_sorted = pd_frame.sort_index().sort_values(list(pd_frame.columns)) @@ -675,78 +525,76 @@ def test_different_shapes_and_columns_with_unaligned_indices(binop): @pytest.mark.parametrize( - "df2", + "pdf2", [ - cudf.DataFrame({"a": [3, 2, 1]}, index=[3, 2, 1]), - cudf.DataFrame([3, 2]), + pd.DataFrame({"a": [3, 2, 1]}, index=[3, 2, 1]), + pd.DataFrame([3, 2]), ], ) -@pytest.mark.parametrize("binop", [operator.eq, operator.ne]) -def test_df_different_index_shape(df2, binop): +def test_df_different_index_shape(pdf2, comparison_op): df1 = cudf.DataFrame([1, 2, 3], index=[1, 2, 3]) pdf1 = df1.to_pandas() - pdf2 = df2.to_pandas() + df2 = cudf.DataFrame.from_pandas(pdf2) utils.assert_exceptions_equal( - lfunc=binop, - rfunc=binop, + lfunc=comparison_op, + rfunc=comparison_op, lfunc_args_and_kwargs=([pdf1, pdf2],), rfunc_args_and_kwargs=([df1, df2],), ) -@pytest.mark.parametrize("op", [operator.eq, operator.ne]) -def test_boolean_scalar_binop(op): +def test_boolean_scalar_binop(comparison_op): rng = np.random.default_rng(seed=0) psr = pd.Series(rng.choice([True, False], 10)) gsr = cudf.from_pandas(psr) - assert_eq(op(psr, True), op(gsr, True)) - assert_eq(op(psr, False), op(gsr, False)) + assert_eq(comparison_op(psr, True), comparison_op(gsr, True)) + assert_eq(comparison_op(psr, False), comparison_op(gsr, False)) -@pytest.mark.parametrize("func", _operators_arithmetic) @pytest.mark.parametrize("has_nulls", [True, False]) @pytest.mark.parametrize("fill_value", [None, 27]) -@pytest.mark.parametrize("dtype", ["float32", "float64"]) -def test_operator_func_between_series(dtype, func, has_nulls, fill_value): +def test_operator_func_between_series( + float_types_as_str, arithmetic_op_method, has_nulls, fill_value +): count = 1000 gdf_series_a = utils.gen_rand_series( - dtype, count, has_nulls=has_nulls, stride=10000 + float_types_as_str, count, has_nulls=has_nulls, stride=10000 ) gdf_series_b = utils.gen_rand_series( - dtype, count, has_nulls=has_nulls, stride=100 + float_types_as_str, count, has_nulls=has_nulls, stride=100 ) pdf_series_a = gdf_series_a.to_pandas() pdf_series_b = gdf_series_b.to_pandas() - gdf_result = getattr(gdf_series_a, func)( + gdf_result = getattr(gdf_series_a, arithmetic_op_method)( gdf_series_b, fill_value=fill_value ) - pdf_result = getattr(pdf_series_a, func)( + pdf_result = getattr(pdf_series_a, arithmetic_op_method)( pdf_series_b, fill_value=fill_value ) assert_eq(pdf_result, gdf_result) -@pytest.mark.parametrize("func", _operators_arithmetic) @pytest.mark.parametrize("has_nulls", [True, False]) @pytest.mark.parametrize("fill_value", [None, 27]) -@pytest.mark.parametrize("dtype", ["float32", "float64"]) -def test_operator_func_series_and_scalar(dtype, func, has_nulls, fill_value): +def test_operator_func_series_and_scalar( + float_types_as_str, arithmetic_op_method, has_nulls, fill_value +): count = 1000 scalar = 59 gdf_series = utils.gen_rand_series( - dtype, count, has_nulls=has_nulls, stride=10000 + float_types_as_str, count, has_nulls=has_nulls, stride=10000 ) pdf_series = gdf_series.to_pandas() - gdf_series_result = getattr(gdf_series, func)( + gdf_series_result = getattr(gdf_series, arithmetic_op_method)( scalar, fill_value=fill_value, ) - pdf_series_result = getattr(pdf_series, func)( + pdf_series_result = getattr(pdf_series, arithmetic_op_method)( scalar, fill_value=fill_value, ) @@ -754,27 +602,26 @@ def test_operator_func_series_and_scalar(dtype, func, has_nulls, fill_value): assert_eq(pdf_series_result, gdf_series_result) -_permu_values = [0, 1, None, np.nan] - - -@pytest.mark.parametrize("fill_value", _permu_values) -@pytest.mark.parametrize("scalar_a", _permu_values) -@pytest.mark.parametrize("scalar_b", _permu_values) -@pytest.mark.parametrize("func", _operators_comparison) -@pytest.mark.parametrize("dtype", ["float32", "float64"]) +@pytest.mark.parametrize("fill_value", [0, 1, None, np.nan]) +@pytest.mark.parametrize("scalar_a", [0, 1, None, np.nan]) +@pytest.mark.parametrize("scalar_b", [0, 1, None, np.nan]) def test_operator_func_between_series_logical( - dtype, func, scalar_a, scalar_b, fill_value + float_types_as_str, comparison_op_method, scalar_a, scalar_b, fill_value ): - gdf_series_a = Series([scalar_a], nan_as_null=False).astype(dtype) - gdf_series_b = Series([scalar_b], nan_as_null=False).astype(dtype) + gdf_series_a = Series([scalar_a], nan_as_null=False).astype( + float_types_as_str + ) + gdf_series_b = Series([scalar_b], nan_as_null=False).astype( + float_types_as_str + ) pdf_series_a = gdf_series_a.to_pandas(nullable=True) pdf_series_b = gdf_series_b.to_pandas(nullable=True) - gdf_series_result = getattr(gdf_series_a, func)( + gdf_series_result = getattr(gdf_series_a, comparison_op_method)( gdf_series_b, fill_value=fill_value ) - pdf_series_result = getattr(pdf_series_a, func)( + pdf_series_result = getattr(pdf_series_a, comparison_op_method)( pdf_series_b, fill_value=fill_value ) expect = pdf_series_result @@ -795,33 +642,43 @@ def test_operator_func_between_series_logical( assert_eq(expect, got) -@pytest.mark.parametrize("dtype", ["float32", "float64"]) -@pytest.mark.parametrize("func", _operators_comparison) @pytest.mark.parametrize("has_nulls", [True, False]) @pytest.mark.parametrize("scalar", [-59.0, np.nan, 0, 59.0]) @pytest.mark.parametrize("fill_value", [None, 1.0]) def test_operator_func_series_and_scalar_logical( - request, dtype, func, has_nulls, scalar, fill_value + request, + float_types_as_str, + comparison_op_method, + has_nulls, + scalar, + fill_value, ): request.applymarker( pytest.mark.xfail( PANDAS_VERSION >= PANDAS_CURRENT_SUPPORTED_VERSION and fill_value == 1.0 and scalar is np.nan - and (has_nulls or (not has_nulls and func not in {"eq", "ne"})), + and ( + has_nulls + or (not has_nulls and comparison_op_method not in {"eq", "ne"}) + ), reason="https://github.com/pandas-dev/pandas/issues/57447", ) ) if has_nulls: - gdf_series = cudf.Series([-1.0, 0, cudf.NA, 1.1], dtype=dtype) + gdf_series = cudf.Series( + [-1.0, 0, cudf.NA, 1.1], dtype=float_types_as_str + ) else: - gdf_series = cudf.Series([-1.0, 0, 10.5, 1.1], dtype=dtype) + gdf_series = cudf.Series( + [-1.0, 0, 10.5, 1.1], dtype=float_types_as_str + ) pdf_series = gdf_series.to_pandas(nullable=True) - gdf_series_result = getattr(gdf_series, func)( + gdf_series_result = getattr(gdf_series, comparison_op_method)( scalar, fill_value=fill_value, ) - pdf_series_result = getattr(pdf_series, func)( + pdf_series_result = getattr(pdf_series, comparison_op_method)( scalar, fill_value=fill_value ) @@ -831,11 +688,12 @@ def test_operator_func_series_and_scalar_logical( assert_eq(expect, got) -@pytest.mark.parametrize("func", _operators_arithmetic) @pytest.mark.parametrize("nulls", _nulls) @pytest.mark.parametrize("fill_value", [None, 27]) @pytest.mark.parametrize("other", ["df", "scalar"]) -def test_operator_func_dataframe(func, nulls, fill_value, other): +def test_operator_func_dataframe( + arithmetic_op_method, nulls, fill_value, other +): num_rows = 100 num_cols = 3 @@ -862,16 +720,17 @@ def gen_df(): gdf1 = cudf.DataFrame.from_pandas(pdf1) gdf2 = cudf.DataFrame.from_pandas(pdf2) if other == "df" else 59.0 - got = getattr(gdf1, func)(gdf2, fill_value=fill_value) - expect = getattr(pdf1, func)(pdf2, fill_value=fill_value)[list(got._data)] + got = getattr(gdf1, arithmetic_op_method)(gdf2, fill_value=fill_value) + expect = getattr(pdf1, arithmetic_op_method)(pdf2, fill_value=fill_value)[ + list(got._data) + ] assert_eq(expect, got) -@pytest.mark.parametrize("func", _operators_comparison) @pytest.mark.parametrize("nulls", _nulls) @pytest.mark.parametrize("other", ["df", "scalar"]) -def test_logical_operator_func_dataframe(func, nulls, other): +def test_logical_operator_func_dataframe(comparison_op_method, nulls, other): num_rows = 100 num_cols = 3 @@ -902,37 +761,26 @@ def gen_df(): else 59.0 ) - got = getattr(gdf1, func)(gdf2) - expect = getattr(pdf1, func)(pdf2)[list(got._data)] + got = getattr(gdf1, comparison_op_method)(gdf2) + expect = getattr(pdf1, comparison_op_method)(pdf2)[list(got._data)] assert_eq(expect, got) -@pytest.mark.parametrize( - "func", - [op for op in _operators_arithmetic if op not in {"rmod", "rfloordiv"}] - + _operators_comparison - + [ - pytest.param( - "rmod", - marks=pytest.mark.xfail( - reason="https://github.com/rapidsai/cudf/issues/12162" - ), - ), - pytest.param( - "rfloordiv", - marks=pytest.mark.xfail( +@pytest.mark.parametrize("rhs", [0, 1, 10]) +def test_binop_bool_uint(request, binary_op_method, rhs): + if binary_op_method in {"rmod", "rfloordiv"}: + request.applymarker( + pytest.mark.xfail( reason="https://github.com/rapidsai/cudf/issues/12162" ), - ), - ], -) -@pytest.mark.parametrize("rhs", [0, 1, 2, 128]) -def test_binop_bool_uint(func, rhs): + ) psr = pd.Series([True, False, False]) gsr = cudf.from_pandas(psr) assert_eq( - getattr(psr, func)(rhs), getattr(gsr, func)(rhs), check_dtype=False + getattr(psr, binary_op_method)(rhs), + getattr(gsr, binary_op_method)(rhs), + check_dtype=False, ) @@ -981,30 +829,19 @@ def test_floordiv_zero_bool(scalar_divisor): cr // cudf_div -@pytest.mark.parametrize( - "dtype", - ( - pytest.param( - np.bool_, - marks=pytest_xfail( - reason=( - "Pandas handling of division by zero-bool is too strange" - ) - ), - ), - np.int8, - np.uint8, - np.int64, - np.uint64, - np.float32, - np.float64, - ), -) -def test_rmod_zero_nan(dtype): - sr = pd.Series([1, 1, 0], dtype=dtype) +def test_rmod_zero_nan(numeric_and_bool_types_as_str, request): + request.applymarker( + pytest.mark.xfail( + numeric_and_bool_types_as_str == "bool", + reason="pandas returns int8, cuDF returns int64", + ) + ) + sr = pd.Series([1, 1, 0], dtype=numeric_and_bool_types_as_str) cr = cudf.from_pandas(sr) assert_eq(1 % sr, 1 % cr) - expected_dtype = np.float64 if cr.dtype.kind != "f" else dtype + expected_dtype = ( + np.float64 if cr.dtype.kind != "f" else numeric_and_bool_types_as_str + ) assert_eq(1 % cr, cudf.Series([0, 0, None], dtype=expected_dtype)) @@ -1048,7 +885,7 @@ def is_timezone_aware_dtype(dtype: str) -> bool: return bool(re.match(r"^datetime64\[ns, .+\]$", dtype)) -@pytest.mark.parametrize("n_periods", [0, 1, -1, 12, -12]) +@pytest.mark.parametrize("n_periods", [0, 1, -12]) @pytest.mark.parametrize( "frequency", [ @@ -1145,16 +982,6 @@ def test_datetime_dateoffset_binaryop( assert_eq(expect, got) -@pytest.mark.parametrize( - "date_col", - [ - [ - "2000-01-01 00:00:00.012345678", - "2000-01-31 00:00:00.012345678", - "2000-02-29 00:00:00.012345678", - ] - ], -) @pytest.mark.parametrize( "kwargs", [ @@ -1177,8 +1004,15 @@ def test_datetime_dateoffset_binaryop( PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, reason="Fails in older versions of pandas", ) -def test_datetime_dateoffset_binaryop_multiple(request, date_col, kwargs, op): - gsr = cudf.Series(date_col, dtype="datetime64[ns]") +def test_datetime_dateoffset_binaryop_multiple(request, kwargs, op): + gsr = cudf.Series( + [ + "2000-01-01 00:00:00.012345678", + "2000-01-31 00:00:00.012345678", + "2000-02-29 00:00:00.012345678", + ], + dtype="datetime64[ns]", + ) psr = gsr.to_pandas() poffset = pd.DateOffset(**kwargs) @@ -1190,7 +1024,7 @@ def test_datetime_dateoffset_binaryop_multiple(request, date_col, kwargs, op): assert_eq(expect, got) -@pytest.mark.parametrize("n_periods", [0, 1, -1, 12, -12]) +@pytest.mark.parametrize("n_periods", [0, 1, -12]) @pytest.mark.parametrize( "frequency", [ @@ -1290,43 +1124,21 @@ def test_binops_with_lhs_numpy_scalar(frame, dtype): assert_eq(expected, got) -@pytest.mark.parametrize( - "dtype", - [ - "int8", - "int16", - "int32", - "int64", - "uint8", - "uint16", - "uint32", - "uint64", - "float32", - "float64", - "datetime64[ns]", - "datetime64[us]", - "datetime64[ms]", - "datetime64[s]", - "timedelta64[ns]", - "timedelta64[us]", - "timedelta64[ms]", - "timedelta64[s]", - ], -) -@pytest.mark.parametrize("op", _operators_comparison) -def test_binops_with_NA_consistent(dtype, op): +def test_binops_with_NA_consistent( + numeric_and_temporal_types_as_str, comparison_op_method +): data = [1, 2, 3] - sr = cudf.Series(data, dtype=dtype) + sr = cudf.Series(data, dtype=numeric_and_temporal_types_as_str) - result = getattr(sr, op)(cudf.NA) - if dtype in NUMERIC_TYPES: - if op == "ne": + result = getattr(sr, comparison_op_method)(cudf.NA) + if sr.dtype.kind in "mM": + assert result.null_count == len(data) + else: + if comparison_op_method == "ne": expect_all = True else: expect_all = False assert (result == expect_all).all() - elif dtype in DATETIME_TYPES & TIMEDELTA_TYPES: - assert result._column.null_count == len(data) @pytest.mark.parametrize( @@ -1767,7 +1579,7 @@ def test_binops_reflect_decimal( assert_eq(expect, got) -@pytest.mark.parametrize("powers", [0, 1, 2, 3]) +@pytest.mark.parametrize("powers", [0, 1, 2]) def test_binops_decimal_pow(powers): s = cudf.Series( [ @@ -1791,157 +1603,165 @@ def test_binops_raise_error(): @pytest.mark.parametrize( - "args", + "op, ldata, ldtype, rdata, expected1, expected2", [ ( operator.eq, ["100", "41", None], cudf.Decimal64Dtype(scale=0, precision=5), [100, 42, 12], - cudf.Series([True, False, None], dtype=bool), - cudf.Series([True, False, None], dtype=bool), + [True, False, None], + [True, False, None], ), ( operator.eq, ["100.000", "42.001", None], cudf.Decimal64Dtype(scale=3, precision=6), [100, 42, 12], - cudf.Series([True, False, None], dtype=bool), - cudf.Series([True, False, None], dtype=bool), + [True, False, None], + [True, False, None], ), ( operator.eq, ["100", "40", None], cudf.Decimal64Dtype(scale=-1, precision=3), [100, 42, 12], - cudf.Series([True, False, None], dtype=bool), - cudf.Series([True, False, None], dtype=bool), + [True, False, None], + [True, False, None], ), ( operator.ne, ["100", "42", "24", None], cudf.Decimal64Dtype(scale=0, precision=3), [100, 40, 24, 12], - cudf.Series([False, True, False, None], dtype=bool), - cudf.Series([False, True, False, None], dtype=bool), + [False, True, False, None], + [False, True, False, None], ), ( operator.ne, ["10.1", "88", "11", None], cudf.Decimal64Dtype(scale=1, precision=3), [10, 42, 11, 12], - cudf.Series([True, True, False, None], dtype=bool), - cudf.Series([True, True, False, None], dtype=bool), + [True, True, False, None], + [True, True, False, None], ), ( operator.ne, ["100.000", "42", "23.999", None], cudf.Decimal64Dtype(scale=3, precision=6), [100, 42, 24, 12], - cudf.Series([False, False, True, None], dtype=bool), - cudf.Series([False, False, True, None], dtype=bool), + [False, False, True, None], + [False, False, True, None], ), ( operator.lt, ["100", "40", "28", None], cudf.Decimal64Dtype(scale=0, precision=3), [100, 42, 24, 12], - cudf.Series([False, True, False, None], dtype=bool), - cudf.Series([False, False, True, None], dtype=bool), + [False, True, False, None], + [False, False, True, None], ), ( operator.lt, ["100.000", "42.002", "23.999", None], cudf.Decimal64Dtype(scale=3, precision=6), [100, 42, 24, 12], - cudf.Series([False, False, True, None], dtype=bool), - cudf.Series([False, True, False, None], dtype=bool), + [False, False, True, None], + [False, True, False, None], ), ( operator.lt, ["100", "40", "10", None], cudf.Decimal64Dtype(scale=-1, precision=3), [100, 42, 8, 12], - cudf.Series([False, True, False, None], dtype=bool), - cudf.Series([False, False, True, None], dtype=bool), + [False, True, False, None], + [False, False, True, None], ), ( operator.gt, ["100", "42", "20", None], cudf.Decimal64Dtype(scale=0, precision=3), [100, 40, 24, 12], - cudf.Series([False, True, False, None], dtype=bool), - cudf.Series([False, False, True, None], dtype=bool), + [False, True, False, None], + [False, False, True, None], ), ( operator.gt, ["100.000", "42.002", "23.999", None], cudf.Decimal64Dtype(scale=3, precision=6), [100, 42, 24, 12], - cudf.Series([False, True, False, None], dtype=bool), - cudf.Series([False, False, True, None], dtype=bool), + [False, True, False, None], + [False, False, True, None], ), ( operator.gt, ["100", "40", "10", None], cudf.Decimal64Dtype(scale=-1, precision=3), [100, 42, 8, 12], - cudf.Series([False, False, True, None], dtype=bool), - cudf.Series([False, True, False, None], dtype=bool), + [False, False, True, None], + [False, True, False, None], ), ( operator.le, ["100", "40", "28", None], cudf.Decimal64Dtype(scale=0, precision=3), [100, 42, 24, 12], - cudf.Series([True, True, False, None], dtype=bool), - cudf.Series([True, False, True, None], dtype=bool), + [True, True, False, None], + [True, False, True, None], ), ( operator.le, ["100.000", "42.002", "23.999", None], cudf.Decimal64Dtype(scale=3, precision=6), [100, 42, 24, 12], - cudf.Series([True, False, True, None], dtype=bool), - cudf.Series([True, True, False, None], dtype=bool), + [True, False, True, None], + [True, True, False, None], ), ( operator.le, ["100", "40", "10", None], cudf.Decimal64Dtype(scale=-1, precision=3), [100, 42, 8, 12], - cudf.Series([True, True, False, None], dtype=bool), - cudf.Series([True, False, True, None], dtype=bool), + [True, True, False, None], + [True, False, True, None], ), ( operator.ge, ["100", "42", "20", None], cudf.Decimal64Dtype(scale=0, precision=3), [100, 40, 24, 12], - cudf.Series([True, True, False, None], dtype=bool), - cudf.Series([True, False, True, None], dtype=bool), + [True, True, False, None], + [True, False, True, None], ), ( operator.ge, ["100.000", "42.002", "23.999", None], cudf.Decimal64Dtype(scale=3, precision=6), [100, 42, 24, 12], - cudf.Series([True, True, False, None], dtype=bool), - cudf.Series([True, False, True, None], dtype=bool), + [True, True, False, None], + [True, False, True, None], ), ( operator.ge, ["100", "40", "10", None], cudf.Decimal64Dtype(scale=-1, precision=3), [100, 42, 8, 12], - cudf.Series([True, False, True, None], dtype=bool), - cudf.Series([True, True, False, None], dtype=bool), + [True, False, True, None], + [True, True, False, None], ), ], ) -@pytest.mark.parametrize("integer_dtype", utils.INTEGER_TYPES) @pytest.mark.parametrize("reflected", [True, False]) -def test_binops_decimal_comp_mixed_integer(args, integer_dtype, reflected): +def test_binops_decimal_comp_mixed_integer( + op, + ldata, + ldtype, + rdata, + expected1, + expected2, + integer_types_as_str, + reflected, +): """ Tested compare operations: eq, lt, gt, le, ge @@ -1951,12 +1771,12 @@ def test_binops_decimal_comp_mixed_integer(args, integer_dtype, reflected): of the following compare results: {True, False, None}. """ if not reflected: - op, ldata, ldtype, rdata, expected, _ = args + expected = cudf.Series(expected1, dtype=bool) else: - op, ldata, ldtype, rdata, _, expected = args + expected = cudf.Series(expected2, dtype=bool) lhs = utils._decimal_series(ldata, ldtype) - rhs = cudf.Series(rdata, dtype=integer_dtype) + rhs = cudf.Series(rdata, dtype=integer_types_as_str) if reflected: rhs, lhs = lhs, rhs @@ -1967,7 +1787,7 @@ def test_binops_decimal_comp_mixed_integer(args, integer_dtype, reflected): @pytest.mark.parametrize( - "args", + "op, lhs, l_dtype, rhs, expect, expect_dtype, reflect", [ ( operator.add, @@ -2196,17 +2016,17 @@ def test_binops_decimal_comp_mixed_integer(args, integer_dtype, reflected): ), ], ) -def test_binops_decimal_scalar(args): - op, lhs, l_dtype, rhs, expect, expect_dtype, reflect = args - - def decimal_series(input, dtype): - return cudf.Series( - [x if x is None else decimal.Decimal(x) for x in input], - dtype=dtype, - ) - - lhs = decimal_series(lhs, l_dtype) - expect = decimal_series(expect, expect_dtype) +def test_binops_decimal_scalar( + op, lhs, l_dtype, rhs, expect, expect_dtype, reflect +): + lhs = cudf.Series( + [x if x is None else decimal.Decimal(x) for x in lhs], + dtype=l_dtype, + ) + expect = cudf.Series( + [x if x is None else decimal.Decimal(x) for x in expect], + dtype=expect_dtype, + ) if reflect: lhs, rhs = rhs, lhs @@ -2217,108 +2037,110 @@ def decimal_series(input, dtype): @pytest.mark.parametrize( - "args", + "op, ldata, ldtype, rdata, expected1, expected2", [ ( operator.eq, ["100.00", "41", None], cudf.Decimal64Dtype(scale=0, precision=5), 100, - cudf.Series([True, False, None], dtype=bool), - cudf.Series([True, False, None], dtype=bool), + [True, False, None], + [True, False, None], ), ( operator.eq, ["100.123", "41", None], cudf.Decimal64Dtype(scale=3, precision=6), decimal.Decimal("100.123"), - cudf.Series([True, False, None], dtype=bool), - cudf.Series([True, False, None], dtype=bool), + [True, False, None], + [True, False, None], ), ( operator.ne, ["100.00", "41", None], cudf.Decimal64Dtype(scale=2, precision=5), 100, - cudf.Series([False, True, None], dtype=bool), - cudf.Series([False, True, None], dtype=bool), + [False, True, None], + [False, True, None], ), ( operator.ne, ["100.123", "120.21", None], cudf.Decimal64Dtype(scale=3, precision=6), decimal.Decimal("100.123"), - cudf.Series([False, True, None], dtype=bool), - cudf.Series([False, True, None], dtype=bool), + [False, True, None], + [False, True, None], ), ( operator.gt, ["100.00", "41", "120.21", None], cudf.Decimal64Dtype(scale=2, precision=5), 100, - cudf.Series([False, False, True, None], dtype=bool), - cudf.Series([False, True, False, None], dtype=bool), + [False, False, True, None], + [False, True, False, None], ), ( operator.gt, ["100.123", "41", "120.21", None], cudf.Decimal64Dtype(scale=3, precision=6), decimal.Decimal("100.123"), - cudf.Series([False, False, True, None], dtype=bool), - cudf.Series([False, True, False, None], dtype=bool), + [False, False, True, None], + [False, True, False, None], ), ( operator.ge, ["100.00", "41", "120.21", None], cudf.Decimal64Dtype(scale=2, precision=5), 100, - cudf.Series([True, False, True, None], dtype=bool), - cudf.Series([True, True, False, None], dtype=bool), + [True, False, True, None], + [True, True, False, None], ), ( operator.ge, ["100.123", "41", "120.21", None], cudf.Decimal64Dtype(scale=3, precision=6), decimal.Decimal("100.123"), - cudf.Series([True, False, True, None], dtype=bool), - cudf.Series([True, True, False, None], dtype=bool), + [True, False, True, None], + [True, True, False, None], ), ( operator.lt, ["100.00", "41", "120.21", None], cudf.Decimal64Dtype(scale=2, precision=5), 100, - cudf.Series([False, True, False, None], dtype=bool), - cudf.Series([False, False, True, None], dtype=bool), + [False, True, False, None], + [False, False, True, None], ), ( operator.lt, ["100.123", "41", "120.21", None], cudf.Decimal64Dtype(scale=3, precision=6), decimal.Decimal("100.123"), - cudf.Series([False, True, False, None], dtype=bool), - cudf.Series([False, False, True, None], dtype=bool), + [False, True, False, None], + [False, False, True, None], ), ( operator.le, ["100.00", "41", "120.21", None], cudf.Decimal64Dtype(scale=2, precision=5), 100, - cudf.Series([True, True, False, None], dtype=bool), - cudf.Series([True, False, True, None], dtype=bool), + [True, True, False, None], + [True, False, True, None], ), ( operator.le, ["100.123", "41", "120.21", None], cudf.Decimal64Dtype(scale=3, precision=6), decimal.Decimal("100.123"), - cudf.Series([True, True, False, None], dtype=bool), - cudf.Series([True, False, True, None], dtype=bool), + [True, True, False, None], + [True, False, True, None], ), ], ) @pytest.mark.parametrize("reflected", [True, False]) -def test_binops_decimal_scalar_compare(args, reflected): +def test_binops_decimal_scalar_compare( + op, ldata, ldtype, rdata, expected1, expected2, reflected +): """ Tested compare operations: eq, lt, gt, le, ge @@ -2327,9 +2149,9 @@ def test_binops_decimal_scalar_compare(args, reflected): following compare results: {True, False, None}. """ if not reflected: - op, ldata, ldtype, rdata, expected, _ = args + expected = cudf.Series(expected1, dtype=bool) else: - op, ldata, ldtype, rdata, _, expected = args + expected = cudf.Series(expected2, dtype=bool) lhs = utils._decimal_series(ldata, ldtype) rhs = rdata @@ -2342,53 +2164,35 @@ def test_binops_decimal_scalar_compare(args, reflected): assert_eq(expected, actual) -@pytest.mark.parametrize( - "dtype", - [ - "uint8", - "uint16", - "uint32", - "uint64", - "int8", - "int16", - "int32", - "int64", - "float32", - "float64", - "str", - "datetime64[ns]", - "datetime64[us]", - "datetime64[ms]", - "datetime64[s]", - "timedelta64[ns]", - "timedelta64[us]", - "timedelta64[ms]", - "timedelta64[s]", - ], -) @pytest.mark.parametrize("null_scalar", [None, cudf.NA, np.datetime64("NaT")]) -@pytest.mark.parametrize("cmpop", _cmpops) -def test_column_null_scalar_comparison(dtype, null_scalar, cmpop): +def test_column_null_scalar_comparison( + request, all_supported_types_as_str, null_scalar, comparison_op +): # This test is meant to validate that comparing # a series of any dtype with a null scalar produces # a new series where all the elements are . + request.applymarker( + pytest.mark.xfail( + all_supported_types_as_str == "category", + raises=ValueError, + reason="Value ... not found in column", + ) + ) + dtype = cudf.dtype(all_supported_types_as_str) if isinstance(null_scalar, np.datetime64): - if cudf.dtype(dtype).kind not in "mM": - pytest.skip() + if dtype.kind not in "mM": + pytest.skip(f"{null_scalar} not applicable for {dtype}") null_scalar = null_scalar.astype(dtype) - dtype = cudf.dtype(dtype) - data = [1, 2, 3, 4, 5] sr = cudf.Series(data, dtype=dtype) - result = cmpop(sr, null_scalar) + result = comparison_op(sr, null_scalar) assert result.isnull().all() -@pytest.mark.parametrize("fn", ["eq", "ne", "lt", "gt", "le", "ge"]) -def test_equality_ops_index_mismatch(fn): +def test_equality_ops_index_mismatch(comparison_op_method): a = cudf.Series( [1, 2, 3, None, None, 4], index=["a", "b", "c", "d", "e", "f"] ) @@ -2399,13 +2203,26 @@ def test_equality_ops_index_mismatch(fn): pa = a.to_pandas(nullable=True) pb = b.to_pandas(nullable=True) - expected = getattr(pa, fn)(pb) - actual = getattr(a, fn)(b).to_pandas(nullable=True) + expected = getattr(pa, comparison_op_method)(pb) + actual = getattr(a, comparison_op_method)(b).to_pandas(nullable=True) assert_eq(expected, actual) -def generate_test_null_equals_columnops_data(): +@pytest.mark.parametrize( + "dtype", + sorted( + itertools.chain( + NUMERIC_TYPES, + DATETIME_TYPES, + TIMEDELTA_TYPES, + STRING_TYPES, + ["category"], + ) + ), +) +@pytest.mark.parametrize("null_case", ["neither", "left", "right", "both"]) +def test_null_equals_columnops(dtype, null_case): # Generate tuples of: # (left_data, right_data, compare_bool # where compare_bool is the correct answer to @@ -2425,36 +2242,16 @@ def set_null_cases(column_l, column_r, case): raise ValueError("Unknown null case") return column_l, column_r - null_cases = ["neither", "left", "right", "both"] data = [1, 2, 3] - results = [] - # TODO: Numeric types can be cross compared as null equal - for dtype in ( - list(NUMERIC_TYPES) - + list(DATETIME_TYPES) - + list(TIMEDELTA_TYPES) - + list(STRING_TYPES) - + ["category"] - ): - for case in null_cases: - left = cudf.Series(data, dtype=dtype) - right = cudf.Series(data, dtype=dtype) - if case in {"left", "right"}: - answer = False - else: - answer = True - left, right = set_null_cases(left, right, case) - results.append((left._column, right._column, answer, case)) - - return results - - -@pytest.mark.parametrize( - "lcol,rcol,ans,case", generate_test_null_equals_columnops_data() -) -def test_null_equals_columnops(lcol, rcol, ans, case): - assert lcol.equals(rcol) == ans + left = cudf.Series(data, dtype=dtype) + right = cudf.Series(data, dtype=dtype) + if null_case in {"left", "right"}: + answer = False + else: + answer = True + left, right = set_null_cases(left, right, null_case) + assert left._column.equals(right._column) is answer def test_add_series_to_dataframe(): @@ -2470,27 +2267,25 @@ def test_add_series_to_dataframe(): @pytest.mark.parametrize("obj_class", [cudf.Series, cudf.Index]) -@pytest.mark.parametrize("binop", _binops) -def test_binops_cupy_array(obj_class, binop): +def test_binops_cupy_array(obj_class, arithmetic_op): # Skip 0 to not deal with NaNs from division. data = range(1, 100) lhs = obj_class(data) rhs = cp.array(data) - assert (binop(lhs, rhs) == binop(lhs, lhs)).all() + assert (arithmetic_op(lhs, rhs) == arithmetic_op(lhs, lhs)).all() -@pytest.mark.parametrize("binop", _binops + _binops_compare) -@pytest.mark.parametrize("data", [None, [-9, 7], [5, -2], [12, 18]]) +@pytest.mark.parametrize("data", [None, [-9, 7], [12, 18]]) @pytest.mark.parametrize("scalar", [1, 3, 12, np.nan]) -def test_empty_column(binop, data, scalar): +def test_empty_column(binary_op, data, scalar): gdf = cudf.DataFrame(columns=["a", "b"]) if data is not None: gdf["a"] = data pdf = gdf.to_pandas() - got = binop(gdf, scalar) - expected = binop(pdf, scalar) + got = binary_op(gdf, scalar) + expected = binary_op(pdf, scalar) assert_eq(expected, got) @@ -2498,16 +2293,16 @@ def test_empty_column(binop, data, scalar): @pytest.mark.parametrize( "df", [ - cudf.DataFrame( + lambda: cudf.DataFrame( [[1, 2, 3, 4], [5, 6, 7, 8], [10, 11, 12, 13], [14, 15, 16, 17]] ), pytest.param( - cudf.DataFrame([[1, None, None, 4], [5, 6, 7, None]]), + lambda: cudf.DataFrame([[1, None, None, 4], [5, 6, 7, None]]), marks=pytest_xfail( reason="Cannot access Frame.values if frame contains nulls" ), ), - cudf.DataFrame( + lambda: cudf.DataFrame( [ [1.2, 2.3, 3.4, 4.5], [5.6, 6.7, 7.8, 8.9], @@ -2515,24 +2310,26 @@ def test_empty_column(binop, data, scalar): [9.1, 2.4, 4.5, 65.34], ] ), - cudf.Series([14, 15, 16, 17]), - cudf.Series([14.15, 15.16, 16.17, 17.18]), + lambda: cudf.Series([14, 15, 16, 17]), + lambda: cudf.Series([14.15, 15.16, 16.17, 17.18]), ], ) @pytest.mark.parametrize( "other", [ - cudf.DataFrame([[9, 10], [11, 12], [13, 14], [15, 16]]), - cudf.DataFrame( + lambda: cudf.DataFrame([[9, 10], [11, 12], [13, 14], [15, 16]]), + lambda: cudf.DataFrame( [[9.4, 10.5], [11.6, 12.7], [13.8, 14.9], [15.1, 16.2]] ), - cudf.Series([5, 6, 7, 8]), - cudf.Series([5.6, 6.7, 7.8, 8.9]), - np.array([5, 6, 7, 8]), - [25.5, 26.6, 27.7, 28.8], + lambda: cudf.Series([5, 6, 7, 8]), + lambda: cudf.Series([5.6, 6.7, 7.8, 8.9]), + lambda: np.array([5, 6, 7, 8]), + lambda: [25.5, 26.6, 27.7, 28.8], ], ) def test_binops_dot(df, other): + df = df() + other = other() pdf = df.to_pandas() host_other = other.to_pandas() if hasattr(other, "to_pandas") else other @@ -2592,13 +2389,12 @@ def test_binop_integer_power_int_series(): assert_eq(expected, got) -@pytest.mark.parametrize("op", _binops) -def test_binop_index_series(op): +def test_binop_index_series(arithmetic_op): gi = cudf.Index([10, 11, 12]) gs = cudf.Series([1, 2, 3]) - actual = op(gi, gs) - expected = op(gi.to_pandas(), gs.to_pandas()) + actual = arithmetic_op(gi, gs) + expected = arithmetic_op(gi.to_pandas(), gs.to_pandas()) assert_eq(expected, actual) @@ -2649,7 +2445,6 @@ def test_binop_lhs_numpy_datetimelike_scalar(scalar): assert_eq(result, expected) -@pytest.mark.parametrize("comp_op", _cmpops) @pytest.mark.parametrize("ordered", [True, False]) @pytest.mark.parametrize( "data_left, data_right", @@ -2658,7 +2453,9 @@ def test_binop_lhs_numpy_datetimelike_scalar(scalar): [[1, 2], [1, 3]], ], ) -def test_cat_non_cat_compare_ops(comp_op, data_left, data_right, ordered): +def test_cat_non_cat_compare_ops( + comparison_op, data_left, data_right, ordered +): pd_non_cat = pd.Series(data_left) pd_cat = pd.Series( data_right, @@ -2669,15 +2466,20 @@ def test_cat_non_cat_compare_ops(comp_op, data_left, data_right, ordered): cudf_cat = cudf.Series.from_pandas(pd_cat) if ( - not ordered and comp_op not in {operator.eq, operator.ne} - ) or comp_op in {operator.gt, operator.lt, operator.le, operator.ge}: + not ordered and comparison_op not in {operator.eq, operator.ne} + ) or comparison_op in { + operator.gt, + operator.lt, + operator.le, + operator.ge, + }: with pytest.raises(TypeError): - comp_op(pd_non_cat, pd_cat) + comparison_op(pd_non_cat, pd_cat) with pytest.raises(TypeError): - comp_op(cudf_non_cat, cudf_cat) + comparison_op(cudf_non_cat, cudf_cat) else: - expected = comp_op(pd_non_cat, pd_cat) - result = comp_op(cudf_non_cat, cudf_cat) + expected = comparison_op(pd_non_cat, pd_cat) + result = comparison_op(cudf_non_cat, cudf_cat) assert_eq(result, expected) @@ -2704,10 +2506,9 @@ def test_eq_ne_non_comparable_types( assert_eq(result, expected) -@pytest.mark.parametrize("op", _binops_compare) -def test_binops_compare_stdlib_date_scalar(op): +def test_binops_compare_stdlib_date_scalar(comparison_op): dt = datetime.date(2020, 1, 1) data = [dt] - result = op(cudf.Series(data), dt) - expected = op(pd.Series(data), dt) + result = comparison_op(cudf.Series(data), dt) + expected = comparison_op(pd.Series(data), dt) assert_eq(result, expected) diff --git a/python/cudf/cudf/tests/test_buffer.py b/python/cudf/cudf/tests/test_buffer.py index 03637e05eae..e36523f10b4 100644 --- a/python/cudf/cudf/tests/test_buffer.py +++ b/python/cudf/cudf/tests/test_buffer.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. +# Copyright (c) 2020-2025, NVIDIA CORPORATION. import cupy as cp import pytest @@ -7,22 +7,30 @@ pytestmark = pytest.mark.spilling -arr_len = 10 + +@pytest.fixture +def arr_len(): + return 10 @pytest.mark.parametrize( - "data", + "data, expect_success", [ - (cp.zeros(arr_len), True), - (cp.zeros((1, arr_len)), True), - (cp.zeros((1, arr_len, 1)), True), - (cp.zeros((arr_len, arr_len)), True), - (cp.zeros((arr_len, arr_len)).reshape(arr_len * arr_len), True), - (cp.zeros((arr_len, arr_len))[:, 0], False), + (lambda arr_len: cp.zeros(arr_len), True), + (lambda arr_len: cp.zeros((1, arr_len)), True), + (lambda arr_len: cp.zeros((1, arr_len, 1)), True), + (lambda arr_len: cp.zeros((arr_len, arr_len)), True), + ( + lambda arr_len: cp.zeros((arr_len, arr_len)).reshape( + arr_len * arr_len + ), + True, + ), + (lambda arr_len: cp.zeros((arr_len, arr_len))[:, 0], False), ], ) -def test_buffer_from_cuda_iface_contiguous(data): - data, expect_success = data +def test_buffer_from_cuda_iface_contiguous(data, expect_success, arr_len): + data = data(arr_len) if expect_success: as_buffer(data.view("|u1")) else: @@ -33,14 +41,15 @@ def test_buffer_from_cuda_iface_contiguous(data): @pytest.mark.parametrize( "data", [ - cp.arange(arr_len), - cp.arange(arr_len).reshape(1, arr_len), - cp.arange(arr_len).reshape(1, arr_len, 1), - cp.arange(arr_len**2).reshape(arr_len, arr_len), + lambda arr_len: cp.arange(arr_len), + lambda arr_len: cp.arange(arr_len).reshape(1, arr_len), + lambda arr_len: cp.arange(arr_len).reshape(1, arr_len, 1), + lambda arr_len: cp.arange(arr_len**2).reshape(arr_len, arr_len), ], ) @pytest.mark.parametrize("dtype", ["uint8", "int8", "float32", "int32"]) -def test_buffer_from_cuda_iface_dtype(data, dtype): +def test_buffer_from_cuda_iface_dtype(data, dtype, arr_len): + data = data(arr_len) data = data.astype(dtype) buf = as_buffer(data) got = cp.array(buf).reshape(-1).view("uint8") @@ -48,7 +57,7 @@ def test_buffer_from_cuda_iface_dtype(data, dtype): assert (expect == got).all() -def test_buffer_creation_from_any(): +def test_buffer_creation_from_any(arr_len): ary = cp.arange(arr_len) b = as_buffer(ary, exposed=True) assert isinstance(b, Buffer) @@ -89,12 +98,12 @@ def test_buffer_repr(size, expect): slice(0, 0), slice(0, 1), slice(-2, -1), - slice(0, arr_len), + slice(0, 10), slice(2, 3), slice(2, -1), ], ) -def test_buffer_slice(idx): +def test_buffer_slice(idx, arr_len): ary = cp.arange(arr_len, dtype="uint8") buf = as_buffer(ary) expect = ary[idx] @@ -112,7 +121,7 @@ def test_buffer_slice(idx): (slice(3, 2, -1), ValueError, "slice must be C-contiguous"), ], ) -def test_buffer_slice_fail(idx, err_type, err_msg): +def test_buffer_slice_fail(idx, err_type, err_msg, arr_len): ary = cp.arange(arr_len, dtype="uint8") buf = as_buffer(ary) diff --git a/python/cudf/cudf/tests/test_categorical.py b/python/cudf/cudf/tests/test_categorical.py index 75cd40aa436..c5e2f05fcd9 100644 --- a/python/cudf/cudf/tests/test_categorical.py +++ b/python/cudf/cudf/tests/test_categorical.py @@ -241,9 +241,8 @@ def test_cat_series_binop_error(): ) -@pytest.mark.parametrize("num_elements", [10, 100, 1000]) -def test_categorical_unique(num_elements): - # create categorical series +def test_categorical_unique(): + num_elements = 20 rng = np.random.default_rng(seed=12) pd_cat = pd.Categorical( pd.Series( @@ -268,9 +267,8 @@ def test_categorical_unique(num_elements): np.testing.assert_array_equal(pdf_unique_sorted, gdf_unique_sorted) -@pytest.mark.parametrize("nelem", [20, 50, 100]) -def test_categorical_unique_count(nelem): - # create categorical series +def test_categorical_unique_count(): + nelem = 20 rng = np.random.default_rng(seed=0) pd_cat = pd.Categorical( pd.Series( @@ -334,8 +332,8 @@ def test_categorical_set_categories_preserves_order(): def test_categorical_as_ordered(pd_str_cat): - pd_sr = pd.Series(pd_str_cat.copy().set_ordered(False)) - cd_sr = cudf.Series(pd_str_cat.copy().set_ordered(False)) + pd_sr = pd.Series(pd_str_cat.set_ordered(False)) + cd_sr = cudf.Series(pd_str_cat.set_ordered(False)) assert cd_sr.cat.ordered is False assert cd_sr.cat.ordered == pd_sr.cat.ordered @@ -349,8 +347,8 @@ def test_categorical_as_ordered(pd_str_cat): def test_categorical_as_unordered(pd_str_cat): - pd_sr = pd.Series(pd_str_cat.copy().set_ordered(True)) - cd_sr = cudf.Series(pd_str_cat.copy().set_ordered(True)) + pd_sr = pd.Series(pd_str_cat.set_ordered(True)) + cd_sr = cudf.Series(pd_str_cat.set_ordered(True)) assert cd_sr.cat.ordered is True assert cd_sr.cat.ordered == pd_sr.cat.ordered @@ -366,19 +364,15 @@ def test_categorical_as_unordered(pd_str_cat): @pytest.mark.parametrize("from_ordered", [True, False]) @pytest.mark.parametrize("to_ordered", [True, False]) def test_categorical_reorder_categories(pd_str_cat, from_ordered, to_ordered): - pd_sr = pd.Series(pd_str_cat.copy().set_ordered(from_ordered)) - cd_sr = cudf.Series(pd_str_cat.copy().set_ordered(from_ordered)) + pd_sr = pd.Series(pd_str_cat.set_ordered(from_ordered)) + cd_sr = cudf.Series(pd_str_cat.set_ordered(from_ordered)) assert_eq(pd_sr, cd_sr) assert str(pd_sr) == str(cd_sr) - kwargs = dict( - ordered=to_ordered, - ) - - pd_sr_1 = pd_sr.cat.reorder_categories(list("cba"), **kwargs) - cd_sr_1 = cd_sr.cat.reorder_categories(list("cba"), **kwargs) + pd_sr_1 = pd_sr.cat.reorder_categories(list("cba"), ordered=to_ordered) + cd_sr_1 = cd_sr.cat.reorder_categories(list("cba"), ordered=to_ordered) assert_eq(pd_sr_1, cd_sr_1) @@ -386,8 +380,8 @@ def test_categorical_reorder_categories(pd_str_cat, from_ordered, to_ordered): def test_categorical_add_categories(pd_str_cat): - pd_sr = pd.Series(pd_str_cat.copy()) - cd_sr = cudf.Series(pd_str_cat.copy()) + pd_sr = pd.Series(pd_str_cat) + cd_sr = cudf.Series(pd_str_cat) assert_eq(pd_sr, cd_sr) @@ -403,8 +397,8 @@ def test_categorical_add_categories(pd_str_cat): def test_categorical_remove_categories(pd_str_cat): - pd_sr = pd.Series(pd_str_cat.copy()) - cd_sr = cudf.Series(pd_str_cat.copy()) + pd_sr = pd.Series(pd_str_cat) + cd_sr = cudf.Series(pd_str_cat) assert_eq(pd_sr, cd_sr) @@ -456,21 +450,22 @@ def test_categorical_dataframe_slice_copy(): ], ) @pytest.mark.parametrize( - "cat_type", + "categories", [ - pd.CategoricalDtype(categories=["aa", "bb", "cc"]), - pd.CategoricalDtype(categories=[2, 4, 10, 100]), - pd.CategoricalDtype(categories=["aa", "bb", "c"]), - pd.CategoricalDtype(categories=["a", "bb", "c"]), - pd.CategoricalDtype(categories=["a", "b", "c"]), - pd.CategoricalDtype(categories=["1", "2", "3", "4"]), - pd.CategoricalDtype(categories=["1.0", "2.5", "3.001", "9"]), - pd.CategoricalDtype(categories=[]), + ["aa", "bb", "cc"], + [2, 4, 10, 100], + ["aa", "bb", "c"], + ["a", "bb", "c"], + ["a", "b", "c"], + ["1", "2", "3", "4"], + ["1.0", "2.5", "3.001", "9"], + [], ], ) -def test_categorical_typecast(data, cat_type): - pd_data = data.copy() +def test_categorical_typecast(data, categories): + pd_data = data gd_data = cudf.from_pandas(data) + cat_type = pd.CategoricalDtype(categories) assert_eq(pd_data.astype(cat_type), gd_data.astype(cat_type)) @@ -503,7 +498,7 @@ def test_categorical_typecast(data, cat_type): ], ) def test_categorical_set_categories_categoricals(data, new_categories): - pd_data = data.copy().astype("category") + pd_data = data.astype("category") gd_data = cudf.from_pandas(pd_data) expected = pd_data.cat.set_categories(new_categories=new_categories) @@ -539,18 +534,19 @@ def test_categorical_set_categories_categoricals(data, new_categories): ], ) @pytest.mark.parametrize( - "dtype", + "categories", [ - pd.CategoricalDtype(categories=["aa", "bb", "cc"]), - pd.CategoricalDtype(categories=[2, 4, 10, 100]), - pd.CategoricalDtype(categories=["aa", "bb", "c"]), - pd.CategoricalDtype(categories=["a", "bb", "c"]), - pd.CategoricalDtype(categories=["a", "b", "c"]), - pd.CategoricalDtype(categories=["22", "b", "c"]), - pd.CategoricalDtype(categories=[]), + ["aa", "bb", "cc"], + [2, 4, 10, 100], + ["aa", "bb", "c"], + ["a", "bb", "c"], + ["a", "b", "c"], + ["22", "b", "c"], + [], ], ) -def test_categorical_creation(data, dtype): +def test_categorical_creation(data, categories): + dtype = pd.CategoricalDtype(categories) expected = pd.Series(data, dtype=dtype) got = cudf.Series(data, dtype=dtype) assert_eq(expected, got) @@ -584,33 +580,22 @@ def test_categorical_dtype(categories, ordered): @pytest.mark.parametrize( - ("data", "expected"), + ("values", "expected"), [ - (cudf.Series([1]), np.uint8), - (cudf.Series([1, None]), np.uint8), - (cudf.Series(np.arange(np.iinfo(np.int8).max)), np.uint8), - ( - cudf.Series(np.append(np.arange(np.iinfo(np.int8).max), [None])), - np.uint8, - ), - (cudf.Series(np.arange(np.iinfo(np.int16).max)), np.uint16), - ( - cudf.Series(np.append(np.arange(np.iinfo(np.int16).max), [None])), - np.uint16, - ), - (cudf.Series(np.arange(np.iinfo(np.uint8).max)), np.uint8), - ( - cudf.Series(np.append(np.arange(np.iinfo(np.uint8).max), [None])), - np.uint8, - ), - (cudf.Series(np.arange(np.iinfo(np.uint16).max)), np.uint16), - ( - cudf.Series(np.append(np.arange(np.iinfo(np.uint16).max), [None])), - np.uint16, - ), + ([1], np.uint8), + ([1, None], np.uint8), + (np.arange(np.iinfo(np.int8).max), np.uint8), + (np.append(np.arange(np.iinfo(np.int8).max), [None]), np.uint8), + (np.arange(np.iinfo(np.int16).max), np.uint16), + (np.append(np.arange(np.iinfo(np.int16).max), [None]), np.uint16), + (np.arange(np.iinfo(np.uint8).max), np.uint8), + (np.append(np.arange(np.iinfo(np.uint8).max), [None]), np.uint8), + (np.arange(np.iinfo(np.uint16).max), np.uint16), + (np.append(np.arange(np.iinfo(np.uint16).max), [None]), np.uint16), ], ) -def test_astype_dtype(data, expected): +def test_astype_dtype(values, expected): + data = cudf.Series(values) got = data.astype("category").cat.codes.dtype np.testing.assert_equal(got, expected) @@ -696,18 +681,19 @@ def test_add_categories_mixed_error(): ], ) @pytest.mark.parametrize( - "cat_dtype", + "categories", [ - pd.CategoricalDtype(categories=["aa", "bb", "cc"]), - pd.CategoricalDtype(categories=[2, 4, 10, 100]), - pd.CategoricalDtype(categories=["aa", "bb", "c"]), - pd.CategoricalDtype(categories=["a", "bb", "c"]), - pd.CategoricalDtype(categories=["a", "b", "c"]), - pd.CategoricalDtype(categories=["22", "b", "c"]), - pd.CategoricalDtype(categories=["a"]), + ["aa", "bb", "cc"], + [2, 4, 10, 100], + ["aa", "bb", "c"], + ["a", "bb", "c"], + ["a", "b", "c"], + ["22", "b", "c"], + ["a"], ], ) -def test_categorical_assignment(data, cat_dtype): +def test_categorical_assignment(data, categories): + cat_dtype = pd.CategoricalDtype(categories) pd_df = pd.DataFrame() pd_df["a"] = np.ones(len(data)) cd_df = cudf.from_pandas(pd_df) @@ -777,16 +763,16 @@ def test_series_construction_with_nulls(input_obj, dtype): @pytest.mark.parametrize( "data", [ - {"a": cudf.Series(["a", "b", "c", "a", "c", "b"]).astype("category")}, + {"a": pd.Series(["a", "b", "c", "a", "c", "b"]).astype("category")}, { - "a": cudf.Series(["a", "a", "b", "b"]).astype("category"), - "b": cudf.Series(["b", "b", "c", "c"]).astype("category"), - "c": cudf.Series(["c", "c", "a", "a"]).astype("category"), + "a": pd.Series(["a", "a", "b", "b"]).astype("category"), + "b": pd.Series(["b", "b", "c", "c"]).astype("category"), + "c": pd.Series(["c", "c", "a", "a"]).astype("category"), }, { - "a": cudf.Series(["a", None, "b", "b"]).astype("category"), - "b": cudf.Series(["b", "b", None, "c"]).astype("category"), - "c": cudf.Series(["c", "c", "a", None]).astype("category"), + "a": pd.Series(["a", None, "b", "b"]).astype("category"), + "b": pd.Series(["b", "b", None, "c"]).astype("category"), + "c": pd.Series(["c", "c", "a", None]).astype("category"), }, ], ) diff --git a/python/cudf/cudf/tests/test_column.py b/python/cudf/cudf/tests/test_column.py index 41bf96f6939..15988673bcd 100644 --- a/python/cudf/cudf/tests/test_column.py +++ b/python/cudf/cudf/tests/test_column.py @@ -311,16 +311,17 @@ def test_column_view_valid_numeric_to_numeric(data, from_dtype, to_dtype): @pytest.mark.parametrize( - "data,from_dtype,to_dtype", + "to_dtype", [ - (np.arange(9), "int8", "int64"), - (np.arange(3), "int8", "int16"), - (np.arange(6), "int8", "float32"), - (np.arange(1), "int8", "datetime64[ns]"), + "int64", + "int16", + "float32", + "datetime64[ns]", ], ) -def test_column_view_invalid_numeric_to_numeric(data, from_dtype, to_dtype): - from_dtype = np.dtype(from_dtype) +def test_column_view_invalid_numeric_to_numeric(to_dtype): + data = np.arange(5) + from_dtype = np.dtype("int8") to_dtype = np.dtype(to_dtype) cpu_data = np.asarray(data, dtype=from_dtype) gpu_data = as_column(data, dtype=from_dtype) @@ -407,34 +408,19 @@ def test_column_view_string_slice(slc): assert_eq(expect, got) +@pytest.mark.parametrize("box", [cp.asarray, np.asarray]) @pytest.mark.parametrize( - "data,expected", + "data", [ - ( - np.array([1, 2, 3, 4, 5], dtype="uint8"), - cudf.core.column.as_column( - [1, 2, 3, 4, 5], dtype=np.dtype(np.uint8) - ), - ), - ( - cp.array([1, 2, 3, 4, 5], dtype="uint8"), - cudf.core.column.as_column( - [1, 2, 3, 4, 5], dtype=np.dtype(np.uint8) - ), - ), - ( - cp.array([], dtype="uint8"), - cudf.core.column.column_empty(0, dtype=np.dtype(np.uint8)), - ), - ( - cp.array([255], dtype="uint8"), - cudf.core.column.as_column([255], dtype=np.dtype(np.uint8)), - ), + np.array([1, 2, 3, 4, 5], dtype="uint8"), + np.array([], dtype="uint8"), + np.array([255], dtype="uint8"), ], ) -def test_as_column_buffer(data, expected): +def test_as_column_buffer(box, data): + expected = cudf.core.column.as_column(data) actual_column = cudf.core.column.as_column( - cudf.core.buffer.as_buffer(data), dtype=data.dtype + cudf.core.buffer.as_buffer(box(data)), dtype=data.dtype ) assert_eq( cudf.Series._from_column(actual_column), @@ -560,12 +546,8 @@ def test_build_series_from_nullable_pandas_dtype(pd_dtype, expect_dtype): ("Float64", "float64"), ], ) -@pytest.mark.parametrize( - "data", - [[1, 2, 0]], -) -def test_astype_with_aliases(alias, expect_dtype, data): - pd_data = pd.Series(data) +def test_astype_with_aliases(alias, expect_dtype): + pd_data = pd.Series([1, 2, 0]) gd_data = cudf.Series.from_pandas(pd_data) assert_eq(pd_data.astype(expect_dtype), gd_data.astype(alias)) diff --git a/python/cudf/cudf/tests/test_column_accessor.py b/python/cudf/cudf/tests/test_column_accessor.py index 023dbbd8daf..ae0e71b5f58 100644 --- a/python/cudf/cudf/tests/test_column_accessor.py +++ b/python/cudf/cudf/tests/test_column_accessor.py @@ -9,25 +9,6 @@ from cudf.core.column_accessor import ColumnAccessor from cudf.testing import assert_eq -simple_test_data = [ - {}, - {"a": as_column([])}, - {"a": as_column([1])}, - {"a": as_column(["a"])}, - {"a": as_column([1, 2, 3]), "b": as_column(["a", "b", "c"])}, -] - -mi_test_data = [ - {("a", "b"): as_column([1, 2, 4]), ("a", "c"): as_column([2, 3, 4])}, - {("a", "b"): as_column([1, 2, 3]), ("a", ""): as_column([2, 3, 4])}, - {("a", "b"): as_column([1, 2, 4]), ("c", "d"): as_column([2, 3, 4])}, - { - ("a", "b"): as_column([1, 2, 3]), - ("a", "c"): as_column([2, 3, 4]), - ("b", ""): as_column([4, 5, 6]), - }, -] - def check_ca_equal(lhs, rhs): assert lhs.level_names == rhs.level_names @@ -39,19 +20,17 @@ def check_ca_equal(lhs, rhs): assert_eq(lhs[l_key], rhs[r_key]) -@pytest.fixture(params=simple_test_data) +@pytest.fixture( + params=[ + {}, + {"a": []}, + {"a": [1]}, + {"a": ["a"]}, + {"a": [1, 2, 3], "b": ["a", "b", "c"]}, + ] +) def simple_data(request): - return request.param - - -@pytest.fixture(params=mi_test_data) -def mi_data(request): - return request.param - - -@pytest.fixture(params=simple_test_data + mi_test_data) -def all_data(request): - return request.param + return {key: as_column(data) for key, data in request.param.items()} def test_to_pandas_simple(simple_data): @@ -72,7 +51,17 @@ def test_to_pandas_simple(simple_data): ) -def test_to_pandas_multiindex(mi_data): +@pytest.mark.parametrize( + "keys", + [ + [("a", "b"), ("a", "c")], + [("a", "b"), ("c", "d")], + [("a", "b"), ("a", ""), ("b", "")], + [("a", "b"), ("a", "c"), ("b", "")], + ], +) +def test_to_pandas_multiindex(keys): + mi_data = {key: as_column([1, 2, 4]) for key in keys} ca = ColumnAccessor(mi_data, multiindex=True) assert_eq( ca.to_pandas_index, diff --git a/python/cudf/cudf/tests/test_contains.py b/python/cudf/cudf/tests/test_contains.py index fe86df99d35..2c6bc0e8a00 100644 --- a/python/cudf/cudf/tests/test_contains.py +++ b/python/cudf/cudf/tests/test_contains.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019-2024, NVIDIA CORPORATION. +# Copyright (c) 2019-2025, NVIDIA CORPORATION. import datetime @@ -9,74 +9,45 @@ import cudf from cudf import Series from cudf.core.index import Index, RangeIndex -from cudf.testing import assert_eq from cudf.testing._utils import DATETIME_TYPES, NUMERIC_TYPES, TIMEDELTA_TYPES -def cudf_date_series(start, stop, freq): - return Series(pd.date_range(start, stop, freq=freq, name="times")) - - -def cudf_num_series(start, stop, step=1): - return Series(range(start, stop, step)) - - -def get_categorical_series(): - return Series( - pd.Categorical( - ["ab", "ac", "cd", "ab", "cd"], categories=["ab", "ac", "cd"] - ) - ) - - -def get_string_series(): - return Series(["ab", "ac", "ba", "cc", "ad"]) - - -# If the type being searched is different from type of series, exceptions -# are thrown well within the python code, and needs to be handled. -# Some of the test cases check this scenario. Example : String Vs Numerical -testdata_all = [ - ( - cudf_date_series("20010101", "20020215", freq="400h"), - datetime.datetime.strptime("2001-01-01", "%Y-%m-%d"), - True, - ), - ( - cudf_date_series("20010101", "20020215", freq="400h"), - datetime.datetime.strptime("2000-01-01", "%Y-%m-%d"), - False, - ), - (cudf_date_series("20010101", "20020215", freq="400h"), 20000101, False), - (get_categorical_series(), "cd", True), - (get_categorical_series(), "dc", False), - (get_categorical_series(), "c", False), - (get_categorical_series(), "c", False), - (get_categorical_series(), 1, False), - (get_string_series(), "ac", True), - (get_string_series(), "ca", False), - (get_string_series(), "c", False), - (get_string_series(), 97, False), - (cudf_num_series(0, 100, 5), 60, True), - (cudf_num_series(0, 100, 5), 71, False), - (cudf_num_series(0, 100, 5), "a", False), -] - - -@pytest.mark.parametrize("values, item, expected", testdata_all) -def test_series_contains(values, item, expected): - assert_eq(expected, item in Series(index=values)) - - -@pytest.mark.parametrize("values, item, expected", testdata_all) -def test_index_contains(values, item, expected): - index = Index(values) - assert_eq(expected, item in index) +@pytest.mark.parametrize( + "values, item, expected", + [ + [[1, 2, 3], 2, True], + [[1, 2, 3], 4, False], + [[1, 2, 3], "a", False], + [["a", "b", "c"], "a", True], + [["a", "b", "c"], "ab", False], + [["a", "b", "c"], 6, False], + [pd.Categorical(["a", "b", "c"]), "a", True], + [pd.Categorical(["a", "b", "c"]), "ab", False], + [pd.Categorical(["a", "b", "c"]), 6, False], + [pd.date_range("20010101", periods=5, freq="D"), 20000101, False], + [ + pd.date_range("20010101", periods=5, freq="D"), + datetime.datetime(2000, 1, 1), + False, + ], + [ + pd.date_range("20010101", periods=5, freq="D"), + datetime.datetime(2001, 1, 1), + True, + ], + ], +) +@pytest.mark.parametrize( + "box", [Index, lambda x: Series(index=x)], ids=["index", "series"] +) +def test_contains(values, item, expected, box): + assert (item in box(values)) is expected def test_rangeindex_contains(): - assert_eq(True, 9 in RangeIndex(start=0, stop=10, name="Index")) - assert_eq(False, 10 in RangeIndex(start=0, stop=10, name="Index")) + ridx = RangeIndex(start=0, stop=10, name="Index") + assert 9 in ridx + assert 10 not in ridx @pytest.mark.parametrize("dtype", NUMERIC_TYPES) diff --git a/python/cudf/cudf/tests/test_copying.py b/python/cudf/cudf/tests/test_copying.py index 26f92f75807..dc19c52715a 100644 --- a/python/cudf/cudf/tests/test_copying.py +++ b/python/cudf/cudf/tests/test_copying.py @@ -365,52 +365,48 @@ def test_series_zero_copy_cow_off(): @pytest.mark.parametrize("copy_on_write", [True, False]) def test_series_str_copy(copy_on_write): - original_cow_setting = cudf.get_option("copy_on_write") - cudf.set_option("copy_on_write", copy_on_write) - s = cudf.Series(["a", "b", "c", "d", "e"]) - s1 = s.copy(deep=True) - s2 = s.copy(deep=True) + with cudf.option_context("copy_on_write", copy_on_write): + s = cudf.Series(["a", "b", "c", "d", "e"]) + s1 = s.copy(deep=True) + s2 = s.copy(deep=True) - assert_eq(s, cudf.Series(["a", "b", "c", "d", "e"])) - assert_eq(s1, cudf.Series(["a", "b", "c", "d", "e"])) - assert_eq(s2, cudf.Series(["a", "b", "c", "d", "e"])) + assert_eq(s, cudf.Series(["a", "b", "c", "d", "e"])) + assert_eq(s1, cudf.Series(["a", "b", "c", "d", "e"])) + assert_eq(s2, cudf.Series(["a", "b", "c", "d", "e"])) - s[0:3] = "abc" + s[0:3] = "abc" - assert_eq(s, cudf.Series(["abc", "abc", "abc", "d", "e"])) - assert_eq(s1, cudf.Series(["a", "b", "c", "d", "e"])) - assert_eq(s2, cudf.Series(["a", "b", "c", "d", "e"])) + assert_eq(s, cudf.Series(["abc", "abc", "abc", "d", "e"])) + assert_eq(s1, cudf.Series(["a", "b", "c", "d", "e"])) + assert_eq(s2, cudf.Series(["a", "b", "c", "d", "e"])) - s2[1:4] = "xyz" + s2[1:4] = "xyz" - assert_eq(s, cudf.Series(["abc", "abc", "abc", "d", "e"])) - assert_eq(s1, cudf.Series(["a", "b", "c", "d", "e"])) - assert_eq(s2, cudf.Series(["a", "xyz", "xyz", "xyz", "e"])) - cudf.set_option("copy_on_write", original_cow_setting) + assert_eq(s, cudf.Series(["abc", "abc", "abc", "d", "e"])) + assert_eq(s1, cudf.Series(["a", "b", "c", "d", "e"])) + assert_eq(s2, cudf.Series(["a", "xyz", "xyz", "xyz", "e"])) @pytest.mark.parametrize("copy_on_write", [True, False]) def test_series_cat_copy(copy_on_write): - original_cow_setting = cudf.get_option("copy_on_write") - cudf.set_option("copy_on_write", copy_on_write) - s = cudf.Series([10, 20, 30, 40, 50], dtype="category") - s1 = s.copy(deep=True) - s2 = s1.copy(deep=True) - s3 = s1.copy(deep=True) - - s[0] = 50 - assert_eq(s, cudf.Series([50, 20, 30, 40, 50], dtype=s.dtype)) - assert_eq(s1, cudf.Series([10, 20, 30, 40, 50], dtype="category")) - assert_eq(s2, cudf.Series([10, 20, 30, 40, 50], dtype="category")) - assert_eq(s3, cudf.Series([10, 20, 30, 40, 50], dtype="category")) - - s2[3] = 10 - s3[2:5] = 20 - assert_eq(s, cudf.Series([50, 20, 30, 40, 50], dtype=s.dtype)) - assert_eq(s1, cudf.Series([10, 20, 30, 40, 50], dtype=s.dtype)) - assert_eq(s2, cudf.Series([10, 20, 30, 10, 50], dtype=s.dtype)) - assert_eq(s3, cudf.Series([10, 20, 20, 20, 20], dtype=s.dtype)) - cudf.set_option("copy_on_write", original_cow_setting) + with cudf.option_context("copy_on_write", copy_on_write): + s = cudf.Series([10, 20, 30, 40, 50], dtype="category") + s1 = s.copy(deep=True) + s2 = s1.copy(deep=True) + s3 = s1.copy(deep=True) + + s[0] = 50 + assert_eq(s, cudf.Series([50, 20, 30, 40, 50], dtype=s.dtype)) + assert_eq(s1, cudf.Series([10, 20, 30, 40, 50], dtype="category")) + assert_eq(s2, cudf.Series([10, 20, 30, 40, 50], dtype="category")) + assert_eq(s3, cudf.Series([10, 20, 30, 40, 50], dtype="category")) + + s2[3] = 10 + s3[2:5] = 20 + assert_eq(s, cudf.Series([50, 20, 30, 40, 50], dtype=s.dtype)) + assert_eq(s1, cudf.Series([10, 20, 30, 40, 50], dtype=s.dtype)) + assert_eq(s2, cudf.Series([10, 20, 30, 10, 50], dtype=s.dtype)) + assert_eq(s3, cudf.Series([10, 20, 20, 20, 20], dtype=s.dtype)) def test_dataframe_cow_slice_setitem(): From 7f05fbcd58eac9447d8bd53d46b83d13f58fe483 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Mon, 11 Aug 2025 15:28:10 -0700 Subject: [PATCH 096/366] Move test_search/test_scan/test_seriesmap.py to new cudf classic test directory structure (#19492) Towards https://github.com/rapidsai/cudf/issues/9999 Towards https://github.com/rapidsai/cudf/issues/15723 Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/19492 --- .../dataframe/methods/test_searchsorted.py | 42 +++ .../series/methods/test_cumulative_methods.py | 124 ++++++++ .../methods/test_map.py} | 0 .../methods/test_searchsorted.py} | 40 +-- python/cudf/cudf/tests/test_scan.py | 269 ------------------ 5 files changed, 167 insertions(+), 308 deletions(-) create mode 100644 python/cudf/cudf/tests/dataframe/methods/test_searchsorted.py create mode 100644 python/cudf/cudf/tests/series/methods/test_cumulative_methods.py rename python/cudf/cudf/tests/{test_seriesmap.py => series/methods/test_map.py} (100%) rename python/cudf/cudf/tests/{test_search.py => series/methods/test_searchsorted.py} (78%) delete mode 100644 python/cudf/cudf/tests/test_scan.py diff --git a/python/cudf/cudf/tests/dataframe/methods/test_searchsorted.py b/python/cudf/cudf/tests/dataframe/methods/test_searchsorted.py new file mode 100644 index 00000000000..f765c576993 --- /dev/null +++ b/python/cudf/cudf/tests/dataframe/methods/test_searchsorted.py @@ -0,0 +1,42 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. +import pytest + +import cudf + + +@pytest.mark.parametrize("side", ["left", "right"]) +@pytest.mark.parametrize("multiindex", [True, False]) +def test_searchsorted_dataframe(side, multiindex): + values = cudf.DataFrame( + { + "a": [1, 0, 5, 1], + "b": [-0.998, 0.031, -0.888, -0.998], + "c": ["C", "A", "G", "B"], + } + ) + base = cudf.DataFrame( + { + "a": [1, 1, 1, 5], + "b": [-0.999, -0.998, -0.997, -0.888], + "c": ["A", "C", "E", "G"], + } + ) + + if multiindex: + base = base.set_index(["a", "b", "c"]).index + values = values.set_index(["a", "b", "c"]).index + + result = base.searchsorted(values, side=side).tolist() + + if side == "left": + assert result == [1, 0, 3, 1] + else: + assert result == [2, 0, 4, 1] + + +def test_search_sorted_dataframe_unequal_number_of_columns(): + values = cudf.DataFrame({"a": [1, 0, 5, 1]}) + base = cudf.DataFrame({"a": [1, 0, 5, 1], "b": ["x", "z", "w", "a"]}) + + with pytest.raises(ValueError, match="Mismatch number of columns"): + base.searchsorted(values) diff --git a/python/cudf/cudf/tests/series/methods/test_cumulative_methods.py b/python/cudf/cudf/tests/series/methods/test_cumulative_methods.py new file mode 100644 index 00000000000..8c2b14ec2d9 --- /dev/null +++ b/python/cudf/cudf/tests/series/methods/test_cumulative_methods.py @@ -0,0 +1,124 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + +import numpy as np +import pandas as pd +import pytest + +import cudf +from cudf.core.dtypes import Decimal32Dtype, Decimal64Dtype, Decimal128Dtype +from cudf.testing import assert_eq +from cudf.testing._utils import gen_rand + + +@pytest.fixture(params=[0, 5]) +def nelem(request): + return request.param + + +@pytest.fixture(params=["cumsum", "cummin", "cummax", "cumprod"]) +def cumulative_methods(request): + return request.param + + +def test_cumulative_methods(numeric_types_as_str, nelem, cumulative_methods): + dtype = np.dtype(numeric_types_as_str) + if dtype == np.int8: + # to keep data in range + data = gen_rand(dtype, nelem, low=-2, high=2) + else: + data = gen_rand(dtype, nelem) + + decimal = 4 if dtype == np.float32 else 6 + + gs = cudf.Series(data) + ps = pd.Series(data) + np.testing.assert_array_almost_equal( + getattr(gs, cumulative_methods)().to_numpy(), + getattr(ps, cumulative_methods)(), + decimal=decimal, + ) + + +def test_cumulative_methods_masked(numeric_types_as_str, cumulative_methods): + data = [1, 2, None, 4, 5] + gs = cudf.Series(data).astype(numeric_types_as_str) + # float64 since pandas usses NaN as missing value + ps = pd.Series(data).astype("float64") + assert_eq( + getattr(gs, cumulative_methods)(), + getattr(ps, cumulative_methods)(), + check_dtype=False, + ) + + +@pytest.mark.parametrize( + "dtype", + [ + Decimal64Dtype(8, 4), + Decimal64Dtype(10, 5), + Decimal64Dtype(12, 7), + Decimal32Dtype(8, 5), + Decimal128Dtype(13, 6), + ], +) +def test_cumsum_decimal(dtype): + data = ["243.32", "48.245", "-7234.298", np.nan, "-467.2"] + gser = cudf.Series(data).astype(dtype) + pser = pd.Series(data, dtype="float64") + + got = gser.cumsum() + expected = cudf.Series.from_pandas(pser.cumsum()).astype(dtype) + + assert_eq(got, expected) + + +@pytest.mark.parametrize( + "dtype", + [ + Decimal64Dtype(8, 4), + Decimal64Dtype(11, 6), + Decimal64Dtype(14, 7), + Decimal32Dtype(8, 4), + Decimal128Dtype(11, 6), + ], +) +def test_cummin_decimal(dtype): + data = ["8394.294", np.nan, "-9940.444", np.nan, "-23.928"] + gser = cudf.Series(data).astype(dtype) + pser = pd.Series(data, dtype="float64") + + got = gser.cummin() + expected = cudf.Series.from_pandas(pser.cummin()).astype(dtype) + + assert_eq(got, expected) + + +@pytest.mark.parametrize( + "dtype", + [ + Decimal64Dtype(8, 4), + Decimal64Dtype(11, 6), + Decimal64Dtype(14, 7), + Decimal32Dtype(8, 4), + Decimal128Dtype(11, 6), + ], +) +def test_cummax_decimal(dtype): + data = [np.nan, "54.203", "8.222", "644.32", "-562.272"] + gser = cudf.Series(data).astype(dtype) + pser = pd.Series(data, dtype="float64") + + got = gser.cummax() + expected = cudf.Series.from_pandas(pser.cummax()).astype(dtype) + + assert_eq(got, expected) + + +@pytest.mark.parametrize("method", ["cumsum", "cumprod"]) +def test_scan_boolean(method): + s = cudf.Series([True, False, True, False]) + + got = getattr(s, method)() + expect = getattr(s.to_pandas(), method)() + + assert_eq(expect, got) diff --git a/python/cudf/cudf/tests/test_seriesmap.py b/python/cudf/cudf/tests/series/methods/test_map.py similarity index 100% rename from python/cudf/cudf/tests/test_seriesmap.py rename to python/cudf/cudf/tests/series/methods/test_map.py diff --git a/python/cudf/cudf/tests/test_search.py b/python/cudf/cudf/tests/series/methods/test_searchsorted.py similarity index 78% rename from python/cudf/cudf/tests/test_search.py rename to python/cudf/cudf/tests/series/methods/test_searchsorted.py index 6fcbcde5be7..23fb916a202 100644 --- a/python/cudf/cudf/tests/test_search.py +++ b/python/cudf/cudf/tests/series/methods/test_searchsorted.py @@ -1,4 +1,4 @@ -# Copyright (c) 2018-2025, NVIDIA CORPORATION. +# Copyright (c) 2025, NVIDIA CORPORATION. import cupy import numpy as np import pandas as pd @@ -50,44 +50,6 @@ def test_searchsorted(side, obj_class, vals_class): assert_eq(expect, cupy.asnumpy(got)) -@pytest.mark.parametrize("side", ["left", "right"]) -@pytest.mark.parametrize("multiindex", [True, False]) -def test_searchsorted_dataframe(side, multiindex): - values = cudf.DataFrame( - { - "a": [1, 0, 5, 1], - "b": [-0.998, 0.031, -0.888, -0.998], - "c": ["C", "A", "G", "B"], - } - ) - base = cudf.DataFrame( - { - "a": [1, 1, 1, 5], - "b": [-0.999, -0.998, -0.997, -0.888], - "c": ["A", "C", "E", "G"], - } - ) - - if multiindex: - base = base.set_index(["a", "b", "c"]).index - values = values.set_index(["a", "b", "c"]).index - - result = base.searchsorted(values, side=side).tolist() - - if side == "left": - assert result == [1, 0, 3, 1] - else: - assert result == [2, 0, 4, 1] - - -def test_search_sorted_dataframe_unequal_number_of_columns(): - values = cudf.DataFrame({"a": [1, 0, 5, 1]}) - base = cudf.DataFrame({"a": [1, 0, 5, 1], "b": ["x", "z", "w", "a"]}) - - with pytest.raises(ValueError, match="Mismatch number of columns"): - base.searchsorted(values) - - @pytest.mark.parametrize("side", ["left", "right"]) def test_searchsorted_categorical(side): cat1 = pd.Categorical( diff --git a/python/cudf/cudf/tests/test_scan.py b/python/cudf/cudf/tests/test_scan.py deleted file mode 100644 index d4b21480070..00000000000 --- a/python/cudf/cudf/tests/test_scan.py +++ /dev/null @@ -1,269 +0,0 @@ -# Copyright (c) 2021-2025, NVIDIA CORPORATION. - -import numpy as np -import pandas as pd -import pytest - -import cudf -from cudf.core.dtypes import Decimal32Dtype, Decimal64Dtype, Decimal128Dtype -from cudf.testing import assert_eq -from cudf.testing._utils import INTEGER_TYPES, NUMERIC_TYPES, gen_rand - - -@pytest.fixture(params=NUMERIC_TYPES) -def dtype(request): - return request.param - - -@pytest.fixture(params=[0, 1, 5]) -def nelem(request): - return request.param - - -def test_cumsum(dtype, nelem): - if dtype == np.int8: - # to keep data in range - data = gen_rand(dtype, nelem, low=-2, high=2) - else: - data = gen_rand(dtype, nelem) - - decimal = 4 if dtype == np.float32 else 6 - - # series - gs = cudf.Series(data) - ps = pd.Series(data) - np.testing.assert_array_almost_equal( - gs.cumsum().to_numpy(), ps.cumsum(), decimal=decimal - ) - - # dataframe series (named series) - gdf = cudf.DataFrame() - gdf["a"] = cudf.Series(data) - pdf = pd.DataFrame() - pdf["a"] = pd.Series(data) - np.testing.assert_array_almost_equal( - gdf.a.cumsum().to_numpy(), pdf.a.cumsum(), decimal=decimal - ) - - -def test_cumsum_masked(): - data = [1, 2, None, 4, 5] - float_types = ["float32", "float64"] - - for type_ in float_types: - gs = cudf.Series(data).astype(type_) - ps = pd.Series(data).astype(type_) - assert_eq(gs.cumsum(), ps.cumsum()) - - for type_ in INTEGER_TYPES: - gs = cudf.Series(data).astype(type_) - got = gs.cumsum() - expected = pd.Series([1, 3, np.nan, 7, 12], dtype="float64") - assert_eq(got, expected) - - -@pytest.mark.parametrize( - "dtype", - [ - Decimal64Dtype(8, 4), - Decimal64Dtype(10, 5), - Decimal64Dtype(12, 7), - Decimal32Dtype(8, 5), - Decimal128Dtype(13, 6), - ], -) -def test_cumsum_decimal(dtype): - data = ["243.32", "48.245", "-7234.298", np.nan, "-467.2"] - gser = cudf.Series(data).astype(dtype) - pser = pd.Series(data, dtype="float64") - - got = gser.cumsum() - expected = cudf.Series.from_pandas(pser.cumsum()).astype(dtype) - - assert_eq(got, expected) - - -def test_cummin(dtype, nelem): - if dtype == np.int8: - # to keep data in range - data = gen_rand(dtype, nelem, low=-2, high=2) - else: - data = gen_rand(dtype, nelem) - - decimal = 4 if dtype == np.float32 else 6 - - # series - gs = cudf.Series(data) - ps = pd.Series(data) - np.testing.assert_array_almost_equal( - gs.cummin().to_numpy(), ps.cummin(), decimal=decimal - ) - - # dataframe series (named series) - gdf = cudf.DataFrame() - gdf["a"] = cudf.Series(data) - pdf = pd.DataFrame() - pdf["a"] = pd.Series(data) - np.testing.assert_array_almost_equal( - gdf.a.cummin().to_numpy(), pdf.a.cummin(), decimal=decimal - ) - - -def test_cummin_masked(): - data = [1, 2, None, 4, 5] - float_types = ["float32", "float64"] - - for type_ in float_types: - gs = cudf.Series(data).astype(type_) - ps = pd.Series(data).astype(type_) - assert_eq(gs.cummin(), ps.cummin()) - - for type_ in INTEGER_TYPES: - gs = cudf.Series(data).astype(type_) - expected = pd.Series([1, 1, np.nan, 1, 1]).astype("float64") - assert_eq(gs.cummin(), expected) - - -@pytest.mark.parametrize( - "dtype", - [ - Decimal64Dtype(8, 4), - Decimal64Dtype(11, 6), - Decimal64Dtype(14, 7), - Decimal32Dtype(8, 4), - Decimal128Dtype(11, 6), - ], -) -def test_cummin_decimal(dtype): - data = ["8394.294", np.nan, "-9940.444", np.nan, "-23.928"] - gser = cudf.Series(data).astype(dtype) - pser = pd.Series(data, dtype="float64") - - got = gser.cummin() - expected = cudf.Series.from_pandas(pser.cummin()).astype(dtype) - - assert_eq(got, expected) - - -def test_cummax(dtype, nelem): - if dtype == np.int8: - # to keep data in range - data = gen_rand(dtype, nelem, low=-2, high=2) - else: - data = gen_rand(dtype, nelem) - - decimal = 4 if dtype == np.float32 else 6 - - # series - gs = cudf.Series(data) - ps = pd.Series(data) - np.testing.assert_array_almost_equal( - gs.cummax().to_numpy(), ps.cummax(), decimal=decimal - ) - - # dataframe series (named series) - gdf = cudf.DataFrame() - gdf["a"] = cudf.Series(data) - pdf = pd.DataFrame() - pdf["a"] = pd.Series(data) - np.testing.assert_array_almost_equal( - gdf.a.cummax().to_numpy(), pdf.a.cummax(), decimal=decimal - ) - - -def test_cummax_masked(): - data = [1, 2, None, 4, 5] - float_types = ["float32", "float64"] - - for type_ in float_types: - gs = cudf.Series(data).astype(type_) - ps = pd.Series(data).astype(type_) - assert_eq(gs.cummax(), ps.cummax()) - - for type_ in INTEGER_TYPES: - gs = cudf.Series(data).astype(type_) - expected = pd.Series([1, 2, np.nan, 4, 5]).astype("float64") - assert_eq(gs.cummax(), expected) - - -@pytest.mark.parametrize( - "dtype", - [ - Decimal64Dtype(8, 4), - Decimal64Dtype(11, 6), - Decimal64Dtype(14, 7), - Decimal32Dtype(8, 4), - Decimal128Dtype(11, 6), - ], -) -def test_cummax_decimal(dtype): - data = [np.nan, "54.203", "8.222", "644.32", "-562.272"] - gser = cudf.Series(data).astype(dtype) - pser = pd.Series(data, dtype="float64") - - got = gser.cummax() - expected = cudf.Series.from_pandas(pser.cummax()).astype(dtype) - - assert_eq(got, expected) - - -def test_cumprod(dtype, nelem): - if dtype == np.int8: - # to keep data in range - data = gen_rand(dtype, nelem, low=-2, high=2) - else: - data = gen_rand(dtype, nelem) - - decimal = 4 if dtype == np.float32 else 6 - - # series - gs = cudf.Series(data) - ps = pd.Series(data) - np.testing.assert_array_almost_equal( - gs.cumprod().to_numpy(), ps.cumprod(), decimal=decimal - ) - - # dataframe series (named series) - gdf = cudf.DataFrame() - gdf["a"] = cudf.Series(data) - pdf = pd.DataFrame() - pdf["a"] = pd.Series(data) - np.testing.assert_array_almost_equal( - gdf.a.cumprod().to_numpy(), pdf.a.cumprod(), decimal=decimal - ) - - -def test_cumprod_masked(): - data = [1, 2, None, 4, 5] - float_types = ["float32", "float64"] - - for type_ in float_types: - gs = cudf.Series(data).astype(type_) - ps = pd.Series(data).astype(type_) - assert_eq(gs.cumprod(), ps.cumprod()) - - for type_ in INTEGER_TYPES: - gs = cudf.Series(data).astype(type_) - got = gs.cumprod() - expected = pd.Series([1, 2, np.nan, 8, 40], dtype="float64") - assert_eq(got, expected) - - -def test_scan_boolean_cumsum(): - s = cudf.Series([0, -1, -300, 23, 4, -3, 0, 0, 100]) - - # cumsum test - got = (s > 0).cumsum() - expect = (s > 0).to_pandas().cumsum() - - assert_eq(expect, got) - - -def test_scan_boolean_cumprod(): - s = cudf.Series([0, -1, -300, 23, 4, -3, 0, 0, 100]) - - # cumprod test - got = (s > 0).cumprod() - expect = (s > 0).to_pandas().cumprod() - - assert_eq(expect, got) From 9c6ee0aa5bbd448f208cbf88ef5c669e73f95516 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Mon, 11 Aug 2025 15:52:19 -0700 Subject: [PATCH 097/366] Add streams to stream_compaction (#19651) Contributes to #15163 Authors: - Vyas Ramasubramani (https://github.com/vyasr) Approvers: - Matthew Roeschke (https://github.com/mroeschke) URL: https://github.com/rapidsai/cudf/pull/19651 --- .../pylibcudf/libcudf/stream_compaction.pxd | 22 +++-- .../pylibcudf/pylibcudf/stream_compaction.pxd | 25 ++++-- .../pylibcudf/pylibcudf/stream_compaction.pyi | 30 +++++-- .../pylibcudf/pylibcudf/stream_compaction.pyx | 83 ++++++++++++++----- 4 files changed, 121 insertions(+), 39 deletions(-) diff --git a/python/pylibcudf/pylibcudf/libcudf/stream_compaction.pxd b/python/pylibcudf/pylibcudf/libcudf/stream_compaction.pxd index 78b9bcb299b..ba7c7f6edd1 100644 --- a/python/pylibcudf/pylibcudf/libcudf/stream_compaction.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/stream_compaction.pxd @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. +# Copyright (c) 2020-2025, NVIDIA CORPORATION. from libcpp cimport bool from libcpp.memory cimport unique_ptr from libcpp.vector cimport vector @@ -14,6 +14,7 @@ from pylibcudf.libcudf.types cimport ( null_policy, size_type, ) +from rmm.librmm.cuda_stream_view cimport cuda_stream_view cdef extern from "cudf/stream_compaction.hpp" namespace "cudf" nogil: @@ -26,18 +27,21 @@ cdef extern from "cudf/stream_compaction.hpp" namespace "cudf" nogil: cdef unique_ptr[table] drop_nulls( table_view source_table, vector[size_type] keys, - size_type keep_threshold + size_type keep_threshold, + cuda_stream_view stream ) except +libcudf_exception_handler cdef unique_ptr[table] drop_nans( table_view source_table, vector[size_type] keys, - size_type keep_threshold + size_type keep_threshold, + cuda_stream_view stream ) except +libcudf_exception_handler cdef unique_ptr[table] apply_boolean_mask( table_view source_table, - column_view boolean_mask + column_view boolean_mask, + cuda_stream_view stream ) except +libcudf_exception_handler cdef unique_ptr[table] unique( @@ -45,6 +49,7 @@ cdef extern from "cudf/stream_compaction.hpp" namespace "cudf" nogil: vector[size_type] keys, duplicate_keep_option keep, null_equality nulls_equal, + cuda_stream_view stream ) except +libcudf_exception_handler cdef unique_ptr[table] distinct( @@ -53,6 +58,7 @@ cdef extern from "cudf/stream_compaction.hpp" namespace "cudf" nogil: duplicate_keep_option keep, null_equality nulls_equal, nan_equality nans_equals, + cuda_stream_view stream ) except +libcudf_exception_handler cdef unique_ptr[column] distinct_indices( @@ -60,6 +66,7 @@ cdef extern from "cudf/stream_compaction.hpp" namespace "cudf" nogil: duplicate_keep_option keep, null_equality nulls_equal, nan_equality nans_equal, + cuda_stream_view stream ) except +libcudf_exception_handler cdef unique_ptr[table] stable_distinct( @@ -68,12 +75,14 @@ cdef extern from "cudf/stream_compaction.hpp" namespace "cudf" nogil: duplicate_keep_option keep, null_equality nulls_equal, nan_equality nans_equal, + cuda_stream_view stream ) except +libcudf_exception_handler cdef size_type unique_count( column_view column, null_policy null_handling, - nan_policy nan_handling) except +libcudf_exception_handler + nan_policy nan_handling, + cuda_stream_view stream) except +libcudf_exception_handler cdef size_type unique_count( table_view source_table, @@ -82,7 +91,8 @@ cdef extern from "cudf/stream_compaction.hpp" namespace "cudf" nogil: cdef size_type distinct_count( column_view column, null_policy null_handling, - nan_policy nan_handling) except +libcudf_exception_handler + nan_policy nan_handling, + cuda_stream_view stream) except +libcudf_exception_handler cdef size_type distinct_count( table_view source_table, diff --git a/python/pylibcudf/pylibcudf/stream_compaction.pxd b/python/pylibcudf/pylibcudf/stream_compaction.pxd index a20a23e2e58..1eacdf32f2c 100644 --- a/python/pylibcudf/pylibcudf/stream_compaction.pxd +++ b/python/pylibcudf/pylibcudf/stream_compaction.pxd @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. +# Copyright (c) 2024-2025, NVIDIA CORPORATION. from pylibcudf.libcudf.stream_compaction cimport duplicate_keep_option from pylibcudf.libcudf.types cimport ( @@ -8,22 +8,30 @@ from pylibcudf.libcudf.types cimport ( null_policy, size_type, ) +from rmm.pylibrmm.stream cimport Stream from .column cimport Column from .table cimport Table -cpdef Table drop_nulls(Table source_table, list keys, size_type keep_threshold) +cpdef Table drop_nulls( + Table source_table, list keys, size_type keep_threshold, Stream stream = * +) -cpdef Table drop_nans(Table source_table, list keys, size_type keep_threshold) +cpdef Table drop_nans( + Table source_table, list keys, size_type keep_threshold, Stream stream = * +) -cpdef Table apply_boolean_mask(Table source_table, Column boolean_mask) +cpdef Table apply_boolean_mask( + Table source_table, Column boolean_mask, Stream stream = * +) cpdef Table unique( Table input, list keys, duplicate_keep_option keep, null_equality nulls_equal, + Stream stream = *, ) cpdef Table distinct( @@ -32,6 +40,7 @@ cpdef Table distinct( duplicate_keep_option keep, null_equality nulls_equal, nan_equality nans_equal, + Stream stream = *, ) cpdef Column distinct_indices( @@ -39,6 +48,7 @@ cpdef Column distinct_indices( duplicate_keep_option keep, null_equality nulls_equal, nan_equality nans_equal, + Stream stream = *, ) cpdef Table stable_distinct( @@ -47,16 +57,19 @@ cpdef Table stable_distinct( duplicate_keep_option keep, null_equality nulls_equal, nan_equality nans_equal, + Stream stream = *, ) cpdef size_type unique_count( Column column, null_policy null_handling, - nan_policy nan_handling + nan_policy nan_handling, + Stream stream = * ) cpdef size_type distinct_count( Column column, null_policy null_handling, - nan_policy nan_handling + nan_policy nan_handling, + Stream stream = * ) diff --git a/python/pylibcudf/pylibcudf/stream_compaction.pyi b/python/pylibcudf/pylibcudf/stream_compaction.pyi index 99cade48309..5b7acab3f50 100644 --- a/python/pylibcudf/pylibcudf/stream_compaction.pyi +++ b/python/pylibcudf/pylibcudf/stream_compaction.pyi @@ -2,6 +2,8 @@ from enum import IntEnum +from rmm.pylibrmm.stream import Stream + from pylibcudf.column import Column from pylibcudf.table import Table from pylibcudf.types import NanEquality, NanPolicy, NullEquality, NullPolicy @@ -13,17 +15,26 @@ class DuplicateKeepOption(IntEnum): KEEP_NONE = ... def drop_nulls( - source_table: Table, keys: list[int], keep_threshold: int + source_table: Table, + keys: list[int], + keep_threshold: int, + stream: Stream | None = None, ) -> Table: ... def drop_nans( - source_table: Table, keys: list[int], keep_threshold: int + source_table: Table, + keys: list[int], + keep_threshold: int, + stream: Stream | None = None, +) -> Table: ... +def apply_boolean_mask( + source_table: Table, boolean_mask: Column, stream: Stream | None = None ) -> Table: ... -def apply_boolean_mask(source_table: Table, boolean_mask: Column) -> Table: ... def unique( input: Table, keys: list[int], keep: DuplicateKeepOption, nulls_equal: NullEquality, + stream: Stream | None = None, ) -> Table: ... def distinct( input: Table, @@ -31,12 +42,14 @@ def distinct( keep: DuplicateKeepOption, nulls_equal: NullEquality, nans_equal: NanEquality, + stream: Stream | None = None, ) -> Table: ... def distinct_indices( input: Table, keep: DuplicateKeepOption, nulls_equal: NullEquality, nans_equal: NanEquality, + stream: Stream | None = None, ) -> Column: ... def stable_distinct( input: Table, @@ -44,10 +57,17 @@ def stable_distinct( keep: DuplicateKeepOption, nulls_equal: NullEquality, nans_equal: NanEquality, + stream: Stream | None = None, ) -> Table: ... def unique_count( - source: Column, null_handling: NullPolicy, nan_handling: NanPolicy + source: Column, + null_handling: NullPolicy, + nan_handling: NanPolicy, + stream: Stream | None = None, ) -> int: ... def distinct_count( - source: Column, null_handling: NullPolicy, nan_handling: NanPolicy + source: Column, + null_handling: NullPolicy, + nan_handling: NanPolicy, + stream: Stream | None = None, ) -> int: ... diff --git a/python/pylibcudf/pylibcudf/stream_compaction.pyx b/python/pylibcudf/pylibcudf/stream_compaction.pyx index 8f308e3b29e..8eafe040508 100644 --- a/python/pylibcudf/pylibcudf/stream_compaction.pyx +++ b/python/pylibcudf/pylibcudf/stream_compaction.pyx @@ -17,9 +17,11 @@ from pylibcudf.libcudf.types cimport ( from pylibcudf.libcudf.stream_compaction import \ duplicate_keep_option as DuplicateKeepOption # no-cython-lint, isort:skip +from rmm.pylibrmm.stream cimport Stream from .column cimport Column from .table cimport Table +from .utils cimport _get_stream __all__ = [ "DuplicateKeepOption", @@ -34,7 +36,9 @@ __all__ = [ "unique_count", ] -cpdef Table drop_nulls(Table source_table, list keys, size_type keep_threshold): +cpdef Table drop_nulls( + Table source_table, list keys, size_type keep_threshold, Stream stream=None +): """Filters out rows from the input table based on the presence of nulls. For details, see :cpp:func:`drop_nulls`. @@ -55,14 +59,19 @@ cpdef Table drop_nulls(Table source_table, list keys, size_type keep_threshold): """ cdef unique_ptr[table] c_result cdef vector[size_type] c_keys = keys + + stream = _get_stream(stream) + with nogil: c_result = cpp_stream_compaction.drop_nulls( - source_table.view(), c_keys, keep_threshold + source_table.view(), c_keys, keep_threshold, stream.view() ) - return Table.from_libcudf(move(c_result)) + return Table.from_libcudf(move(c_result), stream) -cpdef Table drop_nans(Table source_table, list keys, size_type keep_threshold): +cpdef Table drop_nans( + Table source_table, list keys, size_type keep_threshold, Stream stream=None +): """Filters out rows from the input table based on the presence of NaNs. For details, see :cpp:func:`drop_nans`. @@ -83,14 +92,19 @@ cpdef Table drop_nans(Table source_table, list keys, size_type keep_threshold): """ cdef unique_ptr[table] c_result cdef vector[size_type] c_keys = keys + + stream = _get_stream(stream) + with nogil: - c_result = cpp_stream_compaction.drop_nulls( - source_table.view(), c_keys, keep_threshold + c_result = cpp_stream_compaction.drop_nans( + source_table.view(), c_keys, keep_threshold, stream.view() ) - return Table.from_libcudf(move(c_result)) + return Table.from_libcudf(move(c_result), stream) -cpdef Table apply_boolean_mask(Table source_table, Column boolean_mask): +cpdef Table apply_boolean_mask( + Table source_table, Column boolean_mask, Stream stream=None +): """Filters out rows from the input table based on a boolean mask. For details, see :cpp:func:`apply_boolean_mask`. @@ -108,11 +122,14 @@ cpdef Table apply_boolean_mask(Table source_table, Column boolean_mask): A new table with rows removed based on the boolean mask. """ cdef unique_ptr[table] c_result + + stream = _get_stream(stream) + with nogil: c_result = cpp_stream_compaction.apply_boolean_mask( - source_table.view(), boolean_mask.view() + source_table.view(), boolean_mask.view(), stream.view() ) - return Table.from_libcudf(move(c_result)) + return Table.from_libcudf(move(c_result), stream) cpdef Table unique( @@ -120,6 +137,7 @@ cpdef Table unique( list keys, duplicate_keep_option keep, null_equality nulls_equal, + Stream stream=None, ): """Filter duplicate consecutive rows from the input table. @@ -149,11 +167,14 @@ cpdef Table unique( """ cdef unique_ptr[table] c_result cdef vector[size_type] c_keys = keys + + stream = _get_stream(stream) + with nogil: c_result = cpp_stream_compaction.unique( - input.view(), c_keys, keep, nulls_equal + input.view(), c_keys, keep, nulls_equal, stream.view() ) - return Table.from_libcudf(move(c_result)) + return Table.from_libcudf(move(c_result), stream) cpdef Table distinct( @@ -162,6 +183,7 @@ cpdef Table distinct( duplicate_keep_option keep, null_equality nulls_equal, nan_equality nans_equal, + Stream stream=None, ): """Get the distinct rows from the input table. @@ -188,11 +210,14 @@ cpdef Table distinct( """ cdef unique_ptr[table] c_result cdef vector[size_type] c_keys = keys + + stream = _get_stream(stream) + with nogil: c_result = cpp_stream_compaction.distinct( - input.view(), c_keys, keep, nulls_equal, nans_equal + input.view(), c_keys, keep, nulls_equal, nans_equal, stream.view() ) - return Table.from_libcudf(move(c_result)) + return Table.from_libcudf(move(c_result), stream) cpdef Column distinct_indices( @@ -200,6 +225,7 @@ cpdef Column distinct_indices( duplicate_keep_option keep, null_equality nulls_equal, nan_equality nans_equal, + Stream stream=None, ): """Get the indices of the distinct rows from the input table. @@ -222,11 +248,14 @@ cpdef Column distinct_indices( A new column with the indices of the distinct rows from the input table. """ cdef unique_ptr[column] c_result + + stream = _get_stream(stream) + with nogil: c_result = cpp_stream_compaction.distinct_indices( - input.view(), keep, nulls_equal, nans_equal + input.view(), keep, nulls_equal, nans_equal, stream.view() ) - return Column.from_libcudf(move(c_result)) + return Column.from_libcudf(move(c_result), stream) cpdef Table stable_distinct( @@ -235,6 +264,7 @@ cpdef Table stable_distinct( duplicate_keep_option keep, null_equality nulls_equal, nan_equality nans_equal, + Stream stream=None, ): """Get the distinct rows from the input table, preserving input order. @@ -261,17 +291,21 @@ cpdef Table stable_distinct( """ cdef unique_ptr[table] c_result cdef vector[size_type] c_keys = keys + + stream = _get_stream(stream) + with nogil: c_result = cpp_stream_compaction.stable_distinct( - input.view(), c_keys, keep, nulls_equal, nans_equal + input.view(), c_keys, keep, nulls_equal, nans_equal, stream.view() ) - return Table.from_libcudf(move(c_result)) + return Table.from_libcudf(move(c_result), stream) cpdef size_type unique_count( Column source, null_policy null_handling, - nan_policy nan_handling + nan_policy nan_handling, + Stream stream=None ): """Returns the number of unique consecutive elements in the input column. @@ -296,15 +330,18 @@ cpdef size_type unique_count( If the input column is sorted, then unique_count can produce the same result as distinct_count, but faster. """ + stream = _get_stream(stream) + return cpp_stream_compaction.unique_count( - source.view(), null_handling, nan_handling + source.view(), null_handling, nan_handling, stream.view() ) cpdef size_type distinct_count( Column source, null_policy null_handling, - nan_policy nan_handling + nan_policy nan_handling, + Stream stream=None ): """Returns the number of distinct elements in the input column. @@ -324,8 +361,10 @@ cpdef size_type distinct_count( size_type The number of distinct elements in the input column. """ + stream = _get_stream(stream) + return cpp_stream_compaction.distinct_count( - source.view(), null_handling, nan_handling + source.view(), null_handling, nan_handling, stream.view() ) DuplicateKeepOption.__str__ = DuplicateKeepOption.__repr__ From edf638edfb0c7592af261f00592569696e5691a1 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Mon, 11 Aug 2025 16:15:04 -0700 Subject: [PATCH 098/366] Move some test_datetime.py tests to new cudf classic test directory structure (#19505) Towards https://github.com/rapidsai/cudf/issues/9999 Towards https://github.com/rapidsai/cudf/issues/15723 Authors: - Matthew Roeschke (https://github.com/mroeschke) - GALI PREM SAGAR (https://github.com/galipremsagar) Approvers: - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/19505 --- .../tests/dataframe/indexing/test_setitem.py | 12 + .../tests/dataframe/methods/test_dropna.py | 34 + .../dataframe/methods/test_sort_values.py | 26 + .../tests/dataframe/methods/test_to_arrow.py | 10 + .../datetime => dateoffset}/__init__.py | 0 .../tests/dateoffset/test_constructors.py | 10 + .../general_functions/test_date_range.py | 238 ++ .../general_functions/test_to_datetime.py | 312 +++ .../tests/indexes/datetime/test_components.py | 1 - .../indexes/datetime/test_constructing.py | 1 - .../tests/indexes/datetime/test_conversion.py | 1 - .../indexes/datetimeindex/methods/__init__.py | 0 .../datetimeindex/methods/test_isocalendar.py | 30 + .../datetimeindex/methods/test_strftime.py | 24 + .../datetimeindex/methods/test_tz_convert.py | 16 + .../methods/test_tz_localize.py} | 11 +- .../indexes/datetimeindex/test_attributes.py | 106 +- .../indexes/datetimeindex/test_constructor.py | 87 + .../test_getitem.py} | 0 .../cudf/tests/series/accessors/test_dt.py | 478 ++++ .../tests/series/indexing/test_getitem.py | 5 + .../cudf/tests/series/methods/test_astype.py | 183 ++ .../cudf/tests/series/methods/test_fillna.py | 39 + .../tests/series/methods/test_first_last.py | 126 + .../cudf/tests/series/methods/test_nunique.py | 27 + .../cudf/tests/series/methods/test_query.py | 49 + .../tests/series/methods/test_to_pandas.py | 18 + .../cudf/tests/series/methods/test_unique.py | 38 + .../cudf/cudf/tests/series/test_attributes.py | 52 + .../cudf/tests/series/test_constructors.py | 99 + python/cudf/cudf/tests/test_datetime.py | 2020 +---------------- 31 files changed, 2020 insertions(+), 2033 deletions(-) create mode 100644 python/cudf/cudf/tests/dataframe/indexing/test_setitem.py create mode 100644 python/cudf/cudf/tests/dataframe/methods/test_dropna.py rename python/cudf/cudf/tests/{indexes/datetime => dateoffset}/__init__.py (100%) create mode 100644 python/cudf/cudf/tests/dateoffset/test_constructors.py create mode 100644 python/cudf/cudf/tests/general_functions/test_date_range.py create mode 100644 python/cudf/cudf/tests/general_functions/test_to_datetime.py delete mode 100644 python/cudf/cudf/tests/indexes/datetime/test_components.py delete mode 100644 python/cudf/cudf/tests/indexes/datetime/test_constructing.py delete mode 100644 python/cudf/cudf/tests/indexes/datetime/test_conversion.py create mode 100644 python/cudf/cudf/tests/indexes/datetimeindex/methods/__init__.py create mode 100644 python/cudf/cudf/tests/indexes/datetimeindex/methods/test_isocalendar.py create mode 100644 python/cudf/cudf/tests/indexes/datetimeindex/methods/test_strftime.py create mode 100644 python/cudf/cudf/tests/indexes/datetimeindex/methods/test_tz_convert.py rename python/cudf/cudf/tests/indexes/{datetime/test_time_specific.py => datetimeindex/methods/test_tz_localize.py} (62%) create mode 100644 python/cudf/cudf/tests/indexes/datetimeindex/test_constructor.py rename python/cudf/cudf/tests/indexes/{datetime/test_indexing.py => datetimeindex/test_getitem.py} (100%) create mode 100644 python/cudf/cudf/tests/series/methods/test_first_last.py create mode 100644 python/cudf/cudf/tests/series/methods/test_query.py create mode 100644 python/cudf/cudf/tests/series/methods/test_unique.py diff --git a/python/cudf/cudf/tests/dataframe/indexing/test_setitem.py b/python/cudf/cudf/tests/dataframe/indexing/test_setitem.py new file mode 100644 index 00000000000..73f4632fafd --- /dev/null +++ b/python/cudf/cudf/tests/dataframe/indexing/test_setitem.py @@ -0,0 +1,12 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + + +import pandas as pd + +from cudf import DataFrame + + +def test_setitem_datetime(): + df = DataFrame() + df["date"] = pd.date_range("20010101", "20010105").values + assert df.date.dtype.kind == "M" diff --git a/python/cudf/cudf/tests/dataframe/methods/test_dropna.py b/python/cudf/cudf/tests/dataframe/methods/test_dropna.py new file mode 100644 index 00000000000..ec27503a0ef --- /dev/null +++ b/python/cudf/cudf/tests/dataframe/methods/test_dropna.py @@ -0,0 +1,34 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + + +import numpy as np +import pandas as pd + +import cudf +from cudf.testing import assert_eq + + +def test_datetime_dataframe(): + data = { + "timearray": np.array( + [0, 1, None, 2, 20, None, 897], dtype="datetime64[ms]" + ) + } + gdf = cudf.DataFrame(data) + pdf = pd.DataFrame(data) + + assert_eq(pdf, gdf) + + assert_eq(pdf.dropna(), gdf.dropna()) + + assert_eq(pdf.isnull(), gdf.isnull()) + + data = np.array([0, 1, None, 2, 20, None, 897], dtype="datetime64[ms]") + gs = cudf.Series(data) + ps = pd.Series(data) + + assert_eq(ps, gs) + + assert_eq(ps.dropna(), gs.dropna()) + + assert_eq(ps.isnull(), gs.isnull()) diff --git a/python/cudf/cudf/tests/dataframe/methods/test_sort_values.py b/python/cudf/cudf/tests/dataframe/methods/test_sort_values.py index 1c322ff67af..2a7b53d94cb 100644 --- a/python/cudf/cudf/tests/dataframe/methods/test_sort_values.py +++ b/python/cudf/cudf/tests/dataframe/methods/test_sort_values.py @@ -6,6 +6,7 @@ import pandas as pd import pytest +import cudf from cudf import DataFrame, option_context from cudf.core._compat import PANDAS_CURRENT_SUPPORTED_VERSION, PANDAS_VERSION from cudf.testing import assert_eq @@ -192,3 +193,28 @@ def test_sort_values_by_ambiguous(): lfunc_args_and_kwargs=(["a"], {}), rfunc_args_and_kwargs=(["a"], {}), ) + + +def test_sort_values_datetime(): + rng = np.random.default_rng(seed=0) + df = pd.DataFrame( + { + "date": np.array( + [ + np.datetime64("2016-11-20"), + np.datetime64("2020-11-20"), + np.datetime64("2019-11-20"), + np.datetime64("1918-11-20"), + np.datetime64("2118-11-20"), + ] + ), + "vals": rng.random(5), + } + ) + + gdf = cudf.from_pandas(df) + + s_df = df.sort_values(by="date") + s_gdf = gdf.sort_values(by="date") + + assert_eq(s_df, s_gdf) diff --git a/python/cudf/cudf/tests/dataframe/methods/test_to_arrow.py b/python/cudf/cudf/tests/dataframe/methods/test_to_arrow.py index 4bbed8fab9e..c033efed0b0 100644 --- a/python/cudf/cudf/tests/dataframe/methods/test_to_arrow.py +++ b/python/cudf/cudf/tests/dataframe/methods/test_to_arrow.py @@ -1,9 +1,11 @@ # Copyright (c) 2023-2025, NVIDIA CORPORATION. +import pandas as pd import pyarrow as pa import pytest import cudf +from cudf.testing import assert_eq @pytest.mark.parametrize("preserve_index", [False, True, None]) @@ -13,3 +15,11 @@ def test_dataframe_to_arrow_preserve_index(preserve_index): expect = pa.Table.from_pandas(pf, preserve_index=preserve_index).schema got = df.to_arrow(preserve_index=preserve_index).schema assert expect == got + + +def test_datetime_to_arrow(datetime_types_as_str): + data = pd.date_range("2000-01-01", "2000-01-02", freq="3600s") + gdf = cudf.DataFrame({"timestamp": data.astype(datetime_types_as_str)}) + assert_eq( + gdf, cudf.DataFrame.from_arrow(gdf.to_arrow(preserve_index=False)) + ) diff --git a/python/cudf/cudf/tests/indexes/datetime/__init__.py b/python/cudf/cudf/tests/dateoffset/__init__.py similarity index 100% rename from python/cudf/cudf/tests/indexes/datetime/__init__.py rename to python/cudf/cudf/tests/dateoffset/__init__.py diff --git a/python/cudf/cudf/tests/dateoffset/test_constructors.py b/python/cudf/cudf/tests/dateoffset/test_constructors.py new file mode 100644 index 00000000000..56338f44773 --- /dev/null +++ b/python/cudf/cudf/tests/dateoffset/test_constructors.py @@ -0,0 +1,10 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + +import pandas as pd + +import cudf + + +def test_dateoffset_instance_subclass_check(): + assert not issubclass(pd.DateOffset, cudf.DateOffset) + assert not isinstance(pd.DateOffset(), cudf.DateOffset) diff --git a/python/cudf/cudf/tests/general_functions/test_date_range.py b/python/cudf/cudf/tests/general_functions/test_date_range.py new file mode 100644 index 00000000000..199ebca8eeb --- /dev/null +++ b/python/cudf/cudf/tests/general_functions/test_date_range.py @@ -0,0 +1,238 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + + +import numpy as np +import pandas as pd +import pytest + +import cudf +from cudf.core._compat import ( + PANDAS_CURRENT_SUPPORTED_VERSION, + PANDAS_GE_220, + PANDAS_VERSION, +) +from cudf.testing import assert_eq +from cudf.testing._utils import ( + expect_warning_if, +) + + +def test_date_range_freq_default(): + result = pd.date_range("2020-01-01", periods=2, name="foo") + expected = cudf.date_range("2020-01-01", periods=2, name="foo") + assert_eq(result, expected) + + +def test_date_range_tz(): + result = pd.date_range("2020-01-01", periods=2, tz="UTC") + expected = cudf.date_range("2020-01-01", periods=2, tz="UTC") + assert_eq(result, expected) + + result = pd.date_range("2020-01-01", "2020-01-02", periods=2, tz="UTC") + expected = cudf.date_range("2020-01-01", "2020-01-02", periods=2, tz="UTC") + assert_eq(result, expected) + + +def test_date_range_start_end_divisible_by_freq(): + result = cudf.date_range("2011-01-01", "2011-01-02", freq="h") + expected = pd.date_range("2011-01-01", "2011-01-02", freq="h") + assert_eq(result, expected) + + +@pytest.fixture( + params=[ + "2000-02-13 08:41:06", + "1996-11-21 04:05:30", + "1970-01-01 00:00:00", + "1831-05-08 15:23:21", + ], + ids=["leap_year", "non_leap_year", "unix_epoch_time_0", "random_date"], +) +def start(request): + return request.param + + +@pytest.fixture( + params=[ + "2000-02-13 08:41:06", + "1996-11-21 04:05:30", + "1970-01-01 00:00:00", + "1831-05-08 15:23:21", + ], + ids=["leap_year", "non_leap_year", "unix_epoch_time_0", "random_date"], +) +def end(request): + return request.param + + +@pytest.fixture(params=[1, 10]) +def periods(request): + return request.param + + +@pytest.fixture( + params=[ + {"months": 3, "years": 1}, + {"hours": 10, "days": 57, "nanoseconds": 3}, + "83D", + "17h", + "-680min", + "110546s", + "110546789ms", + "110546789248us", + ] +) +def freq(request): + return request.param + + +def test_date_range_start_end_periods(start, end, periods): + expect = pd.date_range(start=start, end=end, periods=periods, name="a") + got = cudf.date_range(start=start, end=end, periods=periods, name="a") + + np.testing.assert_allclose( + expect.to_numpy().astype("int64"), + got.to_pandas().to_numpy().astype("int64"), + ) + + +@pytest.mark.skipif( + PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, + reason="https://github.com/pandas-dev/pandas/issues/46877", +) +def test_date_range_end_freq_periods(end, freq, periods): + if isinstance(freq, str): + _gfreq = _pfreq = freq + else: + _gfreq = cudf.DateOffset(**freq) + _pfreq = pd.DateOffset(**freq) + + expect = pd.date_range(end=end, periods=periods, freq=_pfreq, name="a") + got = cudf.date_range(end=end, periods=periods, freq=_gfreq, name="a") + + np.testing.assert_allclose( + expect.to_numpy().astype("int64"), + got.to_pandas().to_numpy().astype("int64"), + ) + + +def test_date_range_freq_does_not_divide_range(): + expect = pd.date_range( + "2001-01-01 00:00:00.000000", "2001-01-01 00:00:00.000010", freq="3us" + ) + got = cudf.date_range( + "2001-01-01 00:00:00.000000", "2001-01-01 00:00:00.000010", freq="3us" + ) + np.testing.assert_allclose( + expect.to_numpy().astype("int64"), + got.to_pandas().to_numpy().astype("int64"), + ) + + +@pytest.mark.parametrize( + "kwargs", + [ + {"nanoseconds": 1}, + {"months": 1}, + ], +) +def test_date_range_raise_overflow(kwargs): + start = np.datetime64(np.iinfo("int64").max, "ns") + periods = 2 + freq = cudf.DateOffset(**kwargs) + with pytest.raises(pd.errors.OutOfBoundsDatetime): + cudf.date_range(start=start, periods=periods, freq=freq) + + +@pytest.mark.parametrize( + "freqstr_unsupported", + [ + "1ME", + "2SME", + "3MS", + "4BME", + "5CBME", + "6SMS", + "7BMS", + "8CBMS", + "QE", + "2BQE", + "3BQS", + "10YE", + "9BYE", + "8YS", + "7BYS", + "bh", + "B", + ], +) +def test_date_range_raise_unsupported(freqstr_unsupported): + if not PANDAS_GE_220 and freqstr_unsupported.endswith("E"): + pytest.skip(reason="YE, etc. support was added in pandas 2.2") + + s, e = "2001-01-01", "2008-01-31" + pd.date_range(start=s, end=e, freq=freqstr_unsupported) + with pytest.raises(ValueError, match="does not yet support"): + cudf.date_range(start=s, end=e, freq=freqstr_unsupported) + + # We also check that these values are unsupported when using lowercase + # characters. We exclude the value 3MS (every 3 month starts) because 3ms + # is a valid frequency for every 3 milliseconds. + if freqstr_unsupported != "3MS": + freqstr_unsupported = freqstr_unsupported.lower() + with pytest.raises(ValueError, match="does not yet support"): + with expect_warning_if( + PANDAS_GE_220 and freqstr_unsupported not in {"b", "bh"} + ): + cudf.date_range(start=s, end=e, freq=freqstr_unsupported) + + +@pytest.mark.skipif( + PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, + reason="Fails in older versions of pandas", +) +def test_date_range_start_freq_periods(start, freq, periods): + if isinstance(freq, str): + _gfreq = _pfreq = freq + else: + _gfreq = cudf.DateOffset(**freq) + _pfreq = pd.DateOffset(**freq) + + expect = pd.date_range(start=start, periods=periods, freq=_pfreq, name="a") + got = cudf.date_range(start=start, periods=periods, freq=_gfreq, name="a") + + np.testing.assert_allclose( + expect.to_numpy().astype("int64"), + got.to_pandas().to_numpy().astype("int64"), + ) + + +def test_daterange_pandas_compatibility(): + with cudf.option_context("mode.pandas_compatible", True): + expected = pd.date_range( + "2010-01-01", "2010-02-01", periods=10, name="times" + ) + actual = cudf.date_range( + "2010-01-01", "2010-02-01", periods=10, name="times" + ) + assert_eq(expected, actual) + + +@pytest.mark.skipif( + PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, + reason="Fails in older versions of pandas", +) +def test_date_range_start_end_freq(start, end, freq): + if isinstance(freq, str): + _gfreq = _pfreq = freq + else: + _gfreq = cudf.DateOffset(**freq) + _pfreq = pd.DateOffset(**freq) + + expect = pd.date_range(start=start, end=end, freq=_pfreq, name="a") + got = cudf.date_range(start=start, end=end, freq=_gfreq, name="a") + + np.testing.assert_allclose( + expect.to_numpy().astype("int64"), + got.to_pandas().to_numpy().astype("int64"), + ) diff --git a/python/cudf/cudf/tests/general_functions/test_to_datetime.py b/python/cudf/cudf/tests/general_functions/test_to_datetime.py new file mode 100644 index 00000000000..e8ebff29e54 --- /dev/null +++ b/python/cudf/cudf/tests/general_functions/test_to_datetime.py @@ -0,0 +1,312 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + + +import cupy as cp +import numpy as np +import pandas as pd +import pytest + +import cudf +from cudf.testing import assert_eq +from cudf.testing._utils import ( + assert_exceptions_equal, + expect_warning_if, +) + + +@pytest.mark.parametrize( + "data", + [ + None, + [], + pd.Series([], dtype="float64"), + pd.Index([]), + pd.Series([1, 2, 3]), + pd.Series([0, 1, -1]), + pd.Series([0, 1, -1, 100.3, 200, 47637289]), + pd.Series(["2012-10-11", "2010-01-01", "2016-07-07", "2014-02-02"]), + [1, 2, 3, 100, -123, -1, 0, 1000000000000679367], + pd.DataFrame({"year": [2015, 2016], "month": [2, 3], "day": [4, 5]}), + pd.DataFrame( + {"year": ["2015", "2016"], "month": ["2", "3"], "day": [4, 5]} + ), + pd.DataFrame( + { + "year": [2015, 2016], + "month": [2, 3], + "day": [4, 5], + "minute": [1, 100], + "second": [90, 10], + "hour": [1, 0.5], + }, + index=["a", "b"], + ), + pd.DataFrame( + { + "year": [], + "month": [], + "day": [], + "minute": [], + "second": [], + "hour": [], + }, + ), + ["2012-10-11", "2010-01-01", "2016-07-07", "2014-02-02"], + pd.Index([1, 2, 3, 4]), + pd.DatetimeIndex( + ["1970-01-01 00:00:00.000000001", "1970-01-01 00:00:00.000000002"], + dtype="datetime64[ns]", + freq=None, + ), + pd.DatetimeIndex( + [], + dtype="datetime64[ns]", + freq=None, + ), + pd.Series([1, 2, 3]).astype("datetime64[ns]"), + pd.Series([1, 2, 3]).astype("datetime64[us]"), + pd.Series([1, 2, 3]).astype("datetime64[ms]"), + pd.Series([1, 2, 3]).astype("datetime64[s]"), + pd.Series([1, 2, 3]).astype("datetime64[D]"), + 1, + 100, + 17, + 53.638435454, + np.array([1, 10, 15, 478925, 2327623467]), + np.array([0.3474673, -10, 15, 478925.34345, 2327623467]), + ], +) +@pytest.mark.parametrize("dayfirst", [True, False]) +def test_cudf_to_datetime(data, dayfirst): + pd_data = data + if isinstance(pd_data, (pd.Series, pd.DataFrame, pd.Index)): + gd_data = cudf.from_pandas(pd_data) + else: + if type(pd_data).__module__ == np.__name__: + gd_data = cp.array(pd_data) + else: + gd_data = pd_data + + expected = pd.to_datetime(pd_data, dayfirst=dayfirst) + actual = cudf.to_datetime(gd_data, dayfirst=dayfirst) + + if isinstance(expected, pd.Series): + assert_eq(actual, expected, check_dtype=False) + else: + assert_eq(actual, expected, check_exact=False) + + +@pytest.mark.parametrize( + "data", + [ + "2", + ["1", "2", "3"], + ["1/1/1", "2/2/2", "1"], + pd.Series([1, 2, 3], dtype="timedelta64[ns]"), + pd.DataFrame( + { + "year": [2015, 2016], + "month": [2, 3], + "day": [4, 5], + "minute": [1, 100], + "second": [90, 10], + "hour": [1, 0], + "blablacol": [1, 1], + } + ), + pd.DataFrame( + { + "month": [2, 3], + "day": [4, 5], + "minute": [1, 100], + "second": [90, 10], + "hour": [1, 0], + } + ), + ], +) +@pytest.mark.filterwarnings("ignore:Could not infer format:UserWarning") +def test_to_datetime_errors(data): + pd_data = data + if isinstance(pd_data, (pd.Series, pd.DataFrame, pd.Index)): + gd_data = cudf.from_pandas(pd_data) + else: + gd_data = pd_data + + assert_exceptions_equal( + pd.to_datetime, + cudf.to_datetime, + ([pd_data],), + ([gd_data],), + ) + + +def test_to_datetime_not_implemented(): + with pytest.raises(NotImplementedError): + cudf.to_datetime([], exact=False) + + with pytest.raises(NotImplementedError): + cudf.to_datetime([], origin="julian") + + with pytest.raises(NotImplementedError): + cudf.to_datetime([], yearfirst=True) + + +@pytest.mark.parametrize( + "data", + [ + 1, + [], + pd.Series([], dtype="float64"), + pd.Index([]), + pd.Series([1, 2, 3]), + pd.Series([1, 2.4, 3]), + pd.Series([0, 1, -1]), + pd.Series([0, 1, -1, 100, 200, 47637]), + [10, 12, 1200, 15003], + pd.DatetimeIndex( + [], + dtype="datetime64[ns]", + freq=None, + ), + pd.Index([1, 2, 3, 4]), + ], +) +@pytest.mark.parametrize("unit", ["D", "s", "ms", "us", "ns"]) +def test_to_datetime_units(data, unit): + pd_data = data + if isinstance(pd_data, (pd.Series, pd.DataFrame, pd.Index)): + gd_data = cudf.from_pandas(pd_data) + else: + gd_data = pd_data + + expected = pd.to_datetime(pd_data, unit=unit) + actual = cudf.to_datetime(gd_data, unit=unit) + + if isinstance(expected, pd.Series): + assert_eq(actual, expected, check_dtype=False) + else: + assert_eq(actual, expected, exact=False, check_exact=False) + + +@pytest.mark.parametrize( + "data,format", + [ + ("2012-10-11", None), + ("2012-10-11", "%Y-%m-%d"), + ("2012-10-11", "%Y-%d-%m"), + (["2012-10-11", "2010-01-01", "2016-07-07", "2014-02-02"], None), + (["2012-10-11", "2010-01-01", "2016-07-07", "2014-02-02"], "%Y-%m-%d"), + (["2012-10-11", "2010-01-01", "2016-07-07", "2014-02-02"], "%Y-%d-%m"), + (["10-11-2012", "01-01-2010", "07-07-2016", "02-02-2014"], "%m-%d-%Y"), + (["10-11-2012", "01-01-2010", "07-07-2016", "02-02-2014"], "%d-%m-%Y"), + (["10-11-2012", "01-01-2010", "07-07-2016", "02-02-2014"], None), + (["2012/10/11", "2010/01/01", "2016/07/07", "2014/02/02"], None), + (["2012/10/11", "2010/01/01", "2016/07/07", "2014/02/02"], "%Y/%m/%d"), + (["2012/10/11", "2010/01/01", "2016/07/07", "2014/02/02"], "%Y/%d/%m"), + (["10/11/2012", "01/01/2010", "07/07/2016", "02/02/2014"], "%m/%d/%Y"), + (["10/11/2012", "01/01/2010", "07/07/2016", "02/02/2014"], "%d/%m/%Y"), + (["10/11/2012", "01/01/2010", "07/07/2016", "02/02/2014"], None), + (["2021-04-13 12:30:04.123456789"], "%Y-%m-%d %H:%M:%S.%f"), + (pd.Series([2015, 2020, 2021]), "%Y"), + pytest.param( + pd.Series(["1", "2", "1"]), + "%m", + marks=pytest.mark.xfail( + reason="https://github.com/rapidsai/cudf/issues/6109" + "https://github.com/pandas-dev/pandas/issues/35934" + ), + ), + pytest.param( + pd.Series(["14", "20", "10"]), + "%d", + marks=pytest.mark.xfail( + reason="https://github.com/rapidsai/cudf/issues/6109" + "https://github.com/pandas-dev/pandas/issues/35934" + ), + ), + (pd.Series([2015, 2020.0, 2021.2]), "%Y"), + ], +) +@pytest.mark.parametrize("infer_datetime_format", [True, False]) +def test_to_datetime_format(data, format, infer_datetime_format): + pd_data = data + if isinstance(pd_data, (pd.Series, pd.DataFrame, pd.Index)): + gd_data = cudf.from_pandas(pd_data) + else: + gd_data = pd_data + + with expect_warning_if(True, UserWarning): + expected = pd.to_datetime( + pd_data, format=format, infer_datetime_format=infer_datetime_format + ) + with expect_warning_if(not infer_datetime_format): + actual = cudf.to_datetime( + gd_data, format=format, infer_datetime_format=infer_datetime_format + ) + + if isinstance(expected, pd.Series): + assert_eq(actual, expected, check_dtype=False) + else: + assert_eq(actual, expected, check_exact=False) + + +def test_to_datetime_data_out_of_range_for_format(): + with pytest.raises(ValueError): + cudf.to_datetime("2015-02-99", format="%Y-%m-%d") + + +def test_to_datetime_different_formats_notimplemented(): + with pytest.raises(NotImplementedError): + cudf.to_datetime(["2015-02-01", "2015-02-01 10:10:10"]) + + +def test_datetime_to_datetime_error(): + assert_exceptions_equal( + lfunc=pd.to_datetime, + rfunc=cudf.to_datetime, + lfunc_args_and_kwargs=(["02-Oct-2017 09:30", "%d-%B-%Y %H:%M"],), + rfunc_args_and_kwargs=(["02-Oct-2017 09:30", "%d-%B-%Y %H:%M"],), + check_exception_type=False, + ) + + +@pytest.mark.parametrize("code", ["z", "Z"]) +def test_format_timezone_not_implemented(code): + with pytest.raises(NotImplementedError): + cudf.to_datetime( + ["2020-01-01 00:00:00 UTC"], format=f"%Y-%m-%d %H:%M:%S %{code}" + ) + + +@pytest.mark.parametrize("tz", ["UTC-3", "+01:00"]) +def test_utc_offset_not_implemented(tz): + with pytest.raises((NotImplementedError, ValueError)): + cudf.to_datetime([f"2020-01-01 00:00:00{tz}"]) + + +def test_Z_utc_offset(): + with cudf.option_context("mode.pandas_compatible", True): + with pytest.raises(NotImplementedError): + cudf.to_datetime(["2020-01-01 00:00:00Z"]) + + result = cudf.to_datetime(["2020-01-01 00:00:00Z"]) + expected = cudf.to_datetime(["2020-01-01 00:00:00"]) + assert_eq(result, expected) + + +@pytest.mark.parametrize("arg", [True, False]) +def test_args_not_datetime_typerror(arg): + with pytest.raises(TypeError): + cudf.to_datetime([arg]) + + +@pytest.mark.parametrize("errors", ["coerce", "ignore"]) +def test_to_datetime_errors_non_scalar_not_implemented(errors): + with pytest.raises(NotImplementedError): + cudf.to_datetime([1, ""], unit="s", errors=errors) + + +def test_to_datetime_errors_ignore_deprecated(): + with pytest.warns(FutureWarning): + cudf.to_datetime("2001-01-01 00:04:45", errors="ignore") diff --git a/python/cudf/cudf/tests/indexes/datetime/test_components.py b/python/cudf/cudf/tests/indexes/datetime/test_components.py deleted file mode 100644 index 06777c8e6af..00000000000 --- a/python/cudf/cudf/tests/indexes/datetime/test_components.py +++ /dev/null @@ -1 +0,0 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. diff --git a/python/cudf/cudf/tests/indexes/datetime/test_constructing.py b/python/cudf/cudf/tests/indexes/datetime/test_constructing.py deleted file mode 100644 index 06777c8e6af..00000000000 --- a/python/cudf/cudf/tests/indexes/datetime/test_constructing.py +++ /dev/null @@ -1 +0,0 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. diff --git a/python/cudf/cudf/tests/indexes/datetime/test_conversion.py b/python/cudf/cudf/tests/indexes/datetime/test_conversion.py deleted file mode 100644 index 06777c8e6af..00000000000 --- a/python/cudf/cudf/tests/indexes/datetime/test_conversion.py +++ /dev/null @@ -1 +0,0 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. diff --git a/python/cudf/cudf/tests/indexes/datetimeindex/methods/__init__.py b/python/cudf/cudf/tests/indexes/datetimeindex/methods/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/python/cudf/cudf/tests/indexes/datetimeindex/methods/test_isocalendar.py b/python/cudf/cudf/tests/indexes/datetimeindex/methods/test_isocalendar.py new file mode 100644 index 00000000000..e166a66f81c --- /dev/null +++ b/python/cudf/cudf/tests/indexes/datetimeindex/methods/test_isocalendar.py @@ -0,0 +1,30 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + +import pandas as pd +import pytest + +import cudf +from cudf.testing import assert_eq + + +@pytest.mark.parametrize( + "data", + [ + [], + [None, None], + [ + "2020-05-31 08:00:00", + "1999-12-31 18:40:00", + "2000-12-31 04:00:00", + ], + ["2100-03-14 07:30:00"], + ], +) +def test_isocalendar_index(data): + ps = pd.DatetimeIndex(data, dtype="datetime64[ns]") + gs = cudf.from_pandas(ps) + + expect = ps.isocalendar() + got = gs.isocalendar() + + assert_eq(expect, got, check_dtype=False) diff --git a/python/cudf/cudf/tests/indexes/datetimeindex/methods/test_strftime.py b/python/cudf/cudf/tests/indexes/datetimeindex/methods/test_strftime.py new file mode 100644 index 00000000000..2c621f460a6 --- /dev/null +++ b/python/cudf/cudf/tests/indexes/datetimeindex/methods/test_strftime.py @@ -0,0 +1,24 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + +import pandas as pd +import pytest + +import cudf +from cudf.testing import assert_eq + + +@pytest.mark.parametrize( + "method, kwargs", + [ + ["to_pydatetime", {}], + ["to_period", {"freq": "D"}], + ["strftime", {"date_format": "%Y-%m-%d"}], + ], +) +def test_dti_methods(method, kwargs): + pd_dti = pd.DatetimeIndex(["2020-01-01", "2020-12-31"], name="foo") + cudf_dti = cudf.from_pandas(pd_dti) + + result = getattr(cudf_dti, method)(**kwargs) + expected = getattr(pd_dti, method)(**kwargs) + assert_eq(result, expected) diff --git a/python/cudf/cudf/tests/indexes/datetimeindex/methods/test_tz_convert.py b/python/cudf/cudf/tests/indexes/datetimeindex/methods/test_tz_convert.py new file mode 100644 index 00000000000..1c026224da3 --- /dev/null +++ b/python/cudf/cudf/tests/indexes/datetimeindex/methods/test_tz_convert.py @@ -0,0 +1,16 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. +import zoneinfo + +import pandas as pd + +import cudf +from cudf.testing import assert_eq + + +def test_tz_convert(): + tz = zoneinfo.ZoneInfo("America/New_York") + pidx = pd.date_range("2023-01-01", periods=3, freq="h") + idx = cudf.from_pandas(pidx) + pidx = pidx.tz_localize("UTC") + idx = idx.tz_localize("UTC") + assert_eq(pidx.tz_convert(tz), idx.tz_convert(tz)) diff --git a/python/cudf/cudf/tests/indexes/datetime/test_time_specific.py b/python/cudf/cudf/tests/indexes/datetimeindex/methods/test_tz_localize.py similarity index 62% rename from python/cudf/cudf/tests/indexes/datetime/test_time_specific.py rename to python/cudf/cudf/tests/indexes/datetimeindex/methods/test_tz_localize.py index 7cc629270b1..7c07b11fa97 100644 --- a/python/cudf/cudf/tests/indexes/datetime/test_time_specific.py +++ b/python/cudf/cudf/tests/indexes/datetimeindex/methods/test_tz_localize.py @@ -1,4 +1,4 @@ -# Copyright (c) 2022-2024, NVIDIA CORPORATION. +# Copyright (c) 2025, NVIDIA CORPORATION. import zoneinfo import pandas as pd @@ -16,15 +16,6 @@ def test_tz_localize(): assert_eq(pidx.tz_localize(tz), idx.tz_localize(tz)) -def test_tz_convert(): - tz = zoneinfo.ZoneInfo("America/New_York") - pidx = pd.date_range("2023-01-01", periods=3, freq="h") - idx = cudf.from_pandas(pidx) - pidx = pidx.tz_localize("UTC") - idx = idx.tz_localize("UTC") - assert_eq(pidx.tz_convert(tz), idx.tz_convert(tz)) - - def test_delocalize_naive(): pidx = pd.date_range("2023-01-01", periods=3, freq="h") idx = cudf.from_pandas(pidx) diff --git a/python/cudf/cudf/tests/indexes/datetimeindex/test_attributes.py b/python/cudf/cudf/tests/indexes/datetimeindex/test_attributes.py index 5fb8c2fb647..8d230451886 100644 --- a/python/cudf/cudf/tests/indexes/datetimeindex/test_attributes.py +++ b/python/cudf/cudf/tests/indexes/datetimeindex/test_attributes.py @@ -1,9 +1,111 @@ # Copyright (c) 2025, NVIDIA CORPORATION. +import numpy as np import pandas as pd +import pandas._testing as tm import pytest -from cudf.core.index import DatetimeIndex +import cudf +from cudf.testing import assert_eq + + +@pytest.mark.parametrize( + "field", + [ + "year", + "month", + "day", + "hour", + "minute", + "second", + "microsecond", + "nanosecond", + "weekday", + "dayofweek", + "dayofyear", + "day_of_year", + ], +) +def test_dt_index(field): + data = pd.DatetimeIndex( + [pd.Timestamp(2020, 1, 2, 3, 4, 5, 6, nanosecond=7)] + ) + gdf_data = cudf.DatetimeIndex(data) + assert_eq(getattr(gdf_data, field), getattr(data, field), exact=False) + + +@pytest.mark.parametrize( + "attr", + [ + "is_month_start", + "is_month_end", + "is_quarter_end", + "is_quarter_start", + "is_year_end", + "is_year_start", + "days_in_month", + "timetz", + "time", + "date", + ], +) +def test_dti_datetime_attributes(attr): + data = [ + "2020-01-01", + "2020-01-31", + "2020-03-01", + "2020-03-31", + "2020-03-31", + "2020-12-31", + None, + ] + pd_dti = pd.DatetimeIndex(data, name="foo") + cudf_dti = cudf.from_pandas(pd_dti) + + result = getattr(cudf_dti, attr) + expected = getattr(pd_dti, attr) + if isinstance(result, np.ndarray): + # numpy doesn't assert object arrays with NaT correctly + tm.assert_numpy_array_equal(result, expected) + else: + assert_eq(result, expected) + + +@pytest.mark.parametrize("attr", ["freq", "unit"]) +def test_dti_properties(attr): + pd_dti = pd.DatetimeIndex( + ["2020-01-01", "2020-01-02"], dtype="datetime64[ns]" + ) + cudf_dti = cudf.DatetimeIndex( + ["2020-01-01", "2020-01-02"], dtype="datetime64[ns]" + ) + + result = getattr(cudf_dti, attr) + expected = getattr(pd_dti, attr) + assert result == expected + + +def test_writable_numpy_array(): + gi = cudf.Index([1, 2, 3], dtype="datetime64[ns]") + expected_flags = pd.Index( + [1, 2, 3], dtype="datetime64[ns]" + )._data._ndarray.flags + + actual_flags = gi.to_pandas()._data._ndarray.flags + assert expected_flags.c_contiguous == actual_flags.c_contiguous + assert expected_flags.f_contiguous == actual_flags.f_contiguous + assert expected_flags.writeable == actual_flags.writeable + assert expected_flags.aligned == actual_flags.aligned + assert expected_flags.writebackifcopy == actual_flags.writebackifcopy + + +def test_dti_asi8(): + pd_dti = pd.DatetimeIndex(["2020-01-01", "2020-12-31"], name="foo") + cudf_dti = cudf.from_pandas(pd_dti) + + result = pd_dti.asi8 + expected = cudf_dti.asi8 + assert_eq(result, expected) @pytest.mark.parametrize( @@ -37,7 +139,7 @@ ], ) def test_datetime_index_is_unique_monotonic(testlist): - index = DatetimeIndex(testlist) + index = cudf.DatetimeIndex(testlist) index_pd = pd.DatetimeIndex(testlist) assert index.is_unique == index_pd.is_unique diff --git a/python/cudf/cudf/tests/indexes/datetimeindex/test_constructor.py b/python/cudf/cudf/tests/indexes/datetimeindex/test_constructor.py new file mode 100644 index 00000000000..bae161521a4 --- /dev/null +++ b/python/cudf/cudf/tests/indexes/datetimeindex/test_constructor.py @@ -0,0 +1,87 @@ +# Copyright (c) 2023-2025, NVIDIA CORPORATION. + + +import pandas as pd +import pytest + +import cudf +from cudf.core.index import DatetimeIndex +from cudf.testing import assert_eq +from cudf.testing._utils import ( + assert_exceptions_equal, +) + + +@pytest.mark.parametrize( + "data,dtype,freq", + [ + ([10], "datetime64[ns]", "2ns"), + ([10, 12, 14, 16], "datetime64[ns]", "2ns"), + ([10, 11, 12, 13], "datetime64[ns]", "1ns"), + ([100, 200, 300, 400], "datetime64[s]", "100s"), + ([101, 201, 301, 401], "datetime64[ms]", "100ms"), + ], +) +def test_datetime_index_with_freq(data, dtype, freq): + actual = cudf.DatetimeIndex(data, dtype=dtype, freq=freq) + expected = pd.DatetimeIndex(data, dtype=dtype, freq=freq) + assert_eq(actual, expected) + + +@pytest.mark.parametrize( + "data,dtype,freq", + [ + ([10, 1232, 13244, 13426], "datetime64[ns]", "2ns"), + ([10, 11, 12, 13], "datetime64[ns]", "1s"), + ([10000, 200, 300, 400], "datetime64[s]", "100s"), + ([107871, 201, 301, 401], "datetime64[ms]", "100ns"), + ], +) +def test_datetime_index_freq_error(data, dtype, freq): + assert_exceptions_equal( + pd.DatetimeIndex, + cudf.DatetimeIndex, + ([data], {"dtype": dtype, "freq": freq}), + ([data], {"dtype": dtype, "freq": freq}), + ) + + +def test_strings_with_utc_offset_not_implemented(): + with pytest.raises(NotImplementedError): + DatetimeIndex(["2022-07-22 00:00:00+02:00"]) + + +def test_dateimeindex_from_noniso_string(): + data = ["20160920", "20160925"] + gdti = cudf.DatetimeIndex(data) + pdti = pd.DatetimeIndex(data) + + assert_eq(gdti, pdti) + + +@pytest.mark.parametrize( + "data", + [ + [ + "2020-01-31", + "2020-02-15", + "2020-02-29", + "2020-03-15", + "2020-03-31", + "2020-04-15", + "2020-04-30", + ], + [43534, 43543, 37897, 2000], + ], +) +@pytest.mark.parametrize("dtype", [None, "datetime64[ns]"]) +def test_datetime_constructor(data, dtype): + expected = pd.DatetimeIndex(data=data, dtype=dtype) + actual = cudf.DatetimeIndex(data=data, dtype=dtype) + + assert_eq(expected, actual) + + expected = pd.DatetimeIndex(data=pd.Series(data), dtype=dtype) + actual = cudf.DatetimeIndex(data=cudf.Series(data), dtype=dtype) + + assert_eq(expected, actual) diff --git a/python/cudf/cudf/tests/indexes/datetime/test_indexing.py b/python/cudf/cudf/tests/indexes/datetimeindex/test_getitem.py similarity index 100% rename from python/cudf/cudf/tests/indexes/datetime/test_indexing.py rename to python/cudf/cudf/tests/indexes/datetimeindex/test_getitem.py diff --git a/python/cudf/cudf/tests/series/accessors/test_dt.py b/python/cudf/cudf/tests/series/accessors/test_dt.py index 40b604bc043..401f89aed65 100644 --- a/python/cudf/cudf/tests/series/accessors/test_dt.py +++ b/python/cudf/cudf/tests/series/accessors/test_dt.py @@ -2,9 +2,14 @@ import cupy as cp import numpy as np +import pandas as pd import pytest import cudf +from cudf.core._compat import ( + PANDAS_CURRENT_SUPPORTED_VERSION, + PANDAS_VERSION, +) from cudf.testing import assert_eq @@ -93,3 +98,476 @@ def test_timedelta_series_total_seconds( expected = psr.dt.total_seconds() actual = gsr.dt.total_seconds() assert_eq(expected, actual) + + +@pytest.mark.parametrize("meth", ["day_name", "month_name"]) +@pytest.mark.parametrize("klass", [cudf.Series, cudf.DatetimeIndex]) +def test_day_month_name_locale_not_implemented(meth, klass): + obj = klass(cudf.date_range("2020-01-01", periods=7)) + if klass is cudf.Series: + obj = obj.dt + with pytest.raises(NotImplementedError): + getattr(obj, meth)(locale="pt_BR.utf8") + + +@pytest.mark.parametrize("meth", ["day_name", "month_name"]) +@pytest.mark.parametrize("klass", [pd.Series, pd.DatetimeIndex]) +def test_day_month_name(meth, klass): + data = [ + "2020-05-31 08:00:00", + None, + "1999-12-31 18:40:00", + "2000-12-31 04:00:00", + None, + "1900-02-28 07:00:00", + "1800-03-14 07:30:00", + "2100-03-14 07:30:00", + "1970-01-01 00:00:00", + "1969-12-31 12:59:00", + ] + + p_obj = klass(data, dtype="datetime64[s]") + g_obj = cudf.from_pandas(p_obj) + + if klass is pd.Series: + p_obj = p_obj.dt + g_obj = g_obj.dt + + expect = getattr(p_obj, meth)() + got = getattr(g_obj, meth)() + + assert_eq(expect, got) + + +@pytest.mark.parametrize( + "freqstr", + [ + "H", + "N", + "T", + "L", + "U", + "S", + ], +) +def test_datetime_ceil_raise_warning(freqstr): + t = cudf.Series( + ["2001-01-01 00:04:45", "2001-01-01 00:04:58", "2001-01-01 00:05:04"], + dtype="datetime64[ns]", + ) + with pytest.warns(FutureWarning): + t.dt.ceil(freqstr) + + +@pytest.mark.parametrize( + "resolution", ["D", "h", "min", "min", "s", "ms", "us", "ns"] +) +def test_round(datetime_types_as_str, resolution): + data = [ + "2020-05-31 08:00:00", + "1999-12-31 18:40:10", + "2000-12-31 04:00:05", + "1900-02-28 07:00:06", + "1800-03-14 07:30:20", + "2100-03-14 07:30:20", + "1970-01-01 00:00:09", + "1969-12-31 12:59:10", + ] + gs = cudf.Series(data, dtype=datetime_types_as_str) + ps = gs.to_pandas() + + expect = ps.dt.round(resolution) + got = gs.dt.round(resolution) + assert_eq(expect, got) + + +@pytest.mark.skipif( + PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, + reason="https://github.com/pandas-dev/pandas/issues/52761", +) +@pytest.mark.parametrize( + "resolution", ["D", "h", "min", "min", "s", "ms", "us", "ns"] +) +def test_floor(datetime_types_as_str, resolution): + data = [ + "2020-05-31 08:00:00", + "1999-12-31 18:40:10", + "2000-12-31 04:00:05", + "1900-02-28 07:00:06", + "1800-03-14 07:30:20", + "2100-03-14 07:30:20", + "1970-01-01 00:00:09", + "1969-12-31 12:59:10", + ] + gs = cudf.Series(data, dtype=datetime_types_as_str) + ps = gs.to_pandas() + + expect = ps.dt.floor(resolution) + got = gs.dt.floor(resolution) + assert_eq(expect, got) + + +@pytest.mark.skipif( + PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, + reason="https://github.com/pandas-dev/pandas/issues/52761", +) +@pytest.mark.parametrize( + "resolution", ["D", "h", "min", "min", "s", "ms", "us", "ns"] +) +def test_ceil(datetime_types_as_str, resolution): + data = [ + "2020-05-31 08:00:00", + "1999-12-31 18:40:10", + "2000-12-31 04:00:05", + "1900-02-28 07:00:06", + "1800-03-14 07:30:20", + "2100-03-14 07:30:20", + "1970-01-01 00:00:09", + "1969-12-31 12:59:10", + ] + gs = cudf.Series(data, dtype=datetime_types_as_str) + ps = gs.to_pandas() + + expect = ps.dt.ceil(resolution) + got = gs.dt.ceil(resolution) + assert_eq(expect, got) + + +def test_days_in_months(): + data = [ + "2020-05-31", + None, + "1999-12-01", + "2000-12-21", + None, + "1900-02-28", + "1800-03-14", + "2100-03-10", + "1970-01-01", + "1969-12-11", + ] + ps = pd.Series(data, dtype="datetime64[ns]") + gs = cudf.from_pandas(ps) + + assert_eq(ps.dt.days_in_month, gs.dt.days_in_month) + + +def test_is_month_start(): + data = [ + "2020-05-31", + None, + "1999-12-01", + "2000-12-21", + None, + "1900-02-28", + "1800-03-14", + "2100-03-10", + "1970-01-01", + "1969-12-11", + ] + ps = pd.Series(data, dtype="datetime64[ns]") + gs = cudf.from_pandas(ps) + + expect = ps.dt.is_month_start + got = gs.dt.is_month_start + + assert_eq(expect, got) + + +def test_is_month_end(): + data = [ + "2020-05-31", + "2020-02-29", + None, + "1999-12-01", + "2000-12-21", + None, + "1900-02-28", + "1800-03-14", + "2100-03-10", + "1970-01-01", + "1969-12-11", + ] + ps = pd.Series(data, dtype="datetime64[ns]") + gs = cudf.from_pandas(ps) + + expect = ps.dt.is_month_end + got = gs.dt.is_month_end + + assert_eq(expect, got) + + +def test_is_year_start(): + data = [ + "2020-05-31", + None, + "1999-12-01", + "2000-12-21", + None, + "1900-01-01", + "1800-03-14", + "2100-03-10", + "1970-01-01", + "1969-12-11", + "2017-12-30", + "2017-12-31", + "2018-01-01", + ] + ps = pd.Series(data, dtype="datetime64[ns]") + gs = cudf.from_pandas(ps) + + expect = ps.dt.is_year_start + got = gs.dt.is_year_start + + assert_eq(expect, got) + + +def test_is_year_end(): + data = [ + "2020-05-31", + None, + "1999-12-01", + "2000-12-21", + None, + "1900-12-31", + "1800-03-14", + "2017-12-30", + "2017-12-31", + "2020-12-31 08:00:00", + None, + "1999-12-31 18:40:00", + "2000-12-31 04:00:00", + None, + "1800-12-14 07:30:00", + "2100-12-14 07:30:00", + "2020-05-31", + ] + ps = pd.Series(data, dtype="datetime64[ns]") + gs = cudf.from_pandas(ps) + + expect = ps.dt.is_year_end + got = gs.dt.is_year_end + + assert_eq(expect, got) + + +def test_is_quarter_start(): + data = [ + "2020-05-01", + "2020-05-31", + "2020-02-29", + None, + "1999-12-01", + "2000-12-21", + None, + "1900-02-28", + "1800-03-14", + "2100-03-10", + "1970-04-1", + "1970-01-01", + "1969-12-11", + "2020-12-31", + ] + ps = pd.Series(data, dtype="datetime64[ns]") + gs = cudf.from_pandas(ps) + + expect = ps.dt.is_quarter_start + got = gs.dt.is_quarter_start + + assert_eq(expect, got) + + +def test_is_quarter_end(): + data = [ + "2020-05-01", + "2020-05-31", + "2020-02-29", + None, + "1999-12-01", + "2000-12-21", + None, + "1900-02-28", + "1800-03-14", + "2100-03-10", + "1970-04-1", + "1970-01-01", + "1969-12-11", + "2020-12-31", + ] + ps = pd.Series(data, dtype="datetime64[ns]") + gs = cudf.from_pandas(ps) + + expect = ps.dt.is_quarter_end + got = gs.dt.is_quarter_end + + assert_eq(expect, got) + + +def test_is_leap_year(): + data = [ + "2020-05-31 08:00:00", + None, + "1999-12-31 18:40:00", + "2000-12-31 04:00:00", + None, + "1900-02-28 07:00:00", + "1800-03-14 07:30:00", + "2100-03-14 07:30:00", + "1970-01-01 00:00:00", + "1969-12-31 12:59:00", + ] + + # Series + ps = pd.Series(data, dtype="datetime64[s]") + gs = cudf.from_pandas(ps) + + expect = ps.dt.is_leap_year + got = gs.dt.is_leap_year + + assert_eq(expect, got) + + # DatetimeIndex + pIndex = pd.DatetimeIndex(data) + gIndex = cudf.from_pandas(pIndex) + + expect2 = pIndex.is_leap_year + got2 = gIndex.is_leap_year + + assert_eq(expect2, got2) + + +def test_quarter(): + data = [ + "2020-05-31 08:00:00", + "1999-12-31 18:40:00", + "2000-12-31 04:00:00", + "1900-02-28 07:00:00", + "1800-03-14 07:30:00", + "2100-03-14 07:30:00", + "1970-01-01 00:00:00", + "1969-12-31 12:59:00", + ] + dtype = "datetime64[s]" + + # Series + ps = pd.Series(data, dtype=dtype) + gs = cudf.from_pandas(ps) + + expect = ps.dt.quarter + got = gs.dt.quarter + + assert_eq(expect, got, check_dtype=False) + + # DatetimeIndex + pIndex = pd.DatetimeIndex(data) + gIndex = cudf.from_pandas(pIndex) + + expect2 = pIndex.quarter + got2 = gIndex.quarter + + assert_eq(expect2.values, got2.values) + + +@pytest.mark.parametrize( + "data", + [ + pd.Series([], dtype="datetime64[ns]"), + pd.Series(pd.date_range("2010-01-01", "2010-02-01")), + pd.Series([None, None], dtype="datetime64[ns]"), + pd.Series("2020-05-31 08:00:00", dtype="datetime64[s]"), + pd.Series( + pd.date_range(start="2021-07-25", end="2021-07-30"), + index=["a", "b", "c", "d", "e", "f"], + ), + ], +) +def test_isocalendar_series(data): + ps = data.copy() + gs = cudf.from_pandas(ps) + + expect = ps.dt.isocalendar() + got = gs.dt.isocalendar() + + assert_eq(expect, got, check_dtype=False) + + +@pytest.mark.parametrize( + "data", [[1, 2, 3, None], [], [100121, 1221312, 321312321, 1232131223]] +) +@pytest.mark.parametrize( + "date_format", + [ + "%d - %m", + "%y/%H", + "%Y", + "%I - %M / %S", + "%f", + "%j", + "%p", + "%w", + "%U", + "%W", + "%G", + "%u", + "%V", + "%b", + "%B", + "%a", + "%A", + "%U_", + "_%b", + "%B*", + "%a ", + "%A1", + ], +) +def test_datetime_strftime(data, datetime_types_as_str, date_format): + gsr = cudf.Series(data, dtype=datetime_types_as_str) + psr = gsr.to_pandas() + + expected = psr.dt.strftime(date_format=date_format) + actual = gsr.dt.strftime(date_format=date_format) + + assert_eq(expected, actual) + + +@pytest.mark.parametrize("date_format", ["%c", "%x", "%X"]) +def test_datetime_strftime_not_implemented_formats(date_format): + gsr = cudf.Series([1, 2, 3], dtype="datetime64[ms]") + + with pytest.raises(NotImplementedError): + gsr.dt.strftime(date_format=date_format) + + +@pytest.mark.parametrize( + "data", + [ + pd.date_range("20010101", "20020215", freq="400h", name="times"), + pd.date_range( + "20010101", freq="243434324423423234ns", name="times", periods=10 + ), + ], +) +@pytest.mark.parametrize( + "field", + [ + "year", + "month", + "day", + "hour", + "minute", + "second", + "microsecond", + "nanosecond", + "weekday", + "dayofweek", + "dayofyear", + "day_of_year", + ], +) +def test_dt_series_datetime_fields(data, field): + pd_data = pd.Series(data) + gdf_data = cudf.Series(pd_data) + base = getattr(pd_data.dt, field) + test = getattr(gdf_data.dt, field) + assert_eq(base, test, check_dtype=False) diff --git a/python/cudf/cudf/tests/series/indexing/test_getitem.py b/python/cudf/cudf/tests/series/indexing/test_getitem.py index 568d6761cdd..3ed2ef57d9d 100644 --- a/python/cudf/cudf/tests/series/indexing/test_getitem.py +++ b/python/cudf/cudf/tests/series/indexing/test_getitem.py @@ -145,6 +145,11 @@ def test_struct_getitem(series, expected): assert sr[0] == expected +def test_datetime_getitem_na(): + s = cudf.Series([1, 2, None, 3], dtype="datetime64[ns]") + assert s[2] is cudf.NaT + + def test_timedelta_getitem_na(): s = cudf.Series([1, 2, None, 3], dtype="timedelta64[ns]") assert s[2] is cudf.NaT diff --git a/python/cudf/cudf/tests/series/methods/test_astype.py b/python/cudf/cudf/tests/series/methods/test_astype.py index 8373b173815..30b4fcbdc4e 100644 --- a/python/cudf/cudf/tests/series/methods/test_astype.py +++ b/python/cudf/cudf/tests/series/methods/test_astype.py @@ -314,3 +314,186 @@ def test_timedelta_str_roundtrip(sr_data, sr_dtype, exp_data, exp_dtype): assert_eq(expected_series, actual_series) assert_eq(gsr, actual_series.astype(gsr.dtype)) + + +def test_typecast_from_datetime(numeric_types_as_str): + data = pd.date_range( + "2019-07-16 00:00:00", + "2019-07-16 00:00:01", + freq="5555us", + name="times", + ) + pd_data = pd.Series(data) + np_data = np.array(pd_data) + gdf_data = cudf.Series(pd_data) + + np_casted = np_data.astype(numeric_types_as_str) + gdf_casted = gdf_data.astype(numeric_types_as_str) + + np.testing.assert_equal(np_casted, gdf_casted.to_numpy()) + + +def test_typecast_from_datetime_to_int64_to_datetime(datetime_types_as_str): + data = pd.date_range( + "2019-07-16 00:00:00", + "2019-07-16 00:00:01", + freq="5555us", + name="times", + ) + pd_data = pd.Series(data) + np_data = np.array(pd_data) + gdf_data = cudf.Series(pd_data) + + np_casted = np_data.astype(np.int64).astype(datetime_types_as_str) + gdf_casted = gdf_data.astype(np.int64).astype(datetime_types_as_str) + + np.testing.assert_equal(np_casted, gdf_casted.to_numpy()) + + +def test_typecast_to_different_datetime_resolutions(datetime_types_as_str): + data = pd.date_range( + "2019-07-16 00:00:00", + "2019-07-16 00:00:01", + freq="5555us", + name="times", + ) + pd_data = pd.Series(data) + np_data = np.array(pd_data).astype(datetime_types_as_str) + gdf_series = cudf.Series(pd_data).astype(datetime_types_as_str) + np.testing.assert_equal(np_data, gdf_series.to_numpy()) + + +@pytest.mark.parametrize( + "data", + [ + [ + "2019-07-16 00:00:00.333", + "2019-07-16 00:00:00.666", + "2019-07-16 00:00:00.888", + ], + [ + "2019-07-16 00:00:00.333333", + "2019-07-16 00:00:00.666666", + "2019-07-16 00:00:00.888888", + ], + [ + "2019-07-16 00:00:00.333333333", + "2019-07-16 00:00:00.666666666", + "2019-07-16 00:00:00.888888888", + ], + ], + ids=["ms_data", "us_data", "ns_data"], +) +def test_string_timstamp_typecast_to_different_datetime_resolutions( + data, datetime_types_as_str +): + pd_sr = pd.Series(data) + gdf_sr = cudf.Series.from_pandas(pd_sr) + + expect = pd_sr.values.astype(datetime_types_as_str) + got = gdf_sr.astype(datetime_types_as_str).values_host + + np.testing.assert_equal(expect, got) + + +def test_typecast_to_datetime(numeric_types_as_str, datetime_types_as_str): + data = np.arange(1, 10) + np_data = data.astype(numeric_types_as_str) + gdf_data = cudf.Series(np_data) + + np_casted = np_data.astype(datetime_types_as_str) + gdf_casted = gdf_data.astype(datetime_types_as_str) + + np.testing.assert_equal(np_casted, gdf_casted.to_numpy()) + + +def test_typecast_to_from_datetime( + numeric_types_as_str, datetime_types_as_str +): + data = np.arange(1, 10) + np_data = data.astype(numeric_types_as_str) + gdf_data = cudf.Series(np_data) + + np_casted = np_data.astype(datetime_types_as_str).astype( + numeric_types_as_str + ) + gdf_casted = gdf_data.astype(datetime_types_as_str).astype( + numeric_types_as_str + ) + + np.testing.assert_equal(np_casted, gdf_casted.to_numpy()) + + +@pytest.fixture +def datetime_types_as_str2(datetime_types_as_str): + return datetime_types_as_str + + +def test_typecast_from_datetime_to_datetime( + datetime_types_as_str, datetime_types_as_str2 +): + data = np.arange(1, 10) + np_data = data.astype(datetime_types_as_str) + ser = cudf.Series(np_data) + + np_casted = np_data.astype(datetime_types_as_str2) + ser_casted = ser.astype(datetime_types_as_str2) + + np.testing.assert_equal(np_casted, ser_casted.to_numpy()) + + +@pytest.mark.parametrize( + "data", + [ + ["2001-01-01", "2002-02-02", "2000-01-05", "NaT"], + ["2001-01-01", "2002-02-02", "2000-01-05", None], + [None, None, None, None, None], + ], +) +def test_str_null_to_datetime(data, datetime_types_as_str): + psr = pd.Series(data) + gsr = cudf.Series(data) + + assert_eq( + psr.astype(datetime_types_as_str), gsr.astype(datetime_types_as_str) + ) + + +def test_str_to_datetime_error(): + psr = pd.Series(["2001-01-01", "2002-02-02", "2000-01-05", "None"]) + gsr = cudf.Series(["2001-01-01", "2002-02-02", "2000-01-05", "None"]) + + assert_exceptions_equal( + lfunc=psr.astype, + rfunc=gsr.astype, + lfunc_args_and_kwargs=(["datetime64[s]"],), + rfunc_args_and_kwargs=(["datetime64[s]"],), + check_exception_type=False, + ) + + +@pytest.mark.parametrize("timezone", ["", "Z"]) +@pytest.mark.parametrize( + "data", + [ + "2002-10-27T04:30", + "2002-10-27T04:30:00", + "2002-10-27T04:30:00.000", + "2002-10-27T04:30:00.000000", + "2002-10-27T04:30:00.000000000", + ], +) +def test_datetime_infer_format(data, timezone, datetime_types_as_str): + ts_data = [data + timezone] + sr = cudf.Series(ts_data) + psr = pd.Series(ts_data) + if not timezone: + expected = psr.astype(datetime_types_as_str) + actual = sr.astype(datetime_types_as_str) + + assert_eq(expected, actual) + else: + with cudf.option_context("mode.pandas_compatible", True): + with pytest.raises(NotImplementedError): + # pandas doesn't allow parsing "Z" to naive type + sr.astype(datetime_types_as_str) diff --git a/python/cudf/cudf/tests/series/methods/test_fillna.py b/python/cudf/cudf/tests/series/methods/test_fillna.py index 094b27c4fff..5dac20da52c 100644 --- a/python/cudf/cudf/tests/series/methods/test_fillna.py +++ b/python/cudf/cudf/tests/series/methods/test_fillna.py @@ -48,6 +48,45 @@ def test_fillna_categorical_with_different_categories_raises(): ser.fillna(cudf.Series([1, 2]), dtype="category") +@pytest.mark.parametrize( + "data", + [ + [], + [1, 2], + [None, 1], + [None, None], + ], +) +@pytest.mark.parametrize( + "fill_value", + [ + np.datetime64("2005-02"), + np.datetime64("2005-02-25"), + np.datetime64("2005-02-25T03:30"), + np.datetime64("nat"), + "NaT", + ], +) +def test_datetime_fillna(data, datetime_types_as_str, fill_value): + sr = cudf.Series(data, dtype=datetime_types_as_str) + psr = sr.to_pandas() + + expected = psr.dropna() + actual = sr.dropna() + + assert_eq(expected, actual) + + expected = psr.fillna(fill_value) + actual = sr.fillna(fill_value) + + assert_eq(expected, actual) + + expected = expected.dropna() + actual = actual.dropna() + + assert_eq(expected, actual) + + @pytest.mark.parametrize( "data", [ diff --git a/python/cudf/cudf/tests/series/methods/test_first_last.py b/python/cudf/cudf/tests/series/methods/test_first_last.py new file mode 100644 index 00000000000..ffee8b06977 --- /dev/null +++ b/python/cudf/cudf/tests/series/methods/test_first_last.py @@ -0,0 +1,126 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + + +import pandas as pd +import pytest + +import cudf +from cudf.core._compat import ( + PANDAS_CURRENT_SUPPORTED_VERSION, + PANDAS_VERSION, +) +from cudf.testing import assert_eq + + +@pytest.mark.skipif( + PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, + reason="warning not present in older pandas versions", +) +@pytest.mark.parametrize( + "idx", + [ + pd.DatetimeIndex([]), + pd.DatetimeIndex(["2010-05-31"]), + pd.date_range("2000-01-01", "2000-12-31", periods=21), + ], +) +@pytest.mark.parametrize( + "offset", + [ + "10Y", + "6M", + "M", + "31D", + "0H", + "44640T", + "44640min", + "2678000S", + "2678000000L", + "2678000000ms", + "2678000000000U", + "2678000000000us", + "2678000000000000N", + "2678000000000000ns", + ], +) +def test_first(idx, offset): + p = pd.Series(range(len(idx)), dtype="int64", index=idx) + g = cudf.from_pandas(p) + + with pytest.warns(FutureWarning): + expect = p.first(offset=offset) + with pytest.warns(FutureWarning): + got = g.first(offset=offset) + + assert_eq(expect, got) + + +@pytest.mark.skipif( + PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, + reason="warning not present in older pandas versions", +) +def test_first_start_at_end_of_month(): + idx = pd.DatetimeIndex( + [ + "2020-01-31", + "2020-02-15", + "2020-02-29", + "2020-03-15", + "2020-03-31", + "2020-04-15", + "2020-04-30", + ] + ) + offset = "3M" + p = pd.Series(range(len(idx)), index=idx) + g = cudf.from_pandas(p) + + with pytest.warns(FutureWarning): + expect = p.first(offset=offset) + with pytest.warns(FutureWarning): + got = g.first(offset=offset) + + assert_eq(expect, got) + + +@pytest.mark.skipif( + PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, + reason="warning not present in older pandas versions", +) +@pytest.mark.parametrize( + "idx", + [ + pd.DatetimeIndex([]), + pd.DatetimeIndex(["2010-05-31"]), + pd.date_range("2000-01-01", "2000-12-31", periods=21), + ], +) +@pytest.mark.parametrize( + "offset", + [ + "10Y", + "6M", + "M", + "31D", + "0H", + "44640T", + "44640min", + "2678000S", + "2678000000L", + "2678000000ms", + "2678000000000U", + "2678000000000us", + "2678000000000000N", + "2678000000000000ns", + ], +) +def test_last(idx, offset): + p = pd.Series(range(len(idx)), dtype="int64", index=idx) + g = cudf.from_pandas(p) + + with pytest.warns(FutureWarning): + expect = p.last(offset=offset) + with pytest.warns(FutureWarning): + got = g.last(offset=offset) + + assert_eq(expect, got) diff --git a/python/cudf/cudf/tests/series/methods/test_nunique.py b/python/cudf/cudf/tests/series/methods/test_nunique.py index 19ff910e316..c645fa401f4 100644 --- a/python/cudf/cudf/tests/series/methods/test_nunique.py +++ b/python/cudf/cudf/tests/series/methods/test_nunique.py @@ -1,6 +1,9 @@ # Copyright (c) 2025, NVIDIA CORPORATION. + +import numpy as np import pandas as pd +import pytest import cudf from cudf.testing import assert_eq @@ -23,3 +26,27 @@ def test_series_nunique(): expected = pd_s.nunique() assert_eq(expected, actual) + + +@pytest.mark.parametrize( + "data", + [ + pd.Series([], dtype="datetime64[ns]"), + pd.Series(pd.date_range("2010-01-01", "2010-02-01")), + pd.Series([None, None], dtype="datetime64[ns]"), + ], +) +@pytest.mark.parametrize("nulls", ["none", "some"]) +def test_datetime_nunique(data, nulls): + psr = data.copy() + rng = np.random.default_rng(seed=0) + + if len(data) > 0: + if nulls == "some": + p = rng.integers(0, len(data), 2) + psr[p] = None + + gsr = cudf.from_pandas(psr) + expected = psr.nunique() + got = gsr.nunique() + assert_eq(got, expected) diff --git a/python/cudf/cudf/tests/series/methods/test_query.py b/python/cudf/cudf/tests/series/methods/test_query.py new file mode 100644 index 00000000000..b6b69cf8ebb --- /dev/null +++ b/python/cudf/cudf/tests/series/methods/test_query.py @@ -0,0 +1,49 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. +import datetime + +import numpy as np +import pandas as pd + +from cudf import DataFrame +from cudf.testing import assert_eq + + +def test_issue_165(): + df_pandas = pd.DataFrame() + start_date = datetime.datetime.strptime("2000-10-21", "%Y-%m-%d") + data = [(start_date + datetime.timedelta(days=x)) for x in range(6)] + df_pandas["dates"] = data + df_pandas["num"] = [1, 2, 3, 4, 5, 6] + df_cudf = DataFrame.from_pandas(df_pandas) + + base = df_pandas.query("dates==@start_date") + test = df_cudf.query("dates==@start_date") + assert_eq(base, test) + assert len(test) > 0 + + mask = df_cudf.dates == start_date + base_mask = df_pandas.dates == start_date + assert_eq(mask, base_mask, check_names=False) + assert mask.to_pandas().sum() > 0 + + start_date_ts = pd.Timestamp(start_date) + test = df_cudf.query("dates==@start_date_ts") + base = df_pandas.query("dates==@start_date_ts") + assert_eq(base, test) + assert len(test) > 0 + + mask = df_cudf.dates == start_date_ts + base_mask = df_pandas.dates == start_date_ts + assert_eq(mask, base_mask, check_names=False) + assert mask.to_pandas().sum() > 0 + + start_date_np = np.datetime64(start_date_ts, "ns") + test = df_cudf.query("dates==@start_date_np") + base = df_pandas.query("dates==@start_date_np") + assert_eq(base, test) + assert len(test) > 0 + + mask = df_cudf.dates == start_date_np + base_mask = df_pandas.dates == start_date_np + assert_eq(mask, base_mask, check_names=False) + assert mask.to_pandas().sum() > 0 diff --git a/python/cudf/cudf/tests/series/methods/test_to_pandas.py b/python/cudf/cudf/tests/series/methods/test_to_pandas.py index bc78a8c7871..1768d6ccc0e 100644 --- a/python/cudf/cudf/tests/series/methods/test_to_pandas.py +++ b/python/cudf/cudf/tests/series/methods/test_to_pandas.py @@ -195,3 +195,21 @@ def test_writable_numpy_array_timedelta(): assert expected_flags.writeable == actual_flags.writeable assert expected_flags.aligned == actual_flags.aligned assert expected_flags.writebackifcopy == actual_flags.writebackifcopy + + +@pytest.mark.parametrize("nulls", ["some", "all"]) +def test_to_from_pandas_nulls(nulls): + data = np.arange(1, 10) + pd_data = pd.Series(data.astype("datetime64[ns]")) + if nulls == "some": + # Fill half the values with NaT + pd_data[list(range(0, len(pd_data), 2))] = np.datetime64("nat", "ns") + elif nulls == "all": + # Fill all the values with NaT + pd_data[:] = np.datetime64("nat", "ns") + gdf_data = cudf.Series.from_pandas(pd_data) + + expect = pd_data + got = gdf_data.to_pandas() + + assert_eq(expect, got) diff --git a/python/cudf/cudf/tests/series/methods/test_unique.py b/python/cudf/cudf/tests/series/methods/test_unique.py new file mode 100644 index 00000000000..2aa4b78b039 --- /dev/null +++ b/python/cudf/cudf/tests/series/methods/test_unique.py @@ -0,0 +1,38 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + + +import numpy as np +import pandas as pd +import pytest + +import cudf +from cudf.testing import assert_eq + + +@pytest.mark.parametrize( + "data", + [ + pd.Series([], dtype="datetime64[ns]"), + pd.Series(pd.date_range("2010-01-01", "2010-02-01")), + pd.Series([None, None], dtype="datetime64[ns]"), + ], +) +@pytest.mark.parametrize("nulls", ["none", "some"]) +def test_datetime_unique(data, nulls): + rng = np.random.default_rng(seed=0) + psr = data.copy() + + if len(data) > 0: + if nulls == "some": + p = rng.integers(0, len(data), 2) + psr[p] = None + + gsr = cudf.from_pandas(psr) + expected = psr.unique() + got = gsr.unique() + + # Unique does not provide a guarantee on ordering. + assert_eq( + pd.Series(expected).sort_values(ignore_index=True), + got.sort_values(ignore_index=True).to_pandas(), + ) diff --git a/python/cudf/cudf/tests/series/test_attributes.py b/python/cudf/cudf/tests/series/test_attributes.py index aedbf696009..0e99893f530 100644 --- a/python/cudf/cudf/tests/series/test_attributes.py +++ b/python/cudf/cudf/tests/series/test_attributes.py @@ -3,6 +3,7 @@ import numpy as np import pandas as pd +import pyarrow as pa import pytest import cudf @@ -241,3 +242,54 @@ def test_timedelta_contains(data, timedelta_types_as_str, scalar): actual = scalar in psr assert_eq(expected, actual) + + +@pytest.mark.parametrize( + "data, expected", + [ + [["2018-01-01", None, "2019-01-31", None, "2018-01-01"], True], + [ + [ + "2018-01-01", + "2018-01-02", + "2019-01-31", + "2018-03-01", + "2018-01-01", + ], + False, + ], + [ + np.array( + ["2018-01-01", None, "2019-12-30"], dtype="datetime64[ms]" + ), + True, + ], + ], +) +def test_datetime_has_null_test(data, expected): + data = cudf.Series(data, dtype="datetime64[ms]") + pd_data = data.to_pandas() + count = pd_data.notna().value_counts() + expected_count = 0 + if False in count.keys(): + expected_count = count[False] + + assert expected is data.has_nulls + assert expected_count == data.null_count + + +def test_datetime_has_null_test_pyarrow(): + data = cudf.Series( + pa.array( + [0, np.iinfo("int64").min, np.iinfo("int64").max, None], + type=pa.timestamp("ns"), + ) + ) + assert data.has_nulls is True + assert data.null_count == 1 + + +def test_error_values_datetime(): + s = cudf.Series([1, 2, 3], dtype="datetime64[ns]") + with pytest.raises(NotImplementedError, match="cupy does not support"): + s.values diff --git a/python/cudf/cudf/tests/series/test_constructors.py b/python/cudf/cudf/tests/series/test_constructors.py index d5a9ea50317..e7bdf6d415f 100644 --- a/python/cudf/cudf/tests/series/test_constructors.py +++ b/python/cudf/cudf/tests/series/test_constructors.py @@ -1,4 +1,5 @@ # Copyright (c) 2023-2025, NVIDIA CORPORATION. +import datetime import decimal import cupy as cp @@ -8,6 +9,10 @@ import pytest import cudf +from cudf.core._compat import ( + PANDAS_CURRENT_SUPPORTED_VERSION, + PANDAS_VERSION, +) from cudf.core.column.column import as_column from cudf.errors import MixedTypeError from cudf.testing import assert_eq @@ -757,3 +762,97 @@ def test_create_struct_series(data): expect = pd.Series(data) got = cudf.Series(data) assert_eq(expect, got, check_dtype=False) + + +@pytest.mark.parametrize( + "data", + [ + pd.date_range("20010101", "20020215", freq="400h", name="times"), + pd.date_range( + "20010101", freq="243434324423423234ns", name="times", periods=10 + ), + ], +) +def test_series_from_pandas_datetime_index(data): + pd_data = pd.Series(data) + gdf_data = cudf.Series(pd_data) + assert_eq(pd_data, gdf_data) + + +@pytest.mark.parametrize( + "dtype", + ["datetime64[D]", "datetime64[W]", "datetime64[M]", "datetime64[Y]"], +) +def test_datetime_array_timeunit_cast(dtype): + testdata = np.array( + [ + np.datetime64("2016-11-20"), + np.datetime64("2020-11-20"), + np.datetime64("2019-11-20"), + np.datetime64("1918-11-20"), + np.datetime64("2118-11-20"), + ], + dtype=dtype, + ) + + gs = cudf.Series(testdata) + ps = pd.Series(testdata) + + assert_eq(ps, gs) + + gdf = cudf.DataFrame() + gdf["a"] = np.arange(5) + gdf["b"] = testdata + + pdf = pd.DataFrame() + pdf["a"] = np.arange(5) + pdf["b"] = testdata + assert_eq(pdf, gdf) + + +@pytest.mark.parametrize("timeunit", ["D", "W", "M", "Y"]) +@pytest.mark.skipif( + PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, + reason="Fails in older versions of pandas", +) +def test_datetime_scalar_timeunit_cast(timeunit): + testscalar = np.datetime64("2016-11-20", timeunit) + + gs = cudf.Series(testscalar) + ps = pd.Series(testscalar) + + assert_eq(ps, gs, check_dtype=False) + + gdf = cudf.DataFrame() + gdf["a"] = np.arange(5) + gdf["b"] = testscalar + + pdf = pd.DataFrame() + pdf["a"] = np.arange(5) + pdf["b"] = testscalar + + assert gdf["b"].dtype == np.dtype("datetime64[s]") + assert_eq(pdf, gdf, check_dtype=True) + + +@pytest.mark.skipif( + PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, + reason="Fails in older versions of pandas", +) +def test_datetime_string_to_datetime_resolution_loss_raises(): + data = ["2020-01-01 00:00:00.00001"] + dtype = "datetime64[s]" + with pytest.raises(ValueError): + cudf.Series(data, dtype=dtype) + with pytest.raises(ValueError): + pd.Series(data, dtype=dtype) + + +def test_timezone_pyarrow_array(): + pa_array = pa.array( + [datetime.datetime(2020, 1, 1, tzinfo=datetime.timezone.utc)], + type=pa.timestamp("ns", "UTC"), + ) + result = cudf.Series(pa_array) + expected = pa_array.to_pandas() + assert_eq(result, expected) diff --git a/python/cudf/cudf/tests/test_datetime.py b/python/cudf/cudf/tests/test_datetime.py index 94fca7a2a6b..1bc6fe19d02 100644 --- a/python/cudf/cudf/tests/test_datetime.py +++ b/python/cudf/cudf/tests/test_datetime.py @@ -2,29 +2,22 @@ import datetime import operator -import warnings import cupy as cp import numpy as np import pandas as pd -import pandas._testing as tm -import pyarrow as pa import pytest import cudf -import cudf.testing.dataset_generator as dataset_generator -from cudf import DataFrame, Series +from cudf import Series from cudf.core._compat import ( PANDAS_CURRENT_SUPPORTED_VERSION, - PANDAS_GE_220, PANDAS_GE_230, PANDAS_VERSION, ) -from cudf.core.index import DatetimeIndex from cudf.testing import assert_eq from cudf.testing._utils import ( DATETIME_TYPES, - NUMERIC_TYPES, assert_exceptions_equal, expect_warning_if, ) @@ -56,32 +49,6 @@ def data(request): return request.param -@pytest.fixture( - params=[ - "year", - "month", - "day", - "hour", - "minute", - "second", - "microsecond", - "nanosecond", - "weekday", - "dayofweek", - "dayofyear", - "day_of_year", - ] -) -def field(request): - return request.param - - -def test_series(data): - pd_data = pd.Series(data) - gdf_data = Series(pd_data) - assert_eq(pd_data, gdf_data) - - @pytest.mark.parametrize( "lhs_dtype", ["datetime64[s]", "datetime64[ms]", "datetime64[us]", "datetime64[ns]"], @@ -157,633 +124,6 @@ def test_dt_ops(data): assert_eq(pd_data > pd_data, gdf_data > gdf_data) -# libcudf doesn't respect timezones -def test_dt_series(data, field): - pd_data = pd.Series(data) - gdf_data = Series(pd_data) - base = getattr(pd_data.dt, field) - test = getattr(gdf_data.dt, field) - assert_eq(base, test, check_dtype=False) - - -def test_dt_index(data, field): - gdf_data = DatetimeIndex(data) - assert_eq(getattr(gdf_data, field), getattr(data, field), exact=False) - - -def test_setitem_datetime(): - df = DataFrame() - df["date"] = pd.date_range("20010101", "20010105").values - assert df.date.dtype.kind == "M" - - -def test_sort_datetime(): - rng = np.random.default_rng(seed=0) - df = pd.DataFrame( - { - "date": np.array( - [ - np.datetime64("2016-11-20"), - np.datetime64("2020-11-20"), - np.datetime64("2019-11-20"), - np.datetime64("1918-11-20"), - np.datetime64("2118-11-20"), - ] - ), - "vals": rng.random(5), - } - ) - - gdf = cudf.from_pandas(df) - - s_df = df.sort_values(by="date") - s_gdf = gdf.sort_values(by="date") - - assert_eq(s_df, s_gdf) - - -def test_issue_165(): - df_pandas = pd.DataFrame() - start_date = datetime.datetime.strptime("2000-10-21", "%Y-%m-%d") - data = [(start_date + datetime.timedelta(days=x)) for x in range(6)] - df_pandas["dates"] = data - df_pandas["num"] = [1, 2, 3, 4, 5, 6] - df_cudf = DataFrame.from_pandas(df_pandas) - - base = df_pandas.query("dates==@start_date") - test = df_cudf.query("dates==@start_date") - assert_eq(base, test) - assert len(test) > 0 - - mask = df_cudf.dates == start_date - base_mask = df_pandas.dates == start_date - assert_eq(mask, base_mask, check_names=False) - assert mask.to_pandas().sum() > 0 - - start_date_ts = pd.Timestamp(start_date) - test = df_cudf.query("dates==@start_date_ts") - base = df_pandas.query("dates==@start_date_ts") - assert_eq(base, test) - assert len(test) > 0 - - mask = df_cudf.dates == start_date_ts - base_mask = df_pandas.dates == start_date_ts - assert_eq(mask, base_mask, check_names=False) - assert mask.to_pandas().sum() > 0 - - start_date_np = np.datetime64(start_date_ts, "ns") - test = df_cudf.query("dates==@start_date_np") - base = df_pandas.query("dates==@start_date_np") - assert_eq(base, test) - assert len(test) > 0 - - mask = df_cudf.dates == start_date_np - base_mask = df_pandas.dates == start_date_np - assert_eq(mask, base_mask, check_names=False) - assert mask.to_pandas().sum() > 0 - - -@pytest.mark.parametrize("dtype", NUMERIC_TYPES) -def test_typecast_from_datetime(data, dtype): - pd_data = pd.Series(data) - np_data = np.array(pd_data) - gdf_data = Series(pd_data) - - np_casted = np_data.astype(dtype) - gdf_casted = gdf_data.astype(dtype) - - np.testing.assert_equal(np_casted, gdf_casted.to_numpy()) - - -@pytest.mark.parametrize( - "dtype", - ["datetime64[s]", "datetime64[ms]", "datetime64[us]", "datetime64[ns]"], -) -def test_typecast_from_datetime_to_int64_to_datetime(data, dtype): - pd_data = pd.Series(data) - np_data = np.array(pd_data) - gdf_data = Series(pd_data) - - np_casted = np_data.astype(np.int64).astype(dtype) - gdf_casted = gdf_data.astype(np.int64).astype(dtype) - - np.testing.assert_equal(np_casted, gdf_casted.to_numpy()) - - -@pytest.mark.parametrize( - "dtype", - ["datetime64[s]", "datetime64[ms]", "datetime64[us]", "datetime64[ns]"], -) -def test_typecast_to_different_datetime_resolutions(data, dtype): - data = pd.date_range( - "2019-07-16 00:00:00", - "2019-07-16 00:00:01", - freq="5555us", - name="times", - ) - pd_data = pd.Series(data) - np_data = np.array(pd_data).astype(dtype) - gdf_series = Series(pd_data).astype(dtype) - np.testing.assert_equal(np_data, gdf_series.to_numpy()) - - -@pytest.mark.parametrize( - "data", - [ - [ - "2019-07-16 00:00:00.333", - "2019-07-16 00:00:00.666", - "2019-07-16 00:00:00.888", - ], - [ - "2019-07-16 00:00:00.333333", - "2019-07-16 00:00:00.666666", - "2019-07-16 00:00:00.888888", - ], - [ - "2019-07-16 00:00:00.333333333", - "2019-07-16 00:00:00.666666666", - "2019-07-16 00:00:00.888888888", - ], - ], - ids=["ms_data", "us_data", "ns_data"], -) -@pytest.mark.parametrize( - "dtype", - ["datetime64[s]", "datetime64[ms]", "datetime64[us]", "datetime64[ns]"], -) -def test_string_timstamp_typecast_to_different_datetime_resolutions( - data, dtype -): - pd_sr = pd.Series(data) - gdf_sr = cudf.Series.from_pandas(pd_sr) - - expect = pd_sr.values.astype(dtype) - got = gdf_sr.astype(dtype).values_host - - np.testing.assert_equal(expect, got) - - -@pytest.mark.parametrize("from_dtype", NUMERIC_TYPES) -@pytest.mark.parametrize( - "to_dtype", - ["datetime64[s]", "datetime64[ms]", "datetime64[us]", "datetime64[ns]"], -) -def test_typecast_to_datetime(from_dtype, to_dtype): - data = np.arange(1, 10) - np_data = data.astype(from_dtype) - gdf_data = Series(np_data) - - np_casted = np_data.astype(to_dtype) - gdf_casted = gdf_data.astype(to_dtype) - - np.testing.assert_equal(np_casted, gdf_casted.to_numpy()) - - -@pytest.mark.parametrize("from_dtype", NUMERIC_TYPES) -@pytest.mark.parametrize( - "to_dtype", - ["datetime64[s]", "datetime64[ms]", "datetime64[us]", "datetime64[ns]"], -) -def test_typecast_to_from_datetime(from_dtype, to_dtype): - data = np.arange(1, 10) - np_data = data.astype(from_dtype) - gdf_data = Series(np_data) - - np_casted = np_data.astype(to_dtype).astype(from_dtype) - gdf_casted = gdf_data.astype(to_dtype).astype(from_dtype) - - np.testing.assert_equal(np_casted, gdf_casted.to_numpy()) - - -@pytest.mark.parametrize( - "from_dtype", - ["datetime64[s]", "datetime64[ms]", "datetime64[us]", "datetime64[ns]"], -) -@pytest.mark.parametrize( - "to_dtype", - ["datetime64[s]", "datetime64[ms]", "datetime64[us]", "datetime64[ns]"], -) -def test_typecast_from_datetime_to_datetime(from_dtype, to_dtype): - data = np.arange(1, 10) - np_data = data.astype(from_dtype) - ser = Series(np_data) - - np_casted = np_data.astype(to_dtype) - ser_casted = ser.astype(to_dtype) - - np.testing.assert_equal(np_casted, ser_casted.to_numpy()) - - -@pytest.mark.parametrize("nulls", ["some", "all"]) -def test_to_from_pandas_nulls(data, nulls): - data = np.arange(1, 10) - pd_data = pd.Series(data.astype("datetime64[ns]")) - if nulls == "some": - # Fill half the values with NaT - pd_data[list(range(0, len(pd_data), 2))] = np.datetime64("nat", "ns") - elif nulls == "all": - # Fill all the values with NaT - pd_data[:] = np.datetime64("nat", "ns") - gdf_data = Series.from_pandas(pd_data) - - expect = pd_data - got = gdf_data.to_pandas() - - assert_eq(expect, got) - - -@pytest.mark.parametrize( - "dtype", - ["datetime64[s]", "datetime64[ms]", "datetime64[us]", "datetime64[ns]"], -) -def test_datetime_to_arrow(dtype): - timestamp = ( - cudf.datasets.timeseries( - start="2000-01-01", end="2000-01-02", freq="3600s", dtypes={} - ) - .reset_index()["timestamp"] - .reset_index(drop=True) - ) - gdf = DataFrame({"timestamp": timestamp.astype(dtype)}) - assert_eq(gdf, DataFrame.from_arrow(gdf.to_arrow(preserve_index=False))) - - -@pytest.mark.parametrize( - "data", - [ - pd.Series([], dtype="datetime64[ns]"), - pd.Series(pd.date_range("2010-01-01", "2010-02-01")), - pd.Series([None, None], dtype="datetime64[ns]"), - ], -) -@pytest.mark.parametrize("nulls", ["none", "some"]) -def test_datetime_unique(data, nulls): - rng = np.random.default_rng(seed=0) - psr = data.copy() - - if len(data) > 0: - if nulls == "some": - p = rng.integers(0, len(data), 2) - psr[p] = None - - gsr = cudf.from_pandas(psr) - expected = psr.unique() - got = gsr.unique() - - # Unique does not provide a guarantee on ordering. - assert_eq( - pd.Series(expected).sort_values(ignore_index=True), - got.sort_values(ignore_index=True).to_pandas(), - ) - - -@pytest.mark.parametrize( - "data", - [ - pd.Series([], dtype="datetime64[ns]"), - pd.Series(pd.date_range("2010-01-01", "2010-02-01")), - pd.Series([None, None], dtype="datetime64[ns]"), - ], -) -@pytest.mark.parametrize("nulls", ["none", "some"]) -def test_datetime_nunique(data, nulls): - psr = data.copy() - rng = np.random.default_rng(seed=0) - - if len(data) > 0: - if nulls == "some": - p = rng.integers(0, len(data), 2) - psr[p] = None - - gsr = cudf.from_pandas(psr) - expected = psr.nunique() - got = gsr.nunique() - assert_eq(got, expected) - - -@pytest.mark.parametrize( - "data, expected", - [ - [["2018-01-01", None, "2019-01-31", None, "2018-01-01"], True], - [ - [ - "2018-01-01", - "2018-01-02", - "2019-01-31", - "2018-03-01", - "2018-01-01", - ], - False, - ], - [ - np.array( - ["2018-01-01", None, "2019-12-30"], dtype="datetime64[ms]" - ), - True, - ], - ], -) -def test_datetime_has_null_test(data, expected): - data = Series(data, dtype="datetime64[ms]") - pd_data = data.to_pandas() - count = pd_data.notna().value_counts() - expected_count = 0 - if False in count.keys(): - expected_count = count[False] - - assert_eq(expected, data.has_nulls) - assert_eq(expected_count, data.null_count) - - -def test_datetime_has_null_test_pyarrow(): - data = Series( - pa.array( - [0, np.iinfo("int64").min, np.iinfo("int64").max, None], - type=pa.timestamp("ns"), - ) - ) - expected = True - expected_count = 1 - - assert_eq(expected, data.has_nulls) - assert_eq(expected_count, data.null_count) - - -def test_datetime_dataframe(): - data = { - "timearray": np.array( - [0, 1, None, 2, 20, None, 897], dtype="datetime64[ms]" - ) - } - gdf = cudf.DataFrame(data) - pdf = pd.DataFrame(data) - - assert_eq(pdf, gdf) - - assert_eq(pdf.dropna(), gdf.dropna()) - - assert_eq(pdf.isnull(), gdf.isnull()) - - data = np.array([0, 1, None, 2, 20, None, 897], dtype="datetime64[ms]") - gs = cudf.Series(data) - ps = pd.Series(data) - - assert_eq(ps, gs) - - assert_eq(ps.dropna(), gs.dropna()) - - assert_eq(ps.isnull(), gs.isnull()) - - -@pytest.mark.parametrize( - "data", - [ - None, - [], - pd.Series([], dtype="float64"), - pd.Index([]), - pd.Series([1, 2, 3]), - pd.Series([0, 1, -1]), - pd.Series([0, 1, -1, 100.3, 200, 47637289]), - pd.Series(["2012-10-11", "2010-01-01", "2016-07-07", "2014-02-02"]), - [1, 2, 3, 100, -123, -1, 0, 1000000000000679367], - pd.DataFrame({"year": [2015, 2016], "month": [2, 3], "day": [4, 5]}), - pd.DataFrame( - {"year": ["2015", "2016"], "month": ["2", "3"], "day": [4, 5]} - ), - pd.DataFrame( - { - "year": [2015, 2016], - "month": [2, 3], - "day": [4, 5], - "minute": [1, 100], - "second": [90, 10], - "hour": [1, 0.5], - }, - index=["a", "b"], - ), - pd.DataFrame( - { - "year": [], - "month": [], - "day": [], - "minute": [], - "second": [], - "hour": [], - }, - ), - ["2012-10-11", "2010-01-01", "2016-07-07", "2014-02-02"], - pd.Index([1, 2, 3, 4]), - pd.DatetimeIndex( - ["1970-01-01 00:00:00.000000001", "1970-01-01 00:00:00.000000002"], - dtype="datetime64[ns]", - freq=None, - ), - pd.DatetimeIndex( - [], - dtype="datetime64[ns]", - freq=None, - ), - pd.Series([1, 2, 3]).astype("datetime64[ns]"), - pd.Series([1, 2, 3]).astype("datetime64[us]"), - pd.Series([1, 2, 3]).astype("datetime64[ms]"), - pd.Series([1, 2, 3]).astype("datetime64[s]"), - pd.Series([1, 2, 3]).astype("datetime64[D]"), - 1, - 100, - 17, - 53.638435454, - np.array([1, 10, 15, 478925, 2327623467]), - np.array([0.3474673, -10, 15, 478925.34345, 2327623467]), - ], -) -@pytest.mark.parametrize("dayfirst", [True, False]) -def test_cudf_to_datetime(data, dayfirst): - pd_data = data - if isinstance(pd_data, (pd.Series, pd.DataFrame, pd.Index)): - gd_data = cudf.from_pandas(pd_data) - else: - if type(pd_data).__module__ == np.__name__: - gd_data = cp.array(pd_data) - else: - gd_data = pd_data - - expected = pd.to_datetime(pd_data, dayfirst=dayfirst) - actual = cudf.to_datetime(gd_data, dayfirst=dayfirst) - - if isinstance(expected, pd.Series): - assert_eq(actual, expected, check_dtype=False) - else: - assert_eq(actual, expected, check_exact=False) - - -@pytest.mark.parametrize( - "data", - [ - "2", - ["1", "2", "3"], - ["1/1/1", "2/2/2", "1"], - pd.Series([1, 2, 3], dtype="timedelta64[ns]"), - pd.DataFrame( - { - "year": [2015, 2016], - "month": [2, 3], - "day": [4, 5], - "minute": [1, 100], - "second": [90, 10], - "hour": [1, 0], - "blablacol": [1, 1], - } - ), - pd.DataFrame( - { - "month": [2, 3], - "day": [4, 5], - "minute": [1, 100], - "second": [90, 10], - "hour": [1, 0], - } - ), - ], -) -def test_to_datetime_errors(data): - pd_data = data - if isinstance(pd_data, (pd.Series, pd.DataFrame, pd.Index)): - gd_data = cudf.from_pandas(pd_data) - else: - gd_data = pd_data - - with warnings.catch_warnings(): - warnings.simplefilter("ignore") - assert_exceptions_equal( - pd.to_datetime, - cudf.to_datetime, - ([pd_data],), - ([gd_data],), - ) - - -def test_to_datetime_not_implemented(): - with pytest.raises(NotImplementedError): - cudf.to_datetime([], exact=False) - - with pytest.raises(NotImplementedError): - cudf.to_datetime([], origin="julian") - - with pytest.raises(NotImplementedError): - cudf.to_datetime([], yearfirst=True) - - -@pytest.mark.parametrize( - "data", - [ - 1, - [], - pd.Series([], dtype="float64"), - pd.Index([]), - pd.Series([1, 2, 3]), - pd.Series([1, 2.4, 3]), - pd.Series([0, 1, -1]), - pd.Series([0, 1, -1, 100, 200, 47637]), - [10, 12, 1200, 15003], - pd.DatetimeIndex( - [], - dtype="datetime64[ns]", - freq=None, - ), - pd.Index([1, 2, 3, 4]), - ], -) -@pytest.mark.parametrize("unit", ["D", "s", "ms", "us", "ns"]) -def test_to_datetime_units(data, unit): - pd_data = data - if isinstance(pd_data, (pd.Series, pd.DataFrame, pd.Index)): - gd_data = cudf.from_pandas(pd_data) - else: - gd_data = pd_data - - expected = pd.to_datetime(pd_data, unit=unit) - actual = cudf.to_datetime(gd_data, unit=unit) - - if isinstance(expected, pd.Series): - assert_eq(actual, expected, check_dtype=False) - else: - assert_eq(actual, expected, exact=False, check_exact=False) - - -@pytest.mark.parametrize( - "data,format", - [ - ("2012-10-11", None), - ("2012-10-11", "%Y-%m-%d"), - ("2012-10-11", "%Y-%d-%m"), - (["2012-10-11", "2010-01-01", "2016-07-07", "2014-02-02"], None), - (["2012-10-11", "2010-01-01", "2016-07-07", "2014-02-02"], "%Y-%m-%d"), - (["2012-10-11", "2010-01-01", "2016-07-07", "2014-02-02"], "%Y-%d-%m"), - (["10-11-2012", "01-01-2010", "07-07-2016", "02-02-2014"], "%m-%d-%Y"), - (["10-11-2012", "01-01-2010", "07-07-2016", "02-02-2014"], "%d-%m-%Y"), - (["10-11-2012", "01-01-2010", "07-07-2016", "02-02-2014"], None), - (["2012/10/11", "2010/01/01", "2016/07/07", "2014/02/02"], None), - (["2012/10/11", "2010/01/01", "2016/07/07", "2014/02/02"], "%Y/%m/%d"), - (["2012/10/11", "2010/01/01", "2016/07/07", "2014/02/02"], "%Y/%d/%m"), - (["10/11/2012", "01/01/2010", "07/07/2016", "02/02/2014"], "%m/%d/%Y"), - (["10/11/2012", "01/01/2010", "07/07/2016", "02/02/2014"], "%d/%m/%Y"), - (["10/11/2012", "01/01/2010", "07/07/2016", "02/02/2014"], None), - (["2021-04-13 12:30:04.123456789"], "%Y-%m-%d %H:%M:%S.%f"), - (pd.Series([2015, 2020, 2021]), "%Y"), - pytest.param( - pd.Series(["1", "2", "1"]), - "%m", - marks=pytest.mark.xfail( - reason="https://github.com/rapidsai/cudf/issues/6109" - "https://github.com/pandas-dev/pandas/issues/35934" - ), - ), - pytest.param( - pd.Series(["14", "20", "10"]), - "%d", - marks=pytest.mark.xfail( - reason="https://github.com/rapidsai/cudf/issues/6109" - "https://github.com/pandas-dev/pandas/issues/35934" - ), - ), - (pd.Series([2015, 2020.0, 2021.2]), "%Y"), - ], -) -@pytest.mark.parametrize("infer_datetime_format", [True, False]) -def test_to_datetime_format(data, format, infer_datetime_format): - pd_data = data - if isinstance(pd_data, (pd.Series, pd.DataFrame, pd.Index)): - gd_data = cudf.from_pandas(pd_data) - else: - gd_data = pd_data - - with expect_warning_if(True, UserWarning): - expected = pd.to_datetime( - pd_data, format=format, infer_datetime_format=infer_datetime_format - ) - with expect_warning_if(not infer_datetime_format): - actual = cudf.to_datetime( - gd_data, format=format, infer_datetime_format=infer_datetime_format - ) - - if isinstance(expected, pd.Series): - assert_eq(actual, expected, check_dtype=False) - else: - assert_eq(actual, expected, check_exact=False) - - -def test_to_datetime_data_out_of_range_for_format(): - with pytest.raises(ValueError): - cudf.to_datetime("2015-02-99", format="%Y-%m-%d") - - -def test_to_datetime_different_formats_notimplemented(): - with pytest.raises(NotImplementedError): - cudf.to_datetime(["2015-02-01", "2015-02-01 10:10:10"]) - - @pytest.mark.skipif( PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, reason="Fails in older versions of pandas.", @@ -801,92 +141,6 @@ def test_datetime_can_cast_safely(): assert sr._column.can_cast_safely(np.dtype("datetime64[ns]")) is False -# Cudf autocasts unsupported time_units -@pytest.mark.parametrize( - "dtype", - ["datetime64[D]", "datetime64[W]", "datetime64[M]", "datetime64[Y]"], -) -def test_datetime_array_timeunit_cast(dtype): - testdata = np.array( - [ - np.datetime64("2016-11-20"), - np.datetime64("2020-11-20"), - np.datetime64("2019-11-20"), - np.datetime64("1918-11-20"), - np.datetime64("2118-11-20"), - ], - dtype=dtype, - ) - - gs = Series(testdata) - ps = pd.Series(testdata) - - assert_eq(ps, gs) - - gdf = DataFrame() - gdf["a"] = np.arange(5) - gdf["b"] = testdata - - pdf = pd.DataFrame() - pdf["a"] = np.arange(5) - pdf["b"] = testdata - assert_eq(pdf, gdf) - - -@pytest.mark.parametrize("timeunit", ["D", "W", "M", "Y"]) -@pytest.mark.skipif( - PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, - reason="Fails in older versions of pandas", -) -def test_datetime_scalar_timeunit_cast(timeunit): - testscalar = np.datetime64("2016-11-20", timeunit) - - gs = Series(testscalar) - ps = pd.Series(testscalar) - - assert_eq(ps, gs, check_dtype=False) - - gdf = DataFrame() - gdf["a"] = np.arange(5) - gdf["b"] = testscalar - - pdf = pd.DataFrame() - pdf["a"] = np.arange(5) - pdf["b"] = testscalar - - assert gdf["b"].dtype == cudf.dtype("datetime64[s]") - assert_eq(pdf, gdf, check_dtype=True) - - -@pytest.mark.parametrize( - "data", - [ - ["2001-01-01", "2002-02-02", "2000-01-05", "NaT"], - ["2001-01-01", "2002-02-02", "2000-01-05", None], - [None, None, None, None, None], - ], -) -@pytest.mark.parametrize("dtype", DATETIME_TYPES) -def test_str_null_to_datetime(data, dtype): - psr = pd.Series(data) - gsr = Series(data) - - assert_eq(psr.astype(dtype), gsr.astype(dtype)) - - -def test_str_to_datetime_error(): - psr = pd.Series(["2001-01-01", "2002-02-02", "2000-01-05", "None"]) - gsr = Series(["2001-01-01", "2002-02-02", "2000-01-05", "None"]) - - assert_exceptions_equal( - lfunc=psr.astype, - rfunc=gsr.astype, - lfunc_args_and_kwargs=(["datetime64[s]"],), - rfunc_args_and_kwargs=(["datetime64[s]"],), - check_exception_type=False, - ) - - @pytest.mark.parametrize( "data", [ @@ -1130,97 +384,7 @@ def test_datetime_invalid_ops(): ) -@pytest.mark.parametrize( - "data", - [ - [], - [1, 2, 3], - [None, 1, 10, 11, None], - [None, None, None, None, None], - [None], - ], -) -@pytest.mark.parametrize("dtype", DATETIME_TYPES) -@pytest.mark.parametrize( - "fill_value", - [ - np.datetime64("2005-02"), - np.datetime64("2005-02-25"), - np.datetime64("2005-02-25T03:30"), - np.datetime64("nat"), - "NaT", - ], -) -def test_datetime_fillna(data, dtype, fill_value): - sr = cudf.Series(data, dtype=dtype) - psr = sr.to_pandas() - - expected = psr.dropna() - actual = sr.dropna() - - assert_eq(expected, actual) - - expected = psr.fillna(fill_value) - actual = sr.fillna(fill_value) - - assert_eq(expected, actual) - - expected = expected.dropna() - actual = actual.dropna() - - assert_eq(expected, actual) - - -@pytest.mark.parametrize( - "data", [[1, 2, 3, None], [], [100121, 1221312, 321312321, 1232131223]] -) -@pytest.mark.parametrize("dtype", DATETIME_TYPES) -@pytest.mark.parametrize( - "date_format", - [ - "%d - %m", - "%y/%H", - "%Y", - "%I - %M / %S", - "%f", - "%j", - "%p", - "%w", - "%U", - "%W", - "%G", - "%u", - "%V", - "%b", - "%B", - "%a", - "%A", - "%U_", - "_%b", - "%B*", - "%a ", - "%A1", - ], -) -def test_datetime_strftime(data, dtype, date_format): - gsr = cudf.Series(data, dtype=dtype) - psr = gsr.to_pandas() - - expected = psr.dt.strftime(date_format=date_format) - actual = gsr.dt.strftime(date_format=date_format) - - assert_eq(expected, actual) - - -@pytest.mark.parametrize("date_format", ["%c", "%x", "%X"]) -def test_datetime_strftime_not_implemented_formats(date_format): - gsr = cudf.Series([1, 2, 3], dtype="datetime64[ms]") - - with pytest.raises(NotImplementedError): - gsr.dt.strftime(date_format=date_format) - - -@pytest.mark.parametrize("data", [[1, 2, 3], [], [1, 20, 1000, None]]) +@pytest.mark.parametrize("data", [[1, 2, 3], [], [1, 20, 1000, None]]) @pytest.mark.parametrize("dtype", DATETIME_TYPES) @pytest.mark.parametrize("stat", ["mean", "quantile"]) def test_datetime_stats(data, dtype, stat): @@ -1274,764 +438,6 @@ def test_datetime_reductions(data, op, dtype): assert_eq(expected, actual) -@pytest.mark.parametrize("timezone", ["", "Z"]) -@pytest.mark.parametrize( - "data", - [ - "2002-10-27T04:30", - "2002-10-27T04:30:00", - "2002-10-27T04:30:00.000", - "2002-10-27T04:30:00.000000", - "2002-10-27T04:30:00.000000000", - ], -) -@pytest.mark.parametrize("dtype", DATETIME_TYPES) -def test_datetime_infer_format(data, timezone, dtype): - ts_data = [data + timezone] - sr = cudf.Series(ts_data) - psr = pd.Series(ts_data) - if not timezone: - expected = psr.astype(dtype) - actual = sr.astype(dtype) - - assert_eq(expected, actual) - else: - with cudf.option_context("mode.pandas_compatible", True): - with pytest.raises(NotImplementedError): - # pandas doesn't allow parsing "Z" to naive type - sr.astype(dtype) - - -def test_dateoffset_instance_subclass_check(): - assert not issubclass(pd.DateOffset, cudf.DateOffset) - assert not isinstance(pd.DateOffset(), cudf.DateOffset) - - -def test_datetime_to_datetime_error(): - assert_exceptions_equal( - lfunc=pd.to_datetime, - rfunc=cudf.to_datetime, - lfunc_args_and_kwargs=(["02-Oct-2017 09:30", "%d-%B-%Y %H:%M"],), - rfunc_args_and_kwargs=(["02-Oct-2017 09:30", "%d-%B-%Y %H:%M"],), - check_exception_type=False, - ) - - -def test_is_leap_year(): - data = [ - "2020-05-31 08:00:00", - None, - "1999-12-31 18:40:00", - "2000-12-31 04:00:00", - None, - "1900-02-28 07:00:00", - "1800-03-14 07:30:00", - "2100-03-14 07:30:00", - "1970-01-01 00:00:00", - "1969-12-31 12:59:00", - ] - - # Series - ps = pd.Series(data, dtype="datetime64[s]") - gs = cudf.from_pandas(ps) - - expect = ps.dt.is_leap_year - got = gs.dt.is_leap_year - - assert_eq(expect, got) - - # DatetimeIndex - pIndex = pd.DatetimeIndex(data) - gIndex = cudf.from_pandas(pIndex) - - expect2 = pIndex.is_leap_year - got2 = gIndex.is_leap_year - - assert_eq(expect2, got2) - - -def test_quarter(): - data = [ - "2020-05-31 08:00:00", - "1999-12-31 18:40:00", - "2000-12-31 04:00:00", - "1900-02-28 07:00:00", - "1800-03-14 07:30:00", - "2100-03-14 07:30:00", - "1970-01-01 00:00:00", - "1969-12-31 12:59:00", - ] - dtype = "datetime64[s]" - - # Series - ps = pd.Series(data, dtype=dtype) - gs = cudf.from_pandas(ps) - - expect = ps.dt.quarter - got = gs.dt.quarter - - assert_eq(expect, got, check_dtype=False) - - # DatetimeIndex - pIndex = pd.DatetimeIndex(data) - gIndex = cudf.from_pandas(pIndex) - - expect2 = pIndex.quarter - got2 = gIndex.quarter - - assert_eq(expect2.values, got2.values) - - -@pytest.mark.parametrize( - "data", - [ - pd.Series([], dtype="datetime64[ns]"), - pd.Series(pd.date_range("2010-01-01", "2010-02-01")), - pd.Series([None, None], dtype="datetime64[ns]"), - pd.Series("2020-05-31 08:00:00", dtype="datetime64[s]"), - pd.Series( - pd.date_range(start="2021-07-25", end="2021-07-30"), - index=["a", "b", "c", "d", "e", "f"], - ), - ], -) -def test_isocalendar_series(data): - ps = data.copy() - gs = cudf.from_pandas(ps) - - expect = ps.dt.isocalendar() - got = gs.dt.isocalendar() - - assert_eq(expect, got, check_dtype=False) - - -@pytest.mark.parametrize( - "data", - [ - [], - [None, None], - [ - "2020-05-31 08:00:00", - "1999-12-31 18:40:00", - "2000-12-31 04:00:00", - ], - ["2100-03-14 07:30:00"], - ], -) -def test_isocalendar_index(data): - ps = pd.DatetimeIndex(data, dtype="datetime64[ns]") - gs = cudf.from_pandas(ps) - - expect = ps.isocalendar() - got = gs.isocalendar() - - assert_eq(expect, got, check_dtype=False) - - -@pytest.mark.parametrize("dtype", DATETIME_TYPES) -def test_days_in_months(dtype): - nrows = 1000 - - data = dataset_generator.rand_dataframe( - dtypes_meta=[ - {"dtype": dtype, "null_frequency": 0.4, "cardinality": nrows} - ], - rows=nrows, - use_threads=False, - seed=23, - ) - - ps = data.to_pandas()["0"] - gs = cudf.from_pandas(ps) - - assert_eq(ps.dt.days_in_month, gs.dt.days_in_month) - - -def test_is_month_start(): - data = [ - "2020-05-31", - None, - "1999-12-01", - "2000-12-21", - None, - "1900-02-28", - "1800-03-14", - "2100-03-10", - "1970-01-01", - "1969-12-11", - ] - ps = pd.Series(data, dtype="datetime64[ns]") - gs = cudf.from_pandas(ps) - - expect = ps.dt.is_month_start - got = gs.dt.is_month_start - - assert_eq(expect, got) - - -################################################################## -# Date Range Tests # -################################################################## - -date_range_test_freq = [ - {"months": 3, "years": 1}, - {"hours": 10, "days": 57, "nanoseconds": 3}, - "83D", - "17h", - "-680min", - "110546s", - "110546789ms", - "110546789248us", -] - - -@pytest.fixture( - params=[ - "2000-02-13 08:41:06", - "1996-11-21 04:05:30", - "1970-01-01 00:00:00", - "1831-05-08 15:23:21", - ], - ids=["leap_year", "non_leap_year", "unix_epoch_time_0", "random_date"], -) -def start(request): - return request.param - - -@pytest.fixture( - params=[ - "2000-02-13 08:41:06", - "1996-11-21 04:05:30", - "1970-01-01 00:00:00", - "1831-05-08 15:23:21", - ], - ids=["leap_year", "non_leap_year", "unix_epoch_time_0", "random_date"], -) -def end(request): - return request.param - - -@pytest.fixture(params=[1, 10]) -def periods(request): - return request.param - - -@pytest.fixture( - params=[ - {"months": 3, "years": 1}, - {"hours": 10, "days": 57, "nanoseconds": 3}, - "83D", - "17h", - "-680min", - "110546s", - "110546789ms", - "110546789248us", - ] -) -def freq(request): - return request.param - - -def test_date_range_start_end_periods(start, end, periods): - expect = pd.date_range(start=start, end=end, periods=periods, name="a") - got = cudf.date_range(start=start, end=end, periods=periods, name="a") - - np.testing.assert_allclose( - expect.to_numpy().astype("int64"), - got.to_pandas().to_numpy().astype("int64"), - ) - - -@pytest.mark.skipif( - PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, - reason="Fails in older versions of pandas", -) -def test_date_range_start_end_freq(start, end, freq): - if isinstance(freq, str): - _gfreq = _pfreq = freq - else: - _gfreq = cudf.DateOffset(**freq) - _pfreq = pd.DateOffset(**freq) - - expect = pd.date_range(start=start, end=end, freq=_pfreq, name="a") - got = cudf.date_range(start=start, end=end, freq=_gfreq, name="a") - - np.testing.assert_allclose( - expect.to_numpy().astype("int64"), - got.to_pandas().to_numpy().astype("int64"), - ) - - -@pytest.mark.skipif( - PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, - reason="Fails in older versions of pandas", -) -def test_date_range_start_freq_periods(start, freq, periods): - if isinstance(freq, str): - _gfreq = _pfreq = freq - else: - _gfreq = cudf.DateOffset(**freq) - _pfreq = pd.DateOffset(**freq) - - expect = pd.date_range(start=start, periods=periods, freq=_pfreq, name="a") - got = cudf.date_range(start=start, periods=periods, freq=_gfreq, name="a") - - np.testing.assert_allclose( - expect.to_numpy().astype("int64"), - got.to_pandas().to_numpy().astype("int64"), - ) - - -@pytest.mark.skipif( - PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, - reason="https://github.com/pandas-dev/pandas/issues/46877", -) -def test_date_range_end_freq_periods(end, freq, periods): - if isinstance(freq, str): - _gfreq = _pfreq = freq - else: - _gfreq = cudf.DateOffset(**freq) - _pfreq = pd.DateOffset(**freq) - - expect = pd.date_range(end=end, periods=periods, freq=_pfreq, name="a") - got = cudf.date_range(end=end, periods=periods, freq=_gfreq, name="a") - - np.testing.assert_allclose( - expect.to_numpy().astype("int64"), - got.to_pandas().to_numpy().astype("int64"), - ) - - -def test_date_range_freq_does_not_divide_range(): - expect = pd.date_range( - "2001-01-01 00:00:00.000000", "2001-01-01 00:00:00.000010", freq="3us" - ) - got = cudf.date_range( - "2001-01-01 00:00:00.000000", "2001-01-01 00:00:00.000010", freq="3us" - ) - np.testing.assert_allclose( - expect.to_numpy().astype("int64"), - got.to_pandas().to_numpy().astype("int64"), - ) - - -@pytest.mark.parametrize( - "kwargs", - [ - {"nanoseconds": 1}, - {"months": 1}, - ], -) -def test_date_range_raise_overflow(kwargs): - start = np.datetime64(np.iinfo("int64").max, "ns") - periods = 2 - freq = cudf.DateOffset(**kwargs) - with pytest.raises(pd.errors.OutOfBoundsDatetime): - cudf.date_range(start=start, periods=periods, freq=freq) - - -@pytest.mark.parametrize( - "freqstr_unsupported", - [ - "1ME", - "2SME", - "3MS", - "4BME", - "5CBME", - "6SMS", - "7BMS", - "8CBMS", - "QE", - "2BQE", - "3BQS", - "10YE", - "9BYE", - "8YS", - "7BYS", - "bh", - "B", - ], -) -def test_date_range_raise_unsupported(freqstr_unsupported): - if not PANDAS_GE_220 and freqstr_unsupported.endswith("E"): - pytest.skip(reason="YE, etc. support was added in pandas 2.2") - - s, e = "2001-01-01", "2008-01-31" - pd.date_range(start=s, end=e, freq=freqstr_unsupported) - with pytest.raises(ValueError, match="does not yet support"): - cudf.date_range(start=s, end=e, freq=freqstr_unsupported) - - # We also check that these values are unsupported when using lowercase - # characters. We exclude the value 3MS (every 3 month starts) because 3ms - # is a valid frequency for every 3 milliseconds. - if freqstr_unsupported != "3MS": - freqstr_unsupported = freqstr_unsupported.lower() - with pytest.raises(ValueError, match="does not yet support"): - with expect_warning_if( - PANDAS_GE_220 and freqstr_unsupported not in {"b", "bh"} - ): - cudf.date_range(start=s, end=e, freq=freqstr_unsupported) - - -################################################################## -# End of Date Range Test # -################################################################## - - -def test_is_month_end(): - data = [ - "2020-05-31", - "2020-02-29", - None, - "1999-12-01", - "2000-12-21", - None, - "1900-02-28", - "1800-03-14", - "2100-03-10", - "1970-01-01", - "1969-12-11", - ] - ps = pd.Series(data, dtype="datetime64[ns]") - gs = cudf.from_pandas(ps) - - expect = ps.dt.is_month_end - got = gs.dt.is_month_end - - assert_eq(expect, got) - - -def test_is_year_start(): - data = [ - "2020-05-31", - None, - "1999-12-01", - "2000-12-21", - None, - "1900-01-01", - "1800-03-14", - "2100-03-10", - "1970-01-01", - "1969-12-11", - "2017-12-30", - "2017-12-31", - "2018-01-01", - ] - ps = pd.Series(data, dtype="datetime64[ns]") - gs = cudf.from_pandas(ps) - - expect = ps.dt.is_year_start - got = gs.dt.is_year_start - - assert_eq(expect, got) - - -def test_is_year_end(): - data = [ - "2020-05-31", - None, - "1999-12-01", - "2000-12-21", - None, - "1900-12-31", - "1800-03-14", - "2017-12-30", - "2017-12-31", - "2020-12-31 08:00:00", - None, - "1999-12-31 18:40:00", - "2000-12-31 04:00:00", - None, - "1800-12-14 07:30:00", - "2100-12-14 07:30:00", - "2020-05-31", - ] - ps = pd.Series(data, dtype="datetime64[ns]") - gs = cudf.from_pandas(ps) - - expect = ps.dt.is_year_end - got = gs.dt.is_year_end - - assert_eq(expect, got) - - -def test_is_quarter_start(): - data = [ - "2020-05-01", - "2020-05-31", - "2020-02-29", - None, - "1999-12-01", - "2000-12-21", - None, - "1900-02-28", - "1800-03-14", - "2100-03-10", - "1970-04-1", - "1970-01-01", - "1969-12-11", - "2020-12-31", - ] - ps = pd.Series(data, dtype="datetime64[ns]") - gs = cudf.from_pandas(ps) - - expect = ps.dt.is_quarter_start - got = gs.dt.is_quarter_start - - assert_eq(expect, got) - - -def test_is_quarter_end(): - data = [ - "2020-05-01", - "2020-05-31", - "2020-02-29", - None, - "1999-12-01", - "2000-12-21", - None, - "1900-02-28", - "1800-03-14", - "2100-03-10", - "1970-04-1", - "1970-01-01", - "1969-12-11", - "2020-12-31", - ] - ps = pd.Series(data, dtype="datetime64[ns]") - gs = cudf.from_pandas(ps) - - expect = ps.dt.is_quarter_end - got = gs.dt.is_quarter_end - - assert_eq(expect, got) - - -def test_error_values(): - s = cudf.Series([1, 2, 3], dtype="datetime64[ns]") - with pytest.raises(NotImplementedError, match="cupy does not support"): - s.values - - -@pytest.mark.skipif( - PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, - reason="https://github.com/pandas-dev/pandas/issues/52761", -) -@pytest.mark.parametrize("time_type", DATETIME_TYPES) -@pytest.mark.parametrize( - "resolution", ["D", "h", "min", "min", "s", "ms", "us", "ns"] -) -def test_ceil(data, time_type, resolution): - data = [ - "2020-05-31 08:00:00", - "1999-12-31 18:40:10", - "2000-12-31 04:00:05", - "1900-02-28 07:00:06", - "1800-03-14 07:30:20", - "2100-03-14 07:30:20", - "1970-01-01 00:00:09", - "1969-12-31 12:59:10", - ] - gs = cudf.Series(data, dtype=time_type) - ps = gs.to_pandas() - - expect = ps.dt.ceil(resolution) - got = gs.dt.ceil(resolution) - assert_eq(expect, got) - - -@pytest.mark.skipif( - PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, - reason="https://github.com/pandas-dev/pandas/issues/52761", -) -@pytest.mark.parametrize("time_type", DATETIME_TYPES) -@pytest.mark.parametrize( - "resolution", ["D", "h", "min", "min", "s", "ms", "us", "ns"] -) -def test_floor(time_type, resolution): - data = [ - "2020-05-31 08:00:00", - "1999-12-31 18:40:10", - "2000-12-31 04:00:05", - "1900-02-28 07:00:06", - "1800-03-14 07:30:20", - "2100-03-14 07:30:20", - "1970-01-01 00:00:09", - "1969-12-31 12:59:10", - ] - gs = cudf.Series(data, dtype=time_type) - ps = gs.to_pandas() - - expect = ps.dt.floor(resolution) - got = gs.dt.floor(resolution) - assert_eq(expect, got) - - -@pytest.mark.parametrize("time_type", DATETIME_TYPES) -@pytest.mark.parametrize( - "resolution", ["D", "h", "min", "min", "s", "ms", "us", "ns"] -) -def test_round(time_type, resolution): - data = [ - "2020-05-31 08:00:00", - "1999-12-31 18:40:10", - "2000-12-31 04:00:05", - "1900-02-28 07:00:06", - "1800-03-14 07:30:20", - "2100-03-14 07:30:20", - "1970-01-01 00:00:09", - "1969-12-31 12:59:10", - ] - gs = cudf.Series(data, dtype=time_type) - ps = gs.to_pandas() - - expect = ps.dt.round(resolution) - got = gs.dt.round(resolution) - assert_eq(expect, got) - - -@pytest.mark.skipif( - PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, - reason="warning not present in older pandas versions", -) -@pytest.mark.parametrize( - "idx", - [ - pd.DatetimeIndex([]), - pd.DatetimeIndex(["2010-05-31"]), - pd.date_range("2000-01-01", "2000-12-31", periods=21), - ], -) -@pytest.mark.parametrize( - "offset", - [ - "10Y", - "6M", - "M", - "31D", - "0H", - "44640T", - "44640min", - "2678000S", - "2678000000L", - "2678000000ms", - "2678000000000U", - "2678000000000us", - "2678000000000000N", - "2678000000000000ns", - ], -) -def test_first(idx, offset): - p = pd.Series(range(len(idx)), dtype="int64", index=idx) - g = cudf.from_pandas(p) - - with pytest.warns(FutureWarning): - expect = p.first(offset=offset) - with pytest.warns(FutureWarning): - got = g.first(offset=offset) - - assert_eq(expect, got) - - -@pytest.mark.skipif( - PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, - reason="warning not present in older pandas versions", -) -def test_first_start_at_end_of_month(): - idx = pd.DatetimeIndex( - [ - "2020-01-31", - "2020-02-15", - "2020-02-29", - "2020-03-15", - "2020-03-31", - "2020-04-15", - "2020-04-30", - ] - ) - offset = "3M" - p = pd.Series(range(len(idx)), index=idx) - g = cudf.from_pandas(p) - - with pytest.warns(FutureWarning): - expect = p.first(offset=offset) - with pytest.warns(FutureWarning): - got = g.first(offset=offset) - - assert_eq(expect, got) - - -@pytest.mark.skipif( - PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, - reason="warning not present in older pandas versions", -) -@pytest.mark.parametrize( - "idx", - [ - pd.DatetimeIndex([]), - pd.DatetimeIndex(["2010-05-31"]), - pd.date_range("2000-01-01", "2000-12-31", periods=21), - ], -) -@pytest.mark.parametrize( - "offset", - [ - "10Y", - "6M", - "M", - "31D", - "0H", - "44640T", - "44640min", - "2678000S", - "2678000000L", - "2678000000ms", - "2678000000000U", - "2678000000000us", - "2678000000000000N", - "2678000000000000ns", - ], -) -def test_last(idx, offset): - p = pd.Series(range(len(idx)), dtype="int64", index=idx) - g = cudf.from_pandas(p) - - with pytest.warns(FutureWarning): - expect = p.last(offset=offset) - with pytest.warns(FutureWarning): - got = g.last(offset=offset) - - assert_eq(expect, got) - - -@pytest.mark.parametrize( - "data", - [ - [ - "2020-01-31", - "2020-02-15", - "2020-02-29", - "2020-03-15", - "2020-03-31", - "2020-04-15", - "2020-04-30", - ], - [43534, 43543, 37897, 2000], - ], -) -@pytest.mark.parametrize("dtype", [None, "datetime64[ns]"]) -def test_datetime_constructor(data, dtype): - expected = pd.DatetimeIndex(data=data, dtype=dtype) - actual = cudf.DatetimeIndex(data=data, dtype=dtype) - - assert_eq(expected, actual) - - expected = pd.DatetimeIndex(data=pd.Series(data), dtype=dtype) - actual = cudf.DatetimeIndex(data=cudf.Series(data), dtype=dtype) - - assert_eq(expected, actual) - - def test_datetime_binop_tz_timestamp(op): s = cudf.Series([1, 2, 3], dtype="datetime64[ns]") pd_tz_timestamp = pd.Timestamp("1970-01-01 00:00:00.000000001", tz="utc") @@ -2059,391 +465,6 @@ def test_datetime_series_cmpops_pandas_compatibility(op): assert_eq(expect, got) -def test_datetime_getitem_na(): - s = cudf.Series([1, 2, None, 3], dtype="datetime64[ns]") - assert s[2] is cudf.NaT - - -def test_daterange_pandas_compatibility(): - with cudf.option_context("mode.pandas_compatible", True): - expected = pd.date_range( - "2010-01-01", "2010-02-01", periods=10, name="times" - ) - actual = cudf.date_range( - "2010-01-01", "2010-02-01", periods=10, name="times" - ) - assert_eq(expected, actual) - - -@pytest.mark.parametrize( - "data,dtype,freq", - [ - ([10], "datetime64[ns]", "2ns"), - ([10, 12, 14, 16], "datetime64[ns]", "2ns"), - ([10, 11, 12, 13], "datetime64[ns]", "1ns"), - ([100, 200, 300, 400], "datetime64[s]", "100s"), - ([101, 201, 301, 401], "datetime64[ms]", "100ms"), - ], -) -def test_datetime_index_with_freq(data, dtype, freq): - actual = cudf.DatetimeIndex(data, dtype=dtype, freq=freq) - expected = pd.DatetimeIndex(data, dtype=dtype, freq=freq) - assert_eq(actual, expected) - - -@pytest.mark.parametrize( - "data,dtype,freq", - [ - ([10, 1232, 13244, 13426], "datetime64[ns]", "2ns"), - ([10, 11, 12, 13], "datetime64[ns]", "1s"), - ([10000, 200, 300, 400], "datetime64[s]", "100s"), - ([107871, 201, 301, 401], "datetime64[ms]", "100ns"), - ], -) -def test_datetime_index_freq_error(data, dtype, freq): - assert_exceptions_equal( - pd.DatetimeIndex, - cudf.DatetimeIndex, - ([data], {"dtype": dtype, "freq": freq}), - ([data], {"dtype": dtype, "freq": freq}), - ) - - -def test_strings_with_utc_offset_not_implemented(): - with pytest.raises(NotImplementedError): - DatetimeIndex(["2022-07-22 00:00:00+02:00"]) - - -@pytest.mark.parametrize("code", ["z", "Z"]) -def test_format_timezone_not_implemented(code): - with pytest.raises(NotImplementedError): - cudf.to_datetime( - ["2020-01-01 00:00:00 UTC"], format=f"%Y-%m-%d %H:%M:%S %{code}" - ) - - -@pytest.mark.parametrize("tz", ["UTC-3", "+01:00"]) -def test_utc_offset_not_implemented(tz): - with pytest.raises((NotImplementedError, ValueError)): - cudf.to_datetime([f"2020-01-01 00:00:00{tz}"]) - - -def test_Z_utc_offset(): - with cudf.option_context("mode.pandas_compatible", True): - with pytest.raises(NotImplementedError): - cudf.to_datetime(["2020-01-01 00:00:00Z"]) - - result = cudf.to_datetime(["2020-01-01 00:00:00Z"]) - expected = cudf.to_datetime(["2020-01-01 00:00:00"]) - assert_eq(result, expected) - - -@pytest.mark.parametrize("arg", [True, False]) -def test_args_not_datetime_typerror(arg): - with pytest.raises(TypeError): - cudf.to_datetime([arg]) - - -@pytest.mark.parametrize( - "data, dtype", - [ - [ - [ - "2000-01-01 00:00:00.000000000", - "2000-01-01 00:00:00.000000000", - "2000-01-01 00:00:00.000000000", - ], - "datetime64[s]", - ], - [ - [ - "2000-01-01 00:00:00.000000000", - None, - "2000-01-01 00:00:00.000000000", - ], - "datetime64[s]", - ], - [ - [ - "2000-01-01 00:00:00.001000000", - "2000-01-01 00:00:00.000000000", - "2000-01-01 00:00:00.000000000", - ], - "datetime64[us]", - ], - [ - [ - "2000-01-01 00:00:00.010000000", - "2000-01-01 00:00:00.020000000", - "2000-01-01 00:00:00.030000000", - ], - "datetime64[ms]", - ], - [ - [ - "2000-01-01 00:00:00.010000000", - "2000-01-01 00:00:00.020000000", - None, - ], - "datetime64[ms]", - ], - [ - [ - "2000-01-01 00:00:00.000001000", - "2000-01-01 00:00:00.000000000", - "2000-01-01 00:00:00.000004000", - ], - "datetime64[us]", - ], - [ - [ - None, - "2000-01-01 00:00:00.000000000", - "2000-01-01 00:00:00.000004000", - ], - "datetime64[us]", - ], - [ - [ - "2000-01-01 00:00:00.000000010", - "2000-01-01 00:00:00.000000002", - "2000-01-01 00:00:00.000000000", - ], - "datetime64[ns]", - ], - [ - [ - "2000-01-01 00:00:00.000000010", - None, - "2000-01-01 00:00:00.000000000", - ], - "datetime64[ns]", - ], - [ - [ - "2000-01-01 00:00:01.000000000", - "2000-01-01 00:00:40.000000000", - "2000-01-01 00:00:59.000000000", - ], - "datetime64[s]", - ], - [ - [ - "2000-01-01 00:10:00.000000000", - "2000-01-01 00:30:40.000000000", - "2000-01-01 00:59:00.000000000", - ], - "datetime64[s]", - ], - [ - [ - "2000-01-01 07:00:00.000000000", - "2000-01-01 08:00:00.000000000", - None, - ], - "datetime64[s]", - ], - [[None, None, None], "datetime64[s]"], - [[], "datetime64[s]"], - [ - [ - "2000-01-01 00:10:00.123456789", - "2000-01-01 00:30:40.123123456", - "2000-01-01 00:59:00.675347634", - ], - "datetime64[ns]", - ], - ], -) -def test_datetime_to_str(data, dtype): - gs = cudf.Series(data, dtype=dtype) - ps = gs.to_pandas() - - with cudf.option_context("mode.pandas_compatible", True): - actual = gs.astype("str") - - expected = ps.astype("string") - - assert_eq(actual.to_pandas(nullable=True), expected) - - -@pytest.mark.skipif( - PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, - reason="Fails in older versions of pandas", -) -def test_datetime_string_to_datetime_resolution_loss_raises(): - data = ["2020-01-01 00:00:00.00001"] - dtype = "datetime64[s]" - with pytest.raises(ValueError): - cudf.Series(data, dtype=dtype) - with pytest.raises(ValueError): - pd.Series(data, dtype=dtype) - - -def test_dateimeindex_from_noniso_string(): - data = ["20160920", "20160925"] - gdti = cudf.DatetimeIndex(data) - pdti = pd.DatetimeIndex(data) - - assert_eq(gdti, pdti) - - -@pytest.mark.parametrize("errors", ["coerce", "ignore"]) -def test_to_datetime_errors_non_scalar_not_implemented(errors): - with pytest.raises(NotImplementedError): - cudf.to_datetime([1, ""], unit="s", errors=errors) - - -@pytest.mark.parametrize( - "freqstr", - [ - "H", - "N", - "T", - "L", - "U", - "S", - ], -) -def test_datetime_raise_warning(freqstr): - t = cudf.Series( - ["2001-01-01 00:04:45", "2001-01-01 00:04:58", "2001-01-01 00:05:04"], - dtype="datetime64[ns]", - ) - with pytest.warns(FutureWarning): - t.dt.ceil(freqstr) - - -def test_timezone_pyarrow_array(): - pa_array = pa.array( - [datetime.datetime(2020, 1, 1, tzinfo=datetime.timezone.utc)], - type=pa.timestamp("ns", "UTC"), - ) - result = cudf.Series(pa_array) - expected = pa_array.to_pandas() - assert_eq(result, expected) - - -def test_to_datetime_errors_ignore_deprecated(): - with pytest.warns(FutureWarning): - cudf.to_datetime("2001-01-01 00:04:45", errors="ignore") - - -def test_date_range_freq_default(): - result = pd.date_range("2020-01-01", periods=2, name="foo") - expected = cudf.date_range("2020-01-01", periods=2, name="foo") - assert_eq(result, expected) - - -def test_date_range_tz(): - result = pd.date_range("2020-01-01", periods=2, tz="UTC") - expected = cudf.date_range("2020-01-01", periods=2, tz="UTC") - assert_eq(result, expected) - - result = pd.date_range("2020-01-01", "2020-01-02", periods=2, tz="UTC") - expected = cudf.date_range("2020-01-01", "2020-01-02", periods=2, tz="UTC") - assert_eq(result, expected) - - -@pytest.mark.parametrize("meth", ["day_name", "month_name"]) -@pytest.mark.parametrize("klass", [pd.Series, pd.DatetimeIndex]) -def test_day_month_name(meth, klass): - data = [ - "2020-05-31 08:00:00", - None, - "1999-12-31 18:40:00", - "2000-12-31 04:00:00", - None, - "1900-02-28 07:00:00", - "1800-03-14 07:30:00", - "2100-03-14 07:30:00", - "1970-01-01 00:00:00", - "1969-12-31 12:59:00", - ] - - p_obj = klass(data, dtype="datetime64[s]") - g_obj = cudf.from_pandas(p_obj) - - if klass is pd.Series: - p_obj = p_obj.dt - g_obj = g_obj.dt - - expect = getattr(p_obj, meth)() - got = getattr(g_obj, meth)() - - assert_eq(expect, got) - - -@pytest.mark.parametrize("meth", ["day_name", "month_name"]) -@pytest.mark.parametrize("klass", [cudf.Series, cudf.DatetimeIndex]) -def test_day_month_name_locale_not_implemented(meth, klass): - obj = klass(cudf.date_range("2020-01-01", periods=7)) - if klass is cudf.Series: - obj = obj.dt - with pytest.raises(NotImplementedError): - getattr(obj, meth)(locale="pt_BR.utf8") - - -@pytest.mark.parametrize( - "attr", - [ - "is_month_start", - "is_month_end", - "is_quarter_end", - "is_quarter_start", - "is_year_end", - "is_year_start", - "days_in_month", - "timetz", - "time", - "date", - ], -) -def test_dti_datetime_attributes(attr): - data = [ - "2020-01-01", - "2020-01-31", - "2020-03-01", - "2020-03-31", - "2020-03-31", - "2020-12-31", - None, - ] - pd_dti = pd.DatetimeIndex(data, name="foo") - cudf_dti = cudf.from_pandas(pd_dti) - - result = getattr(cudf_dti, attr) - expected = getattr(pd_dti, attr) - if isinstance(result, np.ndarray): - # numpy doesn't assert object arrays with NaT correctly - tm.assert_numpy_array_equal(result, expected) - else: - assert_eq(result, expected) - - -@pytest.mark.parametrize("attr", ["freq", "unit"]) -def test_dti_properties(attr): - pd_dti = pd.DatetimeIndex( - ["2020-01-01", "2020-01-02"], dtype="datetime64[ns]" - ) - cudf_dti = cudf.DatetimeIndex( - ["2020-01-01", "2020-01-02"], dtype="datetime64[ns]" - ) - - result = getattr(cudf_dti, attr) - expected = getattr(pd_dti, attr) - assert result == expected - - -def test_dti_asi8(): - pd_dti = pd.DatetimeIndex(["2020-01-01", "2020-12-31"], name="foo") - cudf_dti = cudf.from_pandas(pd_dti) - - result = pd_dti.asi8 - expected = cudf_dti.asi8 - assert_eq(result, expected) - - @pytest.mark.parametrize( "method, kwargs", [["mean", {}], ["std", {}], ["std", {"ddof": 0}]], @@ -2455,40 +476,3 @@ def test_dti_reduction(method, kwargs): result = getattr(cudf_dti, method)(**kwargs) expected = getattr(pd_dti, method)(**kwargs) assert result == expected - - -@pytest.mark.parametrize( - "method, kwargs", - [ - ["to_pydatetime", {}], - ["to_period", {"freq": "D"}], - ["strftime", {"date_format": "%Y-%m-%d"}], - ], -) -def test_dti_methods(method, kwargs): - pd_dti = pd.DatetimeIndex(["2020-01-01", "2020-12-31"], name="foo") - cudf_dti = cudf.from_pandas(pd_dti) - - result = getattr(cudf_dti, method)(**kwargs) - expected = getattr(pd_dti, method)(**kwargs) - assert_eq(result, expected) - - -def test_date_range_start_end_divisible_by_freq(): - result = cudf.date_range("2011-01-01", "2011-01-02", freq="h") - expected = pd.date_range("2011-01-01", "2011-01-02", freq="h") - assert_eq(result, expected) - - -def test_writable_numpy_array(): - gi = cudf.Index([1, 2, 3], dtype="datetime64[ns]") - expected_flags = pd.Index( - [1, 2, 3], dtype="datetime64[ns]" - )._data._ndarray.flags - - actual_flags = gi.to_pandas()._data._ndarray.flags - assert expected_flags.c_contiguous == actual_flags.c_contiguous - assert expected_flags.f_contiguous == actual_flags.f_contiguous - assert expected_flags.writeable == actual_flags.writeable - assert expected_flags.aligned == actual_flags.aligned - assert expected_flags.writebackifcopy == actual_flags.writebackifcopy From bc9d4bb91b7200578980461b70db53f3109a2958 Mon Sep 17 00:00:00 2001 From: Matthew Murray <41342305+Matt711@users.noreply.github.com> Date: Mon, 11 Aug 2025 19:25:59 -0400 Subject: [PATCH 099/366] Skip managed memory test if managed memory not supported in cudf-polars (#19653) Skips the test: `test_cudf_polars_enable_disable_managed_memory` if managed memory is not supported. Authors: - Matthew Murray (https://github.com/Matt711) Approvers: - Bradley Dice (https://github.com/bdice) URL: https://github.com/rapidsai/cudf/pull/19653 --- python/cudf_polars/tests/test_config.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/python/cudf_polars/tests/test_config.py b/python/cudf_polars/tests/test_config.py index 52651fbe5c8..3de18e0a4e4 100644 --- a/python/cudf_polars/tests/test_config.py +++ b/python/cudf_polars/tests/test_config.py @@ -10,6 +10,7 @@ import polars as pl from polars.testing.asserts import assert_frame_equal +import pylibcudf as plc import rmm import cudf_polars.utils.config @@ -83,17 +84,21 @@ def test_invalid_memory_resource_raises(mr): q.collect(engine=pl.GPUEngine(memory_resource=mr)) -@pytest.mark.parametrize("disable_managed_memory", ["1", "0"]) -def test_cudf_polars_enable_disable_managed_memory(monkeypatch, disable_managed_memory): +@pytest.mark.skipif( + not plc.utils._is_concurrent_managed_access_supported(), + reason="managed memory not supported", +) +@pytest.mark.parametrize("enable_managed_memory", ["1", "0"]) +def test_cudf_polars_enable_disable_managed_memory(monkeypatch, enable_managed_memory): q = pl.LazyFrame({"a": [1, 2, 3]}) with monkeypatch.context() as monkeycontext: monkeycontext.setenv( - "POLARS_GPU_ENABLE_CUDA_MANAGED_MEMORY", disable_managed_memory + "POLARS_GPU_ENABLE_CUDA_MANAGED_MEMORY", enable_managed_memory ) result = q.collect(engine=pl.GPUEngine()) - mr = default_memory_resource(0, bool(disable_managed_memory == "1")) - if disable_managed_memory == "1": + mr = default_memory_resource(0, bool(enable_managed_memory == "1")) + if enable_managed_memory == "1": assert isinstance(mr, rmm.mr.PrefetchResourceAdaptor) assert isinstance(mr.upstream_mr, rmm.mr.PoolMemoryResource) else: From 1d6aaf12bcc5a57bff3c0cd882c2019425b63279 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Mon, 11 Aug 2025 17:02:13 -0700 Subject: [PATCH 100/366] Add streams support to datetime APIs (#19654) Contributes to #15163 Authors: - Vyas Ramasubramani (https://github.com/vyasr) Approvers: - Matthew Roeschke (https://github.com/mroeschke) URL: https://github.com/rapidsai/cudf/pull/19654 --- python/pylibcudf/pylibcudf/datetime.pxd | 24 +++-- python/pylibcudf/pylibcudf/datetime.pyi | 30 ++++--- python/pylibcudf/pylibcudf/datetime.pyx | 87 ++++++++++++------- .../pylibcudf/pylibcudf/libcudf/datetime.pxd | 34 +++++--- 4 files changed, 116 insertions(+), 59 deletions(-) diff --git a/python/pylibcudf/pylibcudf/datetime.pxd b/python/pylibcudf/pylibcudf/datetime.pxd index ce295990d26..7e92acaaff8 100644 --- a/python/pylibcudf/pylibcudf/datetime.pxd +++ b/python/pylibcudf/pylibcudf/datetime.pxd @@ -3,6 +3,7 @@ from pylibcudf.column cimport Column from pylibcudf.libcudf.datetime cimport datetime_component, rounding_frequency from pylibcudf.scalar cimport Scalar +from rmm.pylibrmm.stream cimport Stream ctypedef fused ColumnOrScalar: Column @@ -10,35 +11,40 @@ ctypedef fused ColumnOrScalar: cpdef Column extract_datetime_component( Column input, - datetime_component component + datetime_component component, + Stream stream = * ) cpdef Column ceil_datetimes( Column input, - rounding_frequency freq + rounding_frequency freq, + Stream stream = * ) cpdef Column floor_datetimes( Column input, - rounding_frequency freq + rounding_frequency freq, + Stream stream = * ) cpdef Column round_datetimes( Column input, - rounding_frequency freq + rounding_frequency freq, + Stream stream = * ) cpdef Column add_calendrical_months( Column timestamps, ColumnOrScalar months, + Stream stream = * ) -cpdef Column day_of_year(Column input) +cpdef Column day_of_year(Column input, Stream stream = *) -cpdef Column is_leap_year(Column input) +cpdef Column is_leap_year(Column input, Stream stream = *) -cpdef Column last_day_of_month(Column input) +cpdef Column last_day_of_month(Column input, Stream stream = *) -cpdef Column extract_quarter(Column input) +cpdef Column extract_quarter(Column input, Stream stream = *) -cpdef Column days_in_month(Column input) +cpdef Column days_in_month(Column input, Stream stream = *) diff --git a/python/pylibcudf/pylibcudf/datetime.pyi b/python/pylibcudf/pylibcudf/datetime.pyi index 8eedaeefe61..3b464d3bf11 100644 --- a/python/pylibcudf/pylibcudf/datetime.pyi +++ b/python/pylibcudf/pylibcudf/datetime.pyi @@ -2,6 +2,8 @@ from enum import IntEnum +from rmm.pylibrmm.stream import Stream + from pylibcudf.column import Column from pylibcudf.scalar import Scalar @@ -27,16 +29,24 @@ class RoundingFrequency(IntEnum): NANOSECOND = ... def extract_datetime_component( - input: Column, component: DatetimeComponent + input: Column, component: DatetimeComponent, stream: Stream | None = None +) -> Column: ... +def ceil_datetimes( + input: Column, freq: RoundingFrequency, stream: Stream | None = None +) -> Column: ... +def floor_datetimes( + input: Column, freq: RoundingFrequency, stream: Stream | None = None +) -> Column: ... +def round_datetimes( + input: Column, freq: RoundingFrequency, stream: Stream | None = None ) -> Column: ... -def ceil_datetimes(input: Column, freq: RoundingFrequency) -> Column: ... -def floor_datetimes(input: Column, freq: RoundingFrequency) -> Column: ... -def round_datetimes(input: Column, freq: RoundingFrequency) -> Column: ... def add_calendrical_months( - input: Column, months: Column | Scalar + input: Column, months: Column | Scalar, stream: Stream | None = None +) -> Column: ... +def day_of_year(input: Column, stream: Stream | None = None) -> Column: ... +def is_leap_year(input: Column, stream: Stream | None = None) -> Column: ... +def last_day_of_month( + input: Column, stream: Stream | None = None ) -> Column: ... -def day_of_year(input: Column) -> Column: ... -def is_leap_year(input: Column) -> Column: ... -def last_day_of_month(input: Column) -> Column: ... -def extract_quarter(input: Column) -> Column: ... -def days_in_month(input: Column) -> Column: ... +def extract_quarter(input: Column, stream: Stream | None = None) -> Column: ... +def days_in_month(input: Column, stream: Stream | None = None) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/datetime.pyx b/python/pylibcudf/pylibcudf/datetime.pyx index da736755848..0aa13917c97 100644 --- a/python/pylibcudf/pylibcudf/datetime.pyx +++ b/python/pylibcudf/pylibcudf/datetime.pyx @@ -23,8 +23,11 @@ from pylibcudf.libcudf.datetime import \ rounding_frequency as RoundingFrequency # no-cython-lint from cython.operator cimport dereference +from rmm.pylibrmm.stream cimport Stream from .column cimport Column +from .scalar cimport Scalar +from .utils cimport _get_stream __all__ = [ "DatetimeComponent", @@ -43,7 +46,8 @@ __all__ = [ cpdef Column extract_datetime_component( Column input, - datetime_component component + datetime_component component, + Stream stream=None ): """ Extract a datetime component from a datetime column. @@ -64,13 +68,16 @@ cpdef Column extract_datetime_component( """ cdef unique_ptr[column] result + stream = _get_stream(stream) + with nogil: - result = cpp_extract_datetime_component(input.view(), component) - return Column.from_libcudf(move(result)) + result = cpp_extract_datetime_component(input.view(), component, stream.view()) + return Column.from_libcudf(move(result), stream) cpdef Column ceil_datetimes( Column input, - rounding_frequency freq + rounding_frequency freq, + Stream stream=None ): """ Round datetimes up to the nearest multiple of the given frequency. @@ -91,13 +98,16 @@ cpdef Column ceil_datetimes( """ cdef unique_ptr[column] result + stream = _get_stream(stream) + with nogil: - result = cpp_ceil_datetimes(input.view(), freq) - return Column.from_libcudf(move(result)) + result = cpp_ceil_datetimes(input.view(), freq, stream.view()) + return Column.from_libcudf(move(result), stream) cpdef Column floor_datetimes( Column input, - rounding_frequency freq + rounding_frequency freq, + Stream stream=None ): """ Round datetimes down to the nearest multiple of the given frequency. @@ -118,13 +128,16 @@ cpdef Column floor_datetimes( """ cdef unique_ptr[column] result + stream = _get_stream(stream) + with nogil: - result = cpp_floor_datetimes(input.view(), freq) - return Column.from_libcudf(move(result)) + result = cpp_floor_datetimes(input.view(), freq, stream.view()) + return Column.from_libcudf(move(result), stream) cpdef Column round_datetimes( Column input, - rounding_frequency freq + rounding_frequency freq, + Stream stream=None ): """ Round datetimes to the nearest multiple of the given frequency. @@ -145,13 +158,16 @@ cpdef Column round_datetimes( """ cdef unique_ptr[column] result + stream = _get_stream(stream) + with nogil: - result = cpp_round_datetimes(input.view(), freq) - return Column.from_libcudf(move(result)) + result = cpp_round_datetimes(input.view(), freq, stream.view()) + return Column.from_libcudf(move(result), stream) cpdef Column add_calendrical_months( Column input, ColumnOrScalar months, + Stream stream=None ): """ Adds or subtracts a number of months from the datetime @@ -177,15 +193,18 @@ cpdef Column add_calendrical_months( cdef unique_ptr[column] result + stream = _get_stream(stream) + with nogil: result = cpp_add_calendrical_months( input.view(), months.view() if ColumnOrScalar is Column else - dereference(months.get()) + dereference(months.get()), + stream.view() ) - return Column.from_libcudf(move(result)) + return Column.from_libcudf(move(result), stream) -cpdef Column day_of_year(Column input): +cpdef Column day_of_year(Column input, Stream stream=None): """ Computes the day number since the start of the year from the datetime. The value is between @@ -205,11 +224,13 @@ cpdef Column day_of_year(Column input): """ cdef unique_ptr[column] result + stream = _get_stream(stream) + with nogil: - result = cpp_day_of_year(input.view()) - return Column.from_libcudf(move(result)) + result = cpp_day_of_year(input.view(), stream.view()) + return Column.from_libcudf(move(result), stream) -cpdef Column is_leap_year(Column input): +cpdef Column is_leap_year(Column input, Stream stream=None): """ Check if the year of the given date is a leap year. @@ -228,11 +249,13 @@ cpdef Column is_leap_year(Column input): """ cdef unique_ptr[column] result + stream = _get_stream(stream) + with nogil: - result = cpp_is_leap_year(input.view()) - return Column.from_libcudf(move(result)) + result = cpp_is_leap_year(input.view(), stream.view()) + return Column.from_libcudf(move(result), stream) -cpdef Column last_day_of_month(Column input): +cpdef Column last_day_of_month(Column input, Stream stream=None): """ Computes the last day of the month. @@ -251,11 +274,13 @@ cpdef Column last_day_of_month(Column input): """ cdef unique_ptr[column] result + stream = _get_stream(stream) + with nogil: - result = cpp_last_day_of_month(input.view()) - return Column.from_libcudf(move(result)) + result = cpp_last_day_of_month(input.view(), stream.view()) + return Column.from_libcudf(move(result), stream) -cpdef Column extract_quarter(Column input): +cpdef Column extract_quarter(Column input, Stream stream=None): """ Returns the quarter (ie. a value from {1, 2, 3, 4}) that the date is in. @@ -274,11 +299,13 @@ cpdef Column extract_quarter(Column input): """ cdef unique_ptr[column] result + stream = _get_stream(stream) + with nogil: - result = cpp_extract_quarter(input.view()) - return Column.from_libcudf(move(result)) + result = cpp_extract_quarter(input.view(), stream.view()) + return Column.from_libcudf(move(result), stream) -cpdef Column days_in_month(Column input): +cpdef Column days_in_month(Column input, Stream stream=None): """ Extract the number of days in the month. @@ -296,9 +323,11 @@ cpdef Column days_in_month(Column input): """ cdef unique_ptr[column] result + stream = _get_stream(stream) + with nogil: - result = cpp_days_in_month(input.view()) - return Column.from_libcudf(move(result)) + result = cpp_days_in_month(input.view(), stream.view()) + return Column.from_libcudf(move(result), stream) DatetimeComponent.__str__ = DatetimeComponent.__repr__ RoundingFrequency.__str__ = RoundingFrequency.__repr__ diff --git a/python/pylibcudf/pylibcudf/libcudf/datetime.pxd b/python/pylibcudf/pylibcudf/libcudf/datetime.pxd index 7dacab668b6..25ee571044a 100644 --- a/python/pylibcudf/pylibcudf/libcudf/datetime.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/datetime.pxd @@ -6,6 +6,7 @@ from pylibcudf.exception_handler cimport libcudf_exception_handler from pylibcudf.libcudf.column.column cimport column from pylibcudf.libcudf.column.column_view cimport column_view from pylibcudf.libcudf.scalar.scalar cimport scalar +from rmm.librmm.cuda_stream_view cimport cuda_stream_view cdef extern from "cudf/datetime.hpp" namespace "cudf::datetime" nogil: @@ -23,7 +24,8 @@ cdef extern from "cudf/datetime.hpp" namespace "cudf::datetime" nogil: cdef unique_ptr[column] extract_datetime_component( const column_view& column, - datetime_component component + datetime_component component, + cuda_stream_view stream ) except +libcudf_exception_handler cpdef enum class rounding_frequency(int32_t): @@ -36,35 +38,45 @@ cdef extern from "cudf/datetime.hpp" namespace "cudf::datetime" nogil: NANOSECOND cdef unique_ptr[column] ceil_datetimes( - const column_view& column, rounding_frequency freq + const column_view& column, rounding_frequency freq, + cuda_stream_view stream ) except +libcudf_exception_handler cdef unique_ptr[column] floor_datetimes( - const column_view& column, rounding_frequency freq + const column_view& column, rounding_frequency freq, + cuda_stream_view stream ) except +libcudf_exception_handler cdef unique_ptr[column] round_datetimes( - const column_view& column, rounding_frequency freq + const column_view& column, rounding_frequency freq, + cuda_stream_view stream ) except +libcudf_exception_handler cdef unique_ptr[column] add_calendrical_months( const column_view& timestamps, - const column_view& months + const column_view& months, + cuda_stream_view stream ) except +libcudf_exception_handler cdef unique_ptr[column] add_calendrical_months( const column_view& timestamps, - const scalar& months + const scalar& months, + cuda_stream_view stream ) except +libcudf_exception_handler cdef unique_ptr[column] day_of_year( - const column_view& column + const column_view& column, + cuda_stream_view stream ) except +libcudf_exception_handler cdef unique_ptr[column] is_leap_year( - const column_view& column + const column_view& column, + cuda_stream_view stream ) except +libcudf_exception_handler cdef unique_ptr[column] last_day_of_month( - const column_view& column + const column_view& column, + cuda_stream_view stream ) except +libcudf_exception_handler cdef unique_ptr[column] extract_quarter( - const column_view& column + const column_view& column, + cuda_stream_view stream ) except +libcudf_exception_handler cdef unique_ptr[column] days_in_month( - const column_view& column + const column_view& column, + cuda_stream_view stream ) except +libcudf_exception_handler From e7482c1f5a19f2783103de323be0ef148070bb3b Mon Sep 17 00:00:00 2001 From: Robert Maynard Date: Mon, 11 Aug 2025 20:20:06 -0400 Subject: [PATCH 101/366] Use rapids_cuda_enable_fatbin_compression (#19650) Standardize compression flags via rapids_cuda_enable_fatbin_compression Authors: - Robert Maynard (https://github.com/robertmaynard) Approvers: - Bradley Dice (https://github.com/bdice) - Karthikeyan (https://github.com/karthikeyann) URL: https://github.com/rapidsai/cudf/pull/19650 --- cpp/cmake/Modules/ConfigureCUDA.cmake | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/cpp/cmake/Modules/ConfigureCUDA.cmake b/cpp/cmake/Modules/ConfigureCUDA.cmake index a6987803c3b..e0c5cedf2ee 100644 --- a/cpp/cmake/Modules/ConfigureCUDA.cmake +++ b/cpp/cmake/Modules/ConfigureCUDA.cmake @@ -37,13 +37,8 @@ if(DISABLE_DEPRECATION_WARNINGS) endif() # make sure we produce smallest binary size -list(APPEND CUDF_CUDA_FLAGS -Xfatbin=-compress-all) -if(CMAKE_CUDA_COMPILER_ID STREQUAL "NVIDIA" - AND (CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.9 AND CMAKE_CUDA_COMPILER_VERSION - VERSION_LESS 13.0) -) - list(APPEND CUDF_CUDA_FLAGS -Xfatbin=--compress-level=3) -endif() +include(${rapids-cmake-dir}/cuda/enable_fatbin_compression.cmake) +rapids_cuda_enable_fatbin_compression(VARIABLE CUDF_CUDA_FLAGS TUNE_FOR rapids) # Option to enable line info in CUDA device compilation to allow introspection when profiling / # memchecking From c867802d8c640e91756309fe0419212955259077 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Mon, 11 Aug 2025 17:40:28 -0700 Subject: [PATCH 102/366] Add streams to transform and unary (#19613) Contributes to #15163 Authors: - Vyas Ramasubramani (https://github.com/vyasr) Approvers: - Matthew Roeschke (https://github.com/mroeschke) URL: https://github.com/rapidsai/cudf/pull/19613 --- .../pylibcudf/pylibcudf/libcudf/transform.pxd | 29 ++++-- python/pylibcudf/pylibcudf/libcudf/unary.pxd | 19 ++-- python/pylibcudf/pylibcudf/transform.pxd | 21 +++-- python/pylibcudf/pylibcudf/transform.pyi | 27 ++++-- python/pylibcudf/pylibcudf/transform.pyx | 93 ++++++++++++++----- python/pylibcudf/pylibcudf/unary.pxd | 15 +-- python/pylibcudf/pylibcudf/unary.pyi | 18 ++-- python/pylibcudf/pylibcudf/unary.pyx | 50 ++++++---- 8 files changed, 191 insertions(+), 81 deletions(-) diff --git a/python/pylibcudf/pylibcudf/libcudf/transform.pxd b/python/pylibcudf/pylibcudf/libcudf/transform.pxd index 9be137a077e..ce55a4a841a 100644 --- a/python/pylibcudf/pylibcudf/libcudf/transform.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/transform.pxd @@ -1,6 +1,7 @@ # Copyright (c) 2020-2025, NVIDIA CORPORATION. from libcpp cimport bool from libcpp.memory cimport unique_ptr +from libcpp.optional cimport optional from libcpp.pair cimport pair from libcpp.string cimport string from libcpp.vector cimport vector @@ -13,43 +14,53 @@ from pylibcudf.libcudf.table.table_view cimport table_view from pylibcudf.libcudf.types cimport bitmask_type, data_type, size_type from rmm.librmm.device_buffer cimport device_buffer +from rmm.librmm.cuda_stream_view cimport cuda_stream_view cdef extern from "cudf/transform.hpp" namespace "cudf" nogil: cdef pair[unique_ptr[device_buffer], size_type] bools_to_mask ( - column_view input + column_view input, + cuda_stream_view stream ) except +libcudf_exception_handler cdef unique_ptr[column] mask_to_bools ( - bitmask_type* bitmask, size_type begin_bit, size_type end_bit + bitmask_type* bitmask, size_type begin_bit, size_type end_bit, + cuda_stream_view stream ) except +libcudf_exception_handler cdef pair[unique_ptr[device_buffer], size_type] nans_to_nulls( - column_view input + column_view input, + cuda_stream_view stream ) except +libcudf_exception_handler cdef unique_ptr[column] compute_column( table_view table, - expression expr + expression expr, + cuda_stream_view stream ) except +libcudf_exception_handler cdef unique_ptr[column] transform( const vector[column_view] & inputs, const string & transform_udf, data_type output_type, - bool is_ptx + bool is_ptx, + optional[void *] user_data, + cuda_stream_view stream ) except +libcudf_exception_handler cdef pair[unique_ptr[table], unique_ptr[column]] encode( - table_view input + table_view input, + cuda_stream_view stream ) except +libcudf_exception_handler cdef pair[unique_ptr[column], table_view] one_hot_encode( column_view input_column, - column_view categories - ) except + + column_view categories, + cuda_stream_view stream + ) except +libcudf_exception_handler cdef unique_ptr[column] compute_column( const table_view table, - const expression& expr + const expression& expr, + cuda_stream_view stream ) except +libcudf_exception_handler diff --git a/python/pylibcudf/pylibcudf/libcudf/unary.pxd b/python/pylibcudf/pylibcudf/libcudf/unary.pxd index 5027e528660..3a68b948fea 100644 --- a/python/pylibcudf/pylibcudf/libcudf/unary.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/unary.pxd @@ -6,6 +6,7 @@ from pylibcudf.exception_handler cimport libcudf_exception_handler from pylibcudf.libcudf.column.column cimport column from pylibcudf.libcudf.column.column_view cimport column_view from pylibcudf.libcudf.types cimport data_type +from rmm.librmm.cuda_stream_view cimport cuda_stream_view cdef extern from "cudf/unary.hpp" namespace "cudf" nogil: @@ -38,21 +39,27 @@ cdef extern from "cudf/unary.hpp" namespace "cudf" nogil: cdef extern unique_ptr[column] unary_operation( column_view input, - unary_operator op) except +libcudf_exception_handler + unary_operator op, + cuda_stream_view stream) except +libcudf_exception_handler cdef extern unique_ptr[column] is_null( - column_view input + column_view input, + cuda_stream_view stream ) except +libcudf_exception_handler cdef extern unique_ptr[column] is_valid( - column_view input + column_view input, + cuda_stream_view stream ) except +libcudf_exception_handler cdef extern unique_ptr[column] cast( column_view input, - data_type out_type) except +libcudf_exception_handler + data_type out_type, + cuda_stream_view stream) except +libcudf_exception_handler cdef extern bool is_supported_cast(data_type from_, data_type to) noexcept cdef extern unique_ptr[column] is_nan( - column_view input + column_view input, + cuda_stream_view stream ) except +libcudf_exception_handler cdef extern unique_ptr[column] is_not_nan( - column_view input + column_view input, + cuda_stream_view stream ) except +libcudf_exception_handler diff --git a/python/pylibcudf/pylibcudf/transform.pxd b/python/pylibcudf/pylibcudf/transform.pxd index 45f79158055..09fc4cf72bd 100644 --- a/python/pylibcudf/pylibcudf/transform.pxd +++ b/python/pylibcudf/pylibcudf/transform.pxd @@ -1,6 +1,7 @@ # Copyright (c) 2024-2025, NVIDIA CORPORATION. from libcpp cimport bool from pylibcudf.libcudf.types cimport bitmask_type, data_type +from rmm.pylibrmm.stream cimport Stream from .column cimport Column from .expressions cimport Expression @@ -9,19 +10,25 @@ from .table cimport Table from .types cimport DataType -cpdef tuple[gpumemoryview, int] nans_to_nulls(Column input) +cpdef tuple[gpumemoryview, int] nans_to_nulls(Column input, Stream stream = *) -cpdef Column compute_column(Table input, Expression expr) +cpdef Column compute_column(Table input, Expression expr, Stream stream = *) -cpdef tuple[gpumemoryview, int] bools_to_mask(Column input) +cpdef tuple[gpumemoryview, int] bools_to_mask(Column input, Stream stream = *) -cpdef Column mask_to_bools(Py_ssize_t bitmask, int begin_bit, int end_bit) +cpdef Column mask_to_bools( + Py_ssize_t bitmask, + int begin_bit, + int end_bit, + Stream stream = *, +) cpdef Column transform(list[Column] inputs, str transform_udf, DataType output_type, - bool is_ptx) + bool is_ptx, + Stream stream = *) -cpdef tuple[Table, Column] encode(Table input) +cpdef tuple[Table, Column] encode(Table input, Stream stream = *) -cpdef Table one_hot_encode(Column input_column, Column categories) +cpdef Table one_hot_encode(Column input_column, Column categories, Stream stream = *) diff --git a/python/pylibcudf/pylibcudf/transform.pyi b/python/pylibcudf/pylibcudf/transform.pyi index ff7c43115bd..b38d19d732a 100644 --- a/python/pylibcudf/pylibcudf/transform.pyi +++ b/python/pylibcudf/pylibcudf/transform.pyi @@ -1,19 +1,34 @@ # Copyright (c) 2024, NVIDIA CORPORATION. +from rmm.pylibrmm.stream import Stream + from pylibcudf.column import Column from pylibcudf.expressions import Expression from pylibcudf.gpumemoryview import gpumemoryview from pylibcudf.table import Table from pylibcudf.types import DataType -def nans_to_nulls(input: Column) -> tuple[gpumemoryview, int]: ... -def compute_column(input: Table, expr: Expression) -> Column: ... -def bools_to_mask(input: Column) -> tuple[gpumemoryview, int]: ... -def mask_to_bools(bitmask: int, begin_bit: int, end_bit: int) -> Column: ... +def nans_to_nulls( + input: Column, stream: Stream | None = None +) -> tuple[gpumemoryview, int]: ... +def compute_column( + input: Table, expr: Expression, stream: Stream | None = None +) -> Column: ... +def bools_to_mask( + input: Column, stream: Stream | None = None +) -> tuple[gpumemoryview, int]: ... +def mask_to_bools( + bitmask: int, begin_bit: int, end_bit: int, stream: Stream | None = None +) -> Column: ... def transform( inputs: list[Column], transform_udf: str, output_type: DataType, is_ptx: bool, + stream: Stream | None = None, ) -> Column: ... -def encode(input: Table) -> tuple[Table, Column]: ... -def one_hot_encode(input: Column, categories: Column) -> Table: ... +def encode( + input: Table, stream: Stream | None = None +) -> tuple[Table, Column]: ... +def one_hot_encode( + input: Column, categories: Column, stream: Stream | None = None +) -> Table: ... diff --git a/python/pylibcudf/pylibcudf/transform.pyx b/python/pylibcudf/pylibcudf/transform.pyx index a06e6e43dd8..0581bef2b19 100644 --- a/python/pylibcudf/pylibcudf/transform.pyx +++ b/python/pylibcudf/pylibcudf/transform.pyx @@ -3,6 +3,7 @@ from cython.operator cimport dereference from libcpp.memory cimport unique_ptr +from libcpp.optional cimport optional from libcpp.string cimport string from libcpp.vector cimport vector from libcpp.utility cimport move, pair @@ -16,10 +17,12 @@ from pylibcudf.libcudf.types cimport bitmask_type, size_type from rmm.librmm.device_buffer cimport device_buffer from rmm.pylibrmm.device_buffer cimport DeviceBuffer +from rmm.pylibrmm.stream cimport Stream from .column cimport Column from .gpumemoryview cimport gpumemoryview from .types cimport DataType +from .utils cimport _get_stream __all__ = [ "bools_to_mask", @@ -31,7 +34,10 @@ __all__ = [ "transform", ] -cpdef tuple[gpumemoryview, int] nans_to_nulls(Column input): +cpdef tuple[gpumemoryview, int] nans_to_nulls( + Column input, + Stream stream=None, +): """Create a null mask preserving existing nulls and converting nans to null. For details, see :cpp:func:`nans_to_nulls`. @@ -47,16 +53,18 @@ cpdef tuple[gpumemoryview, int] nans_to_nulls(Column input): """ cdef pair[unique_ptr[device_buffer], size_type] c_result + stream = _get_stream(stream) + with nogil: - c_result = cpp_transform.nans_to_nulls(input.view()) + c_result = cpp_transform.nans_to_nulls(input.view(), stream.view()) return ( - gpumemoryview(DeviceBuffer.c_from_unique_ptr(move(c_result.first))), + gpumemoryview(DeviceBuffer.c_from_unique_ptr(move(c_result.first), stream)), c_result.second ) -cpdef Column compute_column(Table input, Expression expr): +cpdef Column compute_column(Table input, Expression expr, Stream stream=None): """Create a column by evaluating an expression on a table. For details see :cpp:func:`compute_column`. @@ -74,15 +82,20 @@ cpdef Column compute_column(Table input, Expression expr): """ cdef unique_ptr[column] c_result + stream = _get_stream(stream) + with nogil: c_result = cpp_transform.compute_column( - input.view(), dereference(expr.c_obj.get()) + input.view(), dereference(expr.c_obj.get()), stream.view() ) - return Column.from_libcudf(move(c_result)) + return Column.from_libcudf(move(c_result), stream) -cpdef tuple[gpumemoryview, int] bools_to_mask(Column input): +cpdef tuple[gpumemoryview, int] bools_to_mask( + Column input, + Stream stream=None, +): """Create a bitmask from a column of boolean elements Parameters @@ -97,16 +110,23 @@ cpdef tuple[gpumemoryview, int] bools_to_mask(Column input): """ cdef pair[unique_ptr[device_buffer], size_type] c_result + stream = _get_stream(stream) + with nogil: - c_result = cpp_transform.bools_to_mask(input.view()) + c_result = cpp_transform.bools_to_mask(input.view(), stream.view()) return ( - gpumemoryview(DeviceBuffer.c_from_unique_ptr(move(c_result.first))), + gpumemoryview(DeviceBuffer.c_from_unique_ptr(move(c_result.first), stream)), c_result.second ) -cpdef Column mask_to_bools(Py_ssize_t bitmask, int begin_bit, int end_bit): +cpdef Column mask_to_bools( + Py_ssize_t bitmask, + int begin_bit, + int end_bit, + Stream stream=None, +): """Creates a boolean column from given bitmask. Parameters @@ -126,16 +146,24 @@ cpdef Column mask_to_bools(Py_ssize_t bitmask, int begin_bit, int end_bit): cdef unique_ptr[column] c_result cdef bitmask_type * bitmask_ptr = bitmask + stream = _get_stream(stream) + with nogil: - c_result = cpp_transform.mask_to_bools(bitmask_ptr, begin_bit, end_bit) + c_result = cpp_transform.mask_to_bools( + bitmask_ptr, + begin_bit, + end_bit, + stream.view(), + ) - return Column.from_libcudf(move(c_result)) + return Column.from_libcudf(move(c_result), stream) cpdef Column transform(list[Column] inputs, str transform_udf, DataType output_type, - bool is_ptx): + bool is_ptx, + Stream stream=None): """Create a new column by applying a transform function against multiple input columns. @@ -160,18 +188,26 @@ cpdef Column transform(list[Column] inputs, cdef unique_ptr[column] c_result cdef string c_transform_udf = transform_udf.encode() cdef bool c_is_ptx = is_ptx + cdef optional[void *] user_data + + stream = _get_stream(stream) for input in inputs: c_inputs.push_back((input).view()) with nogil: c_result = cpp_transform.transform( - c_inputs, c_transform_udf, output_type.c_obj, c_is_ptx + c_inputs, + c_transform_udf, + output_type.c_obj, + c_is_ptx, + user_data, + stream.view(), ) - return Column.from_libcudf(move(c_result)) + return Column.from_libcudf(move(c_result), stream) -cpdef tuple[Table, Column] encode(Table input): +cpdef tuple[Table, Column] encode(Table input, Stream stream=None): """Encode the rows of the given table as integers. Parameters @@ -187,15 +223,21 @@ cpdef tuple[Table, Column] encode(Table input): """ cdef pair[unique_ptr[table], unique_ptr[column]] c_result + stream = _get_stream(stream) + with nogil: - c_result = cpp_transform.encode(input.view()) + c_result = cpp_transform.encode(input.view(), stream.view()) return ( - Table.from_libcudf(move(c_result.first)), - Column.from_libcudf(move(c_result.second)) + Table.from_libcudf(move(c_result.first), stream), + Column.from_libcudf(move(c_result.second), stream) ) -cpdef Table one_hot_encode(Column input, Column categories): +cpdef Table one_hot_encode( + Column input, + Column categories, + Stream stream=None, +): """Encodes `input` by generating a new column for each value in `categories` indicating the presence of that value in `input`. @@ -215,11 +257,18 @@ cpdef Table one_hot_encode(Column input, Column categories): cdef pair[unique_ptr[column], table_view] c_result cdef Table owner_table + stream = _get_stream(stream) + with nogil: - c_result = cpp_transform.one_hot_encode(input.view(), categories.view()) + c_result = cpp_transform.one_hot_encode( + input.view(), + categories.view(), + stream.view(), + ) owner_table = Table( - [Column.from_libcudf(move(c_result.first))] * c_result.second.num_columns() + [Column.from_libcudf(move(c_result.first), stream)] + * c_result.second.num_columns() ) return Table.from_table_view(c_result.second, owner_table) diff --git a/python/pylibcudf/pylibcudf/unary.pxd b/python/pylibcudf/pylibcudf/unary.pxd index 9ee08653599..b2414458d1c 100644 --- a/python/pylibcudf/pylibcudf/unary.pxd +++ b/python/pylibcudf/pylibcudf/unary.pxd @@ -1,22 +1,23 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. +# Copyright (c) 2024-2025, NVIDIA CORPORATION. from libcpp cimport bool from pylibcudf.libcudf.unary cimport unary_operator +from rmm.pylibrmm.stream cimport Stream from .column cimport Column from .types cimport DataType -cpdef Column unary_operation(Column input, unary_operator op) +cpdef Column unary_operation(Column input, unary_operator op, Stream stream = *) -cpdef Column is_null(Column input) +cpdef Column is_null(Column input, Stream stream = *) -cpdef Column is_valid(Column input) +cpdef Column is_valid(Column input, Stream stream = *) -cpdef Column cast(Column input, DataType data_type) +cpdef Column cast(Column input, DataType data_type, Stream stream = *) -cpdef Column is_nan(Column input) +cpdef Column is_nan(Column input, Stream stream = *) -cpdef Column is_not_nan(Column input) +cpdef Column is_not_nan(Column input, Stream stream = *) cpdef bool is_supported_cast(DataType from_, DataType to) diff --git a/python/pylibcudf/pylibcudf/unary.pyi b/python/pylibcudf/pylibcudf/unary.pyi index 4959e163125..28e0bb59327 100644 --- a/python/pylibcudf/pylibcudf/unary.pyi +++ b/python/pylibcudf/pylibcudf/unary.pyi @@ -2,6 +2,8 @@ from enum import IntEnum +from rmm.pylibrmm.stream import Stream + from pylibcudf.column import Column from pylibcudf.types import DataType @@ -31,10 +33,14 @@ class UnaryOperator(IntEnum): NOT = ... NEGATE = ... -def unary_operation(input: Column, op: UnaryOperator) -> Column: ... -def is_null(input: Column) -> Column: ... -def is_valid(input: Column) -> Column: ... -def cast(input: Column, data_type: DataType) -> Column: ... -def is_nan(input: Column) -> Column: ... -def is_not_nan(input: Column) -> Column: ... +def unary_operation( + input: Column, op: UnaryOperator, stream: Stream | None = None +) -> Column: ... +def is_null(input: Column, stream: Stream | None = None) -> Column: ... +def is_valid(input: Column, stream: Stream | None = None) -> Column: ... +def cast( + input: Column, data_type: DataType, stream: Stream | None = None +) -> Column: ... +def is_nan(input: Column, stream: Stream | None = None) -> Column: ... +def is_not_nan(input: Column, stream: Stream | None = None) -> Column: ... def is_supported_cast(from_: DataType, to: DataType) -> bool: ... diff --git a/python/pylibcudf/pylibcudf/unary.pyx b/python/pylibcudf/pylibcudf/unary.pyx index 3915ed8274a..e68b780fdc1 100644 --- a/python/pylibcudf/pylibcudf/unary.pyx +++ b/python/pylibcudf/pylibcudf/unary.pyx @@ -6,12 +6,14 @@ from libcpp.utility cimport move from pylibcudf.libcudf cimport unary as cpp_unary from pylibcudf.libcudf.column.column cimport column from pylibcudf.libcudf.unary cimport unary_operator +from rmm.pylibrmm.stream cimport Stream from pylibcudf.libcudf.unary import \ unary_operator as UnaryOperator # no-cython-lint from .column cimport Column from .types cimport DataType +from .utils cimport _get_stream __all__ = [ "UnaryOperator", @@ -24,7 +26,7 @@ __all__ = [ "unary_operation", ] -cpdef Column unary_operation(Column input, unary_operator op): +cpdef Column unary_operation(Column input, unary_operator op, Stream stream=None): """Perform a unary operation on a column. For details, see :cpp:func:`unary_operation`. @@ -43,13 +45,15 @@ cpdef Column unary_operation(Column input, unary_operator op): """ cdef unique_ptr[column] result + stream = _get_stream(stream) + with nogil: - result = cpp_unary.unary_operation(input.view(), op) + result = cpp_unary.unary_operation(input.view(), op, stream.view()) - return Column.from_libcudf(move(result)) + return Column.from_libcudf(move(result), stream) -cpdef Column is_null(Column input): +cpdef Column is_null(Column input, Stream stream=None): """Check whether elements of a column are null. For details, see :cpp:func:`is_null`. @@ -66,13 +70,15 @@ cpdef Column is_null(Column input): """ cdef unique_ptr[column] result + stream = _get_stream(stream) + with nogil: - result = cpp_unary.is_null(input.view()) + result = cpp_unary.is_null(input.view(), stream.view()) - return Column.from_libcudf(move(result)) + return Column.from_libcudf(move(result), stream) -cpdef Column is_valid(Column input): +cpdef Column is_valid(Column input, Stream stream=None): """Check whether elements of a column are valid. For details, see :cpp:func:`is_valid`. @@ -89,13 +95,15 @@ cpdef Column is_valid(Column input): """ cdef unique_ptr[column] result + stream = _get_stream(stream) + with nogil: - result = cpp_unary.is_valid(input.view()) + result = cpp_unary.is_valid(input.view(), stream.view()) - return Column.from_libcudf(move(result)) + return Column.from_libcudf(move(result), stream) -cpdef Column cast(Column input, DataType data_type): +cpdef Column cast(Column input, DataType data_type, Stream stream=None): """Cast a column to a different data type. For details, see :cpp:func:`cast`. @@ -114,13 +122,15 @@ cpdef Column cast(Column input, DataType data_type): """ cdef unique_ptr[column] result + stream = _get_stream(stream) + with nogil: - result = cpp_unary.cast(input.view(), data_type.c_obj) + result = cpp_unary.cast(input.view(), data_type.c_obj, stream.view()) - return Column.from_libcudf(move(result)) + return Column.from_libcudf(move(result), stream) -cpdef Column is_nan(Column input): +cpdef Column is_nan(Column input, Stream stream=None): """Check whether elements of a column are nan. For details, see :cpp:func:`is_nan`. @@ -137,13 +147,15 @@ cpdef Column is_nan(Column input): """ cdef unique_ptr[column] result + stream = _get_stream(stream) + with nogil: - result = cpp_unary.is_nan(input.view()) + result = cpp_unary.is_nan(input.view(), stream.view()) - return Column.from_libcudf(move(result)) + return Column.from_libcudf(move(result), stream) -cpdef Column is_not_nan(Column input): +cpdef Column is_not_nan(Column input, Stream stream=None): """Check whether elements of a column are not nan. For details, see :cpp:func:`is_not_nan`. @@ -160,10 +172,12 @@ cpdef Column is_not_nan(Column input): """ cdef unique_ptr[column] result + stream = _get_stream(stream) + with nogil: - result = cpp_unary.is_not_nan(input.view()) + result = cpp_unary.is_not_nan(input.view(), stream.view()) - return Column.from_libcudf(move(result)) + return Column.from_libcudf(move(result), stream) cpdef bool is_supported_cast(DataType from_, DataType to): """Check if a cast between datatypes is supported. From 91aeaa747401a89d9d9888effa78ab3951fd3b79 Mon Sep 17 00:00:00 2001 From: Matthew Murray <41342305+Matt711@users.noreply.github.com> Date: Tue, 12 Aug 2025 08:13:26 -0400 Subject: [PATCH 103/366] Require `--scale` for PDS-DS benchmarks (due to nonlinear scaling) (#19631) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Contributes to #19200. Unlike PDS-H, which scales linearly with row counts, PDS-DS tables scale nonlinearly, so we cannot infer the scale factor from a single table’s row count. In the future, we can relax this requirement by maintaining a map of scale factors to expected row counts for PDS-DS tables. Authors: - Matthew Murray (https://github.com/Matt711) Approvers: - Tom Augspurger (https://github.com/TomAugspurger) URL: https://github.com/rapidsai/cudf/pull/19631 --- .../experimental/benchmarks/pdsds.py | 1 + .../experimental/benchmarks/pdsh.py | 2 ++ .../experimental/benchmarks/utils.py | 26 +++++++++++++------ 3 files changed, 21 insertions(+), 8 deletions(-) diff --git a/python/cudf_polars/cudf_polars/experimental/benchmarks/pdsds.py b/python/cudf_polars/cudf_polars/experimental/benchmarks/pdsds.py index 904ae633d73..b9daef51ec4 100644 --- a/python/cudf_polars/cudf_polars/experimental/benchmarks/pdsds.py +++ b/python/cudf_polars/cudf_polars/experimental/benchmarks/pdsds.py @@ -72,6 +72,7 @@ class PDSDSQueries(metaclass=PDSDSQueriesMeta): """Base class for query loading.""" q_impl: str + name: str = "pdsds" class PDSDSPolarsQueries(PDSDSQueries): diff --git a/python/cudf_polars/cudf_polars/experimental/benchmarks/pdsh.py b/python/cudf_polars/cudf_polars/experimental/benchmarks/pdsh.py index 4cb5911ecae..960d8449589 100644 --- a/python/cudf_polars/cudf_polars/experimental/benchmarks/pdsh.py +++ b/python/cudf_polars/cudf_polars/experimental/benchmarks/pdsh.py @@ -39,6 +39,8 @@ class PDSHQueries: """PDS-H query definitions.""" + name: str = "pdsh" + @staticmethod def q0(run_config: RunConfig) -> pl.LazyFrame: """Query 0.""" diff --git a/python/cudf_polars/cudf_polars/experimental/benchmarks/utils.py b/python/cudf_polars/cudf_polars/experimental/benchmarks/utils.py index 11e6442baab..c85fbf8319e 100644 --- a/python/cudf_polars/cudf_polars/experimental/benchmarks/utils.py +++ b/python/cudf_polars/cudf_polars/experimental/benchmarks/utils.py @@ -16,7 +16,6 @@ import time from collections import defaultdict from datetime import datetime, timezone -from pathlib import Path from typing import TYPE_CHECKING, Any, Literal, assert_never import nvtx @@ -41,6 +40,7 @@ if TYPE_CHECKING: from collections.abc import Callable, Sequence + from pathlib import Path ExecutorType = Literal["in-memory", "streaming", "cpu"] @@ -155,9 +155,7 @@ def collect(cls) -> HardwareInfo: return cls(gpus=gpus) -def _infer_scale_factor(path: str | Path, suffix: str) -> int | float: - name = Path(sys.argv[0]).name - +def _infer_scale_factor(name: str, path: str | Path, suffix: str) -> int | float: if "pdsh" in name: supplier = get_data(path, "supplier", suffix) num_rows = supplier.select(pl.len()).collect().item(0, 0) @@ -176,7 +174,7 @@ def _infer_scale_factor(path: str | Path, suffix: str) -> int | float: @dataclasses.dataclass(kw_only=True) class RunConfig: - """Results for a PDS-H query run.""" + """Results for a PDS-H or PDS-DS query run.""" queries: list[int] suffix: str @@ -203,6 +201,7 @@ class RunConfig: rapidsmpf_oom_protection: bool rapidsmpf_spill: bool spill_device: float + query_set: str @classmethod def from_args(cls, args: argparse.Namespace) -> RunConfig: @@ -214,12 +213,21 @@ def from_args(cls, args: argparse.Namespace) -> RunConfig: scheduler = None path = args.path - if (scale_factor := args.scale) is None: + name = args.query_set + scale_factor = args.scale + + if scale_factor is None: + if "pdsds" in name: + raise ValueError( + "--scale is required for PDS-DS benchmarks.\n" + "TODO: This will be inferred once we maintain a map of scale factors to row counts." + ) if path is None: raise ValueError( "Must specify --root and --scale if --path is not specified." ) - scale_factor = _infer_scale_factor(path, args.suffix) + # For PDS-H, infer scale factor based on row count + scale_factor = _infer_scale_factor(name, path, args.suffix) if path is None: path = f"{args.root}/scale-{scale_factor}" try: @@ -229,7 +237,7 @@ def from_args(cls, args: argparse.Namespace) -> RunConfig: if args.scale is not None: # Validate the user-supplied scale factor - sf_inf = _infer_scale_factor(path, args.suffix) + sf_inf = _infer_scale_factor(name, path, args.suffix) rel_error = abs((scale_factor - sf_inf) / sf_inf) if rel_error > 0.01: raise ValueError( @@ -255,6 +263,7 @@ def from_args(cls, args: argparse.Namespace) -> RunConfig: spill_device=args.spill_device, rapidsmpf_spill=args.rapidsmpf_spill, max_rows_per_partition=args.max_rows_per_partition, + query_set=args.query_set, ) def serialize(self, engine: pl.GPUEngine | None) -> dict: @@ -681,6 +690,7 @@ def run_polars( ) -> None: """Run the queries using the given benchmark and executor options.""" args = parse_args(options, num_queries=num_queries) + vars(args).update({"query_set": benchmark.name}) run_config = RunConfig.from_args(args) validation_failures: list[int] = [] From bf63a7d37cb2a1ee9900ebea4f281d9292b4a958 Mon Sep 17 00:00:00 2001 From: David Wendt <45795991+davidwendt@users.noreply.github.com> Date: Tue, 12 Aug 2025 10:37:00 -0400 Subject: [PATCH 104/366] Use public Arrow functions for TDigest in PercentileApproxInputTypesTests (#19648) Changes the `PercentileApproxInputTypesTests` logic in `percentile_approx_test.cpp` to use the public arrow functions to compute the tdigest values instead of `internal` or `detail` functions. This required enabling the `ARROW_COMPUTE=ON` functions to be enabled in the `get_arrow.cmake` to build the libarrow.so. Authors: - David Wendt (https://github.com/davidwendt) Approvers: - Vyas Ramasubramani (https://github.com/vyasr) - Bradley Dice (https://github.com/bdice) - Muhammad Haseeb (https://github.com/mhaseeb123) URL: https://github.com/rapidsai/cudf/pull/19648 --- cpp/cmake/thirdparty/get_arrow.cmake | 14 +++++++++-- .../quantiles/percentile_approx_test.cpp | 24 ++++++++++++++----- 2 files changed, 30 insertions(+), 8 deletions(-) diff --git a/cpp/cmake/thirdparty/get_arrow.cmake b/cpp/cmake/thirdparty/get_arrow.cmake index c519fa687c3..8293f96fb5b 100644 --- a/cpp/cmake/thirdparty/get_arrow.cmake +++ b/cpp/cmake/thirdparty/get_arrow.cmake @@ -82,8 +82,9 @@ function(find_and_configure_arrow VERSION BUILD_STATIC EXCLUDE_FROM_ALL ENABLE_P rapids_cpm_find( Arrow ${VERSION} - GLOBAL_TARGETS arrow_shared parquet_shared arrow_acero_shared arrow_dataset_shared arrow_static - parquet_static arrow_acero_static arrow_dataset_static + GLOBAL_TARGETS + arrow_shared parquet_shared arrow_acero_shared arrow_dataset_shared arrow_compute_shared + arrow_static parquet_static arrow_acero_static arrow_dataset_static arrow_compute_static CPM_ARGS GIT_REPOSITORY https://github.com/apache/arrow.git GIT_TAG apache-arrow-${VERSION} @@ -91,6 +92,7 @@ function(find_and_configure_arrow VERSION BUILD_STATIC EXCLUDE_FROM_ALL ENABLE_P EXCLUDE_FROM_ALL ${EXCLUDE_FROM_ALL} OPTIONS "CMAKE_VERBOSE_MAKEFILE ON" "ARROW_ACERO ON" + "ARROW_COMPUTE ON" "ARROW_IPC ON" "ARROW_DATASET ON" "ARROW_WITH_BACKTRACE ON" @@ -145,6 +147,9 @@ function(find_and_configure_arrow VERSION BUILD_STATIC EXCLUDE_FROM_ALL ENABLE_P # us set(ArrowDataset_DIR "${Arrow_DIR}") find_package(ArrowDataset REQUIRED QUIET) + # Set this to enable `find_package(ArrowCompute)` + set(ArrowCompute_DIR "${Arrow_DIR}") + find_package(ArrowCompute REQUIRED QUIET) endif() # Arrow_ADDED: set if CPM downloaded Arrow from Github elseif(Arrow_ADDED) @@ -288,6 +293,7 @@ function(find_and_configure_arrow VERSION BUILD_STATIC EXCLUDE_FROM_ALL ENABLE_P NAMESPACE cudf:: FINAL_CODE_BLOCK arrow_dataset_code_string ) + set(parquet_code_string [=[ if (TARGET cudf::parquet_shared AND (NOT TARGET parquet_shared)) @@ -320,6 +326,7 @@ function(find_and_configure_arrow VERSION BUILD_STATIC EXCLUDE_FROM_ALL ENABLE_P rapids_export_package(BUILD Parquet cudf-exports) rapids_export_package(BUILD ArrowDataset cudf-exports) endif() + rapids_export_package(BUILD ArrowCompute cudf-exports) include("${rapids-cmake-dir}/export/find_package_root.cmake") rapids_export_find_package_root( @@ -335,6 +342,9 @@ function(find_and_configure_arrow VERSION BUILD_STATIC EXCLUDE_FROM_ALL ENABLE_P EXPORT_SET cudf-exports CONDITION ENABLE_PARQUET ) + rapids_export_find_package_root( + BUILD ArrowCompute [=[${CMAKE_CURRENT_LIST_DIR}]=] EXPORT_SET cudf-exports + ) endif() set(ARROW_LIBRARIES diff --git a/cpp/tests/quantiles/percentile_approx_test.cpp b/cpp/tests/quantiles/percentile_approx_test.cpp index 086f97c03af..a65f4766159 100644 --- a/cpp/tests/quantiles/percentile_approx_test.cpp +++ b/cpp/tests/quantiles/percentile_approx_test.cpp @@ -31,7 +31,8 @@ #include #include -#include +#include +#include namespace { std::unique_ptr arrow_percentile_approx(cudf::column_view const& _values, @@ -59,17 +60,28 @@ std::unique_ptr arrow_percentile_approx(cudf::column_view const& _ } // generate the tdigest - arrow::internal::TDigest atd(delta, sorted_values.size() * 2); + arrow::DoubleBuilder builder; for (size_t idx = 0; idx < h_values.size(); idx++) { - if (sorted_values.null_mask() == nullptr || h_validity[idx]) { atd.Add(h_values[idx]); } + if (sorted_values.null_mask() == nullptr || h_validity[idx]) { + EXPECT_TRUE(builder.Append(h_values[idx]).ok()); + } } + std::shared_ptr array; + EXPECT_TRUE(builder.Finish(&array).ok()); + + auto const udelta = static_cast(delta); + auto const usize = static_cast(h_values.size()) * 2; + arrow::compute::TDigestOptions options{percentages, udelta, usize}; + + auto arrow_result = arrow::compute::CallFunction("tdigest", {array}, &options); + auto result_array = arrow_result.ValueOrDie().array_as(); - // generate the percentiles and stuff them into a list column + // copy the percentiles and stuff them into a list column std::vector h_result; h_result.reserve(percentages.size()); std::transform( - percentages.begin(), percentages.end(), std::back_inserter(h_result), [&atd](double p) { - return atd.Quantile(p); + result_array->begin(), result_array->end(), std::back_inserter(h_result), [](auto p) { + return p.value(); }); cudf::test::fixed_width_column_wrapper result(h_result.begin(), h_result.end()); cudf::test::fixed_width_column_wrapper offsets{ From d6533a294e8fd3ce75a6e149c8c0aabd15d86f14 Mon Sep 17 00:00:00 2001 From: David Wendt <45795991+davidwendt@users.noreply.github.com> Date: Tue, 12 Aug 2025 10:57:34 -0400 Subject: [PATCH 105/366] Set scalar to valid in range_window_bounds unbounded/current_row (#19622) Fixes the `cudf::range_bound_windows` `current_row()` and `unbounded()` functions to return valid scalars. Also adds a check for invalid scalar parameter in `range_comparable_value` function. Authors: - David Wendt (https://github.com/davidwendt) Approvers: - Bradley Dice (https://github.com/bdice) - Lawrence Mitchell (https://github.com/wence-) - MithunR (https://github.com/mythrocks) - Basit Ayantunde (https://github.com/lamarrr) URL: https://github.com/rapidsai/cudf/pull/19622 --- cpp/src/rolling/detail/range_window_bounds.hpp | 4 +++- cpp/src/rolling/range_window_bounds.cpp | 10 +++++++--- cpp/tests/rolling/range_window_bounds_test.cpp | 11 +++++------ 3 files changed, 15 insertions(+), 10 deletions(-) diff --git a/cpp/src/rolling/detail/range_window_bounds.hpp b/cpp/src/rolling/detail/range_window_bounds.hpp index 62e28cf47c4..0536b450209 100644 --- a/cpp/src/rolling/detail/range_window_bounds.hpp +++ b/cpp/src/rolling/detail/range_window_bounds.hpp @@ -139,7 +139,9 @@ range_rep_type range_comparable_value(range_window_bounds const& ra rmm::cuda_stream_view stream) { auto const& range_scalar = range_bounds.range_scalar(); - using range_type = cudf::detail::range_type; + CUDF_EXPECTS( + range_scalar.is_valid(stream), "Range bounds scalar must be valid.", std::invalid_argument); + using range_type = cudf::detail::range_type; CUDF_EXPECTS(range_scalar.type().id() == cudf::type_to_id(), "Range bounds scalar must match the type of the orderby column."); diff --git a/cpp/src/rolling/range_window_bounds.cpp b/cpp/src/rolling/range_window_bounds.cpp index 7f698dfcd6b..2e4a26f720b 100644 --- a/cpp/src/rolling/range_window_bounds.cpp +++ b/cpp/src/rolling/range_window_bounds.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2024, NVIDIA CORPORATION. + * Copyright (c) 2021-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -78,12 +78,16 @@ range_window_bounds::range_window_bounds(extent_type extent_, range_window_bounds range_window_bounds::unbounded(data_type type, rmm::cuda_stream_view stream) { - return {extent_type::UNBOUNDED, make_default_constructed_scalar(type, stream), stream}; + auto s = make_default_constructed_scalar(type, stream); + s->set_valid_async(true, stream); + return {extent_type::UNBOUNDED, std::move(s), stream}; } range_window_bounds range_window_bounds::current_row(data_type type, rmm::cuda_stream_view stream) { - return {extent_type::CURRENT_ROW, make_default_constructed_scalar(type, stream), stream}; + auto s = make_default_constructed_scalar(type, stream); + s->set_valid_async(true, stream); + return {extent_type::CURRENT_ROW, std::move(s), stream}; } range_window_bounds range_window_bounds::get(scalar const& boundary, rmm::cuda_stream_view stream) diff --git a/cpp/tests/rolling/range_window_bounds_test.cpp b/cpp/tests/rolling/range_window_bounds_test.cpp index a67555280f4..df551f96e50 100644 --- a/cpp/tests/rolling/range_window_bounds_test.cpp +++ b/cpp/tests/rolling/range_window_bounds_test.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2024, NVIDIA CORPORATION. + * Copyright (c) 2021-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -116,21 +116,20 @@ TYPED_TEST(NumericRangeWindowBoundsTest, BoundsConstruction) using range_type = cudf::detail::range_type; using rep_type = cudf::detail::range_rep_type; auto const dtype = cudf::data_type{cudf::type_to_id()}; + auto const stream = cudf::get_default_stream(); static_assert(std::is_integral_v); auto range_3 = cudf::range_window_bounds::get(cudf::numeric_scalar{3, true}); EXPECT_FALSE(range_3.is_unbounded() && "range_window_bounds constructed from scalar cannot be unbounded."); - EXPECT_EQ( - cudf::detail::range_comparable_value(range_3, dtype, cudf::get_default_stream()), - rep_type{3}); + EXPECT_EQ(cudf::detail::range_comparable_value(range_3, dtype, stream), rep_type{3}); auto range_unbounded = cudf::range_window_bounds::unbounded(cudf::data_type{cudf::type_to_id()}); EXPECT_TRUE(range_unbounded.is_unbounded() && "range_window_bounds::unbounded() must return an unbounded range."); - EXPECT_EQ(cudf::detail::range_comparable_value( - range_unbounded, dtype, cudf::get_default_stream()), + EXPECT_TRUE(range_unbounded.range_scalar().is_valid(stream)); + EXPECT_EQ(cudf::detail::range_comparable_value(range_unbounded, dtype, stream), rep_type{}); } From 4bb819bd0cb9b30227f7cf59dbbdc1b60183ad05 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 12 Aug 2025 09:18:30 -0700 Subject: [PATCH 106/366] Clean and move test_join_order/interpolate/onehot.py to new cudf classic test directory structure (#19662) Towards https://github.com/rapidsai/cudf/issues/9999 Towards https://github.com/rapidsai/cudf/issues/15723 Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - Bradley Dice (https://github.com/bdice) URL: https://github.com/rapidsai/cudf/pull/19662 --- .../dataframe/methods/test_interpolate.py | 64 ++++ .../test_get_dummies.py} | 40 ++- python/cudf/cudf/tests/reshape/test_merge.py | 270 +++++++++++++++- .../{ => series/methods}/test_interpolate.py | 73 +---- python/cudf/cudf/tests/test_join_order.py | 287 ------------------ 5 files changed, 368 insertions(+), 366 deletions(-) create mode 100644 python/cudf/cudf/tests/dataframe/methods/test_interpolate.py rename python/cudf/cudf/tests/{test_onehot.py => general_functions/test_get_dummies.py} (82%) rename python/cudf/cudf/tests/{ => series/methods}/test_interpolate.py (50%) delete mode 100644 python/cudf/cudf/tests/test_join_order.py diff --git a/python/cudf/cudf/tests/dataframe/methods/test_interpolate.py b/python/cudf/cudf/tests/dataframe/methods/test_interpolate.py new file mode 100644 index 00000000000..506d0677178 --- /dev/null +++ b/python/cudf/cudf/tests/dataframe/methods/test_interpolate.py @@ -0,0 +1,64 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + +import pytest + +import cudf +from cudf.core._compat import PANDAS_CURRENT_SUPPORTED_VERSION, PANDAS_VERSION +from cudf.testing import assert_eq +from cudf.testing._utils import assert_exceptions_equal + + +@pytest.mark.parametrize( + "data", + [ + # basics + {"A": [1.0, 2.0, 3.0], "B": [4.0, 5.0, 6.0]}, + {"A": [1.0, None, 3.0], "B": [4.0, None, 6.0]}, + {"A": [None, 2.0, 3.0], "B": [4.0, 5.0, None]}, + ], +) +def test_interpolate_dataframe(data): + # Pandas interpolate methods do not seem to work + # with nullable dtypes yet, so this method treats + # NAs as NaNs + # https://github.com/pandas-dev/pandas/issues/40252 + axis = 0 + method = "linear" + gdf = cudf.DataFrame(data) + pdf = gdf.to_pandas() + + expect = pdf.interpolate(method=method, axis=axis) + got = gdf.interpolate(method=method, axis=axis) + assert_eq(expect, got) + + +@pytest.mark.parametrize( + "data,kwargs", + [ + ( + {"A": ["a", "b", "c"], "B": ["d", "e", "f"]}, + {"axis": 0, "method": "linear"}, + ), + ({"A": [1, 2, 3]}, {"method": "pad", "limit_direction": "forward"}), + ({"A": [1, 2, 3]}, {"method": "ffill", "limit_direction": "forward"}), + ({"A": [1, 2, 3]}, {"method": "bfill", "limit_direction": "backward"}), + ( + {"A": [1, 2, 3]}, + {"method": "backfill", "limit_direction": "backward"}, + ), + ], +) +@pytest.mark.skipif( + PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, + reason="Does not fail on older versions of pandas", +) +def test_interpolate_dataframe_error_cases(data, kwargs): + gsr = cudf.DataFrame(data) + psr = gsr.to_pandas() + + assert_exceptions_equal( + lfunc=psr.interpolate, + rfunc=gsr.interpolate, + lfunc_args_and_kwargs=([], kwargs), + rfunc_args_and_kwargs=([], kwargs), + ) diff --git a/python/cudf/cudf/tests/test_onehot.py b/python/cudf/cudf/tests/general_functions/test_get_dummies.py similarity index 82% rename from python/cudf/cudf/tests/test_onehot.py rename to python/cudf/cudf/tests/general_functions/test_get_dummies.py index b85882a79f5..6108f4b7b7f 100644 --- a/python/cudf/cudf/tests/test_onehot.py +++ b/python/cudf/cudf/tests/general_functions/test_get_dummies.py @@ -22,13 +22,16 @@ (range(10), [1, 2, 3, 4, 5] * 2), ], ) -@pytest.mark.parametrize("dtype", ["bool", "uint8"]) -def test_get_dummies(data, index, dtype): +def test_get_dummies(data, index, numeric_and_bool_types_as_str): pdf = pd.DataFrame({"x": data}, index=index) gdf = cudf.from_pandas(pdf) - encoded_expected = pd.get_dummies(pdf, prefix="test", dtype=dtype) - encoded_actual = cudf.get_dummies(gdf, prefix="test", dtype=dtype) + encoded_expected = pd.get_dummies( + pdf, prefix="test", dtype=numeric_and_bool_types_as_str + ) + encoded_actual = cudf.get_dummies( + gdf, prefix="test", dtype=numeric_and_bool_types_as_str + ) assert_eq( encoded_expected, @@ -37,8 +40,8 @@ def test_get_dummies(data, index, dtype): ) -@pytest.mark.parametrize("n_cols", [5, 10, 20]) -def test_onehot_get_dummies_multicol(n_cols): +def test_onehot_get_dummies_multicol(): + n_cols = 5 n_categories = 5 data = dict( zip( @@ -57,9 +60,15 @@ def test_onehot_get_dummies_multicol(n_cols): assert_eq(encoded_expected, encoded_actual) -@pytest.mark.parametrize("nan_as_null", [True, False]) @pytest.mark.parametrize("dummy_na", [True, False]) -def test_onehost_get_dummies_dummy_na(nan_as_null, dummy_na): +def test_get_dummies_dummy_na(request, nan_as_null, dummy_na): + request.applymarker( + pytest.mark.xfail( + nan_as_null is None, + reason=f"Incorrect cuDF result with {nan_as_null=}", + ) + ) + df = cudf.DataFrame({"a": [0, 1, np.nan]}, nan_as_null=nan_as_null) pdf = df.to_pandas(nullable=nan_as_null) @@ -133,17 +142,24 @@ def test_get_dummies_with_nan(): ) @pytest.mark.parametrize("prefix_sep", ["-", "#"]) @pytest.mark.parametrize("prefix", [None, "hi"]) -@pytest.mark.parametrize("dtype", ["uint8", "int16"]) -def test_get_dummies_array_like(data, prefix_sep, prefix, dtype): +def test_get_dummies_array_like( + data, prefix_sep, prefix, numeric_and_bool_types_as_str +): data = data() pd_data = data.to_pandas() expected = pd.get_dummies( - pd_data, prefix=prefix, prefix_sep=prefix_sep, dtype=dtype + pd_data, + prefix=prefix, + prefix_sep=prefix_sep, + dtype=numeric_and_bool_types_as_str, ) actual = cudf.get_dummies( - data, prefix=prefix, prefix_sep=prefix_sep, dtype=dtype + data, + prefix=prefix, + prefix_sep=prefix_sep, + dtype=numeric_and_bool_types_as_str, ) assert_eq(expected, actual) diff --git a/python/cudf/cudf/tests/reshape/test_merge.py b/python/cudf/cudf/tests/reshape/test_merge.py index f661eb4b587..cd11f36448b 100644 --- a/python/cudf/cudf/tests/reshape/test_merge.py +++ b/python/cudf/cudf/tests/reshape/test_merge.py @@ -1,11 +1,19 @@ # Copyright (c) 2025, NVIDIA CORPORATION. - +import itertools +import operator +import string +from collections import defaultdict import numpy as np import pandas as pd import pytest import cudf +from cudf.core._compat import ( + PANDAS_CURRENT_SUPPORTED_VERSION, + PANDAS_GE_220, + PANDAS_VERSION, +) from cudf.core.dtypes import CategoricalDtype, Decimal64Dtype, Decimal128Dtype from cudf.testing import assert_eq from cudf.testing._utils import ( @@ -31,6 +39,11 @@ def how(request): return request.param +@pytest.fixture(params=[False, True]) +def sort(request): + return request.param + + def assert_join_results_equal(expect, got, how, **kwargs): if how == "right": got = got[expect.columns] @@ -1348,3 +1361,258 @@ def test_merge_datetime_timedelta_error(temporal_types_as_str): with pytest.raises(TypeError): df1.merge(df2) + + +if PANDAS_GE_220: + # Behaviour in sort=False case didn't match documentation in many + # cases prior to https://github.com/pandas-dev/pandas/pull/54611 + # (released as part of pandas 2.2) + def expected(left, right, sort, *, how): + left = left.to_pandas() + right = right.to_pandas() + return left.merge(right, on="key", how=how, sort=sort) + +else: + + def expect_inner(left, right, sort): + left_key = left.key.values_host.tolist() + left_val = left.val.values_host.tolist() + right_key = right.key.values_host.tolist() + right_val = right.val.values_host.tolist() + + right_have = defaultdict(list) + for i, k in enumerate(right_key): + right_have[k].append(i) + keys = [] + val_x = [] + val_y = [] + for k, v in zip(left_key, left_val, strict=True): + if k not in right_have: + continue + for i in right_have[k]: + keys.append(k) + val_x.append(v) + val_y.append(right_val[i]) + + if sort: + # Python sort is stable, so this will preserve input order for + # equal items. + keys, val_x, val_y = zip( + *sorted( + zip(keys, val_x, val_y, strict=True), + key=operator.itemgetter(0), + ), + strict=True, + ) + return cudf.DataFrame({"key": keys, "val_x": val_x, "val_y": val_y}) + + def expect_left(left, right, sort): + left_key = left.key.values_host.tolist() + left_val = left.val.values_host.tolist() + right_key = right.key.values_host.tolist() + right_val = right.val.values_host.tolist() + + right_have = defaultdict(list) + for i, k in enumerate(right_key): + right_have[k].append(i) + keys = [] + val_x = [] + val_y = [] + for k, v in zip(left_key, left_val, strict=True): + if k not in right_have: + right_vals = [None] + else: + right_vals = [right_val[i] for i in right_have[k]] + + for rv in right_vals: + keys.append(k) + val_x.append(v) + val_y.append(rv) + + if sort: + # Python sort is stable, so this will preserve input order for + # equal items. + keys, val_x, val_y = zip( + *sorted( + zip(keys, val_x, val_y, strict=True), + key=operator.itemgetter(0), + ), + strict=True, + ) + return cudf.DataFrame({"key": keys, "val_x": val_x, "val_y": val_y}) + + def expect_outer(left, right, sort): + left_key = left.key.values_host.tolist() + left_val = left.val.values_host.tolist() + right_key = right.key.values_host.tolist() + right_val = right.val.values_host.tolist() + right_have = defaultdict(list) + for i, k in enumerate(right_key): + right_have[k].append(i) + keys = [] + val_x = [] + val_y = [] + for k, v in zip(left_key, left_val, strict=True): + if k not in right_have: + right_vals = [None] + else: + right_vals = [right_val[i] for i in right_have[k]] + for rv in right_vals: + keys.append(k) + val_x.append(v) + val_y.append(rv) + left_have = set(left_key) + for k, v in zip(right_key, right_val, strict=True): + if k not in left_have: + keys.append(k) + val_x.append(None) + val_y.append(v) + + # Python sort is stable, so this will preserve input order for + # equal items. + # outer joins are always sorted, but we test both sort values + keys, val_x, val_y = zip( + *sorted( + zip(keys, val_x, val_y, strict=True), + key=operator.itemgetter(0), + ), + strict=True, + ) + return cudf.DataFrame({"key": keys, "val_x": val_x, "val_y": val_y}) + + def expected(left, right, sort, *, how): + if how == "inner": + return expect_inner(left, right, sort) + elif how == "outer": + return expect_outer(left, right, sort) + elif how == "left": + return expect_left(left, right, sort) + elif how == "right": + return expect_left(right, left, sort).rename( + {"val_x": "val_y", "val_y": "val_x"}, axis=1 + ) + else: + raise NotImplementedError() + + +def test_join_ordering_pandas_compat(request, sort, how): + if how in ["leftanti", "leftsemi", "cross"]: + pytest.skip(f"Test not applicable for {how}") + request.applymarker( + pytest.mark.xfail( + PANDAS_VERSION >= PANDAS_CURRENT_SUPPORTED_VERSION + and how == "right", + reason="TODO: Result ording of suffix'ed columns is incorrect", + ) + ) + left_key = [1, 3, 2, 1, 1, 2, 5, 1, 4, 5, 8, 12, 12312, 1] * 100 + left_val = range(len(left_key)) + left = cudf.DataFrame({"key": left_key, "val": left_val}) + right_key = [12312, 12312, 3, 2, 1, 1, 5, 7, 2] * 200 + right_val = list( + itertools.islice(itertools.cycle(string.ascii_letters), len(right_key)) + ) + right = cudf.DataFrame({"key": right_key, "val": right_val}) + with cudf.option_context("mode.pandas_compatible", True): + actual = left.merge(right, on="key", how=how, sort=sort) + expect = expected(left, right, sort, how=how) + assert_eq(expect, actual) + + +@pytest.mark.parametrize("on_index", [True, False]) +@pytest.mark.parametrize("left_unique", [True, False]) +@pytest.mark.parametrize("left_monotonic", [True, False]) +@pytest.mark.parametrize("right_unique", [True, False]) +@pytest.mark.parametrize("right_monotonic", [True, False]) +def test_merge_combinations( + request, + how, + sort, + on_index, + left_unique, + left_monotonic, + right_unique, + right_monotonic, +): + if how in ["leftanti", "leftsemi", "cross"]: + pytest.skip(f"Test not applicable for {how}") + request.applymarker( + pytest.mark.xfail( + condition=how == "outer" + and on_index + and left_unique + and not left_monotonic + and right_unique + and not right_monotonic, + reason="https://github.com/pandas-dev/pandas/issues/55992", + ) + ) + left = [2, 3] + if left_unique: + left.append(4 if left_monotonic else 1) + else: + left.append(3 if left_monotonic else 2) + + right = [2, 3] + if right_unique: + right.append(4 if right_monotonic else 1) + else: + right.append(3 if right_monotonic else 2) + + left = cudf.DataFrame({"key": left}) + right = cudf.DataFrame({"key": right}) + + if on_index: + left = left.set_index("key") + right = right.set_index("key") + on_kwargs = {"left_index": True, "right_index": True} + else: + on_kwargs = {"on": "key"} + + with cudf.option_context("mode.pandas_compatible", True): + result = cudf.merge(left, right, how=how, sort=sort, **on_kwargs) + if on_index: + left = left.reset_index() + right = right.reset_index() + + if how in ["left", "right", "inner"]: + if how in ["left", "inner"]: + expected, other, other_unique = left, right, right_unique + else: + expected, other, other_unique = right, left, left_unique + if how == "inner": + keep_values = set(left["key"].values_host).intersection( + right["key"].values_host + ) + keep_mask = expected["key"].isin(keep_values) + expected = expected[keep_mask] + if sort: + expected = expected.sort_values("key") + if not other_unique: + other_value_counts = other["key"].value_counts() + repeats = other_value_counts.reindex( + expected["key"].values, fill_value=1 + ) + repeats = repeats.astype(np.intp) + expected = expected["key"].repeat(repeats.values) + expected = expected.to_frame() + elif how == "outer": + if on_index and left_unique and left["key"].equals(right["key"]): + expected = cudf.DataFrame({"key": left["key"]}) + else: + left_counts = left["key"].value_counts() + right_counts = right["key"].value_counts() + expected_counts = left_counts.mul(right_counts, fill_value=1) + expected_counts = expected_counts.astype(np.intp) + expected = expected_counts.index.values_host.repeat( + expected_counts.values_host + ) + expected = cudf.DataFrame({"key": expected}) + expected = expected.sort_values("key") + + if on_index: + expected = expected.set_index("key") + else: + expected = expected.reset_index(drop=True) + + assert_eq(result, expected) diff --git a/python/cudf/cudf/tests/test_interpolate.py b/python/cudf/cudf/tests/series/methods/test_interpolate.py similarity index 50% rename from python/cudf/cudf/tests/test_interpolate.py rename to python/cudf/cudf/tests/series/methods/test_interpolate.py index c76a49103e2..dd7acb2f3b7 100644 --- a/python/cudf/cudf/tests/test_interpolate.py +++ b/python/cudf/cudf/tests/series/methods/test_interpolate.py @@ -1,35 +1,11 @@ -# Copyright (c) 2021-2024, NVIDIA CORPORATION. +# Copyright (c) 2025, NVIDIA CORPORATION. import pytest import cudf from cudf.core._compat import PANDAS_CURRENT_SUPPORTED_VERSION, PANDAS_VERSION from cudf.testing import assert_eq -from cudf.testing._utils import assert_exceptions_equal, expect_warning_if - - -@pytest.mark.parametrize( - "data", - [ - # basics - {"A": [1.0, 2.0, 3.0], "B": [4.0, 5.0, 6.0]}, - {"A": [1.0, None, 3.0], "B": [4.0, None, 6.0]}, - {"A": [None, 2.0, 3.0], "B": [4.0, 5.0, None]}, - ], -) -@pytest.mark.parametrize("method", ["linear"]) -@pytest.mark.parametrize("axis", [0]) -def test_interpolate_dataframe(data, method, axis): - # Pandas interpolate methods do not seem to work - # with nullable dtypes yet, so this method treats - # NAs as NaNs - # https://github.com/pandas-dev/pandas/issues/40252 - gdf = cudf.DataFrame(data) - pdf = gdf.to_pandas() - - expect = pdf.interpolate(method=method, axis=axis) - got = gdf.interpolate(method=method, axis=axis) - assert_eq(expect, got) +from cudf.testing._utils import expect_warning_if @pytest.mark.skipif( @@ -49,9 +25,9 @@ def test_interpolate_dataframe(data, method, axis): [0.1, 0.2, 0.3], ], ) -@pytest.mark.parametrize("method", ["linear"]) -@pytest.mark.parametrize("axis", [0]) -def test_interpolate_series(data, method, axis): +def test_interpolate_series(data): + axis = 0 + method = "linear" gsr = cudf.Series(data) psr = gsr.to_pandas() @@ -64,11 +40,8 @@ def test_interpolate_series(data, method, axis): assert_eq(expect, got, check_dtype=psr.dtype != "object") -@pytest.mark.parametrize( - "data,index", [([2.0, None, 4.0, None, 2.0], [1, 2, 3, 2, 1])] -) -def test_interpolate_series_unsorted_index(data, index): - gsr = cudf.Series(data, index=index) +def test_interpolate_series_unsorted_index(): + gsr = cudf.Series([2.0, None, 4.0, None, 2.0], index=[1, 2, 3, 2, 1]) psr = gsr.to_pandas() expect = psr.interpolate(method="values") @@ -109,38 +82,6 @@ def test_interpolate_series_values_or_index(data, index, method): assert_eq(expect, got, check_dtype=psr.dtype != "object") -@pytest.mark.parametrize( - "data,kwargs", - [ - ( - {"A": ["a", "b", "c"], "B": ["d", "e", "f"]}, - {"axis": 0, "method": "linear"}, - ), - ({"A": [1, 2, 3]}, {"method": "pad", "limit_direction": "forward"}), - ({"A": [1, 2, 3]}, {"method": "ffill", "limit_direction": "forward"}), - ({"A": [1, 2, 3]}, {"method": "bfill", "limit_direction": "backward"}), - ( - {"A": [1, 2, 3]}, - {"method": "backfill", "limit_direction": "backward"}, - ), - ], -) -@pytest.mark.skipif( - PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, - reason="Does not fail on older versions of pandas", -) -def test_interpolate_dataframe_error_cases(data, kwargs): - gsr = cudf.DataFrame(data) - psr = gsr.to_pandas() - - assert_exceptions_equal( - lfunc=psr.interpolate, - rfunc=gsr.interpolate, - lfunc_args_and_kwargs=([], kwargs), - rfunc_args_and_kwargs=([], kwargs), - ) - - def test_interpolate_noop_new_column(): ser = cudf.Series([1.0, 2.0, 3.0]) result = ser.interpolate() diff --git a/python/cudf/cudf/tests/test_join_order.py b/python/cudf/cudf/tests/test_join_order.py deleted file mode 100644 index 60ec93f5040..00000000000 --- a/python/cudf/cudf/tests/test_join_order.py +++ /dev/null @@ -1,287 +0,0 @@ -# Copyright (c) 2023-2025, NVIDIA CORPORATION. - -import itertools -import operator -import string -from collections import defaultdict - -import numpy as np -import pytest - -import cudf -from cudf.core._compat import ( - PANDAS_CURRENT_SUPPORTED_VERSION, - PANDAS_GE_220, - PANDAS_VERSION, -) -from cudf.testing import assert_eq - - -@pytest.fixture(params=[False, True], ids=["unsorted", "sorted"]) -def sort(request): - return request.param - - -@pytest.fixture -def left(): - left_key = [1, 3, 2, 1, 1, 2, 5, 1, 4, 5, 8, 12, 12312, 1] * 100 - left_val = list(range(len(left_key))) - return cudf.DataFrame({"key": left_key, "val": left_val}) - - -@pytest.fixture -def right(): - right_key = [12312, 12312, 3, 2, 1, 1, 5, 7, 2] * 200 - right_val = list( - itertools.islice(itertools.cycle(string.ascii_letters), len(right_key)) - ) - return cudf.DataFrame({"key": right_key, "val": right_val}) - - -# Behaviour in sort=False case didn't match documentation in many -# cases prior to https://github.com/pandas-dev/pandas/pull/54611 -# (released as part of pandas 2.2) -if PANDAS_GE_220: - # Behaviour in sort=False case didn't match documentation in many - # cases prior to https://github.com/pandas-dev/pandas/pull/54611 - # (released as part of pandas 2.2) - def expected(left, right, sort, *, how): - left = left.to_pandas() - right = right.to_pandas() - return left.merge(right, on="key", how=how, sort=sort) - -else: - - def expect_inner(left, right, sort): - left_key = left.key.values_host.tolist() - left_val = left.val.values_host.tolist() - right_key = right.key.values_host.tolist() - right_val = right.val.values_host.tolist() - - right_have = defaultdict(list) - for i, k in enumerate(right_key): - right_have[k].append(i) - keys = [] - val_x = [] - val_y = [] - for k, v in zip(left_key, left_val, strict=True): - if k not in right_have: - continue - for i in right_have[k]: - keys.append(k) - val_x.append(v) - val_y.append(right_val[i]) - - if sort: - # Python sort is stable, so this will preserve input order for - # equal items. - keys, val_x, val_y = zip( - *sorted( - zip(keys, val_x, val_y, strict=True), - key=operator.itemgetter(0), - ), - strict=True, - ) - return cudf.DataFrame({"key": keys, "val_x": val_x, "val_y": val_y}) - - def expect_left(left, right, sort): - left_key = left.key.values_host.tolist() - left_val = left.val.values_host.tolist() - right_key = right.key.values_host.tolist() - right_val = right.val.values_host.tolist() - - right_have = defaultdict(list) - for i, k in enumerate(right_key): - right_have[k].append(i) - keys = [] - val_x = [] - val_y = [] - for k, v in zip(left_key, left_val, strict=True): - if k not in right_have: - right_vals = [None] - else: - right_vals = [right_val[i] for i in right_have[k]] - - for rv in right_vals: - keys.append(k) - val_x.append(v) - val_y.append(rv) - - if sort: - # Python sort is stable, so this will preserve input order for - # equal items. - keys, val_x, val_y = zip( - *sorted( - zip(keys, val_x, val_y, strict=True), - key=operator.itemgetter(0), - ), - strict=True, - ) - return cudf.DataFrame({"key": keys, "val_x": val_x, "val_y": val_y}) - - def expect_outer(left, right, sort): - left_key = left.key.values_host.tolist() - left_val = left.val.values_host.tolist() - right_key = right.key.values_host.tolist() - right_val = right.val.values_host.tolist() - right_have = defaultdict(list) - for i, k in enumerate(right_key): - right_have[k].append(i) - keys = [] - val_x = [] - val_y = [] - for k, v in zip(left_key, left_val, strict=True): - if k not in right_have: - right_vals = [None] - else: - right_vals = [right_val[i] for i in right_have[k]] - for rv in right_vals: - keys.append(k) - val_x.append(v) - val_y.append(rv) - left_have = set(left_key) - for k, v in zip(right_key, right_val, strict=True): - if k not in left_have: - keys.append(k) - val_x.append(None) - val_y.append(v) - - # Python sort is stable, so this will preserve input order for - # equal items. - # outer joins are always sorted, but we test both sort values - keys, val_x, val_y = zip( - *sorted( - zip(keys, val_x, val_y, strict=True), - key=operator.itemgetter(0), - ), - strict=True, - ) - return cudf.DataFrame({"key": keys, "val_x": val_x, "val_y": val_y}) - - def expected(left, right, sort, *, how): - if how == "inner": - return expect_inner(left, right, sort) - elif how == "outer": - return expect_outer(left, right, sort) - elif how == "left": - return expect_left(left, right, sort) - elif how == "right": - return expect_left(right, left, sort).rename( - {"val_x": "val_y", "val_y": "val_x"}, axis=1 - ) - else: - raise NotImplementedError() - - -@pytest.mark.parametrize("how", ["inner", "left", "right", "outer"]) -def test_join_ordering_pandas_compat(request, left, right, sort, how): - request.applymarker( - pytest.mark.xfail( - PANDAS_VERSION >= PANDAS_CURRENT_SUPPORTED_VERSION - and how == "right", - reason="TODO: Result ording of suffix'ed columns is incorrect", - ) - ) - with cudf.option_context("mode.pandas_compatible", True): - actual = left.merge(right, on="key", how=how, sort=sort) - expect = expected(left, right, sort, how=how) - assert_eq(expect, actual) - - -@pytest.mark.parametrize("how", ["left", "right", "inner", "outer"]) -@pytest.mark.parametrize("sort", [True, False]) -@pytest.mark.parametrize("on_index", [True, False]) -@pytest.mark.parametrize("left_unique", [True, False]) -@pytest.mark.parametrize("left_monotonic", [True, False]) -@pytest.mark.parametrize("right_unique", [True, False]) -@pytest.mark.parametrize("right_monotonic", [True, False]) -def test_merge_combinations( - request, - how, - sort, - on_index, - left_unique, - left_monotonic, - right_unique, - right_monotonic, -): - request.applymarker( - pytest.mark.xfail( - condition=how == "outer" - and on_index - and left_unique - and not left_monotonic - and right_unique - and not right_monotonic, - reason="https://github.com/pandas-dev/pandas/issues/55992", - ) - ) - left = [2, 3] - if left_unique: - left.append(4 if left_monotonic else 1) - else: - left.append(3 if left_monotonic else 2) - - right = [2, 3] - if right_unique: - right.append(4 if right_monotonic else 1) - else: - right.append(3 if right_monotonic else 2) - - left = cudf.DataFrame({"key": left}) - right = cudf.DataFrame({"key": right}) - - if on_index: - left = left.set_index("key") - right = right.set_index("key") - on_kwargs = {"left_index": True, "right_index": True} - else: - on_kwargs = {"on": "key"} - - with cudf.option_context("mode.pandas_compatible", True): - result = cudf.merge(left, right, how=how, sort=sort, **on_kwargs) - if on_index: - left = left.reset_index() - right = right.reset_index() - - if how in ["left", "right", "inner"]: - if how in ["left", "inner"]: - expected, other, other_unique = left, right, right_unique - else: - expected, other, other_unique = right, left, left_unique - if how == "inner": - keep_values = set(left["key"].values_host).intersection( - right["key"].values_host - ) - keep_mask = expected["key"].isin(keep_values) - expected = expected[keep_mask] - if sort: - expected = expected.sort_values("key") - if not other_unique: - other_value_counts = other["key"].value_counts() - repeats = other_value_counts.reindex( - expected["key"].values, fill_value=1 - ) - repeats = repeats.astype(np.intp) - expected = expected["key"].repeat(repeats.values) - expected = expected.to_frame() - elif how == "outer": - if on_index and left_unique and left["key"].equals(right["key"]): - expected = cudf.DataFrame({"key": left["key"]}) - else: - left_counts = left["key"].value_counts() - right_counts = right["key"].value_counts() - expected_counts = left_counts.mul(right_counts, fill_value=1) - expected_counts = expected_counts.astype(np.intp) - expected = expected_counts.index.values_host.repeat( - expected_counts.values_host - ) - expected = cudf.DataFrame({"key": expected}) - expected = expected.sort_values("key") - - if on_index: - expected = expected.set_index("key") - else: - expected = expected.reset_index(drop=True) - - assert_eq(result, expected) From 52433f1745dd31d29c47b1d66531b2b3c8fd4794 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Tue, 12 Aug 2025 09:26:03 -0700 Subject: [PATCH 107/366] Add streams to hashing APIs (#19663) Contributes to #15163 Authors: - Vyas Ramasubramani (https://github.com/vyasr) Approvers: - Bradley Dice (https://github.com/bdice) URL: https://github.com/rapidsai/cudf/pull/19663 --- python/pylibcudf/pylibcudf/hashing.pxd | 25 +++-- python/pylibcudf/pylibcudf/hashing.pyi | 30 ++++-- python/pylibcudf/pylibcudf/hashing.pyx | 104 ++++++++++++++------ python/pylibcudf/pylibcudf/libcudf/hash.pxd | 31 ++++-- 4 files changed, 128 insertions(+), 62 deletions(-) diff --git a/python/pylibcudf/pylibcudf/hashing.pxd b/python/pylibcudf/pylibcudf/hashing.pxd index fbd478f963f..c95c5a995bb 100644 --- a/python/pylibcudf/pylibcudf/hashing.pxd +++ b/python/pylibcudf/pylibcudf/hashing.pxd @@ -1,6 +1,7 @@ # Copyright (c) 2024-2025, NVIDIA CORPORATION. from libc.stdint cimport uint32_t, uint64_t +from rmm.pylibrmm.stream cimport Stream from .column cimport Column from .table cimport Table @@ -8,27 +9,31 @@ from .table cimport Table cpdef Column murmurhash3_x86_32( Table input, - uint32_t seed=* + uint32_t seed=*, + Stream stream=* ) cpdef Table murmurhash3_x64_128( Table input, - uint64_t seed=* + uint64_t seed=*, + Stream stream=* ) cpdef Column xxhash_32( Table input, - uint32_t seed=* + uint32_t seed=*, + Stream stream=* ) cpdef Column xxhash_64( Table input, - uint64_t seed=* + uint64_t seed=*, + Stream stream=* ) -cpdef Column md5(Table input) -cpdef Column sha1(Table input) -cpdef Column sha224(Table input) -cpdef Column sha256(Table input) -cpdef Column sha384(Table input) -cpdef Column sha512(Table input) +cpdef Column md5(Table input, Stream stream=*) +cpdef Column sha1(Table input, Stream stream=*) +cpdef Column sha224(Table input, Stream stream=*) +cpdef Column sha256(Table input, Stream stream=*) +cpdef Column sha384(Table input, Stream stream=*) +cpdef Column sha512(Table input, Stream stream=*) diff --git a/python/pylibcudf/pylibcudf/hashing.pyi b/python/pylibcudf/pylibcudf/hashing.pyi index d535d842a18..5eb217146b9 100644 --- a/python/pylibcudf/pylibcudf/hashing.pyi +++ b/python/pylibcudf/pylibcudf/hashing.pyi @@ -2,18 +2,28 @@ from typing import Final +from rmm.pylibrmm.stream import Stream + from pylibcudf.column import Column from pylibcudf.table import Table LIBCUDF_DEFAULT_HASH_SEED: Final[int] -def murmurhash3_x86_32(input: Table, seed: int = ...) -> Column: ... -def murmurhash3_x64_128(input: Table, seed: int = ...) -> Table: ... -def xxhash_32(input: Table, seed: int = ...) -> Column: ... -def xxhash_64(input: Table, seed: int = ...) -> Column: ... -def md5(input: Table) -> Column: ... -def sha1(input: Table) -> Column: ... -def sha224(input: Table) -> Column: ... -def sha256(input: Table) -> Column: ... -def sha384(input: Table) -> Column: ... -def sha512(input: Table) -> Column: ... +def murmurhash3_x86_32( + input: Table, seed: int = ..., stream: Stream | None = None +) -> Column: ... +def murmurhash3_x64_128( + input: Table, seed: int = ..., stream: Stream | None = None +) -> Table: ... +def xxhash_32( + input: Table, seed: int = ..., stream: Stream | None = None +) -> Column: ... +def xxhash_64( + input: Table, seed: int = ..., stream: Stream | None = None +) -> Column: ... +def md5(input: Table, stream: Stream | None = None) -> Column: ... +def sha1(input: Table, stream: Stream | None = None) -> Column: ... +def sha224(input: Table, stream: Stream | None = None) -> Column: ... +def sha256(input: Table, stream: Stream | None = None) -> Column: ... +def sha384(input: Table, stream: Stream | None = None) -> Column: ... +def sha512(input: Table, stream: Stream | None = None) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/hashing.pyx b/python/pylibcudf/pylibcudf/hashing.pyx index 1f093b20c6b..fe4cc14f4c2 100644 --- a/python/pylibcudf/pylibcudf/hashing.pyx +++ b/python/pylibcudf/pylibcudf/hashing.pyx @@ -17,9 +17,11 @@ from pylibcudf.libcudf.hash cimport ( xxhash_64 as cpp_xxhash_64, ) from pylibcudf.libcudf.table.table cimport table +from rmm.pylibrmm.stream cimport Stream from .column cimport Column from .table cimport Table +from .utils cimport _get_stream __all__ = [ "LIBCUDF_DEFAULT_HASH_SEED", @@ -39,7 +41,8 @@ LIBCUDF_DEFAULT_HASH_SEED = DEFAULT_HASH_SEED cpdef Column murmurhash3_x86_32( Table input, - uint32_t seed=DEFAULT_HASH_SEED + uint32_t seed=DEFAULT_HASH_SEED, + Stream stream=None ): """Computes the MurmurHash3 32-bit hash value of each row in the given table. @@ -58,18 +61,23 @@ cpdef Column murmurhash3_x86_32( A column where each row is the hash of a row from the input """ cdef unique_ptr[column] c_result + + stream = _get_stream(stream) + with nogil: c_result = cpp_murmurhash3_x86_32( input.view(), - seed + seed, + stream.view() ) - return Column.from_libcudf(move(c_result)) + return Column.from_libcudf(move(c_result), stream) cpdef Table murmurhash3_x64_128( Table input, - uint64_t seed=DEFAULT_HASH_SEED + uint64_t seed=DEFAULT_HASH_SEED, + Stream stream=None ): """Computes the MurmurHash3 64-bit hash value of each row in the given table. @@ -88,18 +96,23 @@ cpdef Table murmurhash3_x64_128( A table of two UINT64 columns """ cdef unique_ptr[table] c_result + + stream = _get_stream(stream) + with nogil: c_result = cpp_murmurhash3_x64_128( input.view(), - seed + seed, + stream.view() ) - return Table.from_libcudf(move(c_result)) + return Table.from_libcudf(move(c_result), stream) cpdef Column xxhash_32( Table input, - uint32_t seed=DEFAULT_HASH_SEED + uint32_t seed=DEFAULT_HASH_SEED, + Stream stream=None ): """Computes the xxHash 32-bit hash value of each row in the given table. @@ -119,18 +132,23 @@ cpdef Column xxhash_32( """ cdef unique_ptr[column] c_result - with nogil: + + stream = _get_stream(stream) + + with nogil: c_result = cpp_xxhash_32( input.view(), - seed + seed, + stream.view() ) - return Column.from_libcudf(move(c_result)) + return Column.from_libcudf(move(c_result), stream) cpdef Column xxhash_64( Table input, - uint64_t seed=DEFAULT_HASH_SEED + uint64_t seed=DEFAULT_HASH_SEED, + Stream stream=None ): """Computes the xxHash 64-bit hash value of each row in the given table. @@ -150,16 +168,20 @@ cpdef Column xxhash_64( """ cdef unique_ptr[column] c_result - with nogil: + + stream = _get_stream(stream) + + with nogil: c_result = cpp_xxhash_64( input.view(), - seed + seed, + stream.view() ) - return Column.from_libcudf(move(c_result)) + return Column.from_libcudf(move(c_result), stream) -cpdef Column md5(Table input): +cpdef Column md5(Table input, Stream stream=None): """Computes the MD5 hash value of each row in the given table. For details, see :cpp:func:`md5`. @@ -177,11 +199,14 @@ cpdef Column md5(Table input): """ cdef unique_ptr[column] c_result + + stream = _get_stream(stream) + with nogil: - c_result = cpp_md5(input.view()) - return Column.from_libcudf(move(c_result)) + c_result = cpp_md5(input.view(), stream.view()) + return Column.from_libcudf(move(c_result), stream) -cpdef Column sha1(Table input): +cpdef Column sha1(Table input, Stream stream=None): """Computes the SHA-1 hash value of each row in the given table. For details, see :cpp:func:`sha1`. @@ -197,12 +222,15 @@ cpdef Column sha1(Table input): A column where each row is the hash of a row from the input """ cdef unique_ptr[column] c_result + + stream = _get_stream(stream) + with nogil: - c_result = cpp_sha1(input.view()) - return Column.from_libcudf(move(c_result)) + c_result = cpp_sha1(input.view(), stream.view()) + return Column.from_libcudf(move(c_result), stream) -cpdef Column sha224(Table input): +cpdef Column sha224(Table input, Stream stream=None): """Computes the SHA-224 hash value of each row in the given table. For details, see :cpp:func:`sha224`. @@ -218,12 +246,15 @@ cpdef Column sha224(Table input): A column where each row is the hash of a row from the input """ cdef unique_ptr[column] c_result + + stream = _get_stream(stream) + with nogil: - c_result = cpp_sha224(input.view()) - return Column.from_libcudf(move(c_result)) + c_result = cpp_sha224(input.view(), stream.view()) + return Column.from_libcudf(move(c_result), stream) -cpdef Column sha256(Table input): +cpdef Column sha256(Table input, Stream stream=None): """Computes the SHA-256 hash value of each row in the given table. For details, see :cpp:func:`sha256`. @@ -239,12 +270,15 @@ cpdef Column sha256(Table input): A column where each row is the hash of a row from the input """ cdef unique_ptr[column] c_result + + stream = _get_stream(stream) + with nogil: - c_result = cpp_sha256(input.view()) - return Column.from_libcudf(move(c_result)) + c_result = cpp_sha256(input.view(), stream.view()) + return Column.from_libcudf(move(c_result), stream) -cpdef Column sha384(Table input): +cpdef Column sha384(Table input, Stream stream=None): """Computes the SHA-384 hash value of each row in the given table. For details, see :cpp:func:`sha384`. @@ -260,12 +294,15 @@ cpdef Column sha384(Table input): A column where each row is the hash of a row from the input """ cdef unique_ptr[column] c_result + + stream = _get_stream(stream) + with nogil: - c_result = cpp_sha384(input.view()) - return Column.from_libcudf(move(c_result)) + c_result = cpp_sha384(input.view(), stream.view()) + return Column.from_libcudf(move(c_result), stream) -cpdef Column sha512(Table input): +cpdef Column sha512(Table input, Stream stream=None): """Computes the SHA-512 hash value of each row in the given table. For details, see :cpp:func:`sha512`. @@ -281,6 +318,9 @@ cpdef Column sha512(Table input): A column where each row is the hash of a row from the input """ cdef unique_ptr[column] c_result + + stream = _get_stream(stream) + with nogil: - c_result = cpp_sha512(input.view()) - return Column.from_libcudf(move(c_result)) + c_result = cpp_sha512(input.view(), stream.view()) + return Column.from_libcudf(move(c_result), stream) diff --git a/python/pylibcudf/pylibcudf/libcudf/hash.pxd b/python/pylibcudf/pylibcudf/libcudf/hash.pxd index 46fdf62cd6b..7a3dec20f24 100644 --- a/python/pylibcudf/pylibcudf/libcudf/hash.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/hash.pxd @@ -6,52 +6,63 @@ from pylibcudf.exception_handler cimport libcudf_exception_handler from pylibcudf.libcudf.column.column cimport column from pylibcudf.libcudf.table.table cimport table from pylibcudf.libcudf.table.table_view cimport table_view +from rmm.librmm.cuda_stream_view cimport cuda_stream_view cdef extern from "cudf/hashing.hpp" namespace "cudf::hashing" nogil: cdef unique_ptr[column] murmurhash3_x86_32( const table_view& input, - const uint32_t seed + const uint32_t seed, + cuda_stream_view stream ) except +libcudf_exception_handler cdef unique_ptr[table] murmurhash3_x64_128( const table_view& input, - const uint64_t seed + const uint64_t seed, + cuda_stream_view stream ) except +libcudf_exception_handler cdef unique_ptr[column] md5( - const table_view& input + const table_view& input, + cuda_stream_view stream ) except +libcudf_exception_handler cdef unique_ptr[column] sha1( - const table_view& input + const table_view& input, + cuda_stream_view stream ) except +libcudf_exception_handler cdef unique_ptr[column] sha224( - const table_view& input + const table_view& input, + cuda_stream_view stream ) except +libcudf_exception_handler cdef unique_ptr[column] sha256( - const table_view& input + const table_view& input, + cuda_stream_view stream ) except +libcudf_exception_handler cdef unique_ptr[column] sha384( - const table_view& input + const table_view& input, + cuda_stream_view stream ) except +libcudf_exception_handler cdef unique_ptr[column] sha512( - const table_view& input + const table_view& input, + cuda_stream_view stream ) except +libcudf_exception_handler cdef unique_ptr[column] xxhash_32( const table_view& input, - const uint32_t seed + const uint32_t seed, + cuda_stream_view stream ) except +libcudf_exception_handler cdef unique_ptr[column] xxhash_64( const table_view& input, - const uint64_t seed + const uint64_t seed, + cuda_stream_view stream ) except +libcudf_exception_handler cdef extern from "cudf/hashing.hpp" namespace "cudf" nogil: From aa0858be9214617607aabc00f1841419cc2e406b Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Tue, 12 Aug 2025 09:31:46 -0700 Subject: [PATCH 108/366] Improve documentation around why we need no_gc_clear on pylibcudf Scalars (#19661) I decided there wasn't a great place in the developer docs for this information, but the existing comment didn't have much information so I expanded it substantially. Resolves #14249 Authors: - Vyas Ramasubramani (https://github.com/vyasr) Approvers: - Lawrence Mitchell (https://github.com/wence-) - GALI PREM SAGAR (https://github.com/galipremsagar) URL: https://github.com/rapidsai/cudf/pull/19661 --- python/pylibcudf/pylibcudf/scalar.pyx | 40 ++++++++++++++++++++++++--- 1 file changed, 36 insertions(+), 4 deletions(-) diff --git a/python/pylibcudf/pylibcudf/scalar.pyx b/python/pylibcudf/pylibcudf/scalar.pyx index 7ba769e6060..0d533c960a4 100644 --- a/python/pylibcudf/pylibcudf/scalar.pyx +++ b/python/pylibcudf/pylibcudf/scalar.pyx @@ -78,10 +78,42 @@ except ImportError as err: __all__ = ["Scalar"] -# The DeviceMemoryResource attribute could be released prematurely -# by the gc if the Scalar is in a reference cycle. Removing the tp_clear -# function with the no_gc_clear decoration prevents that. See -# https://github.com/rapidsai/rmm/pull/931 for details. +# The no_gc_clear decorator on this class is necessary for the following reason: +# +# The object underlying a Scalar is a libcudf scalar. The underlying storage +# type within the scalar depends on the scalar's data type, but regardless of +# the proximate storage class all of the types ultimately store their data in +# an rmm::device_buffer that has an associated rmm memory resource used for +# allocation and deallocation. That memory resource must therefore still be +# alive when the Scalar is destroyed. With the current architecture of cudf we +# do not know exactly what mr was used to construct the scalar, so until then +# the best we can do is to grab the current memory resource at the time of +# construction and keep it alive until the Scalar is destroyed (for potential +# problems with this approach, see https://github.com/rapidsai/rmm/issues/1515; +# the solution will be to address https://github.com/rapidsai/cudf/issues/15170 +# and also pass mrs all the way down to every rmm Python API to avoid its +# default mrs). This is done in the `__cinit__` method below. +# +# However, even in the most common case where this approach gives us the +# correct mr, we still have a problem. If a Scalar participates in a reference +# cycle, then when the garbage collector goes to clear that cycle its default +# behavior will be to clear all attributes of the object, including the mr +# attribute (see +# https://cython.readthedocs.io/en/latest/src/userguide/extension_types.html#dealloc-intro). +# This is fine in the immediate Cython code because Scalar does not define a +# `__dealloc__` method, so there is no need for the mr in the Cython code. +# However, if after the Scalar was created some other code called +# `set_current_device_resource`, then there may be no other references left to +# the mr used to create the scalar. In that case, the reference count of the +# Python DeviceMemoryResource will drop to zero and it will immediately be +# destroyed, resulting in the destruction of the underlying C++ memory resource +# as well (rmm::device_buffer only has a non-owning reference to it because all +# mrs in rmm are managed with unique_ptr semantics). That will result in a +# segmentation fault when the device_buffer goes to deallocate its memory using +# a freed memory resources. To prevent this, we use the `no_gc_clear` decorator +# to prevent the garbage collector from clearing the `mr` attribute when it +# clears the Scalar object as described in +# https://cython.readthedocs.io/en/latest/src/userguide/extension_types.html#disabling-cycle-breaking-tp-clear. @no_gc_clear cdef class Scalar: """A scalar value in device memory. From 1dd127ceb20b81969e245953bb01fd67b9b04f67 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 12 Aug 2025 09:44:21 -0700 Subject: [PATCH 109/366] Move test_resampling/query/pickling to new cudf classic directory structure (#19615) Towards https://github.com/rapidsai/cudf/issues/9999 Towards https://github.com/rapidsai/cudf/issues/15723 Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - Matthew Murray (https://github.com/Matt711) URL: https://github.com/rapidsai/cudf/pull/19615 --- .../{ => dataframe/methods}/test_query.py | 65 ++++++++++--------- .../groupby/test_function_application.py | 1 - .../test_resample.py} | 0 python/cudf/cudf/tests/groupby/test_stats.py | 1 - .../tests/{ => input_output}/test_pickling.py | 14 ++-- 5 files changed, 39 insertions(+), 42 deletions(-) rename python/cudf/cudf/tests/{ => dataframe/methods}/test_query.py (82%) delete mode 100644 python/cudf/cudf/tests/groupby/test_function_application.py rename python/cudf/cudf/tests/{test_resampling.py => groupby/test_resample.py} (100%) delete mode 100644 python/cudf/cudf/tests/groupby/test_stats.py rename python/cudf/cudf/tests/{ => input_output}/test_pickling.py (93%) diff --git a/python/cudf/cudf/tests/test_query.py b/python/cudf/cudf/tests/dataframe/methods/test_query.py similarity index 82% rename from python/cudf/cudf/tests/test_query.py rename to python/cudf/cudf/tests/dataframe/methods/test_query.py index ddb8a7ffd37..a2776fb144e 100644 --- a/python/cudf/cudf/tests/test_query.py +++ b/python/cudf/cudf/tests/dataframe/methods/test_query.py @@ -30,7 +30,6 @@ def test_query_parser(text, expect_args): assert tuple(argspec.args) == tuple(expect_args) -@pytest.mark.parametrize("nelem", [1, 10]) @pytest.mark.parametrize( "fn", [ @@ -40,20 +39,22 @@ def test_query_parser(text, expect_args): ], ) @pytest.mark.parametrize("nulls", [True, False]) -def test_query(nelem, fn, nulls): - # prepare +def test_query(fn, nulls): + n = 5 expect_fn, query_expr = fn rng = np.random.default_rng(seed=0) - pdf = pd.DataFrame() - pdf["a"] = np.arange(nelem) - pdf["b"] = rng.random(nelem) * nelem + pdf = pd.DataFrame( + { + "a": np.arange(n), + "b": rng.random(n) * n, + } + ) if nulls: pdf.loc[::2, "a"] = None gdf = cudf.from_pandas(pdf) assert_eq(pdf.query(query_expr), gdf.query(query_expr)) -@pytest.mark.parametrize("nelem", [1, 10]) @pytest.mark.parametrize( "fn", [ @@ -64,13 +65,13 @@ def test_query(nelem, fn, nulls): ), ], ) -def test_query_ref_env(nelem, fn): - # prepare +def test_query_ref_env(fn): + n = 5 expect_fn, query_expr = fn rng = np.random.default_rng(seed=0) df = DataFrame() - df["a"] = aa = np.arange(nelem) - df["b"] = bb = rng.random(nelem) * nelem + df["a"] = aa = np.arange(n) + df["b"] = bb = rng.random(n) * n c = 2.3 d = 1.2 # udt @@ -104,15 +105,22 @@ def test_query_local_dict(): got = df.query(expr, local_dict={"val": 10}) np.testing.assert_array_equal(aa[aa < 10], got["a"].to_numpy()) - # test for datetime - df = DataFrame() - data = np.array(["2018-10-07", "2018-10-08"], dtype="datetime64") - df["datetimes"] = data + +def test_query_local_dict_datetime(): + df = DataFrame( + { + "datetimes": np.array( + ["2018-10-07", "2018-10-08"], dtype="datetime64" + ) + } + ) search_date = datetime.datetime.strptime("2018-10-08", "%Y-%m-%d") expr = "datetimes==@search_date" got = df.query(expr, local_dict={"search_date": search_date}) - np.testing.assert_array_equal(data[1], got["datetimes"].to_numpy()) + np.testing.assert_array_equal( + np.datetime64("2018-10-08"), got["datetimes"].to_numpy() + ) def test_query_global_dict(): @@ -166,7 +174,6 @@ def test_query_empty_frames(): assert_eq(got, expect) -@pytest.mark.parametrize(("a_val", "b_val", "c_val"), [(4, 3, 15)]) @pytest.mark.parametrize("index", ["a", ["a", "b"]]) @pytest.mark.parametrize( "query", @@ -176,7 +183,10 @@ def test_query_empty_frames(): "(a < @a_val and b >@b_val) or c >@c_val", ], ) -def test_query_with_index_name(index, query, a_val, b_val, c_val): +def test_query_with_index_name(index, query): + a_val = 4 # noqa: F841 + b_val = 3 # noqa: F841 + c_val = 15 # noqa: F841 pdf = pd.DataFrame( { "a": [1, None, 3, 4, 5], @@ -194,7 +204,6 @@ def test_query_with_index_name(index, query, a_val, b_val, c_val): assert_eq(out, expect) -@pytest.mark.parametrize(("a_val", "b_val", "c_val"), [(4, 3, 15)]) @pytest.mark.parametrize( "query", [ @@ -203,7 +212,10 @@ def test_query_with_index_name(index, query, a_val, b_val, c_val): "(index < @a_val and b >@b_val) or c >@c_val", ], ) -def test_query_with_index_keyword(query, a_val, b_val, c_val): +def test_query_with_index_keyword(query): + a_val = 4 # noqa: F841 + b_val = 3 # noqa: F841 + c_val = 15 # noqa: F841 pdf = pd.DataFrame( { "a": [1, None, 3, 4, 5], @@ -237,15 +249,6 @@ def test_query_unsupported_dtypes(): gdf.query(query) -@pytest.mark.parametrize( - "values", - [ - [0, 1.0, 2.0, None, np.nan, None, 3, 5], - [0, 1.0, 2.0, None, 3, np.nan, None, 4], - [0, 1.0, 2.0, None, 3, np.nan, None, 4, None, 9], - ], -) -@pytest.mark.parametrize("nan_as_null", [True, False]) @pytest.mark.parametrize( "query", [ @@ -260,8 +263,8 @@ def test_query_unsupported_dtypes(): "a >= 3", ], ) -def test_query_mask(values, nan_as_null, query): - data = {"a": values} +def test_query_mask(nan_as_null, query): + data = {"a": [0, 1.0, 2.0, None, 3, np.nan, None, 4]} pdf = pd.DataFrame(data) gdf = cudf.DataFrame(data, nan_as_null=nan_as_null) diff --git a/python/cudf/cudf/tests/groupby/test_function_application.py b/python/cudf/cudf/tests/groupby/test_function_application.py deleted file mode 100644 index 06777c8e6af..00000000000 --- a/python/cudf/cudf/tests/groupby/test_function_application.py +++ /dev/null @@ -1 +0,0 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. diff --git a/python/cudf/cudf/tests/test_resampling.py b/python/cudf/cudf/tests/groupby/test_resample.py similarity index 100% rename from python/cudf/cudf/tests/test_resampling.py rename to python/cudf/cudf/tests/groupby/test_resample.py diff --git a/python/cudf/cudf/tests/groupby/test_stats.py b/python/cudf/cudf/tests/groupby/test_stats.py deleted file mode 100644 index 06777c8e6af..00000000000 --- a/python/cudf/cudf/tests/groupby/test_stats.py +++ /dev/null @@ -1 +0,0 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. diff --git a/python/cudf/cudf/tests/test_pickling.py b/python/cudf/cudf/tests/input_output/test_pickling.py similarity index 93% rename from python/cudf/cudf/tests/test_pickling.py rename to python/cudf/cudf/tests/input_output/test_pickling.py index ac13056fa7c..ed3483f296b 100644 --- a/python/cudf/cudf/tests/test_pickling.py +++ b/python/cudf/cudf/tests/input_output/test_pickling.py @@ -70,7 +70,7 @@ def test_pickle_index(): idx = Index(np.arange(nelem), name="a") pickled = pickle.dumps(idx) out = pickle.loads(pickled) - assert (idx == out).all() + assert_eq(idx, out) def test_pickle_buffer(): @@ -83,17 +83,13 @@ def test_pickle_buffer(): assert unpacked.size == arr.nbytes -@pytest.mark.parametrize("named", [True, False]) -def test_pickle_series(named): +@pytest.mark.parametrize("name", [None, "a"]) +def test_pickle_series(name): rng = np.random.default_rng(seed=0) - if named: - ser = Series(rng.random(10), name="a") - else: - ser = Series(rng.random(10)) - + ser = Series(rng.random(10), name=name) pickled = pickle.dumps(ser) out = pickle.loads(pickled) - assert (ser == out).all() + assert_eq(ser, out) @pytest.mark.parametrize( From 0fbd451151a3f5226624448a27de75a3d81b6e4e Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Tue, 12 Aug 2025 10:09:44 -0700 Subject: [PATCH 110/366] Re-enable Disabled Join Tests (#19649) This PR re-enables previously disabled join tests following the recent cuco version bump, along with the necessary fixes. Authors: - Yunsong Wang (https://github.com/PointKernel) Approvers: - Shruti Shivakumar (https://github.com/shrshi) - Tianyu Liu (https://github.com/kingcrimsontianyu) URL: https://github.com/rapidsai/cudf/pull/19649 --- cpp/tests/join/distinct_join_tests.cpp | 3 +-- cpp/tests/join/join_tests.cpp | 11 ++++++----- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/cpp/tests/join/distinct_join_tests.cpp b/cpp/tests/join/distinct_join_tests.cpp index b5c02bd562e..ce27a3fbc0a 100644 --- a/cpp/tests/join/distinct_join_tests.cpp +++ b/cpp/tests/join/distinct_join_tests.cpp @@ -507,8 +507,7 @@ TEST_F(DistinctJoinTest, LeftJoinWithStructsAndNulls) build.view(), probe.view(), gather_map, gold.view(), cudf::out_of_bounds_policy::NULLIFY); } -// Disabled for now, waiting on upstream cuco updates -TEST_F(DistinctJoinTest, DISABLED_InvalidLoadFactor) +TEST_F(DistinctJoinTest, InvalidLoadFactor) { column_wrapper col0_0{{3, 1, 2, 0, 3}}; strcol_wrapper col0_1({"s0", "s1", "s2", "s4", "s1"}); diff --git a/cpp/tests/join/join_tests.cpp b/cpp/tests/join/join_tests.cpp index 3ea65232c81..1f371576d55 100644 --- a/cpp/tests/join/join_tests.cpp +++ b/cpp/tests/join/join_tests.cpp @@ -40,6 +40,8 @@ #include +#include + #include namespace { @@ -185,8 +187,7 @@ struct JoinTest : public cudf::test::BaseFixture { } }; -// Disabled for now, waiting on upstream cuco updates -TEST_F(JoinTest, DISABLED_InvalidLoadFactor) +TEST_F(JoinTest, InvalidLoadFactor) { column_wrapper col0_0{{3, 1, 2, 0, 3}}; strcol_wrapper col0_1({"s0", "s1", "s2", "s4", "s1"}); @@ -199,13 +200,13 @@ TEST_F(JoinTest, DISABLED_InvalidLoadFactor) // Test load factor of -0.1 EXPECT_THROW(cudf::hash_join(t0, cudf::nullable_join::NO, cudf::null_equality::EQUAL, -0.1), - std::invalid_argument); + cuco::logic_error); // Test load factor of 0 EXPECT_THROW(cudf::hash_join(t0, cudf::nullable_join::NO, cudf::null_equality::EQUAL, 0.0), - std::invalid_argument); + cuco::logic_error); // Test load factor > 1 EXPECT_THROW(cudf::hash_join(t0, cudf::nullable_join::NO, cudf::null_equality::EQUAL, 1.5), - std::invalid_argument); + cuco::logic_error); } struct JoinParameterizedTest : public JoinTest, public testing::WithParamInterface {}; From f82828f9163b384e5012ef334a3c97f62a605637 Mon Sep 17 00:00:00 2001 From: Matthew Murray <41342305+Matt711@users.noreply.github.com> Date: Tue, 12 Aug 2025 15:42:45 -0400 Subject: [PATCH 111/366] Fix broken links in 10min notebook (#19665) Fixed some outdated links in the 10min to cuDF notebook. Authors: - Matthew Murray (https://github.com/Matt711) Approvers: - Matthew Roeschke (https://github.com/mroeschke) URL: https://github.com/rapidsai/cudf/pull/19665 --- docs/cudf/source/user_guide/10min.ipynb | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/docs/cudf/source/user_guide/10min.ipynb b/docs/cudf/source/user_guide/10min.ipynb index 87782cd7fb5..23c947ecf95 100644 --- a/docs/cudf/source/user_guide/10min.ipynb +++ b/docs/cudf/source/user_guide/10min.ipynb @@ -15,11 +15,11 @@ "\n", "[Dask](https://dask.org/) is a flexible library for parallel computing in Python that makes scaling out your workflow smooth and simple. On the CPU, Dask uses Pandas to execute operations in parallel on DataFrame partitions.\n", "\n", - "[Dask cuDF](https://github.com/rapidsai/cudf/tree/main/python/dask_cudf) extends Dask where necessary to allow its DataFrame partitions to be processed using cuDF GPU DataFrames instead of Pandas DataFrames. For instance, when you call `dask_cudf.read_csv(...)`, your cluster's GPUs do the work of parsing the CSV file(s) by calling [`cudf.read_csv()`](https://docs.rapids.ai/api/cudf/stable/api_docs/api/cudf.read_csv.html).\n", + "[Dask cuDF](https://github.com/rapidsai/cudf/tree/main/python/dask_cudf) extends Dask where necessary to allow its DataFrame partitions to be processed using cuDF GPU DataFrames instead of Pandas DataFrames. For instance, when you call `dask_cudf.read_csv(...)`, your cluster's GPUs do the work of parsing the CSV file(s) by calling [`cudf.read_csv()`](https://docs.rapids.ai/api/cudf/stable/user_guide/api_docs/api/cudf.read_csv.html).\n", "\n", "\n", "
\n", - "Note: This notebook uses the explicit Dask cuDF API (dask_cudf) for clarity. However, we strongly recommend that you use Dask's configuration infrastructure to set the \"dataframe.backend\" option to \"cudf\", and work with the Dask DataFrame API directly. Please see the Dask cuDF documentation for more information.\n", + "Note: This notebook uses the explicit Dask cuDF API (dask_cudf) for clarity. However, we strongly recommend that you use Dask's configuration infrastructure to set the \"dataframe.backend\" option to \"cudf\", and work with the Dask DataFrame API directly. Please see the Dask cuDF documentation for more information.\n", "
\n", "\n", "\n", @@ -2572,7 +2572,7 @@ "id": "fd3fc4f3", "metadata": {}, "source": [ - "Like pandas, cuDF provides string processing methods in the `str` attribute of `Series`. Full documentation of string methods is a work in progress. Please see the [cuDF API documentation](https://docs.rapids.ai/api/cudf/stable/api_docs/series.html#string-handling) for more information." + "Like pandas, cuDF provides string processing methods in the `str` attribute of `Series`. Full documentation of string methods is a work in progress. Please see the [cuDF API documentation](https://docs.rapids.ai/api/cudf/stable/user_guide/api_docs/series.html#string-handling) for more information." ] }, { @@ -2637,7 +2637,7 @@ "id": "44fe1243", "metadata": {}, "source": [ - "As well as simple manipulation, We can also match strings using [regular expressions](https://docs.rapids.ai/api/cudf/stable/api_docs/api/cudf.core.column.string.StringMethods.match.html)." + "As well as simple manipulation, We can also match strings using [regular expressions](https://docs.rapids.ai/api/cudf/stable/user_guide/api_docs/api/cudf.core.accessors.string.StringMethods.match.html)." ] }, { From 5cd9ea07bbafa36288d5939fdc29d7d2dbe1cae8 Mon Sep 17 00:00:00 2001 From: Basit Ayantunde Date: Tue, 12 Aug 2025 21:23:35 +0100 Subject: [PATCH 112/366] [FEA] Remove excessive copies of JITIFY's ProgramData during JIT kernel launch (#19667) This MR removes the excessive copies of the programdata, which was being copied on every kernel launch. The program data contains compressed un-compiled C++ code of CUDF's headers, which is very large. This impacts both small and large columns/tables. Follows-up https://github.com/rapidsai/cudf/issues/19625 Authors: - Basit Ayantunde (https://github.com/lamarrr) Approvers: - Bradley Dice (https://github.com/bdice) - Yunsong Wang (https://github.com/PointKernel) URL: https://github.com/rapidsai/cudf/pull/19667 --- cpp/src/jit/cache.cpp | 4 ++-- cpp/src/jit/cache.hpp | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/cpp/src/jit/cache.cpp b/cpp/src/jit/cache.cpp index 70cf5f3150e..958d071ad1f 100644 --- a/cpp/src/jit/cache.cpp +++ b/cpp/src/jit/cache.cpp @@ -112,7 +112,7 @@ std::size_t try_parse_numeric_env_var(char const* const env_name, std::size_t de } } // namespace -jitify2::ProgramCache<>& jit::program_cache::get(jitify2::PreprocessedProgramData preprog) +jitify2::ProgramCache<>& jit::program_cache::get(jitify2::PreprocessedProgramData const& preprog) { std::lock_guard const caches_lock(_caches_mutex); @@ -138,7 +138,7 @@ jitify2::ProgramCache<>& jit::program_cache::get(jitify2::PreprocessedProgramDat return *(existing_cache->second); } -jitify2::ProgramCache<>& jit::get_program_cache(jitify2::PreprocessedProgramData preprog) +jitify2::ProgramCache<>& jit::get_program_cache(jitify2::PreprocessedProgramData const& preprog) { return cudf::get_context().program_cache().get(preprog); } diff --git a/cpp/src/jit/cache.hpp b/cpp/src/jit/cache.hpp index da51e573679..1772134bb90 100644 --- a/cpp/src/jit/cache.hpp +++ b/cpp/src/jit/cache.hpp @@ -39,10 +39,10 @@ class program_cache { program_cache& operator=(program_cache&&) = delete; ~program_cache() = default; - jitify2::ProgramCache<>& get(jitify2::PreprocessedProgramData preprog); + jitify2::ProgramCache<>& get(jitify2::PreprocessedProgramData const& preprog); }; -jitify2::ProgramCache<>& get_program_cache(jitify2::PreprocessedProgramData preprog); +jitify2::ProgramCache<>& get_program_cache(jitify2::PreprocessedProgramData const& preprog); } // namespace jit } // namespace cudf From 23b59a9b3f828ffb04f87f074c1b057fca5dbd4d Mon Sep 17 00:00:00 2001 From: David Wendt <45795991+davidwendt@users.noreply.github.com> Date: Tue, 12 Aug 2025 17:22:38 -0400 Subject: [PATCH 113/366] Fix integer overflow in warp-per-row grid calculation (#19638) Fixed int overflow for `size * warp_size` calculation for the number of threads given to `grid1d` utility in several strings and nvtext functions. Authors: - David Wendt (https://github.com/davidwendt) Approvers: - Nghia Truong (https://github.com/ttnghia) - Shruti Shivakumar (https://github.com/shrshi) URL: https://github.com/rapidsai/cudf/pull/19638 --- cpp/src/strings/attributes.cu | 5 +++-- cpp/src/strings/case.cu | 3 ++- cpp/src/strings/like.cu | 5 +++-- cpp/src/strings/search/find.cu | 12 +++++++----- cpp/src/text/vocabulary_tokenize.cu | 3 ++- cpp/src/text/wordpiece_tokenize.cu | 5 +++-- 6 files changed, 20 insertions(+), 13 deletions(-) diff --git a/cpp/src/strings/attributes.cu b/cpp/src/strings/attributes.cu index ac1f32160ed..46360ee8663 100644 --- a/cpp/src/strings/attributes.cu +++ b/cpp/src/strings/attributes.cu @@ -152,8 +152,9 @@ std::unique_ptr count_characters_parallel(strings_column_view const& inp auto const d_strings = cudf::column_device_view::create(input.parent(), stream); // fill in the lengths - constexpr int block_size = 256; - cudf::detail::grid_1d grid{input.size() * cudf::detail::warp_size, block_size}; + constexpr thread_index_type block_size = 256; + constexpr thread_index_type warp_size = cudf::detail::warp_size; + cudf::detail::grid_1d grid{input.size() * warp_size, block_size}; count_characters_parallel_fn<<>>( *d_strings, d_lengths); diff --git a/cpp/src/strings/case.cu b/cpp/src/strings/case.cu index 490d8ce8a5c..6b3fe0a6cd6 100644 --- a/cpp/src/strings/case.cu +++ b/cpp/src/strings/case.cu @@ -456,7 +456,8 @@ std::unique_ptr convert_case(strings_column_view const& input, // note: tried to use segmented-reduce approach instead here and it was consistently slower auto [offsets, bytes] = [&] { rmm::device_uvector sizes(input.size(), stream); - auto grid = cudf::detail::grid_1d(input.size() * cudf::detail::warp_size, block_size); + constexpr thread_index_type warp_size = cudf::detail::warp_size; + auto grid = cudf::detail::grid_1d(input.size() * warp_size, block_size); count_bytes_kernel <<>>( ccfn, *d_strings, sizes.data()); diff --git a/cpp/src/strings/like.cu b/cpp/src/strings/like.cu index 508f5055b86..6d0c963fc74 100644 --- a/cpp/src/strings/like.cu +++ b/cpp/src/strings/like.cu @@ -343,8 +343,9 @@ std::unique_ptr like(strings_column_view const& input, like_fn{*d_strings, patterns_itr, d_escape}); } else { // warp-parallel for longer strings - constexpr auto block_size = 512; - auto const grid = cudf::detail::grid_1d(input.size() * cudf::detail::warp_size, block_size); + constexpr thread_index_type block_size = 512; + constexpr thread_index_type warp_size = cudf::detail::warp_size; + auto const grid = cudf::detail::grid_1d(input.size() * warp_size, block_size); like_kernel<<>>( *d_strings, patterns_itr, d_escape, results->mutable_view().data()); } diff --git a/cpp/src/strings/search/find.cu b/cpp/src/strings/search/find.cu index cb862bcdeda..2991a12235a 100644 --- a/cpp/src/strings/search/find.cu +++ b/cpp/src/strings/search/find.cu @@ -181,8 +181,9 @@ void find_utility(strings_column_view const& input, auto d_results = output.mutable_view().data(); if ((input.chars_size(stream) / (input.size() - input.null_count())) > AVG_CHAR_BYTES_THRESHOLD) { // warp-per-string runs faster for longer strings (but not shorter ones) - constexpr int block_size = 256; - cudf::detail::grid_1d grid{input.size() * cudf::detail::warp_size, block_size}; + constexpr auto block_size = 256; + constexpr thread_index_type warp_size = cudf::detail::warp_size; + cudf::detail::grid_1d grid{input.size() * warp_size, block_size}; finder_warp_parallel_fn <<>>( *d_strings, target_itr, start, stop, d_results); @@ -398,9 +399,10 @@ std::unique_ptr contains_warp_parallel(strings_column_view const& input, rmm::exec_policy_nosync(stream), results_view.begin(), results_view.end(), true); } else { // launch warp per string - auto const d_strings = column_device_view::create(input.parent(), stream); - constexpr int block_size = 256; - cudf::detail::grid_1d grid{input.size() * cudf::detail::warp_size, block_size}; + auto const d_strings = column_device_view::create(input.parent(), stream); + constexpr thread_index_type block_size = 256; + constexpr thread_index_type warp_size = cudf::detail::warp_size; + cudf::detail::grid_1d grid{input.size() * warp_size, block_size}; contains_warp_parallel_fn<<>>( *d_strings, d_target, results_view.data()); } diff --git a/cpp/src/text/vocabulary_tokenize.cu b/cpp/src/text/vocabulary_tokenize.cu index e8e65848913..15069a8f01b 100644 --- a/cpp/src/text/vocabulary_tokenize.cu +++ b/cpp/src/text/vocabulary_tokenize.cu @@ -420,7 +420,8 @@ std::unique_ptr tokenize_with_vocabulary(cudf::strings_column_view stream.value()>>>(d_input_chars, chars_size, d_delimiter, d_marks.data()); // launch warp per string to compute token counts - cudf::detail::grid_1d grid{input.size() * cudf::detail::warp_size, block_size}; + constexpr cudf::thread_index_type warp_size = cudf::detail::warp_size; + cudf::detail::grid_1d grid{input.size() * warp_size, block_size}; token_counts_fn<<>>( *d_strings, d_delimiter, d_token_counts.data(), d_marks.data()); auto [token_offsets, total_count] = cudf::detail::make_offsets_child_column( diff --git a/cpp/src/text/wordpiece_tokenize.cu b/cpp/src/text/wordpiece_tokenize.cu index 02f9d38940c..b27bd379356 100644 --- a/cpp/src/text/wordpiece_tokenize.cu +++ b/cpp/src/text/wordpiece_tokenize.cu @@ -791,8 +791,9 @@ rmm::device_uvector compute_some_tokens( // find start/end for each row up to max_words_per_row words; // store word positions in start_words and sizes in word_sizes - cudf::detail::grid_1d grid_find{input.size() * cudf::detail::warp_size, block_size}; - find_words_kernel + constexpr cudf::thread_index_type warp_size = cudf::detail::warp_size; + cudf::detail::grid_1d grid_find{input.size() * warp_size, block_size}; + find_words_kernel <<>>( *d_strings, d_input_chars, max_word_offsets.data(), start_words.data(), word_sizes.data()); From 6a7134c9a26168140eff7c2fdef9a701ae756d40 Mon Sep 17 00:00:00 2001 From: Jigao Luo Date: Wed, 13 Aug 2025 00:43:11 +0200 Subject: [PATCH 114/366] Replace `rmm::device_scalar` with `cudf::detail::device_scalar` due to unnecessary synchronization (Part 3 of miss-sync) (#19119) For issue #18967, this PR is one part of merging the PR Draft #18968. In this PR, almost all `rmm::device_scalar` calls in libcudf are replaced with `cudf::detail::device_scalar` due to its internal host-pinned bounce buffer. This is also a call to action to use host-pinned memory globally in libcudf, with arguments stated in #18967 and #18968. Authors: - Jigao Luo (https://github.com/JigaoLuo) - Vukasin Milovanovic (https://github.com/vuule) Approvers: - Vukasin Milovanovic (https://github.com/vuule) - Nghia Truong (https://github.com/ttnghia) - David Wendt (https://github.com/davidwendt) URL: https://github.com/rapidsai/cudf/pull/19119 --- cpp/doxygen/developer_guide/DEVELOPER_GUIDE.md | 15 +++++++++++++++ .../cudf/detail/sizes_to_offsets_iterator.cuh | 6 +++--- cpp/include/cudf/reduction/detail/reduction.cuh | 7 +++---- cpp/include/cudf_test/nanoarrow_utils.hpp | 2 +- cpp/src/join/sort_merge_join.cu | 4 ++-- .../iterator/sizes_to_offsets_iterator_test.cu | 8 ++++---- cpp/tests/scalar/scalar_device_view_test.cu | 6 +++--- 7 files changed, 31 insertions(+), 17 deletions(-) diff --git a/cpp/doxygen/developer_guide/DEVELOPER_GUIDE.md b/cpp/doxygen/developer_guide/DEVELOPER_GUIDE.md index 9c319b9048e..52e3a47cb9a 100644 --- a/cpp/doxygen/developer_guide/DEVELOPER_GUIDE.md +++ b/cpp/doxygen/developer_guide/DEVELOPER_GUIDE.md @@ -655,6 +655,21 @@ kernel<<<...>>>(int_scalar.data(),...); int host_value = int_scalar.value(); ``` +##### cudf::detail::device_scalar +Acts as a drop-in replacement for `rmm::device_scalar`, with the key difference +being the use of pinned host memory as a bounce buffer for data transfers. +It is recommended for internal use to avoid the implicit synchronization overhead caused by +memcpy operations on pageable host memory. + +```c++ +// Same as the case with rmm::device_scalar above +cudf::detail::device_scalar int_scalar{42, stream, mr}; +kernel<<<...>>>(int_scalar.data(),...); + +// Note: This device-to-host transfer uses host-pinned bounce buffer for efficient memcpy +int host_value = int_scalar.value(); +``` + #### rmm::device_vector Allocates a specified number of elements of the specified type. If no initialization value is diff --git a/cpp/include/cudf/detail/sizes_to_offsets_iterator.cuh b/cpp/include/cudf/detail/sizes_to_offsets_iterator.cuh index 67a2da31d82..8dc7213edbf 100644 --- a/cpp/include/cudf/detail/sizes_to_offsets_iterator.cuh +++ b/cpp/include/cudf/detail/sizes_to_offsets_iterator.cuh @@ -17,12 +17,12 @@ #pragma once #include +#include #include #include #include #include -#include #include #include @@ -203,7 +203,7 @@ struct sizes_to_offsets_iterator { * auto begin = // begin input iterator * auto end = // end input iterator * auto result = rmm::device_uvector(std::distance(begin,end), stream); - * auto last = rmm::device_scalar(0, stream); + * auto last = cudf::detail::device_scalar(0, stream); * auto itr = make_sizes_to_offsets_iterator(result.begin(), * result.end(), * last.data()); @@ -270,7 +270,7 @@ auto sizes_to_offsets(SizesIterator begin, "Only numeric types are supported by sizes_to_offsets"); using LastType = std::conditional_t, int64_t, uint64_t>; - auto last_element = rmm::device_scalar(0, stream); + auto last_element = cudf::detail::device_scalar(0, stream); auto output_itr = make_sizes_to_offsets_iterator(result, result + std::distance(begin, end), last_element.data()); // This function uses the type of the initialization parameter as the accumulator type diff --git a/cpp/include/cudf/reduction/detail/reduction.cuh b/cpp/include/cudf/reduction/detail/reduction.cuh index 6b15c8fb4c0..89eb49e8e8a 100644 --- a/cpp/include/cudf/reduction/detail/reduction.cuh +++ b/cpp/include/cudf/reduction/detail/reduction.cuh @@ -19,13 +19,13 @@ #include "reduction_operators.cuh" #include +#include #include #include #include #include #include -#include #include #include @@ -123,7 +123,7 @@ std::unique_ptr reduce(InputIterator d_in, { auto const binary_op = cudf::detail::cast_functor(op.get_binary_op()); auto const initial_value = init.value_or(op.template get_identity()); - auto dev_result = rmm::device_scalar{initial_value, stream}; + auto dev_result = cudf::detail::device_scalar{initial_value, stream}; // Allocate temporary storage rmm::device_buffer d_temp_storage; @@ -167,7 +167,6 @@ std::unique_ptr reduce(InputIterator d_in, * @param op the reduction operator * @param valid_count Number of valid items * @param ddof Delta degrees of freedom used for standard deviation and variance - * @param init Optional initial value of the reduction * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned scalar's device memory * @returns Output scalar in device memory @@ -187,7 +186,7 @@ std::unique_ptr reduce(InputIterator d_in, auto const binary_op = cudf::detail::cast_functor(op.get_binary_op()); auto const initial_value = op.template get_identity(); - rmm::device_scalar intermediate_result{initial_value, stream}; + cudf::detail::device_scalar intermediate_result{initial_value, stream}; // Allocate temporary storage rmm::device_buffer d_temp_storage; diff --git a/cpp/include/cudf_test/nanoarrow_utils.hpp b/cpp/include/cudf_test/nanoarrow_utils.hpp index faeaea9e1d9..4a014450576 100644 --- a/cpp/include/cudf_test/nanoarrow_utils.hpp +++ b/cpp/include/cudf_test/nanoarrow_utils.hpp @@ -160,7 +160,7 @@ std::enable_if_t, void> populate_from_col( ArrowArrayBuffer(arr, 2)->size_bytes = sview.chars_size(cudf::get_default_stream()); ArrowArrayBuffer(arr, 2)->data = const_cast(view.data()); } else { - auto zero = rmm::device_scalar(0, cudf::get_default_stream()); + auto zero = cudf::detail::device_scalar(0, cudf::get_default_stream()); uint8_t const* ptr = reinterpret_cast(zero.data()); nanoarrow::BufferInitWrapped(ArrowArrayBuffer(arr, 1), std::move(zero), ptr, 4); } diff --git a/cpp/src/join/sort_merge_join.cu b/cpp/src/join/sort_merge_join.cu index bc81d2577f1..ec015c704b2 100644 --- a/cpp/src/join/sort_merge_join.cu +++ b/cpp/src/join/sort_merge_join.cu @@ -174,8 +174,8 @@ merge::matches_per_row(rmm::cuda_stream_view st // naive: iterate through larger table and binary search on smaller table auto const larger_numrows = larger.num_rows(); - rmm::device_scalar d_lb_type(bound_type::LOWER, stream, temp_mr); - rmm::device_scalar d_ub_type(bound_type::UPPER, stream, temp_mr); + cudf::detail::device_scalar d_lb_type(bound_type::LOWER, stream, temp_mr); + cudf::detail::device_scalar d_ub_type(bound_type::UPPER, stream, temp_mr); auto match_counts = cudf::detail::make_zeroed_device_uvector_async(larger_numrows + 1, stream, temp_mr); diff --git a/cpp/tests/iterator/sizes_to_offsets_iterator_test.cu b/cpp/tests/iterator/sizes_to_offsets_iterator_test.cu index 3b412b76dde..3baf0187b56 100644 --- a/cpp/tests/iterator/sizes_to_offsets_iterator_test.cu +++ b/cpp/tests/iterator/sizes_to_offsets_iterator_test.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * Copyright (c) 2022-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -17,10 +17,10 @@ #include #include +#include #include #include -#include #include #include @@ -48,7 +48,7 @@ TYPED_TEST(SizesToOffsetsIteratorTestTyped, ExclusiveScan) auto d_col = cudf::test::fixed_width_column_wrapper(sizes.begin(), sizes.end()); auto d_view = cudf::column_view(d_col); - auto last = rmm::device_scalar(0, stream); + auto last = cudf::detail::device_scalar(0, stream); auto result = rmm::device_uvector(d_view.size(), stream); auto output_itr = cudf::detail::make_sizes_to_offsets_iterator(result.begin(), result.end(), last.data()); @@ -80,7 +80,7 @@ TEST_F(SizesToOffsetsIteratorTest, ScanWithOverflow) auto d_col = cudf::test::fixed_width_column_wrapper(values.begin(), values.end()); auto d_view = cudf::column_view(d_col); - auto last = rmm::device_scalar(0, stream); + auto last = cudf::detail::device_scalar(0, stream); auto result = rmm::device_uvector(d_view.size(), stream); auto output_itr = cudf::detail::make_sizes_to_offsets_iterator(result.begin(), result.end(), last.data()); diff --git a/cpp/tests/scalar/scalar_device_view_test.cu b/cpp/tests/scalar/scalar_device_view_test.cu index d64a8f4418c..8f55cd6274f 100644 --- a/cpp/tests/scalar/scalar_device_view_test.cu +++ b/cpp/tests/scalar/scalar_device_view_test.cu @@ -59,7 +59,7 @@ TYPED_TEST(TypedScalarDeviceViewTest, Value) auto scalar_device_view = cudf::get_scalar_device_view(s); auto scalar_device_view1 = cudf::get_scalar_device_view(s1); - rmm::device_scalar result{cudf::get_default_stream()}; + cudf::detail::device_scalar result{cudf::get_default_stream()}; test_set_value<<<1, 1, 0, cudf::get_default_stream().value()>>>(scalar_device_view, scalar_device_view1); @@ -86,7 +86,7 @@ TYPED_TEST(TypedScalarDeviceViewTest, ConstructNull) TypeParam value = cudf::test::make_type_param_scalar(5); cudf::scalar_type_t s(value, false); auto scalar_device_view = cudf::get_scalar_device_view(s); - rmm::device_scalar result{cudf::get_default_stream()}; + cudf::detail::device_scalar result{cudf::get_default_stream()}; test_null<<<1, 1, 0, cudf::get_default_stream().value()>>>(scalar_device_view, result.data()); CUDF_CHECK_CUDA(0); @@ -130,7 +130,7 @@ TEST_F(StringScalarDeviceViewTest, Value) cudf::string_scalar s(value); auto scalar_device_view = cudf::get_scalar_device_view(s); - rmm::device_scalar result{cudf::get_default_stream()}; + cudf::detail::device_scalar result{cudf::get_default_stream()}; auto value_v = cudf::detail::make_device_uvector( value, cudf::get_default_stream(), cudf::get_current_device_resource_ref()); From fa49ec5fd1c17520734952e91c0b5306b285c9d6 Mon Sep 17 00:00:00 2001 From: Matthew Murray <41342305+Matt711@users.noreply.github.com> Date: Tue, 12 Aug 2025 21:24:46 -0400 Subject: [PATCH 115/366] [BUG] Set `query_set` arg when validating/running cudf-polars PDS-DS benchmarks (#19674) Follows up https://github.com/rapidsai/cudf/pull/19631. I didn't set the set `query_set` (ie. the benchmark name `pdsh` or `pdsds`) arg when running the PDS-DS benchmarks in validation mode (ie. `--engine validate`) or the DuckDB benchmarks (ie. `--engine duckdb`). Therefore we'd get this error ``` $ python python/cudf_polars/cudf_polars/experimental/benchmarks/pdsds.py --engine duckdb 12 --root tpcds_parquet --scale 1.0 Traceback (most recent call last): File "/home/coder/cudf/python/cudf_polars/cudf_polars/experimental/benchmarks/pdsds.py", line 218, in run_duckdb(PDSDSDuckDBQueries, extra_args) ~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/home/coder/cudf/python/cudf_polars/cudf_polars/experimental/benchmarks/pdsds.py", line 109, in run_duckdb run_config = RunConfig.from_args(args) File "/home/coder/cudf/python/cudf_polars/cudf_polars/experimental/benchmarks/utils.py", line 216, in from_args name = args.query_set ^^^^^^^^^^^^^^ AttributeError: 'Namespace' object has no attribute 'query_set' ``` This PR fixes this error. Authors: - Matthew Murray (https://github.com/Matt711) Approvers: - Matthew Roeschke (https://github.com/mroeschke) URL: https://github.com/rapidsai/cudf/pull/19674 --- .../cudf_polars/experimental/benchmarks/pdsds.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/python/cudf_polars/cudf_polars/experimental/benchmarks/pdsds.py b/python/cudf_polars/cudf_polars/experimental/benchmarks/pdsds.py index b9daef51ec4..75b56991a55 100644 --- a/python/cudf_polars/cudf_polars/experimental/benchmarks/pdsds.py +++ b/python/cudf_polars/cudf_polars/experimental/benchmarks/pdsds.py @@ -35,6 +35,7 @@ if TYPE_CHECKING: from collections.abc import Sequence from types import ModuleType + from typing import Any # Without this setting, the first IO task to run # on each worker takes ~15 sec extra @@ -101,9 +102,10 @@ def execute_duckdb_query(query: str, dataset_path: Path) -> pl.DataFrame: return conn.execute("\n".join(statements)).pl() -def run_duckdb(options: Sequence[str] | None = None) -> None: +def run_duckdb(benchmark: Any, options: Sequence[str] | None = None) -> None: """Run the benchmark with DuckDB.""" args = parse_args(options, num_queries=99) + vars(args).update({"query_set": benchmark.name}) run_config = RunConfig.from_args(args) records: defaultdict[int, list[Record]] = defaultdict(list) @@ -130,11 +132,12 @@ def run_duckdb(options: Sequence[str] | None = None) -> None: records[q_id].append(record) -def run_validate(options: Sequence[str] | None = None) -> None: +def run_validate(benchmark: Any, options: Sequence[str] | None = None) -> None: """Validate Polars CPU vs DuckDB or Polars GPU.""" from polars.testing import assert_frame_equal args = parse_args(options, num_queries=99) + vars(args).update({"query_set": benchmark.name}) run_config = RunConfig.from_args(args) baseline = args.baseline @@ -212,6 +215,6 @@ def run_validate(options: Sequence[str] | None = None) -> None: if args.engine == "polars": run_polars(PDSDSPolarsQueries, extra_args, num_queries=99) elif args.engine == "duckdb": - run_duckdb(extra_args) + run_duckdb(PDSDSDuckDBQueries, extra_args) elif args.engine == "validate": - run_validate(extra_args) + run_validate(PDSDSQueries, extra_args) From b76565b7d01d07961f27d6408493c2cdcd1ade5e Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Tue, 12 Aug 2025 20:34:17 -0500 Subject: [PATCH 116/366] Update to numba-cuda>=0.18.0,<0.19.0 (#19604) Updates to `numba-cuda >=0.18.0,<0.19.0`. Drops dependency on `pynvjitlink`. Updates cuda-python pinning to `>=12.9.1,<13.0.0a0` to get cuda-bindings support and fix for a segfault with Python 3.13. Authors: - Bradley Dice (https://github.com/bdice) - https://github.com/brandon-b-miller Approvers: - https://github.com/brandon-b-miller - Graham Markall (https://github.com/gmarkall) - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/19604 --- .../all_cuda-129_arch-aarch64.yaml | 7 ++-- .../all_cuda-129_arch-x86_64.yaml | 7 ++-- conda/recipes/cudf/recipe.yaml | 14 ++----- conda/recipes/pylibcudf/recipe.yaml | 2 +- dependencies.yaml | 39 +++++++++---------- python/cudf/cudf/core/udf/strings_lowering.py | 4 +- python/cudf/pyproject.toml | 9 ++--- python/cudf/udf_cpp/shim.cu | 8 ++++ python/pylibcudf/pyproject.toml | 2 +- 9 files changed, 46 insertions(+), 46 deletions(-) diff --git a/conda/environments/all_cuda-129_arch-aarch64.yaml b/conda/environments/all_cuda-129_arch-aarch64.yaml index 0e95832dddd..b53ce1f224e 100644 --- a/conda/environments/all_cuda-129_arch-aarch64.yaml +++ b/conda/environments/all_cuda-129_arch-aarch64.yaml @@ -20,7 +20,7 @@ dependencies: - cuda-nvcc - cuda-nvrtc-dev - cuda-nvtx-dev -- cuda-python>=12.6.2,<13.0a0 +- cuda-python>=12.9.1,<13.0a0 - cuda-sanitizer-api - cuda-version=12.9 - cupy>=12.0.0 @@ -54,8 +54,8 @@ dependencies: - nbsphinx - ninja - notebook -- numba-cuda>=0.14.0,<0.15.0a0 -- numba>=0.59.1,<0.62.0a0 +- numba-cuda>=0.18.0,<0.19.0a0 +- numba>=0.60.0,<0.62.0a0 - numpy>=1.23,<3.0a0 - numpydoc - nvidia-ml-py @@ -69,7 +69,6 @@ dependencies: - pre-commit - pyarrow>=14.0.0,<20.0.0a0 - pydata-sphinx-theme>=0.15.4 -- pynvjitlink>=0.0.0a0 - pynvml>=12.0.0,<13.0.0a0 - pytest - pytest-benchmark diff --git a/conda/environments/all_cuda-129_arch-x86_64.yaml b/conda/environments/all_cuda-129_arch-x86_64.yaml index e96b8d81953..4d1af2746ac 100644 --- a/conda/environments/all_cuda-129_arch-x86_64.yaml +++ b/conda/environments/all_cuda-129_arch-x86_64.yaml @@ -20,7 +20,7 @@ dependencies: - cuda-nvcc - cuda-nvrtc-dev - cuda-nvtx-dev -- cuda-python>=12.6.2,<13.0a0 +- cuda-python>=12.9.1,<13.0a0 - cuda-sanitizer-api - cuda-version=12.9 - cupy>=12.0.0 @@ -55,8 +55,8 @@ dependencies: - nbsphinx - ninja - notebook -- numba-cuda>=0.14.0,<0.15.0a0 -- numba>=0.59.1,<0.62.0a0 +- numba-cuda>=0.18.0,<0.19.0a0 +- numba>=0.60.0,<0.62.0a0 - numpy>=1.23,<3.0a0 - numpydoc - nvidia-ml-py @@ -70,7 +70,6 @@ dependencies: - pre-commit - pyarrow>=14.0.0,<20.0.0a0 - pydata-sphinx-theme>=0.15.4 -- pynvjitlink>=0.0.0a0 - pynvml>=12.0.0,<13.0.0a0 - pytest - pytest-benchmark diff --git a/conda/recipes/cudf/recipe.yaml b/conda/recipes/cudf/recipe.yaml index 38e32b5c1f2..3f69e8dcd2e 100644 --- a/conda/recipes/cudf/recipe.yaml +++ b/conda/recipes/cudf/recipe.yaml @@ -55,7 +55,7 @@ requirements: - rapids-build-backend >=0.4.0,<0.5.0.dev0 - scikit-build-core >=0.10.0 - dlpack >=0.8,<1.0 - - numba-cuda >=0.14.0,<0.15.0a0 + - numba-cuda >=0.18.0,<0.19.0a0 - libcudf =${{ version }} - pylibcudf =${{ version }} - rmm =${{ minor_version }} @@ -70,8 +70,8 @@ requirements: - typing_extensions >=4.0.0 - pandas >=2.0,<2.4.0dev0 - cupy >=12.0.0 - - numba-cuda >=0.14.0,<0.15.0a0 - - numba >=0.59.1,<0.62.0a0 + - numba-cuda >=0.18.0,<0.19.0a0 + - numba >=0.60.0,<0.62.0a0 - numpy >=1.23,<3.0a0 - pyarrow>=14.0.0,<20.0.0a0 - libcudf =${{ version }} @@ -79,13 +79,7 @@ requirements: - ${{ pin_compatible("rmm", upper_bound="x.x") }} - fsspec >=0.6.0 - cuda-cudart - # Needed by Numba for CUDA support - - cuda-nvcc-impl - # TODO: Add nvjitlink here - # xref: https://github.com/rapidsai/cudf/issues/12822 - - cuda-nvrtc - - cuda-python >=12.6.2,<13.0a0 - - pynvjitlink + - cuda-python >=12.9.1,<13.0a0 - if: linux and x86_64 then: - libcufile diff --git a/conda/recipes/pylibcudf/recipe.yaml b/conda/recipes/pylibcudf/recipe.yaml index 2d2cf0a630f..6ba6e189d0f 100644 --- a/conda/recipes/pylibcudf/recipe.yaml +++ b/conda/recipes/pylibcudf/recipe.yaml @@ -72,7 +72,7 @@ requirements: - libcudf =${{ version }} - ${{ pin_compatible("rmm", upper_bound="x.x") }} - fsspec >=0.6.0 - - cuda-python >=12.6.2,<13.0a0 + - cuda-python >=12.9.1,<13.0a0 - nvtx >=0.2.1 - packaging ignore_run_exports: diff --git a/dependencies.yaml b/dependencies.yaml index 7b240f5bc84..504a2b81f96 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -486,9 +486,19 @@ dependencies: - cython>=3.0.3 build_python_cudf: common: - - output_types: [conda, requirements, pyproject] + - output_types: [conda] packages: - - &numba_cuda numba-cuda>=0.14.0,<0.15.0a0 + - &numba_cuda numba-cuda>=0.18.0,<0.19.0a0 + specific: + - output_types: [requirements, pyproject] + matrices: + - matrix: + cuda: "12.*" + packages: + - &numba_cuda_cu12 numba-cuda[cu12]>=0.18.0,<0.19.0a0 + - matrix: # Fallback for no matrix + packages: + - *numba_cuda_cu12 pyarrow_run: common: - output_types: [conda] @@ -646,14 +656,14 @@ dependencies: matrices: - matrix: {cuda: "12.*"} packages: &run_pylibcudf_packages_all_cu12 - - cuda-python>=12.6.2,<13.0a0 + - cuda-python>=12.9.1,<13.0a0 - {matrix: null, packages: *run_pylibcudf_packages_all_cu12} run_cudf: common: - output_types: [conda, requirements, pyproject] packages: - cachetools - - &numba numba>=0.59.1,<0.62.0a0 + - &numba numba>=0.60.0,<0.62.0a0 - nvtx>=0.2.1 - packaging - rich @@ -671,18 +681,13 @@ dependencies: matrices: - matrix: {cuda: "12.*"} packages: &run_cudf_packages_all_cu12 - - cuda-python>=12.6.2,<13.0a0 + - cuda-python>=12.9.1,<13.0a0 - {matrix: null, packages: *run_cudf_packages_all_cu12} - - output_types: conda - matrices: - - matrix: {cuda: "12.*"} - packages: - - &pynvjitlink_unsuffixed pynvjitlink>=0.0.0a0 - output_types: [requirements, pyproject] matrices: - matrix: {cuda: "12.*"} packages: - - &numba_cuda_cu12 numba-cuda[cu12]>=0.14.0,<0.15.0a0 + - *numba_cuda_cu12 - matrix: # Fallback for no matrix packages: - *numba_cuda_cu12 @@ -692,15 +697,9 @@ dependencies: cuda: "12.*" cuda_suffixed: "true" packages: - - pynvjitlink-cu12>=0.0.0a0 - nvidia-cuda-nvcc-cu12 - nvidia-cuda-nvrtc-cu12 - - matrix: - cuda: "12.*" - cuda_suffixed: "false" - packages: &run_cudf_cu12_unsuffixed - - *pynvjitlink_unsuffixed - - {matrix: null, packages: *run_cudf_cu12_unsuffixed} + - {matrix: null, packages: []} run_cudf_polars: common: - output_types: [conda, requirements, pyproject] @@ -782,9 +781,9 @@ dependencies: matrices: - matrix: {dependencies: "oldest"} packages: - - numba==0.59.1 - - numba-cuda==0.14.0 + - numba==0.60.0 - pandas==2.0.* + - numba-cuda==0.18.0 - matrix: {dependencies: "latest"} packages: - pandas==2.3.1 diff --git a/python/cudf/cudf/core/udf/strings_lowering.py b/python/cudf/cudf/core/udf/strings_lowering.py index 4cb755785e8..61f69cb8c71 100644 --- a/python/cudf/cudf/core/udf/strings_lowering.py +++ b/python/cudf/cudf/core/udf/strings_lowering.py @@ -267,7 +267,9 @@ def decref_managed_udf_string(context, builder, sig, args): context, builder, value=managed_ptr ) fnty = ir.FunctionType(ir.VoidType(), [ir.PointerType(ir.IntType(8))]) - fn = cgutils.get_or_insert_function(builder.module, fnty, "NRT_decref") + fn = cgutils.get_or_insert_function( + builder.module, fnty, "NRT_decref_managed_string" + ) builder.call(fn, (managed.meminfo,)) return diff --git a/python/cudf/pyproject.toml b/python/cudf/pyproject.toml index 8e770112a67..6cb02397aed 100644 --- a/python/cudf/pyproject.toml +++ b/python/cudf/pyproject.toml @@ -19,12 +19,12 @@ license = { text = "Apache-2.0" } requires-python = ">=3.10" dependencies = [ "cachetools", - "cuda-python>=12.6.2,<13.0a0", + "cuda-python>=12.9.1,<13.0a0", "cupy-cuda12x>=12.0.0", "fsspec>=0.6.0", "libcudf==25.10.*,>=0.0.0a0", - "numba-cuda[cu12]>=0.14.0,<0.15.0a0", - "numba>=0.59.1,<0.62.0a0", + "numba-cuda[cu12]>=0.18.0,<0.19.0a0", + "numba>=0.60.0,<0.62.0a0", "numpy>=1.23,<3.0a0", "nvtx>=0.2.1", "packaging", @@ -32,7 +32,6 @@ dependencies = [ "pyarrow>=14.0.0,<20.0.0a0,!=17.0.0; platform_machine=='aarch64'", "pyarrow>=14.0.0,<20.0.0a0; platform_machine=='x86_64'", "pylibcudf==25.10.*,>=0.0.0a0", - "pynvjitlink>=0.0.0a0", "rich", "rmm==25.10.*,>=0.0.0a0", "typing_extensions>=4.0.0", @@ -126,7 +125,7 @@ requires = [ "libcudf==25.10.*,>=0.0.0a0", "librmm==25.10.*,>=0.0.0a0", "ninja", - "numba-cuda>=0.14.0,<0.15.0a0", + "numba-cuda[cu12]>=0.18.0,<0.19.0a0", "pylibcudf==25.10.*,>=0.0.0a0", "rmm==25.10.*,>=0.0.0a0", ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. diff --git a/python/cudf/udf_cpp/shim.cu b/python/cudf/udf_cpp/shim.cu index 535358bfc9a..d6f84a23299 100644 --- a/python/cudf/udf_cpp/shim.cu +++ b/python/cudf/udf_cpp/shim.cu @@ -75,6 +75,14 @@ __device__ NRT_MemInfo* make_meminfo_for_new_udf_string(udf_string* udf_str) } } +// Special decref called only by python after transferring ownership of output strings +// Must reset dtor with one that is part of the current module +extern "C" __device__ void NRT_decref_managed_string(NRT_MemInfo* mi) +{ + mi->dtor = udf_str_dtor; + NRT_decref(mi); +} + extern "C" __device__ int len(int* nb_retval, void const* str) { auto sv = reinterpret_cast(str); diff --git a/python/pylibcudf/pyproject.toml b/python/pylibcudf/pyproject.toml index 85d8693b1c3..0c7f89111e3 100644 --- a/python/pylibcudf/pyproject.toml +++ b/python/pylibcudf/pyproject.toml @@ -18,7 +18,7 @@ authors = [ license = { text = "Apache-2.0" } requires-python = ">=3.10" dependencies = [ - "cuda-python>=12.6.2,<13.0a0", + "cuda-python>=12.9.1,<13.0a0", "libcudf==25.10.*,>=0.0.0a0", "nvtx>=0.2.1", "packaging", From 05c2bca3f82527e2145a95366523310613c1b876 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Tue, 12 Aug 2025 18:40:20 -0700 Subject: [PATCH 117/366] Add streams to sorting APIs (#19671) Contributes to #15163 Authors: - Vyas Ramasubramani (https://github.com/vyasr) Approvers: - Bradley Dice (https://github.com/bdice) URL: https://github.com/rapidsai/cudf/pull/19671 --- .../pylibcudf/pylibcudf/libcudf/sorting.pxd | 34 ++++-- python/pylibcudf/pylibcudf/sorting.pxd | 27 +++-- python/pylibcudf/pylibcudf/sorting.pyi | 22 +++- python/pylibcudf/pylibcudf/sorting.pyx | 102 +++++++++++++++--- 4 files changed, 150 insertions(+), 35 deletions(-) diff --git a/python/pylibcudf/pylibcudf/libcudf/sorting.pxd b/python/pylibcudf/pylibcudf/libcudf/sorting.pxd index a5fc13dc90e..0fcda9320dc 100644 --- a/python/pylibcudf/pylibcudf/libcudf/sorting.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/sorting.pxd @@ -16,19 +16,22 @@ from pylibcudf.libcudf.types cimport ( null_order, size_type ) +from rmm.librmm.cuda_stream_view cimport cuda_stream_view cdef extern from "cudf/sorting.hpp" namespace "cudf" nogil: cdef unique_ptr[column] sorted_order( table_view source_table, vector[order] column_order, - vector[null_order] null_precedence + vector[null_order] null_precedence, + cuda_stream_view stream ) except +libcudf_exception_handler cdef unique_ptr[column] stable_sorted_order( table_view source_table, vector[order] column_order, - vector[null_order] null_precedence + vector[null_order] null_precedence, + cuda_stream_view stream ) except +libcudf_exception_handler cdef unique_ptr[column] rank( @@ -37,12 +40,15 @@ cdef extern from "cudf/sorting.hpp" namespace "cudf" nogil: order column_order, null_policy null_handling, null_order null_precedence, - bool percentage) except +libcudf_exception_handler + bool percentage, + cuda_stream_view stream + ) except +libcudf_exception_handler cdef bool is_sorted( const table_view& table, vector[order] column_order, - vector[null_order] null_precedence + vector[null_order] null_precedence, + cuda_stream_view stream ) except +libcudf_exception_handler cdef unique_ptr[table] segmented_sort_by_key( @@ -50,7 +56,8 @@ cdef extern from "cudf/sorting.hpp" namespace "cudf" nogil: const table_view& keys, const column_view& segment_offsets, vector[order] column_order, - vector[null_order] null_precedence + vector[null_order] null_precedence, + cuda_stream_view stream ) except +libcudf_exception_handler cdef unique_ptr[table] stable_segmented_sort_by_key( @@ -58,43 +65,50 @@ cdef extern from "cudf/sorting.hpp" namespace "cudf" nogil: const table_view& keys, const column_view& segment_offsets, vector[order] column_order, - vector[null_order] null_precedence + vector[null_order] null_precedence, + cuda_stream_view stream ) except +libcudf_exception_handler cdef unique_ptr[table] sort_by_key( const table_view& values, const table_view& keys, vector[order] column_order, - vector[null_order] null_precedence + vector[null_order] null_precedence, + cuda_stream_view stream ) except +libcudf_exception_handler cdef unique_ptr[table] stable_sort_by_key( const table_view& values, const table_view& keys, vector[order] column_order, - vector[null_order] null_precedence + vector[null_order] null_precedence, + cuda_stream_view stream ) except +libcudf_exception_handler cdef unique_ptr[table] sort( table_view source_table, vector[order] column_order, - vector[null_order] null_precedence + vector[null_order] null_precedence, + cuda_stream_view stream ) except +libcudf_exception_handler cdef unique_ptr[table] stable_sort( table_view source_table, vector[order] column_order, - vector[null_order] null_precedence + vector[null_order] null_precedence, + cuda_stream_view stream ) except +libcudf_exception_handler cdef unique_ptr[column] top_k( const column_view& col, size_type k, order sort_order, + cuda_stream_view stream ) except +libcudf_exception_handler cdef unique_ptr[column] top_k_order( const column_view& col, size_type k, order sort_order, + cuda_stream_view stream ) except +libcudf_exception_handler diff --git a/python/pylibcudf/pylibcudf/sorting.pxd b/python/pylibcudf/pylibcudf/sorting.pxd index 91f8354f965..2262238c966 100644 --- a/python/pylibcudf/pylibcudf/sorting.pxd +++ b/python/pylibcudf/pylibcudf/sorting.pxd @@ -3,17 +3,21 @@ from libcpp cimport bool from pylibcudf.libcudf.aggregation cimport rank_method from pylibcudf.libcudf.types cimport null_order, null_policy, order, size_type +from rmm.pylibrmm.stream cimport Stream from .column cimport Column from .table cimport Table -cpdef Column sorted_order(Table source_table, list column_order, list null_precedence) +cpdef Column sorted_order( + Table source_table, list column_order, list null_precedence, Stream stream=* +) cpdef Column stable_sorted_order( Table source_table, list column_order, list null_precedence, + Stream stream=* ) cpdef Column rank( @@ -23,9 +27,12 @@ cpdef Column rank( null_policy null_handling, null_order null_precedence, bool percentage, + Stream stream=* ) -cpdef bool is_sorted(Table table, list column_order, list null_precedence) +cpdef bool is_sorted( + Table table, list column_order, list null_precedence, Stream stream=* +) cpdef Table segmented_sort_by_key( Table values, @@ -33,6 +40,7 @@ cpdef Table segmented_sort_by_key( Column segment_offsets, list column_order, list null_precedence, + Stream stream=* ) cpdef Table stable_segmented_sort_by_key( @@ -41,6 +49,7 @@ cpdef Table stable_segmented_sort_by_key( Column segment_offsets, list column_order, list null_precedence, + Stream stream=* ) cpdef Table sort_by_key( @@ -48,6 +57,7 @@ cpdef Table sort_by_key( Table keys, list column_order, list null_precedence, + Stream stream=* ) cpdef Table stable_sort_by_key( @@ -55,12 +65,17 @@ cpdef Table stable_sort_by_key( Table keys, list column_order, list null_precedence, + Stream stream=* ) -cpdef Table sort(Table source_table, list column_order, list null_precedence) +cpdef Table sort( + Table source_table, list column_order, list null_precedence, Stream stream=* +) -cpdef Table stable_sort(Table source_table, list column_order, list null_precedence) +cpdef Table stable_sort( + Table source_table, list column_order, list null_precedence, Stream stream=* +) -cpdef Column top_k(Column col, size_type k, order sort_order = *) +cpdef Column top_k(Column col, size_type k, order sort_order=*, Stream stream=*) -cpdef Column top_k_order(Column col, size_type k, order sort_order = *) +cpdef Column top_k_order(Column col, size_type k, order sort_order=*, Stream stream=*) diff --git a/python/pylibcudf/pylibcudf/sorting.pyi b/python/pylibcudf/pylibcudf/sorting.pyi index 07ad962d0ce..4ff529631e3 100644 --- a/python/pylibcudf/pylibcudf/sorting.pyi +++ b/python/pylibcudf/pylibcudf/sorting.pyi @@ -1,5 +1,7 @@ # Copyright (c) 2024, NVIDIA CORPORATION. +from rmm.pylibrmm.stream import Stream + from pylibcudf.aggregation import RankMethod from pylibcudf.column import Column from pylibcudf.table import Table @@ -9,11 +11,13 @@ def sorted_order( source_table: Table, column_order: list[Order], null_precedence: list[NullOrder], + stream: Stream | None = None, ) -> Column: ... def stable_sorted_order( source_table: Table, column_order: list[Order], null_precedence: list[NullOrder], + stream: Stream | None = None, ) -> Column: ... def rank( input_view: Column, @@ -22,9 +26,13 @@ def rank( null_handling: NullPolicy, null_precedence: NullOrder, percentage: bool, + stream: Stream | None = None, ) -> Column: ... def is_sorted( - tbl: Table, column_order: list[Order], null_precedence: list[NullOrder] + tbl: Table, + column_order: list[Order], + null_precedence: list[NullOrder], + stream: Stream | None = None, ) -> bool: ... def segmented_sort_by_key( values: Table, @@ -32,6 +40,7 @@ def segmented_sort_by_key( segment_offsets: Column, column_order: list[Order], null_precedence: list[NullOrder], + stream: Stream | None = None, ) -> Table: ... def stable_segmented_sort_by_key( values: Table, @@ -39,34 +48,43 @@ def stable_segmented_sort_by_key( segment_offsets: Column, column_order: list[Order], null_precedence: list[NullOrder], + stream: Stream | None = None, ) -> Table: ... def sort_by_key( values: Table, keys: Table, column_order: list[Order], null_precedence: list[NullOrder], + stream: Stream | None = None, ) -> Table: ... def stable_sort_by_key( values: Table, keys: Table, column_order: list[Order], null_precedence: list[NullOrder], + stream: Stream | None = None, ) -> Table: ... def sort( source_table: Table, column_order: list[Order], null_precedence: list[NullOrder], + stream: Stream | None = None, ) -> Table: ... def stable_sort( source_table: Table, column_order: list[Order], null_precedence: list[NullOrder], + stream: Stream | None = None, ) -> Table: ... def top_k( col: Column, k: int, sort_order: Order = Order.DESCENDING, + stream: Stream | None = None, ) -> Column: ... def top_k_order( - col: Column, k: int, sort_order: Order = Order.DESCENDING + col: Column, + k: int, + sort_order: Order = Order.DESCENDING, + stream: Stream | None = None, ) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/sorting.pyx b/python/pylibcudf/pylibcudf/sorting.pyx index 31efc018d6d..a1f28b7762b 100644 --- a/python/pylibcudf/pylibcudf/sorting.pyx +++ b/python/pylibcudf/pylibcudf/sorting.pyx @@ -8,9 +8,11 @@ from pylibcudf.libcudf.aggregation cimport rank_method from pylibcudf.libcudf.column.column cimport column from pylibcudf.libcudf.table.table cimport table from pylibcudf.libcudf.types cimport null_order, null_policy, order, size_type +from rmm.pylibrmm.stream cimport Stream from .column cimport Column from .table cimport Table +from .utils cimport _get_stream __all__ = [ "is_sorted", @@ -25,7 +27,9 @@ __all__ = [ "stable_sorted_order", ] -cpdef Column sorted_order(Table source_table, list column_order, list null_precedence): +cpdef Column sorted_order( + Table source_table, list column_order, list null_precedence, Stream stream=None +): """Computes the row indices required to sort the table. For details, see :cpp:func:`sorted_order`. @@ -47,19 +51,24 @@ cpdef Column sorted_order(Table source_table, list column_order, list null_prece cdef unique_ptr[column] c_result cdef vector[order] c_orders = column_order cdef vector[null_order] c_null_precedence = null_precedence + + stream = _get_stream(stream) + with nogil: c_result = cpp_sorting.sorted_order( source_table.view(), c_orders, c_null_precedence, + stream.view() ) - return Column.from_libcudf(move(c_result)) + return Column.from_libcudf(move(c_result), stream) cpdef Column stable_sorted_order( Table source_table, list column_order, list null_precedence, + Stream stream=None ): """Computes the row indices required to sort the table, preserving order of equal elements. @@ -83,13 +92,17 @@ cpdef Column stable_sorted_order( cdef unique_ptr[column] c_result cdef vector[order] c_orders = column_order cdef vector[null_order] c_null_precedence = null_precedence + + stream = _get_stream(stream) + with nogil: c_result = cpp_sorting.stable_sorted_order( source_table.view(), c_orders, c_null_precedence, + stream.view() ) - return Column.from_libcudf(move(c_result)) + return Column.from_libcudf(move(c_result), stream) cpdef Column rank( @@ -99,6 +112,7 @@ cpdef Column rank( null_policy null_handling, null_order null_precedence, bool percentage, + Stream stream=None ): """Computes the rank of each element in the column. @@ -125,6 +139,9 @@ cpdef Column rank( The rank of each element in the column. """ cdef unique_ptr[column] c_result + + stream = _get_stream(stream) + with nogil: c_result = cpp_sorting.rank( input_view.view(), @@ -133,11 +150,14 @@ cpdef Column rank( null_handling, null_precedence, percentage, + stream.view() ) - return Column.from_libcudf(move(c_result)) + return Column.from_libcudf(move(c_result), stream) -cpdef bool is_sorted(Table tbl, list column_order, list null_precedence): +cpdef bool is_sorted( + Table tbl, list column_order, list null_precedence, Stream stream=None +): """Checks if the table is sorted. For details, see :cpp:func:`is_sorted`. @@ -159,11 +179,15 @@ cpdef bool is_sorted(Table tbl, list column_order, list null_precedence): cdef bool c_result cdef vector[order] c_orders = column_order cdef vector[null_order] c_null_precedence = null_precedence + + stream = _get_stream(stream) + with nogil: c_result = cpp_sorting.is_sorted( tbl.view(), c_orders, c_null_precedence, + stream.view() ) return c_result @@ -174,6 +198,7 @@ cpdef Table segmented_sort_by_key( Column segment_offsets, list column_order, list null_precedence, + Stream stream=None ): """Sorts the table by key, within segments. @@ -200,6 +225,9 @@ cpdef Table segmented_sort_by_key( cdef unique_ptr[table] c_result cdef vector[order] c_orders = column_order cdef vector[null_order] c_null_precedence = null_precedence + + stream = _get_stream(stream) + with nogil: c_result = cpp_sorting.segmented_sort_by_key( values.view(), @@ -207,8 +235,9 @@ cpdef Table segmented_sort_by_key( segment_offsets.view(), c_orders, c_null_precedence, + stream.view() ) - return Table.from_libcudf(move(c_result)) + return Table.from_libcudf(move(c_result), stream) cpdef Table stable_segmented_sort_by_key( @@ -217,6 +246,7 @@ cpdef Table stable_segmented_sort_by_key( Column segment_offsets, list column_order, list null_precedence, + Stream stream=None ): """Sorts the table by key preserving order of equal elements, within segments. @@ -244,6 +274,9 @@ cpdef Table stable_segmented_sort_by_key( cdef unique_ptr[table] c_result cdef vector[order] c_orders = column_order cdef vector[null_order] c_null_precedence = null_precedence + + stream = _get_stream(stream) + with nogil: c_result = cpp_sorting.stable_segmented_sort_by_key( values.view(), @@ -251,8 +284,9 @@ cpdef Table stable_segmented_sort_by_key( segment_offsets.view(), c_orders, c_null_precedence, + stream.view() ) - return Table.from_libcudf(move(c_result)) + return Table.from_libcudf(move(c_result), stream) cpdef Table sort_by_key( @@ -260,6 +294,7 @@ cpdef Table sort_by_key( Table keys, list column_order, list null_precedence, + Stream stream=None ): """Sorts the table by key. @@ -284,14 +319,18 @@ cpdef Table sort_by_key( cdef unique_ptr[table] c_result cdef vector[order] c_orders = column_order cdef vector[null_order] c_null_precedence = null_precedence + + stream = _get_stream(stream) + with nogil: c_result = cpp_sorting.sort_by_key( values.view(), keys.view(), c_orders, c_null_precedence, + stream.view() ) - return Table.from_libcudf(move(c_result)) + return Table.from_libcudf(move(c_result), stream) cpdef Table stable_sort_by_key( @@ -299,6 +338,7 @@ cpdef Table stable_sort_by_key( Table keys, list column_order, list null_precedence, + Stream stream=None ): """Sorts the table by key preserving order of equal elements. @@ -323,17 +363,23 @@ cpdef Table stable_sort_by_key( cdef unique_ptr[table] c_result cdef vector[order] c_orders = column_order cdef vector[null_order] c_null_precedence = null_precedence + + stream = _get_stream(stream) + with nogil: c_result = cpp_sorting.stable_sort_by_key( values.view(), keys.view(), c_orders, c_null_precedence, + stream.view() ) - return Table.from_libcudf(move(c_result)) + return Table.from_libcudf(move(c_result), stream) -cpdef Table sort(Table source_table, list column_order, list null_precedence): +cpdef Table sort( + Table source_table, list column_order, list null_precedence, Stream stream=None +): """Sorts the table. For details, see :cpp:func:`sort`. @@ -355,16 +401,22 @@ cpdef Table sort(Table source_table, list column_order, list null_precedence): cdef unique_ptr[table] c_result cdef vector[order] c_orders = column_order cdef vector[null_order] c_null_precedence = null_precedence + + stream = _get_stream(stream) + with nogil: c_result = cpp_sorting.sort( source_table.view(), c_orders, c_null_precedence, + stream.view() ) - return Table.from_libcudf(move(c_result)) + return Table.from_libcudf(move(c_result), stream) -cpdef Table stable_sort(Table source_table, list column_order, list null_precedence): +cpdef Table stable_sort( + Table source_table, list column_order, list null_precedence, Stream stream=None +): """Sorts the table preserving order of equal elements. For details, see :cpp:func:`stable_sort`. @@ -386,16 +438,22 @@ cpdef Table stable_sort(Table source_table, list column_order, list null_precede cdef unique_ptr[table] c_result cdef vector[order] c_orders = column_order cdef vector[null_order] c_null_precedence = null_precedence + + stream = _get_stream(stream) + with nogil: c_result = cpp_sorting.stable_sort( source_table.view(), c_orders, c_null_precedence, + stream.view() ) - return Table.from_libcudf(move(c_result)) + return Table.from_libcudf(move(c_result), stream) -cpdef Column top_k(Column col, size_type k, order sort_order = order.DESCENDING): +cpdef Column top_k( + Column col, size_type k, order sort_order = order.DESCENDING, Stream stream=None +): """ Computes the top-k values of a column. @@ -417,16 +475,22 @@ cpdef Column top_k(Column col, size_type k, order sort_order = order.DESCENDING) A column of the top ``k`` elements from the input. """ cdef unique_ptr[column] c_result + + stream = _get_stream(stream) + with nogil: c_result = cpp_sorting.top_k( col.view(), k, sort_order, + stream.view() ) - return Column.from_libcudf(move(c_result)) + return Column.from_libcudf(move(c_result), stream) -cpdef Column top_k_order(Column col, size_type k, order sort_order = order.DESCENDING): +cpdef Column top_k_order( + Column col, size_type k, order sort_order = order.DESCENDING, Stream stream=None +): """ Computes the indices of the top-k values of a column. @@ -451,10 +515,14 @@ cpdef Column top_k_order(Column col, size_type k, order sort_order = order.DESCE A column of the indices of the top ``k`` elements. """ cdef unique_ptr[column] c_result + + stream = _get_stream(stream) + with nogil: c_result = cpp_sorting.top_k_order( col.view(), k, sort_order, + stream.view() ) - return Column.from_libcudf(move(c_result)) + return Column.from_libcudf(move(c_result), stream) From 97a3014bf4f7569fd8df373040a7cb2e09d2bd65 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Tue, 12 Aug 2025 18:43:56 -0700 Subject: [PATCH 118/366] Add streams to pylibcudf join APIs (#19672) Contributes to #15163 Authors: - Vyas Ramasubramani (https://github.com/vyasr) Approvers: - Bradley Dice (https://github.com/bdice) URL: https://github.com/rapidsai/cudf/pull/19672 --- python/pylibcudf/pylibcudf/join.pxd | 40 ++-- python/pylibcudf/pylibcudf/join.pyi | 61 +++++- python/pylibcudf/pylibcudf/join.pyx | 210 +++++++++++++++----- python/pylibcudf/pylibcudf/libcudf/join.pxd | 56 ++++-- 4 files changed, 278 insertions(+), 89 deletions(-) diff --git a/python/pylibcudf/pylibcudf/join.pxd b/python/pylibcudf/pylibcudf/join.pxd index bb9162b466a..e5aa8be2261 100644 --- a/python/pylibcudf/pylibcudf/join.pxd +++ b/python/pylibcudf/pylibcudf/join.pxd @@ -1,6 +1,7 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. +# Copyright (c) 2024-2025, NVIDIA CORPORATION. from pylibcudf.libcudf.types cimport null_equality +from rmm.pylibrmm.stream cimport Stream from .column cimport Column from .expressions cimport Expression @@ -10,63 +11,73 @@ from .table cimport Table cpdef tuple inner_join( Table left_keys, Table right_keys, - null_equality nulls_equal + null_equality nulls_equal, + Stream stream=* ) cpdef tuple left_join( Table left_keys, Table right_keys, - null_equality nulls_equal + null_equality nulls_equal, + Stream stream=* ) cpdef tuple full_join( Table left_keys, Table right_keys, - null_equality nulls_equal + null_equality nulls_equal, + Stream stream=* ) cpdef Column left_semi_join( Table left_keys, Table right_keys, - null_equality nulls_equal + null_equality nulls_equal, + Stream stream=* ) cpdef Column left_anti_join( Table left_keys, Table right_keys, - null_equality nulls_equal + null_equality nulls_equal, + Stream stream=* ) -cpdef Table cross_join(Table left, Table right) +cpdef Table cross_join(Table left, Table right, Stream stream=*) cpdef tuple conditional_inner_join( Table left, Table right, Expression binary_predicate, + Stream stream=* ) cpdef tuple conditional_left_join( Table left, Table right, Expression binary_predicate, + Stream stream=* ) cpdef tuple conditional_full_join( Table left, Table right, Expression binary_predicate, + Stream stream=* ) cpdef Column conditional_left_semi_join( Table left, Table right, Expression binary_predicate, + Stream stream=* ) cpdef Column conditional_left_anti_join( Table left, Table right, Expression binary_predicate, + Stream stream=* ) cpdef tuple mixed_inner_join( @@ -75,7 +86,8 @@ cpdef tuple mixed_inner_join( Table left_conditional, Table right_conditional, Expression binary_predicate, - null_equality nulls_equal + null_equality nulls_equal, + Stream stream=* ) cpdef tuple mixed_left_join( @@ -84,7 +96,8 @@ cpdef tuple mixed_left_join( Table left_conditional, Table right_conditional, Expression binary_predicate, - null_equality nulls_equal + null_equality nulls_equal, + Stream stream=* ) cpdef tuple mixed_full_join( @@ -93,7 +106,8 @@ cpdef tuple mixed_full_join( Table left_conditional, Table right_conditional, Expression binary_predicate, - null_equality nulls_equal + null_equality nulls_equal, + Stream stream=* ) cpdef Column mixed_left_semi_join( @@ -102,7 +116,8 @@ cpdef Column mixed_left_semi_join( Table left_conditional, Table right_conditional, Expression binary_predicate, - null_equality nulls_equal + null_equality nulls_equal, + Stream stream=* ) cpdef Column mixed_left_anti_join( @@ -111,5 +126,6 @@ cpdef Column mixed_left_anti_join( Table left_conditional, Table right_conditional, Expression binary_predicate, - null_equality nulls_equal + null_equality nulls_equal, + Stream stream=* ) diff --git a/python/pylibcudf/pylibcudf/join.pyi b/python/pylibcudf/pylibcudf/join.pyi index f34357baa67..5008bbd2a94 100644 --- a/python/pylibcudf/pylibcudf/join.pyi +++ b/python/pylibcudf/pylibcudf/join.pyi @@ -1,40 +1,74 @@ # Copyright (c) 2024, NVIDIA CORPORATION. +from rmm.pylibrmm import Stream + from pylibcudf.column import Column from pylibcudf.expressions import Expression from pylibcudf.table import Table from pylibcudf.types import NullEquality def inner_join( - left_keys: Table, right_keys: Table, nulls_equal: NullEquality + left_keys: Table, + right_keys: Table, + nulls_equal: NullEquality, + stream: Stream | None = None, ) -> tuple[Column, Column]: ... def left_join( - left_keys: Table, right_keys: Table, nulls_equal: NullEquality + left_keys: Table, + right_keys: Table, + nulls_equal: NullEquality, + stream: Stream | None = None, ) -> tuple[Column, Column]: ... def full_join( - left_keys: Table, right_keys: Table, nulls_equal: NullEquality + left_keys: Table, + right_keys: Table, + nulls_equal: NullEquality, + stream: Stream | None = None, ) -> tuple[Column, Column]: ... def left_semi_join( - left_keys: Table, right_keys: Table, nulls_equal: NullEquality + left_keys: Table, + right_keys: Table, + nulls_equal: NullEquality, + stream: Stream | None = None, ) -> Column: ... def left_anti_join( - left_keys: Table, right_keys: Table, nulls_equal: NullEquality + left_keys: Table, + right_keys: Table, + nulls_equal: NullEquality, + stream: Stream | None = None, ) -> Column: ... -def cross_join(left: Table, right: Table) -> Table: ... +def cross_join( + left: Table, right: Table, stream: Stream | None = None +) -> Table: ... def conditional_inner_join( - left: Table, right: Table, binary_predicate: Expression + left: Table, + right: Table, + binary_predicate: Expression, + stream: Stream | None = None, ) -> tuple[Column, Column]: ... def conditional_left_join( - left: Table, right: Table, binary_predicate: Expression + left: Table, + right: Table, + binary_predicate: Expression, + stream: Stream | None = None, ) -> tuple[Column, Column]: ... def conditional_full_join( - left: Table, right: Table, binary_predicate: Expression + left: Table, + right: Table, + binary_predicate: Expression, + stream: Stream | None = None, ) -> tuple[Column, Column]: ... def conditional_left_semi_join( - left: Table, right: Table, binary_predicate: Expression + left: Table, + right: Table, + binary_predicate: Expression, + stream: Stream | None = None, ) -> Column: ... def conditional_left_anti_join( - left: Table, right: Table, binary_predicate: Expression + left: Table, + right: Table, + binary_predicate: Expression, + stream: Stream | None = None, ) -> Column: ... def mixed_inner_join( left_keys: Table, @@ -43,6 +77,7 @@ def mixed_inner_join( right_conditional: Table, binary_predicate: Expression, nulls_equal: NullEquality, + stream: Stream | None = None, ) -> tuple[Column, Column]: ... def mixed_left_join( left_keys: Table, @@ -51,6 +86,7 @@ def mixed_left_join( right_conditional: Table, binary_predicate: Expression, nulls_equal: NullEquality, + stream: Stream | None = None, ) -> tuple[Column, Column]: ... def mixed_full_join( left_keys: Table, @@ -59,6 +95,7 @@ def mixed_full_join( right_conditional: Table, binary_predicate: Expression, nulls_equal: NullEquality, + stream: Stream | None = None, ) -> tuple[Column, Column]: ... def mixed_left_semi_join( left_keys: Table, @@ -67,6 +104,7 @@ def mixed_left_semi_join( right_conditional: Table, binary_predicate: Expression, nulls_equal: NullEquality, + stream: Stream | None = None, ) -> Column: ... def mixed_left_anti_join( left_keys: Table, @@ -75,4 +113,5 @@ def mixed_left_anti_join( right_conditional: Table, binary_predicate: Expression, nulls_equal: NullEquality, + stream: Stream | None = None, ) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/join.pyx b/python/pylibcudf/pylibcudf/join.pyx index c2efe05ffc4..a9261345db5 100644 --- a/python/pylibcudf/pylibcudf/join.pyx +++ b/python/pylibcudf/pylibcudf/join.pyx @@ -1,8 +1,10 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. +# Copyright (c) 2024-2025, NVIDIA CORPORATION. from cython.operator import dereference +from libc.stddef cimport size_t from libcpp.memory cimport make_unique, unique_ptr +from libcpp.optional cimport optional from libcpp.utility cimport move from pylibcudf.libcudf cimport join as cpp_join from pylibcudf.libcudf.column.column cimport column @@ -10,10 +12,12 @@ from pylibcudf.libcudf.table.table cimport table from pylibcudf.libcudf.types cimport null_equality from rmm.librmm.device_buffer cimport device_buffer +from rmm.pylibrmm.stream cimport Stream from .column cimport Column from .expressions cimport Expression from .table cimport Table +from .utils cimport _get_stream __all__ = [ "conditional_full_join", @@ -34,7 +38,7 @@ __all__ = [ "mixed_left_semi_join", ] -cdef Column _column_from_gather_map(cpp_join.gather_map_type gather_map): +cdef Column _column_from_gather_map(cpp_join.gather_map_type gather_map, Stream stream): # helper to convert a gather map to a Column return Column.from_libcudf( move( @@ -43,14 +47,16 @@ cdef Column _column_from_gather_map(cpp_join.gather_map_type gather_map): device_buffer(), 0 ) - ) + ), + stream ) cpdef tuple inner_join( Table left_keys, Table right_keys, - null_equality nulls_equal + null_equality nulls_equal, + Stream stream=None ): """Perform an inner join between two tables. @@ -72,18 +78,24 @@ cpdef tuple inner_join( join. """ cdef cpp_join.gather_map_pair_type c_result + + stream = _get_stream(stream) + with nogil: - c_result = cpp_join.inner_join(left_keys.view(), right_keys.view(), nulls_equal) + c_result = cpp_join.inner_join( + left_keys.view(), right_keys.view(), nulls_equal, stream.view() + ) return ( - _column_from_gather_map(move(c_result.first)), - _column_from_gather_map(move(c_result.second)), + _column_from_gather_map(move(c_result.first), stream), + _column_from_gather_map(move(c_result.second), stream), ) cpdef tuple left_join( Table left_keys, Table right_keys, - null_equality nulls_equal + null_equality nulls_equal, + Stream stream=None ): """Perform a left join between two tables. @@ -105,18 +117,24 @@ cpdef tuple left_join( join. """ cdef cpp_join.gather_map_pair_type c_result + + stream = _get_stream(stream) + with nogil: - c_result = cpp_join.left_join(left_keys.view(), right_keys.view(), nulls_equal) + c_result = cpp_join.left_join( + left_keys.view(), right_keys.view(), nulls_equal, stream.view() + ) return ( - _column_from_gather_map(move(c_result.first)), - _column_from_gather_map(move(c_result.second)), + _column_from_gather_map(move(c_result.first), stream), + _column_from_gather_map(move(c_result.second), stream), ) cpdef tuple full_join( Table left_keys, Table right_keys, - null_equality nulls_equal + null_equality nulls_equal, + Stream stream=None ): """Perform a full join between two tables. @@ -138,18 +156,24 @@ cpdef tuple full_join( join. """ cdef cpp_join.gather_map_pair_type c_result + + stream = _get_stream(stream) + with nogil: - c_result = cpp_join.full_join(left_keys.view(), right_keys.view(), nulls_equal) + c_result = cpp_join.full_join( + left_keys.view(), right_keys.view(), nulls_equal, stream.view() + ) return ( - _column_from_gather_map(move(c_result.first)), - _column_from_gather_map(move(c_result.second)), + _column_from_gather_map(move(c_result.first), stream), + _column_from_gather_map(move(c_result.second), stream), ) cpdef Column left_semi_join( Table left_keys, Table right_keys, - null_equality nulls_equal + null_equality nulls_equal, + Stream stream=None ): """Perform a left semi join between two tables. @@ -170,19 +194,24 @@ cpdef Column left_semi_join( A column containing the row indices from the left table after the join. """ cdef cpp_join.gather_map_type c_result + + stream = _get_stream(stream) + with nogil: c_result = cpp_join.left_semi_join( left_keys.view(), right_keys.view(), - nulls_equal + nulls_equal, + stream.view() ) - return _column_from_gather_map(move(c_result)) + return _column_from_gather_map(move(c_result), stream) cpdef Column left_anti_join( Table left_keys, Table right_keys, - null_equality nulls_equal + null_equality nulls_equal, + Stream stream=None ): """Perform a left anti join between two tables. @@ -203,16 +232,20 @@ cpdef Column left_anti_join( A column containing the row indices from the left table after the join. """ cdef cpp_join.gather_map_type c_result + + stream = _get_stream(stream) + with nogil: c_result = cpp_join.left_anti_join( left_keys.view(), right_keys.view(), - nulls_equal + nulls_equal, + stream.view() ) - return _column_from_gather_map(move(c_result)) + return _column_from_gather_map(move(c_result), stream) -cpdef Table cross_join(Table left, Table right): +cpdef Table cross_join(Table left, Table right, Stream stream=None): """Perform a cross join on two tables. For details see :cpp:func:`cross_join`. @@ -230,15 +263,19 @@ cpdef Table cross_join(Table left, Table right): The result of cross joining the two inputs. """ cdef unique_ptr[table] result + + stream = _get_stream(stream) + with nogil: - result = cpp_join.cross_join(left.view(), right.view()) - return Table.from_libcudf(move(result)) + result = cpp_join.cross_join(left.view(), right.view(), stream.view()) + return Table.from_libcudf(move(result), stream) cpdef tuple conditional_inner_join( Table left, Table right, Expression binary_predicate, + Stream stream=None ): """Perform a conditional inner join between two tables. @@ -260,13 +297,21 @@ cpdef tuple conditional_inner_join( join. """ cdef cpp_join.gather_map_pair_type c_result + cdef optional[size_t] output_size + + stream = _get_stream(stream) + with nogil: c_result = cpp_join.conditional_inner_join( - left.view(), right.view(), dereference(binary_predicate.c_obj.get()) + left.view(), + right.view(), + dereference(binary_predicate.c_obj.get()), + output_size, + stream.view() ) return ( - _column_from_gather_map(move(c_result.first)), - _column_from_gather_map(move(c_result.second)), + _column_from_gather_map(move(c_result.first), stream), + _column_from_gather_map(move(c_result.second), stream), ) @@ -274,6 +319,7 @@ cpdef tuple conditional_left_join( Table left, Table right, Expression binary_predicate, + Stream stream=None ): """Perform a conditional left join between two tables. @@ -295,13 +341,21 @@ cpdef tuple conditional_left_join( join. """ cdef cpp_join.gather_map_pair_type c_result + cdef optional[size_t] output_size + + stream = _get_stream(stream) + with nogil: c_result = cpp_join.conditional_left_join( - left.view(), right.view(), dereference(binary_predicate.c_obj.get()) + left.view(), + right.view(), + dereference(binary_predicate.c_obj.get()), + output_size, + stream.view() ) return ( - _column_from_gather_map(move(c_result.first)), - _column_from_gather_map(move(c_result.second)), + _column_from_gather_map(move(c_result.first), stream), + _column_from_gather_map(move(c_result.second), stream), ) @@ -309,6 +363,7 @@ cpdef tuple conditional_full_join( Table left, Table right, Expression binary_predicate, + Stream stream=None ): """Perform a conditional full join between two tables. @@ -330,13 +385,19 @@ cpdef tuple conditional_full_join( join. """ cdef cpp_join.gather_map_pair_type c_result + + stream = _get_stream(stream) + with nogil: c_result = cpp_join.conditional_full_join( - left.view(), right.view(), dereference(binary_predicate.c_obj.get()) + left.view(), + right.view(), + dereference(binary_predicate.c_obj.get()), + stream.view() ) return ( - _column_from_gather_map(move(c_result.first)), - _column_from_gather_map(move(c_result.second)), + _column_from_gather_map(move(c_result.first), stream), + _column_from_gather_map(move(c_result.second), stream), ) @@ -344,6 +405,7 @@ cpdef Column conditional_left_semi_join( Table left, Table right, Expression binary_predicate, + Stream stream=None ): """Perform a conditional left semi join between two tables. @@ -364,17 +426,26 @@ cpdef Column conditional_left_semi_join( A column containing the row indices from the left table after the join. """ cdef cpp_join.gather_map_type c_result + cdef optional[size_t] output_size + + stream = _get_stream(stream) + with nogil: c_result = cpp_join.conditional_left_semi_join( - left.view(), right.view(), dereference(binary_predicate.c_obj.get()) + left.view(), + right.view(), + dereference(binary_predicate.c_obj.get()), + output_size, + stream.view() ) - return _column_from_gather_map(move(c_result)) + return _column_from_gather_map(move(c_result), stream) cpdef Column conditional_left_anti_join( Table left, Table right, Expression binary_predicate, + Stream stream=None ): """Perform a conditional left anti join between two tables. @@ -395,11 +466,19 @@ cpdef Column conditional_left_anti_join( A column containing the row indices from the left table after the join. """ cdef cpp_join.gather_map_type c_result + cdef optional[size_t] output_size + + stream = _get_stream(stream) + with nogil: c_result = cpp_join.conditional_left_anti_join( - left.view(), right.view(), dereference(binary_predicate.c_obj.get()) + left.view(), + right.view(), + dereference(binary_predicate.c_obj.get()), + output_size, + stream.view() ) - return _column_from_gather_map(move(c_result)) + return _column_from_gather_map(move(c_result), stream) cpdef tuple mixed_inner_join( @@ -408,7 +487,8 @@ cpdef tuple mixed_inner_join( Table left_conditional, Table right_conditional, Expression binary_predicate, - null_equality nulls_equal + null_equality nulls_equal, + Stream stream=None ): """Perform a mixed inner join between two tables. @@ -436,6 +516,10 @@ cpdef tuple mixed_inner_join( join. """ cdef cpp_join.gather_map_pair_type c_result + cdef cpp_join.output_size_data_type empty_optional + + stream = _get_stream(stream) + with nogil: c_result = cpp_join.mixed_inner_join( left_keys.view(), @@ -444,10 +528,12 @@ cpdef tuple mixed_inner_join( right_conditional.view(), dereference(binary_predicate.c_obj.get()), nulls_equal, + empty_optional, + stream.view() ) return ( - _column_from_gather_map(move(c_result.first)), - _column_from_gather_map(move(c_result.second)), + _column_from_gather_map(move(c_result.first), stream), + _column_from_gather_map(move(c_result.second), stream), ) @@ -457,7 +543,8 @@ cpdef tuple mixed_left_join( Table left_conditional, Table right_conditional, Expression binary_predicate, - null_equality nulls_equal + null_equality nulls_equal, + Stream stream=None ): """Perform a mixed left join between two tables. @@ -485,6 +572,10 @@ cpdef tuple mixed_left_join( join. """ cdef cpp_join.gather_map_pair_type c_result + cdef cpp_join.output_size_data_type empty_optional + + stream = _get_stream(stream) + with nogil: c_result = cpp_join.mixed_left_join( left_keys.view(), @@ -493,10 +584,12 @@ cpdef tuple mixed_left_join( right_conditional.view(), dereference(binary_predicate.c_obj.get()), nulls_equal, + empty_optional, + stream.view() ) return ( - _column_from_gather_map(move(c_result.first)), - _column_from_gather_map(move(c_result.second)), + _column_from_gather_map(move(c_result.first), stream), + _column_from_gather_map(move(c_result.second), stream), ) @@ -506,7 +599,8 @@ cpdef tuple mixed_full_join( Table left_conditional, Table right_conditional, Expression binary_predicate, - null_equality nulls_equal + null_equality nulls_equal, + Stream stream=None ): """Perform a mixed full join between two tables. @@ -534,6 +628,10 @@ cpdef tuple mixed_full_join( join. """ cdef cpp_join.gather_map_pair_type c_result + cdef cpp_join.output_size_data_type empty_optional + + stream = _get_stream(stream) + with nogil: c_result = cpp_join.mixed_full_join( left_keys.view(), @@ -542,10 +640,12 @@ cpdef tuple mixed_full_join( right_conditional.view(), dereference(binary_predicate.c_obj.get()), nulls_equal, + empty_optional, + stream.view() ) return ( - _column_from_gather_map(move(c_result.first)), - _column_from_gather_map(move(c_result.second)), + _column_from_gather_map(move(c_result.first), stream), + _column_from_gather_map(move(c_result.second), stream), ) @@ -555,7 +655,8 @@ cpdef Column mixed_left_semi_join( Table left_conditional, Table right_conditional, Expression binary_predicate, - null_equality nulls_equal + null_equality nulls_equal, + Stream stream=None ): """Perform a mixed left semi join between two tables. @@ -582,6 +683,9 @@ cpdef Column mixed_left_semi_join( A column containing the row indices from the left table after the join. """ cdef cpp_join.gather_map_type c_result + + stream = _get_stream(stream) + with nogil: c_result = cpp_join.mixed_left_semi_join( left_keys.view(), @@ -590,8 +694,9 @@ cpdef Column mixed_left_semi_join( right_conditional.view(), dereference(binary_predicate.c_obj.get()), nulls_equal, + stream.view() ) - return _column_from_gather_map(move(c_result)) + return _column_from_gather_map(move(c_result), stream) cpdef Column mixed_left_anti_join( @@ -600,7 +705,8 @@ cpdef Column mixed_left_anti_join( Table left_conditional, Table right_conditional, Expression binary_predicate, - null_equality nulls_equal + null_equality nulls_equal, + Stream stream=None ): """Perform a mixed left anti join between two tables. @@ -627,6 +733,9 @@ cpdef Column mixed_left_anti_join( A column containing the row indices from the left table after the join. """ cdef cpp_join.gather_map_type c_result + + stream = _get_stream(stream) + with nogil: c_result = cpp_join.mixed_left_anti_join( left_keys.view(), @@ -635,5 +744,6 @@ cpdef Column mixed_left_anti_join( right_conditional.view(), dereference(binary_predicate.c_obj.get()), nulls_equal, + stream.view() ) - return _column_from_gather_map(move(c_result)) + return _column_from_gather_map(move(c_result), stream) diff --git a/python/pylibcudf/pylibcudf/libcudf/join.pxd b/python/pylibcudf/pylibcudf/libcudf/join.pxd index 3810bdb6798..111576ea1d9 100644 --- a/python/pylibcudf/pylibcudf/libcudf/join.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/join.pxd @@ -12,71 +12,85 @@ from pylibcudf.libcudf.expressions cimport expression from pylibcudf.libcudf.table.table cimport table from pylibcudf.libcudf.table.table_view cimport table_view from pylibcudf.libcudf.types cimport null_equality, size_type +from rmm.librmm.cuda_stream_view cimport cuda_stream_view from rmm.librmm.device_uvector cimport device_uvector +from pylibcudf.libcudf.utilities.span cimport device_span ctypedef unique_ptr[device_uvector[size_type]] gather_map_type ctypedef pair[gather_map_type, gather_map_type] gather_map_pair_type +ctypedef optional[pair[size_t, device_span[const size_type]]] output_size_data_type cdef extern from "cudf/join/join.hpp" namespace "cudf" nogil: cdef gather_map_pair_type inner_join( const table_view left_keys, const table_view right_keys, + cuda_stream_view stream ) except +libcudf_exception_handler cdef gather_map_pair_type left_join( const table_view left_keys, const table_view right_keys, + cuda_stream_view stream ) except +libcudf_exception_handler cdef gather_map_pair_type full_join( const table_view left_keys, const table_view right_keys, + cuda_stream_view stream ) except +libcudf_exception_handler cdef gather_map_type left_semi_join( const table_view left_keys, const table_view right_keys, + cuda_stream_view stream ) except +libcudf_exception_handler cdef gather_map_type left_anti_join( const table_view left_keys, const table_view right_keys, + cuda_stream_view stream ) except +libcudf_exception_handler cdef gather_map_pair_type inner_join( const table_view left_keys, const table_view right_keys, null_equality nulls_equal, + cuda_stream_view stream ) except +libcudf_exception_handler cdef gather_map_pair_type left_join( const table_view left_keys, const table_view right_keys, null_equality nulls_equal, + cuda_stream_view stream ) except +libcudf_exception_handler cdef gather_map_pair_type full_join( const table_view left_keys, const table_view right_keys, null_equality nulls_equal, + cuda_stream_view stream ) except +libcudf_exception_handler cdef gather_map_type left_semi_join( const table_view left_keys, const table_view right_keys, null_equality nulls_equal, + cuda_stream_view stream ) except +libcudf_exception_handler cdef gather_map_type left_anti_join( const table_view left_keys, const table_view right_keys, null_equality nulls_equal, + cuda_stream_view stream ) except +libcudf_exception_handler cdef unique_ptr[table] cross_join( const table_view left, const table_view right, + cuda_stream_view stream ) except + cdef extern from "cudf/join/conditional_join.hpp" namespace "cudf" nogil: @@ -84,65 +98,67 @@ cdef extern from "cudf/join/conditional_join.hpp" namespace "cudf" nogil: const table_view left, const table_view right, const expression binary_predicate, + cuda_stream_view stream ) except +libcudf_exception_handler cdef gather_map_pair_type conditional_inner_join( const table_view left, const table_view right, const expression binary_predicate, - optional[size_t] output_size + optional[size_t] output_size, + cuda_stream_view stream ) except +libcudf_exception_handler cdef gather_map_pair_type conditional_left_join( const table_view left, const table_view right, const expression binary_predicate, + cuda_stream_view stream ) except +libcudf_exception_handler cdef gather_map_pair_type conditional_left_join( const table_view left, const table_view right, const expression binary_predicate, - optional[size_t] output_size + optional[size_t] output_size, + cuda_stream_view stream ) except +libcudf_exception_handler cdef gather_map_pair_type conditional_full_join( const table_view left, const table_view right, const expression binary_predicate, - ) except +libcudf_exception_handler - - cdef gather_map_pair_type conditional_full_join( - const table_view left, - const table_view right, - const expression binary_predicate, - optional[size_t] output_size + cuda_stream_view stream ) except +libcudf_exception_handler cdef gather_map_type conditional_left_semi_join( const table_view left, const table_view right, const expression binary_predicate, + cuda_stream_view stream ) except +libcudf_exception_handler cdef gather_map_type conditional_left_semi_join( const table_view left, const table_view right, const expression binary_predicate, - optional[size_t] output_size + optional[size_t] output_size, + cuda_stream_view stream ) except +libcudf_exception_handler cdef gather_map_type conditional_left_anti_join( const table_view left, const table_view right, const expression binary_predicate, + cuda_stream_view stream ) except +libcudf_exception_handler cdef gather_map_type conditional_left_anti_join( const table_view left, const table_view right, const expression binary_predicate, - optional[size_t] output_size + optional[size_t] output_size, + cuda_stream_view stream ) except +libcudf_exception_handler cdef extern from "cudf/join/mixed_join.hpp" namespace "cudf" nogil: @@ -152,7 +168,9 @@ cdef extern from "cudf/join/mixed_join.hpp" namespace "cudf" nogil: const table_view left_conditional, const table_view right_conditional, const expression binary_predicate, - null_equality compare_nulls + null_equality compare_nulls, + output_size_data_type output_size_data, + cuda_stream_view stream ) except +libcudf_exception_handler cdef gather_map_pair_type mixed_left_join( @@ -161,7 +179,9 @@ cdef extern from "cudf/join/mixed_join.hpp" namespace "cudf" nogil: const table_view left_conditional, const table_view right_conditional, const expression binary_predicate, - null_equality compare_nulls + null_equality compare_nulls, + output_size_data_type output_size_data, + cuda_stream_view stream ) except +libcudf_exception_handler cdef gather_map_pair_type mixed_full_join( @@ -170,7 +190,9 @@ cdef extern from "cudf/join/mixed_join.hpp" namespace "cudf" nogil: const table_view left_conditional, const table_view right_conditional, const expression binary_predicate, - null_equality compare_nulls + null_equality compare_nulls, + output_size_data_type output_size_data, + cuda_stream_view stream ) except +libcudf_exception_handler cdef gather_map_type mixed_left_semi_join( @@ -179,7 +201,8 @@ cdef extern from "cudf/join/mixed_join.hpp" namespace "cudf" nogil: const table_view left_conditional, const table_view right_conditional, const expression binary_predicate, - null_equality compare_nulls + null_equality compare_nulls, + cuda_stream_view stream ) except +libcudf_exception_handler cdef gather_map_type mixed_left_anti_join( @@ -188,5 +211,6 @@ cdef extern from "cudf/join/mixed_join.hpp" namespace "cudf" nogil: const table_view left_conditional, const table_view right_conditional, const expression binary_predicate, - null_equality compare_nulls + null_equality compare_nulls, + cuda_stream_view stream ) except +libcudf_exception_handler From 5bc06674e2455bdc745a8322608417da0d870327 Mon Sep 17 00:00:00 2001 From: Basit Ayantunde Date: Wed, 13 Aug 2025 13:57:10 +0100 Subject: [PATCH 119/366] [FEA] Refactor AST `operator_functor`s for use in JIT-compiled CUDA (#19541) This pull-request restructures the AST headers to enable them to be used in online-compiled (JIT-compiled) CUDA. It also adds an operator-to-string utility necessary for CUDA codegen. It also fixes the `std::` namespace references in the floating_conversion header to use `cuda::std::`. Precedes https://github.com/rapidsai/cudf/pull/19467 Authors: - Basit Ayantunde (https://github.com/lamarrr) Approvers: - Tianyu Liu (https://github.com/kingcrimsontianyu) - David Wendt (https://github.com/davidwendt) - Yunsong Wang (https://github.com/PointKernel) URL: https://github.com/rapidsai/cudf/pull/19541 --- cpp/include/cudf/ast/ast_operator.hpp | 101 +++ .../cudf/ast/detail/operator_functor.cuh | 760 ++++++++++++++++++ cpp/include/cudf/ast/detail/operators.cuh | 752 +---------------- cpp/include/cudf/ast/detail/operators.hpp | 2 + cpp/include/cudf/ast/detail/possibly_null.cuh | 45 ++ cpp/include/cudf/ast/expressions.hpp | 75 +- cpp/include/cudf/fixed_point/conv.hpp | 113 +++ .../detail/floating_conversion.hpp | 22 +- cpp/include/cudf/unary.hpp | 79 -- cpp/src/ast/operators.cpp | 57 ++ cpp/src/binaryop/compiled/binary_ops.cuh | 1 + cpp/src/quantiles/quantiles_util.hpp | 1 + .../quantiles/tdigest/tdigest_aggregation.cu | 1 + cpp/src/unary/cast_ops.cu | 1 + cpp/tests/fixed_point/fixed_point_tests.cpp | 3 +- 15 files changed, 1106 insertions(+), 907 deletions(-) create mode 100644 cpp/include/cudf/ast/ast_operator.hpp create mode 100644 cpp/include/cudf/ast/detail/operator_functor.cuh create mode 100644 cpp/include/cudf/ast/detail/possibly_null.cuh create mode 100644 cpp/include/cudf/fixed_point/conv.hpp diff --git a/cpp/include/cudf/ast/ast_operator.hpp b/cpp/include/cudf/ast/ast_operator.hpp new file mode 100644 index 00000000000..397a3550143 --- /dev/null +++ b/cpp/include/cudf/ast/ast_operator.hpp @@ -0,0 +1,101 @@ +/* + * Copyright (c) 2025, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include + +#include + +namespace CUDF_EXPORT cudf { + +namespace ast { +/** + * @addtogroup expressions + * @{ + * @file + */ + +/** + * @brief Enum of supported operators. + */ +enum class ast_operator : int32_t { + // Binary operators + ADD, ///< operator + + SUB, ///< operator - + MUL, ///< operator * + DIV, ///< operator / using common type of lhs and rhs + TRUE_DIV, ///< operator / after promoting type to floating point + FLOOR_DIV, ///< operator / after promoting to 64 bit floating point and then + ///< flooring the result + MOD, ///< operator % + PYMOD, ///< operator % using Python's sign rules for negatives + POW, ///< lhs ^ rhs + EQUAL, ///< operator == + NULL_EQUAL, ///< operator == with Spark rules: NULL_EQUAL(null, null) is true, NULL_EQUAL(null, + ///< valid) is false, and + ///< NULL_EQUAL(valid, valid) == EQUAL(valid, valid) + NOT_EQUAL, ///< operator != + LESS, ///< operator < + GREATER, ///< operator > + LESS_EQUAL, ///< operator <= + GREATER_EQUAL, ///< operator >= + BITWISE_AND, ///< operator & + BITWISE_OR, ///< operator | + BITWISE_XOR, ///< operator ^ + LOGICAL_AND, ///< operator && + NULL_LOGICAL_AND, ///< operator && with Spark rules: NULL_LOGICAL_AND(null, null) is null, + ///< NULL_LOGICAL_AND(null, true) is + ///< null, NULL_LOGICAL_AND(null, false) is false, and NULL_LOGICAL_AND(valid, + ///< valid) == LOGICAL_AND(valid, valid) + LOGICAL_OR, ///< operator || + NULL_LOGICAL_OR, ///< operator || with Spark rules: NULL_LOGICAL_OR(null, null) is null, + ///< NULL_LOGICAL_OR(null, true) is true, + ///< NULL_LOGICAL_OR(null, false) is null, and NULL_LOGICAL_OR(valid, valid) == + ///< LOGICAL_OR(valid, valid) + // Unary operators + IDENTITY, ///< Identity function + IS_NULL, ///< Check if operand is null + SIN, ///< Trigonometric sine + COS, ///< Trigonometric cosine + TAN, ///< Trigonometric tangent + ARCSIN, ///< Trigonometric sine inverse + ARCCOS, ///< Trigonometric cosine inverse + ARCTAN, ///< Trigonometric tangent inverse + SINH, ///< Hyperbolic sine + COSH, ///< Hyperbolic cosine + TANH, ///< Hyperbolic tangent + ARCSINH, ///< Hyperbolic sine inverse + ARCCOSH, ///< Hyperbolic cosine inverse + ARCTANH, ///< Hyperbolic tangent inverse + EXP, ///< Exponential (base e, Euler number) + LOG, ///< Natural Logarithm (base e) + SQRT, ///< Square-root (x^0.5) + CBRT, ///< Cube-root (x^(1.0/3)) + CEIL, ///< Smallest integer value not less than arg + FLOOR, ///< largest integer value not greater than arg + ABS, ///< Absolute value + RINT, ///< Rounds the floating-point argument arg to an integer value + BIT_INVERT, ///< Bitwise Not (~) + NOT, ///< Logical Not (!) + CAST_TO_INT64, ///< Cast value to int64_t + CAST_TO_UINT64, ///< Cast value to uint64_t + CAST_TO_FLOAT64 ///< Cast value to double +}; + +/** @} */ // end of group +} // namespace ast + +} // namespace CUDF_EXPORT cudf diff --git a/cpp/include/cudf/ast/detail/operator_functor.cuh b/cpp/include/cudf/ast/detail/operator_functor.cuh new file mode 100644 index 00000000000..640c7548c80 --- /dev/null +++ b/cpp/include/cudf/ast/detail/operator_functor.cuh @@ -0,0 +1,760 @@ +/* + * Copyright (c) 2025, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include +#include +#include +#include +#include +#include + +#include + +namespace CUDF_EXPORT cudf { +namespace ast::detail { + +/** + * @brief Operator functor. + * + * This functor is templated on an `ast_operator`, with each template specialization defining a + * callable `operator()` that executes the operation. The functor specialization also has a member + * `arity` defining the number of operands that are accepted by the call to `operator()`. The + * `operator()` is templated on the types of its inputs (e.g. `typename LHS` and `typename RHS` for + * a binary operator). Trailing return types are defined as `decltype(result)` where `result` is + * the returned value. The trailing return types allow SFINAE to only consider template + * instantiations for valid combinations of types. This, in turn, allows the operator functors to be + * used with traits like `is_valid_binary_op` that rely on `std::is_invocable` and related features. + * + * @tparam op AST operator. + */ +template +struct operator_functor {}; + +template <> +struct operator_functor { + static constexpr auto arity{2}; + + template + __device__ inline auto operator()(LHS lhs, RHS rhs) const noexcept -> decltype(lhs + rhs) + { + return lhs + rhs; + } +}; + +template <> +struct operator_functor { + static constexpr auto arity{2}; + + template + __device__ inline auto operator()(LHS lhs, RHS rhs) const noexcept -> decltype(lhs - rhs) + { + return lhs - rhs; + } +}; + +template <> +struct operator_functor { + static constexpr auto arity{2}; + + template + __device__ inline auto operator()(LHS lhs, RHS rhs) const noexcept -> decltype(lhs * rhs) + { + return lhs * rhs; + } +}; + +template <> +struct operator_functor { + static constexpr auto arity{2}; + + template + __device__ inline auto operator()(LHS lhs, RHS rhs) const noexcept -> decltype(lhs / rhs) + { + return lhs / rhs; + } +}; + +template <> +struct operator_functor { + static constexpr auto arity{2}; + + template + __device__ inline auto operator()(LHS lhs, RHS rhs) const noexcept + -> decltype(static_cast(lhs) / static_cast(rhs)) + { + return static_cast(lhs) / static_cast(rhs); + } +}; + +template <> +struct operator_functor { + static constexpr auto arity{2}; + + template + __device__ inline auto operator()(LHS lhs, RHS rhs) const noexcept + -> decltype(floor(static_cast(lhs) / static_cast(rhs))) + { + return floor(static_cast(lhs) / static_cast(rhs)); + } +}; + +template <> +struct operator_functor { + static constexpr auto arity{2}; + + template > + __device__ inline auto operator()(LHS lhs, RHS rhs) const noexcept + -> decltype(static_cast(lhs) % static_cast(rhs)) + requires(cuda::std::is_integral_v) + { + return static_cast(lhs) % static_cast(rhs); + } + + template > + __device__ inline auto operator()(LHS lhs, RHS rhs) const noexcept + -> decltype(fmodf(static_cast(lhs), static_cast(rhs))) + requires(cuda::std::is_same_v) + { + return fmodf(static_cast(lhs), static_cast(rhs)); + } + + template > + __device__ inline auto operator()(LHS lhs, RHS rhs) const noexcept + -> decltype(fmod(static_cast(lhs), static_cast(rhs))) + requires(cuda::std::is_same_v) + { + return fmod(static_cast(lhs), static_cast(rhs)); + } +}; + +template <> +struct operator_functor { + static constexpr auto arity{2}; + + template > + __device__ inline auto operator()(LHS lhs, RHS rhs) const noexcept + -> decltype(((static_cast(lhs) % static_cast(rhs)) + + static_cast(rhs)) % + static_cast(rhs)) + requires(cuda::std::is_integral_v) + { + return ((static_cast(lhs) % static_cast(rhs)) + + static_cast(rhs)) % + static_cast(rhs); + } + + template > + __device__ inline auto operator()(LHS lhs, RHS rhs) const noexcept + -> decltype(fmodf(fmodf(static_cast(lhs), static_cast(rhs)) + + static_cast(rhs), + static_cast(rhs))) + requires(cuda::std::is_same_v) + { + return fmodf(fmodf(static_cast(lhs), static_cast(rhs)) + + static_cast(rhs), + static_cast(rhs)); + } + + template > + __device__ inline auto operator()(LHS lhs, RHS rhs) const noexcept + -> decltype(fmod(fmod(static_cast(lhs), static_cast(rhs)) + + static_cast(rhs), + static_cast(rhs))) + requires(cuda::std::is_same_v) + { + return fmod(fmod(static_cast(lhs), static_cast(rhs)) + + static_cast(rhs), + static_cast(rhs)); + } +}; + +template <> +struct operator_functor { + static constexpr auto arity{2}; + + template + __device__ inline auto operator()(LHS lhs, RHS rhs) const noexcept + -> decltype(cuda::std::pow(lhs, rhs)) + { + return cuda::std::pow(lhs, rhs); + } +}; + +template <> +struct operator_functor { + static constexpr auto arity{2}; + + template + __device__ inline auto operator()(LHS lhs, RHS rhs) const noexcept -> decltype(lhs == rhs) + { + return lhs == rhs; + } +}; + +// Alias NULL_EQUAL = EQUAL in the non-nullable case. +template <> +struct operator_functor + : public operator_functor {}; + +template <> +struct operator_functor { + static constexpr auto arity{2}; + + template + __device__ inline auto operator()(LHS lhs, RHS rhs) const noexcept -> decltype(lhs != rhs) + { + return lhs != rhs; + } +}; + +template <> +struct operator_functor { + static constexpr auto arity{2}; + + template + __device__ inline auto operator()(LHS lhs, RHS rhs) const noexcept -> decltype(lhs < rhs) + { + return lhs < rhs; + } +}; + +template <> +struct operator_functor { + static constexpr auto arity{2}; + + template + __device__ inline auto operator()(LHS lhs, RHS rhs) const noexcept -> decltype(lhs > rhs) + { + return lhs > rhs; + } +}; + +template <> +struct operator_functor { + static constexpr auto arity{2}; + + template + __device__ inline auto operator()(LHS lhs, RHS rhs) const noexcept -> decltype(lhs <= rhs) + { + return lhs <= rhs; + } +}; + +template <> +struct operator_functor { + static constexpr auto arity{2}; + + template + __device__ inline auto operator()(LHS lhs, RHS rhs) const noexcept -> decltype(lhs >= rhs) + { + return lhs >= rhs; + } +}; + +template <> +struct operator_functor { + static constexpr auto arity{2}; + + template + __device__ inline auto operator()(LHS lhs, RHS rhs) const noexcept -> decltype(lhs & rhs) + { + return lhs & rhs; + } +}; + +template <> +struct operator_functor { + static constexpr auto arity{2}; + + template + __device__ inline auto operator()(LHS lhs, RHS rhs) const noexcept -> decltype(lhs | rhs) + { + return lhs | rhs; + } +}; + +template <> +struct operator_functor { + static constexpr auto arity{2}; + + template + __device__ inline auto operator()(LHS lhs, RHS rhs) const noexcept -> decltype(lhs ^ rhs) + { + return lhs ^ rhs; + } +}; + +template <> +struct operator_functor { + static constexpr auto arity{2}; + + template + __device__ inline auto operator()(LHS lhs, RHS rhs) const noexcept -> decltype(lhs && rhs) + { + return lhs && rhs; + } +}; + +// Alias NULL_LOGICAL_AND = LOGICAL_AND in the non-nullable case. +template <> +struct operator_functor + : public operator_functor {}; + +template <> +struct operator_functor { + static constexpr auto arity{2}; + + template + __device__ inline auto operator()(LHS lhs, RHS rhs) const noexcept -> decltype(lhs || rhs) + { + return lhs || rhs; + } +}; + +// Alias NULL_LOGICAL_OR = LOGICAL_OR in the non-nullable case. +template <> +struct operator_functor + : public operator_functor {}; + +template <> +struct operator_functor { + static constexpr auto arity{1}; + + template + __device__ inline auto operator()(InputT input) const noexcept -> decltype(input) + { + return input; + } +}; + +template <> +struct operator_functor { + static constexpr auto arity{1}; + + template + __device__ inline auto operator()(InputT input) const noexcept -> bool + { + return false; + } +}; + +template <> +struct operator_functor { + static constexpr auto arity{1}; + + template + __device__ inline auto operator()(InputT input) const noexcept -> decltype(cuda::std::sin(input)) + requires(cuda::std::is_floating_point_v) + { + return cuda::std::sin(input); + } +}; + +template <> +struct operator_functor { + static constexpr auto arity{1}; + + template + __device__ inline auto operator()(InputT input) const noexcept -> decltype(cuda::std::cos(input)) + requires(cuda::std::is_floating_point_v) + { + return cuda::std::cos(input); + } +}; + +template <> +struct operator_functor { + static constexpr auto arity{1}; + + template + __device__ inline auto operator()(InputT input) const noexcept -> decltype(cuda::std::tan(input)) + requires(cuda::std::is_floating_point_v) + { + return cuda::std::tan(input); + } +}; + +template <> +struct operator_functor { + static constexpr auto arity{1}; + + template + __device__ inline auto operator()(InputT input) const noexcept -> decltype(cuda::std::asin(input)) + requires(cuda::std::is_floating_point_v) + { + return cuda::std::asin(input); + } +}; + +template <> +struct operator_functor { + static constexpr auto arity{1}; + + template + __device__ inline auto operator()(InputT input) const noexcept -> decltype(cuda::std::acos(input)) + requires(cuda::std::is_floating_point_v) + { + return cuda::std::acos(input); + } +}; + +template <> +struct operator_functor { + static constexpr auto arity{1}; + + template + __device__ inline auto operator()(InputT input) const noexcept -> decltype(cuda::std::atan(input)) + requires(cuda::std::is_floating_point_v) + { + return cuda::std::atan(input); + } +}; + +template <> +struct operator_functor { + static constexpr auto arity{1}; + + template + __device__ inline auto operator()(InputT input) const noexcept -> decltype(cuda::std::sinh(input)) + requires(cuda::std::is_floating_point_v) + { + return cuda::std::sinh(input); + } +}; + +template <> +struct operator_functor { + static constexpr auto arity{1}; + + template + __device__ inline auto operator()(InputT input) const noexcept -> decltype(cuda::std::cosh(input)) + requires(cuda::std::is_floating_point_v) + { + return cuda::std::cosh(input); + } +}; + +template <> +struct operator_functor { + static constexpr auto arity{1}; + + template + __device__ inline auto operator()(InputT input) const noexcept -> decltype(cuda::std::tanh(input)) + requires(cuda::std::is_floating_point_v) + { + return cuda::std::tanh(input); + } +}; + +template <> +struct operator_functor { + static constexpr auto arity{1}; + + template + __device__ inline auto operator()(InputT input) const noexcept + -> decltype(cuda::std::asinh(input)) + requires(cuda::std::is_floating_point_v) + { + return cuda::std::asinh(input); + } +}; + +template <> +struct operator_functor { + static constexpr auto arity{1}; + + template + __device__ inline auto operator()(InputT input) const noexcept + -> decltype(cuda::std::acosh(input)) + requires(cuda::std::is_floating_point_v) + { + return cuda::std::acosh(input); + } +}; + +template <> +struct operator_functor { + static constexpr auto arity{1}; + + template + __device__ inline auto operator()(InputT input) const noexcept + -> decltype(cuda::std::atanh(input)) + requires(cuda::std::is_floating_point_v) + { + return cuda::std::atanh(input); + } +}; + +template <> +struct operator_functor { + static constexpr auto arity{1}; + + template + __device__ inline auto operator()(InputT input) const noexcept -> decltype(cuda::std::exp(input)) + { + return cuda::std::exp(input); + } +}; + +template <> +struct operator_functor { + static constexpr auto arity{1}; + + template + __device__ inline auto operator()(InputT input) const noexcept -> decltype(cuda::std::log(input)) + { + return cuda::std::log(input); + } +}; + +template <> +struct operator_functor { + static constexpr auto arity{1}; + + template + __device__ inline auto operator()(InputT input) const noexcept -> decltype(cuda::std::sqrt(input)) + { + return cuda::std::sqrt(input); + } +}; + +template <> +struct operator_functor { + static constexpr auto arity{1}; + + template + __device__ inline auto operator()(InputT input) const noexcept -> decltype(cuda::std::cbrt(input)) + { + return cuda::std::cbrt(input); + } +}; + +template <> +struct operator_functor { + static constexpr auto arity{1}; + + template + __device__ inline auto operator()(InputT input) const noexcept -> decltype(cuda::std::ceil(input)) + { + return cuda::std::ceil(input); + } +}; + +template <> +struct operator_functor { + static constexpr auto arity{1}; + + template + __device__ inline auto operator()(InputT input) const noexcept + -> decltype(cuda::std::floor(input)) + { + return cuda::std::floor(input); + } +}; + +template <> +struct operator_functor { + static constexpr auto arity{1}; + + // Only accept signed or unsigned types (both require is_arithmetic to be true) + template + __device__ inline auto operator()(InputT input) const noexcept -> decltype(cuda::std::abs(input)) + requires(cuda::std::is_signed_v) + { + return cuda::std::abs(input); + } + + template + __device__ inline auto operator()(InputT input) const noexcept -> decltype(input) + requires(cuda::std::is_unsigned_v) + { + return input; + } +}; + +template <> +struct operator_functor { + static constexpr auto arity{1}; + + template + __device__ inline auto operator()(InputT input) const noexcept -> decltype(cuda::std::rint(input)) + { + return cuda::std::rint(input); + } +}; + +template <> +struct operator_functor { + static constexpr auto arity{1}; + + template + __device__ inline auto operator()(InputT input) const noexcept -> decltype(~input) + { + return ~input; + } +}; + +template <> +struct operator_functor { + static constexpr auto arity{1}; + + template + __device__ inline auto operator()(InputT input) const noexcept -> decltype(!input) + { + return !input; + } +}; + +template +struct cast { + static constexpr auto arity{1}; + template + __device__ inline auto operator()(From f) const noexcept -> To + requires(is_fixed_point()) + { + if constexpr (cuda::std::is_floating_point_v) { + return convert_fixed_to_floating(f); + } else { + return static_cast(f); + } + } + + template + __device__ inline auto operator()(From f) const noexcept -> decltype(static_cast(f)) + requires(!is_fixed_point()) + { + return static_cast(f); + } +}; + +template <> +struct operator_functor : cast {}; +template <> +struct operator_functor : cast {}; +template <> +struct operator_functor : cast {}; + +/* + * The default specialization of nullable operators is to fall back to the non-nullable + * implementation + */ +template +struct operator_functor { + using NonNullOperator = operator_functor; + static constexpr auto arity = NonNullOperator::arity; + + template + __device__ inline auto operator()(LHS const lhs, RHS const rhs) const noexcept + -> possibly_null_value_t + requires(arity_placeholder == 2) + { + using Out = possibly_null_value_t; + return (lhs.has_value() && rhs.has_value()) ? Out{NonNullOperator{}(*lhs, *rhs)} : Out{}; + } + + template + __device__ inline auto operator()(Input const input) const noexcept + -> possibly_null_value_t + requires(arity_placeholder == 1) + { + using Out = possibly_null_value_t; + return input.has_value() ? Out{NonNullOperator{}(*input)} : Out{}; + } +}; + +// IS_NULL(null) is true, IS_NULL(valid) is false +template <> +struct operator_functor { + using NonNullOperator = operator_functor; + static constexpr auto arity = NonNullOperator::arity; + + template + __device__ inline auto operator()(LHS const lhs) const noexcept -> bool + { + return !lhs.has_value(); + } +}; + +// NULL_EQUAL(null, null) is true, NULL_EQUAL(null, valid) is false, and NULL_EQUAL(valid, valid) == +// EQUAL(valid, valid) +template <> +struct operator_functor { + using NonNullOperator = operator_functor; + static constexpr auto arity = NonNullOperator::arity; + + template + __device__ inline auto operator()(LHS const lhs, RHS const rhs) const noexcept + -> possibly_null_value_t + { + // Case 1: Neither is null, so the output is given by the operation. + if (lhs.has_value() && rhs.has_value()) { return {NonNullOperator{}(*lhs, *rhs)}; } + // Case 2: Two nulls compare equal. + if (!lhs.has_value() && !rhs.has_value()) { return {true}; } + // Case 3: One value is null, while the other is not, so we return false. + return {false}; + } +}; + +///< NULL_LOGICAL_AND(null, null) is null, NULL_LOGICAL_AND(null, true) is null, +///< NULL_LOGICAL_AND(null, false) is false, and NULL_LOGICAL_AND(valid, valid) == +///< LOGICAL_AND(valid, valid) +template <> +struct operator_functor { + using NonNullOperator = operator_functor; + static constexpr auto arity = NonNullOperator::arity; + + template + __device__ inline auto operator()(LHS const lhs, RHS const rhs) const noexcept + -> possibly_null_value_t + { + // Case 1: Neither is null, so the output is given by the operation. + if (lhs.has_value() && rhs.has_value()) { return {NonNullOperator{}(*lhs, *rhs)}; } + // Case 2: Two nulls return null. + if (!lhs.has_value() && !rhs.has_value()) { return {}; } + // Case 3: One value is null, while the other is not. If it's true we return null, otherwise we + // return false. + auto const& valid_element = lhs.has_value() ? lhs : rhs; + if (*valid_element) { return {}; } + return {false}; + } +}; + +///< NULL_LOGICAL_OR(null, null) is null, NULL_LOGICAL_OR(null, true) is true, NULL_LOGICAL_OR(null, +///< false) is null, and NULL_LOGICAL_OR(valid, valid) == LOGICAL_OR(valid, valid) +template <> +struct operator_functor { + using NonNullOperator = operator_functor; + static constexpr auto arity = NonNullOperator::arity; + + template + __device__ inline auto operator()(LHS const lhs, RHS const rhs) const noexcept + -> possibly_null_value_t + { + // Case 1: Neither is null, so the output is given by the operation. + if (lhs.has_value() && rhs.has_value()) { return {NonNullOperator{}(*lhs, *rhs)}; } + // Case 2: Two nulls return null. + if (!lhs.has_value() && !rhs.has_value()) { return {}; } + // Case 3: One value is null, while the other is not. If it's true we return true, otherwise we + // return null. + auto const& valid_element = lhs.has_value() ? lhs : rhs; + if (*valid_element) { return {true}; } + return {}; + } +}; + +} // namespace ast::detail +} // namespace CUDF_EXPORT cudf diff --git a/cpp/include/cudf/ast/detail/operators.cuh b/cpp/include/cudf/ast/detail/operators.cuh index 1a7f7357f7e..333b8be2de3 100644 --- a/cpp/include/cudf/ast/detail/operators.cuh +++ b/cpp/include/cudf/ast/detail/operators.cuh @@ -15,37 +15,20 @@ */ #pragma once +#include +#include +#include #include #include #include #include #include -#include -#include #include namespace CUDF_EXPORT cudf { namespace ast::detail { -// Type trait for wrapping nullable types in a cuda::std::optional. Non-nullable -// types are returned as is. -template -struct possibly_null_value; - -template -struct possibly_null_value { - using type = cuda::std::optional; -}; - -template -struct possibly_null_value { - using type = T; -}; - -template -using possibly_null_value_t = typename possibly_null_value::type; - // Traits for valid operator / type combinations template constexpr bool is_valid_binary_op = cuda::std::is_invocable_v; @@ -177,734 +160,5 @@ CUDF_HOST_DEVICE inline constexpr decltype(auto) ast_operator_dispatcher(ast_ope } } -/** - * @brief Operator functor. - * - * This functor is templated on an `ast_operator`, with each template specialization defining a - * callable `operator()` that executes the operation. The functor specialization also has a member - * `arity` defining the number of operands that are accepted by the call to `operator()`. The - * `operator()` is templated on the types of its inputs (e.g. `typename LHS` and `typename RHS` for - * a binary operator). Trailing return types are defined as `decltype(result)` where `result` is - * the returned value. The trailing return types allow SFINAE to only consider template - * instantiations for valid combinations of types. This, in turn, allows the operator functors to be - * used with traits like `is_valid_binary_op` that rely on `std::is_invocable` and related features. - * - * @tparam op AST operator. - */ -template -struct operator_functor {}; - -template <> -struct operator_functor { - static constexpr auto arity{2}; - - template - __device__ inline auto operator()(LHS lhs, RHS rhs) const noexcept -> decltype(lhs + rhs) - { - return lhs + rhs; - } -}; - -template <> -struct operator_functor { - static constexpr auto arity{2}; - - template - __device__ inline auto operator()(LHS lhs, RHS rhs) const noexcept -> decltype(lhs - rhs) - { - return lhs - rhs; - } -}; - -template <> -struct operator_functor { - static constexpr auto arity{2}; - - template - __device__ inline auto operator()(LHS lhs, RHS rhs) const noexcept -> decltype(lhs * rhs) - { - return lhs * rhs; - } -}; - -template <> -struct operator_functor { - static constexpr auto arity{2}; - - template - __device__ inline auto operator()(LHS lhs, RHS rhs) const noexcept -> decltype(lhs / rhs) - { - return lhs / rhs; - } -}; - -template <> -struct operator_functor { - static constexpr auto arity{2}; - - template - __device__ inline auto operator()(LHS lhs, RHS rhs) const noexcept - -> decltype(static_cast(lhs) / static_cast(rhs)) - { - return static_cast(lhs) / static_cast(rhs); - } -}; - -template <> -struct operator_functor { - static constexpr auto arity{2}; - - template - __device__ inline auto operator()(LHS lhs, RHS rhs) const noexcept - -> decltype(floor(static_cast(lhs) / static_cast(rhs))) - { - return floor(static_cast(lhs) / static_cast(rhs)); - } -}; - -template <> -struct operator_functor { - static constexpr auto arity{2}; - - template > - __device__ inline auto operator()(LHS lhs, RHS rhs) const noexcept - -> decltype(static_cast(lhs) % static_cast(rhs)) - requires(cuda::std::is_integral_v) - { - return static_cast(lhs) % static_cast(rhs); - } - - template > - __device__ inline auto operator()(LHS lhs, RHS rhs) const noexcept - -> decltype(fmodf(static_cast(lhs), static_cast(rhs))) - requires(cuda::std::is_same_v) - { - return fmodf(static_cast(lhs), static_cast(rhs)); - } - - template > - __device__ inline auto operator()(LHS lhs, RHS rhs) const noexcept - -> decltype(fmod(static_cast(lhs), static_cast(rhs))) - requires(cuda::std::is_same_v) - { - return fmod(static_cast(lhs), static_cast(rhs)); - } -}; - -template <> -struct operator_functor { - static constexpr auto arity{2}; - - template > - __device__ inline auto operator()(LHS lhs, RHS rhs) const noexcept - -> decltype(((static_cast(lhs) % static_cast(rhs)) + - static_cast(rhs)) % - static_cast(rhs)) - requires(cuda::std::is_integral_v) - { - return ((static_cast(lhs) % static_cast(rhs)) + - static_cast(rhs)) % - static_cast(rhs); - } - - template > - __device__ inline auto operator()(LHS lhs, RHS rhs) const noexcept - -> decltype(fmodf(fmodf(static_cast(lhs), static_cast(rhs)) + - static_cast(rhs), - static_cast(rhs))) - requires(cuda::std::is_same_v) - { - return fmodf(fmodf(static_cast(lhs), static_cast(rhs)) + - static_cast(rhs), - static_cast(rhs)); - } - - template > - __device__ inline auto operator()(LHS lhs, RHS rhs) const noexcept - -> decltype(fmod(fmod(static_cast(lhs), static_cast(rhs)) + - static_cast(rhs), - static_cast(rhs))) - requires(cuda::std::is_same_v) - { - return fmod(fmod(static_cast(lhs), static_cast(rhs)) + - static_cast(rhs), - static_cast(rhs)); - } -}; - -template <> -struct operator_functor { - static constexpr auto arity{2}; - - template - __device__ inline auto operator()(LHS lhs, RHS rhs) const noexcept - -> decltype(cuda::std::pow(lhs, rhs)) - { - return cuda::std::pow(lhs, rhs); - } -}; - -template <> -struct operator_functor { - static constexpr auto arity{2}; - - template - __device__ inline auto operator()(LHS lhs, RHS rhs) const noexcept -> decltype(lhs == rhs) - { - return lhs == rhs; - } -}; - -// Alias NULL_EQUAL = EQUAL in the non-nullable case. -template <> -struct operator_functor - : public operator_functor {}; - -template <> -struct operator_functor { - static constexpr auto arity{2}; - - template - __device__ inline auto operator()(LHS lhs, RHS rhs) const noexcept -> decltype(lhs != rhs) - { - return lhs != rhs; - } -}; - -template <> -struct operator_functor { - static constexpr auto arity{2}; - - template - __device__ inline auto operator()(LHS lhs, RHS rhs) const noexcept -> decltype(lhs < rhs) - { - return lhs < rhs; - } -}; - -template <> -struct operator_functor { - static constexpr auto arity{2}; - - template - __device__ inline auto operator()(LHS lhs, RHS rhs) const noexcept -> decltype(lhs > rhs) - { - return lhs > rhs; - } -}; - -template <> -struct operator_functor { - static constexpr auto arity{2}; - - template - __device__ inline auto operator()(LHS lhs, RHS rhs) const noexcept -> decltype(lhs <= rhs) - { - return lhs <= rhs; - } -}; - -template <> -struct operator_functor { - static constexpr auto arity{2}; - - template - __device__ inline auto operator()(LHS lhs, RHS rhs) const noexcept -> decltype(lhs >= rhs) - { - return lhs >= rhs; - } -}; - -template <> -struct operator_functor { - static constexpr auto arity{2}; - - template - __device__ inline auto operator()(LHS lhs, RHS rhs) const noexcept -> decltype(lhs & rhs) - { - return lhs & rhs; - } -}; - -template <> -struct operator_functor { - static constexpr auto arity{2}; - - template - __device__ inline auto operator()(LHS lhs, RHS rhs) const noexcept -> decltype(lhs | rhs) - { - return lhs | rhs; - } -}; - -template <> -struct operator_functor { - static constexpr auto arity{2}; - - template - __device__ inline auto operator()(LHS lhs, RHS rhs) const noexcept -> decltype(lhs ^ rhs) - { - return lhs ^ rhs; - } -}; - -template <> -struct operator_functor { - static constexpr auto arity{2}; - - template - __device__ inline auto operator()(LHS lhs, RHS rhs) const noexcept -> decltype(lhs && rhs) - { - return lhs && rhs; - } -}; - -// Alias NULL_LOGICAL_AND = LOGICAL_AND in the non-nullable case. -template <> -struct operator_functor - : public operator_functor {}; - -template <> -struct operator_functor { - static constexpr auto arity{2}; - - template - __device__ inline auto operator()(LHS lhs, RHS rhs) const noexcept -> decltype(lhs || rhs) - { - return lhs || rhs; - } -}; - -// Alias NULL_LOGICAL_OR = LOGICAL_OR in the non-nullable case. -template <> -struct operator_functor - : public operator_functor {}; - -template <> -struct operator_functor { - static constexpr auto arity{1}; - - template - __device__ inline auto operator()(InputT input) const noexcept -> decltype(input) - { - return input; - } -}; - -template <> -struct operator_functor { - static constexpr auto arity{1}; - - template - __device__ inline auto operator()(InputT input) const noexcept -> bool - { - return false; - } -}; - -template <> -struct operator_functor { - static constexpr auto arity{1}; - - template - __device__ inline auto operator()(InputT input) const noexcept -> decltype(cuda::std::sin(input)) - requires(cuda::std::is_floating_point_v) - { - return cuda::std::sin(input); - } -}; - -template <> -struct operator_functor { - static constexpr auto arity{1}; - - template - __device__ inline auto operator()(InputT input) const noexcept -> decltype(cuda::std::cos(input)) - requires(cuda::std::is_floating_point_v) - { - return cuda::std::cos(input); - } -}; - -template <> -struct operator_functor { - static constexpr auto arity{1}; - - template - __device__ inline auto operator()(InputT input) const noexcept -> decltype(cuda::std::tan(input)) - requires(cuda::std::is_floating_point_v) - { - return cuda::std::tan(input); - } -}; - -template <> -struct operator_functor { - static constexpr auto arity{1}; - - template - __device__ inline auto operator()(InputT input) const noexcept -> decltype(cuda::std::asin(input)) - requires(cuda::std::is_floating_point_v) - { - return cuda::std::asin(input); - } -}; - -template <> -struct operator_functor { - static constexpr auto arity{1}; - - template - __device__ inline auto operator()(InputT input) const noexcept -> decltype(cuda::std::acos(input)) - requires(cuda::std::is_floating_point_v) - { - return cuda::std::acos(input); - } -}; - -template <> -struct operator_functor { - static constexpr auto arity{1}; - - template - __device__ inline auto operator()(InputT input) const noexcept -> decltype(cuda::std::atan(input)) - requires(cuda::std::is_floating_point_v) - { - return cuda::std::atan(input); - } -}; - -template <> -struct operator_functor { - static constexpr auto arity{1}; - - template - __device__ inline auto operator()(InputT input) const noexcept -> decltype(cuda::std::sinh(input)) - requires(cuda::std::is_floating_point_v) - { - return cuda::std::sinh(input); - } -}; - -template <> -struct operator_functor { - static constexpr auto arity{1}; - - template - __device__ inline auto operator()(InputT input) const noexcept -> decltype(cuda::std::cosh(input)) - requires(cuda::std::is_floating_point_v) - { - return cuda::std::cosh(input); - } -}; - -template <> -struct operator_functor { - static constexpr auto arity{1}; - - template - __device__ inline auto operator()(InputT input) const noexcept -> decltype(cuda::std::tanh(input)) - requires(cuda::std::is_floating_point_v) - { - return cuda::std::tanh(input); - } -}; - -template <> -struct operator_functor { - static constexpr auto arity{1}; - - template - __device__ inline auto operator()(InputT input) const noexcept - -> decltype(cuda::std::asinh(input)) - requires(cuda::std::is_floating_point_v) - { - return cuda::std::asinh(input); - } -}; - -template <> -struct operator_functor { - static constexpr auto arity{1}; - - template - __device__ inline auto operator()(InputT input) const noexcept - -> decltype(cuda::std::acosh(input)) - requires(cuda::std::is_floating_point_v) - { - return cuda::std::acosh(input); - } -}; - -template <> -struct operator_functor { - static constexpr auto arity{1}; - - template - __device__ inline auto operator()(InputT input) const noexcept - -> decltype(cuda::std::atanh(input)) - requires(cuda::std::is_floating_point_v) - { - return cuda::std::atanh(input); - } -}; - -template <> -struct operator_functor { - static constexpr auto arity{1}; - - template - __device__ inline auto operator()(InputT input) const noexcept -> decltype(cuda::std::exp(input)) - { - return cuda::std::exp(input); - } -}; - -template <> -struct operator_functor { - static constexpr auto arity{1}; - - template - __device__ inline auto operator()(InputT input) const noexcept -> decltype(cuda::std::log(input)) - { - return cuda::std::log(input); - } -}; - -template <> -struct operator_functor { - static constexpr auto arity{1}; - - template - __device__ inline auto operator()(InputT input) const noexcept -> decltype(cuda::std::sqrt(input)) - { - return cuda::std::sqrt(input); - } -}; - -template <> -struct operator_functor { - static constexpr auto arity{1}; - - template - __device__ inline auto operator()(InputT input) const noexcept -> decltype(cuda::std::cbrt(input)) - { - return cuda::std::cbrt(input); - } -}; - -template <> -struct operator_functor { - static constexpr auto arity{1}; - - template - __device__ inline auto operator()(InputT input) const noexcept -> decltype(cuda::std::ceil(input)) - { - return cuda::std::ceil(input); - } -}; - -template <> -struct operator_functor { - static constexpr auto arity{1}; - - template - __device__ inline auto operator()(InputT input) const noexcept - -> decltype(cuda::std::floor(input)) - { - return cuda::std::floor(input); - } -}; - -template <> -struct operator_functor { - static constexpr auto arity{1}; - - // Only accept signed or unsigned types (both require is_arithmetic to be true) - template - __device__ inline auto operator()(InputT input) const noexcept -> decltype(cuda::std::abs(input)) - requires(cuda::std::is_signed_v) - { - return cuda::std::abs(input); - } - - template - __device__ inline auto operator()(InputT input) const noexcept -> decltype(input) - requires(cuda::std::is_unsigned_v) - { - return input; - } -}; - -template <> -struct operator_functor { - static constexpr auto arity{1}; - - template - __device__ inline auto operator()(InputT input) const noexcept -> decltype(cuda::std::rint(input)) - { - return cuda::std::rint(input); - } -}; - -template <> -struct operator_functor { - static constexpr auto arity{1}; - - template - __device__ inline auto operator()(InputT input) const noexcept -> decltype(~input) - { - return ~input; - } -}; - -template <> -struct operator_functor { - static constexpr auto arity{1}; - - template - __device__ inline auto operator()(InputT input) const noexcept -> decltype(!input) - { - return !input; - } -}; - -template -struct cast { - static constexpr auto arity{1}; - template - __device__ inline auto operator()(From f) const noexcept -> To - requires(is_fixed_point()) - { - if constexpr (cuda::std::is_floating_point_v) { - return convert_fixed_to_floating(f); - } else { - return static_cast(f); - } - } - - template - __device__ inline auto operator()(From f) const noexcept -> decltype(static_cast(f)) - requires(!is_fixed_point()) - { - return static_cast(f); - } -}; - -template <> -struct operator_functor : cast {}; -template <> -struct operator_functor : cast {}; -template <> -struct operator_functor : cast {}; - -/* - * The default specialization of nullable operators is to fall back to the non-nullable - * implementation - */ -template -struct operator_functor { - using NonNullOperator = operator_functor; - static constexpr auto arity = NonNullOperator::arity; - - template - __device__ inline auto operator()(LHS const lhs, RHS const rhs) const noexcept - -> possibly_null_value_t - requires(arity_placeholder == 2) - { - using Out = possibly_null_value_t; - return (lhs.has_value() && rhs.has_value()) ? Out{NonNullOperator{}(*lhs, *rhs)} : Out{}; - } - - template - __device__ inline auto operator()(Input const input) const noexcept - -> possibly_null_value_t - requires(arity_placeholder == 1) - { - using Out = possibly_null_value_t; - return input.has_value() ? Out{NonNullOperator{}(*input)} : Out{}; - } -}; - -// IS_NULL(null) is true, IS_NULL(valid) is false -template <> -struct operator_functor { - using NonNullOperator = operator_functor; - static constexpr auto arity = NonNullOperator::arity; - - template - __device__ inline auto operator()(LHS const lhs) const noexcept -> bool - { - return !lhs.has_value(); - } -}; - -// NULL_EQUAL(null, null) is true, NULL_EQUAL(null, valid) is false, and NULL_EQUAL(valid, valid) == -// EQUAL(valid, valid) -template <> -struct operator_functor { - using NonNullOperator = operator_functor; - static constexpr auto arity = NonNullOperator::arity; - - template - __device__ inline auto operator()(LHS const lhs, RHS const rhs) const noexcept - -> possibly_null_value_t - { - // Case 1: Neither is null, so the output is given by the operation. - if (lhs.has_value() && rhs.has_value()) { return {NonNullOperator{}(*lhs, *rhs)}; } - // Case 2: Two nulls compare equal. - if (!lhs.has_value() && !rhs.has_value()) { return {true}; } - // Case 3: One value is null, while the other is not, so we return false. - return {false}; - } -}; - -///< NULL_LOGICAL_AND(null, null) is null, NULL_LOGICAL_AND(null, true) is null, -///< NULL_LOGICAL_AND(null, false) is false, and NULL_LOGICAL_AND(valid, valid) == -///< LOGICAL_AND(valid, valid) -template <> -struct operator_functor { - using NonNullOperator = operator_functor; - static constexpr auto arity = NonNullOperator::arity; - - template - __device__ inline auto operator()(LHS const lhs, RHS const rhs) const noexcept - -> possibly_null_value_t - { - // Case 1: Neither is null, so the output is given by the operation. - if (lhs.has_value() && rhs.has_value()) { return {NonNullOperator{}(*lhs, *rhs)}; } - // Case 2: Two nulls return null. - if (!lhs.has_value() && !rhs.has_value()) { return {}; } - // Case 3: One value is null, while the other is not. If it's true we return null, otherwise we - // return false. - auto const& valid_element = lhs.has_value() ? lhs : rhs; - if (*valid_element) { return {}; } - return {false}; - } -}; - -///< NULL_LOGICAL_OR(null, null) is null, NULL_LOGICAL_OR(null, true) is true, NULL_LOGICAL_OR(null, -///< false) is null, and NULL_LOGICAL_OR(valid, valid) == LOGICAL_OR(valid, valid) -template <> -struct operator_functor { - using NonNullOperator = operator_functor; - static constexpr auto arity = NonNullOperator::arity; - - template - __device__ inline auto operator()(LHS const lhs, RHS const rhs) const noexcept - -> possibly_null_value_t - { - // Case 1: Neither is null, so the output is given by the operation. - if (lhs.has_value() && rhs.has_value()) { return {NonNullOperator{}(*lhs, *rhs)}; } - // Case 2: Two nulls return null. - if (!lhs.has_value() && !rhs.has_value()) { return {}; } - // Case 3: One value is null, while the other is not. If it's true we return true, otherwise we - // return null. - auto const& valid_element = lhs.has_value() ? lhs : rhs; - if (*valid_element) { return {true}; } - return {}; - } -}; - } // namespace ast::detail } // namespace CUDF_EXPORT cudf diff --git a/cpp/include/cudf/ast/detail/operators.hpp b/cpp/include/cudf/ast/detail/operators.hpp index 61bee5b479b..f12148c9ef8 100644 --- a/cpp/include/cudf/ast/detail/operators.hpp +++ b/cpp/include/cudf/ast/detail/operators.hpp @@ -47,6 +47,8 @@ cudf::data_type ast_operator_return_type(ast_operator op, */ cudf::size_type ast_operator_arity(ast_operator op); +std::string_view ast_operator_string(ast_operator op); + } // namespace ast::detail } // namespace CUDF_EXPORT cudf diff --git a/cpp/include/cudf/ast/detail/possibly_null.cuh b/cpp/include/cudf/ast/detail/possibly_null.cuh new file mode 100644 index 00000000000..d30b914666f --- /dev/null +++ b/cpp/include/cudf/ast/detail/possibly_null.cuh @@ -0,0 +1,45 @@ +/* + * Copyright (c) 2025, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include + +#include + +namespace CUDF_EXPORT cudf { + +namespace ast::detail { + +// Type trait for wrapping nullable types in a cuda::std::optional. Non-nullable +// types are returned as is. +template +struct possibly_null_value; + +template +struct possibly_null_value { + using type = cuda::std::optional; +}; + +template +struct possibly_null_value { + using type = T; +}; + +template +using possibly_null_value_t = typename possibly_null_value::type; + +} // namespace ast::detail +} // namespace CUDF_EXPORT cudf diff --git a/cpp/include/cudf/ast/expressions.hpp b/cpp/include/cudf/ast/expressions.hpp index 4c5601be856..6e084f9fb2d 100644 --- a/cpp/include/cudf/ast/expressions.hpp +++ b/cpp/include/cudf/ast/expressions.hpp @@ -15,6 +15,7 @@ */ #pragma once +#include #include #include #include @@ -90,73 +91,6 @@ struct expression { virtual ~expression() {} }; -/** - * @brief Enum of supported operators. - */ -enum class ast_operator : int32_t { - // Binary operators - ADD, ///< operator + - SUB, ///< operator - - MUL, ///< operator * - DIV, ///< operator / using common type of lhs and rhs - TRUE_DIV, ///< operator / after promoting type to floating point - FLOOR_DIV, ///< operator / after promoting to 64 bit floating point and then - ///< flooring the result - MOD, ///< operator % - PYMOD, ///< operator % using Python's sign rules for negatives - POW, ///< lhs ^ rhs - EQUAL, ///< operator == - NULL_EQUAL, ///< operator == with Spark rules: NULL_EQUAL(null, null) is true, NULL_EQUAL(null, - ///< valid) is false, and - ///< NULL_EQUAL(valid, valid) == EQUAL(valid, valid) - NOT_EQUAL, ///< operator != - LESS, ///< operator < - GREATER, ///< operator > - LESS_EQUAL, ///< operator <= - GREATER_EQUAL, ///< operator >= - BITWISE_AND, ///< operator & - BITWISE_OR, ///< operator | - BITWISE_XOR, ///< operator ^ - LOGICAL_AND, ///< operator && - NULL_LOGICAL_AND, ///< operator && with Spark rules: NULL_LOGICAL_AND(null, null) is null, - ///< NULL_LOGICAL_AND(null, true) is - ///< null, NULL_LOGICAL_AND(null, false) is false, and NULL_LOGICAL_AND(valid, - ///< valid) == LOGICAL_AND(valid, valid) - LOGICAL_OR, ///< operator || - NULL_LOGICAL_OR, ///< operator || with Spark rules: NULL_LOGICAL_OR(null, null) is null, - ///< NULL_LOGICAL_OR(null, true) is true, - ///< NULL_LOGICAL_OR(null, false) is null, and NULL_LOGICAL_OR(valid, valid) == - ///< LOGICAL_OR(valid, valid) - // Unary operators - IDENTITY, ///< Identity function - IS_NULL, ///< Check if operand is null - SIN, ///< Trigonometric sine - COS, ///< Trigonometric cosine - TAN, ///< Trigonometric tangent - ARCSIN, ///< Trigonometric sine inverse - ARCCOS, ///< Trigonometric cosine inverse - ARCTAN, ///< Trigonometric tangent inverse - SINH, ///< Hyperbolic sine - COSH, ///< Hyperbolic cosine - TANH, ///< Hyperbolic tangent - ARCSINH, ///< Hyperbolic sine inverse - ARCCOSH, ///< Hyperbolic cosine inverse - ARCTANH, ///< Hyperbolic tangent inverse - EXP, ///< Exponential (base e, Euler number) - LOG, ///< Natural Logarithm (base e) - SQRT, ///< Square-root (x^0.5) - CBRT, ///< Cube-root (x^(1.0/3)) - CEIL, ///< Smallest integer value not less than arg - FLOOR, ///< largest integer value not greater than arg - ABS, ///< Absolute value - RINT, ///< Rounds the floating-point argument arg to an integer value - BIT_INVERT, ///< Bitwise Not (~) - NOT, ///< Logical Not (!) - CAST_TO_INT64, ///< Cast value to int64_t - CAST_TO_UINT64, ///< Cast value to uint64_t - CAST_TO_FLOAT64 ///< Cast value to double -}; - /** * @brief Enum of table references. * @@ -317,6 +251,13 @@ class literal : public expression { */ [[nodiscard]] generic_scalar_device_view get_value() const { return value; } + /** + * @brief Get the scalar. + * + * @return The scalar object + */ + [[nodiscard]] cudf::scalar const& get_scalar() const { return scalar; } + /** * @copydoc expression::accept */ diff --git a/cpp/include/cudf/fixed_point/conv.hpp b/cpp/include/cudf/fixed_point/conv.hpp new file mode 100644 index 00000000000..3ccb876d009 --- /dev/null +++ b/cpp/include/cudf/fixed_point/conv.hpp @@ -0,0 +1,113 @@ +/* + * Copyright (c) 2025, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include +#include +#include + +namespace CUDF_EXPORT cudf { +/** + * @addtogroup fixed_point_classes + * @{ + * @file + * @brief Conversion functions for fixed-point numbers + */ + +/** + * @brief Convert a floating-point value to fixed point + * + * @note This conversion was moved from fixed-point member functions to free functions. + * This is so that the complex conversion code is not included into many parts of the + * code base that don't need it, and so that it's more obvious to pinpoint where these + * conversions are occurring. + * + * @tparam Fixed The fixed-point type to convert to + * @tparam Floating The floating-point type to convert from + * @param floating The floating-point value to convert + * @param scale The desired scale of the fixed-point value + * @return The converted fixed-point value + */ +template && is_fixed_point())> +CUDF_HOST_DEVICE Fixed convert_floating_to_fixed(Floating floating, numeric::scale_type scale) +{ + using Rep = typename Fixed::rep; + auto const value = [&]() { + if constexpr (Fixed::rad == numeric::Radix::BASE_10) { + return numeric::detail::convert_floating_to_integral(floating, scale); + } else { + return static_cast(numeric::detail::shift(floating, scale)); + } + }(); + + return Fixed(numeric::scaled_integer{value, scale}); +} + +/** + * @brief Convert a fixed-point value to floating point + * + * @note This conversion was moved from fixed-point member functions to free functions. + * This is so that the complex conversion code is not included into many parts of the + * code base that don't need it, and so that it's more obvious to pinpoint where these + * conversions are occurring. + * + * @tparam Floating The floating-point type to convert to + * @tparam Fixed The fixed-point type to convert from + * @param fixed The fixed-point value to convert + * @return The converted floating-point value + */ +template && is_fixed_point())> +CUDF_HOST_DEVICE Floating convert_fixed_to_floating(Fixed fixed) +{ + using Rep = typename Fixed::rep; + if constexpr (Fixed::rad == numeric::Radix::BASE_10) { + return numeric::detail::convert_integral_to_floating(fixed.value(), fixed.scale()); + } else { + auto const casted = static_cast(fixed.value()); + auto const scale = numeric::scale_type{-fixed.scale()}; + return numeric::detail::shift(casted, scale); + } +} + +/** + * @brief Convert a value to floating point + * + * @tparam Floating The floating-point type to convert to + * @tparam Input The input type to convert from + * @param input The input value to convert + * @return The converted floating-point value + */ +template )> +CUDF_HOST_DEVICE Floating convert_to_floating(Input input) +{ + if constexpr (is_fixed_point()) { + return convert_fixed_to_floating(input); + } else { + return static_cast(input); + } +} + +/** @} */ // end of group +} // namespace CUDF_EXPORT cudf diff --git a/cpp/include/cudf/fixed_point/detail/floating_conversion.hpp b/cpp/include/cudf/fixed_point/detail/floating_conversion.hpp index 21d2a9a7d3a..282880e4023 100644 --- a/cpp/include/cudf/fixed_point/detail/floating_conversion.hpp +++ b/cpp/include/cudf/fixed_point/detail/floating_conversion.hpp @@ -38,27 +38,27 @@ namespace detail { * @return The number of significant bits: the # of bits - # of leading zeroes */ template || std::is_same_v || - std::is_same_v)> + CUDF_ENABLE_IF(cuda::std::is_same_v || cuda::std::is_same_v || + cuda::std::is_same_v)> CUDF_HOST_DEVICE inline constexpr int count_significant_bits(T value) { #ifdef __CUDA_ARCH__ - if constexpr (std::is_same_v) { + if constexpr (cuda::std::is_same_v) { return 64 - __clzll(static_cast(value)); - } else if constexpr (std::is_same_v) { + } else if constexpr (cuda::std::is_same_v) { return 32 - __clz(static_cast(value)); - } else if constexpr (std::is_same_v) { + } else if constexpr (cuda::std::is_same_v) { // 128 bit type, must break up into high and low components auto const high_bits = static_cast(value >> 64); auto const low_bits = static_cast(value); return 128 - (__clzll(high_bits) + static_cast(high_bits == 0) * __clzll(low_bits)); } #else - if constexpr (std::is_same_v) { + if constexpr (cuda::std::is_same_v) { return 64 - cuda::std::countl_zero(value); - } else if constexpr (std::is_same_v) { + } else if constexpr (cuda::std::is_same_v) { return 32 - cuda::std::countl_zero(value); - } else if constexpr (std::is_same_v) { + } else if constexpr (cuda::std::is_same_v) { // 128 bit type, must break up into high and low components auto const high_bits = static_cast(value >> 64); if (high_bits == 0) { @@ -263,7 +263,7 @@ struct floating_converter { auto integer_rep = bit_cast_to_integer(floating); // Extract the currently stored (biased) exponent - using SignedType = std::make_signed_t; + using SignedType = cuda::std::make_signed_t; auto exponent_bits = integer_rep & exponent_mask; auto stored_pow2 = static_cast(exponent_bits >> num_stored_mantissa_bits); @@ -550,13 +550,13 @@ struct shifting_constants { static constexpr bool is_double = cuda::std::is_same_v; /// Integer type that can hold the value of the significand - using IntegerRep = std::conditional_t; + using IntegerRep = cuda::std::conditional_t; /// Num bits needed to hold the significand static constexpr auto num_significand_bits = cuda::std::numeric_limits::digits; /// Shift data back and forth in space of a type with 2x the starting bits, to give us enough room - using ShiftingRep = std::conditional_t; + using ShiftingRep = cuda::std::conditional_t; // The significand of a float / double is 24 / 53 bits // However, to uniquely represent each double / float as different #'s in decimal diff --git a/cpp/include/cudf/unary.hpp b/cpp/include/cudf/unary.hpp index ea9457296f6..77cc808a8ed 100644 --- a/cpp/include/cudf/unary.hpp +++ b/cpp/include/cudf/unary.hpp @@ -34,85 +34,6 @@ namespace CUDF_EXPORT cudf { * @brief Column APIs for unary ops */ -/** - * @brief Convert a floating-point value to fixed point - * - * @note This conversion was moved from fixed-point member functions to free functions. - * This is so that the complex conversion code is not included into many parts of the - * code base that don't need it, and so that it's more obvious to pinpoint where these - * conversions are occurring. - * - * @tparam Fixed The fixed-point type to convert to - * @tparam Floating The floating-point type to convert from - * @param floating The floating-point value to convert - * @param scale The desired scale of the fixed-point value - * @return The converted fixed-point value - */ -template && is_fixed_point())> -CUDF_HOST_DEVICE Fixed convert_floating_to_fixed(Floating floating, numeric::scale_type scale) -{ - using Rep = typename Fixed::rep; - auto const value = [&]() { - if constexpr (Fixed::rad == numeric::Radix::BASE_10) { - return numeric::detail::convert_floating_to_integral(floating, scale); - } else { - return static_cast(numeric::detail::shift(floating, scale)); - } - }(); - - return Fixed(numeric::scaled_integer{value, scale}); -} - -/** - * @brief Convert a fixed-point value to floating point - * - * @note This conversion was moved from fixed-point member functions to free functions. - * This is so that the complex conversion code is not included into many parts of the - * code base that don't need it, and so that it's more obvious to pinpoint where these - * conversions are occurring. - * - * @tparam Floating The floating-point type to convert to - * @tparam Fixed The fixed-point type to convert from - * @param fixed The fixed-point value to convert - * @return The converted floating-point value - */ -template && is_fixed_point())> -CUDF_HOST_DEVICE Floating convert_fixed_to_floating(Fixed fixed) -{ - using Rep = typename Fixed::rep; - if constexpr (Fixed::rad == numeric::Radix::BASE_10) { - return numeric::detail::convert_integral_to_floating(fixed.value(), fixed.scale()); - } else { - auto const casted = static_cast(fixed.value()); - auto const scale = numeric::scale_type{-fixed.scale()}; - return numeric::detail::shift(casted, scale); - } -} - -/** - * @brief Convert a value to floating point - * - * @tparam Floating The floating-point type to convert to - * @tparam Input The input type to convert from - * @param input The input value to convert - * @return The converted floating-point value - */ -template )> -CUDF_HOST_DEVICE Floating convert_to_floating(Input input) -{ - if constexpr (is_fixed_point()) { - return convert_fixed_to_floating(input); - } else { - return static_cast(input); - } -} - /** * @brief Types of unary operations that can be performed on data. */ diff --git a/cpp/src/ast/operators.cpp b/cpp/src/ast/operators.cpp index 0fa548f4d90..655d011b81c 100644 --- a/cpp/src/ast/operators.cpp +++ b/cpp/src/ast/operators.cpp @@ -272,6 +272,63 @@ cudf::size_type ast_operator_arity(ast_operator op) return result; } +std::string_view ast_operator_string(ast_operator op) +{ + switch (op) { + case ast_operator::ADD: return "ADD"; + case ast_operator::SUB: return "SUB"; + case ast_operator::MUL: return "MUL"; + case ast_operator::DIV: return "DIV"; + case ast_operator::TRUE_DIV: return "TRUE_DIV"; + case ast_operator::FLOOR_DIV: return "FLOOR_DIV"; + case ast_operator::MOD: return "MOD"; + case ast_operator::PYMOD: return "PYMOD"; + case ast_operator::POW: return "POW"; + case ast_operator::EQUAL: return "EQUAL"; + case ast_operator::NULL_EQUAL: return "NULL_EQUAL"; + case ast_operator::NOT_EQUAL: return "NOT_EQUAL"; + case ast_operator::LESS: return "LESS"; + case ast_operator::GREATER: return "GREATER"; + case ast_operator::LESS_EQUAL: return "LESS_EQUAL"; + case ast_operator::GREATER_EQUAL: return "GREATER_EQUAL"; + case ast_operator::BITWISE_AND: return "BITWISE_AND"; + case ast_operator::BITWISE_OR: return "BITWISE_OR"; + case ast_operator::BITWISE_XOR: return "BITWISE_XOR"; + case ast_operator::LOGICAL_AND: return "LOGICAL_AND"; + case ast_operator::NULL_LOGICAL_AND: return "NULL_LOGICAL_AND"; + case ast_operator::LOGICAL_OR: return "LOGICAL_OR"; + case ast_operator::NULL_LOGICAL_OR: return "NULL_LOGICAL_OR"; + case ast_operator::IDENTITY: return "IDENTITY"; + case ast_operator::IS_NULL: return "IS_NULL"; + case ast_operator::SIN: return "SIN"; + case ast_operator::COS: return "COS"; + case ast_operator::TAN: return "TAN"; + case ast_operator::ARCSIN: return "ARCSIN"; + case ast_operator::ARCCOS: return "ARCCOS"; + case ast_operator::ARCTAN: return "ARCTAN"; + case ast_operator::SINH: return "SINH"; + case ast_operator::COSH: return "COSH"; + case ast_operator::TANH: return "TANH"; + case ast_operator::ARCSINH: return "ARCSINH"; + case ast_operator::ARCCOSH: return "ARCCOSH"; + case ast_operator::ARCTANH: return "ARCTANH"; + case ast_operator::EXP: return "EXP"; + case ast_operator::LOG: return "LOG"; + case ast_operator::SQRT: return "SQRT"; + case ast_operator::CBRT: return "CBRT"; + case ast_operator::CEIL: return "CEIL"; + case ast_operator::FLOOR: return "FLOOR"; + case ast_operator::ABS: return "ABS"; + case ast_operator::RINT: return "RINT"; + case ast_operator::BIT_INVERT: return "BIT_INVERT"; + case ast_operator::NOT: return "NOT"; + case ast_operator::CAST_TO_INT64: return "CAST_TO_INT64"; + case ast_operator::CAST_TO_UINT64: return "CAST_TO_UINT64"; + case ast_operator::CAST_TO_FLOAT64: return "CAST_TO_FLOAT64"; + default: CUDF_FAIL("Unrecognized operator type."); + } +} + } // namespace detail } // namespace ast diff --git a/cpp/src/binaryop/compiled/binary_ops.cuh b/cpp/src/binaryop/compiled/binary_ops.cuh index 2f255e7a07c..1b1ecf9a19c 100644 --- a/cpp/src/binaryop/compiled/binary_ops.cuh +++ b/cpp/src/binaryop/compiled/binary_ops.cuh @@ -22,6 +22,7 @@ #include #include #include +#include #include #include diff --git a/cpp/src/quantiles/quantiles_util.hpp b/cpp/src/quantiles/quantiles_util.hpp index 52fe5356361..18ff0774600 100644 --- a/cpp/src/quantiles/quantiles_util.hpp +++ b/cpp/src/quantiles/quantiles_util.hpp @@ -17,6 +17,7 @@ #pragma once #include +#include #include #include #include diff --git a/cpp/src/quantiles/tdigest/tdigest_aggregation.cu b/cpp/src/quantiles/tdigest/tdigest_aggregation.cu index d28c04f423a..617c33c7502 100644 --- a/cpp/src/quantiles/tdigest/tdigest_aggregation.cu +++ b/cpp/src/quantiles/tdigest/tdigest_aggregation.cu @@ -30,6 +30,7 @@ #include #include #include +#include #include #include #include diff --git a/cpp/src/unary/cast_ops.cu b/cpp/src/unary/cast_ops.cu index d8ea857e4af..7327f0d8614 100644 --- a/cpp/src/unary/cast_ops.cu +++ b/cpp/src/unary/cast_ops.cu @@ -22,6 +22,7 @@ #include #include #include +#include #include #include #include diff --git a/cpp/tests/fixed_point/fixed_point_tests.cpp b/cpp/tests/fixed_point/fixed_point_tests.cpp index f8f8d525043..f1724eb9195 100644 --- a/cpp/tests/fixed_point/fixed_point_tests.cpp +++ b/cpp/tests/fixed_point/fixed_point_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2024, NVIDIA CORPORATION. + * Copyright (c) 2020-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -20,6 +20,7 @@ #include #include +#include #include #include From 954599faf1acdde45e85500fe35fcb7354df5ea1 Mon Sep 17 00:00:00 2001 From: Nghia Truong <7416935+ttnghia@users.noreply.github.com> Date: Wed, 13 Aug 2025 10:20:25 -0700 Subject: [PATCH 120/366] Support hash-based workflow for `M2` groupby aggregation (#19569) This implements the hash-based workflow for `M2` groupby aggregation. A benchmark is also added, showing that the hash-based approach can improve performance over sort-based by up to 40 percent. Since the output is generated using hash table, the order of keys/values will be undefined thus may not be the same as in the input. --- Benchmark: ``` ## [0] Quadro RTX 6000 | T | value_key_ratio | num_rows | null_probability | Ref Time | Ref Noise | Cmp Time | Cmp Noise | Diff | %Diff | Status | |-----|-------------------|------------|--------------------|------------|-------------|------------|-------------|--------------|---------|----------| | I32 | 10 | 10000 | 0 | 215.940 us | 9.98% | 156.946 us | 8.62% | -58.993 us | -27.32% | FAST | | I32 | 30 | 10000 | 0 | 198.239 us | 1.32% | 149.199 us | 5.59% | -49.039 us | -24.74% | FAST | | I32 | 100 | 10000 | 0 | 196.804 us | 3.81% | 145.584 us | 4.77% | -51.220 us | -26.03% | FAST | | I32 | 10 | 1000000 | 0 | 626.309 us | 1.10% | 635.264 us | 1.85% | 8.955 us | 1.43% | SLOW | | I32 | 30 | 1000000 | 0 | 617.688 us | 1.15% | 477.760 us | 2.14% | -139.929 us | -22.65% | FAST | | I32 | 100 | 1000000 | 0 | 612.476 us | 1.23% | 381.148 us | 1.72% | -231.328 us | -37.77% | FAST | | I32 | 10 | 10000000 | 0 | 7.335 ms | 0.78% | 8.007 ms | 0.64% | 671.021 us | 9.15% | SLOW | | I32 | 30 | 10000000 | 0 | 7.140 ms | 0.78% | 7.845 ms | 0.59% | 704.838 us | 9.87% | SLOW | | I32 | 100 | 10000000 | 0 | 6.975 ms | 0.78% | 6.285 ms | 0.77% | -690.236 us | -9.90% | FAST | | I32 | 10 | 10000 | 0.1 | 288.683 us | 1.84% | 179.761 us | 4.03% | -108.923 us | -37.73% | FAST | | I32 | 30 | 10000 | 0.1 | 287.724 us | 2.13% | 177.806 us | 2.02% | -109.918 us | -38.20% | FAST | | I32 | 100 | 10000 | 0.1 | 285.990 us | 2.53% | 174.958 us | 3.17% | -111.033 us | -38.82% | FAST | | I32 | 10 | 1000000 | 0.1 | 768.166 us | 1.03% | 666.330 us | 1.69% | -101.836 us | -13.26% | FAST | | I32 | 30 | 1000000 | 0.1 | 753.631 us | 1.69% | 514.416 us | 2.36% | -239.214 us | -31.74% | FAST | | I32 | 100 | 1000000 | 0.1 | 747.192 us | 1.01% | 417.477 us | 1.86% | -329.715 us | -44.13% | FAST | | I32 | 10 | 10000000 | 0.1 | 7.948 ms | 0.80% | 7.958 ms | 0.55% | 9.830 us | 0.12% | SAME | | I32 | 30 | 10000000 | 0.1 | 7.704 ms | 0.74% | 7.616 ms | 0.51% | -87.225 us | -1.13% | FAST | | I32 | 100 | 10000000 | 0.1 | 7.538 ms | 0.80% | 6.068 ms | 0.78% | -1469.687 us | -19.50% | FAST | | I32 | 10 | 10000 | 0.9 | 289.933 us | 3.40% | 172.943 us | 2.03% | -116.989 us | -40.35% | FAST | | I32 | 30 | 10000 | 0.9 | 289.697 us | 3.85% | 171.887 us | 2.65% | -117.811 us | -40.67% | FAST | | I32 | 100 | 10000 | 0.9 | 286.871 us | 2.91% | 168.887 us | 1.81% | -117.984 us | -41.13% | FAST | | I32 | 10 | 1000000 | 0.9 | 759.703 us | 1.84% | 469.550 us | 2.02% | -290.153 us | -38.19% | FAST | | I32 | 30 | 1000000 | 0.9 | 749.905 us | 4.00% | 405.214 us | 1.81% | -344.691 us | -45.96% | FAST | | I32 | 100 | 1000000 | 0.9 | 743.373 us | 2.09% | 370.548 us | 1.76% | -372.825 us | -50.15% | FAST | | I32 | 10 | 10000000 | 0.9 | 7.901 ms | 0.69% | 4.686 ms | 1.03% | -3214.663 us | -40.69% | FAST | | I32 | 30 | 10000000 | 0.9 | 7.663 ms | 0.79% | 4.038 ms | 1.30% | -3624.460 us | -47.30% | FAST | | I32 | 100 | 10000000 | 0.9 | 7.483 ms | 0.82% | 3.183 ms | 1.46% | -4299.710 us | -57.46% | FAST | | F64 | 10 | 10000 | 0 | 206.186 us | 1.98% | 152.118 us | 2.07% | -54.068 us | -26.22% | FAST | | F64 | 30 | 10000 | 0 | 206.396 us | 3.77% | 151.780 us | 3.92% | -54.616 us | -26.46% | FAST | | F64 | 100 | 10000 | 0 | 202.879 us | 1.54% | 148.957 us | 3.89% | -53.922 us | -26.58% | FAST | | F64 | 10 | 1000000 | 0 | 678.915 us | 1.96% | 662.225 us | 1.68% | -16.690 us | -2.46% | FAST | | F64 | 30 | 1000000 | 0 | 665.775 us | 1.27% | 515.827 us | 1.72% | -149.948 us | -22.52% | FAST | | F64 | 100 | 1000000 | 0 | 660.599 us | 1.73% | 424.674 us | 1.79% | -235.925 us | -35.71% | FAST | | F64 | 10 | 10000000 | 0 | 7.677 ms | 0.76% | 8.056 ms | 0.59% | 378.792 us | 4.93% | SLOW | | F64 | 30 | 10000000 | 0 | 7.466 ms | 1.05% | 7.851 ms | 0.50% | 385.055 us | 5.16% | SLOW | | F64 | 100 | 10000000 | 0 | 7.285 ms | 0.92% | 6.317 ms | 0.71% | -968.127 us | -13.29% | FAST | | F64 | 10 | 10000 | 0.1 | 299.562 us | 3.55% | 183.848 us | 5.22% | -115.714 us | -38.63% | FAST | | F64 | 30 | 10000 | 0.1 | 299.092 us | 3.72% | 182.238 us | 2.39% | -116.854 us | -39.07% | FAST | | F64 | 100 | 10000 | 0.1 | 296.364 us | 3.12% | 179.645 us | 2.52% | -116.719 us | -39.38% | FAST | | F64 | 10 | 1000000 | 0.1 | 811.874 us | 1.19% | 698.105 us | 1.64% | -113.769 us | -14.01% | FAST | | F64 | 30 | 1000000 | 0.1 | 795.446 us | 1.93% | 553.580 us | 2.28% | -241.867 us | -30.41% | FAST | | F64 | 100 | 1000000 | 0.1 | 801.054 us | 4.61% | 456.706 us | 1.58% | -344.348 us | -42.99% | FAST | | F64 | 10 | 10000000 | 0.1 | 8.174 ms | 0.88% | 8.025 ms | 0.50% | -149.418 us | -1.83% | FAST | | F64 | 30 | 10000000 | 0.1 | 7.929 ms | 0.80% | 7.693 ms | 0.58% | -235.359 us | -2.97% | FAST | | F64 | 100 | 10000000 | 0.1 | 7.736 ms | 0.80% | 6.178 ms | 0.73% | -1557.934 us | -20.14% | FAST | | F64 | 10 | 10000 | 0.9 | 294.503 us | 1.49% | 176.414 us | 9.54% | -118.089 us | -40.10% | FAST | | F64 | 30 | 10000 | 0.9 | 298.363 us | 6.18% | 175.130 us | 2.80% | -123.233 us | -41.30% | FAST | | F64 | 100 | 10000 | 0.9 | 292.613 us | 1.48% | 171.707 us | 2.15% | -120.906 us | -41.32% | FAST | | F64 | 10 | 1000000 | 0.9 | 792.239 us | 1.35% | 482.531 us | 1.84% | -309.708 us | -39.09% | FAST | | F64 | 30 | 1000000 | 0.9 | 776.355 us | 1.14% | 422.448 us | 1.78% | -353.907 us | -45.59% | FAST | | F64 | 100 | 1000000 | 0.9 | 771.947 us | 1.58% | 391.754 us | 2.64% | -380.193 us | -49.25% | FAST | | F64 | 10 | 10000000 | 0.9 | 8.051 ms | 0.76% | 4.735 ms | 1.10% | -3315.632 us | -41.18% | FAST | | F64 | 30 | 10000000 | 0.9 | 7.822 ms | 0.75% | 4.110 ms | 1.41% | -3711.202 us | -47.45% | FAST | | F64 | 100 | 10000000 | 0.9 | 7.645 ms | 0.80% | 3.254 ms | 1.47% | -4390.196 us | -57.43% | FAST | ``` Authors: - Nghia Truong (https://github.com/ttnghia) Approvers: - Alessandro Bellina (https://github.com/abellina) - Yunsong Wang (https://github.com/PointKernel) - Shruti Shivakumar (https://github.com/shrshi) - Kyle Edwards (https://github.com/KyleFromNVIDIA) URL: https://github.com/rapidsai/cudf/pull/19569 --- cpp/benchmarks/CMakeLists.txt | 1 + cpp/benchmarks/groupby/group_m2.cpp | 83 +++++++++++++++++ .../cudf/detail/aggregation/aggregation.cuh | 4 + cpp/src/aggregation/aggregation.cu | 2 +- .../groupby/hash/flatten_single_pass_aggs.cpp | 13 ++- cpp/src/groupby/hash/groupby.cu | 60 +++++------- .../hash/hash_compound_agg_finalizer.cu | 35 ++++++- .../hash/hash_compound_agg_finalizer.hpp | 4 +- ...ar_hash_functor.cuh => m2_var_functor.cuh} | 91 ++++++++++++++++--- cpp/tests/groupby/m2_tests.cpp | 42 +++++++-- .../test/java/ai/rapids/cudf/TableTest.java | 12 ++- 11 files changed, 282 insertions(+), 65 deletions(-) create mode 100644 cpp/benchmarks/groupby/group_m2.cpp rename cpp/src/groupby/hash/{var_hash_functor.cuh => m2_var_functor.cuh} (58%) diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt index 21f56e1331c..e111b6395e9 100644 --- a/cpp/benchmarks/CMakeLists.txt +++ b/cpp/benchmarks/CMakeLists.txt @@ -262,6 +262,7 @@ ConfigureBench( ConfigureNVBench( GROUPBY_NVBENCH groupby/group_histogram.cpp + groupby/group_m2.cpp groupby/group_max.cpp groupby/group_max_multithreaded.cpp groupby/group_nunique.cpp diff --git a/cpp/benchmarks/groupby/group_m2.cpp b/cpp/benchmarks/groupby/group_m2.cpp new file mode 100644 index 00000000000..be907e9e343 --- /dev/null +++ b/cpp/benchmarks/groupby/group_m2.cpp @@ -0,0 +1,83 @@ +/* + * Copyright (c) 2025, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include + +#include + +template +void groupby_m2_helper(nvbench::state& state, + cudf::size_type num_rows, + cudf::size_type value_key_ratio, + double null_probability) +{ + auto const keys = [&] { + data_profile const profile = + data_profile_builder() + .cardinality(num_rows / value_key_ratio) + .no_validity() + .distribution(cudf::type_to_id(), distribution_id::UNIFORM, 0, num_rows); + return create_random_column(cudf::type_to_id(), row_count{num_rows}, profile); + }(); + + auto const values = [&] { + auto builder = data_profile_builder().cardinality(0).distribution( + cudf::type_to_id(), distribution_id::UNIFORM, 0, num_rows); + if (null_probability > 0) { + builder.null_probability(null_probability); + } else { + builder.no_validity(); + } + return create_random_column( + cudf::type_to_id(), row_count{num_rows}, data_profile{builder}); + }(); + + // Vector of 1 request + std::vector requests(1); + requests.back().values = values->view(); + requests.back().aggregations.push_back(cudf::make_m2_aggregation()); + + auto const mem_stats_logger = cudf::memory_stats_logger(); + state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value())); + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch&) { + auto gb_obj = cudf::groupby::groupby(cudf::table_view({keys->view()})); + auto const result = gb_obj.aggregate(requests); + }); + + auto const elapsed_time = state.get_summary("nv/cold/time/gpu/mean").get_float64("value"); + state.add_element_count(static_cast(num_rows) / elapsed_time, "rows/s"); + state.add_buffer_size( + mem_stats_logger.peak_memory_usage(), "peak_memory_usage", "peak_memory_usage"); +} + +template +void bench_groupby_m2(nvbench::state& state, nvbench::type_list) +{ + auto const value_key_ratio = static_cast(state.get_int64("value_key_ratio")); + auto const num_rows = static_cast(state.get_int64("num_rows")); + auto const null_probability = state.get_float64("null_probability"); + + groupby_m2_helper(state, num_rows, value_key_ratio, null_probability); +} + +NVBENCH_BENCH_TYPES(bench_groupby_m2, NVBENCH_TYPE_AXES(nvbench::type_list)) + .set_name("groupby_m2") + .add_int64_axis("value_key_ratio", {10, 30, 100}) + .add_int64_axis("num_rows", {10'000, 1'000'000, 10'000'000}) + .add_float64_axis("null_probability", {0, 0.1, 0.9}); diff --git a/cpp/include/cudf/detail/aggregation/aggregation.cuh b/cpp/include/cudf/detail/aggregation/aggregation.cuh index 2124a131c19..92f1aeea572 100644 --- a/cpp/include/cudf/detail/aggregation/aggregation.cuh +++ b/cpp/include/cudf/detail/aggregation/aggregation.cuh @@ -90,6 +90,10 @@ struct corresponding_operator { using type = DeviceSum; }; template <> +struct corresponding_operator { + using type = DeviceSum; +}; +template <> struct corresponding_operator { using type = DeviceSum; }; diff --git a/cpp/src/aggregation/aggregation.cu b/cpp/src/aggregation/aggregation.cu index c58d1f7af7c..89c96c877e4 100644 --- a/cpp/src/aggregation/aggregation.cu +++ b/cpp/src/aggregation/aggregation.cu @@ -53,7 +53,7 @@ struct identity_initializer { (k == aggregation::SUM or k == aggregation::MIN or k == aggregation::MAX or k == aggregation::COUNT_VALID or k == aggregation::COUNT_ALL or k == aggregation::ARGMAX or k == aggregation::ARGMIN or - k == aggregation::SUM_OF_SQUARES or k == aggregation::STD or + k == aggregation::SUM_OF_SQUARES or k == aggregation::M2 or k == aggregation::STD or k == aggregation::VARIANCE or (k == aggregation::PRODUCT and is_product_supported()))) or (k == aggregation::SUM_WITH_OVERFLOW and std::is_same_v); diff --git a/cpp/src/groupby/hash/flatten_single_pass_aggs.cpp b/cpp/src/groupby/hash/flatten_single_pass_aggs.cpp index a533f7a6448..8e64560d246 100644 --- a/cpp/src/groupby/hash/flatten_single_pass_aggs.cpp +++ b/cpp/src/groupby/hash/flatten_single_pass_aggs.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2024, NVIDIA CORPORATION. + * Copyright (c) 2024-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -68,6 +68,17 @@ class groupby_simple_aggregations_collector final return aggs; } + std::vector> visit(data_type, + cudf::detail::m2_aggregation const&) override + { + std::vector> aggs; + aggs.push_back(make_sum_aggregation()); + // COUNT_VALID + aggs.push_back(make_count_aggregation()); + + return aggs; + } + std::vector> visit(data_type, cudf::detail::var_aggregation const&) override { diff --git a/cpp/src/groupby/hash/groupby.cu b/cpp/src/groupby/hash/groupby.cu index b9f08c2c505..155c2fee9ce 100644 --- a/cpp/src/groupby/hash/groupby.cu +++ b/cpp/src/groupby/hash/groupby.cu @@ -21,7 +21,6 @@ #include #include #include -#include #include #include #include @@ -33,44 +32,38 @@ #include -#include #include +#include #include #include namespace cudf::groupby::detail::hash { namespace { /** - * @brief List of aggregation operations that can be computed with a hash-based - * implementation. + * @brief List of aggregation operations that can be computed with a hash-based implementation. + * + * For single pass aggregations, the supported operations are the ones that can be atomically + * updated: SUM, SUM_WITH_OVERFLOW, SUM_OF_SQUARES, PRODUCT, MIN, MAX, COUNT_VALID, COUNT_ALL. + * For compound aggregations, the supported operations are the ones that depends on the single pass + * aggregations above: ARGMIN(MIN), ARGMAX(MAX), MEAN(SUM, COUNT_VALID), M2/STD/VARIANCE(M2, + * COUNT_VALID). */ -constexpr std::array hash_aggregations{aggregation::SUM, - aggregation::SUM_WITH_OVERFLOW, - aggregation::PRODUCT, - aggregation::MIN, - aggregation::MAX, - aggregation::COUNT_VALID, - aggregation::COUNT_ALL, - aggregation::ARGMIN, - aggregation::ARGMAX, - aggregation::SUM_OF_SQUARES, - aggregation::MEAN, - aggregation::STD, - aggregation::VARIANCE}; - -// Could be hash: SUM, PRODUCT, MIN, MAX, COUNT_VALID, COUNT_ALL, ANY, ALL, -// Compound: MEAN(SUM, COUNT_VALID), VARIANCE, STD(MEAN (SUM, COUNT_VALID), COUNT_VALID), -// ARGMAX, ARGMIN - -// TODO replace with std::find in C++20 onwards. -template -constexpr bool array_contains(std::array const& haystack, T needle) -{ - for (auto const& val : haystack) { - if (val == needle) return true; - } - return false; -} +const auto hash_aggregations = std::unordered_set{// Single pass aggregations: + aggregation::SUM, + aggregation::SUM_WITH_OVERFLOW, + aggregation::SUM_OF_SQUARES, + aggregation::PRODUCT, + aggregation::MIN, + aggregation::MAX, + aggregation::COUNT_VALID, + aggregation::COUNT_ALL, + // Compound aggregations: + aggregation::ARGMIN, + aggregation::ARGMAX, + aggregation::MEAN, + aggregation::M2, + aggregation::STD, + aggregation::VARIANCE}; /** * @brief Indicates whether the specified aggregation operation can be computed @@ -80,10 +73,7 @@ constexpr bool array_contains(std::array const& haystack, T needle) * @return true `t` is valid for a hash based groupby * @return false `t` is invalid for a hash based groupby */ -bool constexpr is_hash_aggregation(aggregation::Kind t) -{ - return array_contains(hash_aggregations, t); -} +bool is_hash_aggregation(aggregation::Kind t) { return hash_aggregations.contains(t); } std::unique_ptr
dispatch_groupby(table_view const& keys, host_span requests, diff --git a/cpp/src/groupby/hash/hash_compound_agg_finalizer.cu b/cpp/src/groupby/hash/hash_compound_agg_finalizer.cu index bdd05c2e01b..8b61254ce38 100644 --- a/cpp/src/groupby/hash/hash_compound_agg_finalizer.cu +++ b/cpp/src/groupby/hash/hash_compound_agg_finalizer.cu @@ -16,7 +16,7 @@ #include "hash_compound_agg_finalizer.hpp" #include "helpers.cuh" -#include "var_hash_functor.cuh" +#include "m2_var_functor.cuh" #include #include @@ -29,7 +29,6 @@ #include #include -#include #include @@ -150,6 +149,38 @@ void hash_compound_agg_finalizer::visit(cudf::detail::mean_aggregation dense_results->add_result(col, agg, std::move(result)); } +template +void hash_compound_agg_finalizer::visit(cudf::detail::m2_aggregation const& agg) +{ + if (dense_results->has_result(col, agg)) { return; } + + auto sum_agg = make_sum_aggregation(); + auto count_agg = make_count_aggregation(); + this->visit(*sum_agg); + this->visit(*count_agg); + auto const sum_result = sparse_results->get_result(col, *sum_agg); + auto const count_result = sparse_results->get_result(col, *count_agg); + + auto const d_values_ptr = column_device_view::create(col, stream); + auto const d_sum_ptr = column_device_view::create(sum_result, stream).release(); + auto const d_count_ptr = column_device_view::create(count_result, stream).release(); + + auto output = make_fixed_width_column( + cudf::detail::target_type(result_type, agg.kind), col.size(), mask_state::ALL_NULL, stream); + auto output_view = mutable_column_device_view::create(output->mutable_view(), stream); + auto output_tview = mutable_table_view{{output->mutable_view()}}; + cudf::detail::initialize_with_identity( + output_tview, host_span(&agg.kind, 1), stream); + + thrust::for_each_n( + rmm::exec_policy_nosync(stream), + thrust::make_counting_iterator(0), + col.size(), + m2_hash_functor{set, row_bitmask, *output_view, *d_values_ptr, *d_sum_ptr, *d_count_ptr}); + sparse_results->add_result(col, agg, std::move(output)); + dense_results->add_result(col, agg, to_dense_agg_result(agg)); +} + template void hash_compound_agg_finalizer::visit(cudf::detail::var_aggregation const& agg) { diff --git a/cpp/src/groupby/hash/hash_compound_agg_finalizer.hpp b/cpp/src/groupby/hash/hash_compound_agg_finalizer.hpp index 8bee1a92c40..63e08a19177 100644 --- a/cpp/src/groupby/hash/hash_compound_agg_finalizer.hpp +++ b/cpp/src/groupby/hash/hash_compound_agg_finalizer.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2024, NVIDIA CORPORATION. + * Copyright (c) 2024-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -62,6 +62,8 @@ class hash_compound_agg_finalizer final : public cudf::detail::aggregation_final void visit(cudf::detail::mean_aggregation const& agg) override; + void visit(cudf::detail::m2_aggregation const& agg) override; + void visit(cudf::detail::var_aggregation const& agg) override; void visit(cudf::detail::std_aggregation const& agg) override; diff --git a/cpp/src/groupby/hash/var_hash_functor.cuh b/cpp/src/groupby/hash/m2_var_functor.cuh similarity index 58% rename from cpp/src/groupby/hash/var_hash_functor.cuh rename to cpp/src/groupby/hash/m2_var_functor.cuh index 51a4ce22dbe..c8f72fc4fbc 100644 --- a/cpp/src/groupby/hash/var_hash_functor.cuh +++ b/cpp/src/groupby/hash/m2_var_functor.cuh @@ -13,6 +13,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + #pragma once #include @@ -22,11 +23,82 @@ #include #include -#include #include -#include namespace cudf::groupby::detail::hash { + +template +__device__ constexpr static bool is_m2_var_supported() +{ + return is_numeric() && !is_fixed_point(); +} + +template +struct m2_hash_functor { + SetType set; + bitmask_type const* __restrict__ row_bitmask; + mutable_column_device_view target; + column_device_view source; + column_device_view sum; + column_device_view count; + m2_hash_functor(SetType set, + bitmask_type const* row_bitmask, + mutable_column_device_view target, + column_device_view source, + column_device_view sum, + column_device_view count) + : set{set}, row_bitmask{row_bitmask}, target{target}, source{source}, sum{sum}, count{count} + { + } + + template + __device__ void operator()(column_device_view const&, size_type, size_type) noexcept + requires(!is_m2_var_supported()) + { + CUDF_UNREACHABLE("Invalid source type for M2 aggregation."); + } + + template + __device__ void operator()(column_device_view const& source, + size_type source_index, + size_type target_index) noexcept + requires(is_m2_var_supported()) + { + using Target = cudf::detail::target_type_t; + using SumType = cudf::detail::target_type_t; + using CountType = cudf::detail::target_type_t; + + if (source.is_null(source_index)) { return; } + auto const group_size = count.element(target_index); + if (group_size == 0) { return; } + + auto const x = static_cast(source.element(source_index)); + auto const mean = static_cast(sum.element(target_index)) / group_size; + auto const diff = x - mean; + auto const result = diff * diff; + cuda::atomic_ref ref{target.element(target_index)}; + ref.fetch_add(result, cuda::std::memory_order_relaxed); + if (target.is_null(target_index)) { target.set_valid(target_index); } + } + + __device__ inline void operator()(size_type source_index) + { + if (row_bitmask == nullptr or bit_is_set(row_bitmask, source_index)) { + auto const target_index = *set.find(source_index); + + auto col = source; + auto source_type = source.type(); + if (source_type.id() == type_id::DICTIONARY32) { + col = source.child(cudf::dictionary_column_view::keys_column_index); + source_type = col.type(); + source_index = static_cast(source.element(source_index)); + } + + type_dispatcher(source_type, *this, col, source_index, target_index); + } + } +}; + template struct var_hash_functor { SetType set; @@ -54,16 +126,8 @@ struct var_hash_functor { } template - constexpr static bool is_supported() - { - return is_numeric() && !is_fixed_point(); - } - - template - __device__ void operator()(column_device_view const& source, - size_type source_index, - size_type target_index) noexcept - requires(!is_supported()) + __device__ void operator()(column_device_view const&, size_type, size_type) noexcept + requires(!is_m2_var_supported()) { CUDF_UNREACHABLE("Invalid source type for std, var aggregation combination."); } @@ -72,7 +136,7 @@ struct var_hash_functor { __device__ void operator()(column_device_view const& source, size_type source_index, size_type target_index) noexcept - requires(is_supported()) + requires(is_m2_var_supported()) { using Target = cudf::detail::target_type_t; using SumType = cudf::detail::target_type_t; @@ -109,4 +173,5 @@ struct var_hash_functor { } } }; + } // namespace cudf::groupby::detail::hash diff --git a/cpp/tests/groupby/m2_tests.cpp b/cpp/tests/groupby/m2_tests.cpp index 4359c154cf6..085d06fcdd7 100644 --- a/cpp/tests/groupby/m2_tests.cpp +++ b/cpp/tests/groupby/m2_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2024, NVIDIA CORPORATION. + * Copyright (c) 2021-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -22,6 +22,7 @@ #include #include #include +#include using namespace cudf::test::iterators; @@ -41,14 +42,39 @@ using M2s_col = cudf::test::fixed_width_column_wrapper; auto compute_M2(cudf::column_view const& keys, cudf::column_view const& values) { - std::vector requests; - requests.emplace_back(); - requests[0].values = values; - requests[0].aggregations.emplace_back(cudf::make_m2_aggregation()); - auto gb_obj = cudf::groupby::groupby(cudf::table_view({keys})); - auto result = gb_obj.aggregate(requests); - return std::pair(std::move(result.first->release()[0]), std::move(result.second[0].results[0])); + + auto [hash_gb_keys, hash_gb_vals] = [&] { + std::vector requests; + requests.emplace_back(); + requests[0].values = values; + requests[0].aggregations.emplace_back(cudf::make_m2_aggregation()); + auto const result = gb_obj.aggregate(requests); + auto const sort_order = cudf::sorted_order(result.first->view(), {}, {}); + auto const sorted_keys = cudf::gather(result.first->view(), *sort_order); + auto const sorted_vals = + cudf::gather(cudf::table_view({result.second[0].results[0]->view()}), *sort_order); + return std::pair(std::move(sorted_keys->release()[0]), std::move(sorted_vals->release()[0])); + }(); + + auto const [sort_gb_keys, sort_gb_vals] = [&] { + // Create a fresh aggregation request for sort-based aggregation instead of reusing. + // This is to avoid wrong output when the previous groupby aggregation has not been executed + // while the requests vector is modified. + std::vector requests; + requests.emplace_back(); + requests[0].values = values; + requests[0].aggregations.emplace_back(cudf::make_m2_aggregation()); + requests[0].aggregations.emplace_back( + cudf::make_nth_element_aggregation(0)); + auto result = gb_obj.aggregate(requests); + return std::pair(std::move(result.first->release()[0]), std::move(result.second[0].results[0])); + }(); + + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*hash_gb_keys, *sort_gb_keys, verbosity); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*hash_gb_vals, *sort_gb_vals, verbosity); + + return std::pair(std::move(hash_gb_keys), std::move(hash_gb_vals)); } } // namespace diff --git a/java/src/test/java/ai/rapids/cudf/TableTest.java b/java/src/test/java/ai/rapids/cudf/TableTest.java index 12d7c15791b..1289f468002 100644 --- a/java/src/test/java/ai/rapids/cudf/TableTest.java +++ b/java/src/test/java/ai/rapids/cudf/TableTest.java @@ -7799,10 +7799,11 @@ void testGroupByM2() { .build(); Table results = input.groupBy(0).aggregate(GroupByAggregation.M2() .onColumn(1)); + Table resultsSorted = results.orderBy(OrderByArg.asc(0)); Table expected = new Table.TestBuilder().column(1, 2, 3) .column(42.0, 122.75, 114.0) .build()) { - assertTablesAreEqual(expected, results); + assertTablesAreEqual(expected, resultsSorted); } // Test with values have nulls (the values associated with key=2 has both nulls and non-nulls, @@ -7812,10 +7813,11 @@ void testGroupByM2() { .build(); Table results = input.groupBy(0).aggregate(GroupByAggregation.M2() .onColumn(1)); + Table resultsSorted = results.orderBy(OrderByArg.asc(0)); Table expected = new Table.TestBuilder().column(1, 2, 3, 4, 5) .column(0.0, 2.0, 8.0, 0.0, null) .build()) { - assertTablesAreEqual(expected, results); + assertTablesAreEqual(expected, resultsSorted); } // Test with floating-point values having NaN: @@ -7824,10 +7826,11 @@ void testGroupByM2() { .build(); Table results = input.groupBy(0).aggregate(GroupByAggregation.M2() .onColumn(1)); + Table resultsSorted = results.orderBy(OrderByArg.asc(0)); Table expected = new Table.TestBuilder().column(1, 2, 3, 4, null) .column(18.0, Double.NaN, 18.0, Double.NaN, 0.0) .build()) { - assertTablesAreEqual(expected, results); + assertTablesAreEqual(expected, resultsSorted); } // Test with floating-point values having NaN and +/- Inf @@ -7857,10 +7860,11 @@ void testGroupByM2() { .build(); Table results = input.groupBy(0).aggregate(GroupByAggregation.M2() .onColumn(1)); + Table resultsSorted = results.orderBy(OrderByArg.asc(0)); Table expected = new Table.TestBuilder().column(1, 2, 3, 4, 5) .column(Double.NaN, Double.NaN, Double.NaN, Double.NaN, 12.5) .build()) { - assertTablesAreEqual(expected, results); + assertTablesAreEqual(expected, resultsSorted); } } From 1cf49c9043c3c997a64d6b05dbfd773700ec2c44 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Wed, 13 Aug 2025 15:16:57 -0700 Subject: [PATCH 121/366] Add streams support to all list APIs (#19683) Contributes to #15163 Authors: - Vyas Ramasubramani (https://github.com/vyasr) Approvers: - Matthew Roeschke (https://github.com/mroeschke) URL: https://github.com/rapidsai/cudf/pull/19683 --- .../pylibcudf/libcudf/lists/combine.pxd | 11 +- .../pylibcudf/libcudf/lists/contains.pxd | 8 +- .../libcudf/lists/count_elements.pxd | 6 +- .../pylibcudf/libcudf/lists/explode.pxd | 4 +- .../pylibcudf/libcudf/lists/extract.pxd | 9 +- .../pylibcudf/libcudf/lists/filling.pxd | 5 +- .../pylibcudf/libcudf/lists/gather.pxd | 4 +- .../pylibcudf/libcudf/lists/reverse.pxd | 4 +- .../libcudf/lists/set_operations.pxd | 15 +- .../pylibcudf/libcudf/lists/sorting.pxd | 9 +- .../libcudf/lists/stream_compaction.pxd | 9 +- python/pylibcudf/pylibcudf/lists.pxd | 63 +++++-- python/pylibcudf/pylibcudf/lists.pyi | 47 ++++-- python/pylibcudf/pylibcudf/lists.pyx | 159 +++++++++++++----- 14 files changed, 263 insertions(+), 90 deletions(-) diff --git a/python/pylibcudf/pylibcudf/libcudf/lists/combine.pxd b/python/pylibcudf/pylibcudf/libcudf/lists/combine.pxd index 3e4c88d62b0..f01318e0e4e 100644 --- a/python/pylibcudf/pylibcudf/libcudf/lists/combine.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/lists/combine.pxd @@ -1,4 +1,4 @@ -# Copyright (c) 2021-2024, NVIDIA CORPORATION. +# Copyright (c) 2021-2025, NVIDIA CORPORATION. from libc.stdint cimport int32_t from libcpp.memory cimport unique_ptr @@ -6,6 +6,7 @@ from pylibcudf.exception_handler cimport libcudf_exception_handler from pylibcudf.libcudf.column.column cimport column from pylibcudf.libcudf.column.column_view cimport column_view from pylibcudf.libcudf.table.table_view cimport table_view +from rmm.librmm.cuda_stream_view cimport cuda_stream_view cdef extern from "cudf/lists/combine.hpp" namespace \ @@ -16,14 +17,18 @@ cdef extern from "cudf/lists/combine.hpp" namespace \ NULLIFY_OUTPUT_ROW cdef unique_ptr[column] concatenate_rows( - const table_view input_table + const table_view input_table, + concatenate_null_policy null_policy, + cuda_stream_view stream ) except +libcudf_exception_handler cdef unique_ptr[column] concatenate_list_elements( const table_view input_table, + cuda_stream_view stream ) except +libcudf_exception_handler cdef unique_ptr[column] concatenate_list_elements( const column_view input_table, - concatenate_null_policy null_policy + concatenate_null_policy null_policy, + cuda_stream_view stream ) except +libcudf_exception_handler diff --git a/python/pylibcudf/pylibcudf/libcudf/lists/contains.pxd b/python/pylibcudf/pylibcudf/libcudf/lists/contains.pxd index 13a32d46c7a..23b39f78e1f 100644 --- a/python/pylibcudf/pylibcudf/libcudf/lists/contains.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/lists/contains.pxd @@ -1,4 +1,4 @@ -# Copyright (c) 2021-2024, NVIDIA CORPORATION. +# Copyright (c) 2021-2025, NVIDIA CORPORATION. from libc.stdint cimport int32_t from libcpp.memory cimport unique_ptr from pylibcudf.exception_handler cimport libcudf_exception_handler @@ -6,6 +6,7 @@ from pylibcudf.libcudf.column.column cimport column from pylibcudf.libcudf.column.column_view cimport column_view from pylibcudf.libcudf.lists.lists_column_view cimport lists_column_view from pylibcudf.libcudf.scalar.scalar cimport scalar +from rmm.librmm.cuda_stream_view cimport cuda_stream_view cdef extern from "cudf/lists/contains.hpp" namespace "cudf::lists" nogil: @@ -17,25 +18,30 @@ cdef extern from "cudf/lists/contains.hpp" namespace "cudf::lists" nogil: cdef unique_ptr[column] contains( const lists_column_view& lists, const scalar& search_key, + cuda_stream_view stream ) except +libcudf_exception_handler cdef unique_ptr[column] contains( const lists_column_view& lists, const column_view& search_keys, + cuda_stream_view stream ) except +libcudf_exception_handler cdef unique_ptr[column] contains_nulls( const lists_column_view& lists, + cuda_stream_view stream ) except +libcudf_exception_handler cdef unique_ptr[column] index_of( const lists_column_view& lists, const scalar& search_key, duplicate_find_option find_option, + cuda_stream_view stream ) except +libcudf_exception_handler cdef unique_ptr[column] index_of( const lists_column_view& lists, const column_view& search_keys, duplicate_find_option find_option, + cuda_stream_view stream ) except +libcudf_exception_handler diff --git a/python/pylibcudf/pylibcudf/libcudf/lists/count_elements.pxd b/python/pylibcudf/pylibcudf/libcudf/lists/count_elements.pxd index 64c75ccabd3..64fbb4abbbc 100644 --- a/python/pylibcudf/pylibcudf/libcudf/lists/count_elements.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/lists/count_elements.pxd @@ -1,11 +1,13 @@ -# Copyright (c) 2021-2024, NVIDIA CORPORATION. +# Copyright (c) 2021-2025, NVIDIA CORPORATION. from libcpp.memory cimport unique_ptr from pylibcudf.exception_handler cimport libcudf_exception_handler from pylibcudf.libcudf.column.column cimport column from pylibcudf.libcudf.lists.lists_column_view cimport lists_column_view +from rmm.librmm.cuda_stream_view cimport cuda_stream_view cdef extern from "cudf/lists/count_elements.hpp" namespace "cudf::lists" nogil: cdef unique_ptr[column] count_elements( - const lists_column_view& + const lists_column_view&, + cuda_stream_view stream ) except +libcudf_exception_handler diff --git a/python/pylibcudf/pylibcudf/libcudf/lists/explode.pxd b/python/pylibcudf/pylibcudf/libcudf/lists/explode.pxd index adec02caad1..bf57dbc353a 100644 --- a/python/pylibcudf/pylibcudf/libcudf/lists/explode.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/lists/explode.pxd @@ -1,13 +1,15 @@ -# Copyright (c) 2021-2024, NVIDIA CORPORATION. +# Copyright (c) 2021-2025, NVIDIA CORPORATION. from libcpp.memory cimport unique_ptr from pylibcudf.exception_handler cimport libcudf_exception_handler from pylibcudf.libcudf.table.table cimport table from pylibcudf.libcudf.table.table_view cimport table_view from pylibcudf.libcudf.types cimport size_type +from rmm.librmm.cuda_stream_view cimport cuda_stream_view cdef extern from "cudf/lists/explode.hpp" namespace "cudf" nogil: cdef unique_ptr[table] explode_outer( const table_view, size_type explode_column_idx, + cuda_stream_view stream ) except +libcudf_exception_handler diff --git a/python/pylibcudf/pylibcudf/libcudf/lists/extract.pxd b/python/pylibcudf/pylibcudf/libcudf/lists/extract.pxd index 046bb51c68e..b837f6ca409 100644 --- a/python/pylibcudf/pylibcudf/libcudf/lists/extract.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/lists/extract.pxd @@ -1,17 +1,20 @@ -# Copyright (c) 2021-2024, NVIDIA CORPORATION. +# Copyright (c) 2021-2025, NVIDIA CORPORATION. from libcpp.memory cimport unique_ptr from pylibcudf.exception_handler cimport libcudf_exception_handler from pylibcudf.libcudf.column.column cimport column, column_view from pylibcudf.libcudf.lists.lists_column_view cimport lists_column_view from pylibcudf.libcudf.types cimport size_type +from rmm.librmm.cuda_stream_view cimport cuda_stream_view cdef extern from "cudf/lists/extract.hpp" namespace "cudf::lists" nogil: cdef unique_ptr[column] extract_list_element( const lists_column_view&, - size_type + size_type, + cuda_stream_view stream ) except +libcudf_exception_handler cdef unique_ptr[column] extract_list_element( const lists_column_view&, - const column_view& + const column_view&, + cuda_stream_view stream ) except +libcudf_exception_handler diff --git a/python/pylibcudf/pylibcudf/libcudf/lists/filling.pxd b/python/pylibcudf/pylibcudf/libcudf/lists/filling.pxd index 35e2559d902..5c6ce69f648 100644 --- a/python/pylibcudf/pylibcudf/libcudf/lists/filling.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/lists/filling.pxd @@ -1,18 +1,21 @@ -# Copyright (c) 2021-2024, NVIDIA CORPORATION. +# Copyright (c) 2021-2025, NVIDIA CORPORATION. from libcpp.memory cimport unique_ptr from pylibcudf.exception_handler cimport libcudf_exception_handler from pylibcudf.libcudf.column.column cimport column from pylibcudf.libcudf.column.column_view cimport column_view +from rmm.librmm.cuda_stream_view cimport cuda_stream_view cdef extern from "cudf/lists/filling.hpp" namespace "cudf::lists" nogil: cdef unique_ptr[column] sequences( const column_view& starts, const column_view& sizes, + cuda_stream_view stream ) except +libcudf_exception_handler cdef unique_ptr[column] sequences( const column_view& starts, const column_view& steps, const column_view& sizes, + cuda_stream_view stream ) except +libcudf_exception_handler diff --git a/python/pylibcudf/pylibcudf/libcudf/lists/gather.pxd b/python/pylibcudf/pylibcudf/libcudf/lists/gather.pxd index 4fde535b306..c4c189b0148 100644 --- a/python/pylibcudf/pylibcudf/libcudf/lists/gather.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/lists/gather.pxd @@ -4,10 +4,12 @@ from pylibcudf.exception_handler cimport libcudf_exception_handler from pylibcudf.libcudf.column.column cimport column from pylibcudf.libcudf.copying cimport out_of_bounds_policy from pylibcudf.libcudf.lists.lists_column_view cimport lists_column_view +from rmm.librmm.cuda_stream_view cimport cuda_stream_view cdef extern from "cudf/lists/gather.hpp" namespace "cudf::lists" nogil: cdef unique_ptr[column] segmented_gather( const lists_column_view& source_column, const lists_column_view& gather_map_list, - out_of_bounds_policy bounds_policy + out_of_bounds_policy bounds_policy, + cuda_stream_view stream ) except +libcudf_exception_handler diff --git a/python/pylibcudf/pylibcudf/libcudf/lists/reverse.pxd b/python/pylibcudf/pylibcudf/libcudf/lists/reverse.pxd index 1ae3b4409ef..68d4d91aea9 100644 --- a/python/pylibcudf/pylibcudf/libcudf/lists/reverse.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/lists/reverse.pxd @@ -1,11 +1,13 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. +# Copyright (c) 2024-2025, NVIDIA CORPORATION. from libcpp.memory cimport unique_ptr from pylibcudf.exception_handler cimport libcudf_exception_handler from pylibcudf.libcudf.column.column cimport column from pylibcudf.libcudf.lists.lists_column_view cimport lists_column_view +from rmm.librmm.cuda_stream_view cimport cuda_stream_view cdef extern from "cudf/lists/reverse.hpp" namespace "cudf::lists" nogil: cdef unique_ptr[column] reverse( const lists_column_view& lists_column, + cuda_stream_view stream ) except +libcudf_exception_handler diff --git a/python/pylibcudf/pylibcudf/libcudf/lists/set_operations.pxd b/python/pylibcudf/pylibcudf/libcudf/lists/set_operations.pxd index 1f4855bdbf3..076c765b3c3 100644 --- a/python/pylibcudf/pylibcudf/libcudf/lists/set_operations.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/lists/set_operations.pxd @@ -1,9 +1,10 @@ -# Copyright (c) 2021-2024, NVIDIA CORPORATION. +# Copyright (c) 2021-2025, NVIDIA CORPORATION. from libcpp.memory cimport unique_ptr from pylibcudf.exception_handler cimport libcudf_exception_handler from pylibcudf.libcudf.column.column cimport column from pylibcudf.libcudf.lists.lists_column_view cimport lists_column_view from pylibcudf.libcudf.types cimport nan_equality, null_equality +from rmm.librmm.cuda_stream_view cimport cuda_stream_view cdef extern from "cudf/lists/set_operations.hpp" namespace "cudf::lists" nogil: @@ -11,26 +12,30 @@ cdef extern from "cudf/lists/set_operations.hpp" namespace "cudf::lists" nogil: const lists_column_view& lhs, const lists_column_view& rhs, null_equality nulls_equal, - nan_equality nans_equal + nan_equality nans_equal, + cuda_stream_view stream ) except +libcudf_exception_handler cdef unique_ptr[column] have_overlap( const lists_column_view& lhs, const lists_column_view& rhs, null_equality nulls_equal, - nan_equality nans_equal + nan_equality nans_equal, + cuda_stream_view stream ) except +libcudf_exception_handler cdef unique_ptr[column] intersect_distinct( const lists_column_view& lhs, const lists_column_view& rhs, null_equality nulls_equal, - nan_equality nans_equal + nan_equality nans_equal, + cuda_stream_view stream ) except +libcudf_exception_handler cdef unique_ptr[column] union_distinct( const lists_column_view& lhs, const lists_column_view& rhs, null_equality nulls_equal, - nan_equality nans_equal + nan_equality nans_equal, + cuda_stream_view stream ) except +libcudf_exception_handler diff --git a/python/pylibcudf/pylibcudf/libcudf/lists/sorting.pxd b/python/pylibcudf/pylibcudf/libcudf/lists/sorting.pxd index 344b55b402f..80b46727650 100644 --- a/python/pylibcudf/pylibcudf/libcudf/lists/sorting.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/lists/sorting.pxd @@ -1,20 +1,23 @@ -# Copyright (c) 2021-2024, NVIDIA CORPORATION. +# Copyright (c) 2021-2025, NVIDIA CORPORATION. from libcpp.memory cimport unique_ptr from pylibcudf.exception_handler cimport libcudf_exception_handler from pylibcudf.libcudf.column.column cimport column from pylibcudf.libcudf.lists.lists_column_view cimport lists_column_view from pylibcudf.libcudf.types cimport null_order, order +from rmm.librmm.cuda_stream_view cimport cuda_stream_view cdef extern from "cudf/lists/sorting.hpp" namespace "cudf::lists" nogil: cdef unique_ptr[column] sort_lists( const lists_column_view source_column, order column_order, - null_order null_precedence + null_order null_precedence, + cuda_stream_view stream ) except +libcudf_exception_handler cdef unique_ptr[column] stable_sort_lists( const lists_column_view source_column, order column_order, - null_order null_precedence + null_order null_precedence, + cuda_stream_view stream ) except +libcudf_exception_handler diff --git a/python/pylibcudf/pylibcudf/libcudf/lists/stream_compaction.pxd b/python/pylibcudf/pylibcudf/libcudf/lists/stream_compaction.pxd index 8341ac69bf5..0766d36e724 100644 --- a/python/pylibcudf/pylibcudf/libcudf/lists/stream_compaction.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/lists/stream_compaction.pxd @@ -1,9 +1,11 @@ -# Copyright (c) 2021-2024, NVIDIA CORPORATION. +# Copyright (c) 2021-2025, NVIDIA CORPORATION. from libcpp.memory cimport unique_ptr from pylibcudf.exception_handler cimport libcudf_exception_handler from pylibcudf.libcudf.column.column cimport column from pylibcudf.libcudf.lists.lists_column_view cimport lists_column_view +from pylibcudf.libcudf.stream_compaction cimport duplicate_keep_option from pylibcudf.libcudf.types cimport nan_equality, null_equality +from rmm.librmm.cuda_stream_view cimport cuda_stream_view cdef extern from "cudf/lists/stream_compaction.hpp" \ @@ -11,10 +13,13 @@ cdef extern from "cudf/lists/stream_compaction.hpp" \ cdef unique_ptr[column] apply_boolean_mask( const lists_column_view& lists_column, const lists_column_view& boolean_mask, + cuda_stream_view stream ) except +libcudf_exception_handler cdef unique_ptr[column] distinct( const lists_column_view& lists_column, null_equality nulls_equal, - nan_equality nans_equal + nan_equality nans_equal, + duplicate_keep_option keep_option, + cuda_stream_view stream ) except +libcudf_exception_handler diff --git a/python/pylibcudf/pylibcudf/lists.pxd b/python/pylibcudf/pylibcudf/lists.pxd index ee0c1390791..42484910d02 100644 --- a/python/pylibcudf/pylibcudf/lists.pxd +++ b/python/pylibcudf/pylibcudf/lists.pxd @@ -7,6 +7,7 @@ from pylibcudf.libcudf.types cimport ( from pylibcudf.libcudf.copying cimport out_of_bounds_policy from pylibcudf.libcudf.lists.combine cimport concatenate_null_policy from pylibcudf.libcudf.lists.contains cimport duplicate_find_option +from rmm.pylibrmm.stream cimport Stream from .column cimport Column from .scalar cimport Scalar @@ -20,46 +21,72 @@ ctypedef fused ColumnOrSizeType: Column size_type -cpdef Table explode_outer(Table, size_type explode_column_idx) +cpdef Table explode_outer(Table, size_type explode_column_idx, Stream stream=*) -cpdef Column concatenate_rows(Table) +cpdef Column concatenate_rows(Table, Stream stream=*) -cpdef Column concatenate_list_elements(Column, concatenate_null_policy null_policy) +cpdef Column concatenate_list_elements( + Column, concatenate_null_policy null_policy, Stream stream=* +) -cpdef Column contains(Column, ColumnOrScalar) +cpdef Column contains(Column, ColumnOrScalar, Stream stream=*) -cpdef Column contains_nulls(Column) +cpdef Column contains_nulls(Column, Stream stream=*) -cpdef Column index_of(Column, ColumnOrScalar, duplicate_find_option) +cpdef Column index_of( + Column, ColumnOrScalar, duplicate_find_option, Stream stream=* +) -cpdef Column reverse(Column) +cpdef Column reverse(Column, Stream stream=*) -cpdef Column segmented_gather(Column, Column, out_of_bounds_policy bounds_policy=*) +cpdef Column segmented_gather( + Column, Column, out_of_bounds_policy bounds_policy=*, Stream stream=* +) -cpdef Column extract_list_element(Column, ColumnOrSizeType) +cpdef Column extract_list_element(Column, ColumnOrSizeType, Stream stream=*) -cpdef Column count_elements(Column) +cpdef Column count_elements(Column, Stream stream=*) -cpdef Column sequences(Column, Column, Column steps = *) +cpdef Column sequences( + Column, Column, Column steps = *, Stream stream=* +) -cpdef Column sort_lists(Column, order, null_order, bool stable = *) +cpdef Column sort_lists( + Column, order, null_order, bool stable = *, Stream stream=* +) cpdef Column difference_distinct( - Column, Column, null_equality nulls_equal=*, nan_equality nans_equal=* + Column, + Column, + null_equality nulls_equal=*, + nan_equality nans_equal=*, + Stream stream=* ) cpdef Column have_overlap( - Column, Column, null_equality nulls_equal=*, nan_equality nans_equal=* + Column, + Column, + null_equality nulls_equal=*, + nan_equality nans_equal=*, + Stream stream=* ) cpdef Column intersect_distinct( - Column, Column, null_equality nulls_equal=*, nan_equality nans_equal=* + Column, + Column, + null_equality nulls_equal=*, + nan_equality nans_equal=*, + Stream stream=* ) cpdef Column union_distinct( - Column, Column, null_equality nulls_equal=*, nan_equality nans_equal=* + Column, + Column, + null_equality nulls_equal=*, + nan_equality nans_equal=*, + Stream stream=* ) -cpdef Column apply_boolean_mask(Column, Column) +cpdef Column apply_boolean_mask(Column, Column, Stream stream=*) -cpdef Column distinct(Column, null_equality, nan_equality) +cpdef Column distinct(Column, null_equality, nan_equality, Stream stream=*) diff --git a/python/pylibcudf/pylibcudf/lists.pyi b/python/pylibcudf/pylibcudf/lists.pyi index e08d3df1036..0fe4afa08ac 100644 --- a/python/pylibcudf/pylibcudf/lists.pyi +++ b/python/pylibcudf/pylibcudf/lists.pyi @@ -2,6 +2,8 @@ from enum import IntEnum +from rmm import Stream + from pylibcudf.column import Column from pylibcudf.copying import OutOfBoundsPolicy from pylibcudf.scalar import Scalar @@ -16,60 +18,83 @@ class DuplicateFindOption(IntEnum): FIND_FIRST = ... FIND_LAST = ... -def explode_outer(input: Table, explode_column_idx: int) -> Table: ... -def concatenate_rows(input: Table) -> Column: ... +def explode_outer( + input: Table, explode_column_idx: int, stream: Stream | None = None +) -> Table: ... +def concatenate_rows(input: Table, stream: Stream | None = None) -> Column: ... def concatenate_list_elements( - input: Column, null_policy: ConcatenateNullPolicy + input: Column, + null_policy: ConcatenateNullPolicy, + stream: Stream | None = None, +) -> Column: ... +def contains( + input: Column, search_key: Column | Scalar, stream: Stream | None = None ) -> Column: ... -def contains(input: Column, search_key: Column | Scalar) -> Column: ... -def contains_nulls(input: Column) -> Column: ... +def contains_nulls(input: Column, stream: Stream | None = None) -> Column: ... def index_of( input: Column, search_key: Column | Scalar, find_option: DuplicateFindOption, + stream: Stream | None = None, ) -> Column: ... -def reverse(input: Column) -> Column: ... +def reverse(input: Column, stream: Stream | None = None) -> Column: ... def segmented_gather( input: Column, gather_map_list: Column, bounds_policy: OutOfBoundsPolicy = OutOfBoundsPolicy.DONT_CHECK, + stream: Stream | None = None, ) -> Column: ... -def extract_list_element(input: Column, index: Column | int) -> Column: ... -def count_elements(input: Column) -> Column: ... +def extract_list_element( + input: Column, index: Column | int, stream: Stream | None = None +) -> Column: ... +def count_elements(input: Column, stream: Stream | None = None) -> Column: ... def sequences( - starts: Column, sizes: Column, steps: Column | None = None + starts: Column, + sizes: Column, + steps: Column | None = None, + stream: Stream | None = None, ) -> Column: ... def sort_lists( input: Column, sort_order: Order, na_position: NullOrder, stable: bool = False, + stream: Stream | None = None, ) -> Column: ... def difference_distinct( lhs: Column, rhs: Column, nulls_equal: NullEquality = NullEquality.EQUAL, nans_equal: NanEquality = NanEquality.ALL_EQUAL, + stream: Stream | None = None, ) -> Column: ... def have_overlap( lhs: Column, rhs: Column, nulls_equal: NullEquality = NullEquality.EQUAL, nans_equal: NanEquality = NanEquality.ALL_EQUAL, + stream: Stream | None = None, ) -> Column: ... def intersect_distinct( lhs: Column, rhs: Column, nulls_equal: NullEquality = NullEquality.EQUAL, nans_equal: NanEquality = NanEquality.ALL_EQUAL, + stream: Stream | None = None, ) -> Column: ... def union_distinct( lhs: Column, rhs: Column, nulls_equal: NullEquality = NullEquality.EQUAL, nans_equal: NanEquality = NanEquality.ALL_EQUAL, + stream: Stream | None = None, +) -> Column: ... +def apply_boolean_mask( + input: Column, boolean_mask: Column, stream: Stream | None = None ) -> Column: ... -def apply_boolean_mask(input: Column, boolean_mask: Column) -> Column: ... def distinct( - input: Column, nulls_equal: NullEquality, nans_equal: NanEquality + input: Column, + nulls_equal: NullEquality, + nans_equal: NanEquality, + stream: Stream | None = None, ) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/lists.pyx b/python/pylibcudf/pylibcudf/lists.pyx index eee5a43f6a8..5d6744f41c6 100644 --- a/python/pylibcudf/pylibcudf/lists.pyx +++ b/python/pylibcudf/pylibcudf/lists.pyx @@ -33,6 +33,7 @@ from pylibcudf.libcudf.lists.stream_compaction cimport ( apply_boolean_mask as cpp_apply_boolean_mask, distinct as cpp_distinct, ) +from pylibcudf.libcudf.stream_compaction cimport duplicate_keep_option from pylibcudf.libcudf.table.table cimport table from pylibcudf.libcudf.types cimport ( nan_equality, @@ -46,9 +47,12 @@ from pylibcudf.lists cimport ColumnOrScalar, ColumnOrSizeType from pylibcudf.libcudf.lists.combine import concatenate_null_policy as ConcatenateNullPolicy # no-cython-lint from pylibcudf.libcudf.lists.contains import duplicate_find_option as DuplicateFindOption # no-cython-lint +from rmm.pylibrmm.stream cimport Stream + from .column cimport Column, ListColumnView from .scalar cimport Scalar from .table cimport Table +from .utils cimport _get_stream __all__ = [ "ConcatenateNullPolicy", @@ -73,7 +77,9 @@ __all__ = [ "union_distinct", ] -cpdef Table explode_outer(Table input, size_type explode_column_idx): +cpdef Table explode_outer( + Table input, size_type explode_column_idx, Stream stream=None +): """Explode a column of lists into rows. All other columns will be duplicated for each element in the list. @@ -94,13 +100,17 @@ cpdef Table explode_outer(Table input, size_type explode_column_idx): """ cdef unique_ptr[table] c_result + stream = _get_stream(stream) + with nogil: - c_result = cpp_explode.explode_outer(input.view(), explode_column_idx) + c_result = cpp_explode.explode_outer( + input.view(), explode_column_idx, stream.view() + ) - return Table.from_libcudf(move(c_result)) + return Table.from_libcudf(move(c_result), stream) -cpdef Column concatenate_rows(Table input): +cpdef Column concatenate_rows(Table input, Stream stream=None): """Concatenate multiple lists columns into a single lists column row-wise. For details, see :cpp:func:`concatenate_list_elements`. @@ -117,14 +127,18 @@ cpdef Column concatenate_rows(Table input): """ cdef unique_ptr[column] c_result + stream = _get_stream(stream) + with nogil: - c_result = cpp_concatenate_rows(input.view()) + c_result = cpp_concatenate_rows( + input.view(), concatenate_null_policy.IGNORE, stream.view() + ) - return Column.from_libcudf(move(c_result)) + return Column.from_libcudf(move(c_result), stream) cpdef Column concatenate_list_elements( - Column input, concatenate_null_policy null_policy + Column input, concatenate_null_policy null_policy, Stream stream=None ): """Concatenate multiple lists on the same row into a single list. @@ -144,13 +158,17 @@ cpdef Column concatenate_list_elements( """ cdef unique_ptr[column] c_result + stream = _get_stream(stream) + with nogil: - c_result = cpp_concatenate_list_elements(input.view(), null_policy) + c_result = cpp_concatenate_list_elements( + input.view(), null_policy, stream.view() + ) - return Column.from_libcudf(move(c_result)) + return Column.from_libcudf(move(c_result), stream) -cpdef Column contains(Column input, ColumnOrScalar search_key): +cpdef Column contains(Column input, ColumnOrScalar search_key, Stream stream=None): """Create a column of bool values indicating whether the search_key is contained in the input. @@ -176,6 +194,8 @@ cpdef Column contains(Column input, ColumnOrScalar search_key): cdef unique_ptr[column] c_result cdef ListColumnView list_view = input.list_view() + stream = _get_stream(stream) + if not isinstance(search_key, (Column, Scalar)): raise TypeError("Must pass a Column or Scalar") @@ -185,11 +205,12 @@ cpdef Column contains(Column input, ColumnOrScalar search_key): search_key.view() if ColumnOrScalar is Column else dereference( search_key.get() ), + stream.view(), ) - return Column.from_libcudf(move(c_result)) + return Column.from_libcudf(move(c_result), stream) -cpdef Column contains_nulls(Column input): +cpdef Column contains_nulls(Column input, Stream stream=None): """Create a column of bool values indicating whether each row in the lists column contains a null value. @@ -208,13 +229,19 @@ cpdef Column contains_nulls(Column input): """ cdef unique_ptr[column] c_result cdef ListColumnView list_view = input.list_view() + + stream = _get_stream(stream) + with nogil: - c_result = cpp_contains.contains_nulls(list_view.view()) - return Column.from_libcudf(move(c_result)) + c_result = cpp_contains.contains_nulls(list_view.view(), stream.view()) + return Column.from_libcudf(move(c_result), stream) cpdef Column index_of( - Column input, ColumnOrScalar search_key, duplicate_find_option find_option + Column input, + ColumnOrScalar search_key, + duplicate_find_option find_option, + Stream stream=None ): """Create a column of index values indicating the position of a search key row within the corresponding list row in the lists column. @@ -243,6 +270,9 @@ cpdef Column index_of( """ cdef unique_ptr[column] c_result cdef ListColumnView list_view = input.list_view() + + stream = _get_stream(stream) + with nogil: c_result = cpp_contains.index_of( list_view.view(), @@ -250,11 +280,12 @@ cpdef Column index_of( search_key.get() ), find_option, + stream.view(), ) - return Column.from_libcudf(move(c_result)) + return Column.from_libcudf(move(c_result), stream) -cpdef Column reverse(Column input): +cpdef Column reverse(Column input, Stream stream=None): """Reverse the element order within each list of the input column. For details, see :cpp:func:`reverse`. @@ -272,15 +303,18 @@ cpdef Column reverse(Column input): cdef unique_ptr[column] c_result cdef ListColumnView list_view = input.list_view() + stream = _get_stream(stream) + with nogil: - c_result = cpp_reverse.reverse(list_view.view()) - return Column.from_libcudf(move(c_result)) + c_result = cpp_reverse.reverse(list_view.view(), stream.view()) + return Column.from_libcudf(move(c_result), stream) cpdef Column segmented_gather( Column input, Column gather_map_list, out_of_bounds_policy bounds_policy=out_of_bounds_policy.DONT_CHECK, + Stream stream=None, ): """Create a column with elements gathered based on the indices in gather_map_list @@ -314,16 +348,21 @@ cpdef Column segmented_gather( cdef ListColumnView list_view1 = input.list_view() cdef ListColumnView list_view2 = gather_map_list.list_view() + stream = _get_stream(stream) + with nogil: c_result = cpp_gather.segmented_gather( list_view1.view(), list_view2.view(), bounds_policy, + stream.view(), ) - return Column.from_libcudf(move(c_result)) + return Column.from_libcudf(move(c_result), stream) -cpdef Column extract_list_element(Column input, ColumnOrSizeType index): +cpdef Column extract_list_element( + Column input, ColumnOrSizeType index, Stream stream=None +): """Create a column of extracted list elements. For details, see :cpp:func:`extract_list_element`. @@ -343,15 +382,18 @@ cpdef Column extract_list_element(Column input, ColumnOrSizeType index): cdef unique_ptr[column] c_result cdef ListColumnView list_view = input.list_view() + stream = _get_stream(stream) + with nogil: c_result = cpp_extract_list_element( list_view.view(), index.view() if ColumnOrSizeType is Column else index, + stream.view(), ) - return Column.from_libcudf(move(c_result)) + return Column.from_libcudf(move(c_result), stream) -cpdef Column count_elements(Column input): +cpdef Column count_elements(Column input, Stream stream=None): """Count the number of rows in each list element in the given lists column. For details, see :cpp:func:`count_elements`. @@ -371,13 +413,17 @@ cpdef Column count_elements(Column input): cdef ListColumnView list_view = input.list_view() cdef unique_ptr[column] c_result + stream = _get_stream(stream) + with nogil: - c_result = cpp_count_elements(list_view.view()) + c_result = cpp_count_elements(list_view.view(), stream.view()) - return Column.from_libcudf(move(c_result)) + return Column.from_libcudf(move(c_result), stream) -cpdef Column sequences(Column starts, Column sizes, Column steps = None): +cpdef Column sequences( + Column starts, Column sizes, Column steps = None, Stream stream=None +): """Create a lists column in which each row contains a sequence of values specified by a tuple of (start, step, size) parameters. @@ -399,23 +445,27 @@ cpdef Column sequences(Column starts, Column sizes, Column steps = None): """ cdef unique_ptr[column] c_result + stream = _get_stream(stream) + if steps is not None: with nogil: c_result = cpp_filling.sequences( starts.view(), steps.view(), sizes.view(), + stream.view(), ) else: with nogil: - c_result = cpp_filling.sequences(starts.view(), sizes.view()) - return Column.from_libcudf(move(c_result)) + c_result = cpp_filling.sequences(starts.view(), sizes.view(), stream.view()) + return Column.from_libcudf(move(c_result), stream) cpdef Column sort_lists( Column input, order sort_order, null_order na_position, - bool stable = False + bool stable = False, + Stream stream=None ): """Sort the elements within a list in each row of a list column. @@ -442,20 +492,24 @@ cpdef Column sort_lists( cdef unique_ptr[column] c_result cdef ListColumnView list_view = input.list_view() + stream = _get_stream(stream) + with nogil: if stable: c_result = cpp_stable_sort_lists( list_view.view(), sort_order, na_position, + stream.view(), ) else: c_result = cpp_sort_lists( list_view.view(), sort_order, na_position, + stream.view(), ) - return Column.from_libcudf(move(c_result)) + return Column.from_libcudf(move(c_result), stream) cpdef Column difference_distinct( @@ -463,6 +517,7 @@ cpdef Column difference_distinct( Column rhs, null_equality nulls_equal=null_equality.EQUAL, nan_equality nans_equal=nan_equality.ALL_EQUAL, + Stream stream=None, ): """Create a column of index values indicating the position of a search key row within the corresponding list row in the lists column. @@ -489,14 +544,17 @@ cpdef Column difference_distinct( cdef ListColumnView lhs_view = lhs.list_view() cdef ListColumnView rhs_view = rhs.list_view() + stream = _get_stream(stream) + with nogil: c_result = cpp_set_operations.difference_distinct( lhs_view.view(), rhs_view.view(), nulls_equal, nans_equal, + stream.view(), ) - return Column.from_libcudf(move(c_result)) + return Column.from_libcudf(move(c_result), stream) cpdef Column have_overlap( @@ -504,6 +562,7 @@ cpdef Column have_overlap( Column rhs, null_equality nulls_equal=null_equality.EQUAL, nan_equality nans_equal=nan_equality.ALL_EQUAL, + Stream stream=None, ): """Check if lists at each row of the given lists columns overlap. @@ -529,14 +588,17 @@ cpdef Column have_overlap( cdef ListColumnView lhs_view = lhs.list_view() cdef ListColumnView rhs_view = rhs.list_view() + stream = _get_stream(stream) + with nogil: c_result = cpp_set_operations.have_overlap( lhs_view.view(), rhs_view.view(), nulls_equal, nans_equal, + stream.view(), ) - return Column.from_libcudf(move(c_result)) + return Column.from_libcudf(move(c_result), stream) cpdef Column intersect_distinct( @@ -544,6 +606,7 @@ cpdef Column intersect_distinct( Column rhs, null_equality nulls_equal=null_equality.EQUAL, nan_equality nans_equal=nan_equality.ALL_EQUAL, + Stream stream=None, ): """Create a lists column of distinct elements common to two input lists columns. @@ -569,14 +632,17 @@ cpdef Column intersect_distinct( cdef ListColumnView lhs_view = lhs.list_view() cdef ListColumnView rhs_view = rhs.list_view() + stream = _get_stream(stream) + with nogil: c_result = cpp_set_operations.intersect_distinct( lhs_view.view(), rhs_view.view(), nulls_equal, nans_equal, + stream.view(), ) - return Column.from_libcudf(move(c_result)) + return Column.from_libcudf(move(c_result), stream) cpdef Column union_distinct( @@ -584,6 +650,7 @@ cpdef Column union_distinct( Column rhs, null_equality nulls_equal=null_equality.EQUAL, nan_equality nans_equal=nan_equality.ALL_EQUAL, + Stream stream=None, ): """Create a lists column of distinct elements found in either of two input lists columns. @@ -610,17 +677,20 @@ cpdef Column union_distinct( cdef ListColumnView lhs_view = lhs.list_view() cdef ListColumnView rhs_view = rhs.list_view() + stream = _get_stream(stream) + with nogil: c_result = cpp_set_operations.union_distinct( lhs_view.view(), rhs_view.view(), nulls_equal, nans_equal, + stream.view(), ) - return Column.from_libcudf(move(c_result)) + return Column.from_libcudf(move(c_result), stream) -cpdef Column apply_boolean_mask(Column input, Column boolean_mask): +cpdef Column apply_boolean_mask(Column input, Column boolean_mask, Stream stream=None): """Filters elements in each row of the input lists column using a boolean mask For details, see :cpp:func:`apply_boolean_mask`. @@ -640,15 +710,24 @@ cpdef Column apply_boolean_mask(Column input, Column boolean_mask): cdef unique_ptr[column] c_result cdef ListColumnView list_view = input.list_view() cdef ListColumnView mask_view = boolean_mask.list_view() + + stream = _get_stream(stream) + with nogil: c_result = cpp_apply_boolean_mask( list_view.view(), mask_view.view(), + stream.view(), ) - return Column.from_libcudf(move(c_result)) + return Column.from_libcudf(move(c_result), stream) -cpdef Column distinct(Column input, null_equality nulls_equal, nan_equality nans_equal): +cpdef Column distinct( + Column input, + null_equality nulls_equal, + nan_equality nans_equal, + Stream stream=None +): """Create a new list column without duplicate elements in each list. For details, see :cpp:func:`distinct`. @@ -670,13 +749,17 @@ cpdef Column distinct(Column input, null_equality nulls_equal, nan_equality nans cdef unique_ptr[column] c_result cdef ListColumnView list_view = input.list_view() + stream = _get_stream(stream) + with nogil: c_result = cpp_distinct( list_view.view(), nulls_equal, nans_equal, + duplicate_keep_option.KEEP_ANY, + stream.view(), ) - return Column.from_libcudf(move(c_result)) + return Column.from_libcudf(move(c_result), stream) ConcatenateNullPolicy.__str__ = ConcatenateNullPolicy.__repr__ DuplicateFindOption.__str__ = DuplicateFindOption.__repr__ From 86e5d5522aea9347d64c33af55e34f2d2902b30b Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Wed, 13 Aug 2025 15:40:03 -0700 Subject: [PATCH 122/366] Move test_array_function/ufunc to new cudf classic test directory structure (#19637) Towards https://github.com/rapidsai/cudf/issues/9999 Towards https://github.com/rapidsai/cudf/issues/15723 Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/19637 --- python/cudf/cudf/tests/conftest.py | 44 ++ .../tests/dataframe/test_array_function.py | 44 ++ .../cudf/tests/dataframe/test_np_ufuncs.py | 128 +++++ .../indexes/index/test_array_function.py | 29 ++ .../indexes/multiindex/test_array_function.py | 24 + .../cudf/cudf/tests/indexes/test_np_ufuncs.py | 85 +++ .../cudf/tests/series/test_array_function.py | 36 ++ .../cudf/cudf/tests/series/test_np_ufuncs.py | 210 ++++++++ python/cudf/cudf/tests/test_array_function.py | 133 ----- python/cudf/cudf/tests/test_array_ufunc.py | 486 ------------------ 10 files changed, 600 insertions(+), 619 deletions(-) create mode 100644 python/cudf/cudf/tests/dataframe/test_array_function.py create mode 100644 python/cudf/cudf/tests/dataframe/test_np_ufuncs.py create mode 100644 python/cudf/cudf/tests/indexes/index/test_array_function.py create mode 100644 python/cudf/cudf/tests/indexes/multiindex/test_array_function.py create mode 100644 python/cudf/cudf/tests/indexes/test_np_ufuncs.py create mode 100644 python/cudf/cudf/tests/series/test_array_function.py create mode 100644 python/cudf/cudf/tests/series/test_np_ufuncs.py delete mode 100644 python/cudf/cudf/tests/test_array_function.py delete mode 100644 python/cudf/cudf/tests/test_array_ufunc.py diff --git a/python/cudf/cudf/tests/conftest.py b/python/cudf/cudf/tests/conftest.py index 7a8a6c3881e..4718ecdc711 100644 --- a/python/cudf/cudf/tests/conftest.py +++ b/python/cudf/cudf/tests/conftest.py @@ -431,6 +431,50 @@ def all_supported_types_as_str(request): return request.param +# pandas can raise warnings for some inputs to the following ufuncs: +numpy_ufuncs = [] +for name in dir(np): + func = getattr(np, name) + if isinstance(func, np.ufunc) and hasattr(cp, name): + if func in { + np.arccos, + np.arccosh, + np.arcsin, + np.arctanh, + np.fmod, + np.log, + np.log10, + np.log2, + np.reciprocal, + }: + marks = [ + pytest.mark.filterwarnings( + "ignore:invalid value encountered:RuntimeWarning" + ), + pytest.mark.filterwarnings( + "ignore:divide by zero:RuntimeWarning" + ), + ] + numpy_ufuncs.append(pytest.param(func, marks=marks)) + elif func in { + np.bitwise_and, + np.bitwise_or, + np.bitwise_xor, + }: + marks = pytest.mark.filterwarnings( + "ignore:Operation between non boolean Series:FutureWarning" + ) + numpy_ufuncs.append(pytest.param(func, marks=marks)) + else: + numpy_ufuncs.append(func) + + +@pytest.fixture(params=numpy_ufuncs) +def numpy_ufunc(request): + """Numpy ufuncs also supported by cupy.""" + return request.param + + @pytest.fixture(params=[True, False]) def dropna(request): """Param for `dropna` argument""" diff --git a/python/cudf/cudf/tests/dataframe/test_array_function.py b/python/cudf/cudf/tests/dataframe/test_array_function.py new file mode 100644 index 00000000000..49ea9d59918 --- /dev/null +++ b/python/cudf/cudf/tests/dataframe/test_array_function.py @@ -0,0 +1,44 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + +import numpy as np +import pandas as pd +import pytest + +import cudf +from cudf.testing import assert_eq + + +@pytest.mark.parametrize( + "func", + [ + lambda x: np.mean(x, axis=0), + lambda x: np.sum(x, axis=0), + lambda x: np.var(x, ddof=1, axis=0), + lambda x: np.dot(x, x.transpose()), + np.all, + np.any, + lambda x: np.prod(x, axis=0), + lambda x: np.prod(x, axis=1), + ], +) +def test_array_func_cudf_dataframe(func): + pd_df = pd.DataFrame(np.ones((3, 3))) + cudf_df = cudf.from_pandas(pd_df) + expect = func(pd_df) + got = func(cudf_df) + assert_eq(expect, got) + + +@pytest.mark.parametrize( + "func", + [ + lambda x: np.cov(x, x), + np.linalg.norm, + np.linalg.det, + ], +) +def test_array_func_missing_cudf_dataframe(func): + pd_df = pd.DataFrame(np.ones((3, 3))) + cudf_df = cudf.from_pandas(pd_df) + with pytest.raises(TypeError): + func(cudf_df) diff --git a/python/cudf/cudf/tests/dataframe/test_np_ufuncs.py b/python/cudf/cudf/tests/dataframe/test_np_ufuncs.py new file mode 100644 index 00000000000..315a1303806 --- /dev/null +++ b/python/cudf/cudf/tests/dataframe/test_np_ufuncs.py @@ -0,0 +1,128 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + +import operator +from functools import reduce + +import cupy as cp +import numpy as np +import pytest +from packaging.version import parse + +import cudf +from cudf.core._compat import ( + PANDAS_CURRENT_SUPPORTED_VERSION, + PANDAS_VERSION, +) +from cudf.testing import assert_eq +from cudf.testing._utils import expect_warning_if, set_random_null_mask_inplace + + +# Skip matmul since it requires aligned shapes. +@pytest.mark.skipif( + PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, + reason="warning not present in older pandas versions", +) +@pytest.mark.parametrize("has_nulls", [True, False]) +@pytest.mark.parametrize("indexed", [True, False]) +def test_ufunc_dataframe(request, numpy_ufunc, has_nulls, indexed): + # Note: This test assumes that all ufuncs are unary or binary. + request.applymarker( + pytest.mark.xfail( + condition=( + indexed + and numpy_ufunc + in { + np.greater, + np.greater_equal, + np.less, + np.less_equal, + np.not_equal, + np.equal, + } + ), + reason="Comparison operators do not support misaligned indexes.", + ) + ) + request.applymarker( + pytest.mark.xfail( + condition=numpy_ufunc in {np.ceil, np.floor, np.trunc} + and not has_nulls + and parse(np.__version__) >= parse("2.1") + and parse(cp.__version__) < parse("14"), + reason="https://github.com/cupy/cupy/issues/9018", + ) + ) + request.applymarker( + pytest.mark.xfail( + condition=numpy_ufunc == np.matmul, + reason=f"{numpy_ufunc} is not supported in cuDF", + ) + ) + + N = 100 + # Avoid zeros in either array to skip division by 0 errors. Also limit the + # scale to avoid issues with overflow, etc. We use ints because some + # operations (like bitwise ops) are not defined for floats. + # TODO: Add tests of mismatched columns etc. + pandas_args = args = [ + cudf.DataFrame( + {"foo": cp.random.randint(low=1, high=10, size=N)}, + index=cp.random.choice(range(N), N, False) if indexed else None, + ) + for _ in range(numpy_ufunc.nin) + ] + + if has_nulls: + # Converting nullable integer cudf.Series to pandas will produce a + # float pd.Series, so instead we replace nulls with an arbitrary + # integer value, precompute the mask, and then reapply it afterwards. + for arg in args: + set_random_null_mask_inplace(arg["foo"]) + pandas_args = [arg.copy() for arg in args] + for arg in pandas_args: + arg["foo"] = arg["foo"].fillna(0) + + # Note: Different indexes must be aligned before the mask is computed. + # This requires using an internal function (_align_indices), and that + # is unlikely to change for the foreseeable future. + aligned = ( + cudf.core.dataframe._align_indices(*args) + if indexed and numpy_ufunc.nin == 2 + else args + ) + mask = reduce( + operator.or_, (a["foo"].isna() for a in aligned) + ).to_pandas() + + got = numpy_ufunc(*args) + + expect = numpy_ufunc(*(arg.to_pandas() for arg in pandas_args)) + + if numpy_ufunc.nout > 1: + for g, e in zip(got, expect, strict=True): + if has_nulls: + e[mask] = np.nan + assert_eq(g, e, check_exact=False) + else: + if has_nulls: + with expect_warning_if( + numpy_ufunc + in ( + np.isfinite, + np.isinf, + np.isnan, + np.logical_and, + np.logical_not, + np.logical_or, + np.logical_xor, + np.signbit, + np.equal, + np.greater, + np.greater_equal, + np.less, + np.less_equal, + np.not_equal, + ) + ): + expect[mask] = np.nan + assert_eq(got, expect, check_exact=False) diff --git a/python/cudf/cudf/tests/indexes/index/test_array_function.py b/python/cudf/cudf/tests/indexes/index/test_array_function.py new file mode 100644 index 00000000000..4d41d81dc35 --- /dev/null +++ b/python/cudf/cudf/tests/indexes/index/test_array_function.py @@ -0,0 +1,29 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + +import numpy as np +import pytest + +import cudf +from cudf.testing import assert_eq + + +def test_np_unique_cudf_index(): + np_ar = np.array([1, 1, 3]) + cudf_index = cudf.Index(np_ar) + expect = cudf.Index(np.unique(np_ar)) + got = np.unique(cudf_index) + assert_eq(expect, got) + + +@pytest.mark.parametrize( + "func", + [ + lambda x: np.cov(x, x), + np.linalg.norm, + np.linalg.det, + ], +) +def test_array_func_missing_cudf_index(func): + cudf_index = cudf.Index([1, 2, 3]) + with pytest.raises(TypeError): + func(cudf_index) diff --git a/python/cudf/cudf/tests/indexes/multiindex/test_array_function.py b/python/cudf/cudf/tests/indexes/multiindex/test_array_function.py new file mode 100644 index 00000000000..1bd2dd132e7 --- /dev/null +++ b/python/cudf/cudf/tests/indexes/multiindex/test_array_function.py @@ -0,0 +1,24 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + +import numpy as np +import pytest + +import cudf + + +@pytest.mark.parametrize( + "func", + [ + lambda x: np.cov(x, x), + lambda x: np.dot(x, x), + np.linalg.norm, + np.linalg.det, + ], +) +def test_array_func_missing_cudf_multi_index(func): + levels = [["a", "b"], ["c", "d"]] + codes = [[0, 1], [1, 0]] + + cudf_multi_index = cudf.MultiIndex(levels, codes) + with pytest.raises(TypeError): + func(cudf_multi_index) diff --git a/python/cudf/cudf/tests/indexes/test_np_ufuncs.py b/python/cudf/cudf/tests/indexes/test_np_ufuncs.py new file mode 100644 index 00000000000..e91b377abcb --- /dev/null +++ b/python/cudf/cudf/tests/indexes/test_np_ufuncs.py @@ -0,0 +1,85 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + + +import cupy as cp +import numpy as np +import pytest +from packaging.version import parse + +import cudf +from cudf.core._compat import ( + PANDAS_LT_300, +) +from cudf.testing import assert_eq + + +def test_ufunc_index(request, numpy_ufunc): + # Note: This test assumes that all ufuncs are unary or binary. + request.applymarker( + pytest.mark.xfail( + condition=numpy_ufunc == np.matmul and PANDAS_LT_300, + reason="Fixed by https://github.com/pandas-dev/pandas/pull/57079", + ) + ) + request.applymarker( + pytest.mark.xfail( + condition=numpy_ufunc in {np.ceil, np.floor, np.trunc} + and parse(np.__version__) >= parse("2.1") + and parse(cp.__version__) < parse("14"), + reason="https://github.com/cupy/cupy/issues/9018", + ) + ) + + N = 100 + # Avoid zeros in either array to skip division by 0 errors. Also limit the + # scale to avoid issues with overflow, etc. We use ints because some + # operations (like bitwise ops) are not defined for floats. + pandas_args = args = [ + cudf.Index( + cp.random.randint(low=1, high=10, size=N), + ) + for _ in range(numpy_ufunc.nin) + ] + + got = numpy_ufunc(*args) + + expect = numpy_ufunc(*(arg.to_pandas() for arg in pandas_args)) + + if numpy_ufunc.nout > 1: + for g, e in zip(got, expect, strict=True): + assert_eq(g, e, check_exact=False) + else: + assert_eq(got, expect, check_exact=False) + + +@pytest.mark.parametrize( + "ufunc", [np.add, np.greater, np.greater_equal, np.logical_and] +) +@pytest.mark.parametrize("reflect", [True, False]) +def test_binary_ufunc_index_array(ufunc, reflect): + N = 100 + # Avoid zeros in either array to skip division by 0 errors. Also limit the + # scale to avoid issues with overflow, etc. We use ints because some + # operations (like bitwise ops) are not defined for floats. + args = [cudf.Index(cp.random.rand(N)) for _ in range(ufunc.nin)] + + arg1 = args[1].to_cupy() + + if reflect: + got = ufunc(arg1, args[0]) + expect = ufunc(args[1].to_numpy(), args[0].to_pandas()) + else: + got = ufunc(args[0], arg1) + expect = ufunc(args[0].to_pandas(), args[1].to_numpy()) + + if ufunc.nout > 1: + for g, e in zip(got, expect, strict=True): + if reflect: + assert (cp.asnumpy(g) == e).all() + else: + assert_eq(g, e, check_exact=False) + else: + if reflect: + assert (cp.asnumpy(got) == expect).all() + else: + assert_eq(got, expect, check_exact=False) diff --git a/python/cudf/cudf/tests/series/test_array_function.py b/python/cudf/cudf/tests/series/test_array_function.py new file mode 100644 index 00000000000..4715a0a7103 --- /dev/null +++ b/python/cudf/cudf/tests/series/test_array_function.py @@ -0,0 +1,36 @@ +# Copyright (c) 2018-2025, NVIDIA CORPORATION. + +import numpy as np +import pytest + +import cudf +from cudf.testing import assert_eq + + +@pytest.mark.parametrize( + "func", + [ + np.mean, + np.sum, + lambda x: np.var(x, ddof=1), + np.unique, + lambda x: np.dot(x, x), + np.linalg.norm, + ], +) +def test_array_func_cudf_series(func): + np_ar = np.arange(10, dtype=np.float32) + cudf_ser = cudf.Series(np_ar) + expect = func(np_ar) + got = func(cudf_ser) + if np.isscalar(expect): + assert expect == got + else: + assert_eq(cudf.Series(expect), got) + + +@pytest.mark.parametrize("index", [None, [1, 2, 3]]) +def test_list_input_array_func(index): + s = cudf.Series(np.array([1, 2, 3]), index=index) + with pytest.raises(TypeError): + np.concatenate([s, s, s]) diff --git a/python/cudf/cudf/tests/series/test_np_ufuncs.py b/python/cudf/cudf/tests/series/test_np_ufuncs.py new file mode 100644 index 00000000000..c43f73b7e1f --- /dev/null +++ b/python/cudf/cudf/tests/series/test_np_ufuncs.py @@ -0,0 +1,210 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + +import operator +from functools import reduce + +import cupy as cp +import numpy as np +import pytest + +import cudf +from cudf.core._compat import ( + PANDAS_CURRENT_SUPPORTED_VERSION, + PANDAS_VERSION, +) +from cudf.testing import assert_eq +from cudf.testing._utils import expect_warning_if, set_random_null_mask_inplace + + +@pytest.mark.skipif( + PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, + reason="warning not present in older pandas versions", +) +@pytest.mark.parametrize("has_nulls", [True, False]) +@pytest.mark.parametrize("indexed", [True, False]) +def test_ufunc_series(request, numpy_ufunc, has_nulls, indexed): + # Note: This test assumes that all ufuncs are unary or binary. + request.applymarker( + pytest.mark.xfail( + condition=( + indexed + and numpy_ufunc + in { + np.greater, + np.greater_equal, + np.less, + np.less_equal, + np.not_equal, + np.equal, + } + ), + reason="Comparison operators do not support misaligned indexes.", + ) + ) + request.applymarker( + pytest.mark.xfail( + condition=numpy_ufunc == np.matmul and has_nulls, + reason="Can't call cupy on column with nulls", + ) + ) + request.applymarker( + pytest.mark.xfail( + condition=numpy_ufunc.__name__.startswith("bitwise") + and indexed + and has_nulls, + reason="https://github.com/pandas-dev/pandas/issues/52500", + ) + ) + + N = 100 + # Avoid zeros in either array to skip division by 0 errors. Also limit the + # scale to avoid issues with overflow, etc. We use ints because some + # operations (like bitwise ops) are not defined for floats. + pandas_args = args = [ + cudf.Series( + cp.random.randint(low=1, high=10, size=N), + index=cp.random.choice(range(N), N, False) if indexed else None, + ) + for _ in range(numpy_ufunc.nin) + ] + + if has_nulls: + # Converting nullable integer cudf.Series to pandas will produce a + # float pd.Series, so instead we replace nulls with an arbitrary + # integer value, precompute the mask, and then reapply it afterwards. + for arg in args: + set_random_null_mask_inplace(arg) + pandas_args = [arg.fillna(0) for arg in args] + + # Note: Different indexes must be aligned before the mask is computed. + # This requires using an internal function (_align_indices), and that + # is unlikely to change for the foreseeable future. + aligned = ( + cudf.core.series._align_indices(args, allow_non_unique=True) + if indexed and numpy_ufunc.nin == 2 + else args + ) + mask = reduce(operator.or_, (a.isna() for a in aligned)).to_pandas() + + got = numpy_ufunc(*args) + + expect = numpy_ufunc(*(arg.to_pandas() for arg in pandas_args)) + + if numpy_ufunc.nout > 1: + for g, e in zip(got, expect, strict=True): + if has_nulls: + e[mask] = np.nan + assert_eq(g, e, check_exact=False) + else: + if has_nulls: + with expect_warning_if( + numpy_ufunc + in ( + np.isfinite, + np.isinf, + np.isnan, + np.logical_and, + np.logical_not, + np.logical_or, + np.logical_xor, + np.signbit, + np.equal, + np.greater, + np.greater_equal, + np.less, + np.less_equal, + np.not_equal, + ) + ): + expect[mask] = np.nan + assert_eq(got, expect, check_exact=False) + + +@pytest.mark.parametrize( + "ufunc", [np.add, np.greater, np.greater_equal, np.logical_and] +) +@pytest.mark.parametrize("has_nulls", [True, False]) +@pytest.mark.parametrize("indexed", [True, False]) +@pytest.mark.parametrize("reflect", [True, False]) +def test_binary_ufunc_series_array( + request, ufunc, has_nulls, indexed, reflect +): + fname = ufunc.__name__ + request.applymarker( + pytest.mark.xfail( + condition=reflect and has_nulls, + reason=( + "When cupy is the left operand there is no way for us to " + "avoid calling its binary operators, which cannot handle " + "cudf objects that contain nulls." + ), + ) + ) + # The way cudf casts nans in arrays to nulls during binops with cudf + # objects is currently incompatible with pandas. + request.applymarker( + pytest.mark.xfail( + condition=( + fname in {"greater", "greater_equal", "logical_and"} + and has_nulls + ), + reason=( + "cudf and pandas incompatible casting nans to nulls in binops" + ), + ) + ) + N = 100 + # Avoid zeros in either array to skip division by 0 errors. Also limit the + # scale to avoid issues with overflow, etc. We use ints because some + # operations (like bitwise ops) are not defined for floats. + args = [ + cudf.Series( + cp.random.rand(N), + index=cp.random.choice(range(N), N, False) if indexed else None, + ) + for _ in range(ufunc.nin) + ] + + if has_nulls: + # Converting nullable integer cudf.Series to pandas will produce a + # float pd.Series, so instead we replace nulls with an arbitrary + # integer value, precompute the mask, and then reapply it afterwards. + for arg in args: + set_random_null_mask_inplace(arg) + + # Cupy doesn't support nulls, so we fill with nans before converting. + args[1] = args[1].fillna(cp.nan) + mask = args[0].isna().to_pandas() + + arg1 = args[1].to_cupy() + + if reflect: + got = ufunc(arg1, args[0]) + expect = ufunc(args[1].to_numpy(), args[0].to_pandas()) + else: + got = ufunc(args[0], arg1) + expect = ufunc(args[0].to_pandas(), args[1].to_numpy()) + + if ufunc.nout > 1: + for g, e in zip(got, expect, strict=True): + if has_nulls: + e[mask] = np.nan + if reflect: + assert (cp.asnumpy(g) == e).all() + else: + assert_eq(g, e, check_exact=False) + else: + if has_nulls: + expect[mask] = np.nan + if reflect: + assert (cp.asnumpy(got) == expect).all() + else: + assert_eq(got, expect, check_exact=False) + + +def test_ufunc_cudf_series_error_with_out_kwarg(): + cudf_s1 = cudf.Series(data=[-1, 2, 3, 0]) + cudf_s2 = cudf.Series(data=[-1, 2, 3, 0]) + cudf_s3 = cudf.Series(data=[0, 0, 0, 0]) + with pytest.raises(TypeError): + np.add(x1=cudf_s1, x2=cudf_s2, out=cudf_s3) diff --git a/python/cudf/cudf/tests/test_array_function.py b/python/cudf/cudf/tests/test_array_function.py deleted file mode 100644 index f627087d64c..00000000000 --- a/python/cudf/cudf/tests/test_array_function.py +++ /dev/null @@ -1,133 +0,0 @@ -# Copyright (c) 2018-2025, NVIDIA CORPORATION. - -import numpy as np -import pandas as pd -import pytest - -import cudf -from cudf.testing import assert_eq - - -@pytest.fixture -def rng(): - return np.random.default_rng(seed=0) - - -@pytest.mark.parametrize( - "func", - [ - lambda x: np.mean(x), - lambda x: np.sum(x), - lambda x: np.var(x, ddof=1), - lambda x: np.unique(x), - lambda x: np.dot(x, x), - lambda x: np.linalg.norm(x), - ], -) -def test_array_func_cudf_series(func, rng): - np_ar = rng.random(100) - cudf_ser = cudf.Series(np_ar) - expect = func(np_ar) - got = func(cudf_ser) - if np.isscalar(expect): - assert_eq(expect, got) - else: - assert_eq(expect, got.to_numpy()) - - -@pytest.mark.parametrize( - "func", - [ - lambda x: np.mean(x, axis=0), - lambda x: np.sum(x, axis=0), - lambda x: np.var(x, ddof=1, axis=0), - lambda x: np.dot(x, x.transpose()), - lambda x: np.all(x), - lambda x: np.any(x), - lambda x: np.prod(x, axis=0), - lambda x: np.prod(x, axis=1), - ], -) -def test_array_func_cudf_dataframe(func, rng): - pd_df = pd.DataFrame(rng.uniform(size=(100, 10))) - cudf_df = cudf.from_pandas(pd_df) - expect = func(pd_df) - got = func(cudf_df) - assert_eq(expect, got) - - -@pytest.mark.parametrize( - "func", - [ - lambda x: np.cov(x, x), - lambda x: np.linalg.norm(x), - lambda x: np.linalg.det(x), - ], -) -def test_array_func_missing_cudf_dataframe(func, rng): - pd_df = pd.DataFrame(rng.uniform(size=(100, 10))) - cudf_df = cudf.from_pandas(pd_df) - with pytest.raises(TypeError): - func(cudf_df) - - -@pytest.mark.parametrize( - "func", - [ - lambda x: np.unique(x), - ], -) -def test_array_func_cudf_index(func, rng): - np_ar = rng.random(100) - cudf_index = cudf.Index(cudf.Series(np_ar)) - expect = func(np_ar) - got = func(cudf_index) - if np.isscalar(expect): - assert_eq(expect, got) - else: - assert_eq(expect, got.to_numpy()) - - -@pytest.mark.parametrize( - "func", - [ - lambda x: np.cov(x, x), - lambda x: np.linalg.norm(x), - lambda x: np.linalg.det(x), - ], -) -def test_array_func_missing_cudf_index(func, rng): - np_ar = rng.random(100) - cudf_index = cudf.Index(cudf.Series(np_ar)) - with pytest.raises(TypeError): - func(cudf_index) - - -@pytest.mark.parametrize( - "func", - [ - lambda x: np.cov(x, x), - lambda x: np.dot(x, x), - lambda x: np.linalg.norm(x), - lambda x: np.linalg.det(x), - ], -) -def test_array_func_missing_cudf_multi_index(func): - levels = [["a", "b"], ["c", "d"]] - codes = [[0, 1], [1, 0]] - - cudf_multi_index = cudf.MultiIndex(levels, codes) - with pytest.raises(TypeError): - func(cudf_multi_index) - - -def test_list_input_array_func(): - ar = np.array([1, 2, 3]) - - s = cudf.Series(ar) - with pytest.raises(TypeError): - np.concatenate([s, s, s]) - - s = cudf.Series(ar, index=[1, 2, 3]) - with pytest.raises(TypeError): - np.concatenate([s, s, s]) diff --git a/python/cudf/cudf/tests/test_array_ufunc.py b/python/cudf/cudf/tests/test_array_ufunc.py deleted file mode 100644 index abc3c105320..00000000000 --- a/python/cudf/cudf/tests/test_array_ufunc.py +++ /dev/null @@ -1,486 +0,0 @@ -# Copyright (c) 2020-2025, NVIDIA CORPORATION. - -import operator -import warnings -from contextlib import contextmanager -from functools import reduce - -import cupy as cp -import numpy as np -import pytest -from packaging.version import parse - -import cudf -from cudf.core._compat import ( - PANDAS_CURRENT_SUPPORTED_VERSION, - PANDAS_LT_300, - PANDAS_VERSION, -) -from cudf.testing import assert_eq -from cudf.testing._utils import expect_warning_if, set_random_null_mask_inplace - - -@pytest.fixture( - params=[ - obj - for obj in (getattr(np, name) for name in dir(np)) - if isinstance(obj, np.ufunc) - ] -) -def ufunc(request): - return request.param - - -@contextmanager -def _hide_ufunc_warnings(ufunc): - # pandas raises warnings for some inputs to the following ufuncs: - name = ufunc.__name__ - if name in { - "arccos", - "arccosh", - "arcsin", - "arctanh", - "fmod", - "log", - "log10", - "log2", - "reciprocal", - }: - with warnings.catch_warnings(): - warnings.filterwarnings( - "ignore", - f"invalid value encountered in {name}", - category=RuntimeWarning, - ) - warnings.filterwarnings( - "ignore", - f"divide by zero encountered in {name}", - category=RuntimeWarning, - ) - yield - elif name in { - "bitwise_and", - "bitwise_or", - "bitwise_xor", - }: - with warnings.catch_warnings(): - warnings.filterwarnings( - "ignore", - "Operation between non boolean Series with different " - "indexes will no longer return a boolean result in " - "a future version. Cast both Series to object type " - "to maintain the prior behavior.", - category=FutureWarning, - ) - yield - else: - yield - - -def test_ufunc_index(request, ufunc): - # Note: This test assumes that all ufuncs are unary or binary. - fname = ufunc.__name__ - request.applymarker( - pytest.mark.xfail( - condition=not hasattr(cp, fname), - reason=f"cupy has no support for '{fname}'", - ) - ) - request.applymarker( - pytest.mark.xfail( - condition=fname == "matmul" and PANDAS_LT_300, - reason="Fixed by https://github.com/pandas-dev/pandas/pull/57079", - ) - ) - request.applymarker( - pytest.mark.xfail( - condition=fname in {"ceil", "floor", "trunc"} - and parse(np.__version__) >= parse("2.1") - and parse(cp.__version__) < parse("14"), - reason="https://github.com/cupy/cupy/issues/9018", - ) - ) - - N = 100 - # Avoid zeros in either array to skip division by 0 errors. Also limit the - # scale to avoid issues with overflow, etc. We use ints because some - # operations (like bitwise ops) are not defined for floats. - pandas_args = args = [ - cudf.Index( - cp.random.randint(low=1, high=10, size=N), - ) - for _ in range(ufunc.nin) - ] - - got = ufunc(*args) - - with _hide_ufunc_warnings(ufunc): - expect = ufunc(*(arg.to_pandas() for arg in pandas_args)) - - if ufunc.nout > 1: - for g, e in zip(got, expect, strict=True): - assert_eq(g, e, check_exact=False) - else: - assert_eq(got, expect, check_exact=False) - - -@pytest.mark.parametrize( - "ufunc", [np.add, np.greater, np.greater_equal, np.logical_and] -) -@pytest.mark.parametrize("reflect", [True, False]) -def test_binary_ufunc_index_array(ufunc, reflect): - N = 100 - # Avoid zeros in either array to skip division by 0 errors. Also limit the - # scale to avoid issues with overflow, etc. We use ints because some - # operations (like bitwise ops) are not defined for floats. - args = [cudf.Index(cp.random.rand(N)) for _ in range(ufunc.nin)] - - arg1 = args[1].to_cupy() - - if reflect: - got = ufunc(arg1, args[0]) - expect = ufunc(args[1].to_numpy(), args[0].to_pandas()) - else: - got = ufunc(args[0], arg1) - expect = ufunc(args[0].to_pandas(), args[1].to_numpy()) - - if ufunc.nout > 1: - for g, e in zip(got, expect, strict=True): - if reflect: - assert (cp.asnumpy(g) == e).all() - else: - assert_eq(g, e, check_exact=False) - else: - if reflect: - assert (cp.asnumpy(got) == expect).all() - else: - assert_eq(got, expect, check_exact=False) - - -@pytest.mark.skipif( - PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, - reason="warning not present in older pandas versions", -) -@pytest.mark.parametrize("has_nulls", [True, False]) -@pytest.mark.parametrize("indexed", [True, False]) -def test_ufunc_series(request, ufunc, has_nulls, indexed): - # Note: This test assumes that all ufuncs are unary or binary. - fname = ufunc.__name__ - request.applymarker( - pytest.mark.xfail( - condition=( - indexed - and fname - in { - "greater", - "greater_equal", - "less", - "less_equal", - "not_equal", - "equal", - } - ), - reason="Comparison operators do not support misaligned indexes.", - ) - ) - request.applymarker( - pytest.mark.xfail( - condition=ufunc == np.matmul and has_nulls, - reason="Can't call cupy on column with nulls", - ) - ) - # If we don't have explicit dispatch and cupy doesn't support the operator, - # we expect a failure - request.applymarker( - pytest.mark.xfail( - condition=not hasattr(cp, fname), - reason=f"cupy has no support for '{fname}'", - ) - ) - - request.applymarker( - pytest.mark.xfail( - condition=fname.startswith("bitwise") and indexed and has_nulls, - reason="https://github.com/pandas-dev/pandas/issues/52500", - ) - ) - - N = 100 - # Avoid zeros in either array to skip division by 0 errors. Also limit the - # scale to avoid issues with overflow, etc. We use ints because some - # operations (like bitwise ops) are not defined for floats. - pandas_args = args = [ - cudf.Series( - cp.random.randint(low=1, high=10, size=N), - index=cp.random.choice(range(N), N, False) if indexed else None, - ) - for _ in range(ufunc.nin) - ] - - if has_nulls: - # Converting nullable integer cudf.Series to pandas will produce a - # float pd.Series, so instead we replace nulls with an arbitrary - # integer value, precompute the mask, and then reapply it afterwards. - for arg in args: - set_random_null_mask_inplace(arg) - pandas_args = [arg.fillna(0) for arg in args] - - # Note: Different indexes must be aligned before the mask is computed. - # This requires using an internal function (_align_indices), and that - # is unlikely to change for the foreseeable future. - aligned = ( - cudf.core.series._align_indices(args, allow_non_unique=True) - if indexed and ufunc.nin == 2 - else args - ) - mask = reduce(operator.or_, (a.isna() for a in aligned)).to_pandas() - - got = ufunc(*args) - - with _hide_ufunc_warnings(ufunc): - expect = ufunc(*(arg.to_pandas() for arg in pandas_args)) - - if ufunc.nout > 1: - for g, e in zip(got, expect, strict=True): - if has_nulls: - e[mask] = np.nan - assert_eq(g, e, check_exact=False) - else: - if has_nulls: - with expect_warning_if( - fname - in ( - "isfinite", - "isinf", - "isnan", - "logical_and", - "logical_not", - "logical_or", - "logical_xor", - "signbit", - "equal", - "greater", - "greater_equal", - "less", - "less_equal", - "not_equal", - ) - ): - expect[mask] = np.nan - assert_eq(got, expect, check_exact=False) - - -@pytest.mark.parametrize( - "ufunc", [np.add, np.greater, np.greater_equal, np.logical_and] -) -@pytest.mark.parametrize("has_nulls", [True, False]) -@pytest.mark.parametrize("indexed", [True, False]) -@pytest.mark.parametrize("reflect", [True, False]) -def test_binary_ufunc_series_array( - request, ufunc, has_nulls, indexed, reflect -): - fname = ufunc.__name__ - request.applymarker( - pytest.mark.xfail( - condition=reflect and has_nulls, - reason=( - "When cupy is the left operand there is no way for us to " - "avoid calling its binary operators, which cannot handle " - "cudf objects that contain nulls." - ), - ) - ) - # The way cudf casts nans in arrays to nulls during binops with cudf - # objects is currently incompatible with pandas. - request.applymarker( - pytest.mark.xfail( - condition=( - fname in {"greater", "greater_equal", "logical_and"} - and has_nulls - ), - reason=( - "cudf and pandas incompatible casting nans to nulls in binops" - ), - ) - ) - N = 100 - # Avoid zeros in either array to skip division by 0 errors. Also limit the - # scale to avoid issues with overflow, etc. We use ints because some - # operations (like bitwise ops) are not defined for floats. - args = [ - cudf.Series( - cp.random.rand(N), - index=cp.random.choice(range(N), N, False) if indexed else None, - ) - for _ in range(ufunc.nin) - ] - - if has_nulls: - # Converting nullable integer cudf.Series to pandas will produce a - # float pd.Series, so instead we replace nulls with an arbitrary - # integer value, precompute the mask, and then reapply it afterwards. - for arg in args: - set_random_null_mask_inplace(arg) - - # Cupy doesn't support nulls, so we fill with nans before converting. - args[1] = args[1].fillna(cp.nan) - mask = args[0].isna().to_pandas() - - arg1 = args[1].to_cupy() - - if reflect: - got = ufunc(arg1, args[0]) - expect = ufunc(args[1].to_numpy(), args[0].to_pandas()) - else: - got = ufunc(args[0], arg1) - expect = ufunc(args[0].to_pandas(), args[1].to_numpy()) - - if ufunc.nout > 1: - for g, e in zip(got, expect, strict=True): - if has_nulls: - e[mask] = np.nan - if reflect: - assert (cp.asnumpy(g) == e).all() - else: - assert_eq(g, e, check_exact=False) - else: - if has_nulls: - expect[mask] = np.nan - if reflect: - assert (cp.asnumpy(got) == expect).all() - else: - assert_eq(got, expect, check_exact=False) - - -@pytest.mark.parametrize( - "func", - [np.add], -) -def test_ufunc_cudf_series_error_with_out_kwarg(func): - cudf_s1 = cudf.Series(data=[-1, 2, 3, 0]) - cudf_s2 = cudf.Series(data=[-1, 2, 3, 0]) - cudf_s3 = cudf.Series(data=[0, 0, 0, 0]) - # this throws a value-error because of presence of out kwarg - with pytest.raises(TypeError): - func(x1=cudf_s1, x2=cudf_s2, out=cudf_s3) - - -# Skip matmul since it requires aligned shapes. -@pytest.mark.skipif( - PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, - reason="warning not present in older pandas versions", -) -@pytest.mark.parametrize("has_nulls", [True, False]) -@pytest.mark.parametrize("indexed", [True, False]) -def test_ufunc_dataframe(request, ufunc, has_nulls, indexed): - # Note: This test assumes that all ufuncs are unary or binary. - fname = ufunc.__name__ - request.applymarker( - pytest.mark.xfail( - condition=( - indexed - and fname - in { - "greater", - "greater_equal", - "less", - "less_equal", - "not_equal", - "equal", - } - ), - reason="Comparison operators do not support misaligned indexes.", - ) - ) - # If we don't have explicit dispatch and cupy doesn't support the operator, - # we expect a failure - request.applymarker( - pytest.mark.xfail( - condition=not hasattr(cp, fname), - reason=f"cupy has no support for '{fname}'", - ) - ) - request.applymarker( - pytest.mark.xfail( - condition=fname in {"ceil", "floor", "trunc"} - and not has_nulls - and parse(np.__version__) >= parse("2.1") - and parse(cp.__version__) < parse("14"), - reason="https://github.com/cupy/cupy/issues/9018", - ) - ) - request.applymarker( - pytest.mark.xfail( - condition=fname == "matmul", - reason=f"{fname} is not supported in cuDF", - ) - ) - - N = 100 - # Avoid zeros in either array to skip division by 0 errors. Also limit the - # scale to avoid issues with overflow, etc. We use ints because some - # operations (like bitwise ops) are not defined for floats. - # TODO: Add tests of mismatched columns etc. - pandas_args = args = [ - cudf.DataFrame( - {"foo": cp.random.randint(low=1, high=10, size=N)}, - index=cp.random.choice(range(N), N, False) if indexed else None, - ) - for _ in range(ufunc.nin) - ] - - if has_nulls: - # Converting nullable integer cudf.Series to pandas will produce a - # float pd.Series, so instead we replace nulls with an arbitrary - # integer value, precompute the mask, and then reapply it afterwards. - for arg in args: - set_random_null_mask_inplace(arg["foo"]) - pandas_args = [arg.copy() for arg in args] - for arg in pandas_args: - arg["foo"] = arg["foo"].fillna(0) - - # Note: Different indexes must be aligned before the mask is computed. - # This requires using an internal function (_align_indices), and that - # is unlikely to change for the foreseeable future. - aligned = ( - cudf.core.dataframe._align_indices(*args) - if indexed and ufunc.nin == 2 - else args - ) - mask = reduce( - operator.or_, (a["foo"].isna() for a in aligned) - ).to_pandas() - - got = ufunc(*args) - - with _hide_ufunc_warnings(ufunc): - expect = ufunc(*(arg.to_pandas() for arg in pandas_args)) - - if ufunc.nout > 1: - for g, e in zip(got, expect, strict=True): - if has_nulls: - e[mask] = np.nan - assert_eq(g, e, check_exact=False) - else: - if has_nulls: - with expect_warning_if( - fname - in ( - "isfinite", - "isinf", - "isnan", - "logical_and", - "logical_not", - "logical_or", - "logical_xor", - "signbit", - "equal", - "greater", - "greater_equal", - "less", - "less_equal", - "not_equal", - ) - ): - expect[mask] = np.nan - assert_eq(got, expect, check_exact=False) From fa67af0f96e5fced188294b621087a354b90f6ac Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Wed, 13 Aug 2025 15:42:38 -0700 Subject: [PATCH 123/366] Move ~half of test_groupby.py to new cudf classic test directory structure (#19640) Towards https://github.com/rapidsai/cudf/issues/9999 Towards https://github.com/rapidsai/cudf/issues/15723 Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/19640 --- python/cudf/cudf/testing/__init__.py | 1 + python/cudf/cudf/testing/testing.py | 24 + python/cudf/cudf/tests/groupby/conftest.py | 14 + python/cudf/cudf/tests/groupby/test_agg.py | 575 +++- python/cudf/cudf/tests/groupby/test_apply.py | 141 + .../cudf/tests/groupby/test_attributes.py | 126 + .../cudf/tests/groupby/test_computation.py | 18 - .../cudf/tests/groupby/test_constructors.py | 16 + .../cudf/tests/groupby/test_cummulative.py | 95 + python/cudf/cudf/tests/groupby/test_diff.py | 194 ++ python/cudf/cudf/tests/groupby/test_fillna.py | 191 ++ .../{test_indexing.py => test_get_group.py} | 0 .../cudf/tests/groupby/test_groupby_obj.py | 15 - python/cudf/cudf/tests/groupby/test_nth.py | 25 + .../cudf/cudf/tests/groupby/test_nunique.py | 47 + .../groupby/test_ordering_pandas_compat.py | 28 - python/cudf/cudf/tests/groupby/test_pipe.py | 15 + python/cudf/cudf/tests/groupby/test_rank.py | 59 + .../cudf/tests/groupby/test_reductions.py | 678 +++++ python/cudf/cudf/tests/groupby/test_shift.py | 204 ++ python/cudf/cudf/tests/groupby/test_unique.py | 31 + python/cudf/cudf/tests/test_groupby.py | 2422 +---------------- 22 files changed, 2530 insertions(+), 2389 deletions(-) create mode 100644 python/cudf/cudf/tests/groupby/conftest.py create mode 100644 python/cudf/cudf/tests/groupby/test_apply.py create mode 100644 python/cudf/cudf/tests/groupby/test_attributes.py delete mode 100644 python/cudf/cudf/tests/groupby/test_computation.py create mode 100644 python/cudf/cudf/tests/groupby/test_constructors.py create mode 100644 python/cudf/cudf/tests/groupby/test_cummulative.py create mode 100644 python/cudf/cudf/tests/groupby/test_diff.py create mode 100644 python/cudf/cudf/tests/groupby/test_fillna.py rename python/cudf/cudf/tests/groupby/{test_indexing.py => test_get_group.py} (100%) delete mode 100644 python/cudf/cudf/tests/groupby/test_groupby_obj.py create mode 100644 python/cudf/cudf/tests/groupby/test_nth.py create mode 100644 python/cudf/cudf/tests/groupby/test_nunique.py delete mode 100644 python/cudf/cudf/tests/groupby/test_ordering_pandas_compat.py create mode 100644 python/cudf/cudf/tests/groupby/test_pipe.py create mode 100644 python/cudf/cudf/tests/groupby/test_rank.py create mode 100644 python/cudf/cudf/tests/groupby/test_reductions.py create mode 100644 python/cudf/cudf/tests/groupby/test_shift.py create mode 100644 python/cudf/cudf/tests/groupby/test_unique.py diff --git a/python/cudf/cudf/testing/__init__.py b/python/cudf/cudf/testing/__init__.py index a4afa54f754..b03e5bf4375 100644 --- a/python/cudf/cudf/testing/__init__.py +++ b/python/cudf/cudf/testing/__init__.py @@ -4,6 +4,7 @@ from cudf.testing.testing import ( assert_eq, assert_frame_equal, + assert_groupby_results_equal, assert_index_equal, assert_neq, assert_series_equal, diff --git a/python/cudf/cudf/testing/testing.py b/python/cudf/cudf/testing/testing.py index 190011f614e..048a54a76f8 100644 --- a/python/cudf/cudf/testing/testing.py +++ b/python/cudf/cudf/testing/testing.py @@ -809,3 +809,27 @@ def assert_neq(left, right, **kwargs): pass else: raise AssertionError + + +def assert_groupby_results_equal( + expect, got, sort=True, as_index=True, by=None, **kwargs +): + # Because we don't sort by index by default in groupby, + # sort expect and got by index before comparing. + if sort: + if as_index: + expect = expect.sort_index() + got = got.sort_index() + else: + assert by is not None + if isinstance(expect, (pd.DataFrame, cudf.DataFrame)): + expect = expect.sort_values(by=by).reset_index(drop=True) + else: + expect = expect.sort_values(by=by).reset_index(drop=True) + + if isinstance(got, cudf.DataFrame): + got = got.sort_values(by=by).reset_index(drop=True) + else: + got = got.sort_values(by=by).reset_index(drop=True) + + assert_eq(expect, got, **kwargs) diff --git a/python/cudf/cudf/tests/groupby/conftest.py b/python/cudf/cudf/tests/groupby/conftest.py new file mode 100644 index 00000000000..db95a7b3b8e --- /dev/null +++ b/python/cudf/cudf/tests/groupby/conftest.py @@ -0,0 +1,14 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. +import pytest + + +@pytest.fixture(params=[True, False]) +def as_index(request): + return request.param + + +@pytest.fixture( + params=["min", "max", "idxmin", "idxmax", "count", "sum", "prod", "mean"] +) +def groupby_reduction_methods(request): + return request.param diff --git a/python/cudf/cudf/tests/groupby/test_agg.py b/python/cudf/cudf/tests/groupby/test_agg.py index dc20a27177a..42c74f967bb 100644 --- a/python/cudf/cudf/tests/groupby/test_agg.py +++ b/python/cudf/cudf/tests/groupby/test_agg.py @@ -1,16 +1,20 @@ -# Copyright (c) 2023-2024, NVIDIA CORPORATION. +# Copyright (c) 2023-2025, NVIDIA CORPORATION. +import decimal +import itertools + import numpy as np +import pandas as pd import pytest import cudf -from cudf.testing import assert_eq +from cudf.core._compat import ( + PANDAS_CURRENT_SUPPORTED_VERSION, + PANDAS_VERSION, +) +from cudf.testing import assert_eq, assert_groupby_results_equal -@pytest.mark.parametrize( - "empty", - [True, False], - ids=["empty", "nonempty"], -) +@pytest.mark.parametrize("empty", [True, False]) def test_agg_count_dtype(empty): df = cudf.DataFrame({"a": [1, 2, 1], "c": ["a", "b", "c"]}) if empty: @@ -72,3 +76,560 @@ def test_dataframe_agg_with_invalid_kwarg(): with pytest.raises(TypeError, match="Invalid keyword argument"): df = cudf.DataFrame({"a": [1, 2, 1, 2], "b": [0, 0, 0, 0]}) df.groupby("a").agg(foo=set()) + + +@pytest.mark.parametrize("with_nulls", [False, True]) +def test_groupby_agg_maintain_order_random(with_nulls): + nrows = 20 + nkeys = 3 + rng = np.random.default_rng(seed=0) + key_names = [f"key{key}" for key in range(nkeys)] + key_values = [rng.integers(100, size=nrows) for _ in key_names] + value = rng.integers(-100, 100, size=nrows) + df = cudf.DataFrame( + dict(zip(key_names, key_values, strict=True), value=value) + ) + if with_nulls: + for key in key_names: + df.loc[df[key] == 1, key] = None + with cudf.option_context("mode.pandas_compatible", True): + got = df.groupby(key_names, sort=False).agg({"value": "sum"}) + expect = ( + df.to_pandas().groupby(key_names, sort=False).agg({"value": "sum"}) + ) + assert_eq(expect, got, check_index_type=not with_nulls) + + +def test_groupby_agg_mean_min(): + pdf = pd.DataFrame(np.ones((20, 3)), columns=["x", "y", "val"]) + gdf = cudf.DataFrame(pdf) + got_df = gdf.groupby(["x", "y"]).agg(["mean", "min"]) + expect_df = pdf.groupby(["x", "y"]).agg(["mean", "min"]) + assert_groupby_results_equal(got_df, expect_df) + + +def test_groupby_agg_min_max_dictargs(): + pdf = pd.DataFrame(np.ones((20, 5)), columns=["x", "y", "val", "a", "b"]) + gdf = cudf.DataFrame(pdf) + expect_df = pdf.groupby(["x", "y"]).agg({"a": "min", "b": "max"}) + got_df = gdf.groupby(["x", "y"]).agg({"a": "min", "b": "max"}) + assert_groupby_results_equal(expect_df, got_df) + + +def test_groupby_agg_min_max_dictlist(): + pdf = pd.DataFrame(np.ones((20, 5)), columns=["x", "y", "val", "a", "b"]) + gdf = cudf.DataFrame(pdf) + expect_df = pdf.groupby(["x", "y"]).agg( + {"a": ["min", "max"], "b": ["min", "max"]} + ) + got_df = gdf.groupby(["x", "y"]).agg( + {"a": ["min", "max"], "b": ["min", "max"]} + ) + assert_groupby_results_equal(got_df, expect_df) + + +def test_groupby_as_index_single_agg(as_index): + pdf = pd.DataFrame({"x": [1, 2, 3], "y": [0, 1, 1]}) + gdf = cudf.DataFrame({"x": [1, 2, 3], "y": [0, 1, 1]}) + gdf = gdf.groupby("y", as_index=as_index).agg({"x": "mean"}) + pdf = pdf.groupby("y", as_index=as_index).agg({"x": "mean"}) + assert_groupby_results_equal(pdf, gdf, as_index=as_index, by="y") + + +def test_groupby_default(): + pdf = pd.DataFrame({"x": [1, 2, 3], "y": [0, 1, 1]}) + gdf = cudf.DataFrame({"x": [1, 2, 3], "y": [0, 1, 1]}) + gdf = gdf.groupby("y").agg({"x": "mean"}) + pdf = pdf.groupby("y").agg({"x": "mean"}) + assert_groupby_results_equal(pdf, gdf) + + +def test_groupby_as_index_multiindex(as_index): + pdf = pd.DataFrame( + {"a": [1, 2, 1], "b": [3, 3, 3], "c": [2, 2, 3], "d": [3, 1, 2]} + ) + gdf = cudf.from_pandas(pdf) + + gdf = gdf.groupby(["a", "b"], as_index=as_index, sort=True).agg( + {"c": "mean"} + ) + pdf = pdf.groupby(["a", "b"], as_index=as_index, sort=True).agg( + {"c": "mean"} + ) + + if as_index: + assert_eq(pdf, gdf) + else: + # column names don't match - check just the values + for gcol, pcol in zip(gdf, pdf, strict=True): + np.testing.assert_array_equal( + gdf[gcol].to_numpy(), pdf[pcol].values + ) + + +@pytest.mark.parametrize( + "func", + [ + "mean", + "std", + "var", + "min", + "max", + "idxmin", + "idxmax", + "count", + "sum", + "prod", + ], +) +def test_groupby_2keys_agg(func): + # gdf (Note: lack of multiIndex) + nelem = 20 + pdf = pd.DataFrame(np.ones((nelem, 2)), columns=["x", "y"]) + gdf = cudf.DataFrame(pdf) + expect_df = pdf.groupby(["x", "y"]).agg(func) + got_df = gdf.groupby(["x", "y"]).agg(func) + + assert_groupby_results_equal(got_df, expect_df) + + +def test_series_groupby_agg(groupby_reduction_methods): + s = pd.Series([1, 2, 3]) + g = cudf.Series([1, 2, 3]) + sg = s.groupby(s // 2).agg(groupby_reduction_methods) + gg = g.groupby(g // 2).agg(groupby_reduction_methods) + assert_groupby_results_equal(sg, gg) + + +def test_groupby_agg_decimal(groupby_reduction_methods, request): + request.applymarker( + pytest.mark.xfail( + groupby_reduction_methods in ["prod", "mean"], + raises=pd.errors.DataError, + reason=f"{groupby_reduction_methods} not supported with Decimals in pandas", + ) + ) + request.applymarker( + pytest.mark.xfail( + groupby_reduction_methods in ["idxmax", "idxmin"] + and PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, + reason=f"{groupby_reduction_methods} not supported with Decimals in an older version of pandas", + ) + ) + rng = np.random.default_rng(seed=0) + num_groups = 4 + nelem_per_group = 10 + # The number of digits after the decimal to use. + decimal_digits = 2 + # The number of digits before the decimal to use. + whole_digits = 2 + + scale = 10**whole_digits + nelem = num_groups * nelem_per_group + + # The unique is necessary because otherwise if there are duplicates idxmin + # and idxmax may return different results than pandas (see + # https://github.com/rapidsai/cudf/issues/7756). This is not relevant to + # the current version of the test, because idxmin and idxmax simply don't + # work with pandas Series composed of Decimal objects (see + # https://github.com/pandas-dev/pandas/issues/40685). However, if that is + # ever enabled, then this issue will crop up again so we may as well have + # it fixed now. + x = np.unique((rng.random(nelem) * scale).round(decimal_digits)) + y = np.unique((rng.random(nelem) * scale).round(decimal_digits)) + + if x.size < y.size: + total_elements = x.size + y = y[: x.size] + else: + total_elements = y.size + x = x[: y.size] + + # Note that this filtering can lead to one group with fewer elements, but + # that shouldn't be a problem and is probably useful to test. + idx_col = np.tile(np.arange(num_groups), nelem_per_group)[:total_elements] + + decimal_x = pd.Series([decimal.Decimal(str(d)) for d in x]) + decimal_y = pd.Series([decimal.Decimal(str(d)) for d in y]) + + pdf = pd.DataFrame({"idx": idx_col, "x": decimal_x, "y": decimal_y}) + gdf = cudf.DataFrame( + { + "idx": idx_col, + "x": cudf.Series(decimal_x), + "y": cudf.Series(decimal_y), + } + ) + + expect_df = pdf.groupby("idx", sort=True).agg(groupby_reduction_methods) + got_df = gdf.groupby("idx", sort=True).agg(groupby_reduction_methods) + assert_eq(expect_df["x"], got_df["x"], check_dtype=False) + assert_eq(expect_df["y"], got_df["y"], check_dtype=False) + + +def test_groupby_use_agg_column_as_index(): + pdf = pd.DataFrame({"a": [1, 1, 1, 3, 5]}) + gdf = cudf.DataFrame({"a": [1, 1, 1, 3, 5]}) + gdf["a"] = [1, 1, 1, 3, 5] + pdg = pdf.groupby("a").agg({"a": "count"}) + gdg = gdf.groupby("a").agg({"a": "count"}) + assert_groupby_results_equal(pdg, gdg, check_dtype=False) + + +def test_groupby_list_then_string(): + gdf = cudf.DataFrame( + {"a": [0, 1, 0, 1, 2], "b": [11, 2, 15, 12, 2], "c": [6, 7, 6, 7, 6]} + ) + pdf = gdf.to_pandas() + gdg = gdf.groupby("a", as_index=True).agg( + {"b": ["min", "max"], "c": "max"} + ) + pdg = pdf.groupby("a", as_index=True).agg( + {"b": ["min", "max"], "c": "max"} + ) + assert_groupby_results_equal(gdg, pdg) + + +def test_groupby_different_unequal_length_column_aggregations(): + gdf = cudf.DataFrame( + {"a": [0, 1, 0, 1, 2], "b": [11, 2, 15, 12, 2], "c": [6, 7, 6, 7, 6]} + ) + pdf = gdf.to_pandas() + gdg = gdf.groupby("a", as_index=True).agg( + {"b": "min", "c": ["max", "min"]} + ) + pdg = pdf.groupby("a", as_index=True).agg( + {"b": "min", "c": ["max", "min"]} + ) + assert_groupby_results_equal(pdg, gdg) + + +def test_groupby_single_var_two_aggs(): + gdf = cudf.DataFrame( + {"a": [0, 1, 0, 1, 2], "b": [11, 2, 15, 12, 2], "c": [6, 7, 6, 7, 6]} + ) + pdf = gdf.to_pandas() + gdg = gdf.groupby("a", as_index=True).agg({"b": ["min", "max"]}) + pdg = pdf.groupby("a", as_index=True).agg({"b": ["min", "max"]}) + assert_groupby_results_equal(pdg, gdg) + + +def test_groupby_double_var_two_aggs(): + gdf = cudf.DataFrame( + {"a": [0, 1, 0, 1, 2], "b": [11, 2, 15, 12, 2], "c": [6, 7, 6, 7, 6]} + ) + pdf = gdf.to_pandas() + gdg = gdf.groupby(["a", "b"], as_index=True).agg({"c": ["min", "max"]}) + pdg = pdf.groupby(["a", "b"], as_index=True).agg({"c": ["min", "max"]}) + assert_groupby_results_equal(pdg, gdg) + + +def test_groupby_multi_agg_single_groupby_series(): + rng = np.random.default_rng(seed=0) + pdf = pd.DataFrame( + { + "x": rng.integers(0, 5, size=100), + "y": rng.normal(size=100), + } + ) + gdf = cudf.from_pandas(pdf) + pdg = pdf.groupby("x").y.agg(["sum", "max"]) + gdg = gdf.groupby("x").y.agg(["sum", "max"]) + + assert_groupby_results_equal(pdg, gdg) + + +def test_groupby_multi_agg_multi_groupby(): + rng = np.random.default_rng(seed=0) + pdf = pd.DataFrame( + { + "a": rng.integers(0, 5, 10), + "b": rng.integers(0, 5, 10), + "c": rng.integers(0, 5, 10), + "d": rng.integers(0, 5, 10), + } + ) + gdf = cudf.from_pandas(pdf) + pdg = pdf.groupby(["a", "b"]).agg(["sum", "max"]) + gdg = gdf.groupby(["a", "b"]).agg(["sum", "max"]) + assert_groupby_results_equal(pdg, gdg) + + +def test_groupby_datetime_multi_agg_multi_groupby(): + rng = np.random.default_rng(seed=0) + pdf = pd.DataFrame( + { + "a": pd.date_range( + "2020-01-01", + freq="D", + periods=10, + ), + "b": rng.integers(0, 5, 10), + "c": rng.integers(0, 5, 10), + "d": rng.integers(0, 5, 10), + } + ) + gdf = cudf.from_pandas(pdf) + pdg = pdf.groupby(["a", "b"]).agg(["sum", "max"]) + gdg = gdf.groupby(["a", "b"]).agg(["sum", "max"]) + + assert_groupby_results_equal(pdg, gdg) + + +@pytest.mark.parametrize( + "agg", + [ + ["min", "max", "count", "mean"], + ["mean", "var", "std"], + ["count", "mean", "var", "std"], + ], +) +def test_groupby_multi_agg_hash_groupby(agg): + gdf = cudf.DataFrame( + {"id": [0, 0, 1, 1, 2, 2, 0], "a": [0, 1, 2, 3, 4, 5, 6]} + ) + pdf = gdf.to_pandas() + check_dtype = "count" not in agg + pdg = pdf.groupby("id").agg(agg) + gdg = gdf.groupby("id").agg(agg) + assert_groupby_results_equal(pdg, gdg, check_dtype=check_dtype) + + +@pytest.mark.skipif( + PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, + reason="previous verion of pandas throws a warning", +) +def test_groupby_nulls_basic(groupby_reduction_methods, request): + pdf = pd.DataFrame({"a": [0, 0, 1, 1, 2, 2], "b": [1, 2, 1, 2, 1, None]}) + gdf = cudf.from_pandas(pdf) + assert_groupby_results_equal( + getattr(pdf.groupby("a"), groupby_reduction_methods)(), + getattr(gdf.groupby("a"), groupby_reduction_methods)(), + ) + + pdf = pd.DataFrame( + { + "a": [0, 0, 1, 1, 2, 2], + "b": [1, 2, 1, 2, 1, None], + "c": [1, 2, 1, None, 1, 2], + } + ) + gdf = cudf.from_pandas(pdf) + assert_groupby_results_equal( + getattr(pdf.groupby("a"), groupby_reduction_methods)(), + getattr(gdf.groupby("a"), groupby_reduction_methods)(), + ) + + pdf = pd.DataFrame( + { + "a": [0, 0, 1, 1, 2, 2], + "b": [1, 2, 1, 2, 1, None], + "c": [1, 2, None, None, 1, 2], + } + ) + gdf = cudf.from_pandas(pdf) + + request.applymarker( + pytest.mark.xfail( + groupby_reduction_methods in ["prod", "sum"], + reason="cuDF returns NaN instead of an actual value", + ) + ) + assert_groupby_results_equal( + getattr(pdf.groupby("a"), groupby_reduction_methods)(), + getattr(gdf.groupby("a"), groupby_reduction_methods)(), + ) + + +@pytest.mark.parametrize("agg", [lambda x: x.count(), "count"]) +@pytest.mark.parametrize("by", ["a", ["a", "b"], ["a", "c"]]) +def test_groupby_count(agg, by): + pdf = pd.DataFrame( + {"a": [1, 1, 1, 2, 3], "b": [1, 2, 2, 2, 1], "c": [1, 2, None, 4, 5]} + ) + gdf = cudf.from_pandas(pdf) + + expect = pdf.groupby(by).agg(agg) + got = gdf.groupby(by).agg(agg) + + assert_groupby_results_equal(expect, got, check_dtype=True) + + +@pytest.mark.parametrize("agg", [lambda x: x.median(), "median"]) +@pytest.mark.parametrize("by", ["a", ["a", "b"], ["a", "c"]]) +def test_groupby_median(agg, by): + pdf = pd.DataFrame( + {"a": [1, 1, 1, 2, 3], "b": [1, 2, 2, 2, 1], "c": [1, 2, None, 4, 5]} + ) + gdf = cudf.from_pandas(pdf) + + expect = pdf.groupby(by).agg(agg) + got = gdf.groupby(by).agg(agg) + + assert_groupby_results_equal(expect, got, check_dtype=False) + + +def test_multi_agg(): + gdf = cudf.DataFrame( + {"a": [1, 1, 2, 2], "b": [1, 2, 3, 4], "c": ["a", "b", "c", "d"]} + ) + pdf = gdf.to_pandas() + assert_groupby_results_equal( + pdf.groupby("a").agg({"b": ["count", "mean"], "c": ["count"]}), + gdf.groupby("a").agg({"b": ["count", "mean"], "c": ["count"]}), + ) + + +@pytest.mark.parametrize( + "agg", + ( + [ + *itertools.combinations(["count", "max", "min", "nunique"], 2), + {"b": "min", "c": "mean"}, + {"b": "max", "c": "mean"}, + {"b": "count", "c": "mean"}, + {"b": "nunique", "c": "mean"}, + ] + ), +) +def test_groupby_agg_combinations(agg): + pdf = pd.DataFrame( + { + "a": [1, 1, 2, 2, 3], + "b": ["a", "a", "b", "c", "d"], + "c": [1, 2, 3, 4, 5], + } + ) + gdf = cudf.from_pandas(pdf) + + assert_groupby_results_equal( + pdf.groupby("a").agg(agg), + gdf.groupby("a").agg(agg), + check_dtype=False, + ) + + +@pytest.mark.parametrize("list_agg", [list, "collect"]) +def test_groupby_list_simple(list_agg): + pdf = pd.DataFrame({"a": [1, 1, 1, 2, 2, 2], "b": [1, 2, None, 4, 5, 6]}) + gdf = cudf.from_pandas(pdf) + + assert_groupby_results_equal( + pdf.groupby("a").agg({"b": list}), + gdf.groupby("a").agg({"b": list_agg}), + check_dtype=False, + ) + + +@pytest.mark.parametrize("list_agg", [list, "collect"]) +def test_groupby_list_of_lists(list_agg): + pdf = pd.DataFrame( + { + "a": [1, 1, 1, 2, 2, 2], + "b": [[1, 2], [3, None, 5], None, [], [7, 8], [9]], + } + ) + gdf = cudf.from_pandas(pdf) + + assert_groupby_results_equal( + pdf.groupby("a").agg({"b": list}), + gdf.groupby("a").agg({"b": list_agg}), + check_dtype=False, + ) + + +@pytest.mark.parametrize("list_agg", [list, "collect"]) +def test_groupby_list_of_structs(list_agg): + pdf = pd.DataFrame( + { + "a": [1, 1, 1, 2, 2, 2], + "b": [ + {"c": "1", "d": 1}, + {"c": "2", "d": 2}, + {"c": "3", "d": 3}, + {"c": "4", "d": 4}, + {"c": "5", "d": 5}, + {"c": "6", "d": 6}, + ], + } + ) + gdf = cudf.from_pandas(pdf) + grouped = gdf.groupby("a").agg({"b": list_agg}) + assert_groupby_results_equal( + pdf.groupby("a").agg({"b": list}), + grouped, + check_dtype=True, + ) + assert grouped["b"].dtype.element_type == gdf["b"].dtype + + +@pytest.mark.parametrize("list_agg", [list, "collect"]) +def test_groupby_list_single_element(list_agg): + pdf = pd.DataFrame({"a": [1, 2], "b": [3, None]}) + gdf = cudf.from_pandas(pdf) + + assert_groupby_results_equal( + pdf.groupby("a").agg({"b": list}), + gdf.groupby("a").agg({"b": list_agg}), + check_dtype=False, + ) + + +@pytest.mark.parametrize( + "agg", [list, [list, "count"], {"b": list, "c": "sum"}] +) +def test_groupby_list_strings(agg): + pdf = pd.DataFrame( + { + "a": [1, 1, 1, 2, 2], + "b": ["b", "a", None, "e", "d"], + "c": [1, 2, 3, 4, 5], + } + ) + gdf = cudf.from_pandas(pdf) + + assert_groupby_results_equal( + pdf.groupby("a").agg(agg), + gdf.groupby("a").agg(agg), + check_dtype=False, + ) + + +def test_groupby_list_columns_excluded(): + pdf = pd.DataFrame( + { + "a": [1, 1, 2, 2], + "b": [1, 2, 3, 4], + "c": [[1, 2], [3, 4], [5, 6], [7, 8]], + } + ) + gdf = cudf.from_pandas(pdf) + + pandas_result = pdf.groupby("a").mean(numeric_only=True) + pandas_agg_result = pdf.groupby("a").agg("mean", numeric_only=True) + + assert_groupby_results_equal( + pandas_result, + gdf.groupby("a").mean(numeric_only=True), + check_dtype=False, + ) + + assert_groupby_results_equal( + pandas_agg_result, + gdf.groupby("a").agg("mean"), + check_dtype=False, + ) + + +def test_groupby_mix_agg_scan(): + err_msg = "Cannot perform both aggregation and scan in one operation" + func = ["cumsum", "sum"] + gb = cudf.DataFrame(np.ones((10, 3)), columns=["x", "y", "z"]).groupby( + ["x", "y"], sort=True + ) + + gb.agg(func[0]) + gb.agg(func[1]) + gb.agg(func[1:]) + with pytest.raises(NotImplementedError, match=err_msg): + gb.agg(func) diff --git a/python/cudf/cudf/tests/groupby/test_apply.py b/python/cudf/cudf/tests/groupby/test_apply.py new file mode 100644 index 00000000000..1c340a34e2a --- /dev/null +++ b/python/cudf/cudf/tests/groupby/test_apply.py @@ -0,0 +1,141 @@ +# Copyright (c) 2018-2025, NVIDIA CORPORATION. + +import numpy as np +import pandas as pd +import pytest +from numba import cuda + +import cudf +from cudf.core._compat import ( + PANDAS_CURRENT_SUPPORTED_VERSION, + PANDAS_VERSION, +) +from cudf.testing import assert_groupby_results_equal + + +@pytest.fixture(params=["cudf", "jit"]) +def engine(request): + return request.param + + +@pytest.mark.skipif( + PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, + reason="Include groups missing on old versions of pandas", +) +def test_groupby_as_index_apply(as_index, engine): + pdf = pd.DataFrame({"x": [1, 2, 3], "y": [0, 1, 1]}) + gdf = cudf.DataFrame({"x": [1, 2, 3], "y": [0, 1, 1]}) + gdf = gdf.groupby("y", as_index=as_index).apply( + lambda df: df["x"].mean(), engine=engine + ) + kwargs = {"func": lambda df: df["x"].mean(), "include_groups": False} + pdf = pdf.groupby("y", as_index=as_index).apply(**kwargs) + assert_groupby_results_equal(pdf, gdf, as_index=as_index, by="y") + + +@pytest.mark.skipif( + PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, + reason="Fails in older versions of pandas", +) +def test_groupby_apply(): + rng = np.random.default_rng(seed=0) + nelem = 20 + df = cudf.DataFrame( + { + "key1": rng.integers(0, 3, nelem), + "key2": rng.integers(0, 2, nelem), + "val1": rng.random(nelem), + "val2": rng.random(nelem), + } + ) + + expect_grpby = df.to_pandas().groupby( + ["key1", "key2"], as_index=False, group_keys=False + ) + got_grpby = df.groupby(["key1", "key2"]) + + def foo(df): + df["out"] = df["val1"] + df["val2"] + return df + + expect = expect_grpby.apply(foo, include_groups=False) + got = got_grpby.apply(foo, include_groups=False) + assert_groupby_results_equal(expect, got) + + +def f1(df, k): + df["out"] = df["val1"] + df["val2"] + k + return df + + +def f2(df, k, L): + df["out"] = df["val1"] - df["val2"] + (k / L) + return df + + +def f3(df, k, L, m): + df["out"] = ((k * df["val1"]) + (L * df["val2"])) / m + return df + + +@pytest.mark.parametrize( + "func,args", [(f1, (42,)), (f2, (42, 119)), (f3, (42, 119, 212.1))] +) +@pytest.mark.skipif( + PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, + reason="Fails in older versions of pandas", +) +def test_groupby_apply_args(func, args): + rng = np.random.default_rng(seed=0) + nelem = 20 + df = cudf.DataFrame( + { + "key1": rng.integers(0, 3, nelem), + "key2": rng.integers(0, 2, nelem), + "val1": rng.random(nelem), + "val2": rng.random(nelem), + } + ) + + expect_grpby = df.to_pandas().groupby( + ["key1", "key2"], as_index=False, group_keys=False + ) + got_grpby = df.groupby(["key1", "key2"]) + expect = expect_grpby.apply(func, *args, include_groups=False) + got = got_grpby.apply(func, *args, include_groups=False) + assert_groupby_results_equal(expect, got) + + +def test_groupby_apply_grouped(): + df = cudf.DataFrame( + { + "key1": range(20), + "key2": range(20), + "val1": range(20), + "val2": range(20), + } + ) + + got_grpby = df.groupby(["key1", "key2"]) + + def foo(key1, val1, com1, com2): + for i in range(cuda.threadIdx.x, len(key1), cuda.blockDim.x): + com1[i] = key1[i] * 10000 + val1[i] + com2[i] = i + + got = got_grpby.apply_grouped( + foo, + incols=["key1", "val1"], + outcols={"com1": np.float64, "com2": np.int32}, + tpb=8, + ) + + got = got.to_pandas() + + expect = df.copy() + expect["com1"] = (expect["key1"] * 10000 + expect["key1"]).astype( + np.float64 + ) + expect["com2"] = np.zeros(20, dtype=np.int32) + + assert_groupby_results_equal(expect, got) diff --git a/python/cudf/cudf/tests/groupby/test_attributes.py b/python/cudf/cudf/tests/groupby/test_attributes.py new file mode 100644 index 00000000000..16e6741229d --- /dev/null +++ b/python/cudf/cudf/tests/groupby/test_attributes.py @@ -0,0 +1,126 @@ +# Copyright (c) 2024-2025, NVIDIA CORPORATION. +import numpy as np +import pandas as pd +import pytest + +import cudf +from cudf.testing import assert_eq + + +def test_groups(): + # https://github.com/rapidsai/cudf/issues/14955 + df = cudf.DataFrame({"a": [1, 2] * 2}, index=[0] * 4) + agg = df.groupby("a") + pagg = df.to_pandas().groupby("a") + for key in agg.groups: + np.testing.assert_array_equal( + pagg.indices[key], agg.indices[key].get() + ) + assert_eq(pagg.get_group(key), agg.get_group(key)) + + +@pytest.mark.parametrize( + "by", + [ + "a", + "b", + ["a"], + ["b"], + ["a", "b"], + ["b", "a"], + np.array([0, 0, 0, 1, 1, 1, 2]), + ], +) +def test_groupby_groups(by): + pdf = pd.DataFrame( + {"a": [1, 2, 1, 2, 1, 2, 3], "b": [1, 2, 3, 4, 5, 6, 7]} + ) + gdf = cudf.from_pandas(pdf) + + pdg = pdf.groupby(by) + gdg = gdf.groupby(by) + + for key in pdg.groups: + assert key in gdg.groups + assert_eq(pdg.groups[key], gdg.groups[key]) + + +@pytest.mark.parametrize( + "by", + [ + "a", + "b", + ["a"], + ["b"], + ["a", "b"], + ["b", "a"], + ["a", "c"], + ["a", "b", "c"], + ], +) +def test_groupby_groups_multi(by): + pdf = pd.DataFrame( + { + "a": [1, 2, 1, 2, 1, 2, 3], + "b": ["a", "b", "a", "b", "b", "c", "c"], + "c": [1, 2, 3, 4, 5, 6, 7], + } + ) + gdf = cudf.from_pandas(pdf) + + pdg = pdf.groupby(by) + gdg = gdf.groupby(by) + + for key in pdg.groups: + assert key in gdg.groups + assert_eq(pdg.groups[key], gdg.groups[key]) + + +def test_groupby_iterate_groups(): + rng = np.random.default_rng(seed=0) + nelem = 20 + df = cudf.DataFrame( + { + "key1": rng.integers(0, 3, nelem), + "key2": rng.integers(0, 2, nelem), + "val1": rng.random(nelem), + "val2": rng.random(nelem), + } + ) + + def assert_values_equal(arr): + np.testing.assert_array_equal(arr[0], arr) + + for name, grp in df.groupby(["key1", "key2"]): + pddf = grp.to_pandas() + for k in "key1,key2".split(","): + assert_values_equal(pddf[k].values) + + +@pytest.mark.parametrize( + "grouper", + [ + "a", + ["a"], + ["a", "b"], + np.array([0, 1, 1, 2, 3, 2]), + {0: "a", 1: "a", 2: "b", 3: "a", 4: "b", 5: "c"}, + lambda x: x + 1, + ["a", np.array([0, 1, 1, 2, 3, 2])], + ], +) +def test_grouping(grouper): + pdf = pd.DataFrame( + { + "a": [1, 1, 1, 2, 2, 3], + "b": [1, 2, 1, 2, 1, 2], + "c": [1, 2, 3, 4, 5, 6], + } + ) + gdf = cudf.from_pandas(pdf) + + for pdf_group, gdf_group in zip( + pdf.groupby(grouper), gdf.groupby(grouper), strict=True + ): + assert pdf_group[0] == gdf_group[0] + assert_eq(pdf_group[1], gdf_group[1]) diff --git a/python/cudf/cudf/tests/groupby/test_computation.py b/python/cudf/cudf/tests/groupby/test_computation.py deleted file mode 100644 index 630fcdc4dce..00000000000 --- a/python/cudf/cudf/tests/groupby/test_computation.py +++ /dev/null @@ -1,18 +0,0 @@ -# Copyright (c) 2023-2024, NVIDIA CORPORATION. -import pandas as pd -import pytest - -import cudf -from cudf.testing import assert_eq - - -@pytest.mark.parametrize("method", ["average", "min", "max", "first", "dense"]) -def test_rank_return_type_compatible_mode(method): - # in compatible mode, rank() always returns floats - pdf = pd.DataFrame({"a": [1, 1, 1, 2, 2], "b": [1, 2, 3, 4, 5]}) - with cudf.option_context("mode.pandas_compatible", True): - df = cudf.from_pandas(pdf) - result = df.groupby("a").rank(method=method) - expect = pdf.groupby("a").rank(method=method) - assert_eq(expect, result) - assert result["b"].dtype == "float64" diff --git a/python/cudf/cudf/tests/groupby/test_constructors.py b/python/cudf/cudf/tests/groupby/test_constructors.py new file mode 100644 index 00000000000..503ec3d3500 --- /dev/null +++ b/python/cudf/cudf/tests/groupby/test_constructors.py @@ -0,0 +1,16 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. +import pandas as pd +import pytest + +import cudf +from cudf.testing._utils import assert_exceptions_equal + + +@pytest.mark.parametrize("data", [{"a": [1, 2]}, {"a": [1, 2], "b": [2, 3]}]) +def test_groupby_nonempty_no_keys(data): + pdf = pd.DataFrame(data) + gdf = cudf.from_pandas(pdf) + assert_exceptions_equal( + lambda: pdf.groupby([]), + lambda: gdf.groupby([]), + ) diff --git a/python/cudf/cudf/tests/groupby/test_cummulative.py b/python/cudf/cudf/tests/groupby/test_cummulative.py new file mode 100644 index 00000000000..1eab8a1b317 --- /dev/null +++ b/python/cudf/cudf/tests/groupby/test_cummulative.py @@ -0,0 +1,95 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. +import numpy as np +import pandas as pd +import pytest + +import cudf +from cudf.testing import assert_groupby_results_equal + + +@pytest.mark.parametrize("index", [None, [1, 2, 3, 4]]) +def test_groupby_cumcount(index): + pdf = pd.DataFrame( + { + "a": [1, 1, 3, 4], + "b": ["bob", "bob", "alice", "cooper"], + "c": [1, 2, 3, 4], + }, + index=index, + ) + gdf = cudf.from_pandas(pdf) + + assert_groupby_results_equal( + pdf.groupby("a").cumcount(), + gdf.groupby("a").cumcount(), + check_dtype=False, + ) + + assert_groupby_results_equal( + pdf.groupby(["a", "b", "c"]).cumcount(), + gdf.groupby(["a", "b", "c"]).cumcount(), + check_dtype=False, + ) + + sr = pd.Series(range(len(pdf)), index=index) + assert_groupby_results_equal( + pdf.groupby(sr).cumcount(), + gdf.groupby(sr).cumcount(), + check_dtype=False, + ) + + +@pytest.mark.parametrize( + "func", ["cummin", "cummax", "cumcount", "cumsum", "cumprod"] +) +def test_groupby_2keys_scan(func): + nelem = 20 + pdf = pd.DataFrame(np.ones((nelem, 3)), columns=["x", "y", "val"]) + expect_df = pdf.groupby(["x", "y"], sort=True).agg(func) + gdf = cudf.from_pandas(pdf) + got_df = gdf.groupby(["x", "y"], sort=True).agg(func) + # pd.groupby.cumcount returns a series. + if isinstance(expect_df, pd.Series): + expect_df = expect_df.to_frame("val") + + assert_groupby_results_equal(got_df, expect_df) + + expect_df = getattr(pdf.groupby(["x", "y"], sort=True), func)() + got_df = getattr(gdf.groupby(["x", "y"], sort=True), func)() + assert_groupby_results_equal(got_df, expect_df) + + expect_df = getattr(pdf.groupby(["x", "y"], sort=True)[["x"]], func)() + got_df = getattr(gdf.groupby(["x", "y"], sort=True)[["x"]], func)() + assert_groupby_results_equal(got_df, expect_df) + + expect_df = getattr(pdf.groupby(["x", "y"], sort=True)["y"], func)() + got_df = getattr(gdf.groupby(["x", "y"], sort=True)["y"], func)() + assert_groupby_results_equal(got_df, expect_df) + + +@pytest.mark.parametrize( + "with_nan", [False, True], ids=["just-NA", "also-NaN"] +) +@pytest.mark.parametrize( + "duplicate_index", [False, True], ids=["rangeindex", "dupindex"] +) +def test_groupby_scan_null_keys(with_nan, dropna, duplicate_index): + key_col = [None, 1, 2, None, 3, None, 3, 1, None, 1] + if with_nan: + df = pd.DataFrame( + {"key": pd.Series(key_col, dtype="float32"), "value": range(10)} + ) + else: + df = pd.DataFrame( + {"key": pd.Series(key_col, dtype="Int32"), "value": range(10)} + ) + + if duplicate_index: + # Non-default index with duplicates + df.index = [1, 2, 3, 1, 3, 2, 4, 1, 6, 10] + + cdf = cudf.from_pandas(df) + + expect = df.groupby("key", dropna=dropna).cumsum() + got = cdf.groupby("key", dropna=dropna).cumsum() + assert_groupby_results_equal(expect, got) diff --git a/python/cudf/cudf/tests/groupby/test_diff.py b/python/cudf/cudf/tests/groupby/test_diff.py new file mode 100644 index 00000000000..a9e60e0f4b5 --- /dev/null +++ b/python/cudf/cudf/tests/groupby/test_diff.py @@ -0,0 +1,194 @@ +# Copyright (c) 2018-2025, NVIDIA CORPORATION. + + +import numpy as np +import pandas as pd +import pytest + +import cudf +from cudf.core._compat import ( + PANDAS_CURRENT_SUPPORTED_VERSION, + PANDAS_VERSION, +) +from cudf.testing import assert_groupby_results_equal +from cudf.testing.dataset_generator import rand_dataframe + + +@pytest.mark.parametrize("shift_perc", [0.5, 1.0, 1.5]) +@pytest.mark.parametrize("direction", [1, -1]) +def test_groupby_diff_row(shift_perc, direction): + nelem = 20 + pdf = pd.DataFrame(np.ones((nelem, 4)), columns=["x", "y", "val", "val2"]) + gdf = cudf.from_pandas(pdf) + n_shift = int(nelem * shift_perc) * direction + + expected = pdf.groupby(["x", "y"]).diff(periods=n_shift) + got = gdf.groupby(["x", "y"]).diff(periods=n_shift) + + assert_groupby_results_equal(expected, got) + + +@pytest.mark.parametrize("shift_perc", [0.5, 1.0, 1.5]) +@pytest.mark.parametrize("direction", [1, -1]) +def test_groupby_diff_row_mixed_numerics(shift_perc, direction): + nelem = 20 + t = rand_dataframe( + dtypes_meta=[ + {"dtype": "int64", "null_frequency": 0, "cardinality": 10}, + {"dtype": "int64", "null_frequency": 0.4, "cardinality": 10}, + {"dtype": "float32", "null_frequency": 0.4, "cardinality": 10}, + {"dtype": "decimal64", "null_frequency": 0.4, "cardinality": 10}, + { + "dtype": "datetime64[ns]", + "null_frequency": 0.4, + "cardinality": 10, + }, + { + "dtype": "timedelta64[ns]", + "null_frequency": 0.4, + "cardinality": 10, + }, + ], + rows=nelem, + use_threads=False, + seed=0, + ) + pdf = t.to_pandas() + gdf = cudf.from_pandas(pdf) + n_shift = int(nelem * shift_perc) * direction + + expected = pdf.groupby(["0"]).diff(periods=n_shift) + got = gdf.groupby(["0"]).diff(periods=n_shift) + + assert_groupby_results_equal(expected, got) + + +def test_groupby_diff_row_zero_shift(): + nelem = 20 + t = rand_dataframe( + dtypes_meta=[ + {"dtype": "int64", "null_frequency": 0, "cardinality": 10}, + {"dtype": "int64", "null_frequency": 0.4, "cardinality": 10}, + {"dtype": "float32", "null_frequency": 0.4, "cardinality": 10}, + { + "dtype": "datetime64[ns]", + "null_frequency": 0.4, + "cardinality": 10, + }, + { + "dtype": "timedelta64[ns]", + "null_frequency": 0.4, + "cardinality": 10, + }, + ], + rows=nelem, + use_threads=False, + seed=0, + ) + gdf = cudf.from_pandas(t.to_pandas()) + + expected = gdf + got = gdf.groupby(["0"]).shift(periods=0) + + assert_groupby_results_equal( + expected[["1", "2", "3", "4"]], got[["1", "2", "3", "4"]] + ) + + +@pytest.mark.skipif( + PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, + reason="warning not present in older pandas versions", +) +def test_groupby_fillna_multi_value(): + nelem = 20 + t = rand_dataframe( + dtypes_meta=[ + {"dtype": "int64", "null_frequency": 0, "cardinality": 10}, + {"dtype": "int64", "null_frequency": 0.4, "cardinality": 10}, + {"dtype": "float32", "null_frequency": 0.4, "cardinality": 10}, + { + "dtype": "datetime64[ms]", + "null_frequency": 0.4, + "cardinality": 10, + }, + { + "dtype": "timedelta64[ns]", + "null_frequency": 0.4, + "cardinality": 10, + }, + {"dtype": "decimal64", "null_frequency": 0.4, "cardinality": 10}, + {"dtype": "str", "null_frequency": 0.4, "cardinality": 10}, + ], + rows=nelem, + use_threads=False, + seed=0, + ) + key_col = "0" + value_cols = ["1", "2", "3", "4", "5", "6"] + pdf = t.to_pandas() + gdf = cudf.from_pandas(pdf) + + # fill the dataframe with the first non-null item in the column + fill_values = { + name: pdf[name].loc[pdf[name].first_valid_index()] + for name in value_cols + } + # cudf can't fillna with a pandas.Timedelta type + fill_values["4"] = fill_values["4"].to_numpy() + with pytest.warns(FutureWarning): + expect = pdf.groupby(key_col).fillna(value=fill_values) + with pytest.warns(FutureWarning): + got = gdf.groupby(key_col).fillna(value=fill_values) + + assert_groupby_results_equal(expect[value_cols], got[value_cols]) + + +# TODO: cudf.fillna does not support decimal column to column fill yet +@pytest.mark.skipif( + PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, + reason="warning not present in older pandas versions", +) +def test_groupby_fillna_multi_value_df(): + nelem = 20 + t = rand_dataframe( + dtypes_meta=[ + {"dtype": "int64", "null_frequency": 0, "cardinality": 10}, + {"dtype": "int64", "null_frequency": 0.4, "cardinality": 10}, + {"dtype": "float32", "null_frequency": 0.4, "cardinality": 10}, + { + "dtype": "datetime64[ms]", + "null_frequency": 0.4, + "cardinality": 10, + }, + { + "dtype": "timedelta64[ns]", + "null_frequency": 0.4, + "cardinality": 10, + }, + {"dtype": "str", "null_frequency": 0.4, "cardinality": 10}, + ], + rows=nelem, + use_threads=False, + seed=0, + ) + key_col = "0" + value_cols = ["1", "2", "3", "4", "5"] + pdf = t.to_pandas() + gdf = cudf.from_pandas(pdf) + + # fill the dataframe with the first non-null item in the column + fill_values = { + name: pdf[name].loc[pdf[name].first_valid_index()] + for name in value_cols + } + # cudf can't fillna with a pandas.Timedelta type + fill_values["4"] = fill_values["4"].to_numpy() + fill_values = pd.DataFrame(fill_values, index=pdf.index) + with pytest.warns(FutureWarning): + expect = pdf.groupby(key_col).fillna(value=fill_values) + + fill_values = cudf.from_pandas(fill_values) + with pytest.warns(FutureWarning): + got = gdf.groupby(key_col).fillna(value=fill_values) + + assert_groupby_results_equal(expect[value_cols], got[value_cols]) diff --git a/python/cudf/cudf/tests/groupby/test_fillna.py b/python/cudf/cudf/tests/groupby/test_fillna.py new file mode 100644 index 00000000000..c8f45818357 --- /dev/null +++ b/python/cudf/cudf/tests/groupby/test_fillna.py @@ -0,0 +1,191 @@ +# Copyright (c) 2018-2025, NVIDIA CORPORATION. + + +import pandas as pd +import pytest + +import cudf +from cudf.core._compat import ( + PANDAS_CURRENT_SUPPORTED_VERSION, + PANDAS_VERSION, +) +from cudf.testing import assert_groupby_results_equal +from cudf.testing.dataset_generator import rand_dataframe + + +@pytest.mark.skipif( + PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, + reason="warning not present in older pandas versions", +) +def test_groupby_fillna_multi_value(): + nelem = 20 + t = rand_dataframe( + dtypes_meta=[ + {"dtype": "int64", "null_frequency": 0, "cardinality": 10}, + {"dtype": "int64", "null_frequency": 0.4, "cardinality": 10}, + {"dtype": "float32", "null_frequency": 0.4, "cardinality": 10}, + { + "dtype": "datetime64[ms]", + "null_frequency": 0.4, + "cardinality": 10, + }, + { + "dtype": "timedelta64[ns]", + "null_frequency": 0.4, + "cardinality": 10, + }, + {"dtype": "decimal64", "null_frequency": 0.4, "cardinality": 10}, + {"dtype": "str", "null_frequency": 0.4, "cardinality": 10}, + ], + rows=nelem, + use_threads=False, + seed=0, + ) + key_col = "0" + value_cols = ["1", "2", "3", "4", "5", "6"] + pdf = t.to_pandas() + gdf = cudf.from_pandas(pdf) + + # fill the dataframe with the first non-null item in the column + fill_values = { + name: pdf[name].loc[pdf[name].first_valid_index()] + for name in value_cols + } + # cudf can't fillna with a pandas.Timedelta type + fill_values["4"] = fill_values["4"].to_numpy() + with pytest.warns(FutureWarning): + expect = pdf.groupby(key_col).fillna(value=fill_values) + with pytest.warns(FutureWarning): + got = gdf.groupby(key_col).fillna(value=fill_values) + + assert_groupby_results_equal(expect[value_cols], got[value_cols]) + + +# TODO: cudf.fillna does not support decimal column to column fill yet +@pytest.mark.skipif( + PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, + reason="warning not present in older pandas versions", +) +def test_groupby_fillna_multi_value_df(): + nelem = 20 + t = rand_dataframe( + dtypes_meta=[ + {"dtype": "int64", "null_frequency": 0, "cardinality": 10}, + {"dtype": "int64", "null_frequency": 0.4, "cardinality": 10}, + {"dtype": "float32", "null_frequency": 0.4, "cardinality": 10}, + { + "dtype": "datetime64[ms]", + "null_frequency": 0.4, + "cardinality": 10, + }, + { + "dtype": "timedelta64[ns]", + "null_frequency": 0.4, + "cardinality": 10, + }, + {"dtype": "str", "null_frequency": 0.4, "cardinality": 10}, + ], + rows=nelem, + use_threads=False, + seed=0, + ) + key_col = "0" + value_cols = ["1", "2", "3", "4", "5"] + pdf = t.to_pandas() + gdf = cudf.from_pandas(pdf) + + # fill the dataframe with the first non-null item in the column + fill_values = { + name: pdf[name].loc[pdf[name].first_valid_index()] + for name in value_cols + } + # cudf can't fillna with a pandas.Timedelta type + fill_values["4"] = fill_values["4"].to_numpy() + fill_values = pd.DataFrame(fill_values, index=pdf.index) + with pytest.warns(FutureWarning): + expect = pdf.groupby(key_col).fillna(value=fill_values) + + fill_values = cudf.from_pandas(fill_values) + with pytest.warns(FutureWarning): + got = gdf.groupby(key_col).fillna(value=fill_values) + + assert_groupby_results_equal(expect[value_cols], got[value_cols]) + + +@pytest.mark.parametrize( + "by", + [pd.Series([1, 1, 2, 2, 3, 4]), lambda x: x % 2 == 0, pd.Grouper(level=0)], +) +@pytest.mark.parametrize( + "data", [[1, None, 2, None, 3, None], [1, 2, 3, 4, 5, 6]] +) +@pytest.mark.parametrize("args", [{"value": 42}, {"method": "ffill"}]) +@pytest.mark.skipif( + PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, + reason="warning not present in older pandas versions", +) +def test_groupby_various_by_fillna(by, data, args): + ps = pd.Series(data) + gs = cudf.from_pandas(ps) + + with pytest.warns(FutureWarning): + expect = ps.groupby(by).fillna(**args) + if isinstance(by, pd.Grouper): + by = cudf.Grouper(level=by.level) + with pytest.warns(FutureWarning): + got = gs.groupby(by).fillna(**args) + + assert_groupby_results_equal(expect, got, check_dtype=False) + + +@pytest.mark.skipif( + PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, + reason="warning not present in older pandas versions", +) +@pytest.mark.parametrize("method", ["ffill", "bfill"]) +def test_groupby_fillna_method(method): + nelem = 20 + t = rand_dataframe( + dtypes_meta=[ + {"dtype": "int64", "null_frequency": 0, "cardinality": 10}, + {"dtype": "int64", "null_frequency": 0.4, "cardinality": 10}, + {"dtype": "float32", "null_frequency": 0.4, "cardinality": 10}, + { + "dtype": "datetime64[ns]", + "null_frequency": 0.4, + "cardinality": 10, + }, + { + "dtype": "timedelta64[ns]", + "null_frequency": 0.4, + "cardinality": 10, + }, + { + "dtype": "list", + "null_frequency": 0.4, + "cardinality": 10, + "lists_max_length": 10, + "nesting_max_depth": 3, + "value_type": "int64", + }, + {"dtype": "category", "null_frequency": 0.4, "cardinality": 10}, + {"dtype": "decimal64", "null_frequency": 0.4, "cardinality": 10}, + {"dtype": "str", "null_frequency": 0.4, "cardinality": 10}, + ], + rows=nelem, + use_threads=False, + seed=0, + ) + key_col = "0" + value_cols = ["1", "2", "3", "4", "5", "6", "7", "8"] + pdf = t.to_pandas() + gdf = cudf.from_pandas(pdf) + + with pytest.warns(FutureWarning): + expect = pdf.groupby(key_col).fillna(method=method) + with pytest.warns(FutureWarning): + got = gdf.groupby(key_col).fillna(method=method) + + assert_groupby_results_equal( + expect[value_cols], got[value_cols], sort=False + ) diff --git a/python/cudf/cudf/tests/groupby/test_indexing.py b/python/cudf/cudf/tests/groupby/test_get_group.py similarity index 100% rename from python/cudf/cudf/tests/groupby/test_indexing.py rename to python/cudf/cudf/tests/groupby/test_get_group.py diff --git a/python/cudf/cudf/tests/groupby/test_groupby_obj.py b/python/cudf/cudf/tests/groupby/test_groupby_obj.py deleted file mode 100644 index ab2b16d263c..00000000000 --- a/python/cudf/cudf/tests/groupby/test_groupby_obj.py +++ /dev/null @@ -1,15 +0,0 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. -from numpy.testing import assert_array_equal - -import cudf -from cudf.testing import assert_eq - - -def test_groupby_14955(): - # https://github.com/rapidsai/cudf/issues/14955 - df = cudf.DataFrame({"a": [1, 2] * 2}, index=[0] * 4) - agg = df.groupby("a") - pagg = df.to_pandas().groupby("a") - for key in agg.groups: - assert_array_equal(pagg.indices[key], agg.indices[key].get()) - assert_eq(pagg.get_group(key), agg.get_group(key)) diff --git a/python/cudf/cudf/tests/groupby/test_nth.py b/python/cudf/cudf/tests/groupby/test_nth.py new file mode 100644 index 00000000000..1fb9d32f535 --- /dev/null +++ b/python/cudf/cudf/tests/groupby/test_nth.py @@ -0,0 +1,25 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. +import pandas as pd +import pytest + +import cudf +from cudf.testing import assert_groupby_results_equal + + +@pytest.mark.parametrize("n", [0, 2, 10]) +@pytest.mark.parametrize("by", ["a", ["a", "b"], ["a", "c"]]) +def test_groupby_nth(n, by): + pdf = pd.DataFrame( + { + "a": [1, 1, 1, 2, 3], + "b": [1, 2, 2, 2, 1], + "c": [1, 2, None, 4, 5], + "d": ["a", "b", "c", "d", "e"], + } + ) + gdf = cudf.from_pandas(pdf) + + expect = pdf.groupby(by).nth(n) + got = gdf.groupby(by).nth(n) + + assert_groupby_results_equal(expect, got, check_dtype=False) diff --git a/python/cudf/cudf/tests/groupby/test_nunique.py b/python/cudf/cudf/tests/groupby/test_nunique.py new file mode 100644 index 00000000000..742c35f874d --- /dev/null +++ b/python/cudf/cudf/tests/groupby/test_nunique.py @@ -0,0 +1,47 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. +import pandas as pd +import pytest + +import cudf +from cudf.testing import assert_groupby_results_equal + + +@pytest.mark.parametrize("agg", [lambda x: x.nunique(), "nunique"]) +@pytest.mark.parametrize("by", ["a", ["a", "b"], ["a", "c"]]) +def test_groupby_nunique(agg, by): + pdf = pd.DataFrame( + {"a": [1, 1, 1, 2, 3], "b": [1, 2, 2, 2, 1], "c": [1, 2, None, 4, 5]} + ) + gdf = cudf.from_pandas(pdf) + + expect = pdf.groupby(by).nunique() + got = gdf.groupby(by).nunique() + + assert_groupby_results_equal(expect, got, check_dtype=False) + + +def test_nunique_dropna(dropna): + gdf = cudf.DataFrame( + { + "a": [1, 1, 2], + "b": [4, None, 5], + "c": [None, None, 7], + "d": [1, 1, 3], + } + ) + pdf = gdf.to_pandas() + + result = gdf.groupby("a")["b"].nunique(dropna=dropna) + expected = pdf.groupby("a")["b"].nunique(dropna=dropna) + assert_groupby_results_equal(result, expected, check_dtype=False) + + +def test_groupby_nunique_series(): + pdf = pd.DataFrame({"a": [1, 1, 1, 2, 2, 2], "b": [1, 2, 3, 1, 1, 2]}) + gdf = cudf.from_pandas(pdf) + + assert_groupby_results_equal( + pdf.groupby("a")["b"].nunique(), + gdf.groupby("a")["b"].nunique(), + check_dtype=False, + ) diff --git a/python/cudf/cudf/tests/groupby/test_ordering_pandas_compat.py b/python/cudf/cudf/tests/groupby/test_ordering_pandas_compat.py deleted file mode 100644 index 64bba6f4404..00000000000 --- a/python/cudf/cudf/tests/groupby/test_ordering_pandas_compat.py +++ /dev/null @@ -1,28 +0,0 @@ -# Copyright (c) 2024-2025, NVIDIA CORPORATION. -import numpy as np -import pytest - -import cudf -from cudf.testing import assert_eq - - -@pytest.mark.parametrize("with_nulls", [False, True]) -def test_groupby_maintain_order_random(with_nulls): - nrows = 20 - nkeys = 3 - rng = np.random.default_rng(seed=0) - key_names = [f"key{key}" for key in range(nkeys)] - key_values = [rng.integers(100, size=nrows) for _ in key_names] - value = rng.integers(-100, 100, size=nrows) - df = cudf.DataFrame( - dict(zip(key_names, key_values, strict=True), value=value) - ) - if with_nulls: - for key in key_names: - df.loc[df[key] == 1, key] = None - with cudf.option_context("mode.pandas_compatible", True): - got = df.groupby(key_names, sort=False).agg({"value": "sum"}) - expect = ( - df.to_pandas().groupby(key_names, sort=False).agg({"value": "sum"}) - ) - assert_eq(expect, got, check_index_type=not with_nulls) diff --git a/python/cudf/cudf/tests/groupby/test_pipe.py b/python/cudf/cudf/tests/groupby/test_pipe.py new file mode 100644 index 00000000000..f6342380610 --- /dev/null +++ b/python/cudf/cudf/tests/groupby/test_pipe.py @@ -0,0 +1,15 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. +import pandas as pd + +import cudf +from cudf.testing import assert_groupby_results_equal + + +def test_groupby_pipe(): + pdf = pd.DataFrame({"A": "a b a b".split(), "B": [1, 2, 3, 4]}) + gdf = cudf.from_pandas(pdf) + + expected = pdf.groupby("A").pipe(lambda x: x.max() - x.min()) + actual = gdf.groupby("A").pipe(lambda x: x.max() - x.min()) + + assert_groupby_results_equal(expected, actual) diff --git a/python/cudf/cudf/tests/groupby/test_rank.py b/python/cudf/cudf/tests/groupby/test_rank.py new file mode 100644 index 00000000000..60aee399ca0 --- /dev/null +++ b/python/cudf/cudf/tests/groupby/test_rank.py @@ -0,0 +1,59 @@ +# Copyright (c) 2023-2025, NVIDIA CORPORATION. +import numpy as np +import pandas as pd +import pytest + +import cudf +from cudf.testing import assert_eq, assert_groupby_results_equal + + +@pytest.mark.parametrize("method", ["average", "min", "max", "first", "dense"]) +def test_rank_return_type_compatible_mode(method): + # in compatible mode, rank() always returns floats + pdf = pd.DataFrame({"a": [1, 1, 1, 2, 2], "b": [1, 2, 3, 4, 5]}) + with cudf.option_context("mode.pandas_compatible", True): + df = cudf.from_pandas(pdf) + result = df.groupby("a").rank(method=method) + expect = pdf.groupby("a").rank(method=method) + assert_eq(expect, result) + assert result["b"].dtype == "float64" + + +@pytest.mark.parametrize("method", ["average", "min", "max", "first", "dense"]) +@pytest.mark.parametrize("ascending", [True, False]) +@pytest.mark.parametrize("na_option", ["keep", "top", "bottom"]) +@pytest.mark.parametrize("pct", [False, True]) +def test_groupby_2keys_rank(method, ascending, na_option, pct): + nelem = 20 + pdf = pd.DataFrame( + { + "x": np.arange(nelem), + "y": np.arange(nelem), + "z": np.concatenate([np.arange(nelem - 10), np.full(10, np.nan)]), + } + ) + gdf = cudf.from_pandas(pdf) + expect_df = pdf.groupby(["x", "y"], sort=True).rank( + method=method, ascending=ascending, na_option=na_option, pct=pct + ) + got_df = gdf.groupby(["x", "y"], sort=True).rank( + method=method, ascending=ascending, na_option=na_option, pct=pct + ) + + assert_groupby_results_equal(got_df, expect_df, check_dtype=False) + + +def test_groupby_rank_fails(): + gdf = cudf.DataFrame( + {"x": [1, 2, 3, 4], "y": [1, 2, 3, 4], "z": [1, 2, 3, 4]} + ) + with pytest.raises(NotImplementedError): + gdf.groupby(["x", "y"]).rank(method="min", axis=1) + gdf = cudf.DataFrame( + { + "a": [1, 1, 1, 2, 2, 2], + "b": [[1, 2], [3, None, 5], None, [], [7, 8], [9]], + } + ) + with pytest.raises(NotImplementedError): + gdf.groupby(["a"]).rank(method="min", axis=1) diff --git a/python/cudf/cudf/tests/groupby/test_reductions.py b/python/cudf/cudf/tests/groupby/test_reductions.py new file mode 100644 index 00000000000..adbc5af309f --- /dev/null +++ b/python/cudf/cudf/tests/groupby/test_reductions.py @@ -0,0 +1,678 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + +import numpy as np +import pandas as pd +import pytest + +import cudf +from cudf.testing import assert_groupby_results_equal +from cudf.testing._utils import assert_exceptions_equal + + +def test_groupby_mean(): + pdf = pd.DataFrame(np.ones((20, 3)), columns=["x", "y", "val"]) + gdf = cudf.DataFrame(pdf) + got_df = gdf.groupby(["x", "y"]).mean() + expect_df = pdf.groupby(["x", "y"]).mean() + assert_groupby_results_equal(got_df, expect_df) + + +def test_groupby_mean_3level(): + pdf = pd.DataFrame(np.ones((20, 4)), columns=["x", "y", "val", "z"]) + gdf = cudf.DataFrame(pdf) + bys = list("xyz") + got_df = pdf.groupby(bys).mean() + expect_df = gdf.groupby(bys).mean() + assert_groupby_results_equal(got_df, expect_df) + + +def test_group_keys_true(): + pdf = pd.DataFrame({"x": [1, 2, 3], "y": [0, 1, 1]}) + gdf = cudf.DataFrame({"x": [1, 2, 3], "y": [0, 1, 1]}) + gdf = gdf.groupby("y", group_keys=True).sum() + pdf = pdf.groupby("y", group_keys=True).sum() + assert_groupby_results_equal(pdf, gdf) + + +def test_groupby_getitem_getattr(as_index): + pdf = pd.DataFrame({"x": [1, 3, 1], "y": [1, 2, 3], "z": [1, 4, 5]}) + gdf = cudf.from_pandas(pdf) + assert_groupby_results_equal( + pdf.groupby("x", as_index=as_index)["y"].sum(), + gdf.groupby("x", as_index=as_index)["y"].sum(), + as_index=as_index, + by="x", + ) + assert_groupby_results_equal( + pdf.groupby("x", as_index=as_index).y.sum(), + gdf.groupby("x", as_index=as_index).y.sum(), + as_index=as_index, + by="x", + ) + assert_groupby_results_equal( + pdf.groupby("x", as_index=as_index)[["y"]].sum(), + gdf.groupby("x", as_index=as_index)[["y"]].sum(), + as_index=as_index, + by="x", + ) + assert_groupby_results_equal( + pdf.groupby(["x", "y"], as_index=as_index).sum(), + gdf.groupby(["x", "y"], as_index=as_index).sum(), + as_index=as_index, + by=["x", "y"], + ) + + +def test_groupby_cats(): + rng = np.random.default_rng(seed=0) + df = cudf.DataFrame( + {"cats": pd.Categorical(list("aabaacaab")), "vals": rng.random(9)} + ) + + cats = df["cats"].values_host + vals = df["vals"].to_numpy() + + grouped = df.groupby(["cats"], as_index=False).mean() + + got_vals = grouped["vals"] + + got_cats = grouped["cats"] + + for i in range(len(got_vals)): + expect = vals[cats == got_cats[i]].mean() + np.testing.assert_almost_equal(got_vals[i], expect) + + +def test_series_groupby(groupby_reduction_methods): + s = pd.Series([1, 2, 3]) + g = cudf.Series([1, 2, 3]) + sg = s.groupby(s // 2) + gg = g.groupby(g // 2) + sa = getattr(sg, groupby_reduction_methods)() + ga = getattr(gg, groupby_reduction_methods)() + assert_groupby_results_equal(sa, ga) + + +def test_groupby_level_zero(groupby_reduction_methods, request): + request.applymarker( + pytest.mark.xfail( + groupby_reduction_methods in ["idxmin", "idxmax"], + reason="gather needed for idxmin/idxmax", + ) + ) + pdf = pd.DataFrame({"x": [1, 2, 3]}, index=[2, 5, 5]) + gdf = cudf.DataFrame.from_pandas(pdf) + pdg = pdf.groupby(level=0) + gdg = gdf.groupby(level=0) + pdresult = getattr(pdg, groupby_reduction_methods)() + gdresult = getattr(gdg, groupby_reduction_methods)() + assert_groupby_results_equal( + pdresult, + gdresult, + ) + + +def test_groupby_series_level_zero(groupby_reduction_methods, request): + request.applymarker( + pytest.mark.xfail( + groupby_reduction_methods in ["idxmin", "idxmax"], + reason="gather needed for idxmin/idxmax", + ) + ) + pdf = pd.Series([1, 2, 3], index=[2, 5, 5]) + gdf = cudf.Series.from_pandas(pdf) + pdg = pdf.groupby(level=0) + gdg = gdf.groupby(level=0) + pdresult = getattr(pdg, groupby_reduction_methods)() + gdresult = getattr(gdg, groupby_reduction_methods)() + assert_groupby_results_equal(pdresult, gdresult) + + +def test_groupby_column_name(): + pdf = pd.DataFrame({"xx": [1.0, 2.0, 3.0], "yy": [1, 2, 3]}) + gdf = cudf.DataFrame.from_pandas(pdf) + g = gdf.groupby("yy") + p = pdf.groupby("yy") + gxx = g["xx"].sum() + pxx = p["xx"].sum() + assert_groupby_results_equal(pxx, gxx) + + gxx = g["xx"].count() + pxx = p["xx"].count() + assert_groupby_results_equal(pxx, gxx, check_dtype=False) + + gxx = g["xx"].min() + pxx = p["xx"].min() + assert_groupby_results_equal(pxx, gxx) + + gxx = g["xx"].max() + pxx = p["xx"].max() + assert_groupby_results_equal(pxx, gxx) + + gxx = g["xx"].idxmin() + pxx = p["xx"].idxmin() + assert_groupby_results_equal(pxx, gxx, check_dtype=False) + + gxx = g["xx"].idxmax() + pxx = p["xx"].idxmax() + assert_groupby_results_equal(pxx, gxx, check_dtype=False) + + gxx = g["xx"].mean() + pxx = p["xx"].mean() + assert_groupby_results_equal(pxx, gxx) + + +def test_groupby_column_numeral(): + pdf = pd.DataFrame({0: [1.0, 2.0, 3.0], 1: [1, 2, 3]}) + gdf = cudf.DataFrame.from_pandas(pdf) + p = pdf.groupby(1) + g = gdf.groupby(1) + pxx = p[0].sum() + gxx = g[0].sum() + assert_groupby_results_equal(pxx, gxx) + + pdf = pd.DataFrame({0.5: [1.0, 2.0, 3.0], 1.5: [1, 2, 3]}) + gdf = cudf.DataFrame.from_pandas(pdf) + p = pdf.groupby(1.5) + g = gdf.groupby(1.5) + pxx = p[0.5].sum() + gxx = g[0.5].sum() + assert_groupby_results_equal(pxx, gxx) + + +@pytest.mark.parametrize( + "series", + [ + [0, 1, 0], + [1, 1, 1], + [0, 1, 1], + [1, 2, 3], + [4, 3, 2], + [0, 2, 0], + pd.Series([0, 2, 0]), + pd.Series([0, 2, 0], index=[0, 2, 1]), + ], +) +def test_groupby_external_series(series): + pdf = pd.DataFrame({"x": [1.0, 2.0, 3.0], "y": [1, 2, 1]}) + gdf = cudf.DataFrame.from_pandas(pdf) + pxx = pdf.groupby(pd.Series(series)).x.sum() + gxx = gdf.groupby(cudf.Series(series)).x.sum() + assert_groupby_results_equal(pxx, gxx) + + +@pytest.mark.parametrize("series", [[0.0, 1.0], [1.0, 1.0, 1.0, 1.0]]) +def test_groupby_external_series_incorrect_length(series): + pdf = pd.DataFrame({"x": [1.0, 2.0, 3.0], "y": [1, 2, 1]}) + gdf = cudf.DataFrame.from_pandas(pdf) + pxx = pdf.groupby(pd.Series(series)).x.sum() + gxx = gdf.groupby(cudf.Series(series)).x.sum() + assert_groupby_results_equal(pxx, gxx) + + +@pytest.mark.parametrize( + "level", [0, 1, "a", "b", [0, 1], ["a", "b"], ["a", 1], -1, [-1, -2]] +) +def test_groupby_levels(level): + idx = pd.MultiIndex.from_tuples([(1, 1), (1, 2), (2, 2)], names=("a", "b")) + pdf = pd.DataFrame({"c": [1, 2, 3], "d": [2, 3, 4]}, index=idx) + gdf = cudf.from_pandas(pdf) + assert_groupby_results_equal( + pdf.groupby(level=level).sum(), + gdf.groupby(level=level).sum(), + ) + + +def test_advanced_groupby_levels(): + pdf = pd.DataFrame({"x": [1, 2, 3], "y": [1, 2, 1], "z": [1, 1, 1]}) + gdf = cudf.from_pandas(pdf) + pdg = pdf.groupby(["x", "y"]).sum() + gdg = gdf.groupby(["x", "y"]).sum() + assert_groupby_results_equal(pdg, gdg) + pdh = pdg.groupby(level=1).sum() + gdh = gdg.groupby(level=1).sum() + assert_groupby_results_equal(pdh, gdh) + pdg = pdf.groupby(["x", "y", "z"]).sum() + gdg = gdf.groupby(["x", "y", "z"]).sum() + assert_groupby_results_equal(pdg, gdg) + pdg = pdf.groupby(["z"]).sum() + gdg = gdf.groupby(["z"]).sum() + assert_groupby_results_equal(pdg, gdg) + pdg = pdf.groupby(["y", "z"]).sum() + gdg = gdf.groupby(["y", "z"]).sum() + assert_groupby_results_equal(pdg, gdg) + pdg = pdf.groupby(["x", "z"]).sum() + gdg = gdf.groupby(["x", "z"]).sum() + assert_groupby_results_equal(pdg, gdg) + pdg = pdf.groupby(["y"]).sum() + gdg = gdf.groupby(["y"]).sum() + assert_groupby_results_equal(pdg, gdg) + pdg = pdf.groupby(["x"]).sum() + gdg = gdf.groupby(["x"]).sum() + assert_groupby_results_equal(pdg, gdg) + pdh = pdg.groupby(level=0).sum() + gdh = gdg.groupby(level=0).sum() + assert_groupby_results_equal(pdh, gdh) + pdg = pdf.groupby(["x", "y"]).sum() + gdg = gdf.groupby(["x", "y"]).sum() + pdh = pdg.groupby(level=[0, 1]).sum() + gdh = gdg.groupby(level=[0, 1]).sum() + assert_groupby_results_equal(pdh, gdh) + pdh = pdg.groupby(level=[1, 0]).sum() + gdh = gdg.groupby(level=[1, 0]).sum() + assert_groupby_results_equal(pdh, gdh) + pdg = pdf.groupby(["x", "y"]).sum() + gdg = gdf.groupby(["x", "y"]).sum() + + assert_exceptions_equal( + lfunc=pdg.groupby, + rfunc=gdg.groupby, + lfunc_args_and_kwargs=([], {"level": 2}), + rfunc_args_and_kwargs=([], {"level": 2}), + ) + + +@pytest.mark.parametrize( + "func", + [ + lambda df: df.groupby(["x", "y", "z"]).sum(), + lambda df: df.groupby(["x", "y"]).sum(), + lambda df: df.groupby(["x", "y"]).agg("sum"), + lambda df: df.groupby(["y"]).sum(), + lambda df: df.groupby(["y"]).agg("sum"), + lambda df: df.groupby(["x"]).sum(), + lambda df: df.groupby(["x"]).agg("sum"), + lambda df: df.groupby(["x", "y"]).z.sum(), + lambda df: df.groupby(["x", "y"]).z.agg("sum"), + ], +) +def test_empty_groupby(func): + pdf = pd.DataFrame({"x": [], "y": [], "z": []}) + gdf = cudf.from_pandas(pdf) + assert_groupby_results_equal(func(pdf), func(gdf), check_index_type=False) + + +def test_groupby_unsupported_columns(): + rng = np.random.default_rng(seed=12) + pd_cat = pd.Categorical( + pd.Series(rng.choice(["a", "b", 1], 3), dtype="category") + ) + pdf = pd.DataFrame( + { + "x": [1, 2, 3], + "y": ["a", "b", "c"], + "z": ["d", "e", "f"], + "a": [3, 4, 5], + } + ) + pdf["b"] = pd_cat + gdf = cudf.from_pandas(pdf) + pdg = pdf.groupby("x").sum(numeric_only=True) + # cudf does not yet support numeric_only, so our default is False (unlike + # pandas, which defaults to inferring and throws a warning about it). + gdg = gdf.groupby("x").sum(numeric_only=True) + assert_groupby_results_equal(pdg, gdg) + + +def test_list_of_series(): + pdf = pd.DataFrame({"x": [1, 2, 3], "y": [1, 2, 1]}) + gdf = cudf.from_pandas(pdf) + pdg = pdf.groupby([pdf.x]).y.sum() + gdg = gdf.groupby([gdf.x]).y.sum() + assert_groupby_results_equal(pdg, gdg) + pdg = pdf.groupby([pdf.x, pdf.y]).y.sum() + gdg = gdf.groupby([gdf.x, gdf.y]).y.sum() + assert_groupby_results_equal(pdg, gdg) + + +def test_groupby_apply_basic_agg_single_column(): + gdf = cudf.DataFrame( + { + "key": [0, 0, 1, 1, 2, 2, 0], + "val": [0, 1, 2, 3, 4, 5, 6], + "mult": [0, 1, 2, 3, 4, 5, 6], + } + ) + pdf = gdf.to_pandas() + + gdg = gdf.groupby(["key", "val"]).mult.sum() + pdg = pdf.groupby(["key", "val"]).mult.sum() + assert_groupby_results_equal(pdg, gdg) + + +def test_groupby_nulls_in_index(): + pdf = pd.DataFrame({"a": [None, 2, 1, 1], "b": [1, 2, 3, 4]}) + gdf = cudf.from_pandas(pdf) + + assert_groupby_results_equal( + pdf.groupby("a").sum(), gdf.groupby("a").sum() + ) + + +def test_groupby_all_nulls_index(): + gdf = cudf.DataFrame( + { + "a": cudf.Series([None, None, None, None], dtype="object"), + "b": [1, 2, 3, 4], + } + ) + pdf = gdf.to_pandas() + assert_groupby_results_equal( + pdf.groupby("a").sum(), gdf.groupby("a").sum() + ) + + gdf = cudf.DataFrame( + {"a": cudf.Series([np.nan, np.nan, np.nan, np.nan]), "b": [1, 2, 3, 4]} + ) + pdf = gdf.to_pandas() + assert_groupby_results_equal( + pdf.groupby("a").sum(), gdf.groupby("a").sum() + ) + + +@pytest.mark.parametrize("sort", [True, False]) +def test_groupby_sort(sort): + pdf = pd.DataFrame({"a": [2, 2, 1, 1], "b": [1, 2, 3, 4]}) + gdf = cudf.from_pandas(pdf) + + assert_groupby_results_equal( + pdf.groupby("a", sort=sort).sum(), + gdf.groupby("a", sort=sort).sum(), + check_like=not sort, + ) + + pdf = pd.DataFrame( + {"c": [-1, 2, 1, 4], "b": [1, 2, 3, 4], "a": [2, 2, 1, 1]} + ) + gdf = cudf.from_pandas(pdf) + + assert_groupby_results_equal( + pdf.groupby(["c", "b"], sort=sort).sum(), + gdf.groupby(["c", "b"], sort=sort).sum(), + check_like=not sort, + ) + + ps = pd.Series([1, 2, 3, 4, 5, 6, 7, 8], index=[2, 2, 2, 3, 3, 1, 1, 1]) + gs = cudf.from_pandas(ps) + + assert_groupby_results_equal( + ps.groupby(level=0, sort=sort).sum().to_frame(), + gs.groupby(level=0, sort=sort).sum().to_frame(), + check_like=not sort, + ) + + ps = pd.Series( + [1, 2, 3, 4, 5, 6, 7, 8], + index=pd.MultiIndex.from_product([(1, 2), ("a", "b"), (42, 84)]), + ) + gs = cudf.from_pandas(ps) + + assert_groupby_results_equal( + ps.groupby(level=0, sort=sort).sum().to_frame(), + gs.groupby(level=0, sort=sort).sum().to_frame(), + check_like=not sort, + ) + + +def test_groupby_cat(): + pdf = pd.DataFrame( + {"a": [1, 1, 2], "b": pd.Series(["b", "b", "a"], dtype="category")} + ) + gdf = cudf.from_pandas(pdf) + assert_groupby_results_equal( + pdf.groupby("a").count(), + gdf.groupby("a").count(), + check_dtype=False, + ) + + +def test_groupby_index_type(): + df = cudf.DataFrame() + df["string_col"] = ["a", "b", "c"] + df["counts"] = [1, 2, 3] + res = df.groupby(by="string_col").counts.sum() + assert res.index.dtype == cudf.dtype("object") + + +@pytest.mark.parametrize( + "interpolation", ["linear", "lower", "higher", "nearest", "midpoint"] +) +@pytest.mark.parametrize("q", [0.25, 0.4, 0.5, 0.7, 1]) +def test_groupby_quantile(request, interpolation, q): + request.applymarker( + pytest.mark.xfail( + condition=(q == 0.5 and interpolation == "nearest"), + reason=( + "Pandas NaN Rounding will fail nearest interpolation at 0.5" + ), + ) + ) + + raw_data = { + "y": [None, 1, 2, 3, 4, None, 6, 7, 8, 9], + "x": [1, 2, 3, 1, 2, 2, 1, None, 3, 2], + } + # Pandas>0.25 now casts NaN in quantile operations as a float64 + # # so we are filling with zeros. + pdf = pd.DataFrame(raw_data).fillna(0) + gdf = cudf.DataFrame.from_pandas(pdf) + + pdg = pdf.groupby("x") + gdg = gdf.groupby("x") + + pdresult = pdg.quantile(q, interpolation=interpolation) + gdresult = gdg.quantile(q, interpolation=interpolation) + + assert_groupby_results_equal(pdresult, gdresult) + + +def test_groupby_std(): + raw_data = { + "x": [1, 2, 3, 1, 2, 2, 1, None, 3, 2], + "y": [None, 1, 2, 3, 4, None, 6, 7, 8, 9], + } + pdf = pd.DataFrame(raw_data) + gdf = cudf.DataFrame.from_pandas(pdf) + pdg = pdf.groupby("x") + gdg = gdf.groupby("x") + pdresult = pdg.std() + gdresult = gdg.std() + + assert_groupby_results_equal(pdresult, gdresult) + + +def test_groupby_size(): + pdf = pd.DataFrame( + { + "a": [1, 1, 3, 4], + "b": ["bob", "bob", "alice", "cooper"], + "c": [1, 2, 3, 4], + } + ) + gdf = cudf.from_pandas(pdf) + + assert_groupby_results_equal( + pdf.groupby("a").size(), + gdf.groupby("a").size(), + check_dtype=False, + ) + + assert_groupby_results_equal( + pdf.groupby(["a", "b", "c"]).size(), + gdf.groupby(["a", "b", "c"]).size(), + check_dtype=False, + ) + + sr = pd.Series(range(len(pdf))) + assert_groupby_results_equal( + pdf.groupby(sr).size(), + gdf.groupby(sr).size(), + check_dtype=False, + ) + + +def test_groupby_datetime(request, as_index, groupby_reduction_methods): + pdf = pd.DataFrame( + { + "x": [1, 2, 3], + "y": [4, 5, 6], + "val": [7, 8, 9], + "datetime": pd.date_range("2020-01-01", periods=3), + } + ) + gdf = cudf.DataFrame(pdf) + pdg = pdf.groupby("datetime", as_index=as_index) + gdg = gdf.groupby("datetime", as_index=as_index) + pdres = getattr(pdg, groupby_reduction_methods)() + gdres = getattr(gdg, groupby_reduction_methods)() + assert_groupby_results_equal( + pdres, + gdres, + as_index=as_index, + by=["datetime"], + ) + + +def test_groupby_dropna(): + df = cudf.DataFrame({"a": [1, 1, None], "b": [1, 2, 3]}) + expect = cudf.DataFrame( + {"b": [3, 3]}, index=cudf.Series([1, None], name="a") + ) + got = df.groupby("a", dropna=False).sum() + assert_groupby_results_equal(expect, got) + + df = cudf.DataFrame( + {"a": [1, 1, 1, None], "b": [1, None, 1, None], "c": [1, 2, 3, 4]} + ) + idx = cudf.MultiIndex.from_frame( + df[["a", "b"]].drop_duplicates().sort_values(["a", "b"]), + names=["a", "b"], + ) + expect = cudf.DataFrame({"c": [4, 2, 4]}, index=idx) + got = df.groupby(["a", "b"], dropna=False).sum() + + assert_groupby_results_equal(expect, got) + + +def test_groupby_dropna_getattr(): + df = cudf.DataFrame() + df["id"] = [0, 1, 1, None, None, 3, 3] + df["val"] = [0, 1, 1, 2, 2, 3, 3] + got = df.groupby("id", dropna=False).val.sum() + + expect = cudf.Series( + [0, 2, 6, 4], name="val", index=cudf.Series([0, 1, 3, None], name="id") + ) + + assert_groupby_results_equal(expect, got) + + +def test_groupby_categorical_from_string(): + gdf = cudf.DataFrame() + gdf["id"] = ["a", "b", "c"] + gdf["val"] = [0, 1, 2] + gdf["id"] = gdf["id"].astype("category") + assert_groupby_results_equal( + cudf.DataFrame({"val": gdf["val"]}).set_index(keys=gdf["id"]), + gdf.groupby("id").sum(), + ) + + +def test_groupby_arbitrary_length_series(): + gdf = cudf.DataFrame({"a": [1, 1, 2], "b": [2, 3, 4]}, index=[4, 5, 6]) + gsr = cudf.Series([1.0, 2.0, 2.0], index=[3, 4, 5]) + + pdf = gdf.to_pandas() + psr = gsr.to_pandas() + + expect = pdf.groupby(psr).sum() + got = gdf.groupby(gsr).sum() + + assert_groupby_results_equal(expect, got) + + +def test_groupby_series_same_name_as_dataframe_column(): + gdf = cudf.DataFrame({"a": [1, 1, 2], "b": [2, 3, 4]}, index=[4, 5, 6]) + gsr = cudf.Series([1.0, 2.0, 2.0], name="a", index=[3, 4, 5]) + + pdf = gdf.to_pandas() + psr = gsr.to_pandas() + + expect = pdf.groupby(psr).sum() + got = gdf.groupby(gsr).sum() + + assert_groupby_results_equal(expect, got) + + +def test_group_by_series_and_column_name_in_by(): + gdf = cudf.DataFrame( + {"x": [1.0, 2.0, 3.0], "y": [1, 2, 1]}, index=[1, 2, 3] + ) + gsr0 = cudf.Series([0.0, 1.0, 2.0], name="a", index=[1, 2, 3]) + gsr1 = cudf.Series([0.0, 1.0, 3.0], name="b", index=[3, 4, 5]) + + pdf = gdf.to_pandas() + psr0 = gsr0.to_pandas() + psr1 = gsr1.to_pandas() + + expect = pdf.groupby(["x", psr0, psr1]).sum() + got = gdf.groupby(["x", gsr0, gsr1]).sum() + + assert_groupby_results_equal(expect, got) + + +def test_raise_data_error(): + pdf = pd.DataFrame({"a": [1, 2, 3, 4], "b": ["a", "b", "c", "d"]}) + gdf = cudf.from_pandas(pdf) + + assert_exceptions_equal( + pdf.groupby("a").mean, + gdf.groupby("a").mean, + ) + + +def test_reset_index_after_empty_groupby(): + # GH #5475 + pdf = pd.DataFrame({"a": [1, 2, 3]}) + gdf = cudf.from_pandas(pdf) + + assert_groupby_results_equal( + pdf.groupby("a").sum().reset_index(), + gdf.groupby("a").sum().reset_index(), + as_index=False, + by="a", + ) + + +def test_groupby_attribute_error(): + err_msg = "Test error message" + + class TestGroupBy(cudf.core.groupby.GroupBy): + @property + def _groupby(self): + raise AttributeError(err_msg) + + a = cudf.DataFrame({"a": [1, 2], "b": [2, 3]}) + gb = TestGroupBy(a, a["a"]) + + with pytest.raises(AttributeError, match=err_msg): + gb.sum() + + +@pytest.mark.parametrize( + "pdf", + [pd.DataFrame(), pd.DataFrame({"a": []}), pd.Series([], dtype="float64")], +) +def test_groupby_no_keys(pdf): + gdf = cudf.from_pandas(pdf) + if isinstance(pdf, pd.DataFrame): + kwargs = {"check_column_type": False} + else: + kwargs = {} + assert_groupby_results_equal( + pdf.groupby([]).max(), + gdf.groupby([]).max(), + check_dtype=False, + check_index_type=False, # Int64 v/s Float64 + **kwargs, + ) diff --git a/python/cudf/cudf/tests/groupby/test_shift.py b/python/cudf/cudf/tests/groupby/test_shift.py new file mode 100644 index 00000000000..edb48826138 --- /dev/null +++ b/python/cudf/cudf/tests/groupby/test_shift.py @@ -0,0 +1,204 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + + +import numpy as np +import pandas as pd +import pytest + +import cudf +from cudf.testing import assert_groupby_results_equal +from cudf.testing.dataset_generator import rand_dataframe + + +@pytest.mark.parametrize("shift_perc", [0.5, 1.0, 1.5]) +@pytest.mark.parametrize("direction", [1, -1]) +@pytest.mark.parametrize("fill_value", [None, np.nan, 42]) +def test_groupby_shift_row(shift_perc, direction, fill_value): + nelem = 20 + pdf = pd.DataFrame(np.ones((nelem, 3)), columns=["x", "y", "val"]) + gdf = cudf.from_pandas(pdf) + n_shift = int(nelem * shift_perc) * direction + + expected = pdf.groupby(["x", "y"]).shift( + periods=n_shift, fill_value=fill_value + ) + got = gdf.groupby(["x", "y"]).shift(periods=n_shift, fill_value=fill_value) + + assert_groupby_results_equal(expected, got) + + +@pytest.mark.parametrize("shift_perc", [0.5, 1.0, 1.5]) +@pytest.mark.parametrize("direction", [1, -1]) +@pytest.mark.parametrize( + "fill_value", + [ + None, + pytest.param( + 0, + marks=pytest.mark.xfail( + reason="https://github.com/rapidsai/cudf/issues/10608" + ), + ), + pytest.param( + 42, + marks=pytest.mark.xfail( + reason="https://github.com/rapidsai/cudf/issues/10608" + ), + ), + ], +) +def test_groupby_shift_row_mixed_numerics(shift_perc, direction, fill_value): + nelem = 20 + t = rand_dataframe( + dtypes_meta=[ + {"dtype": "int64", "null_frequency": 0, "cardinality": 10}, + {"dtype": "int64", "null_frequency": 0.4, "cardinality": 10}, + {"dtype": "float32", "null_frequency": 0.4, "cardinality": 10}, + { + "dtype": "datetime64[ns]", + "null_frequency": 0.4, + "cardinality": 10, + }, + { + "dtype": "timedelta64[ns]", + "null_frequency": 0.4, + "cardinality": 10, + }, + ], + rows=nelem, + use_threads=False, + seed=0, + ) + pdf = t.to_pandas() + gdf = cudf.from_pandas(pdf) + n_shift = int(nelem * shift_perc) * direction + + expected = pdf.groupby(["0"]).shift(periods=n_shift, fill_value=fill_value) + got = gdf.groupby(["0"]).shift(periods=n_shift, fill_value=fill_value) + + assert_groupby_results_equal(expected, got) + + +# TODO: Shifting list columns is currently unsupported because we cannot +# construct a null list scalar in python. Support once it is added. +@pytest.mark.parametrize("shift_perc", [0.5, 1.0, 1.5]) +@pytest.mark.parametrize("direction", [1, -1]) +def test_groupby_shift_row_mixed(shift_perc, direction): + nelem = 20 + t = rand_dataframe( + dtypes_meta=[ + {"dtype": "int64", "null_frequency": 0, "cardinality": 10}, + {"dtype": "int64", "null_frequency": 0.4, "cardinality": 10}, + {"dtype": "str", "null_frequency": 0.4, "cardinality": 10}, + { + "dtype": "datetime64[ns]", + "null_frequency": 0.4, + "cardinality": 10, + }, + { + "dtype": "timedelta64[ns]", + "null_frequency": 0.4, + "cardinality": 10, + }, + ], + rows=nelem, + use_threads=False, + seed=0, + ) + pdf = t.to_pandas() + gdf = cudf.from_pandas(pdf) + n_shift = int(nelem * shift_perc) * direction + + expected = pdf.groupby(["0"]).shift(periods=n_shift) + got = gdf.groupby(["0"]).shift(periods=n_shift) + + assert_groupby_results_equal(expected, got) + + +@pytest.mark.parametrize("shift_perc", [0.5, 1.0, 1.5]) +@pytest.mark.parametrize("direction", [1, -1]) +@pytest.mark.parametrize( + "fill_value", + [ + [ + 42, + "fill", + np.datetime64(123, "ns"), + np.timedelta64(456, "ns"), + ] + ], +) +def test_groupby_shift_row_mixed_fill(shift_perc, direction, fill_value): + nelem = 20 + t = rand_dataframe( + dtypes_meta=[ + {"dtype": "int64", "null_frequency": 0, "cardinality": 10}, + {"dtype": "int64", "null_frequency": 0.4, "cardinality": 10}, + {"dtype": "str", "null_frequency": 0.4, "cardinality": 10}, + { + "dtype": "datetime64[ns]", + "null_frequency": 0.4, + "cardinality": 10, + }, + { + "dtype": "timedelta64[ns]", + "null_frequency": 0.4, + "cardinality": 10, + }, + ], + rows=nelem, + use_threads=False, + seed=0, + ) + pdf = t.to_pandas() + gdf = cudf.from_pandas(pdf) + n_shift = int(nelem * shift_perc) * direction + + # Pandas does not support specifying different fill_value by column, so we + # simulate it column by column + expected = pdf.copy() + for col, single_fill in zip(pdf.iloc[:, 1:], fill_value, strict=True): + expected[col] = ( + pdf[col] + .groupby(pdf["0"]) + .shift(periods=n_shift, fill_value=single_fill) + ) + + got = gdf.groupby(["0"]).shift(periods=n_shift, fill_value=fill_value) + + assert_groupby_results_equal( + expected[["1", "2", "3", "4"]], got[["1", "2", "3", "4"]] + ) + + +@pytest.mark.parametrize("fill_value", [None, 0, 42]) +def test_groupby_shift_row_zero_shift(fill_value): + nelem = 20 + t = rand_dataframe( + dtypes_meta=[ + {"dtype": "int64", "null_frequency": 0, "cardinality": 10}, + {"dtype": "int64", "null_frequency": 0.4, "cardinality": 10}, + {"dtype": "float32", "null_frequency": 0.4, "cardinality": 10}, + { + "dtype": "datetime64[ns]", + "null_frequency": 0.4, + "cardinality": 10, + }, + { + "dtype": "timedelta64[ns]", + "null_frequency": 0.4, + "cardinality": 10, + }, + ], + rows=nelem, + use_threads=False, + seed=0, + ) + gdf = cudf.from_pandas(t.to_pandas()) + + expected = gdf + got = gdf.groupby(["0"]).shift(periods=0, fill_value=fill_value) + + assert_groupby_results_equal( + expected[["1", "2", "3", "4"]], got[["1", "2", "3", "4"]] + ) diff --git a/python/cudf/cudf/tests/groupby/test_unique.py b/python/cudf/cudf/tests/groupby/test_unique.py new file mode 100644 index 00000000000..e515531e478 --- /dev/null +++ b/python/cudf/cudf/tests/groupby/test_unique.py @@ -0,0 +1,31 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. +import pandas as pd +import pytest + +import cudf +from cudf.testing import assert_groupby_results_equal + + +@pytest.mark.parametrize( + "by,data", + [ + ([], []), + ([1, 1, 2, 2], [0, 0, 1, 1]), + ([1, 2, 3, 4], [0, 0, 0, 0]), + ([1, 2, 1, 2], [0, 1, 1, 1]), + ], +) +def test_groupby_unique(by, data, all_supported_types_as_str, request): + pdf = pd.DataFrame({"by": by, "data": data}) + pdf["data"] = pdf["data"].astype(all_supported_types_as_str) + gdf = cudf.from_pandas(pdf) + + expect = pdf.groupby("by")["data"].unique() + got = gdf.groupby("by")["data"].unique() + request.applymarker( + pytest.mark.xfail( + len(by) == 0 and all_supported_types_as_str == "category", + reason="pandas returns Categorical, cuDF returns np.ndarray", + ) + ) + assert_groupby_results_equal(expect, got, check_dtype=len(by) > 0) diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py index 552ac748e3e..5cab96d3db9 100644 --- a/python/cudf/cudf/tests/test_groupby.py +++ b/python/cudf/cudf/tests/test_groupby.py @@ -1,19 +1,15 @@ # Copyright (c) 2018-2025, NVIDIA CORPORATION. import collections -import datetime import itertools import operator import string import textwrap -from decimal import Decimal from functools import partial import numpy as np import pandas as pd import pytest -from numba import cuda -from numpy.testing import assert_array_equal import cudf from cudf import DataFrame, Series @@ -28,13 +24,8 @@ from cudf.core.udf.utils import UDFError, precompiled from cudf.testing import assert_eq from cudf.testing._utils import ( - DATETIME_TYPES, - SIGNED_TYPES, - TIMEDELTA_TYPES, - assert_exceptions_equal, expect_warning_if, ) -from cudf.testing.dataset_generator import rand_dataframe _now = np.datetime64("now") _tomorrow = _now + np.timedelta64(1, "D") @@ -106,310 +97,6 @@ def pdf(gdf): return gdf.to_pandas() -def test_groupby_mean(): - nelem = 20 - got_df = make_frame(DataFrame, nelem=nelem).groupby(["x", "y"]).mean() - expect_df = ( - make_frame(pd.DataFrame, nelem=nelem).groupby(["x", "y"]).mean() - ) - assert_groupby_results_equal(got_df, expect_df) - - -def test_groupby_mean_3level(): - nelem = 20 - lvls = "z" - bys = list("xyz") - got_df = ( - make_frame(DataFrame, nelem=nelem, extra_levels=lvls) - .groupby(bys) - .mean() - ) - expect_df = ( - make_frame(pd.DataFrame, nelem=nelem, extra_levels=lvls) - .groupby(bys) - .mean() - ) - assert_groupby_results_equal(got_df, expect_df) - - -def test_groupby_agg_mean_min(): - nelem = 20 - got_df = ( - make_frame(DataFrame, nelem=nelem) - .groupby(["x", "y"]) - .agg(["mean", "min"]) - ) - expect_df = ( - make_frame(pd.DataFrame, nelem=nelem) - .groupby(["x", "y"]) - .agg(["mean", "min"]) - ) - assert_groupby_results_equal(got_df, expect_df) - - -def test_groupby_agg_min_max_dictargs(): - nelem = 20 - expect_df = ( - make_frame(pd.DataFrame, nelem=nelem, extra_vals="ab") - .groupby(["x", "y"]) - .agg({"a": "min", "b": "max"}) - ) - got_df = ( - make_frame(DataFrame, nelem=nelem, extra_vals="ab") - .groupby(["x", "y"]) - .agg({"a": "min", "b": "max"}) - ) - assert_groupby_results_equal(expect_df, got_df) - - -def test_groupby_agg_min_max_dictlist(): - nelem = 20 - expect_df = ( - make_frame(pd.DataFrame, nelem=nelem, extra_vals="ab") - .groupby(["x", "y"]) - .agg({"a": ["min", "max"], "b": ["min", "max"]}) - ) - got_df = ( - make_frame(DataFrame, nelem=nelem, extra_vals="ab") - .groupby(["x", "y"]) - .agg({"a": ["min", "max"], "b": ["min", "max"]}) - ) - assert_groupby_results_equal(got_df, expect_df) - - -@pytest.mark.parametrize("as_index", [True, False]) -def test_groupby_as_index_single_agg(pdf, gdf, as_index): - gdf = gdf.groupby("y", as_index=as_index).agg({"x": "mean"}) - pdf = pdf.groupby("y", as_index=as_index).agg({"x": "mean"}) - assert_groupby_results_equal(pdf, gdf, as_index=as_index, by="y") - - -@pytest.mark.parametrize("engine", ["cudf", "jit"]) -@pytest.mark.parametrize("as_index", [True, False]) -@pytest.mark.skipif( - PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, - reason="Include groups missing on old versions of pandas", -) -def test_groupby_as_index_apply(pdf, gdf, as_index, engine): - gdf = gdf.groupby("y", as_index=as_index).apply( - lambda df: df["x"].mean(), engine=engine - ) - kwargs = {"func": lambda df: df["x"].mean(), "include_groups": False} - pdf = pdf.groupby("y", as_index=as_index).apply(**kwargs) - assert_groupby_results_equal(pdf, gdf, as_index=as_index, by="y") - - -@pytest.mark.parametrize("as_index", [True, False]) -def test_groupby_as_index_multiindex(pdf, gdf, as_index): - pdf = pd.DataFrame( - {"a": [1, 2, 1], "b": [3, 3, 3], "c": [2, 2, 3], "d": [3, 1, 2]} - ) - gdf = cudf.from_pandas(pdf) - - gdf = gdf.groupby(["a", "b"], as_index=as_index, sort=True).agg( - {"c": "mean"} - ) - pdf = pdf.groupby(["a", "b"], as_index=as_index, sort=True).agg( - {"c": "mean"} - ) - - if as_index: - assert_eq(pdf, gdf) - else: - # column names don't match - check just the values - for gcol, pcol in zip(gdf, pdf, strict=True): - assert_array_equal(gdf[gcol].to_numpy(), pdf[pcol].values) - - -def test_groupby_default(pdf, gdf): - gdf = gdf.groupby("y").agg({"x": "mean"}) - pdf = pdf.groupby("y").agg({"x": "mean"}) - assert_groupby_results_equal(pdf, gdf) - - -def test_group_keys_true(pdf, gdf): - gdf = gdf.groupby("y", group_keys=True).sum() - pdf = pdf.groupby("y", group_keys=True).sum() - assert_groupby_results_equal(pdf, gdf) - - -@pytest.mark.parametrize("as_index", [True, False]) -def test_groupby_getitem_getattr(as_index): - pdf = pd.DataFrame({"x": [1, 3, 1], "y": [1, 2, 3], "z": [1, 4, 5]}) - gdf = cudf.from_pandas(pdf) - assert_groupby_results_equal( - pdf.groupby("x", as_index=as_index)["y"].sum(), - gdf.groupby("x", as_index=as_index)["y"].sum(), - as_index=as_index, - by="x", - ) - assert_groupby_results_equal( - pdf.groupby("x", as_index=as_index).y.sum(), - gdf.groupby("x", as_index=as_index).y.sum(), - as_index=as_index, - by="x", - ) - assert_groupby_results_equal( - pdf.groupby("x", as_index=as_index)[["y"]].sum(), - gdf.groupby("x", as_index=as_index)[["y"]].sum(), - as_index=as_index, - by="x", - ) - assert_groupby_results_equal( - pdf.groupby(["x", "y"], as_index=as_index).sum(), - gdf.groupby(["x", "y"], as_index=as_index).sum(), - as_index=as_index, - by=["x", "y"], - ) - - -def test_groupby_cats(): - rng = np.random.default_rng(seed=0) - df = DataFrame( - {"cats": pd.Categorical(list("aabaacaab")), "vals": rng.random(9)} - ) - - cats = df["cats"].values_host - vals = df["vals"].to_numpy() - - grouped = df.groupby(["cats"], as_index=False).mean() - - got_vals = grouped["vals"] - - got_cats = grouped["cats"] - - for i in range(len(got_vals)): - expect = vals[cats == got_cats[i]].mean() - np.testing.assert_almost_equal(got_vals[i], expect) - - -def test_groupby_iterate_groups(): - rng = np.random.default_rng(seed=0) - nelem = 20 - df = DataFrame( - { - "key1": rng.integers(0, 3, nelem), - "key2": rng.integers(0, 2, nelem), - "val1": rng.random(nelem), - "val2": rng.random(nelem), - } - ) - - def assert_values_equal(arr): - np.testing.assert_array_equal(arr[0], arr) - - for name, grp in df.groupby(["key1", "key2"]): - pddf = grp.to_pandas() - for k in "key1,key2".split(","): - assert_values_equal(pddf[k].values) - - -@pytest.mark.skipif( - PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, - reason="Fails in older versions of pandas", -) -def test_groupby_apply(): - rng = np.random.default_rng(seed=0) - nelem = 20 - df = DataFrame( - { - "key1": rng.integers(0, 3, nelem), - "key2": rng.integers(0, 2, nelem), - "val1": rng.random(nelem), - "val2": rng.random(nelem), - } - ) - - expect_grpby = df.to_pandas().groupby( - ["key1", "key2"], as_index=False, group_keys=False - ) - got_grpby = df.groupby(["key1", "key2"]) - - def foo(df): - df["out"] = df["val1"] + df["val2"] - return df - - expect = expect_grpby.apply(foo, include_groups=False) - got = got_grpby.apply(foo, include_groups=False) - assert_groupby_results_equal(expect, got) - - -def f1(df, k): - df["out"] = df["val1"] + df["val2"] + k - return df - - -def f2(df, k, L): - df["out"] = df["val1"] - df["val2"] + (k / L) - return df - - -def f3(df, k, L, m): - df["out"] = ((k * df["val1"]) + (L * df["val2"])) / m - return df - - -@pytest.mark.parametrize( - "func,args", [(f1, (42,)), (f2, (42, 119)), (f3, (42, 119, 212.1))] -) -@pytest.mark.skipif( - PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, - reason="Fails in older versions of pandas", -) -def test_groupby_apply_args(func, args): - rng = np.random.default_rng(seed=0) - nelem = 20 - df = DataFrame( - { - "key1": rng.integers(0, 3, nelem), - "key2": rng.integers(0, 2, nelem), - "val1": rng.random(nelem), - "val2": rng.random(nelem), - } - ) - - expect_grpby = df.to_pandas().groupby( - ["key1", "key2"], as_index=False, group_keys=False - ) - got_grpby = df.groupby(["key1", "key2"]) - expect = expect_grpby.apply(func, *args, include_groups=False) - got = got_grpby.apply(func, *args, include_groups=False) - assert_groupby_results_equal(expect, got) - - -def test_groupby_apply_grouped(): - df = DataFrame() - nelem = 20 - df["key1"] = range(nelem) - df["key2"] = range(nelem) - df["val1"] = range(nelem) - df["val2"] = range(nelem) - - got_grpby = df.groupby(["key1", "key2"]) - - def foo(key1, val1, com1, com2): - for i in range(cuda.threadIdx.x, len(key1), cuda.blockDim.x): - com1[i] = key1[i] * 10000 + val1[i] - com2[i] = i - - got = got_grpby.apply_grouped( - foo, - incols=["key1", "val1"], - outcols={"com1": np.float64, "com2": np.int32}, - tpb=8, - ) - - got = got.to_pandas() - - expect = df.copy() - expect["com1"] = (expect["key1"] * 10000 + expect["key1"]).astype( - np.float64 - ) - expect["com2"] = np.zeros(nelem, dtype=np.int32) - - assert_groupby_results_equal(expect, got) - - @pytest.fixture(scope="module") def groupby_jit_data_small(): """ @@ -1000,1440 +687,150 @@ def pdf_func(df): assert_groupby_results_equal(expect, got) -@pytest.mark.parametrize( - "func", - [ - "mean", - "std", - "var", - "min", - "max", - "idxmin", - "idxmax", - "count", - "sum", - "prod", - ], +@pytest.mark.skipif( + PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, + reason="Include groups missing on old versions of pandas", ) -def test_groupby_2keys_agg(func): - # gdf (Note: lack of multiIndex) - nelem = 20 - expect_df = ( - make_frame(pd.DataFrame, nelem=nelem).groupby(["x", "y"]).agg(func) +def test_groupby_apply_noempty_group(): + pdf = pd.DataFrame( + {"a": [1, 1, 2, 2], "b": [1, 2, 1, 2], "c": [1, 2, 3, 4]} ) - got_df = make_frame(DataFrame, nelem=nelem).groupby(["x", "y"]).agg(func) + gdf = cudf.from_pandas(pdf) - check_dtype = func not in _index_type_aggs - assert_groupby_results_equal(got_df, expect_df, check_dtype=check_dtype) + expect = ( + pdf.groupby("a", group_keys=False) + .apply(lambda x: x.iloc[[0, 1]], include_groups=False) + .reset_index(drop=True) + ) + got = ( + gdf.groupby("a") + .apply(lambda x: x.iloc[[0, 1]], include_groups=False) + .reset_index(drop=True) + ) + assert_groupby_results_equal(expect, got) -@pytest.mark.parametrize("num_groups", [2, 20]) -@pytest.mark.parametrize("nelem_per_group", [1, 10]) -@pytest.mark.parametrize( - "func", - ["min", "max", "count", "sum"], - # TODO: Replace the above line with the one below once - # https://github.com/pandas-dev/pandas/issues/40685 is resolved. - # "func", ["min", "max", "idxmin", "idxmax", "count", "sum"], -) -def test_groupby_agg_decimal(num_groups, nelem_per_group, func): - rng = np.random.default_rng(seed=0) - # The number of digits after the decimal to use. - decimal_digits = 2 - # The number of digits before the decimal to use. - whole_digits = 2 - - scale = 10**whole_digits - nelem = num_groups * nelem_per_group - - # The unique is necessary because otherwise if there are duplicates idxmin - # and idxmax may return different results than pandas (see - # https://github.com/rapidsai/cudf/issues/7756). This is not relevant to - # the current version of the test, because idxmin and idxmax simply don't - # work with pandas Series composed of Decimal objects (see - # https://github.com/pandas-dev/pandas/issues/40685). However, if that is - # ever enabled, then this issue will crop up again so we may as well have - # it fixed now. - x = np.unique((rng.random(nelem) * scale).round(decimal_digits)) - y = np.unique((rng.random(nelem) * scale).round(decimal_digits)) - - if x.size < y.size: - total_elements = x.size - y = y[: x.size] - else: - total_elements = y.size - x = x[: y.size] +def create_test_groupby_apply_return_scalars_params(): + def f0(x): + x = x[~x["B"].isna()] + ticker = x.shape[0] + full = ticker / 10 + return full - # Note that this filtering can lead to one group with fewer elements, but - # that shouldn't be a problem and is probably useful to test. - idx_col = np.tile(np.arange(num_groups), nelem_per_group)[:total_elements] + def f1(x, k): + x = x[~x["B"].isna()] + ticker = x.shape[0] + full = ticker / k + return full - decimal_x = pd.Series([Decimal(str(d)) for d in x]) - decimal_y = pd.Series([Decimal(str(d)) for d in y]) + def f2(x, k, L): + x = x[~x["B"].isna()] + ticker = x.shape[0] + full = L * (ticker / k) + return full - pdf = pd.DataFrame({"idx": idx_col, "x": decimal_x, "y": decimal_y}) - gdf = DataFrame( - { - "idx": idx_col, - "x": cudf.Series(decimal_x), - "y": cudf.Series(decimal_y), - } - ) + def f3(x, k, L, m): + x = x[~x["B"].isna()] + ticker = x.shape[0] + full = L * (ticker / k) % m + return full - expect_df = pdf.groupby("idx", sort=True).agg(func) - got_df = gdf.groupby("idx", sort=True).agg(func) - assert_eq(expect_df["x"], got_df["x"], check_dtype=False) - assert_eq(expect_df["y"], got_df["y"], check_dtype=False) + return [(f0, ()), (f1, (42,)), (f2, (42, 119)), (f3, (42, 119, 212.1))] @pytest.mark.parametrize( - "agg", ["min", "max", "idxmin", "idxmax", "count", "sum", "prod", "mean"] + "func,args", create_test_groupby_apply_return_scalars_params() ) -def test_series_groupby(agg): - s = pd.Series([1, 2, 3]) - g = Series([1, 2, 3]) - sg = s.groupby(s // 2) - gg = g.groupby(g // 2) - sa = getattr(sg, agg)() - ga = getattr(gg, agg)() - check_dtype = agg not in _index_type_aggs - assert_groupby_results_equal(sa, ga, check_dtype=check_dtype) - - -@pytest.mark.parametrize( - "agg", ["min", "max", "idxmin", "idxmax", "count", "sum", "prod", "mean"] +@pytest.mark.skipif( + PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, + reason="Fails in older versions of pandas", ) -def test_series_groupby_agg(agg): - s = pd.Series([1, 2, 3]) - g = Series([1, 2, 3]) - sg = s.groupby(s // 2).agg(agg) - gg = g.groupby(g // 2).agg(agg) - check_dtype = agg not in _index_type_aggs - assert_groupby_results_equal(sg, gg, check_dtype=check_dtype) +def test_groupby_apply_return_scalars(func, args): + pdf = pd.DataFrame( + { + "A": [1, 1, 2, 2, 3, 3, 4, 4, 5, 5], + "B": [ + 0.01, + np.nan, + 0.03, + 0.04, + np.nan, + 0.06, + 0.07, + 0.08, + 0.09, + 1.0, + ], + } + ) + gdf = cudf.from_pandas(pdf) + expected = pdf.groupby("A").apply(func, *args, include_groups=False) + actual = gdf.groupby("A").apply(func, *args, include_groups=False) -@pytest.mark.parametrize( - "agg", - [ - "min", - "max", - "count", - "sum", - "prod", - "mean", - pytest.param( - "idxmin", - marks=pytest.mark.xfail(reason="gather needed for idxmin"), - ), - pytest.param( - "idxmax", - marks=pytest.mark.xfail(reason="gather needed for idxmax"), - ), - ], -) -def test_groupby_level_zero(agg): - pdf = pd.DataFrame({"x": [1, 2, 3]}, index=[2, 5, 5]) - gdf = DataFrame.from_pandas(pdf) - pdg = pdf.groupby(level=0) - gdg = gdf.groupby(level=0) - pdresult = getattr(pdg, agg)() - gdresult = getattr(gdg, agg)() - check_dtype = agg not in _index_type_aggs - assert_groupby_results_equal(pdresult, gdresult, check_dtype=check_dtype) + assert_groupby_results_equal(expected, actual) -@pytest.mark.parametrize( - "agg", - [ - "min", - "max", - "count", - "sum", - "prod", - "mean", - pytest.param( - "idxmin", - marks=pytest.mark.xfail(reason="gather needed for idxmin"), - ), - pytest.param( - "idxmax", - marks=pytest.mark.xfail(reason="gather needed for idxmax"), - ), - ], -) -def test_groupby_series_level_zero(agg): - pdf = pd.Series([1, 2, 3], index=[2, 5, 5]) - gdf = Series.from_pandas(pdf) - pdg = pdf.groupby(level=0) - gdg = gdf.groupby(level=0) - pdresult = getattr(pdg, agg)() - gdresult = getattr(gdg, agg)() - check_dtype = agg not in _index_type_aggs - assert_groupby_results_equal(pdresult, gdresult, check_dtype=check_dtype) - - -def test_groupby_column_name(): - pdf = pd.DataFrame({"xx": [1.0, 2.0, 3.0], "yy": [1, 2, 3]}) - gdf = DataFrame.from_pandas(pdf) - g = gdf.groupby("yy") - p = pdf.groupby("yy") - gxx = g["xx"].sum() - pxx = p["xx"].sum() - assert_groupby_results_equal(pxx, gxx) - - gxx = g["xx"].count() - pxx = p["xx"].count() - assert_groupby_results_equal(pxx, gxx, check_dtype=False) - - gxx = g["xx"].min() - pxx = p["xx"].min() - assert_groupby_results_equal(pxx, gxx) - - gxx = g["xx"].max() - pxx = p["xx"].max() - assert_groupby_results_equal(pxx, gxx) - - gxx = g["xx"].idxmin() - pxx = p["xx"].idxmin() - assert_groupby_results_equal(pxx, gxx, check_dtype=False) - - gxx = g["xx"].idxmax() - pxx = p["xx"].idxmax() - assert_groupby_results_equal(pxx, gxx, check_dtype=False) - - gxx = g["xx"].mean() - pxx = p["xx"].mean() - assert_groupby_results_equal(pxx, gxx) - - -def test_groupby_column_numeral(): - pdf = pd.DataFrame({0: [1.0, 2.0, 3.0], 1: [1, 2, 3]}) - gdf = DataFrame.from_pandas(pdf) - p = pdf.groupby(1) - g = gdf.groupby(1) - pxx = p[0].sum() - gxx = g[0].sum() - assert_groupby_results_equal(pxx, gxx) - - pdf = pd.DataFrame({0.5: [1.0, 2.0, 3.0], 1.5: [1, 2, 3]}) - gdf = DataFrame.from_pandas(pdf) - p = pdf.groupby(1.5) - g = gdf.groupby(1.5) - pxx = p[0.5].sum() - gxx = g[0.5].sum() - assert_groupby_results_equal(pxx, gxx) +def create_test_groupby_apply_return_series_dataframe_params(): + def f0(x): + return x - x.max() + def f1(x): + return x.min() - x.max() -@pytest.mark.parametrize( - "series", - [ - [0, 1, 0], - [1, 1, 1], - [0, 1, 1], - [1, 2, 3], - [4, 3, 2], - [0, 2, 0], - pd.Series([0, 2, 0]), - pd.Series([0, 2, 0], index=[0, 2, 1]), - ], -) -def test_groupby_external_series(series): - pdf = pd.DataFrame({"x": [1.0, 2.0, 3.0], "y": [1, 2, 1]}) - gdf = DataFrame.from_pandas(pdf) - pxx = pdf.groupby(pd.Series(series)).x.sum() - gxx = gdf.groupby(cudf.Series(series)).x.sum() - assert_groupby_results_equal(pxx, gxx) + def f2(x): + return x.min() + def f3(x, k): + return x - x.max() + k -@pytest.mark.parametrize("series", [[0.0, 1.0], [1.0, 1.0, 1.0, 1.0]]) -def test_groupby_external_series_incorrect_length(series): - pdf = pd.DataFrame({"x": [1.0, 2.0, 3.0], "y": [1, 2, 1]}) - gdf = DataFrame.from_pandas(pdf) - pxx = pdf.groupby(pd.Series(series)).x.sum() - gxx = gdf.groupby(cudf.Series(series)).x.sum() - assert_groupby_results_equal(pxx, gxx) + def f4(x, k, L): + return x.min() - x.max() + (k / L) + def f5(x, k, L, m): + return m * x.min() + (k / L) -@pytest.mark.parametrize( - "level", [0, 1, "a", "b", [0, 1], ["a", "b"], ["a", 1], -1, [-1, -2]] -) -def test_groupby_levels(level): - idx = pd.MultiIndex.from_tuples([(1, 1), (1, 2), (2, 2)], names=("a", "b")) - pdf = pd.DataFrame({"c": [1, 2, 3], "d": [2, 3, 4]}, index=idx) - gdf = cudf.from_pandas(pdf) - assert_groupby_results_equal( - pdf.groupby(level=level).sum(), - gdf.groupby(level=level).sum(), - ) - - -def test_advanced_groupby_levels(): - pdf = pd.DataFrame({"x": [1, 2, 3], "y": [1, 2, 1], "z": [1, 1, 1]}) - gdf = cudf.from_pandas(pdf) - pdg = pdf.groupby(["x", "y"]).sum() - gdg = gdf.groupby(["x", "y"]).sum() - assert_groupby_results_equal(pdg, gdg) - pdh = pdg.groupby(level=1).sum() - gdh = gdg.groupby(level=1).sum() - assert_groupby_results_equal(pdh, gdh) - pdg = pdf.groupby(["x", "y", "z"]).sum() - gdg = gdf.groupby(["x", "y", "z"]).sum() - assert_groupby_results_equal(pdg, gdg) - pdg = pdf.groupby(["z"]).sum() - gdg = gdf.groupby(["z"]).sum() - assert_groupby_results_equal(pdg, gdg) - pdg = pdf.groupby(["y", "z"]).sum() - gdg = gdf.groupby(["y", "z"]).sum() - assert_groupby_results_equal(pdg, gdg) - pdg = pdf.groupby(["x", "z"]).sum() - gdg = gdf.groupby(["x", "z"]).sum() - assert_groupby_results_equal(pdg, gdg) - pdg = pdf.groupby(["y"]).sum() - gdg = gdf.groupby(["y"]).sum() - assert_groupby_results_equal(pdg, gdg) - pdg = pdf.groupby(["x"]).sum() - gdg = gdf.groupby(["x"]).sum() - assert_groupby_results_equal(pdg, gdg) - pdh = pdg.groupby(level=0).sum() - gdh = gdg.groupby(level=0).sum() - assert_groupby_results_equal(pdh, gdh) - pdg = pdf.groupby(["x", "y"]).sum() - gdg = gdf.groupby(["x", "y"]).sum() - pdh = pdg.groupby(level=[0, 1]).sum() - gdh = gdg.groupby(level=[0, 1]).sum() - assert_groupby_results_equal(pdh, gdh) - pdh = pdg.groupby(level=[1, 0]).sum() - gdh = gdg.groupby(level=[1, 0]).sum() - assert_groupby_results_equal(pdh, gdh) - pdg = pdf.groupby(["x", "y"]).sum() - gdg = gdf.groupby(["x", "y"]).sum() - - assert_exceptions_equal( - lfunc=pdg.groupby, - rfunc=gdg.groupby, - lfunc_args_and_kwargs=([], {"level": 2}), - rfunc_args_and_kwargs=([], {"level": 2}), - ) - - -@pytest.mark.parametrize( - "func", - [ - lambda df: df.groupby(["x", "y", "z"]).sum(), - lambda df: df.groupby(["x", "y"]).sum(), - lambda df: df.groupby(["x", "y"]).agg("sum"), - lambda df: df.groupby(["y"]).sum(), - lambda df: df.groupby(["y"]).agg("sum"), - lambda df: df.groupby(["x"]).sum(), - lambda df: df.groupby(["x"]).agg("sum"), - lambda df: df.groupby(["x", "y"]).z.sum(), - lambda df: df.groupby(["x", "y"]).z.agg("sum"), - ], -) -def test_empty_groupby(func): - pdf = pd.DataFrame({"x": [], "y": [], "z": []}) - gdf = cudf.from_pandas(pdf) - assert_groupby_results_equal(func(pdf), func(gdf), check_index_type=False) - - -def test_groupby_unsupported_columns(): - rng = np.random.default_rng(seed=12) - pd_cat = pd.Categorical( - pd.Series(rng.choice(["a", "b", 1], 3), dtype="category") - ) - pdf = pd.DataFrame( - { - "x": [1, 2, 3], - "y": ["a", "b", "c"], - "z": ["d", "e", "f"], - "a": [3, 4, 5], - } - ) - pdf["b"] = pd_cat - gdf = cudf.from_pandas(pdf) - pdg = pdf.groupby("x").sum(numeric_only=True) - # cudf does not yet support numeric_only, so our default is False (unlike - # pandas, which defaults to inferring and throws a warning about it). - gdg = gdf.groupby("x").sum(numeric_only=True) - assert_groupby_results_equal(pdg, gdg) - - -def test_list_of_series(): - pdf = pd.DataFrame({"x": [1, 2, 3], "y": [1, 2, 1]}) - gdf = cudf.from_pandas(pdf) - pdg = pdf.groupby([pdf.x]).y.sum() - gdg = gdf.groupby([gdf.x]).y.sum() - assert_groupby_results_equal(pdg, gdg) - pdg = pdf.groupby([pdf.x, pdf.y]).y.sum() - gdg = gdf.groupby([gdf.x, gdf.y]).y.sum() - pytest.skip() - assert_groupby_results_equal(pdg, gdg) - - -def test_groupby_use_agg_column_as_index(): - pdf = pd.DataFrame() - pdf["a"] = [1, 1, 1, 3, 5] - gdf = cudf.DataFrame() - gdf["a"] = [1, 1, 1, 3, 5] - pdg = pdf.groupby("a").agg({"a": "count"}) - gdg = gdf.groupby("a").agg({"a": "count"}) - assert_groupby_results_equal(pdg, gdg, check_dtype=False) - - -def test_groupby_list_then_string(): - gdf = cudf.DataFrame() - gdf["a"] = [0, 1, 0, 1, 2] - gdf["b"] = [11, 2, 15, 12, 2] - gdf["c"] = [6, 7, 6, 7, 6] - pdf = gdf.to_pandas() - gdg = gdf.groupby("a", as_index=True).agg( - {"b": ["min", "max"], "c": "max"} - ) - pdg = pdf.groupby("a", as_index=True).agg( - {"b": ["min", "max"], "c": "max"} - ) - assert_groupby_results_equal(gdg, pdg) - - -def test_groupby_different_unequal_length_column_aggregations(): - gdf = cudf.DataFrame() - gdf["a"] = [0, 1, 0, 1, 2] - gdf["b"] = [11, 2, 15, 12, 2] - gdf["c"] = [11, 2, 15, 12, 2] - pdf = gdf.to_pandas() - gdg = gdf.groupby("a", as_index=True).agg( - {"b": "min", "c": ["max", "min"]} - ) - pdg = pdf.groupby("a", as_index=True).agg( - {"b": "min", "c": ["max", "min"]} - ) - assert_groupby_results_equal(pdg, gdg) - - -def test_groupby_single_var_two_aggs(): - gdf = cudf.DataFrame() - gdf["a"] = [0, 1, 0, 1, 2] - gdf["b"] = [11, 2, 15, 12, 2] - gdf["c"] = [11, 2, 15, 12, 2] - pdf = gdf.to_pandas() - gdg = gdf.groupby("a", as_index=True).agg({"b": ["min", "max"]}) - pdg = pdf.groupby("a", as_index=True).agg({"b": ["min", "max"]}) - assert_groupby_results_equal(pdg, gdg) - - -def test_groupby_double_var_two_aggs(): - gdf = cudf.DataFrame() - gdf["a"] = [0, 1, 0, 1, 2] - gdf["b"] = [11, 2, 15, 12, 2] - gdf["c"] = [11, 2, 15, 12, 2] - pdf = gdf.to_pandas() - gdg = gdf.groupby(["a", "b"], as_index=True).agg({"c": ["min", "max"]}) - pdg = pdf.groupby(["a", "b"], as_index=True).agg({"c": ["min", "max"]}) - assert_groupby_results_equal(pdg, gdg) - - -def test_groupby_apply_basic_agg_single_column(): - gdf = DataFrame() - gdf["key"] = [0, 0, 1, 1, 2, 2, 0] - gdf["val"] = [0, 1, 2, 3, 4, 5, 6] - gdf["mult"] = gdf["key"] * gdf["val"] - pdf = gdf.to_pandas() - - gdg = gdf.groupby(["key", "val"]).mult.sum() - pdg = pdf.groupby(["key", "val"]).mult.sum() - assert_groupby_results_equal(pdg, gdg) - - -def test_groupby_multi_agg_single_groupby_series(): - rng = np.random.default_rng(seed=0) - pdf = pd.DataFrame( - { - "x": rng.integers(0, 5, size=10000), - "y": rng.normal(size=10000), - } - ) - gdf = cudf.from_pandas(pdf) - pdg = pdf.groupby("x").y.agg(["sum", "max"]) - gdg = gdf.groupby("x").y.agg(["sum", "max"]) - - assert_groupby_results_equal(pdg, gdg) - - -def test_groupby_multi_agg_multi_groupby(): - rng = np.random.default_rng(seed=0) - pdf = pd.DataFrame( - { - "a": rng.integers(0, 5, 10), - "b": rng.integers(0, 5, 10), - "c": rng.integers(0, 5, 10), - "d": rng.integers(0, 5, 10), - } - ) - gdf = cudf.from_pandas(pdf) - pdg = pdf.groupby(["a", "b"]).agg(["sum", "max"]) - gdg = gdf.groupby(["a", "b"]).agg(["sum", "max"]) - assert_groupby_results_equal(pdg, gdg) - - -def test_groupby_datetime_multi_agg_multi_groupby(): - rng = np.random.default_rng(seed=0) - pdf = pd.DataFrame( - { - "a": pd.date_range( - datetime.datetime.now(), - datetime.datetime.now() + datetime.timedelta(9), - freq="D", - ), - "b": rng.integers(0, 5, 10), - "c": rng.integers(0, 5, 10), - "d": rng.integers(0, 5, 10), - } - ) - gdf = cudf.from_pandas(pdf) - pdg = pdf.groupby(["a", "b"]).agg(["sum", "max"]) - gdg = gdf.groupby(["a", "b"]).agg(["sum", "max"]) - - assert_groupby_results_equal(pdg, gdg) + return [ + (f0, ()), + (f1, ()), + (f2, ()), + (f3, (42,)), + (f4, (42, 119)), + (f5, (41, 119, 212.1)), + ] @pytest.mark.parametrize( - "agg", - [ - ["min", "max", "count", "mean"], - ["mean", "var", "std"], - ["count", "mean", "var", "std"], - ], + "func,args", create_test_groupby_apply_return_series_dataframe_params() ) -def test_groupby_multi_agg_hash_groupby(agg): - coll_dict = {letter: float for letter in string.ascii_lowercase} - coll_dict["id"] = int - gdf = cudf.datasets.timeseries( - start="2000", - end="2000-01-2", - dtypes=coll_dict, - freq="1s", - seed=1, - ).reset_index(drop=True) - pdf = gdf.to_pandas() - check_dtype = "count" not in agg - pdg = pdf.groupby("id").agg(agg) - gdg = gdf.groupby("id").agg(agg) - assert_groupby_results_equal(pdg, gdg, check_dtype=check_dtype) - - @pytest.mark.skipif( PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, - reason="previous verion of pandas throws a warning", -) -@pytest.mark.parametrize( - "agg", ["min", "max", "idxmax", "idxmin", "sum", "prod", "count", "mean"] + reason="Include groups missing on old versions of pandas", ) -def test_groupby_nulls_basic(agg): - check_dtype = agg not in _index_type_aggs - - pdf = pd.DataFrame({"a": [0, 0, 1, 1, 2, 2], "b": [1, 2, 1, 2, 1, None]}) - gdf = cudf.from_pandas(pdf) - assert_groupby_results_equal( - getattr(pdf.groupby("a"), agg)(), - getattr(gdf.groupby("a"), agg)(), - check_dtype=check_dtype, - ) - - pdf = pd.DataFrame( - { - "a": [0, 0, 1, 1, 2, 2], - "b": [1, 2, 1, 2, 1, None], - "c": [1, 2, 1, None, 1, 2], - } - ) - gdf = cudf.from_pandas(pdf) - assert_groupby_results_equal( - getattr(pdf.groupby("a"), agg)(), - getattr(gdf.groupby("a"), agg)(), - check_dtype=check_dtype, - ) - - pdf = pd.DataFrame( - { - "a": [0, 0, 1, 1, 2, 2], - "b": [1, 2, 1, 2, 1, None], - "c": [1, 2, None, None, 1, 2], - } - ) - gdf = cudf.from_pandas(pdf) - - # TODO: fillna() used here since we don't follow - # Pandas' null semantics. Should we change it? - - assert_groupby_results_equal( - getattr(pdf.groupby("a"), agg)().fillna(0), - getattr(gdf.groupby("a"), agg)().fillna(0 if agg != "prod" else 1), - check_dtype=check_dtype, - ) - - -def test_groupby_nulls_in_index(): - pdf = pd.DataFrame({"a": [None, 2, 1, 1], "b": [1, 2, 3, 4]}) - gdf = cudf.from_pandas(pdf) - - assert_groupby_results_equal( - pdf.groupby("a").sum(), gdf.groupby("a").sum() - ) - - -def test_groupby_all_nulls_index(): - gdf = cudf.DataFrame( - { - "a": cudf.Series([None, None, None, None], dtype="object"), - "b": [1, 2, 3, 4], - } - ) - pdf = gdf.to_pandas() - assert_groupby_results_equal( - pdf.groupby("a").sum(), gdf.groupby("a").sum() - ) - - gdf = cudf.DataFrame( - {"a": cudf.Series([np.nan, np.nan, np.nan, np.nan]), "b": [1, 2, 3, 4]} - ) - pdf = gdf.to_pandas() - assert_groupby_results_equal( - pdf.groupby("a").sum(), gdf.groupby("a").sum() - ) - - -@pytest.mark.parametrize("sort", [True, False]) -def test_groupby_sort(sort): - pdf = pd.DataFrame({"a": [2, 2, 1, 1], "b": [1, 2, 3, 4]}) - gdf = cudf.from_pandas(pdf) - - assert_eq( - pdf.groupby("a", sort=sort).sum(), - gdf.groupby("a", sort=sort).sum(), - check_like=not sort, - ) - +def test_groupby_apply_return_series_dataframe(func, args): pdf = pd.DataFrame( - {"c": [-1, 2, 1, 4], "b": [1, 2, 3, 4], "a": [2, 2, 1, 1]} + {"key": [0, 0, 1, 1, 2, 2, 2], "val": [0, 1, 2, 3, 4, 5, 6]} ) gdf = cudf.from_pandas(pdf) - assert_eq( - pdf.groupby(["c", "b"], sort=sort).sum(), - gdf.groupby(["c", "b"], sort=sort).sum(), - check_like=not sort, - ) - - ps = pd.Series([1, 2, 3, 4, 5, 6, 7, 8], index=[2, 2, 2, 3, 3, 1, 1, 1]) - gs = cudf.from_pandas(ps) - - assert_eq( - ps.groupby(level=0, sort=sort).sum().to_frame(), - gs.groupby(level=0, sort=sort).sum().to_frame(), - check_like=not sort, - ) - - ps = pd.Series( - [1, 2, 3, 4, 5, 6, 7, 8], - index=pd.MultiIndex.from_product([(1, 2), ("a", "b"), (42, 84)]), - ) - gs = cudf.from_pandas(ps) - - assert_eq( - ps.groupby(level=0, sort=sort).sum().to_frame(), - gs.groupby(level=0, sort=sort).sum().to_frame(), - check_like=not sort, - ) - - -def test_groupby_cat(): - pdf = pd.DataFrame( - {"a": [1, 1, 2], "b": pd.Series(["b", "b", "a"], dtype="category")} - ) - gdf = cudf.from_pandas(pdf) - assert_groupby_results_equal( - pdf.groupby("a").count(), - gdf.groupby("a").count(), - check_dtype=False, + expected = pdf.groupby(["key"], group_keys=False).apply( + func, *args, include_groups=False ) + actual = gdf.groupby(["key"]).apply(func, *args, include_groups=False) - -def test_groupby_index_type(): - df = cudf.DataFrame() - df["string_col"] = ["a", "b", "c"] - df["counts"] = [1, 2, 3] - res = df.groupby(by="string_col").counts.sum() - assert res.index.dtype == cudf.dtype("object") + assert_groupby_results_equal(expected, actual) @pytest.mark.parametrize( - "interpolation", ["linear", "lower", "higher", "nearest", "midpoint"] + "pdf", + [pd.DataFrame(), pd.DataFrame({"a": []}), pd.Series([], dtype="float64")], ) -@pytest.mark.parametrize("q", [0.25, 0.4, 0.5, 0.7, 1]) -def test_groupby_quantile(request, interpolation, q): - request.applymarker( - pytest.mark.xfail( - condition=(q == 0.5 and interpolation == "nearest"), - reason=( - "Pandas NaN Rounding will fail nearest interpolation at 0.5" - ), - ) - ) - - raw_data = { - "y": [None, 1, 2, 3, 4, None, 6, 7, 8, 9], - "x": [1, 2, 3, 1, 2, 2, 1, None, 3, 2], - } - # Pandas>0.25 now casts NaN in quantile operations as a float64 - # # so we are filling with zeros. - pdf = pd.DataFrame(raw_data).fillna(0) - gdf = DataFrame.from_pandas(pdf) - - pdg = pdf.groupby("x") - gdg = gdf.groupby("x") - - pdresult = pdg.quantile(q, interpolation=interpolation) - gdresult = gdg.quantile(q, interpolation=interpolation) - - assert_groupby_results_equal(pdresult, gdresult) - - -def test_groupby_std(): - raw_data = { - "x": [1, 2, 3, 1, 2, 2, 1, None, 3, 2], - "y": [None, 1, 2, 3, 4, None, 6, 7, 8, 9], - } - pdf = pd.DataFrame(raw_data) - gdf = DataFrame.from_pandas(pdf) - pdg = pdf.groupby("x") - gdg = gdf.groupby("x") - pdresult = pdg.std() - gdresult = gdg.std() - - assert_groupby_results_equal(pdresult, gdresult) - - -def test_groupby_size(): - pdf = pd.DataFrame( - { - "a": [1, 1, 3, 4], - "b": ["bob", "bob", "alice", "cooper"], - "c": [1, 2, 3, 4], - } - ) - gdf = cudf.from_pandas(pdf) - - assert_groupby_results_equal( - pdf.groupby("a").size(), - gdf.groupby("a").size(), - check_dtype=False, - ) - - assert_groupby_results_equal( - pdf.groupby(["a", "b", "c"]).size(), - gdf.groupby(["a", "b", "c"]).size(), - check_dtype=False, - ) - - sr = pd.Series(range(len(pdf))) - assert_groupby_results_equal( - pdf.groupby(sr).size(), - gdf.groupby(sr).size(), - check_dtype=False, - ) - - -@pytest.mark.parametrize("index", [None, [1, 2, 3, 4]]) -def test_groupby_cumcount(index): - pdf = pd.DataFrame( - { - "a": [1, 1, 3, 4], - "b": ["bob", "bob", "alice", "cooper"], - "c": [1, 2, 3, 4], - }, - index=index, - ) +def test_groupby_apply_no_keys(pdf): gdf = cudf.from_pandas(pdf) - - assert_groupby_results_equal( - pdf.groupby("a").cumcount(), - gdf.groupby("a").cumcount(), - check_dtype=False, - ) - - assert_groupby_results_equal( - pdf.groupby(["a", "b", "c"]).cumcount(), - gdf.groupby(["a", "b", "c"]).cumcount(), - check_dtype=False, - ) - - sr = pd.Series(range(len(pdf)), index=index) - assert_groupby_results_equal( - pdf.groupby(sr).cumcount(), - gdf.groupby(sr).cumcount(), - check_dtype=False, - ) - - -@pytest.mark.parametrize("as_index", [True, False]) -@pytest.mark.parametrize( - "agg", ["min", "max", "idxmin", "idxmax", "mean", "count"] -) -def test_groupby_datetime(request, as_index, agg): - nelem = 20 - if agg == "mean" and as_index is True: - request.applymarker( - pytest.mark.xfail(reason="Invalid type/aggregation combination") - ) - check_dtype = agg not in ("mean", "count", "idxmin", "idxmax") - pdf = make_frame(pd.DataFrame, nelem=nelem, with_datetime=True) - gdf = make_frame(cudf.DataFrame, nelem=nelem, with_datetime=True) - pdg = pdf.groupby("datetime", as_index=as_index) - gdg = gdf.groupby("datetime", as_index=as_index) - if as_index is False: - pdres = getattr(pdg, agg)() - gdres = getattr(gdg, agg)() + if isinstance(pdf, pd.DataFrame): + kwargs = {"check_column_type": False} else: - pdres = pdg.agg({"datetime": agg}) - gdres = gdg.agg({"datetime": agg}) - assert_groupby_results_equal( - pdres, - gdres, - check_dtype=check_dtype, - as_index=as_index, - by=["datetime"], - ) - - -def test_groupby_dropna(): - df = cudf.DataFrame({"a": [1, 1, None], "b": [1, 2, 3]}) - expect = cudf.DataFrame( - {"b": [3, 3]}, index=cudf.Series([1, None], name="a") - ) - got = df.groupby("a", dropna=False).sum() - assert_groupby_results_equal(expect, got) - - df = cudf.DataFrame( - {"a": [1, 1, 1, None], "b": [1, None, 1, None], "c": [1, 2, 3, 4]} - ) - idx = cudf.MultiIndex.from_frame( - df[["a", "b"]].drop_duplicates().sort_values(["a", "b"]), - names=["a", "b"], - ) - expect = cudf.DataFrame({"c": [4, 2, 4]}, index=idx) - got = df.groupby(["a", "b"], dropna=False).sum() - - assert_groupby_results_equal(expect, got) - - -def test_groupby_dropna_getattr(): - df = cudf.DataFrame() - df["id"] = [0, 1, 1, None, None, 3, 3] - df["val"] = [0, 1, 1, 2, 2, 3, 3] - got = df.groupby("id", dropna=False).val.sum() - - expect = cudf.Series( - [0, 2, 6, 4], name="val", index=cudf.Series([0, 1, 3, None], name="id") - ) - - assert_groupby_results_equal(expect, got) - - -def test_groupby_categorical_from_string(): - gdf = cudf.DataFrame() - gdf["id"] = ["a", "b", "c"] - gdf["val"] = [0, 1, 2] - gdf["id"] = gdf["id"].astype("category") - assert_groupby_results_equal( - cudf.DataFrame({"val": gdf["val"]}).set_index(keys=gdf["id"]), - gdf.groupby("id").sum(), - ) - - -def test_groupby_arbitrary_length_series(): - gdf = cudf.DataFrame({"a": [1, 1, 2], "b": [2, 3, 4]}, index=[4, 5, 6]) - gsr = cudf.Series([1.0, 2.0, 2.0], index=[3, 4, 5]) - - pdf = gdf.to_pandas() - psr = gsr.to_pandas() - - expect = pdf.groupby(psr).sum() - got = gdf.groupby(gsr).sum() - - assert_groupby_results_equal(expect, got) - - -def test_groupby_series_same_name_as_dataframe_column(): - gdf = cudf.DataFrame({"a": [1, 1, 2], "b": [2, 3, 4]}, index=[4, 5, 6]) - gsr = cudf.Series([1.0, 2.0, 2.0], name="a", index=[3, 4, 5]) - - pdf = gdf.to_pandas() - psr = gsr.to_pandas() - - expect = pdf.groupby(psr).sum() - got = gdf.groupby(gsr).sum() - - assert_groupby_results_equal(expect, got) - - -def test_group_by_series_and_column_name_in_by(): - gdf = cudf.DataFrame( - {"x": [1.0, 2.0, 3.0], "y": [1, 2, 1]}, index=[1, 2, 3] - ) - gsr0 = cudf.Series([0.0, 1.0, 2.0], name="a", index=[1, 2, 3]) - gsr1 = cudf.Series([0.0, 1.0, 3.0], name="b", index=[3, 4, 5]) - - pdf = gdf.to_pandas() - psr0 = gsr0.to_pandas() - psr1 = gsr1.to_pandas() - - expect = pdf.groupby(["x", psr0, psr1]).sum() - got = gdf.groupby(["x", gsr0, gsr1]).sum() - - assert_groupby_results_equal(expect, got) - - -@pytest.mark.parametrize( - "grouper", - [ - "a", - ["a"], - ["a", "b"], - np.array([0, 1, 1, 2, 3, 2]), - {0: "a", 1: "a", 2: "b", 3: "a", 4: "b", 5: "c"}, - lambda x: x + 1, - ["a", np.array([0, 1, 1, 2, 3, 2])], - ], -) -def test_grouping(grouper): - pdf = pd.DataFrame( - { - "a": [1, 1, 1, 2, 2, 3], - "b": [1, 2, 1, 2, 1, 2], - "c": [1, 2, 3, 4, 5, 6], - } - ) - gdf = cudf.from_pandas(pdf) - - for pdf_group, gdf_group in zip( - pdf.groupby(grouper), gdf.groupby(grouper), strict=True - ): - assert pdf_group[0] == gdf_group[0] - assert_eq(pdf_group[1], gdf_group[1]) - - -@pytest.mark.parametrize("agg", [lambda x: x.count(), "count"]) -@pytest.mark.parametrize("by", ["a", ["a", "b"], ["a", "c"]]) -def test_groupby_count(agg, by): - pdf = pd.DataFrame( - {"a": [1, 1, 1, 2, 3], "b": [1, 2, 2, 2, 1], "c": [1, 2, None, 4, 5]} - ) - gdf = cudf.from_pandas(pdf) - - expect = pdf.groupby(by).agg(agg) - got = gdf.groupby(by).agg(agg) - - assert_groupby_results_equal(expect, got, check_dtype=True) - - -@pytest.mark.parametrize("agg", [lambda x: x.median(), "median"]) -@pytest.mark.parametrize("by", ["a", ["a", "b"], ["a", "c"]]) -def test_groupby_median(agg, by): - pdf = pd.DataFrame( - {"a": [1, 1, 1, 2, 3], "b": [1, 2, 2, 2, 1], "c": [1, 2, None, 4, 5]} - ) - gdf = cudf.from_pandas(pdf) - - expect = pdf.groupby(by).agg(agg) - got = gdf.groupby(by).agg(agg) - - assert_groupby_results_equal(expect, got, check_dtype=False) - - -@pytest.mark.parametrize("agg", [lambda x: x.nunique(), "nunique"]) -@pytest.mark.parametrize("by", ["a", ["a", "b"], ["a", "c"]]) -def test_groupby_nunique(agg, by): - pdf = pd.DataFrame( - {"a": [1, 1, 1, 2, 3], "b": [1, 2, 2, 2, 1], "c": [1, 2, None, 4, 5]} - ) - gdf = cudf.from_pandas(pdf) - - expect = pdf.groupby(by).nunique() - got = gdf.groupby(by).nunique() - - assert_groupby_results_equal(expect, got, check_dtype=False) - - -@pytest.mark.parametrize("dropna", [True, False]) -def test_nunique_dropna(dropna): - gdf = cudf.DataFrame( - { - "a": [1, 1, 2], - "b": [4, None, 5], - "c": [None, None, 7], - "d": [1, 1, 3], - } - ) - pdf = gdf.to_pandas() - - result = gdf.groupby("a")["b"].nunique(dropna=dropna) - expected = pdf.groupby("a")["b"].nunique(dropna=dropna) - assert_groupby_results_equal(result, expected, check_dtype=False) - - -@pytest.mark.parametrize( - "n", - [0, 1, 2, 10], -) -@pytest.mark.parametrize("by", ["a", ["a", "b"], ["a", "c"]]) -def test_groupby_nth(n, by): - pdf = pd.DataFrame( - { - "a": [1, 1, 1, 2, 3], - "b": [1, 2, 2, 2, 1], - "c": [1, 2, None, 4, 5], - "d": ["a", "b", "c", "d", "e"], - } - ) - gdf = cudf.from_pandas(pdf) - - expect = pdf.groupby(by).nth(n) - got = gdf.groupby(by).nth(n) - - assert_groupby_results_equal(expect, got, check_dtype=False) - - -def test_raise_data_error(): - pdf = pd.DataFrame({"a": [1, 2, 3, 4], "b": ["a", "b", "c", "d"]}) - gdf = cudf.from_pandas(pdf) - - assert_exceptions_equal( - pdf.groupby("a").mean, - gdf.groupby("a").mean, - ) - - -def test_multi_agg(): - gdf = cudf.DataFrame( - {"a": [1, 1, 2, 2], "b": [1, 2, 3, 4], "c": ["a", "b", "c", "d"]} - ) - pdf = gdf.to_pandas() - assert_groupby_results_equal( - pdf.groupby("a").agg({"b": ["count", "mean"], "c": ["count"]}), - gdf.groupby("a").agg({"b": ["count", "mean"], "c": ["count"]}), - ) - - -@pytest.mark.parametrize( - "agg", - ( - [ - *itertools.combinations(["count", "max", "min", "nunique"], 2), - {"b": "min", "c": "mean"}, - {"b": "max", "c": "mean"}, - {"b": "count", "c": "mean"}, - {"b": "nunique", "c": "mean"}, - ] - ), -) -def test_groupby_agg_combinations(agg): - pdf = pd.DataFrame( - { - "a": [1, 1, 2, 2, 3], - "b": ["a", "a", "b", "c", "d"], - "c": [1, 2, 3, 4, 5], - } - ) - gdf = cudf.from_pandas(pdf) - - assert_groupby_results_equal( - pdf.groupby("a").agg(agg), - gdf.groupby("a").agg(agg), - check_dtype=False, - ) - - -@pytest.mark.skipif( - PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, - reason="Include groups missing on old versions of pandas", -) -def test_groupby_apply_noempty_group(): - pdf = pd.DataFrame( - {"a": [1, 1, 2, 2], "b": [1, 2, 1, 2], "c": [1, 2, 3, 4]} - ) - gdf = cudf.from_pandas(pdf) - - expect = ( - pdf.groupby("a", group_keys=False) - .apply(lambda x: x.iloc[[0, 1]], include_groups=False) - .reset_index(drop=True) - ) - got = ( - gdf.groupby("a") - .apply(lambda x: x.iloc[[0, 1]], include_groups=False) - .reset_index(drop=True) - ) - assert_groupby_results_equal(expect, got) - - -def test_reset_index_after_empty_groupby(): - # GH #5475 - pdf = pd.DataFrame({"a": [1, 2, 3]}) - gdf = cudf.from_pandas(pdf) - - assert_groupby_results_equal( - pdf.groupby("a").sum().reset_index(), - gdf.groupby("a").sum().reset_index(), - as_index=False, - by="a", - ) - - -def test_groupby_attribute_error(): - err_msg = "Test error message" - - class TestGroupBy(cudf.core.groupby.GroupBy): - @property - def _groupby(self): - raise AttributeError(err_msg) - - a = cudf.DataFrame({"a": [1, 2], "b": [2, 3]}) - gb = TestGroupBy(a, a["a"]) - - with pytest.raises(AttributeError, match=err_msg): - gb.sum() - - -@pytest.mark.parametrize( - "by", - [ - "a", - "b", - ["a"], - ["b"], - ["a", "b"], - ["b", "a"], - np.array([0, 0, 0, 1, 1, 1, 2]), - ], -) -def test_groupby_groups(by): - pdf = pd.DataFrame( - {"a": [1, 2, 1, 2, 1, 2, 3], "b": [1, 2, 3, 4, 5, 6, 7]} - ) - gdf = cudf.from_pandas(pdf) - - pdg = pdf.groupby(by) - gdg = gdf.groupby(by) - - for key in pdg.groups: - assert key in gdg.groups - assert_eq(pdg.groups[key], gdg.groups[key]) - - -@pytest.mark.parametrize( - "by", - [ - "a", - "b", - ["a"], - ["b"], - ["a", "b"], - ["b", "a"], - ["a", "c"], - ["a", "b", "c"], - ], -) -def test_groupby_groups_multi(by): - pdf = pd.DataFrame( - { - "a": [1, 2, 1, 2, 1, 2, 3], - "b": ["a", "b", "a", "b", "b", "c", "c"], - "c": [1, 2, 3, 4, 5, 6, 7], - } - ) - gdf = cudf.from_pandas(pdf) - - pdg = pdf.groupby(by) - gdg = gdf.groupby(by) - - for key in pdg.groups: - assert key in gdg.groups - assert_eq(pdg.groups[key], gdg.groups[key]) - - -def test_groupby_nunique_series(): - pdf = pd.DataFrame({"a": [1, 1, 1, 2, 2, 2], "b": [1, 2, 3, 1, 1, 2]}) - gdf = cudf.from_pandas(pdf) - - assert_groupby_results_equal( - pdf.groupby("a")["b"].nunique(), - gdf.groupby("a")["b"].nunique(), - check_dtype=False, - ) - - -@pytest.mark.parametrize("list_agg", [list, "collect"]) -def test_groupby_list_simple(list_agg): - pdf = pd.DataFrame({"a": [1, 1, 1, 2, 2, 2], "b": [1, 2, None, 4, 5, 6]}) - gdf = cudf.from_pandas(pdf) - - assert_groupby_results_equal( - pdf.groupby("a").agg({"b": list}), - gdf.groupby("a").agg({"b": list_agg}), - check_dtype=False, - ) - - -@pytest.mark.parametrize("list_agg", [list, "collect"]) -def test_groupby_list_of_lists(list_agg): - pdf = pd.DataFrame( - { - "a": [1, 1, 1, 2, 2, 2], - "b": [[1, 2], [3, None, 5], None, [], [7, 8], [9]], - } - ) - gdf = cudf.from_pandas(pdf) - - assert_groupby_results_equal( - pdf.groupby("a").agg({"b": list}), - gdf.groupby("a").agg({"b": list_agg}), - check_dtype=False, - ) - - -@pytest.mark.parametrize("list_agg", [list, "collect"]) -def test_groupby_list_of_structs(list_agg): - pdf = pd.DataFrame( - { - "a": [1, 1, 1, 2, 2, 2], - "b": [ - {"c": "1", "d": 1}, - {"c": "2", "d": 2}, - {"c": "3", "d": 3}, - {"c": "4", "d": 4}, - {"c": "5", "d": 5}, - {"c": "6", "d": 6}, - ], - } - ) - gdf = cudf.from_pandas(pdf) - grouped = gdf.groupby("a").agg({"b": list_agg}) - assert_groupby_results_equal( - pdf.groupby("a").agg({"b": list}), - grouped, - check_dtype=True, - ) - assert grouped["b"].dtype.element_type == gdf["b"].dtype - - -@pytest.mark.parametrize("list_agg", [list, "collect"]) -def test_groupby_list_single_element(list_agg): - pdf = pd.DataFrame({"a": [1, 2], "b": [3, None]}) - gdf = cudf.from_pandas(pdf) - - assert_groupby_results_equal( - pdf.groupby("a").agg({"b": list}), - gdf.groupby("a").agg({"b": list_agg}), - check_dtype=False, - ) - - -@pytest.mark.parametrize( - "agg", [list, [list, "count"], {"b": list, "c": "sum"}] -) -def test_groupby_list_strings(agg): - pdf = pd.DataFrame( - { - "a": [1, 1, 1, 2, 2], - "b": ["b", "a", None, "e", "d"], - "c": [1, 2, 3, 4, 5], - } - ) - gdf = cudf.from_pandas(pdf) - - assert_groupby_results_equal( - pdf.groupby("a").agg(agg), - gdf.groupby("a").agg(agg), - check_dtype=False, - ) - - -def test_groupby_list_columns_excluded(): - pdf = pd.DataFrame( - { - "a": [1, 1, 2, 2], - "b": [1, 2, 3, 4], - "c": [[1, 2], [3, 4], [5, 6], [7, 8]], - } - ) - gdf = cudf.from_pandas(pdf) - - pandas_result = pdf.groupby("a").mean(numeric_only=True) - pandas_agg_result = pdf.groupby("a").agg("mean", numeric_only=True) - - assert_groupby_results_equal( - pandas_result, - gdf.groupby("a").mean(numeric_only=True), - check_dtype=False, - ) - - assert_groupby_results_equal( - pandas_agg_result, - gdf.groupby("a").agg("mean"), - check_dtype=False, - ) - - -def test_groupby_pipe(): - pdf = pd.DataFrame({"A": "a b a b".split(), "B": [1, 2, 3, 4]}) - gdf = cudf.from_pandas(pdf) - - expected = pdf.groupby("A").pipe(lambda x: x.max() - x.min()) - actual = gdf.groupby("A").pipe(lambda x: x.max() - x.min()) - - assert_groupby_results_equal(expected, actual) - - -def create_test_groupby_apply_return_scalars_params(): - def f0(x): - x = x[~x["B"].isna()] - ticker = x.shape[0] - full = ticker / 10 - return full - - def f1(x, k): - x = x[~x["B"].isna()] - ticker = x.shape[0] - full = ticker / k - return full - - def f2(x, k, L): - x = x[~x["B"].isna()] - ticker = x.shape[0] - full = L * (ticker / k) - return full - - def f3(x, k, L, m): - x = x[~x["B"].isna()] - ticker = x.shape[0] - full = L * (ticker / k) % m - return full - - return [(f0, ()), (f1, (42,)), (f2, (42, 119)), (f3, (42, 119, 212.1))] - - -@pytest.mark.parametrize( - "func,args", create_test_groupby_apply_return_scalars_params() -) -@pytest.mark.skipif( - PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, - reason="Fails in older versions of pandas", -) -def test_groupby_apply_return_scalars(func, args): - pdf = pd.DataFrame( - { - "A": [1, 1, 2, 2, 3, 3, 4, 4, 5, 5], - "B": [ - 0.01, - np.nan, - 0.03, - 0.04, - np.nan, - 0.06, - 0.07, - 0.08, - 0.09, - 1.0, - ], - } - ) - gdf = cudf.from_pandas(pdf) - - expected = pdf.groupby("A").apply(func, *args, include_groups=False) - actual = gdf.groupby("A").apply(func, *args, include_groups=False) - - assert_groupby_results_equal(expected, actual) - - -def create_test_groupby_apply_return_series_dataframe_params(): - def f0(x): - return x - x.max() - - def f1(x): - return x.min() - x.max() - - def f2(x): - return x.min() - - def f3(x, k): - return x - x.max() + k - - def f4(x, k, L): - return x.min() - x.max() + (k / L) - - def f5(x, k, L, m): - return m * x.min() + (k / L) - - return [ - (f0, ()), - (f1, ()), - (f2, ()), - (f3, (42,)), - (f4, (42, 119)), - (f5, (41, 119, 212.1)), - ] - - -@pytest.mark.parametrize( - "func,args", create_test_groupby_apply_return_series_dataframe_params() -) -@pytest.mark.skipif( - PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, - reason="Include groups missing on old versions of pandas", -) -def test_groupby_apply_return_series_dataframe(func, args): - pdf = pd.DataFrame( - {"key": [0, 0, 1, 1, 2, 2, 2], "val": [0, 1, 2, 3, 4, 5, 6]} - ) - gdf = cudf.from_pandas(pdf) - - expected = pdf.groupby(["key"], group_keys=False).apply( - func, *args, include_groups=False - ) - actual = gdf.groupby(["key"]).apply(func, *args, include_groups=False) - - assert_groupby_results_equal(expected, actual) - - -@pytest.mark.parametrize( - "pdf", - [pd.DataFrame(), pd.DataFrame({"a": []}), pd.Series([], dtype="float64")], -) -def test_groupby_no_keys(pdf): - gdf = cudf.from_pandas(pdf) - if isinstance(pdf, pd.DataFrame): - kwargs = {"check_column_type": False} - else: - kwargs = {} - assert_groupby_results_equal( - pdf.groupby([]).max(), - gdf.groupby([]).max(), - check_dtype=False, - check_index_type=False, # Int64 v/s Float64 - **kwargs, - ) - - -@pytest.mark.parametrize( - "pdf", - [pd.DataFrame(), pd.DataFrame({"a": []}), pd.Series([], dtype="float64")], -) -def test_groupby_apply_no_keys(pdf): - gdf = cudf.from_pandas(pdf) - if isinstance(pdf, pd.DataFrame): - kwargs = {"check_column_type": False} - else: - kwargs = {} + kwargs = {} assert_groupby_results_equal( pdf.groupby([], group_keys=False).apply(lambda x: x.max()), gdf.groupby([]).apply(lambda x: x.max()), @@ -2442,623 +839,6 @@ def test_groupby_apply_no_keys(pdf): ) -@pytest.mark.parametrize( - "pdf", - [pd.DataFrame({"a": [1, 2]}), pd.DataFrame({"a": [1, 2], "b": [2, 3]})], -) -def test_groupby_nonempty_no_keys(pdf): - gdf = cudf.from_pandas(pdf) - assert_exceptions_equal( - lambda: pdf.groupby([]), - lambda: gdf.groupby([]), - ) - - -@pytest.mark.parametrize( - "by,data", - [ - pytest.param( - [], - [], - marks=pytest.mark.xfail(reason="dtype always cast to object"), - ), - ([1, 1, 2, 2], [0, 0, 1, 1]), - ([1, 2, 3, 4], [0, 0, 0, 0]), - ([1, 2, 1, 2], [0, 1, 1, 1]), - ], -) -@pytest.mark.parametrize( - "dtype", - SIGNED_TYPES + DATETIME_TYPES + TIMEDELTA_TYPES + ["string", "category"], -) -def test_groupby_unique(by, data, dtype): - pdf = pd.DataFrame({"by": by, "data": data}) - pdf["data"] = pdf["data"].astype(dtype) - gdf = cudf.from_pandas(pdf) - - expect = pdf.groupby("by")["data"].unique() - got = gdf.groupby("by")["data"].unique() - assert_groupby_results_equal(expect, got) - - -@pytest.mark.parametrize( - "func", ["cummin", "cummax", "cumcount", "cumsum", "cumprod"] -) -def test_groupby_2keys_scan(func): - nelem = 20 - pdf = make_frame(pd.DataFrame, nelem=nelem) - expect_df = pdf.groupby(["x", "y"], sort=True).agg(func) - gdf = cudf.from_pandas(pdf) - got_df = gdf.groupby(["x", "y"], sort=True).agg(func) - # pd.groupby.cumcount returns a series. - if isinstance(expect_df, pd.Series): - expect_df = expect_df.to_frame("val") - - check_dtype = func not in _index_type_aggs - assert_groupby_results_equal(got_df, expect_df, check_dtype=check_dtype) - - expect_df = getattr(pdf.groupby(["x", "y"], sort=True), func)() - got_df = getattr(gdf.groupby(["x", "y"], sort=True), func)() - assert_groupby_results_equal(got_df, expect_df, check_dtype=check_dtype) - - expect_df = getattr(pdf.groupby(["x", "y"], sort=True)[["x"]], func)() - got_df = getattr(gdf.groupby(["x", "y"], sort=True)[["x"]], func)() - assert_groupby_results_equal(got_df, expect_df, check_dtype=check_dtype) - - expect_df = getattr(pdf.groupby(["x", "y"], sort=True)["y"], func)() - got_df = getattr(gdf.groupby(["x", "y"], sort=True)["y"], func)() - assert_groupby_results_equal(got_df, expect_df, check_dtype=check_dtype) - - -@pytest.mark.parametrize("method", ["average", "min", "max", "first", "dense"]) -@pytest.mark.parametrize("ascending", [True, False]) -@pytest.mark.parametrize("na_option", ["keep", "top", "bottom"]) -@pytest.mark.parametrize("pct", [False, True]) -def test_groupby_2keys_rank(method, ascending, na_option, pct): - nelem = 20 - t = rand_dataframe( - dtypes_meta=[ - {"dtype": "int64", "null_frequency": 0, "cardinality": 10}, - {"dtype": "int64", "null_frequency": 0, "cardinality": 10}, - {"dtype": "int64", "null_frequency": 0.4, "cardinality": 10}, - ], - rows=nelem, - use_threads=False, - seed=0, - ) - pdf = t.to_pandas() - pdf.columns = ["x", "y", "z"] - gdf = cudf.from_pandas(pdf) - expect_df = pdf.groupby(["x", "y"], sort=True).rank( - method=method, ascending=ascending, na_option=na_option, pct=pct - ) - got_df = gdf.groupby(["x", "y"], sort=True).rank( - method=method, ascending=ascending, na_option=na_option, pct=pct - ) - - assert_groupby_results_equal(got_df, expect_df, check_dtype=False) - - -def test_groupby_rank_fails(): - gdf = cudf.DataFrame( - {"x": [1, 2, 3, 4], "y": [1, 2, 3, 4], "z": [1, 2, 3, 4]} - ) - with pytest.raises(NotImplementedError): - gdf.groupby(["x", "y"]).rank(method="min", axis=1) - gdf = cudf.DataFrame( - { - "a": [1, 1, 1, 2, 2, 2], - "b": [[1, 2], [3, None, 5], None, [], [7, 8], [9]], - } - ) - with pytest.raises(NotImplementedError): - gdf.groupby(["a"]).rank(method="min", axis=1) - - -@pytest.mark.parametrize( - "with_nan", [False, True], ids=["just-NA", "also-NaN"] -) -@pytest.mark.parametrize("dropna", [False, True], ids=["keepna", "dropna"]) -@pytest.mark.parametrize( - "duplicate_index", [False, True], ids=["rangeindex", "dupindex"] -) -def test_groupby_scan_null_keys(with_nan, dropna, duplicate_index): - key_col = [None, 1, 2, None, 3, None, 3, 1, None, 1] - if with_nan: - df = pd.DataFrame( - {"key": pd.Series(key_col, dtype="float32"), "value": range(10)} - ) - else: - df = pd.DataFrame( - {"key": pd.Series(key_col, dtype="Int32"), "value": range(10)} - ) - - if duplicate_index: - # Non-default index with duplicates - df.index = [1, 2, 3, 1, 3, 2, 4, 1, 6, 10] - - cdf = cudf.from_pandas(df) - - expect = df.groupby("key", dropna=dropna).cumsum() - got = cdf.groupby("key", dropna=dropna).cumsum() - assert_eq(expect, got) - - -def test_groupby_mix_agg_scan(): - err_msg = "Cannot perform both aggregation and scan in one operation" - func = ["cumsum", "sum"] - gb = make_frame(DataFrame, nelem=10).groupby(["x", "y"], sort=True) - - gb.agg(func[0]) - gb.agg(func[1]) - gb.agg(func[1:]) - with pytest.raises(NotImplementedError, match=err_msg): - gb.agg(func) - - -@pytest.mark.parametrize("shift_perc", [0.5, 1.0, 1.5]) -@pytest.mark.parametrize("direction", [1, -1]) -@pytest.mark.parametrize("fill_value", [None, np.nan, 42]) -def test_groupby_shift_row(shift_perc, direction, fill_value): - nelem = 20 - pdf = make_frame(pd.DataFrame, nelem=nelem, extra_vals=["val2"]) - gdf = cudf.from_pandas(pdf) - n_shift = int(nelem * shift_perc) * direction - - expected = pdf.groupby(["x", "y"]).shift( - periods=n_shift, fill_value=fill_value - ) - got = gdf.groupby(["x", "y"]).shift(periods=n_shift, fill_value=fill_value) - - assert_groupby_results_equal( - expected[["val", "val2"]], got[["val", "val2"]] - ) - - -@pytest.mark.parametrize("shift_perc", [0.5, 1.0, 1.5]) -@pytest.mark.parametrize("direction", [1, -1]) -@pytest.mark.parametrize( - "fill_value", - [ - None, - pytest.param( - 0, - marks=pytest.mark.xfail( - reason="https://github.com/rapidsai/cudf/issues/10608" - ), - ), - pytest.param( - 42, - marks=pytest.mark.xfail( - reason="https://github.com/rapidsai/cudf/issues/10608" - ), - ), - ], -) -def test_groupby_shift_row_mixed_numerics(shift_perc, direction, fill_value): - nelem = 20 - t = rand_dataframe( - dtypes_meta=[ - {"dtype": "int64", "null_frequency": 0, "cardinality": 10}, - {"dtype": "int64", "null_frequency": 0.4, "cardinality": 10}, - {"dtype": "float32", "null_frequency": 0.4, "cardinality": 10}, - { - "dtype": "datetime64[ns]", - "null_frequency": 0.4, - "cardinality": 10, - }, - { - "dtype": "timedelta64[ns]", - "null_frequency": 0.4, - "cardinality": 10, - }, - ], - rows=nelem, - use_threads=False, - seed=0, - ) - pdf = t.to_pandas() - gdf = cudf.from_pandas(pdf) - n_shift = int(nelem * shift_perc) * direction - - expected = pdf.groupby(["0"]).shift(periods=n_shift, fill_value=fill_value) - got = gdf.groupby(["0"]).shift(periods=n_shift, fill_value=fill_value) - - assert_groupby_results_equal( - expected[["1", "2", "3", "4"]], got[["1", "2", "3", "4"]] - ) - - -# TODO: Shifting list columns is currently unsupported because we cannot -# construct a null list scalar in python. Support once it is added. -@pytest.mark.parametrize("shift_perc", [0.5, 1.0, 1.5]) -@pytest.mark.parametrize("direction", [1, -1]) -def test_groupby_shift_row_mixed(shift_perc, direction): - nelem = 20 - t = rand_dataframe( - dtypes_meta=[ - {"dtype": "int64", "null_frequency": 0, "cardinality": 10}, - {"dtype": "int64", "null_frequency": 0.4, "cardinality": 10}, - {"dtype": "str", "null_frequency": 0.4, "cardinality": 10}, - { - "dtype": "datetime64[ns]", - "null_frequency": 0.4, - "cardinality": 10, - }, - { - "dtype": "timedelta64[ns]", - "null_frequency": 0.4, - "cardinality": 10, - }, - ], - rows=nelem, - use_threads=False, - seed=0, - ) - pdf = t.to_pandas() - gdf = cudf.from_pandas(pdf) - n_shift = int(nelem * shift_perc) * direction - - expected = pdf.groupby(["0"]).shift(periods=n_shift) - got = gdf.groupby(["0"]).shift(periods=n_shift) - - assert_groupby_results_equal( - expected[["1", "2", "3", "4"]], got[["1", "2", "3", "4"]] - ) - - -@pytest.mark.parametrize("shift_perc", [0.5, 1.0, 1.5]) -@pytest.mark.parametrize("direction", [1, -1]) -@pytest.mark.parametrize( - "fill_value", - [ - [ - 42, - "fill", - np.datetime64(123, "ns"), - np.timedelta64(456, "ns"), - ] - ], -) -def test_groupby_shift_row_mixed_fill(shift_perc, direction, fill_value): - nelem = 20 - t = rand_dataframe( - dtypes_meta=[ - {"dtype": "int64", "null_frequency": 0, "cardinality": 10}, - {"dtype": "int64", "null_frequency": 0.4, "cardinality": 10}, - {"dtype": "str", "null_frequency": 0.4, "cardinality": 10}, - { - "dtype": "datetime64[ns]", - "null_frequency": 0.4, - "cardinality": 10, - }, - { - "dtype": "timedelta64[ns]", - "null_frequency": 0.4, - "cardinality": 10, - }, - ], - rows=nelem, - use_threads=False, - seed=0, - ) - pdf = t.to_pandas() - gdf = cudf.from_pandas(pdf) - n_shift = int(nelem * shift_perc) * direction - - # Pandas does not support specifying different fill_value by column, so we - # simulate it column by column - expected = pdf.copy() - for col, single_fill in zip(pdf.iloc[:, 1:], fill_value, strict=True): - expected[col] = ( - pdf[col] - .groupby(pdf["0"]) - .shift(periods=n_shift, fill_value=single_fill) - ) - - got = gdf.groupby(["0"]).shift(periods=n_shift, fill_value=fill_value) - - assert_groupby_results_equal( - expected[["1", "2", "3", "4"]], got[["1", "2", "3", "4"]] - ) - - -@pytest.mark.parametrize("fill_value", [None, 0, 42]) -def test_groupby_shift_row_zero_shift(fill_value): - nelem = 20 - t = rand_dataframe( - dtypes_meta=[ - {"dtype": "int64", "null_frequency": 0, "cardinality": 10}, - {"dtype": "int64", "null_frequency": 0.4, "cardinality": 10}, - {"dtype": "float32", "null_frequency": 0.4, "cardinality": 10}, - { - "dtype": "datetime64[ns]", - "null_frequency": 0.4, - "cardinality": 10, - }, - { - "dtype": "timedelta64[ns]", - "null_frequency": 0.4, - "cardinality": 10, - }, - ], - rows=nelem, - use_threads=False, - seed=0, - ) - gdf = cudf.from_pandas(t.to_pandas()) - - expected = gdf - got = gdf.groupby(["0"]).shift(periods=0, fill_value=fill_value) - - assert_groupby_results_equal( - expected[["1", "2", "3", "4"]], got[["1", "2", "3", "4"]] - ) - - -@pytest.mark.parametrize("shift_perc", [0.5, 1.0, 1.5]) -@pytest.mark.parametrize("direction", [1, -1]) -def test_groupby_diff_row(shift_perc, direction): - nelem = 20 - pdf = make_frame(pd.DataFrame, nelem=nelem, extra_vals=["val2"]) - gdf = cudf.from_pandas(pdf) - n_shift = int(nelem * shift_perc) * direction - - expected = pdf.groupby(["x", "y"]).diff(periods=n_shift) - got = gdf.groupby(["x", "y"]).diff(periods=n_shift) - - assert_groupby_results_equal( - expected[["val", "val2"]], got[["val", "val2"]] - ) - - -@pytest.mark.parametrize("shift_perc", [0.5, 1.0, 1.5]) -@pytest.mark.parametrize("direction", [1, -1]) -def test_groupby_diff_row_mixed_numerics(shift_perc, direction): - nelem = 20 - t = rand_dataframe( - dtypes_meta=[ - {"dtype": "int64", "null_frequency": 0, "cardinality": 10}, - {"dtype": "int64", "null_frequency": 0.4, "cardinality": 10}, - {"dtype": "float32", "null_frequency": 0.4, "cardinality": 10}, - {"dtype": "decimal64", "null_frequency": 0.4, "cardinality": 10}, - { - "dtype": "datetime64[ns]", - "null_frequency": 0.4, - "cardinality": 10, - }, - { - "dtype": "timedelta64[ns]", - "null_frequency": 0.4, - "cardinality": 10, - }, - ], - rows=nelem, - use_threads=False, - seed=0, - ) - pdf = t.to_pandas() - gdf = cudf.from_pandas(pdf) - n_shift = int(nelem * shift_perc) * direction - - expected = pdf.groupby(["0"]).diff(periods=n_shift) - got = gdf.groupby(["0"]).diff(periods=n_shift) - - assert_groupby_results_equal( - expected[["1", "2", "3", "4", "5"]], got[["1", "2", "3", "4", "5"]] - ) - - -def test_groupby_diff_row_zero_shift(): - nelem = 20 - t = rand_dataframe( - dtypes_meta=[ - {"dtype": "int64", "null_frequency": 0, "cardinality": 10}, - {"dtype": "int64", "null_frequency": 0.4, "cardinality": 10}, - {"dtype": "float32", "null_frequency": 0.4, "cardinality": 10}, - { - "dtype": "datetime64[ns]", - "null_frequency": 0.4, - "cardinality": 10, - }, - { - "dtype": "timedelta64[ns]", - "null_frequency": 0.4, - "cardinality": 10, - }, - ], - rows=nelem, - use_threads=False, - seed=0, - ) - gdf = cudf.from_pandas(t.to_pandas()) - - expected = gdf - got = gdf.groupby(["0"]).shift(periods=0) - - assert_groupby_results_equal( - expected[["1", "2", "3", "4"]], got[["1", "2", "3", "4"]] - ) - - -@pytest.mark.skipif( - PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, - reason="warning not present in older pandas versions", -) -def test_groupby_fillna_multi_value(): - nelem = 20 - t = rand_dataframe( - dtypes_meta=[ - {"dtype": "int64", "null_frequency": 0, "cardinality": 10}, - {"dtype": "int64", "null_frequency": 0.4, "cardinality": 10}, - {"dtype": "float32", "null_frequency": 0.4, "cardinality": 10}, - { - "dtype": "datetime64[ms]", - "null_frequency": 0.4, - "cardinality": 10, - }, - { - "dtype": "timedelta64[ns]", - "null_frequency": 0.4, - "cardinality": 10, - }, - {"dtype": "decimal64", "null_frequency": 0.4, "cardinality": 10}, - {"dtype": "str", "null_frequency": 0.4, "cardinality": 10}, - ], - rows=nelem, - use_threads=False, - seed=0, - ) - key_col = "0" - value_cols = ["1", "2", "3", "4", "5", "6"] - pdf = t.to_pandas() - gdf = cudf.from_pandas(pdf) - - # fill the dataframe with the first non-null item in the column - fill_values = { - name: pdf[name].loc[pdf[name].first_valid_index()] - for name in value_cols - } - # cudf can't fillna with a pandas.Timedelta type - fill_values["4"] = fill_values["4"].to_numpy() - with pytest.warns(FutureWarning): - expect = pdf.groupby(key_col).fillna(value=fill_values) - with pytest.warns(FutureWarning): - got = gdf.groupby(key_col).fillna(value=fill_values) - - assert_groupby_results_equal(expect[value_cols], got[value_cols]) - - -# TODO: cudf.fillna does not support decimal column to column fill yet -@pytest.mark.skipif( - PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, - reason="warning not present in older pandas versions", -) -def test_groupby_fillna_multi_value_df(): - nelem = 20 - t = rand_dataframe( - dtypes_meta=[ - {"dtype": "int64", "null_frequency": 0, "cardinality": 10}, - {"dtype": "int64", "null_frequency": 0.4, "cardinality": 10}, - {"dtype": "float32", "null_frequency": 0.4, "cardinality": 10}, - { - "dtype": "datetime64[ms]", - "null_frequency": 0.4, - "cardinality": 10, - }, - { - "dtype": "timedelta64[ns]", - "null_frequency": 0.4, - "cardinality": 10, - }, - {"dtype": "str", "null_frequency": 0.4, "cardinality": 10}, - ], - rows=nelem, - use_threads=False, - seed=0, - ) - key_col = "0" - value_cols = ["1", "2", "3", "4", "5"] - pdf = t.to_pandas() - gdf = cudf.from_pandas(pdf) - - # fill the dataframe with the first non-null item in the column - fill_values = { - name: pdf[name].loc[pdf[name].first_valid_index()] - for name in value_cols - } - # cudf can't fillna with a pandas.Timedelta type - fill_values["4"] = fill_values["4"].to_numpy() - fill_values = pd.DataFrame(fill_values, index=pdf.index) - with pytest.warns(FutureWarning): - expect = pdf.groupby(key_col).fillna(value=fill_values) - - fill_values = cudf.from_pandas(fill_values) - with pytest.warns(FutureWarning): - got = gdf.groupby(key_col).fillna(value=fill_values) - - assert_groupby_results_equal(expect[value_cols], got[value_cols]) - - -@pytest.mark.parametrize( - "by", - [pd.Series([1, 1, 2, 2, 3, 4]), lambda x: x % 2 == 0, pd.Grouper(level=0)], -) -@pytest.mark.parametrize( - "data", [[1, None, 2, None, 3, None], [1, 2, 3, 4, 5, 6]] -) -@pytest.mark.parametrize("args", [{"value": 42}, {"method": "ffill"}]) -@pytest.mark.skipif( - PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, - reason="warning not present in older pandas versions", -) -def test_groupby_various_by_fillna(by, data, args): - ps = pd.Series(data) - gs = cudf.from_pandas(ps) - - with pytest.warns(FutureWarning): - expect = ps.groupby(by).fillna(**args) - if isinstance(by, pd.Grouper): - by = cudf.Grouper(level=by.level) - with pytest.warns(FutureWarning): - got = gs.groupby(by).fillna(**args) - - assert_groupby_results_equal(expect, got, check_dtype=False) - - -@pytest.mark.skipif( - PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, - reason="warning not present in older pandas versions", -) -@pytest.mark.parametrize("method", ["ffill", "bfill"]) -def test_groupby_fillna_method(method): - nelem = 20 - t = rand_dataframe( - dtypes_meta=[ - {"dtype": "int64", "null_frequency": 0, "cardinality": 10}, - {"dtype": "int64", "null_frequency": 0.4, "cardinality": 10}, - {"dtype": "float32", "null_frequency": 0.4, "cardinality": 10}, - { - "dtype": "datetime64[ns]", - "null_frequency": 0.4, - "cardinality": 10, - }, - { - "dtype": "timedelta64[ns]", - "null_frequency": 0.4, - "cardinality": 10, - }, - { - "dtype": "list", - "null_frequency": 0.4, - "cardinality": 10, - "lists_max_length": 10, - "nesting_max_depth": 3, - "value_type": "int64", - }, - {"dtype": "category", "null_frequency": 0.4, "cardinality": 10}, - {"dtype": "decimal64", "null_frequency": 0.4, "cardinality": 10}, - {"dtype": "str", "null_frequency": 0.4, "cardinality": 10}, - ], - rows=nelem, - use_threads=False, - seed=0, - ) - key_col = "0" - value_cols = ["1", "2", "3", "4", "5", "6", "7", "8"] - pdf = t.to_pandas() - gdf = cudf.from_pandas(pdf) - - with pytest.warns(FutureWarning): - expect = pdf.groupby(key_col).fillna(method=method) - with pytest.warns(FutureWarning): - got = gdf.groupby(key_col).fillna(method=method) - - assert_groupby_results_equal( - expect[value_cols], got[value_cols], sort=False - ) - - @pytest.mark.parametrize( "data", [ From 9ea44e8ab91f0fb50a4f1c9d55c3d4f4bfc32dd6 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Wed, 13 Aug 2025 15:51:43 -0700 Subject: [PATCH 124/366] Move test_offset/repr.py to new cudf classic testing directory (#19677) Towards https://github.com/rapidsai/cudf/issues/9999 Towards https://github.com/rapidsai/cudf/issues/15723 Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/19677 --- python/cudf/cudf/tests/dataframe/test_repr.py | 484 ++++++ .../tests/dateoffset/test_constructors.py | 55 + .../indexes/categoricalindex/test_repr.py | 46 + .../cudf/tests/indexes/index/test_repr.py | 128 ++ .../tests/indexes/intervalindex/__init__.py | 0 .../tests/indexes/intervalindex/test_repr.py | 20 + .../tests/indexes/multiindex/test_repr.py | 281 +++ .../tests/indexes/timedeltaindex/test_repr.py | 69 + python/cudf/cudf/tests/series/test_repr.py | 521 ++++++ python/cudf/cudf/tests/test_offset.py | 61 - python/cudf/cudf/tests/test_repr.py | 1518 ----------------- 11 files changed, 1604 insertions(+), 1579 deletions(-) create mode 100644 python/cudf/cudf/tests/dataframe/test_repr.py create mode 100644 python/cudf/cudf/tests/indexes/categoricalindex/test_repr.py create mode 100644 python/cudf/cudf/tests/indexes/index/test_repr.py create mode 100644 python/cudf/cudf/tests/indexes/intervalindex/__init__.py create mode 100644 python/cudf/cudf/tests/indexes/intervalindex/test_repr.py create mode 100644 python/cudf/cudf/tests/indexes/multiindex/test_repr.py create mode 100644 python/cudf/cudf/tests/indexes/timedeltaindex/test_repr.py create mode 100644 python/cudf/cudf/tests/series/test_repr.py delete mode 100644 python/cudf/cudf/tests/test_offset.py delete mode 100644 python/cudf/cudf/tests/test_repr.py diff --git a/python/cudf/cudf/tests/dataframe/test_repr.py b/python/cudf/cudf/tests/dataframe/test_repr.py new file mode 100644 index 00000000000..be2fddb8436 --- /dev/null +++ b/python/cudf/cudf/tests/dataframe/test_repr.py @@ -0,0 +1,484 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + +import textwrap + +import numpy as np +import pandas as pd +import pytest +from hypothesis import given, settings, strategies as st + +import cudf + + +@pytest.mark.parametrize("ncols", [1, 2, 10]) +def test_null_dataframe(ncols): + dtype_categories = [ + "float32", + "float64", + "datetime64[ns]", + "str", + "category", + ] + rng = np.random.default_rng(seed=0) + size = 20 + data = cudf.DataFrame() + for dtype in dtype_categories: + sr = cudf.Series(rng.integers(0, 128, size)).astype(dtype) + sr[rng.choice([False, True], size=size)] = None + data[dtype] = sr + gdf = cudf.DataFrame(data) + pdf = gdf.to_pandas() + with pd.option_context("display.max_columns", int(ncols)): + pdf_repr = repr(pdf).replace("NaN", "").replace("None", "") + assert pdf_repr.split() == repr(gdf).split() + + +@pytest.mark.parametrize("nrows", [5, 10, 15]) +@pytest.mark.parametrize("ncols", [5, 10, 15]) +@pytest.mark.parametrize("size", [20, 21]) +def test_full_dataframe_20(all_supported_types_as_str, size, nrows, ncols): + rng = np.random.default_rng(seed=0) + pdf = pd.DataFrame( + {idx: rng.integers(0, 100, size) for idx in range(size)} + ).astype(all_supported_types_as_str) + gdf = cudf.from_pandas(pdf) + + with pd.option_context( + "display.max_rows", int(nrows), "display.max_columns", int(ncols) + ): + assert repr(pdf) == repr(gdf) + assert pdf._repr_html_() == gdf._repr_html_() + assert pdf._repr_latex_() == gdf._repr_latex_() + + +@given( + st.lists( + st.integers(-9223372036854775808, 9223372036854775807), + min_size=1, + max_size=1000, + ) +) +@settings(deadline=None, max_examples=20) +def test_integer_dataframe(x): + gdf = cudf.DataFrame({"x": x}) + pdf = gdf.to_pandas() + with pd.option_context("display.max_columns", 1): + assert repr(gdf) == repr(pdf) + assert repr(gdf.T) == repr(pdf.T) + + +@given(st.lists(st.floats())) +@settings(deadline=None, max_examples=20) +def test_float_dataframe(x): + gdf = cudf.DataFrame({"x": cudf.Series(x, dtype=float, nan_as_null=False)}) + pdf = gdf.to_pandas() + assert repr(gdf) == repr(pdf) + + +def test_mixed_dataframe(): + data = { + "Integer": np.array([2345, 11987, 9027, 9027]), + "Date": np.array( + ["18/04/1995", "14/07/1994", "07/06/2006", "16/09/2005"] + ), + "Float": np.array([9.001, 8.343, 6, 2.781]), + "Integer2": np.array([2345, 106, 2088, 789277]), + "Category": np.array(["M", "F", "F", "F"]), + "String": np.array(["Alpha", "Beta", "Gamma", "Delta"]), + "Boolean": np.array([True, False, True, False]), + } + mixed_gdf = cudf.DataFrame(data) + mixed_pdf = pd.DataFrame(data) + assert repr(mixed_gdf) == repr(mixed_pdf) + + +def test_MI(): + rng = np.random.default_rng(seed=0) + gdf = cudf.DataFrame( + { + "a": rng.integers(0, 4, 10), + "b": rng.integers(0, 4, 10), + "c": rng.integers(0, 4, 10), + } + ) + levels = [["a", "b", "c", "d"], ["w", "x", "y", "z"], ["m", "n"]] + codes = [ + [0, 0, 0, 0, 1, 1, 2, 2, 3, 3], + [0, 1, 2, 3, 0, 1, 2, 3, 0, 1], + [0, 1, 0, 1, 0, 1, 0, 1, 0, 1], + ] + with pd.option_context("display.max_rows", 999, "display.max_columns", 0): + gdf = gdf.set_index(cudf.MultiIndex(levels=levels, codes=codes)) + pdf = gdf.to_pandas() + assert repr(gdf) == repr(pdf) + assert repr(gdf.index) == repr(pdf.index) + assert repr(gdf.T) == repr(pdf.T) + + +@pytest.mark.parametrize("nrows", [0, 1, 3, 5, 10]) +@pytest.mark.parametrize("ncols", [0, 1, 2, 3]) +def test_groupby_MI(nrows, ncols): + gdf = cudf.DataFrame( + {"a": np.arange(10), "b": np.arange(10), "c": np.arange(10)} + ) + pdf = gdf.to_pandas() + gdg = gdf.groupby(["a", "b"], sort=True).count() + pdg = pdf.groupby(["a", "b"], sort=True).count() + with pd.option_context( + "display.max_rows", nrows, "display.max_columns", ncols + ): + assert repr(gdg) == repr(pdg) + assert repr(gdg.index) == repr(pdg.index) + assert repr(gdg.T) == repr(pdg.T) + + +@pytest.mark.parametrize( + "gdf", + [ + lambda: cudf.DataFrame({"a": range(10000)}), + lambda: cudf.DataFrame({"a": range(10000), "b": range(10000)}), + lambda: cudf.DataFrame({"a": range(20), "b": range(20)}), + lambda: cudf.DataFrame( + { + "a": range(20), + "b": range(20), + "c": ["abc", "def", "xyz", "def", "pqr"] * 4, + } + ), + lambda: cudf.DataFrame(index=[1, 2, 3]), + lambda: cudf.DataFrame(index=range(10000)), + lambda: cudf.DataFrame(columns=["a", "b", "c", "d"]), + lambda: cudf.DataFrame(columns=["a"], index=range(10000)), + lambda: cudf.DataFrame( + columns=["a", "col2", "...col n"], index=range(10000) + ), + lambda: cudf.DataFrame(index=cudf.Series(range(10000)).astype("str")), + lambda: cudf.DataFrame( + columns=["a", "b", "c", "d"], + index=cudf.Series(range(10000)).astype("str"), + ), + ], +) +@pytest.mark.parametrize( + "slc", + [ + slice(2500, 5000), + slice(2500, 2501), + slice(5000), + slice(1, 10), + slice(10, 20), + slice(15, 2400), + ], +) +@pytest.mark.parametrize("max_seq_items", [1, 10, 60, 10000, None]) +@pytest.mark.parametrize("max_rows", [1, 10, 60, 10000, None]) +def test_dataframe_sliced(gdf, slc, max_seq_items, max_rows): + gdf = gdf() + with pd.option_context( + "display.max_seq_items", max_seq_items, "display.max_rows", max_rows + ): + pdf = gdf.to_pandas() + + sliced_gdf = gdf[slc] + sliced_pdf = pdf[slc] + + expected_repr = repr(sliced_pdf).replace("None", "") + actual_repr = repr(sliced_gdf) + + assert expected_repr == actual_repr + + +@pytest.mark.parametrize( + "df,pandas_special_case", + [ + (pd.DataFrame({"a": [1, 2, 3]}, index=[10, 20, None]), False), + ( + pd.DataFrame( + { + "a": [1, None, 3], + "string_col": ["hello", "world", "rapids"], + }, + index=[None, "a", "b"], + ), + True, + ), + (pd.DataFrame([], index=[None, "a", "b"]), False), + (pd.DataFrame({"aa": [None, None]}, index=[None, None]), False), + (pd.DataFrame({"aa": [1, 2, 3]}, index=[None, None, None]), False), + ( + pd.DataFrame( + {"aa": [None, 2, 3]}, + index=np.array([1, None, None], dtype="datetime64[ns]"), + ), + False, + ), + ( + pd.DataFrame( + {"aa": [None, 2, 3]}, + index=np.array([100, None, None], dtype="datetime64[ns]"), + ), + False, + ), + ( + pd.DataFrame( + {"aa": [None, None, None]}, + index=np.array([None, None, None], dtype="datetime64[ns]"), + ), + False, + ), + ( + pd.DataFrame( + {"aa": [1, None, 3]}, + index=np.array([10, 15, None], dtype="datetime64[ns]"), + ), + False, + ), + ( + pd.DataFrame( + {"a": [1, 2, None], "v": [10, None, 22], "p": [100, 200, 300]} + ).set_index(["a", "v"]), + False, + ), + ( + pd.DataFrame( + { + "a": [1, 2, None], + "v": ["n", "c", "a"], + "p": [None, None, None], + } + ).set_index(["a", "v"]), + False, + ), + ( + pd.DataFrame( + { + "a": np.array([1, None, None], dtype="datetime64[ns]"), + "v": ["n", "c", "a"], + "p": [None, None, None], + } + ).set_index(["a", "v"]), + False, + ), + ], +) +def test_dataframe_null_index_repr(df, pandas_special_case): + pdf = df + gdf = cudf.from_pandas(pdf) + + expected_repr = repr(pdf).replace("NaN", "").replace("None", "") + actual_repr = repr(gdf) + + if pandas_special_case: + # Pandas inconsistently print Index null values + # as `None` at some places and `NaN` at few other places + # Whereas cudf is consistent with strings `null` values + # to be printed as `None` everywhere. + actual_repr = repr(gdf).replace("None", "") + + assert expected_repr.split() == actual_repr.split() + + +@pytest.mark.parametrize( + "df,expected_repr", + [ + ( + lambda: cudf.DataFrame( + { + "a": cudf.Series( + [1000000, 200000, 3000000], dtype="timedelta64[s]" + ) + } + ), + textwrap.dedent( + """ + a + 0 11 days 13:46:40 + 1 2 days 07:33:20 + 2 34 days 17:20:00 + """ + ), + ), + ( + lambda: cudf.DataFrame( + { + "a": cudf.Series( + [ + 136457654, + None, + 245345345, + 223432411, + None, + 3634548734, + 23234, + ], + dtype="timedelta64[s]", + ), + "b": [10, 11, 22, 33, 44, 55, 66], + } + ), + textwrap.dedent( + """ + a b + 0 1579 days 08:54:14 10 + 1 NaT 11 + 2 2839 days 15:29:05 22 + 3 2586 days 00:33:31 33 + 4 NaT 44 + 5 42066 days 12:52:14 55 + 6 0 days 06:27:14 66 + """ + ), + ), + ( + lambda: cudf.DataFrame( + { + "a": cudf.Series( + [ + 136457654, + None, + 245345345, + 223432411, + None, + 3634548734, + 23234, + ], + dtype="timedelta64[s]", + index=["a", "b", "c", "d", "e", "f", "g"], + ) + } + ), + textwrap.dedent( + """ + a + a 1579 days 08:54:14 + b NaT + c 2839 days 15:29:05 + d 2586 days 00:33:31 + e NaT + f 42066 days 12:52:14 + g 0 days 06:27:14 + """ + ), + ), + ( + lambda: cudf.DataFrame( + { + "a": cudf.Series( + [1, 2, 3, 4, 5, 6, 7], + index=cudf.Index( + [ + 136457654, + None, + 245345345, + 223432411, + None, + 3634548734, + 23234, + ], + dtype="timedelta64[ms]", + ), + ) + } + ), + textwrap.dedent( + """ + a + 1 days 13:54:17.654 1 + NaT 2 + 2 days 20:09:05.345 3 + 2 days 14:03:52.411 4 + NaT 5 + 42 days 01:35:48.734 6 + 0 days 00:00:23.234 7 + """ + ), + ), + ( + lambda: cudf.DataFrame( + { + "a": cudf.Series( + ["a", "f", "q", "e", "w", "e", "t"], + index=cudf.Index( + [ + 136457654, + None, + 245345345, + 223432411, + None, + 3634548734, + 23234, + ], + dtype="timedelta64[ns]", + ), + ) + } + ), + textwrap.dedent( + """ + a + 0 days 00:00:00.136457654 a + NaT f + 0 days 00:00:00.245345345 q + 0 days 00:00:00.223432411 e + NaT w + 0 days 00:00:03.634548734 e + 0 days 00:00:00.000023234 t + """ + ), + ), + ], +) +def test_timedelta_dataframe_repr(df, expected_repr): + actual_repr = repr(df()) + + assert actual_repr.split() == expected_repr.split() + + +def test_categorical_dataframe_with_nan_repr(): + series = cudf.Series( + [1, 2, np.nan, 10, np.nan, None], nan_as_null=False + ).astype("category") + df = cudf.DataFrame({"a": series}) + expected_repr = textwrap.dedent( + """ + a + 0 1.0 + 1 2.0 + 2 NaN + 3 10.0 + 4 NaN + 5 + """ + ) + + assert repr(df).split() == expected_repr.split() + + +def test_repr_struct_after_concat(): + df = cudf.DataFrame( + { + "a": cudf.Series( + [ + {"sa": 2056831253}, + {"sa": -1463792165}, + {"sa": 1735783038}, + {"sa": 103774433}, + {"sa": -1413247520}, + ] + * 13 + ), + "b": cudf.Series( + [ + {"sa": {"ssa": 1140062029}}, + None, + {"sa": {"ssa": 1998862860}}, + {"sa": None}, + {"sa": {"ssa": -395088502}}, + ] + * 13 + ), + } + ) + pdf = df.to_pandas() + + assert repr(df) == repr(pdf) diff --git a/python/cudf/cudf/tests/dateoffset/test_constructors.py b/python/cudf/cudf/tests/dateoffset/test_constructors.py index 56338f44773..08cfff7fbaa 100644 --- a/python/cudf/cudf/tests/dateoffset/test_constructors.py +++ b/python/cudf/cudf/tests/dateoffset/test_constructors.py @@ -1,10 +1,65 @@ # Copyright (c) 2025, NVIDIA CORPORATION. +import re + +import numpy as np import pandas as pd +import pytest import cudf +@pytest.mark.parametrize("period", [1.5, 0.5, "string", "1", "1.0"]) +@pytest.mark.parametrize("freq", ["years", "months"]) +def test_construction_invalid(period, freq): + kwargs = {freq: period} + with pytest.raises(ValueError): + cudf.DateOffset(**kwargs) + + +@pytest.mark.parametrize( + "unit", ["nanoseconds", "microseconds", "milliseconds", "seconds"] +) +def test_construct_max_offset(unit): + cudf.DateOffset(**{unit: np.iinfo("int64").max}) + + +@pytest.mark.parametrize( + "kwargs", + [ + {"seconds": np.iinfo("int64").max + 1}, + {"seconds": np.iinfo("int64").max, "minutes": 1}, + {"minutes": np.iinfo("int64").max}, + ], +) +def test_offset_construction_overflow(kwargs): + with pytest.raises(NotImplementedError): + cudf.DateOffset(**kwargs) + + +@pytest.mark.parametrize( + "unit", + [ + "years", + "months", + "weeks", + "days", + "hours", + "minutes", + "seconds", + "milliseconds", + "microseconds", + "nanoseconds", + ], +) +@pytest.mark.parametrize("period", [0.5, -0.5, 0.71]) +def test_offset_no_fractional_periods(unit, period): + with pytest.raises( + ValueError, match=re.escape("Non-integer periods not supported") + ): + cudf.DateOffset(**{unit: period}) + + def test_dateoffset_instance_subclass_check(): assert not issubclass(pd.DateOffset, cudf.DateOffset) assert not isinstance(pd.DateOffset(), cudf.DateOffset) diff --git a/python/cudf/cudf/tests/indexes/categoricalindex/test_repr.py b/python/cudf/cudf/tests/indexes/categoricalindex/test_repr.py new file mode 100644 index 00000000000..e0c87d1dd0c --- /dev/null +++ b/python/cudf/cudf/tests/indexes/categoricalindex/test_repr.py @@ -0,0 +1,46 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + + +import numpy as np +import pandas as pd +import pytest + +import cudf + + +def test_categorical_index_with_nan_repr(): + cat_index = cudf.Index( + cudf.Series( + [1, 2, np.nan, 10, np.nan, None], nan_as_null=False + ).astype("category") + ) + + expected_repr = ( + "CategoricalIndex([1.0, 2.0, NaN, 10.0, NaN, ], " + "categories=[1.0, 2.0, 10.0, NaN], ordered=False, dtype='category')" + ) + + assert repr(cat_index) == expected_repr + + sliced_expected_repr = ( + "CategoricalIndex([NaN, 10.0, NaN, ], " + "categories=[1.0, 2.0, 10.0, NaN], ordered=False, dtype='category')" + ) + + assert repr(cat_index[2:]) == sliced_expected_repr + + +def test_unique_categories_repr(): + pi = pd.CategoricalIndex(range(10_000)) + gi = cudf.CategoricalIndex(range(10_000)) + expected_repr = repr(pi) + actual_repr = repr(gi) + assert expected_repr == actual_repr + + +@pytest.mark.parametrize("ordered", [True, False]) +def test_categorical_index_ordered(ordered): + pi = pd.CategoricalIndex(range(10), ordered=ordered) + gi = cudf.CategoricalIndex(range(10), ordered=ordered) + + assert repr(pi) == repr(gi) diff --git a/python/cudf/cudf/tests/indexes/index/test_repr.py b/python/cudf/cudf/tests/indexes/index/test_repr.py new file mode 100644 index 00000000000..a9d769d5344 --- /dev/null +++ b/python/cudf/cudf/tests/indexes/index/test_repr.py @@ -0,0 +1,128 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + + +import numpy as np +import pandas as pd +import pytest + +import cudf + + +@pytest.mark.parametrize("length", [0, 1, 10, 100, 1000]) +def test_numeric_index_repr(length, numeric_types_as_str): + rng = np.random.default_rng(seed=0) + data = rng.integers(0, high=100, size=length).astype(numeric_types_as_str) + pidx = pd.Index(data) + gidx = cudf.Index(data) + + assert repr(pidx) == repr(gidx) + + +@pytest.mark.parametrize( + "index,expected_repr", + [ + ( + lambda: cudf.Index([1, 2, 3, None]), + "Index([1, 2, 3, ], dtype='int64')", + ), + ( + lambda: cudf.Index([None, 2.2, 3.324342, None]), + "Index([, 2.2, 3.324342, ], dtype='float64')", + ), + ( + lambda: cudf.Index([None, None, None], name="hello"), + "Index([, , ], dtype='object', name='hello')", + ), + ( + lambda: cudf.Index( + [None, None, None], dtype="float", name="hello" + ), + "Index([, , ], dtype='float64', name='hello')", + ), + ( + lambda: cudf.Index([None], dtype="float64", name="hello"), + "Index([], dtype='float64', name='hello')", + ), + ( + lambda: cudf.Index([None], dtype="int8", name="hello"), + "Index([], dtype='int8', name='hello')", + ), + ( + lambda: cudf.Index([None] * 50, dtype="object"), + "Index([, , , , , , , , , " + ", , ,\n , , , , , , , " + ", , , , ,\n , , , , " + ", , , , , , , ,\n , " + ", , , , , , , , , , " + ",\n , ],\n dtype='object')", + ), + ( + lambda: cudf.Index([None] * 20, dtype="uint32"), + "Index([, , , , , , , , " + ",\n , , , , , , , , " + ",\n , ],\n dtype='uint32')", + ), + ( + lambda: cudf.Index( + [None, 111, 22, 33, None, 23, 34, 2343, None], dtype="int16" + ), + "Index([, 111, 22, 33, , 23, 34, 2343, ], " + "dtype='int16')", + ), + ( + lambda: cudf.Index([1, 2, 3, None], dtype="category"), + "CategoricalIndex([1, 2, 3, ], categories=[1, 2, 3], " + "ordered=False, dtype='category')", + ), + ( + lambda: cudf.Index([None, None], dtype="category"), + "CategoricalIndex([, ], categories=[], ordered=False, " + "dtype='category')", + ), + ( + lambda: cudf.Index( + np.array([10, 20, 30, None], dtype="datetime64[ns]") + ), + "DatetimeIndex([1970-01-01 00:00:00.000000010, " + "1970-01-01 00:00:00.000000020," + "\n 1970-01-01 00:00:00.000000030, NaT],\n " + "dtype='datetime64[ns]')", + ), + ( + lambda: cudf.Index( + np.array([10, 20, 30, None], dtype="datetime64[s]") + ), + "DatetimeIndex([1970-01-01 00:00:10, " + "1970-01-01 00:00:20, 1970-01-01 00:00:30,\n" + " NaT],\n dtype='datetime64[s]')", + ), + ( + lambda: cudf.Index( + np.array([10, 20, 30, None], dtype="datetime64[us]") + ), + "DatetimeIndex([1970-01-01 00:00:00.000010, " + "1970-01-01 00:00:00.000020,\n " + "1970-01-01 00:00:00.000030, NaT],\n " + "dtype='datetime64[us]')", + ), + ( + lambda: cudf.Index( + np.array([10, 20, 30, None], dtype="datetime64[ms]") + ), + "DatetimeIndex([1970-01-01 00:00:00.010, " + "1970-01-01 00:00:00.020,\n " + "1970-01-01 00:00:00.030, NaT],\n " + "dtype='datetime64[ms]')", + ), + ( + lambda: cudf.Index(np.array([None] * 10, dtype="datetime64[ms]")), + "DatetimeIndex([NaT, NaT, NaT, NaT, NaT, NaT, NaT, NaT, " + "NaT, NaT], dtype='datetime64[ms]')", + ), + ], +) +def test_index_null(index, expected_repr): + index = index() + actual_repr = repr(index) + + assert expected_repr == actual_repr diff --git a/python/cudf/cudf/tests/indexes/intervalindex/__init__.py b/python/cudf/cudf/tests/indexes/intervalindex/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/python/cudf/cudf/tests/indexes/intervalindex/test_repr.py b/python/cudf/cudf/tests/indexes/intervalindex/test_repr.py new file mode 100644 index 00000000000..66034d9961c --- /dev/null +++ b/python/cudf/cudf/tests/indexes/intervalindex/test_repr.py @@ -0,0 +1,20 @@ +# Copyright (c) 2019-2025, NVIDIA CORPORATION. + + +import numpy as np +import pandas as pd + +import cudf + + +def test_interval_index_repr(): + pi = pd.Index( + [ + np.nan, + pd.Interval(2.0, 3.0, closed="right"), + pd.Interval(3.0, 4.0, closed="right"), + ] + ) + gi = cudf.from_pandas(pi) + + assert repr(pi) == repr(gi) diff --git a/python/cudf/cudf/tests/indexes/multiindex/test_repr.py b/python/cudf/cudf/tests/indexes/multiindex/test_repr.py new file mode 100644 index 00000000000..70106a9376b --- /dev/null +++ b/python/cudf/cudf/tests/indexes/multiindex/test_repr.py @@ -0,0 +1,281 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + +import textwrap + +import numpy as np +import pandas as pd +import pytest + +import cudf + + +@pytest.mark.parametrize( + "pmi", + [ + pd.MultiIndex.from_tuples( + [(1, "red"), (1, "blue"), (2, "red"), (2, "blue")] + ), + pd.MultiIndex.from_tuples( + [(1, "red"), (1, "blue"), (2, "red"), (2, "blue")] * 10 + ), + pd.MultiIndex.from_tuples([(1, "red", 102, "sdf")]), + pd.MultiIndex.from_tuples( + [ + ("abc", 0.234, 1), + ("a", -0.34, 0), + ("ai", 111, 4385798), + ("rapids", 0, 34534534), + ], + names=["alphabets", "floats", "ints"], + ), + ], +) +@pytest.mark.parametrize("max_seq_items", [None, 1, 2, 5, 10, 100]) +def test_multiindex_repr(pmi, max_seq_items): + with pd.option_context("display.max_seq_items", max_seq_items): + gmi = cudf.from_pandas(pmi) + + assert repr(gmi) == repr(pmi) + + +@pytest.mark.parametrize( + "gdi, expected_repr", + [ + ( + lambda: cudf.DataFrame( + { + "a": [None, 1, 2, 3], + "b": ["abc", None, "xyz", None], + "c": [0.345, np.nan, 100, 10], + } + ) + .set_index(["a", "b"]) + .index, + textwrap.dedent( + """ + MultiIndex([(, 'abc'), + ( 1, ), + ( 2, 'xyz'), + ( 3, )], + names=['a', 'b']) + """ + ), + ), + ( + lambda: cudf.DataFrame( + { + "a": cudf.Series([None, np.nan, 2, 3], nan_as_null=False), + "b": ["abc", None, "xyz", None], + "c": [0.345, np.nan, 100, 10], + } + ) + .set_index(["a", "b"]) + .index, + textwrap.dedent( + """ + MultiIndex([(, 'abc'), + ( nan, ), + ( 2.0, 'xyz'), + ( 3.0, )], + names=['a', 'b']) + """ + ), + ), + ( + lambda: cudf.DataFrame( + { + "a": cudf.Series([None, 1, 2, 3], dtype="datetime64[ns]"), + "b": ["abc", None, "xyz", None], + "c": [0.345, np.nan, 100, 10], + } + ) + .set_index(["a", "b"]) + .index, + textwrap.dedent( + """ + MultiIndex([( 'NaT', 'abc'), + ('1970-01-01 00:00:00.000000001', ), + ('1970-01-01 00:00:00.000000002', 'xyz'), + ('1970-01-01 00:00:00.000000003', )], + names=['a', 'b']) + """ + ), + ), + ( + lambda: cudf.DataFrame( + { + "a": cudf.Series([None, 1, 2, 3], dtype="datetime64[ns]"), + "b": ["abc", None, "xyz", None], + "c": [0.345, np.nan, 100, 10], + } + ) + .set_index(["a", "b", "c"]) + .index, + textwrap.dedent( + """ + MultiIndex([( 'NaT', 'abc', 0.345), + ('1970-01-01 00:00:00.000000001', , ), + ('1970-01-01 00:00:00.000000002', 'xyz', 100.0), + ('1970-01-01 00:00:00.000000003', , 10.0)], + names=['a', 'b', 'c']) + """ + ), + ), + ( + lambda: cudf.DataFrame( + { + "a": ["abc", None, "xyz", None], + "b": cudf.Series([None, 1, 2, 3], dtype="timedelta64[ns]"), + "c": [0.345, np.nan, 100, 10], + } + ) + .set_index(["a", "b", "c"]) + .index, + textwrap.dedent( + """ + MultiIndex([('abc', NaT, 0.345), + ( , '0 days 00:00:00.000000001', ), + ('xyz', '0 days 00:00:00.000000002', 100.0), + ( , '0 days 00:00:00.000000003', 10.0)], + names=['a', 'b', 'c']) + """ + ), + ), + ( + lambda: cudf.DataFrame( + { + "a": ["abc", None, "xyz", None], + "b": cudf.Series([None, 1, 2, 3], dtype="timedelta64[ns]"), + "c": [0.345, np.nan, 100, 10], + } + ) + .set_index(["c", "a"]) + .index, + textwrap.dedent( + """ + MultiIndex([(0.345, 'abc'), + ( , ), + (100.0, 'xyz'), + ( 10.0, )], + names=['c', 'a']) + """ + ), + ), + ( + lambda: cudf.DataFrame( + { + "a": [None, None, None, None], + "b": cudf.Series( + [None, None, None, None], dtype="timedelta64[ns]" + ), + "c": [0.345, np.nan, 100, 10], + } + ) + .set_index(["b", "a"]) + .index, + textwrap.dedent( + """ + MultiIndex([(NaT, ), + (NaT, ), + (NaT, ), + (NaT, )], + names=['b', 'a']) + """ + ), + ), + ( + lambda: cudf.DataFrame( + { + "a": [1, 2, None, 3, 5], + "b": [ + "abc", + "def, hi, bye", + None, + ", one, two, three, four", + None, + ], + "c": cudf.Series( + [0.3232, np.nan, 1, None, -0.34534], nan_as_null=False + ), + "d": [None, 100, 2000324, None, None], + } + ) + .set_index(["a", "b", "c", "d"]) + .index, + textwrap.dedent( + """ + MultiIndex([( 1, 'abc', 0.3232, ), + ( 2, 'def, hi, bye', nan, 100), + (, , 1.0, 2000324), + ( 3, ', one, two, three, four', , ), + ( 5, , -0.34534, )], + names=['a', 'b', 'c', 'd']) + """ + ), + ), + ( + lambda: cudf.DataFrame( + { + "a": [1, 2, None, 3, 5], + "b": [ + "abc", + "def, hi, bye", + None, + ", one, two, three, four", + None, + ], + "c": cudf.Series( + [0.3232, np.nan, 1, None, -0.34534], nan_as_null=False + ), + "d": [None, 100, 2000324, None, None], + } + ) + .set_index(["b", "a", "c", "d"]) + .index, + textwrap.dedent( + """ + MultiIndex([( 'abc', 1, 0.3232, ), + ( 'def, hi, bye', 2, nan, 100), + ( , , 1.0, 2000324), + (', one, two, three, four', 3, , ), + ( , 5, -0.34534, )], + names=['b', 'a', 'c', 'd']) + """ + ), + ), + ( + lambda: cudf.DataFrame( + { + "a": ["(abc", "2", None, "3", "5"], + "b": [ + "abc", + "def, hi, bye", + None, + ", one, two, three, four", + None, + ], + "c": cudf.Series( + [0.3232, np.nan, 1, None, -0.34534], nan_as_null=False + ), + "d": [None, 100, 2000324, None, None], + } + ) + .set_index(["a", "b", "c", "d"]) + .index, + textwrap.dedent( + """ + MultiIndex([('(abc', 'abc', 0.3232, ), + ( '2', 'def, hi, bye', nan, 100), + ( , , 1.0, 2000324), + ( '3', ', one, two, three, four', , ), + ( '5', , -0.34534, )], + names=['a', 'b', 'c', 'd']) + """ + ), + ), + ], +) +def test_multiindex_null_repr(gdi, expected_repr): + actual_repr = repr(gdi()) + + assert actual_repr.split() == expected_repr.split() diff --git a/python/cudf/cudf/tests/indexes/timedeltaindex/test_repr.py b/python/cudf/cudf/tests/indexes/timedeltaindex/test_repr.py new file mode 100644 index 00000000000..3b82bee2d93 --- /dev/null +++ b/python/cudf/cudf/tests/indexes/timedeltaindex/test_repr.py @@ -0,0 +1,69 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + + +import pytest + +import cudf + + +@pytest.mark.parametrize( + "index, expected_repr", + [ + ( + lambda: cudf.Index( + [1000000, 200000, 3000000], dtype="timedelta64[ms]" + ), + "TimedeltaIndex(['0 days 00:16:40', " + "'0 days 00:03:20', '0 days 00:50:00'], " + "dtype='timedelta64[ms]')", + ), + ( + lambda: cudf.Index( + [None, None, None, None, None], dtype="timedelta64[us]" + ), + "TimedeltaIndex([NaT, NaT, NaT, NaT, NaT], " + "dtype='timedelta64[us]')", + ), + ( + lambda: cudf.Index( + [ + 136457654, + None, + 245345345, + 223432411, + None, + 3634548734, + 23234, + ], + dtype="timedelta64[us]", + ), + "TimedeltaIndex([0 days 00:02:16.457654, NaT, " + "0 days 00:04:05.345345, " + "0 days 00:03:43.432411, NaT," + " 0 days 01:00:34.548734, 0 days 00:00:00.023234]," + " dtype='timedelta64[us]')", + ), + ( + lambda: cudf.Index( + [ + 136457654, + None, + 245345345, + 223432411, + None, + 3634548734, + 23234, + ], + dtype="timedelta64[s]", + ), + "TimedeltaIndex([1579 days 08:54:14, NaT, 2839 days 15:29:05," + " 2586 days 00:33:31, NaT, 42066 days 12:52:14, " + "0 days 06:27:14]," + " dtype='timedelta64[s]')", + ), + ], +) +def test_timedelta_index_repr(index, expected_repr): + actual_repr = repr(index()) + + assert actual_repr.split() == expected_repr.split() diff --git a/python/cudf/cudf/tests/series/test_repr.py b/python/cudf/cudf/tests/series/test_repr.py new file mode 100644 index 00000000000..2e1e9888ff4 --- /dev/null +++ b/python/cudf/cudf/tests/series/test_repr.py @@ -0,0 +1,521 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + +import textwrap + +import cupy as cp +import numpy as np +import pandas as pd +import pytest +from hypothesis import given, settings, strategies as st + +import cudf +from cudf.utils.dtypes import np_dtypes_to_pandas_dtypes + + +@pytest.mark.parametrize("nrows", [0, 5, 10]) +def test_null_series(nrows, all_supported_types_as_str, request): + request.applymarker( + pytest.mark.xfail( + all_supported_types_as_str in {"bool", "timedelta64[ms]"}, + reason=f"cuDF repr doesn't match pandas repr for {all_supported_types_as_str}", + ) + ) + rng = np.random.default_rng(seed=0) + size = 5 + sr = cudf.Series(rng.integers(1, 9, size)).astype( + all_supported_types_as_str + ) + sr[rng.choice([False, True], size=size)] = None + if all_supported_types_as_str != "category" and cudf.dtype( + all_supported_types_as_str + ).kind in {"u", "i"}: + ps = pd.Series( + sr._column.data_array_view(mode="read").copy_to_host(), + dtype=np_dtypes_to_pandas_dtypes.get( + cudf.dtype(all_supported_types_as_str), + cudf.dtype(all_supported_types_as_str), + ), + ) + ps[sr.isnull().to_pandas()] = pd.NA + else: + ps = sr.to_pandas() + + with pd.option_context("display.max_rows", int(nrows)): + psrepr = repr(ps).replace("NaN", "").replace("None", "") + if "UInt" in psrepr: + psrepr = psrepr.replace("UInt", "uint") + elif "Int" in psrepr: + psrepr = psrepr.replace("Int", "int") + assert psrepr.split() == repr(sr).split() + + +@pytest.mark.parametrize("nrows", [None, 0, 2, 10, 20, 21]) +def test_full_series(nrows, all_supported_types_as_str): + size = 20 + rng = np.random.default_rng(seed=0) + ps = pd.Series(rng.integers(0, 100, size)).astype( + all_supported_types_as_str + ) + sr = cudf.from_pandas(ps) + with pd.option_context("display.max_rows", nrows): + assert repr(ps) == repr(sr) + + +@given( + st.lists( + st.integers(-9223372036854775808, 9223372036854775807), max_size=1000 + ) +) +@settings(deadline=None, max_examples=20) +def test_integer_series(x): + sr = cudf.Series(x, dtype=int) + ps = pd.Series(data=x, dtype=int) + + assert repr(sr) == repr(ps) + + +@given(st.lists(st.floats())) +@settings(deadline=None, max_examples=20) +def test_float_series(x): + sr = cudf.Series(x, dtype=float, nan_as_null=False) + ps = pd.Series(data=x, dtype=float) + assert repr(sr) == repr(ps) + + +@pytest.mark.parametrize( + "sr,pandas_special_case", + [ + (pd.Series([1, 2, 3], index=[10, 20, None]), False), + (pd.Series([1, None, 3], name="a", index=[None, "a", "b"]), True), + (pd.Series(None, index=[None, "a", "b"], dtype="float"), True), + (pd.Series([None, None], name="aa", index=[None, None]), False), + (pd.Series([1, 2, 3], index=[None, None, None]), False), + ( + pd.Series( + [None, 2, 3], + index=np.array([1, None, None], dtype="datetime64[ns]"), + ), + False, + ), + ( + pd.Series( + [None, None, None], + index=np.array([None, None, None], dtype="datetime64[ns]"), + ), + False, + ), + ( + pd.Series( + [1, None, 3], + index=np.array([10, 15, None], dtype="datetime64[ns]"), + ), + False, + ), + ( + pd.DataFrame( + {"a": [1, 2, None], "v": [10, None, 22], "p": [100, 200, 300]} + ).set_index(["a", "v"])["p"], + False, + ), + ( + pd.DataFrame( + { + "a": [1, 2, None], + "v": ["n", "c", "a"], + "p": [None, None, None], + } + ).set_index(["a", "v"])["p"], + False, + ), + ( + pd.DataFrame( + { + "a": np.array([1, None, None], dtype="datetime64[ns]"), + "v": ["n", "c", "a"], + "p": [None, None, None], + } + ).set_index(["a", "v"])["p"], + False, + ), + ], +) +def test_series_null_index_repr(sr, pandas_special_case): + psr = sr + gsr = cudf.from_pandas(psr) + + expected_repr = repr(psr).replace("NaN", "").replace("None", "") + actual_repr = repr(gsr) + + if pandas_special_case: + # Pandas inconsistently print Index null values + # as `None` at some places and `NaN` at few other places + # Whereas cudf is consistent with strings `null` values + # to be printed as `None` everywhere. + actual_repr = repr(gsr).replace("None", "") + assert expected_repr.split() == actual_repr.split() + + +@pytest.mark.parametrize( + "data", + [ + [1000000, 200000, 3000000], + [1000000, 200000, None], + [], + [None], + [None, None, None, None, None], + [12, 12, 22, 343, 4353534, 435342], + np.array([10, 20, 30, None, 100]), + cp.asarray([10, 20, 30, 100]), + [1000000, 200000, 3000000], + [1000000, 200000, None], + [1], + [12, 11, 232, 223432411, 2343241, 234324, 23234], + [12, 11, 2.32, 2234.32411, 2343.241, 23432.4, 23234], + [1.321, 1132.324, 23223231.11, 233.41, 0.2434, 332, 323], + [ + 136457654, + 134736784, + 245345345, + 223432411, + 2343241, + 3634548734, + 23234, + ], + [12, 11, 2.32, 2234.32411, 2343.241, 23432.4, 23234], + ], +) +@pytest.mark.parametrize("dtype", ["timedelta64[s]", "timedelta64[us]"]) +def test_timedelta_series_s_us_repr(data, dtype): + sr = cudf.Series(data, dtype=dtype) + psr = sr.to_pandas() + + expected = repr(psr).replace("timedelta64[ns]", dtype) + actual = repr(sr) + + assert expected.split() == actual.split() + + +@pytest.mark.parametrize( + "ser, expected_repr", + [ + ( + lambda: cudf.Series([], dtype="timedelta64[ns]"), + textwrap.dedent( + """ + Series([], dtype: timedelta64[ns]) + """ + ), + ), + ( + lambda: cudf.Series([], dtype="timedelta64[ms]"), + textwrap.dedent( + """ + Series([], dtype: timedelta64[ms]) + """ + ), + ), + ( + lambda: cudf.Series( + [1000000, 200000, 3000000], dtype="timedelta64[ns]" + ), + textwrap.dedent( + """ + 0 0 days 00:00:00.001000 + 1 0 days 00:00:00.000200 + 2 0 days 00:00:00.003000 + dtype: timedelta64[ns] + """ + ), + ), + ( + lambda: cudf.Series( + [1000000, 200000, 3000000], dtype="timedelta64[ms]" + ), + textwrap.dedent( + """ + 0 0 days 00:16:40 + 1 0 days 00:03:20 + 2 0 days 00:50:00 + dtype: timedelta64[ms] + """ + ), + ), + ( + lambda: cudf.Series( + [1000000, 200000, None], dtype="timedelta64[ns]" + ), + textwrap.dedent( + """ + 0 0 days 00:00:00.001000000 + 1 0 days 00:00:00.000200000 + 2 NaT + dtype: timedelta64[ns] + """ + ), + ), + ( + lambda: cudf.Series( + [1000000, 200000, None], dtype="timedelta64[ms]" + ), + textwrap.dedent( + """ + 0 0 days 00:16:40 + 1 0 days 00:03:20 + 2 NaT + dtype: timedelta64[ms] + """ + ), + ), + ( + lambda: cudf.Series( + [None, None, None, None, None], dtype="timedelta64[ns]" + ), + textwrap.dedent( + """ + 0 NaT + 1 NaT + 2 NaT + 3 NaT + 4 NaT + dtype: timedelta64[ns] + """ + ), + ), + ( + lambda: cudf.Series( + [None, None, None, None, None], dtype="timedelta64[ms]" + ), + textwrap.dedent( + """ + 0 NaT + 1 NaT + 2 NaT + 3 NaT + 4 NaT + dtype: timedelta64[ms] + """ + ), + ), + ( + lambda: cudf.Series( + [12, 12, 22, 343, 4353534, 435342], dtype="timedelta64[ns]" + ), + textwrap.dedent( + """ + 0 0 days 00:00:00.000000012 + 1 0 days 00:00:00.000000012 + 2 0 days 00:00:00.000000022 + 3 0 days 00:00:00.000000343 + 4 0 days 00:00:00.004353534 + 5 0 days 00:00:00.000435342 + dtype: timedelta64[ns] + """ + ), + ), + ( + lambda: cudf.Series( + [12, 12, 22, 343, 4353534, 435342], dtype="timedelta64[ms]" + ), + textwrap.dedent( + """ + 0 0 days 00:00:00.012000 + 1 0 days 00:00:00.012000 + 2 0 days 00:00:00.022000 + 3 0 days 00:00:00.343000 + 4 0 days 01:12:33.534000 + 5 0 days 00:07:15.342000 + dtype: timedelta64[ms] + """ + ), + ), + ( + lambda: cudf.Series( + [1.321, 1132.324, 23223231.11, 233.41, 0.2434, 332, 323], + dtype="timedelta64[ns]", + ), + textwrap.dedent( + """ + 0 0 days 00:00:00.000000001 + 1 0 days 00:00:00.000001132 + 2 0 days 00:00:00.023223231 + 3 0 days 00:00:00.000000233 + 4 0 days 00:00:00 + 5 0 days 00:00:00.000000332 + 6 0 days 00:00:00.000000323 + dtype: timedelta64[ns] + """ + ), + ), + ( + lambda: cudf.Series( + [1.321, 1132.324, 23223231.11, 233.41, 0.2434, 332, 323], + dtype="timedelta64[ms]", + ), + textwrap.dedent( + """ + 0 0 days 00:00:00.001000 + 1 0 days 00:00:01.132000 + 2 0 days 06:27:03.231000 + 3 0 days 00:00:00.233000 + 4 0 days 00:00:00 + 5 0 days 00:00:00.332000 + 6 0 days 00:00:00.323000 + dtype: timedelta64[ms] + """ + ), + ), + ( + lambda: cudf.Series( + [ + 13645765432432, + 134736784, + 245345345, + 223432411, + 999992343241, + 3634548734, + 23234, + ], + dtype="timedelta64[ms]", + ), + textwrap.dedent( + """ + 0 157937 days 02:23:52.432000 + 1 1 days 13:25:36.784000 + 2 2 days 20:09:05.345000 + 3 2 days 14:03:52.411000 + 4 11573 days 23:39:03.241000 + 5 42 days 01:35:48.734000 + 6 0 days 00:00:23.234000 + dtype: timedelta64[ms] + """ + ), + ), + ( + lambda: cudf.Series( + [ + 13645765432432, + 134736784, + 245345345, + 223432411, + 999992343241, + 3634548734, + 23234, + ], + dtype="timedelta64[ns]", + ), + textwrap.dedent( + """ + 0 0 days 03:47:25.765432432 + 1 0 days 00:00:00.134736784 + 2 0 days 00:00:00.245345345 + 3 0 days 00:00:00.223432411 + 4 0 days 00:16:39.992343241 + 5 0 days 00:00:03.634548734 + 6 0 days 00:00:00.000023234 + dtype: timedelta64[ns] + """ + ), + ), + ( + lambda: cudf.Series( + [ + 13645765432432, + 134736784, + 245345345, + 223432411, + 999992343241, + 3634548734, + 23234, + ], + dtype="timedelta64[ms]", + name="abc", + ), + textwrap.dedent( + """ + 0 157937 days 02:23:52.432000 + 1 1 days 13:25:36.784000 + 2 2 days 20:09:05.345000 + 3 2 days 14:03:52.411000 + 4 11573 days 23:39:03.241000 + 5 42 days 01:35:48.734000 + 6 0 days 00:00:23.234000 + Name: abc, dtype: timedelta64[ms] + """ + ), + ), + ( + lambda: cudf.Series( + [ + 13645765432432, + 134736784, + 245345345, + 223432411, + 999992343241, + 3634548734, + 23234, + ], + dtype="timedelta64[ns]", + index=["a", "b", "z", "x", "y", "l", "m"], + name="hello", + ), + textwrap.dedent( + """ + a 0 days 03:47:25.765432432 + b 0 days 00:00:00.134736784 + z 0 days 00:00:00.245345345 + x 0 days 00:00:00.223432411 + y 0 days 00:16:39.992343241 + l 0 days 00:00:03.634548734 + m 0 days 00:00:00.000023234 + Name: hello, dtype: timedelta64[ns] + """ + ), + ), + ], +) +def test_timedelta_series_ns_ms_repr(ser, expected_repr): + expected = expected_repr + actual = repr(ser()) + + assert expected.split() == actual.split() + + +def test_categorical_series_with_nan_repr(): + series = cudf.Series( + [1, 2, np.nan, 10, np.nan, None], nan_as_null=False + ).astype("category") + + expected_repr = textwrap.dedent( + """ + 0 1.0 + 1 2.0 + 2 NaN + 3 10.0 + 4 NaN + 5 + dtype: category + Categories (4, float64): [1.0, 2.0, 10.0, NaN] + """ + ) + + assert repr(series).split() == expected_repr.split() + + sliced_expected_repr = textwrap.dedent( + """ + 2 NaN + 3 10.0 + 4 NaN + 5 + dtype: category + Categories (4, float64): [1.0, 2.0, 10.0, NaN] + """ + ) + + assert repr(series[2:]).split() == sliced_expected_repr.split() + + +def test_empty_series_name(): + ps = pd.Series([], name="abc", dtype="int") + gs = cudf.from_pandas(ps) + + assert repr(ps) == repr(gs) diff --git a/python/cudf/cudf/tests/test_offset.py b/python/cudf/cudf/tests/test_offset.py deleted file mode 100644 index 1ed04616f04..00000000000 --- a/python/cudf/cudf/tests/test_offset.py +++ /dev/null @@ -1,61 +0,0 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. - -import re - -import numpy as np -import pytest - -from cudf import DateOffset - -INT64MAX = np.iinfo("int64").max - - -@pytest.mark.parametrize("period", [1.5, 0.5, "string", "1", "1.0"]) -@pytest.mark.parametrize("freq", ["years", "months"]) -def test_construction_invalid(period, freq): - kwargs = {freq: period} - with pytest.raises(ValueError): - DateOffset(**kwargs) - - -@pytest.mark.parametrize( - "unit", ["nanoseconds", "microseconds", "milliseconds", "seconds"] -) -def test_construct_max_offset(unit): - DateOffset(**{unit: np.iinfo("int64").max}) - - -@pytest.mark.parametrize( - "kwargs", - [ - {"seconds": INT64MAX + 1}, - {"seconds": INT64MAX, "minutes": 1}, - {"minutes": INT64MAX}, - ], -) -def test_offset_construction_overflow(kwargs): - with pytest.raises(NotImplementedError): - DateOffset(**kwargs) - - -@pytest.mark.parametrize( - "unit", - [ - "years", - "months", - "weeks", - "days", - "hours", - "minutes", - "seconds", - "milliseconds", - "microseconds", - "nanoseconds", - ], -) -@pytest.mark.parametrize("period", [0.5, -0.5, 0.71]) -def test_offset_no_fractional_periods(unit, period): - with pytest.raises( - ValueError, match=re.escape("Non-integer periods not supported") - ): - DateOffset(**{unit: period}) diff --git a/python/cudf/cudf/tests/test_repr.py b/python/cudf/cudf/tests/test_repr.py deleted file mode 100644 index 89fa6a0bb78..00000000000 --- a/python/cudf/cudf/tests/test_repr.py +++ /dev/null @@ -1,1518 +0,0 @@ -# Copyright (c) 2019-2025, NVIDIA CORPORATION. - -import textwrap - -import cupy as cp -import numpy as np -import pandas as pd -import pytest -from hypothesis import given, settings, strategies as st - -import cudf -from cudf.testing import _utils as utils -from cudf.utils.dtypes import np_dtypes_to_pandas_dtypes - - -@pytest.fixture( - params=[ - "uint16", - "int64", - "float64", - "str", - "category", - "datetime64[ns]", - ] -) -def dtype(request): - return request.param - - -@pytest.mark.parametrize("nrows", [0, 5, 10]) -def test_null_series(nrows, dtype): - rng = np.random.default_rng(seed=0) - size = 5 - sr = cudf.Series(rng.integers(1, 9, size)).astype(dtype) - sr[rng.choice([False, True], size=size)] = None - if dtype != "category" and cudf.dtype(dtype).kind in {"u", "i"}: - ps = pd.Series( - sr._column.data_array_view(mode="read").copy_to_host(), - dtype=np_dtypes_to_pandas_dtypes.get( - cudf.dtype(dtype), cudf.dtype(dtype) - ), - ) - ps[sr.isnull().to_pandas()] = pd.NA - else: - ps = sr.to_pandas() - - with pd.option_context("display.max_rows", int(nrows)): - psrepr = repr(ps).replace("NaN", "").replace("None", "") - if "UInt" in psrepr: - psrepr = psrepr.replace("UInt", "uint") - elif "Int" in psrepr: - psrepr = psrepr.replace("Int", "int") - assert psrepr.split() == repr(sr).split() - - -@pytest.mark.parametrize("ncols", [1, 2, 3, 4, 5, 10]) -def test_null_dataframe(ncols): - dtype_categories = [ - "float32", - "float64", - "datetime64[ns]", - "str", - "category", - ] - rng = np.random.default_rng(seed=0) - size = 20 - gdf = cudf.DataFrame() - for dtype in dtype_categories: - sr = cudf.Series(rng.integers(0, 128, size)).astype(dtype) - sr[rng.choice([False, True], size=size)] = None - gdf[dtype] = sr - pdf = gdf.to_pandas() - with pd.option_context("display.max_columns", int(ncols)): - pdf_repr = repr(pdf).replace("NaN", "").replace("None", "") - assert pdf_repr.split() == repr(gdf).split() - - -@pytest.mark.parametrize("nrows", [None, 0, 1, 2, 9, 10, 11, 19, 20, 21]) -def test_full_series(nrows, dtype): - size = 20 - rng = np.random.default_rng(seed=0) - ps = pd.Series(rng.integers(0, 100, size)).astype(dtype) - sr = cudf.from_pandas(ps) - with pd.option_context("display.max_rows", nrows): - assert repr(ps) == repr(sr) - - -@pytest.mark.parametrize("nrows", [5, 10, 15]) -@pytest.mark.parametrize("ncols", [5, 10, 15]) -@pytest.mark.parametrize("size", [20, 21]) -def test_full_dataframe_20(dtype, size, nrows, ncols): - rng = np.random.default_rng(seed=0) - pdf = pd.DataFrame( - {idx: rng.integers(0, 100, size) for idx in range(size)} - ).astype(dtype) - gdf = cudf.from_pandas(pdf) - - with pd.option_context( - "display.max_rows", int(nrows), "display.max_columns", int(ncols) - ): - assert repr(pdf) == repr(gdf) - assert pdf._repr_html_() == gdf._repr_html_() - assert pdf._repr_latex_() == gdf._repr_latex_() - - -@given( - st.lists( - st.integers(-9223372036854775808, 9223372036854775807), - min_size=1, - max_size=1000, - ) -) -@settings(deadline=None, max_examples=20) -def test_integer_dataframe(x): - gdf = cudf.DataFrame({"x": x}) - pdf = gdf.to_pandas() - with pd.option_context("display.max_columns", 1): - assert repr(gdf) == repr(pdf) - assert repr(gdf.T) == repr(pdf.T) - - -@given( - st.lists( - st.integers(-9223372036854775808, 9223372036854775807), max_size=1000 - ) -) -@settings(deadline=None, max_examples=20) -def test_integer_series(x): - sr = cudf.Series(x, dtype=int) - ps = pd.Series(data=x, dtype=int) - - assert repr(sr) == repr(ps) - - -@given(st.lists(st.floats())) -@settings(deadline=None, max_examples=20) -def test_float_dataframe(x): - gdf = cudf.DataFrame({"x": cudf.Series(x, dtype=float, nan_as_null=False)}) - pdf = gdf.to_pandas() - assert repr(gdf) == repr(pdf) - - -@given(st.lists(st.floats())) -@settings(deadline=None, max_examples=20) -def test_float_series(x): - sr = cudf.Series(x, dtype=float, nan_as_null=False) - ps = pd.Series(data=x, dtype=float) - assert repr(sr) == repr(ps) - - -@pytest.fixture -def mixed_pdf(): - pdf = pd.DataFrame() - pdf["Integer"] = np.array([2345, 11987, 9027, 9027]) - pdf["Date"] = np.array( - ["18/04/1995", "14/07/1994", "07/06/2006", "16/09/2005"] - ) - pdf["Float"] = np.array([9.001, 8.343, 6, 2.781]) - pdf["Integer2"] = np.array([2345, 106, 2088, 789277]) - pdf["Category"] = np.array(["M", "F", "F", "F"]) - pdf["String"] = np.array(["Alpha", "Beta", "Gamma", "Delta"]) - pdf["Boolean"] = np.array([True, False, True, False]) - return pdf - - -@pytest.fixture -def mixed_gdf(mixed_pdf): - return cudf.from_pandas(mixed_pdf) - - -def test_mixed_dataframe(mixed_pdf, mixed_gdf): - assert repr(mixed_gdf) == repr(mixed_pdf) - - -def test_mixed_series(mixed_pdf, mixed_gdf): - for col in mixed_gdf.columns: - assert repr(mixed_gdf[col]) == repr(mixed_pdf[col]) - - -def test_MI(): - rng = np.random.default_rng(seed=0) - gdf = cudf.DataFrame( - { - "a": rng.integers(0, 4, 10), - "b": rng.integers(0, 4, 10), - "c": rng.integers(0, 4, 10), - } - ) - levels = [["a", "b", "c", "d"], ["w", "x", "y", "z"], ["m", "n"]] - codes = [ - [0, 0, 0, 0, 1, 1, 2, 2, 3, 3], - [0, 1, 2, 3, 0, 1, 2, 3, 0, 1], - [0, 1, 0, 1, 0, 1, 0, 1, 0, 1], - ] - with pd.option_context("display.max_rows", 999, "display.max_columns", 0): - gdf = gdf.set_index(cudf.MultiIndex(levels=levels, codes=codes)) - pdf = gdf.to_pandas() - assert repr(gdf) == repr(pdf) - assert repr(gdf.index) == repr(pdf.index) - assert repr(gdf.T) == repr(pdf.T) - - -@pytest.mark.parametrize("nrows", [0, 1, 3, 5, 10]) -@pytest.mark.parametrize("ncols", [0, 1, 2, 3]) -def test_groupby_MI(nrows, ncols): - gdf = cudf.DataFrame( - {"a": np.arange(10), "b": np.arange(10), "c": np.arange(10)} - ) - pdf = gdf.to_pandas() - gdg = gdf.groupby(["a", "b"], sort=True).count() - pdg = pdf.groupby(["a", "b"], sort=True).count() - with pd.option_context( - "display.max_rows", nrows, "display.max_columns", ncols - ): - assert repr(gdg) == repr(pdg) - assert repr(gdg.index) == repr(pdg.index) - assert repr(gdg.T) == repr(pdg.T) - - -@pytest.mark.parametrize("dtype", utils.NUMERIC_TYPES) -@pytest.mark.parametrize("length", [0, 1, 10, 100, 1000]) -def test_generic_index(length, dtype): - rng = np.random.default_rng(seed=0) - psr = pd.Series( - range(length), - index=rng.integers(0, high=100, size=length).astype(dtype), - dtype="float64" if length == 0 else None, - ) - gsr = cudf.Series.from_pandas(psr) - - assert repr(psr.index) == repr(gsr.index) - - -@pytest.mark.parametrize( - "gdf", - [ - lambda: cudf.DataFrame({"a": range(10000)}), - lambda: cudf.DataFrame({"a": range(10000), "b": range(10000)}), - lambda: cudf.DataFrame({"a": range(20), "b": range(20)}), - lambda: cudf.DataFrame( - { - "a": range(20), - "b": range(20), - "c": ["abc", "def", "xyz", "def", "pqr"] * 4, - } - ), - lambda: cudf.DataFrame(index=[1, 2, 3]), - lambda: cudf.DataFrame(index=range(10000)), - lambda: cudf.DataFrame(columns=["a", "b", "c", "d"]), - lambda: cudf.DataFrame(columns=["a"], index=range(10000)), - lambda: cudf.DataFrame( - columns=["a", "col2", "...col n"], index=range(10000) - ), - lambda: cudf.DataFrame(index=cudf.Series(range(10000)).astype("str")), - lambda: cudf.DataFrame( - columns=["a", "b", "c", "d"], - index=cudf.Series(range(10000)).astype("str"), - ), - ], -) -@pytest.mark.parametrize( - "slice", - [ - slice(2500, 5000), - slice(2500, 2501), - slice(5000), - slice(1, 10), - slice(10, 20), - slice(15, 2400), - ], -) -@pytest.mark.parametrize("max_seq_items", [1, 10, 60, 10000, None]) -@pytest.mark.parametrize("max_rows", [1, 10, 60, 10000, None]) -def test_dataframe_sliced(gdf, slice, max_seq_items, max_rows): - gdf = gdf() - with pd.option_context( - "display.max_seq_items", max_seq_items, "display.max_rows", max_rows - ): - pdf = gdf.to_pandas() - - sliced_gdf = gdf[slice] - sliced_pdf = pdf[slice] - - expected_repr = repr(sliced_pdf).replace("None", "") - actual_repr = repr(sliced_gdf) - - assert expected_repr == actual_repr - - -@pytest.mark.parametrize( - "index,expected_repr", - [ - ( - lambda: cudf.Index([1, 2, 3, None]), - "Index([1, 2, 3, ], dtype='int64')", - ), - ( - lambda: cudf.Index([None, 2.2, 3.324342, None]), - "Index([, 2.2, 3.324342, ], dtype='float64')", - ), - ( - lambda: cudf.Index([None, None, None], name="hello"), - "Index([, , ], dtype='object', name='hello')", - ), - ( - lambda: cudf.Index( - [None, None, None], dtype="float", name="hello" - ), - "Index([, , ], dtype='float64', name='hello')", - ), - ( - lambda: cudf.Index([None], dtype="float64", name="hello"), - "Index([], dtype='float64', name='hello')", - ), - ( - lambda: cudf.Index([None], dtype="int8", name="hello"), - "Index([], dtype='int8', name='hello')", - ), - ( - lambda: cudf.Index([None] * 50, dtype="object"), - "Index([, , , , , , , , , " - ", , ,\n , , , , , , , " - ", , , , ,\n , , , , " - ", , , , , , , ,\n , " - ", , , , , , , , , , " - ",\n , ],\n dtype='object')", - ), - ( - lambda: cudf.Index([None] * 20, dtype="uint32"), - "Index([, , , , , , , , " - ",\n , , , , , , , , " - ",\n , ],\n dtype='uint32')", - ), - ( - lambda: cudf.Index( - [None, 111, 22, 33, None, 23, 34, 2343, None], dtype="int16" - ), - "Index([, 111, 22, 33, , 23, 34, 2343, ], " - "dtype='int16')", - ), - ( - lambda: cudf.Index([1, 2, 3, None], dtype="category"), - "CategoricalIndex([1, 2, 3, ], categories=[1, 2, 3], " - "ordered=False, dtype='category')", - ), - ( - lambda: cudf.Index([None, None], dtype="category"), - "CategoricalIndex([, ], categories=[], ordered=False, " - "dtype='category')", - ), - ( - lambda: cudf.Index( - np.array([10, 20, 30, None], dtype="datetime64[ns]") - ), - "DatetimeIndex([1970-01-01 00:00:00.000000010, " - "1970-01-01 00:00:00.000000020," - "\n 1970-01-01 00:00:00.000000030, NaT],\n " - "dtype='datetime64[ns]')", - ), - ( - lambda: cudf.Index( - np.array([10, 20, 30, None], dtype="datetime64[s]") - ), - "DatetimeIndex([1970-01-01 00:00:10, " - "1970-01-01 00:00:20, 1970-01-01 00:00:30,\n" - " NaT],\n dtype='datetime64[s]')", - ), - ( - lambda: cudf.Index( - np.array([10, 20, 30, None], dtype="datetime64[us]") - ), - "DatetimeIndex([1970-01-01 00:00:00.000010, " - "1970-01-01 00:00:00.000020,\n " - "1970-01-01 00:00:00.000030, NaT],\n " - "dtype='datetime64[us]')", - ), - ( - lambda: cudf.Index( - np.array([10, 20, 30, None], dtype="datetime64[ms]") - ), - "DatetimeIndex([1970-01-01 00:00:00.010, " - "1970-01-01 00:00:00.020,\n " - "1970-01-01 00:00:00.030, NaT],\n " - "dtype='datetime64[ms]')", - ), - ( - lambda: cudf.Index(np.array([None] * 10, dtype="datetime64[ms]")), - "DatetimeIndex([NaT, NaT, NaT, NaT, NaT, NaT, NaT, NaT, " - "NaT, NaT], dtype='datetime64[ms]')", - ), - ], -) -def test_generic_index_null(index, expected_repr): - index = index() - actual_repr = repr(index) - - assert expected_repr == actual_repr - - -@pytest.mark.parametrize( - "df,pandas_special_case", - [ - (pd.DataFrame({"a": [1, 2, 3]}, index=[10, 20, None]), False), - ( - pd.DataFrame( - { - "a": [1, None, 3], - "string_col": ["hello", "world", "rapids"], - }, - index=[None, "a", "b"], - ), - True, - ), - (pd.DataFrame([], index=[None, "a", "b"]), False), - (pd.DataFrame({"aa": [None, None]}, index=[None, None]), False), - (pd.DataFrame({"aa": [1, 2, 3]}, index=[None, None, None]), False), - ( - pd.DataFrame( - {"aa": [None, 2, 3]}, - index=np.array([1, None, None], dtype="datetime64[ns]"), - ), - False, - ), - ( - pd.DataFrame( - {"aa": [None, 2, 3]}, - index=np.array([100, None, None], dtype="datetime64[ns]"), - ), - False, - ), - ( - pd.DataFrame( - {"aa": [None, None, None]}, - index=np.array([None, None, None], dtype="datetime64[ns]"), - ), - False, - ), - ( - pd.DataFrame( - {"aa": [1, None, 3]}, - index=np.array([10, 15, None], dtype="datetime64[ns]"), - ), - False, - ), - ( - pd.DataFrame( - {"a": [1, 2, None], "v": [10, None, 22], "p": [100, 200, 300]} - ).set_index(["a", "v"]), - False, - ), - ( - pd.DataFrame( - { - "a": [1, 2, None], - "v": ["n", "c", "a"], - "p": [None, None, None], - } - ).set_index(["a", "v"]), - False, - ), - ( - pd.DataFrame( - { - "a": np.array([1, None, None], dtype="datetime64[ns]"), - "v": ["n", "c", "a"], - "p": [None, None, None], - } - ).set_index(["a", "v"]), - False, - ), - ], -) -def test_dataframe_null_index_repr(df, pandas_special_case): - pdf = df - gdf = cudf.from_pandas(pdf) - - expected_repr = repr(pdf).replace("NaN", "").replace("None", "") - actual_repr = repr(gdf) - - if pandas_special_case: - # Pandas inconsistently print Index null values - # as `None` at some places and `NaN` at few other places - # Whereas cudf is consistent with strings `null` values - # to be printed as `None` everywhere. - actual_repr = repr(gdf).replace("None", "") - - assert expected_repr.split() == actual_repr.split() - - -@pytest.mark.parametrize( - "sr,pandas_special_case", - [ - (pd.Series([1, 2, 3], index=[10, 20, None]), False), - (pd.Series([1, None, 3], name="a", index=[None, "a", "b"]), True), - (pd.Series(None, index=[None, "a", "b"], dtype="float"), True), - (pd.Series([None, None], name="aa", index=[None, None]), False), - (pd.Series([1, 2, 3], index=[None, None, None]), False), - ( - pd.Series( - [None, 2, 3], - index=np.array([1, None, None], dtype="datetime64[ns]"), - ), - False, - ), - ( - pd.Series( - [None, None, None], - index=np.array([None, None, None], dtype="datetime64[ns]"), - ), - False, - ), - ( - pd.Series( - [1, None, 3], - index=np.array([10, 15, None], dtype="datetime64[ns]"), - ), - False, - ), - ( - pd.DataFrame( - {"a": [1, 2, None], "v": [10, None, 22], "p": [100, 200, 300]} - ).set_index(["a", "v"])["p"], - False, - ), - ( - pd.DataFrame( - { - "a": [1, 2, None], - "v": ["n", "c", "a"], - "p": [None, None, None], - } - ).set_index(["a", "v"])["p"], - False, - ), - ( - pd.DataFrame( - { - "a": np.array([1, None, None], dtype="datetime64[ns]"), - "v": ["n", "c", "a"], - "p": [None, None, None], - } - ).set_index(["a", "v"])["p"], - False, - ), - ], -) -def test_series_null_index_repr(sr, pandas_special_case): - psr = sr - gsr = cudf.from_pandas(psr) - - expected_repr = repr(psr).replace("NaN", "").replace("None", "") - actual_repr = repr(gsr) - - if pandas_special_case: - # Pandas inconsistently print Index null values - # as `None` at some places and `NaN` at few other places - # Whereas cudf is consistent with strings `null` values - # to be printed as `None` everywhere. - actual_repr = repr(gsr).replace("None", "") - assert expected_repr.split() == actual_repr.split() - - -@pytest.mark.parametrize( - "data", - [ - [1000000, 200000, 3000000], - [1000000, 200000, None], - [], - [None], - [None, None, None, None, None], - [12, 12, 22, 343, 4353534, 435342], - np.array([10, 20, 30, None, 100]), - cp.asarray([10, 20, 30, 100]), - [1000000, 200000, 3000000], - [1000000, 200000, None], - [1], - [12, 11, 232, 223432411, 2343241, 234324, 23234], - [12, 11, 2.32, 2234.32411, 2343.241, 23432.4, 23234], - [1.321, 1132.324, 23223231.11, 233.41, 0.2434, 332, 323], - [ - 136457654, - 134736784, - 245345345, - 223432411, - 2343241, - 3634548734, - 23234, - ], - [12, 11, 2.32, 2234.32411, 2343.241, 23432.4, 23234], - ], -) -@pytest.mark.parametrize("dtype", ["timedelta64[s]", "timedelta64[us]"]) -def test_timedelta_series_s_us_repr(data, dtype): - sr = cudf.Series(data, dtype=dtype) - psr = sr.to_pandas() - - expected = repr(psr).replace("timedelta64[ns]", dtype) - actual = repr(sr) - - assert expected.split() == actual.split() - - -@pytest.mark.parametrize( - "ser, expected_repr", - [ - ( - lambda: cudf.Series([], dtype="timedelta64[ns]"), - textwrap.dedent( - """ - Series([], dtype: timedelta64[ns]) - """ - ), - ), - ( - lambda: cudf.Series([], dtype="timedelta64[ms]"), - textwrap.dedent( - """ - Series([], dtype: timedelta64[ms]) - """ - ), - ), - ( - lambda: cudf.Series( - [1000000, 200000, 3000000], dtype="timedelta64[ns]" - ), - textwrap.dedent( - """ - 0 0 days 00:00:00.001000 - 1 0 days 00:00:00.000200 - 2 0 days 00:00:00.003000 - dtype: timedelta64[ns] - """ - ), - ), - ( - lambda: cudf.Series( - [1000000, 200000, 3000000], dtype="timedelta64[ms]" - ), - textwrap.dedent( - """ - 0 0 days 00:16:40 - 1 0 days 00:03:20 - 2 0 days 00:50:00 - dtype: timedelta64[ms] - """ - ), - ), - ( - lambda: cudf.Series( - [1000000, 200000, None], dtype="timedelta64[ns]" - ), - textwrap.dedent( - """ - 0 0 days 00:00:00.001000000 - 1 0 days 00:00:00.000200000 - 2 NaT - dtype: timedelta64[ns] - """ - ), - ), - ( - lambda: cudf.Series( - [1000000, 200000, None], dtype="timedelta64[ms]" - ), - textwrap.dedent( - """ - 0 0 days 00:16:40 - 1 0 days 00:03:20 - 2 NaT - dtype: timedelta64[ms] - """ - ), - ), - ( - lambda: cudf.Series( - [None, None, None, None, None], dtype="timedelta64[ns]" - ), - textwrap.dedent( - """ - 0 NaT - 1 NaT - 2 NaT - 3 NaT - 4 NaT - dtype: timedelta64[ns] - """ - ), - ), - ( - lambda: cudf.Series( - [None, None, None, None, None], dtype="timedelta64[ms]" - ), - textwrap.dedent( - """ - 0 NaT - 1 NaT - 2 NaT - 3 NaT - 4 NaT - dtype: timedelta64[ms] - """ - ), - ), - ( - lambda: cudf.Series( - [12, 12, 22, 343, 4353534, 435342], dtype="timedelta64[ns]" - ), - textwrap.dedent( - """ - 0 0 days 00:00:00.000000012 - 1 0 days 00:00:00.000000012 - 2 0 days 00:00:00.000000022 - 3 0 days 00:00:00.000000343 - 4 0 days 00:00:00.004353534 - 5 0 days 00:00:00.000435342 - dtype: timedelta64[ns] - """ - ), - ), - ( - lambda: cudf.Series( - [12, 12, 22, 343, 4353534, 435342], dtype="timedelta64[ms]" - ), - textwrap.dedent( - """ - 0 0 days 00:00:00.012000 - 1 0 days 00:00:00.012000 - 2 0 days 00:00:00.022000 - 3 0 days 00:00:00.343000 - 4 0 days 01:12:33.534000 - 5 0 days 00:07:15.342000 - dtype: timedelta64[ms] - """ - ), - ), - ( - lambda: cudf.Series( - [1.321, 1132.324, 23223231.11, 233.41, 0.2434, 332, 323], - dtype="timedelta64[ns]", - ), - textwrap.dedent( - """ - 0 0 days 00:00:00.000000001 - 1 0 days 00:00:00.000001132 - 2 0 days 00:00:00.023223231 - 3 0 days 00:00:00.000000233 - 4 0 days 00:00:00 - 5 0 days 00:00:00.000000332 - 6 0 days 00:00:00.000000323 - dtype: timedelta64[ns] - """ - ), - ), - ( - lambda: cudf.Series( - [1.321, 1132.324, 23223231.11, 233.41, 0.2434, 332, 323], - dtype="timedelta64[ms]", - ), - textwrap.dedent( - """ - 0 0 days 00:00:00.001000 - 1 0 days 00:00:01.132000 - 2 0 days 06:27:03.231000 - 3 0 days 00:00:00.233000 - 4 0 days 00:00:00 - 5 0 days 00:00:00.332000 - 6 0 days 00:00:00.323000 - dtype: timedelta64[ms] - """ - ), - ), - ( - lambda: cudf.Series( - [ - 13645765432432, - 134736784, - 245345345, - 223432411, - 999992343241, - 3634548734, - 23234, - ], - dtype="timedelta64[ms]", - ), - textwrap.dedent( - """ - 0 157937 days 02:23:52.432000 - 1 1 days 13:25:36.784000 - 2 2 days 20:09:05.345000 - 3 2 days 14:03:52.411000 - 4 11573 days 23:39:03.241000 - 5 42 days 01:35:48.734000 - 6 0 days 00:00:23.234000 - dtype: timedelta64[ms] - """ - ), - ), - ( - lambda: cudf.Series( - [ - 13645765432432, - 134736784, - 245345345, - 223432411, - 999992343241, - 3634548734, - 23234, - ], - dtype="timedelta64[ns]", - ), - textwrap.dedent( - """ - 0 0 days 03:47:25.765432432 - 1 0 days 00:00:00.134736784 - 2 0 days 00:00:00.245345345 - 3 0 days 00:00:00.223432411 - 4 0 days 00:16:39.992343241 - 5 0 days 00:00:03.634548734 - 6 0 days 00:00:00.000023234 - dtype: timedelta64[ns] - """ - ), - ), - ( - lambda: cudf.Series( - [ - 13645765432432, - 134736784, - 245345345, - 223432411, - 999992343241, - 3634548734, - 23234, - ], - dtype="timedelta64[ms]", - name="abc", - ), - textwrap.dedent( - """ - 0 157937 days 02:23:52.432000 - 1 1 days 13:25:36.784000 - 2 2 days 20:09:05.345000 - 3 2 days 14:03:52.411000 - 4 11573 days 23:39:03.241000 - 5 42 days 01:35:48.734000 - 6 0 days 00:00:23.234000 - Name: abc, dtype: timedelta64[ms] - """ - ), - ), - ( - lambda: cudf.Series( - [ - 13645765432432, - 134736784, - 245345345, - 223432411, - 999992343241, - 3634548734, - 23234, - ], - dtype="timedelta64[ns]", - index=["a", "b", "z", "x", "y", "l", "m"], - name="hello", - ), - textwrap.dedent( - """ - a 0 days 03:47:25.765432432 - b 0 days 00:00:00.134736784 - z 0 days 00:00:00.245345345 - x 0 days 00:00:00.223432411 - y 0 days 00:16:39.992343241 - l 0 days 00:00:03.634548734 - m 0 days 00:00:00.000023234 - Name: hello, dtype: timedelta64[ns] - """ - ), - ), - ], -) -def test_timedelta_series_ns_ms_repr(ser, expected_repr): - expected = expected_repr - actual = repr(ser()) - - assert expected.split() == actual.split() - - -@pytest.mark.parametrize( - "df,expected_repr", - [ - ( - lambda: cudf.DataFrame( - { - "a": cudf.Series( - [1000000, 200000, 3000000], dtype="timedelta64[s]" - ) - } - ), - textwrap.dedent( - """ - a - 0 11 days 13:46:40 - 1 2 days 07:33:20 - 2 34 days 17:20:00 - """ - ), - ), - ( - lambda: cudf.DataFrame( - { - "a": cudf.Series( - [ - 136457654, - None, - 245345345, - 223432411, - None, - 3634548734, - 23234, - ], - dtype="timedelta64[s]", - ), - "b": [10, 11, 22, 33, 44, 55, 66], - } - ), - textwrap.dedent( - """ - a b - 0 1579 days 08:54:14 10 - 1 NaT 11 - 2 2839 days 15:29:05 22 - 3 2586 days 00:33:31 33 - 4 NaT 44 - 5 42066 days 12:52:14 55 - 6 0 days 06:27:14 66 - """ - ), - ), - ( - lambda: cudf.DataFrame( - { - "a": cudf.Series( - [ - 136457654, - None, - 245345345, - 223432411, - None, - 3634548734, - 23234, - ], - dtype="timedelta64[s]", - index=["a", "b", "c", "d", "e", "f", "g"], - ) - } - ), - textwrap.dedent( - """ - a - a 1579 days 08:54:14 - b NaT - c 2839 days 15:29:05 - d 2586 days 00:33:31 - e NaT - f 42066 days 12:52:14 - g 0 days 06:27:14 - """ - ), - ), - ( - lambda: cudf.DataFrame( - { - "a": cudf.Series( - [1, 2, 3, 4, 5, 6, 7], - index=cudf.Index( - [ - 136457654, - None, - 245345345, - 223432411, - None, - 3634548734, - 23234, - ], - dtype="timedelta64[ms]", - ), - ) - } - ), - textwrap.dedent( - """ - a - 1 days 13:54:17.654 1 - NaT 2 - 2 days 20:09:05.345 3 - 2 days 14:03:52.411 4 - NaT 5 - 42 days 01:35:48.734 6 - 0 days 00:00:23.234 7 - """ - ), - ), - ( - lambda: cudf.DataFrame( - { - "a": cudf.Series( - ["a", "f", "q", "e", "w", "e", "t"], - index=cudf.Index( - [ - 136457654, - None, - 245345345, - 223432411, - None, - 3634548734, - 23234, - ], - dtype="timedelta64[ns]", - ), - ) - } - ), - textwrap.dedent( - """ - a - 0 days 00:00:00.136457654 a - NaT f - 0 days 00:00:00.245345345 q - 0 days 00:00:00.223432411 e - NaT w - 0 days 00:00:03.634548734 e - 0 days 00:00:00.000023234 t - """ - ), - ), - ], -) -def test_timedelta_dataframe_repr(df, expected_repr): - actual_repr = repr(df()) - - assert actual_repr.split() == expected_repr.split() - - -@pytest.mark.parametrize( - "index, expected_repr", - [ - ( - lambda: cudf.Index( - [1000000, 200000, 3000000], dtype="timedelta64[ms]" - ), - "TimedeltaIndex(['0 days 00:16:40', " - "'0 days 00:03:20', '0 days 00:50:00'], " - "dtype='timedelta64[ms]')", - ), - ( - lambda: cudf.Index( - [None, None, None, None, None], dtype="timedelta64[us]" - ), - "TimedeltaIndex([NaT, NaT, NaT, NaT, NaT], " - "dtype='timedelta64[us]')", - ), - ( - lambda: cudf.Index( - [ - 136457654, - None, - 245345345, - 223432411, - None, - 3634548734, - 23234, - ], - dtype="timedelta64[us]", - ), - "TimedeltaIndex([0 days 00:02:16.457654, NaT, " - "0 days 00:04:05.345345, " - "0 days 00:03:43.432411, NaT," - " 0 days 01:00:34.548734, 0 days 00:00:00.023234]," - " dtype='timedelta64[us]')", - ), - ( - lambda: cudf.Index( - [ - 136457654, - None, - 245345345, - 223432411, - None, - 3634548734, - 23234, - ], - dtype="timedelta64[s]", - ), - "TimedeltaIndex([1579 days 08:54:14, NaT, 2839 days 15:29:05," - " 2586 days 00:33:31, NaT, 42066 days 12:52:14, " - "0 days 06:27:14]," - " dtype='timedelta64[s]')", - ), - ], -) -def test_timedelta_index_repr(index, expected_repr): - actual_repr = repr(index()) - - assert actual_repr.split() == expected_repr.split() - - -@pytest.mark.parametrize( - "pmi", - [ - pd.MultiIndex.from_tuples( - [(1, "red"), (1, "blue"), (2, "red"), (2, "blue")] - ), - pd.MultiIndex.from_tuples( - [(1, "red"), (1, "blue"), (2, "red"), (2, "blue")] * 10 - ), - pd.MultiIndex.from_tuples([(1, "red", 102, "sdf")]), - pd.MultiIndex.from_tuples( - [ - ("abc", 0.234, 1), - ("a", -0.34, 0), - ("ai", 111, 4385798), - ("rapids", 0, 34534534), - ], - names=["alphabets", "floats", "ints"], - ), - ], -) -@pytest.mark.parametrize("max_seq_items", [None, 1, 2, 5, 10, 100]) -def test_multiindex_repr(pmi, max_seq_items): - with pd.option_context("display.max_seq_items", max_seq_items): - gmi = cudf.from_pandas(pmi) - - assert repr(gmi) == repr(pmi) - - -@pytest.mark.parametrize( - "gdi, expected_repr", - [ - ( - lambda: cudf.DataFrame( - { - "a": [None, 1, 2, 3], - "b": ["abc", None, "xyz", None], - "c": [0.345, np.nan, 100, 10], - } - ) - .set_index(["a", "b"]) - .index, - textwrap.dedent( - """ - MultiIndex([(, 'abc'), - ( 1, ), - ( 2, 'xyz'), - ( 3, )], - names=['a', 'b']) - """ - ), - ), - ( - lambda: cudf.DataFrame( - { - "a": cudf.Series([None, np.nan, 2, 3], nan_as_null=False), - "b": ["abc", None, "xyz", None], - "c": [0.345, np.nan, 100, 10], - } - ) - .set_index(["a", "b"]) - .index, - textwrap.dedent( - """ - MultiIndex([(, 'abc'), - ( nan, ), - ( 2.0, 'xyz'), - ( 3.0, )], - names=['a', 'b']) - """ - ), - ), - ( - lambda: cudf.DataFrame( - { - "a": cudf.Series([None, 1, 2, 3], dtype="datetime64[ns]"), - "b": ["abc", None, "xyz", None], - "c": [0.345, np.nan, 100, 10], - } - ) - .set_index(["a", "b"]) - .index, - textwrap.dedent( - """ - MultiIndex([( 'NaT', 'abc'), - ('1970-01-01 00:00:00.000000001', ), - ('1970-01-01 00:00:00.000000002', 'xyz'), - ('1970-01-01 00:00:00.000000003', )], - names=['a', 'b']) - """ - ), - ), - ( - lambda: cudf.DataFrame( - { - "a": cudf.Series([None, 1, 2, 3], dtype="datetime64[ns]"), - "b": ["abc", None, "xyz", None], - "c": [0.345, np.nan, 100, 10], - } - ) - .set_index(["a", "b", "c"]) - .index, - textwrap.dedent( - """ - MultiIndex([( 'NaT', 'abc', 0.345), - ('1970-01-01 00:00:00.000000001', , ), - ('1970-01-01 00:00:00.000000002', 'xyz', 100.0), - ('1970-01-01 00:00:00.000000003', , 10.0)], - names=['a', 'b', 'c']) - """ - ), - ), - ( - lambda: cudf.DataFrame( - { - "a": ["abc", None, "xyz", None], - "b": cudf.Series([None, 1, 2, 3], dtype="timedelta64[ns]"), - "c": [0.345, np.nan, 100, 10], - } - ) - .set_index(["a", "b", "c"]) - .index, - textwrap.dedent( - """ - MultiIndex([('abc', NaT, 0.345), - ( , '0 days 00:00:00.000000001', ), - ('xyz', '0 days 00:00:00.000000002', 100.0), - ( , '0 days 00:00:00.000000003', 10.0)], - names=['a', 'b', 'c']) - """ - ), - ), - ( - lambda: cudf.DataFrame( - { - "a": ["abc", None, "xyz", None], - "b": cudf.Series([None, 1, 2, 3], dtype="timedelta64[ns]"), - "c": [0.345, np.nan, 100, 10], - } - ) - .set_index(["c", "a"]) - .index, - textwrap.dedent( - """ - MultiIndex([(0.345, 'abc'), - ( , ), - (100.0, 'xyz'), - ( 10.0, )], - names=['c', 'a']) - """ - ), - ), - ( - lambda: cudf.DataFrame( - { - "a": [None, None, None, None], - "b": cudf.Series( - [None, None, None, None], dtype="timedelta64[ns]" - ), - "c": [0.345, np.nan, 100, 10], - } - ) - .set_index(["b", "a"]) - .index, - textwrap.dedent( - """ - MultiIndex([(NaT, ), - (NaT, ), - (NaT, ), - (NaT, )], - names=['b', 'a']) - """ - ), - ), - ( - lambda: cudf.DataFrame( - { - "a": [1, 2, None, 3, 5], - "b": [ - "abc", - "def, hi, bye", - None, - ", one, two, three, four", - None, - ], - "c": cudf.Series( - [0.3232, np.nan, 1, None, -0.34534], nan_as_null=False - ), - "d": [None, 100, 2000324, None, None], - } - ) - .set_index(["a", "b", "c", "d"]) - .index, - textwrap.dedent( - """ - MultiIndex([( 1, 'abc', 0.3232, ), - ( 2, 'def, hi, bye', nan, 100), - (, , 1.0, 2000324), - ( 3, ', one, two, three, four', , ), - ( 5, , -0.34534, )], - names=['a', 'b', 'c', 'd']) - """ - ), - ), - ( - lambda: cudf.DataFrame( - { - "a": [1, 2, None, 3, 5], - "b": [ - "abc", - "def, hi, bye", - None, - ", one, two, three, four", - None, - ], - "c": cudf.Series( - [0.3232, np.nan, 1, None, -0.34534], nan_as_null=False - ), - "d": [None, 100, 2000324, None, None], - } - ) - .set_index(["b", "a", "c", "d"]) - .index, - textwrap.dedent( - """ - MultiIndex([( 'abc', 1, 0.3232, ), - ( 'def, hi, bye', 2, nan, 100), - ( , , 1.0, 2000324), - (', one, two, three, four', 3, , ), - ( , 5, -0.34534, )], - names=['b', 'a', 'c', 'd']) - """ - ), - ), - ( - lambda: cudf.DataFrame( - { - "a": ["(abc", "2", None, "3", "5"], - "b": [ - "abc", - "def, hi, bye", - None, - ", one, two, three, four", - None, - ], - "c": cudf.Series( - [0.3232, np.nan, 1, None, -0.34534], nan_as_null=False - ), - "d": [None, 100, 2000324, None, None], - } - ) - .set_index(["a", "b", "c", "d"]) - .index, - textwrap.dedent( - """ - MultiIndex([('(abc', 'abc', 0.3232, ), - ( '2', 'def, hi, bye', nan, 100), - ( , , 1.0, 2000324), - ( '3', ', one, two, three, four', , ), - ( '5', , -0.34534, )], - names=['a', 'b', 'c', 'd']) - """ - ), - ), - ], -) -def test_multiindex_null_repr(gdi, expected_repr): - actual_repr = repr(gdi()) - - assert actual_repr.split() == expected_repr.split() - - -def test_categorical_series_with_nan_repr(): - series = cudf.Series( - [1, 2, np.nan, 10, np.nan, None], nan_as_null=False - ).astype("category") - - expected_repr = textwrap.dedent( - """ - 0 1.0 - 1 2.0 - 2 NaN - 3 10.0 - 4 NaN - 5 - dtype: category - Categories (4, float64): [1.0, 2.0, 10.0, NaN] - """ - ) - - assert repr(series).split() == expected_repr.split() - - sliced_expected_repr = textwrap.dedent( - """ - 2 NaN - 3 10.0 - 4 NaN - 5 - dtype: category - Categories (4, float64): [1.0, 2.0, 10.0, NaN] - """ - ) - - assert repr(series[2:]).split() == sliced_expected_repr.split() - - -def test_categorical_dataframe_with_nan_repr(): - series = cudf.Series( - [1, 2, np.nan, 10, np.nan, None], nan_as_null=False - ).astype("category") - df = cudf.DataFrame({"a": series}) - expected_repr = textwrap.dedent( - """ - a - 0 1.0 - 1 2.0 - 2 NaN - 3 10.0 - 4 NaN - 5 - """ - ) - - assert repr(df).split() == expected_repr.split() - - -def test_categorical_index_with_nan_repr(): - cat_index = cudf.Index( - cudf.Series( - [1, 2, np.nan, 10, np.nan, None], nan_as_null=False - ).astype("category") - ) - - expected_repr = ( - "CategoricalIndex([1.0, 2.0, NaN, 10.0, NaN, ], " - "categories=[1.0, 2.0, 10.0, NaN], ordered=False, dtype='category')" - ) - - assert repr(cat_index) == expected_repr - - sliced_expected_repr = ( - "CategoricalIndex([NaN, 10.0, NaN, ], " - "categories=[1.0, 2.0, 10.0, NaN], ordered=False, dtype='category')" - ) - - assert repr(cat_index[2:]) == sliced_expected_repr - - -def test_empty_series_name(): - ps = pd.Series([], name="abc", dtype="int") - gs = cudf.from_pandas(ps) - - assert repr(ps) == repr(gs) - - -def test_repr_struct_after_concat(): - df = cudf.DataFrame( - { - "a": cudf.Series( - [ - {"sa": 2056831253}, - {"sa": -1463792165}, - {"sa": 1735783038}, - {"sa": 103774433}, - {"sa": -1413247520}, - ] - * 13 - ), - "b": cudf.Series( - [ - {"sa": {"ssa": 1140062029}}, - None, - {"sa": {"ssa": 1998862860}}, - {"sa": None}, - {"sa": {"ssa": -395088502}}, - ] - * 13 - ), - } - ) - pdf = df.to_pandas() - - assert repr(df) == repr(pdf) - - -def test_interval_index_repr(): - pi = pd.Index( - [ - np.nan, - pd.Interval(2.0, 3.0, closed="right"), - pd.Interval(3.0, 4.0, closed="right"), - ] - ) - gi = cudf.from_pandas(pi) - - assert repr(pi) == repr(gi) - - -def test_unique_categories_repr(): - pi = pd.CategoricalIndex(range(10_000)) - gi = cudf.CategoricalIndex(range(10_000)) - expected_repr = repr(pi) - actual_repr = repr(gi) - assert expected_repr == actual_repr - - -@pytest.mark.parametrize("ordered", [True, False]) -def test_categorical_index_ordered(ordered): - pi = pd.CategoricalIndex(range(10), ordered=ordered) - gi = cudf.CategoricalIndex(range(10), ordered=ordered) - - assert repr(pi) == repr(gi) From 1bf6ba289a4229a924589a2efba83c8bd157dcff Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Wed, 13 Aug 2025 15:55:37 -0700 Subject: [PATCH 125/366] Run pylibcudf tests without its optional dependencies (#19657) This PR changes cudf CI to run pylibcudf tests twice, once with optionals installed and once without. I'd love to be able to test without pyarrow for our interop bits, but I don't think it's realistic given how heavily we rely on pyarrow in the test suite. Resolves #18201 Authors: - Vyas Ramasubramani (https://github.com/vyasr) Approvers: - Bradley Dice (https://github.com/bdice) URL: https://github.com/rapidsai/cudf/pull/19657 --- ci/test_wheel_cudf.sh | 36 ++++++++++++++++++++++++++------- dependencies.yaml | 27 ++++++++++++++++++++++--- python/pylibcudf/pyproject.toml | 7 ++++++- 3 files changed, 59 insertions(+), 11 deletions(-) diff --git a/ci/test_wheel_cudf.sh b/ci/test_wheel_cudf.sh index e28ac0514a7..9cbd237511d 100755 --- a/ci/test_wheel_cudf.sh +++ b/ci/test_wheel_cudf.sh @@ -12,11 +12,38 @@ CUDF_WHEELHOUSE=$(RAPIDS_PY_WHEEL_NAME="cudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-do LIBCUDF_WHEELHOUSE=$(RAPIDS_PY_WHEEL_NAME="libcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-github cpp) PYLIBCUDF_WHEELHOUSE=$(RAPIDS_PY_WHEEL_NAME="pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-github python) -rapids-logger "Install cudf, pylibcudf, and test requirements" +rapids-logger "Install pylibcudf and its basic dependencies in a virtual environment" # generate constraints (possibly pinning to oldest support versions of dependencies) rapids-generate-pip-constraints py_test_cudf ./constraints.txt +RESULTS_DIR=${RAPIDS_TESTS_DIR:-"$(mktemp -d)"} +RAPIDS_TESTS_DIR=${RAPIDS_TESTS_DIR:-"${RESULTS_DIR}/test-results"}/ +mkdir -p "${RAPIDS_TESTS_DIR}" + +# To test pylibcudf without its optional dependencies, we create a virtual environment +python -m venv env +. env/bin/activate +rapids-pip-retry install \ + -v \ + --constraint ./constraints.txt \ + --constraint "${PIP_CONSTRAINT}" \ + "$(echo "${LIBCUDF_WHEELHOUSE}"/libcudf_"${RAPIDS_PY_CUDA_SUFFIX}"*.whl)" \ + "$(echo "${PYLIBCUDF_WHEELHOUSE}"/pylibcudf_"${RAPIDS_PY_CUDA_SUFFIX}"*.whl)[test]" + +rapids-logger "pytest pylibcudf without optional dependencies" +pushd python/pylibcudf/tests +python -m pytest \ + --cache-clear \ + --numprocesses=8 \ + --dist=worksteal \ + . +popd + +deactivate + +rapids-logger "Install cudf, pylibcudf, and test requirements" + # notes: # # * echo to expand wildcard before adding `[test]` requires for pip @@ -29,12 +56,7 @@ rapids-pip-retry install \ --constraint "${PIP_CONSTRAINT}" \ "$(echo "${CUDF_WHEELHOUSE}"/cudf_"${RAPIDS_PY_CUDA_SUFFIX}"*.whl)[test]" \ "$(echo "${LIBCUDF_WHEELHOUSE}"/libcudf_"${RAPIDS_PY_CUDA_SUFFIX}"*.whl)" \ - "$(echo "${PYLIBCUDF_WHEELHOUSE}"/pylibcudf_"${RAPIDS_PY_CUDA_SUFFIX}"*.whl)[test]" - -RESULTS_DIR=${RAPIDS_TESTS_DIR:-"$(mktemp -d)"} -RAPIDS_TESTS_DIR=${RAPIDS_TESTS_DIR:-"${RESULTS_DIR}/test-results"}/ -mkdir -p "${RAPIDS_TESTS_DIR}" - + "$(echo "${PYLIBCUDF_WHEELHOUSE}"/pylibcudf_"${RAPIDS_PY_CUDA_SUFFIX}"*.whl)[test, pyarrow, numpy]" rapids-logger "pytest pylibcudf" pushd python/pylibcudf/tests diff --git a/dependencies.yaml b/dependencies.yaml index 504a2b81f96..37883b03b4f 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -23,6 +23,7 @@ files: - docs - iwyu - notebooks + - numpy_run - py_version - pyarrow_run - rapids_build_skbuild @@ -69,6 +70,7 @@ files: - test_python_common - test_python_cudf_common - test_python_cudf + - numpy_run - test_python_pylibcudf - depends_on_cudf - depends_on_pylibcudf @@ -77,6 +79,7 @@ files: output: none includes: - cuda_version + - numpy_run - py_version - test_python_common - test_python_pylibcudf @@ -86,6 +89,7 @@ files: output: none includes: - cuda_version + - numpy_run - py_version - test_python_common - test_python_cudf_common @@ -169,6 +173,7 @@ files: extras: table: project includes: + - numpy_run - run_common - run_cudf - pyarrow_run @@ -246,6 +251,14 @@ files: - depends_on_libcudf - depends_on_rmm - run_pylibcudf + py_run_pylibcudf_numpy: + output: pyproject + pyproject_dir: python/pylibcudf + extras: + table: project.optional-dependencies + key: numpy + includes: + - numpy_run py_run_pylibcudf_pyarrow: output: pyproject pyproject_dir: python/pylibcudf @@ -261,6 +274,7 @@ files: table: project.optional-dependencies key: test includes: + - depends_on_cupy - pyarrow_run - test_python_common - test_python_cudf_common @@ -311,6 +325,7 @@ files: table: project.optional-dependencies key: test includes: + - numpy_run - test_python_common - test_python_cudf_polars py_build_dask_cudf: @@ -326,6 +341,7 @@ files: extras: table: project includes: + - numpy_run - run_common - run_dask_cudf - depends_on_cudf @@ -499,6 +515,11 @@ dependencies: - matrix: # Fallback for no matrix packages: - *numba_cuda_cu12 + numpy_run: + common: + - output_types: [conda, requirements, pyproject] + packages: + - numpy>=1.23,<3.0a0 pyarrow_run: common: - output_types: [conda] @@ -642,7 +663,6 @@ dependencies: - output_types: [conda, requirements, pyproject] packages: - fsspec>=0.6.0 - - &numpy numpy>=1.23,<3.0a0 - pandas>=2.0,<2.4.0dev0 run_pylibcudf: common: @@ -815,14 +835,16 @@ dependencies: - mmh3 - nanoarrow - hypothesis>=6.131.7 - - *numpy + - *numba - pandas - output_types: conda packages: - python-xxhash + - *numba_cuda - output_types: [pyproject, requirements] packages: - xxhash + - *numba_cuda_cu12 test_python_cudf: common: - output_types: [conda, requirements, pyproject] @@ -888,7 +910,6 @@ dependencies: - output_types: [conda, requirements, pyproject] packages: - dask-cuda==25.10.*,>=0.0.0a0 - - *numpy - rich test_python_narwhals: common: diff --git a/python/pylibcudf/pyproject.toml b/python/pylibcudf/pyproject.toml index 0c7f89111e3..081d4e54a4c 100644 --- a/python/pylibcudf/pyproject.toml +++ b/python/pylibcudf/pyproject.toml @@ -39,11 +39,13 @@ classifiers = [ [project.optional-dependencies] test = [ + "cupy-cuda12x>=12.0.0", "fastavro>=0.22.9", "hypothesis>=6.131.7", "mmh3", "nanoarrow", - "numpy>=1.23,<3.0a0", + "numba-cuda[cu12]>=0.18.0,<0.19.0a0", + "numba>=0.60.0,<0.62.0a0", "pandas", "pyarrow>=14.0.0,<20.0.0a0,!=17.0.0; platform_machine=='aarch64'", "pyarrow>=14.0.0,<20.0.0a0; platform_machine=='x86_64'", @@ -56,6 +58,9 @@ pyarrow = [ "pyarrow>=14.0.0,<20.0.0a0,!=17.0.0; platform_machine=='aarch64'", "pyarrow>=14.0.0,<20.0.0a0; platform_machine=='x86_64'", ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. +numpy = [ + "numpy>=1.23,<3.0a0", +] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. [project.urls] Homepage = "https://github.com/rapidsai/cudf" From f9fcc122dbcd0057043c879b19ec65be5f4e6856 Mon Sep 17 00:00:00 2001 From: David Wendt <45795991+davidwendt@users.noreply.github.com> Date: Thu, 14 Aug 2025 13:23:06 -0400 Subject: [PATCH 126/366] Support output-type for MEDIAN/QUANTILE aggregation in cudf::reduce (#19267) Adds support to return the aggregation result for MEDIAN and QUANTILE as specified in the `output_type` parameter in the `cudf::reduce` API. The underlying code calls `cudf::detail::quantile` which has an `exact` parameter that returns a `double` type if set to true. Otherwise, it will return a value using the input type. The code change here is to set `exact` to true only if the `output_type` for `cudf::reduce` is set to `FLOAT64` This means the current behavior can be achieved by always setting the `output_type` to `FLOAT64`. Authors: - David Wendt (https://github.com/davidwendt) - Matthew Murray (https://github.com/Matt711) Approvers: - Lawrence Mitchell (https://github.com/wence-) - Bradley Dice (https://github.com/bdice) - Vyas Ramasubramani (https://github.com/vyasr) - Nghia Truong (https://github.com/ttnghia) URL: https://github.com/rapidsai/cudf/pull/19267 --- cpp/CMakeLists.txt | 1 + cpp/include/cudf/reduction.hpp | 80 +++++------ .../reduction/detail/reduction_functions.hpp | 22 +++ cpp/src/reductions/quantile.cu | 58 ++++++++ cpp/src/reductions/reductions.cpp | 45 ++---- cpp/tests/reductions/reduction_tests.cpp | 132 +++++++++++------- .../cudf_polars/tests/expressions/test_agg.py | 21 +-- 7 files changed, 218 insertions(+), 141 deletions(-) create mode 100644 cpp/src/reductions/quantile.cu diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 50b96a9baf4..6b9601856c0 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -638,6 +638,7 @@ add_library( src/reductions/nth_element.cu src/reductions/nunique.cu src/reductions/product.cu + src/reductions/quantile.cu src/reductions/reductions.cpp src/reductions/scan/rank_scan.cu src/reductions/scan/ewm.cu diff --git a/cpp/include/cudf/reduction.hpp b/cpp/include/cudf/reduction.hpp index a96bde9cefd..e4150f4153a 100644 --- a/cpp/include/cudf/reduction.hpp +++ b/cpp/include/cudf/reduction.hpp @@ -35,13 +35,14 @@ namespace CUDF_EXPORT cudf { */ enum class scan_type : bool { INCLUSIVE, EXCLUSIVE }; +// clang-format off /** * @brief Computes the reduction of the values in all rows of a column. * - * This function does not detect overflows in reductions. When `output_dtype` + * This function does not detect overflows in reductions. When `output_type` * does not match the `col.type()`, their values may be promoted to * `int64_t` or `double` for computing aggregations and then cast to - * `output_dtype` before returning. + * `output_type` before returning. * * Only `min` and `max` ops are supported for reduction of non-arithmetic * types (e.g. timestamp or string). @@ -49,46 +50,44 @@ enum class scan_type : bool { INCLUSIVE, EXCLUSIVE }; * Any null values are skipped for the operation. * If the reduction fails, the output scalar returns with `%is_valid()==false`. * - * For empty or all-null input, the result is generally a null scalar except for specific + * For empty or all-null input, the result is generally an invalid scalar except for specific * aggregations where the aggregation has a well-defined output. * - * If the input column is an arithmetic type, the `output_dtype` can be any arithmetic + * If the input column is an arithmetic type, the `output_type` can be any arithmetic * type. If the input column is a non-arithmetic type (e.g. timestamp or string) - * the `output_dtype` must match the `col.type()`. If the reduction type is `any` or - * `all`, the `output_dtype` must be type BOOL8. + * the `output_type` must match the `col.type()`. If the reduction type is `any` or + * `all`, the `output_type` must be type BOOL8. * * | Aggregation | Output Type | Init Value | Empty Input | Comments | - * | :---------: | ----------- | ---------- | ----------- | -------- | - * | SUM/PRODUCT | output_dtype | yes | NA | Input accumulated into output_dtype variable | - * | SUM_OF_SQUARES | output_dtype | no | NA | Input accumulated into output_dtype variable | - * | MIN/MAX | col.type | yes | NA | Supports arithmetic, timestamp, duration, string | + * | :---------: | ----------- | :--------: | ----------- | -------- | + * | SUM/PRODUCT | output_type | yes | NA | Input accumulated into output_type variable | + * | SUM_OF_SQUARES | output_type | no | NA | Input accumulated into output_type variable | + * | MIN/MAX | col.type | yes | NA | Supports arithmetic, timestamp, duration, string types only | * | ANY/ALL | BOOL8 | yes | True for ALL only | Checks for non-zero elements | - * | MEAN/VARIANCE/STD | FLOAT32/FLOAT64 | no | NA | output_dtype must be a float type | - * | MEDIAN/QUANTILE | FLOAT64 | no | NA | | - * | NUNIQUE | output_dtype | no | 1 if all-nulls | May process null rows | + * | MEAN/VARIANCE/STD | FLOAT32/FLOAT64 | no | NA | output_type must be a float type | + * | MEDIAN/QUANTILE | output_type | no | NA | Exact value if output_type is FLOAT64. See @ref cudf::quantile | + * | NUNIQUE | output_type | no | 1 if all-nulls | May process null rows | * | NTH_ELEMENT | col.type | no | NA | | * | BITWISE_AGG | col.type | no | NA | Supports only integral types | - * | HISTOGRAM/MERGE_HISTOGRAM | LIST of col.type | no | empty list | | - * | COLLECT_LIST/COLLECT_SET | LIST of col.type | no | empty list | | - * | TDIGEST/MERGE_TDIGEST | STRUCT | no | empty struct | tdigest scalar is returned | - * | HOST_UDF | output_dtype | yes | NA | Custom UDF could ignore output_dtype | + * | HISTOGRAM/MERGE_HISTOGRAM | LIST of col.type | no | empty list returned | | + * | COLLECT_LIST/COLLECT_SET | LIST of col.type | no | empty list returned | | + * | TDIGEST/MERGE_TDIGEST | STRUCT | no | empty struct returned | tdigest scalar is returned | + * | HOST_UDF | output_type | yes | NA | Custom UDF could ignore output_type | * * The NA in the table indicates an output scalar with `%is_valid()==false` * - * @throw cudf::logic_error if reduction is called for non-arithmetic output + * @throw std::invalid_argument if reduction is called for non-arithmetic output * type and operator other than `min` and `max`. - * @throw cudf::logic_error if input column data type is not convertible to - * `output_dtype`. - * @throw cudf::logic_error if `min` or `max` reduction is called and the + * @throw std::invalid_argument if input column data type is not convertible to `output_type`. + * @throw std::invalid_argument if `min` or `max` reduction is called and the * output type does not match the input column data type. - * @throw cudf::logic_error if `any` or `all` reduction is called and the - * output type is not BOOL8. - * @throw cudf::logic_error if `mean`, `var`, or `std` reduction is called and - * the `output_dtype` is not floating point. + * @throw std::invalid_argument if `any` or `all` reduction is called and the output type is not BOOL8. + * @throw std::invalid_argument if `mean`, `var`, or `std` reduction is called and + * the `output_type` is not floating point. * * @param col Input column view * @param agg Aggregation operator applied by the reduction - * @param output_dtype The output scalar type + * @param output_type The output scalar type * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned scalar's device memory * @returns Output scalar with reduce result @@ -96,9 +95,10 @@ enum class scan_type : bool { INCLUSIVE, EXCLUSIVE }; std::unique_ptr reduce( column_view const& col, reduce_aggregation const& agg, - data_type output_dtype, + data_type output_type, rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); +// clang-format on /** * @brief Computes the reduction of the values in all rows of a column with an initial value @@ -108,12 +108,12 @@ std::unique_ptr reduce( * @see cudf::reduce(column_view const&,reduce_aggregation * const&,data_type,rmm::cuda_stream_view,rmm::device_async_resource_ref) for more details * - * @throw cudf::logic_error if reduction is not `sum`, `product`, `min`, `max`, `any`, or `all` + * @throw std::invalid_argument if reduction is not `sum`, `product`, `min`, `max`, `any`, or `all` * and `init` is specified. * * @param col Input column view * @param agg Aggregation operator applied by the reduction - * @param output_dtype The output scalar type + * @param output_type The output scalar type * @param init The initial value of the reduction * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned scalar's device memory @@ -122,7 +122,7 @@ std::unique_ptr reduce( std::unique_ptr reduce( column_view const& col, reduce_aggregation const& agg, - data_type output_dtype, + data_type output_type, std::optional> init, rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); @@ -130,10 +130,10 @@ std::unique_ptr reduce( /** * @brief Compute reduction of each segment in the input column * - * This function does not detect overflows in reductions. When `output_dtype` + * This function does not detect overflows in reductions. When `output_type` * does not match the `segmented_values.type()`, their values may be promoted to * `int64_t` or `double` for computing aggregations and then cast to - * `output_dtype` before returning. + * `output_type` before returning. * * Null values are treated as identities during reduction. * @@ -143,7 +143,7 @@ std::unique_ptr reduce( * If any index in `offsets` is out of bound of `segmented_values`, the behavior * is undefined. * - * If the input column has arithmetic type, `output_dtype` can be any arithmetic + * If the input column has arithmetic type, `output_type` can be any arithmetic * type. If the input column has non-arithmetic type, e.g. timestamp, the same * output type must be specified. * @@ -152,17 +152,17 @@ std::unique_ptr reduce( * @throw cudf::logic_error if reduction is called for non-arithmetic output * type and operator other than `min` and `max`. * @throw cudf::logic_error if input column data type is not convertible to - * `output_dtype` type. + * `output_type` type. * @throw cudf::logic_error if `min` or `max` reduction is called and the - * `output_dtype` does not match the input column data type. + * `output_type` does not match the input column data type. * @throw cudf::logic_error if `any` or `all` reduction is called and the - * `output_dtype` is not BOOL8. + * `output_type` is not BOOL8. * * @param segmented_values Column view of segmented inputs * @param offsets Each segment's offset of `segmented_values`. A list of offsets with size * `num_segments + 1`. The size of `i`th segment is `offsets[i+1] - offsets[i]`. * @param agg Aggregation operator applied by the reduction - * @param output_dtype The output column type + * @param output_type The output column type * @param null_handling If `INCLUDE`, the reduction is valid if all elements in a segment are valid, * otherwise null. If `EXCLUDE`, the reduction is valid if any element in the segment is valid, * otherwise null. @@ -174,7 +174,7 @@ std::unique_ptr segmented_reduce( column_view const& segmented_values, device_span offsets, segmented_reduce_aggregation const& agg, - data_type output_dtype, + data_type output_type, null_policy null_handling, rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); @@ -187,7 +187,7 @@ std::unique_ptr segmented_reduce( * @param offsets Each segment's offset of `segmented_values`. A list of offsets with size * `num_segments + 1`. The size of `i`th segment is `offsets[i+1] - offsets[i]`. * @param agg Aggregation operator applied by the reduction - * @param output_dtype The output column type + * @param output_type The output column type * @param null_handling If `INCLUDE`, the reduction is valid if all elements in a segment are valid, * otherwise null. If `EXCLUDE`, the reduction is valid if any element in the segment is valid, * otherwise null. @@ -200,7 +200,7 @@ std::unique_ptr segmented_reduce( column_view const& segmented_values, device_span offsets, segmented_reduce_aggregation const& agg, - data_type output_dtype, + data_type output_type, null_policy null_handling, std::optional> init, rmm::cuda_stream_view stream = cudf::get_default_stream(), diff --git a/cpp/include/cudf/reduction/detail/reduction_functions.hpp b/cpp/include/cudf/reduction/detail/reduction_functions.hpp index a90f43a3b54..910e3b9c2e3 100644 --- a/cpp/include/cudf/reduction/detail/reduction_functions.hpp +++ b/cpp/include/cudf/reduction/detail/reduction_functions.hpp @@ -367,6 +367,28 @@ std::unique_ptr bitwise_reduction(bitwise_op bit_op, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr); +/** + * @brief Computes quantile value of the elements in the input column + * + * @see cudf::quantile for additional details + * + * @throw std::invalid_argument if the input column type is not an arithmetic type + * + * @param col Input column to compute quantile + * @param quantile_value Quantile value in range [0,1] + * @param interpolation Interpolation method + * @param output_type Data type of return type + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned scalar's device memory + * @return Quantile as scalar of type `output_type` + */ +std::unique_ptr quantile(column_view const& col, + double quantile_value, + cudf::interpolation interpolation, + cudf::data_type const output_type, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr); + /** * @brief Computes the number of unique elements in the input column * diff --git a/cpp/src/reductions/quantile.cu b/cpp/src/reductions/quantile.cu new file mode 100644 index 00000000000..946ed377fe7 --- /dev/null +++ b/cpp/src/reductions/quantile.cu @@ -0,0 +1,58 @@ +/* + * Copyright (c) 2025, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "simple.cuh" + +#include +#include +#include +#include +#include + +#include + +namespace cudf { +namespace reduction { +namespace detail { + +std::unique_ptr quantile(column_view const& col, + double quantile_value, + cudf::interpolation interpolation, + cudf::data_type const output_type, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + auto current_mr = cudf::get_current_device_resource_ref(); + auto sorted_indices = + cudf::detail::sorted_order(table_view{{col}}, {}, {null_order::AFTER}, stream, current_mr); + auto valid_sorted_indices = + cudf::detail::split(*sorted_indices, {col.size() - col.null_count()}, stream)[0]; + // only perform an exact quantile calculation for output-type FLOAT64 + // @see cudf::quantile for more details on this parameter + auto exact = output_type.id() == cudf::type_id::FLOAT64; + auto col_ptr = cudf::detail::quantile( + col, {quantile_value}, interpolation, valid_sorted_indices, exact, stream, current_mr); + auto result = cudf::detail::get_element(*col_ptr, 0, stream, mr); + if (result->type().id() == output_type.id()) { return result; } + return cudf::type_dispatcher(output_type, + cudf::reduction::simple::detail::cast_numeric_scalar_fn{}, + static_cast*>(result.get()), + stream, + mr); +} +} // namespace detail +} // namespace reduction +} // namespace cudf diff --git a/cpp/src/reductions/reductions.cpp b/cpp/src/reductions/reductions.cpp index 3dd0db63c0c..4c75cc312ba 100644 --- a/cpp/src/reductions/reductions.cpp +++ b/cpp/src/reductions/reductions.cpp @@ -67,33 +67,14 @@ std::unique_ptr reduce_aggregate_impl( return standard_deviation(col, output_dtype, var_agg._ddof, stream, mr); } case aggregation::MEDIAN: { - auto current_mr = cudf::get_current_device_resource_ref(); - auto sorted_indices = - cudf::detail::sorted_order(table_view{{col}}, {}, {null_order::AFTER}, stream, current_mr); - auto valid_sorted_indices = - cudf::detail::split(*sorted_indices, {col.size() - col.null_count()}, stream)[0]; - auto col_ptr = cudf::detail::quantile( - col, {0.5}, interpolation::LINEAR, valid_sorted_indices, true, stream, current_mr); - return cudf::detail::get_element(*col_ptr, 0, stream, mr); + return quantile(col, 0.5, interpolation::LINEAR, output_dtype, stream, mr); } case aggregation::QUANTILE: { - auto quantile_agg = static_cast(agg); - CUDF_EXPECTS(quantile_agg._quantiles.size() == 1, - "Reduction quantile accepts only one quantile value"); - auto current_mr = cudf::get_current_device_resource_ref(); - auto sorted_indices = - cudf::detail::sorted_order(table_view{{col}}, {}, {null_order::AFTER}, stream, current_mr); - auto valid_sorted_indices = - cudf::detail::split(*sorted_indices, {col.size() - col.null_count()}, stream)[0]; - - auto col_ptr = cudf::detail::quantile(col, - quantile_agg._quantiles, - quantile_agg._interpolation, - valid_sorted_indices, - true, - stream, - current_mr); - return cudf::detail::get_element(*col_ptr, 0, stream, mr); + auto qagg = static_cast(agg); + CUDF_EXPECTS(qagg._quantiles.size() == 1, + "Reduction quantile accepts only one quantile value", + std::invalid_argument); + return quantile(col, qagg._quantiles.front(), qagg._interpolation, output_dtype, stream, mr); } case aggregation::NUNIQUE: { auto nunique_agg = static_cast(agg); @@ -121,13 +102,15 @@ std::unique_ptr reduce_aggregate_impl( } case aggregation::TDIGEST: { CUDF_EXPECTS(output_dtype.id() == type_id::STRUCT, - "Tdigest aggregations expect output type to be STRUCT"); + "Tdigest aggregations expect output type to be STRUCT", + std::invalid_argument); auto td_agg = static_cast(agg); return tdigest::detail::reduce_tdigest(col, td_agg.max_centroids, stream, mr); } case aggregation::MERGE_TDIGEST: { CUDF_EXPECTS(output_dtype.id() == type_id::STRUCT, - "Tdigest aggregations expect output type to be STRUCT"); + "Tdigest aggregations expect output type to be STRUCT", + std::invalid_argument); auto td_agg = static_cast(agg); return tdigest::detail::reduce_merge_tdigest(col, td_agg.max_centroids, stream, mr); } @@ -135,14 +118,15 @@ std::unique_ptr reduce_aggregate_impl( auto const& udf_base_ptr = dynamic_cast(agg).udf_ptr; auto const udf_ptr = dynamic_cast(udf_base_ptr.get()); - CUDF_EXPECTS(udf_ptr != nullptr, "Invalid HOST_UDF instance for reduction."); + CUDF_EXPECTS( + udf_ptr != nullptr, "Invalid HOST_UDF instance for reduction.", std::invalid_argument); return (*udf_ptr)(col, output_dtype, init, stream, mr); } case aggregation::BITWISE_AGG: { auto const bitwise_agg = static_cast(agg); return bitwise_reduction(bitwise_agg.bit_op, col, stream, mr); } - default: CUDF_FAIL("Unsupported reduction operator"); + default: CUDF_FAIL("Unsupported reduction operator", std::invalid_argument); } } @@ -214,7 +198,8 @@ std::unique_ptr reduce(column_view const& col, agg.kind == aggregation::HOST_UDF)) { CUDF_FAIL( "Initial value is only supported for SUM, PRODUCT, MIN, MAX, ANY, ALL, and HOST_UDF " - "aggregation types"); + "aggregation types", + std::invalid_argument); } // Returns default scalar if input column is empty or all null diff --git a/cpp/tests/reductions/reduction_tests.cpp b/cpp/tests/reductions/reduction_tests.cpp index 42ffcf7d45a..ff0806d6dfa 100644 --- a/cpp/tests/reductions/reduction_tests.cpp +++ b/cpp/tests/reductions/reduction_tests.cpp @@ -1117,6 +1117,20 @@ TEST_F(ReductionEmptyTest, empty_column) result = cudf::reduce(col_nulls, *nunique_agg, size_data_type); EXPECT_EQ(result->is_valid(), true); EXPECT_EQ(dynamic_cast*>(result.get())->value(), 1); + + auto double_type = cudf::data_type{cudf::type_id::FLOAT64}; + auto median_agg = cudf::make_median_aggregation(); + result = cudf::reduce(col0, *median_agg, double_type); + EXPECT_EQ(result->is_valid(), false); + result = cudf::reduce(col_nulls, *median_agg, double_type); + EXPECT_EQ(result->is_valid(), false); + + auto quantile_agg = + cudf::make_quantile_aggregation({0.0}, cudf::interpolation::LINEAR); + result = cudf::reduce(col0, *quantile_agg, double_type); + EXPECT_EQ(result->is_valid(), false); + result = cudf::reduce(col_nulls, *quantile_agg, double_type); + EXPECT_EQ(result->is_valid(), false); } TEST_F(ReductionEmptyTest, Errors) @@ -1420,6 +1434,7 @@ TYPED_TEST(ReductionTest, Median) std::vector int_values({6, -14, 13, 64, 0, -13, -20, 45}); std::vector host_bools({true, true, true, false, true, true, true, true}); std::vector v = convert_values(int_values); + auto output_type = cudf::data_type{cudf::type_to_id()}; // test without nulls cudf::test::fixed_width_column_wrapper col(v.begin(), v.end()); @@ -1428,10 +1443,11 @@ TYPED_TEST(ReductionTest, Median) if (std::is_signed_v) return 3.0; return 13.5; }(); - EXPECT_EQ( - this->template reduction_test(col, *cudf::make_median_aggregation()) - .first, - expected_value); + EXPECT_EQ(this + ->template reduction_test( + col, *cudf::make_median_aggregation(), output_type) + .first, + expected_value); auto col_odd = cudf::split(col, {1})[1]; double expected_value_odd = [] { @@ -1441,7 +1457,7 @@ TYPED_TEST(ReductionTest, Median) }(); EXPECT_EQ(this ->template reduction_test( - col_odd, *cudf::make_median_aggregation()) + col_odd, *cudf::make_median_aggregation(), output_type) .first, expected_value_odd); @@ -1455,7 +1471,7 @@ TYPED_TEST(ReductionTest, Median) EXPECT_EQ(this ->template reduction_test( - col_nulls, *cudf::make_median_aggregation()) + col_nulls, *cudf::make_median_aggregation(), output_type) .first, expected_null_value); @@ -1467,7 +1483,7 @@ TYPED_TEST(ReductionTest, Median) }(); EXPECT_EQ(this ->template reduction_test( - col_nulls_odd, *cudf::make_median_aggregation()) + col_nulls_odd, *cudf::make_median_aggregation(), output_type) .first, expected_null_value_odd); } @@ -1480,37 +1496,42 @@ TYPED_TEST(ReductionTest, Quantile) std::vector host_bools({true, true, true, false, true, true, true, true}); std::vector v = convert_values(int_values); cudf::interpolation interp{cudf::interpolation::LINEAR}; + auto output_type = cudf::data_type{cudf::type_to_id()}; // test without nulls cudf::test::fixed_width_column_wrapper col(v.begin(), v.end()); double expected_value0 = std::is_same_v || std::is_unsigned_v ? v[4] : v[6]; - EXPECT_EQ(this - ->template reduction_test( - col, *cudf::make_quantile_aggregation({0.0}, interp)) - .first, - expected_value0); + EXPECT_EQ( + this + ->template reduction_test( + col, *cudf::make_quantile_aggregation({0.0}, interp), output_type) + .first, + expected_value0); double expected_value1 = v[3]; - EXPECT_EQ(this - ->template reduction_test( - col, *cudf::make_quantile_aggregation({1.0}, interp)) - .first, - expected_value1); + EXPECT_EQ( + this + ->template reduction_test( + col, *cudf::make_quantile_aggregation({1.0}, interp), output_type) + .first, + expected_value1); // test with nulls cudf::test::fixed_width_column_wrapper col_nulls = construct_null_column(v, host_bools); double expected_null_value1 = v[7]; - EXPECT_EQ(this - ->template reduction_test( - col_nulls, *cudf::make_quantile_aggregation({0}, interp)) - .first, - expected_value0); - EXPECT_EQ(this - ->template reduction_test( - col_nulls, *cudf::make_quantile_aggregation({1}, interp)) - .first, - expected_null_value1); + EXPECT_EQ( + this + ->template reduction_test( + col_nulls, *cudf::make_quantile_aggregation({0}, interp), output_type) + .first, + expected_value0); + EXPECT_EQ( + this + ->template reduction_test( + col_nulls, *cudf::make_quantile_aggregation({1}, interp), output_type) + .first, + expected_null_value1); } TYPED_TEST(ReductionTest, UniqueCount) @@ -2576,20 +2597,22 @@ TYPED_TEST(DictionaryReductionTest, Median) using T = TypeParam; std::vector int_values({6, -14, 13, 64, 0, -13, -20, 45}); std::vector v = convert_values(int_values); + auto output_type = cudf::data_type{cudf::type_to_id()}; // test without nulls cudf::test::dictionary_column_wrapper col(v.begin(), v.end()); - EXPECT_EQ( - this->template reduction_test(col, *cudf::make_median_aggregation()) - .first, - (std::is_signed_v) ? 3.0 : 13.5); + EXPECT_EQ(this + ->template reduction_test( + col, *cudf::make_median_aggregation(), output_type) + .first, + (std::is_signed_v) ? 3.0 : 13.5); // test with nulls std::vector validity({true, true, true, false, true, true, true, true}); cudf::test::dictionary_column_wrapper col_nulls(v.begin(), v.end(), validity.begin()); EXPECT_EQ(this ->template reduction_test( - col_nulls, *cudf::make_median_aggregation()) + col_nulls, *cudf::make_median_aggregation(), output_type) .first, (std::is_signed_v) ? 0.0 : 13.0); } @@ -2600,35 +2623,40 @@ TYPED_TEST(DictionaryReductionTest, Quantile) std::vector int_values({6, -14, 13, 64, 0, -13, -20, 45}); std::vector v = convert_values(int_values); cudf::interpolation interp{cudf::interpolation::LINEAR}; + auto output_type = cudf::data_type{cudf::type_to_id()}; // test without nulls cudf::test::dictionary_column_wrapper col(v.begin(), v.end()); double expected_value = std::is_same_v || std::is_unsigned_v ? 0.0 : -20.0; - EXPECT_EQ(this - ->template reduction_test( - col, *cudf::make_quantile_aggregation({0.0}, interp)) - .first, - expected_value); - EXPECT_EQ(this - ->template reduction_test( - col, *cudf::make_quantile_aggregation({1.0}, interp)) - .first, - 64.0); + EXPECT_EQ( + this + ->template reduction_test( + col, *cudf::make_quantile_aggregation({0.0}, interp), output_type) + .first, + expected_value); + EXPECT_EQ( + this + ->template reduction_test( + col, *cudf::make_quantile_aggregation({1.0}, interp), output_type) + .first, + 64.0); // test with nulls std::vector validity({true, true, true, false, true, true, true, true}); cudf::test::dictionary_column_wrapper col_nulls(v.begin(), v.end(), validity.begin()); - EXPECT_EQ(this - ->template reduction_test( - col_nulls, *cudf::make_quantile_aggregation({0}, interp)) - .first, - expected_value); - EXPECT_EQ(this - ->template reduction_test( - col_nulls, *cudf::make_quantile_aggregation({1}, interp)) - .first, - 45.0); + EXPECT_EQ( + this + ->template reduction_test( + col_nulls, *cudf::make_quantile_aggregation({0}, interp), output_type) + .first, + expected_value); + EXPECT_EQ( + this + ->template reduction_test( + col_nulls, *cudf::make_quantile_aggregation({1}, interp), output_type) + .first, + 45.0); } struct ListReductionTest : public cudf::test::BaseFixture { diff --git a/python/cudf_polars/tests/expressions/test_agg.py b/python/cudf_polars/tests/expressions/test_agg.py index f3dacb57508..9bcd4bd5c4b 100644 --- a/python/cudf_polars/tests/expressions/test_agg.py +++ b/python/cudf_polars/tests/expressions/test_agg.py @@ -8,7 +8,6 @@ from cudf_polars.dsl import expr from cudf_polars.testing.asserts import ( - DEFAULT_BLOCKSIZE_MODE, assert_gpu_result_equal, assert_ir_translation_raises, ) @@ -72,17 +71,7 @@ def df(dtype, with_nulls, is_sorted): def test_agg(df, agg): expr = getattr(pl.col("a"), agg)() q = df.select(expr) - - # https://github.com/rapidsai/cudf/issues/15852 - check_dtypes = agg not in {"median"} - if ( - not check_dtypes - and q.collect_schema()["a"] != pl.Float64 - and DEFAULT_BLOCKSIZE_MODE == "default" - ): - with pytest.raises(AssertionError): - assert_gpu_result_equal(q) - assert_gpu_result_equal(q, check_dtypes=check_dtypes, check_exact=False) + assert_gpu_result_equal(q, check_exact=False) def test_bool_agg(agg, request): @@ -109,13 +98,7 @@ def test_cum_agg_reverse_unsupported(cum_agg): def test_quantile(df, q, interp): expr = pl.col("a").quantile(q, interp) q = df.select(expr) - - # https://github.com/rapidsai/cudf/issues/15852 - check_dtypes = q.collect_schema()["a"] == pl.Float64 - if not check_dtypes: - with pytest.raises(AssertionError): - assert_gpu_result_equal(q) - assert_gpu_result_equal(q, check_dtypes=check_dtypes, check_exact=False) + assert_gpu_result_equal(q, check_exact=False) def test_quantile_invalid_q(df): From 2063844cedfdf542d4e7b5e6ce083cefb4bd5ac6 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Thu, 14 Aug 2025 10:47:25 -0700 Subject: [PATCH 127/366] Avoid cudf.pandas fallback for `pandas.array.NumpyExtensionArray` of strings (#19558) This case was falling back when _sometimes_ this case can be faithfully represented in cuDF. The exception to _sometimes_ is when the data has missing values. pandas strings, pre- and post- pandas 3.0, can have a varied amount of missing value sentinels (`None`, `np.nan`, `pd.NA`) depending if the pandas string type is `object` or `pd.StringDtype`. I don't anticipate it being practical to preserve which missing value sentinel(s) were specified in pandas while round-tripping though cuDF. Currently cuDF always `to_pandas` string data as `object` type with `None` as the missing value sentinel which isn't always correct (per the additions to `conftest-patch.py`). Discussed offline, it probably isn't worth falling back if strings with missing value could never be accelerated. Additionally, defines `DataFrame.__iter__` on the proxy object to short circuit to returning `pandas.DataFrame.__iter__` to avoid the fallback logic Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - GALI PREM SAGAR (https://github.com/galipremsagar) URL: https://github.com/rapidsai/cudf/pull/19558 --- python/cudf/cudf/core/column/column.py | 6 +- python/cudf/cudf/pandas/_wrappers/pandas.py | 1 + .../cudf/pandas/scripts/conftest-patch.py | 93 +++++++++++++++++++ .../cudf/tests/series/test_constructors.py | 15 +++ 4 files changed, 113 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index a2c21071bbe..c462278d2d4 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -3046,8 +3046,10 @@ def as_column( if ( cudf.get_option("mode.pandas_compatible") and inferred_dtype == "mixed" - and not isinstance( - pyarrow_array.type, (pa.ListType, pa.StructType) + and not ( + pa.types.is_list(pyarrow_array.type) + or pa.types.is_struct(pyarrow_array.type) + or pa.types.is_string(pyarrow_array.type) ) ): raise MixedTypeError("Cannot create column with mixed types") diff --git a/python/cudf/cudf/pandas/_wrappers/pandas.py b/python/cudf/cudf/pandas/_wrappers/pandas.py index 492334c9416..95f559f4b6f 100644 --- a/python/cudf/cudf/pandas/_wrappers/pandas.py +++ b/python/cudf/cudf/pandas/_wrappers/pandas.py @@ -298,6 +298,7 @@ def _DataFrame__dtypes(self): "_accessors": set(), "_ipython_canary_method_should_not_exist_": ignore_ipython_canary_check, "dtypes": property(_DataFrame__dtypes), + "__iter__": custom_iter, }, ) diff --git a/python/cudf/cudf/pandas/scripts/conftest-patch.py b/python/cudf/cudf/pandas/scripts/conftest-patch.py index c9795a643a9..3869116496e 100644 --- a/python/cudf/cudf/pandas/scripts/conftest-patch.py +++ b/python/cudf/cudf/pandas/scripts/conftest-patch.py @@ -104,6 +104,9 @@ def pytest_unconfigure(config): "tests/apply/test_frame_apply.py::test_apply_empty_infer_type[python-1-True-mean-index]", "tests/apply/test_frame_apply.py::test_apply_function_runs_once", "tests/apply/test_frame_apply.py::test_apply_raw_function_runs_once[python]", + "tests/apply/test_frame_apply.py::test_mixed_column_raises[max-df0]", + "tests/apply/test_frame_apply.py::test_mixed_column_raises[min-df0]", + "tests/apply/test_frame_apply.py::test_mixed_column_raises[sum-df0]", "tests/apply/test_frame_apply.py::test_nuiscance_columns", "tests/apply/test_frame_apply.py::test_nunique_empty", "tests/apply/test_frame_transform.py::test_transform_listlike[axis='columns'-ops0-names0]", @@ -829,6 +832,22 @@ def pytest_unconfigure(config): "tests/arrays/integer/test_arithmetic.py::test_arithmetic_conversion[__rtruediv__-other1]", "tests/arrays/integer/test_arithmetic.py::test_arithmetic_conversion[__truediv__-1.0]", "tests/arrays/integer/test_arithmetic.py::test_arithmetic_conversion[__truediv__-other1]", + "tests/arrays/integer/test_arithmetic.py::test_error_invalid_values[Int16Dtype-__mul__]", + "tests/arrays/integer/test_arithmetic.py::test_error_invalid_values[Int16Dtype-__rmul__]", + "tests/arrays/integer/test_arithmetic.py::test_error_invalid_values[Int32Dtype-__mul__]", + "tests/arrays/integer/test_arithmetic.py::test_error_invalid_values[Int32Dtype-__rmul__]", + "tests/arrays/integer/test_arithmetic.py::test_error_invalid_values[Int64Dtype-__mul__]", + "tests/arrays/integer/test_arithmetic.py::test_error_invalid_values[Int64Dtype-__rmul__]", + "tests/arrays/integer/test_arithmetic.py::test_error_invalid_values[Int8Dtype-__mul__]", + "tests/arrays/integer/test_arithmetic.py::test_error_invalid_values[Int8Dtype-__rmul__]", + "tests/arrays/integer/test_arithmetic.py::test_error_invalid_values[UInt16Dtype-__mul__]", + "tests/arrays/integer/test_arithmetic.py::test_error_invalid_values[UInt16Dtype-__rmul__]", + "tests/arrays/integer/test_arithmetic.py::test_error_invalid_values[UInt32Dtype-__mul__]", + "tests/arrays/integer/test_arithmetic.py::test_error_invalid_values[UInt32Dtype-__rmul__]", + "tests/arrays/integer/test_arithmetic.py::test_error_invalid_values[UInt64Dtype-__mul__]", + "tests/arrays/integer/test_arithmetic.py::test_error_invalid_values[UInt64Dtype-__rmul__]", + "tests/arrays/integer/test_arithmetic.py::test_error_invalid_values[UInt8Dtype-__mul__]", + "tests/arrays/integer/test_arithmetic.py::test_error_invalid_values[UInt8Dtype-__rmul__]", "tests/arrays/integer/test_arithmetic.py::test_values_multiplying_large_series_by_NA", "tests/arrays/integer/test_comparison.py::TestComparisonOps::test_ufunc_with_out[Int16Dtype]", "tests/arrays/integer/test_comparison.py::TestComparisonOps::test_ufunc_with_out[Int32Dtype]", @@ -5382,6 +5401,12 @@ def pytest_unconfigure(config): "tests/frame/methods/test_tz_localize.py::TestTZLocalize::test_tz_localize_copy_inplace_mutate[Series-True]", "tests/frame/methods/test_update.py::TestDataFrameUpdate::test_update_modify_view", "tests/frame/methods/test_value_counts.py::test_data_frame_value_counts_dropna_false[NoneType]", + "tests/frame/methods/test_value_counts.py::test_data_frame_value_counts_dropna_false[Decimal]", + "tests/frame/methods/test_value_counts.py::test_data_frame_value_counts_dropna_false[NaTType]", + "tests/frame/methods/test_value_counts.py::test_data_frame_value_counts_dropna_false[NAType]", + "tests/frame/methods/test_value_counts.py::test_data_frame_value_counts_subset[Decimal-columns1]", + "tests/frame/methods/test_value_counts.py::test_data_frame_value_counts_subset[NaTType-columns1]", + "tests/frame/methods/test_value_counts.py::test_data_frame_value_counts_subset[NAType-columns1]", "tests/frame/methods/test_value_counts.py::test_data_frame_value_counts_empty", "tests/frame/methods/test_value_counts.py::test_data_frame_value_counts_empty_normalize", "tests/frame/methods/test_value_counts.py::test_data_frame_value_counts_subset[NoneType-columns1]", @@ -6021,6 +6046,8 @@ def pytest_unconfigure(config): "tests/frame/test_stack_unstack.py::TestStackUnstackMultiLevel::test_stack_multiple_out_of_bounds[True]", "tests/frame/test_stack_unstack.py::TestStackUnstackMultiLevel::test_stack_nan_in_multiindex_columns[False]", "tests/frame/test_stack_unstack.py::TestStackUnstackMultiLevel::test_stack_nan_in_multiindex_columns[True]", + "tests/frame/test_stack_unstack.py::TestStackUnstackMultiLevel::test_stack_nan_level[False]", + "tests/frame/test_stack_unstack.py::TestStackUnstackMultiLevel::test_stack_nan_level[True]", "tests/frame/test_stack_unstack.py::TestStackUnstackMultiLevel::test_stack_nullable_dtype[False]", "tests/frame/test_stack_unstack.py::TestStackUnstackMultiLevel::test_stack_nullable_dtype[True]", "tests/frame/test_stack_unstack.py::TestStackUnstackMultiLevel::test_stack_order_with_unsorted_levels_multi_row_2[False]", @@ -6956,6 +6983,10 @@ def pytest_unconfigure(config): "tests/groupby/methods/test_nth.py::test_nth_after_selection[any-selection2]", "tests/groupby/methods/test_nth.py::test_nth_column_order", "tests/groupby/methods/test_nth.py::test_nth_indexed", + "tests/groupby/methods/test_nth.py::test_groupby_last_first_nth_with_none[Decimal-nth]", + "tests/groupby/methods/test_nth.py::test_groupby_last_first_nth_with_none[NaTType-nth]", + "tests/groupby/methods/test_nth.py::test_groupby_last_first_nth_with_none[NAType-nth]", + "tests/groupby/methods/test_nth.py::test_groupby_last_first_nth_with_none[NoneType-nth]", "tests/groupby/methods/test_nth.py::test_nth_multi_grouper", "tests/groupby/methods/test_nth.py::test_nth_multi_index_as_expected", "tests/groupby/methods/test_nth.py::test_nth_with_na_object[NoneType--1]", @@ -7126,6 +7157,12 @@ def pytest_unconfigure(config): "tests/groupby/methods/test_value_counts.py::test_compound[string=string[python]-True-True-expected_rows2-expected_count2-expected_group_size2-True]", "tests/groupby/methods/test_value_counts.py::test_data_frame_value_counts_dropna[NoneType-False-count-False-expected_data1-expected_index1]", "tests/groupby/methods/test_value_counts.py::test_data_frame_value_counts_dropna[NoneType-True-proportion-False-expected_data1-expected_index1]", + "tests/groupby/methods/test_value_counts.py::test_data_frame_value_counts_dropna[Decimal-False-count-False-expected_data1-expected_index1]", + "tests/groupby/methods/test_value_counts.py::test_data_frame_value_counts_dropna[Decimal-True-proportion-False-expected_data1-expected_index1]", + "tests/groupby/methods/test_value_counts.py::test_data_frame_value_counts_dropna[NaTType-False-count-False-expected_data1-expected_index1]", + "tests/groupby/methods/test_value_counts.py::test_data_frame_value_counts_dropna[NaTType-True-proportion-False-expected_data1-expected_index1]", + "tests/groupby/methods/test_value_counts.py::test_data_frame_value_counts_dropna[NAType-False-count-False-expected_data1-expected_index1]", + "tests/groupby/methods/test_value_counts.py::test_data_frame_value_counts_dropna[NAType-True-proportion-False-expected_data1-expected_index1]", "tests/groupby/methods/test_value_counts.py::test_dropna_combinations[True-False-expected_rows2-expected_values2]", "tests/groupby/methods/test_value_counts.py::test_value_counts_sort[False-False-False]", "tests/groupby/methods/test_value_counts.py::test_value_counts_sort[False-True-False]", @@ -7424,6 +7461,24 @@ def pytest_unconfigure(config): "tests/groupby/test_groupby_dropna.py::test_groupby_apply_with_dropna_for_multi_index[dropna_true_no_nan]", "tests/groupby/test_groupby_dropna.py::test_groupby_dropna_multi_index_dataframe_agg[False-tuples1-outputs1]", "tests/groupby/test_groupby_dropna.py::test_groupby_dropna_multi_index_dataframe_nan_in_one_group[NoneType-False-tuples1-outputs1]", + "tests/groupby/test_groupby_dropna.py::test_groupby_dropna_multi_index_dataframe_nan_in_one_group[Decimal-False-tuples1-outputs1]", + "tests/groupby/test_groupby_dropna.py::test_groupby_dropna_multi_index_dataframe_nan_in_one_group[NaTType-False-tuples1-outputs1]", + "tests/groupby/test_groupby_dropna.py::test_groupby_dropna_multi_index_dataframe_nan_in_one_group[NAType-False-tuples1-outputs1]", + "tests/groupby/test_groupby_dropna.py::test_groupby_dropna_multi_index_dataframe_nan_in_two_groups[Decimal-Decimal-False-tuples1-outputs1]", + "tests/groupby/test_groupby_dropna.py::test_groupby_dropna_multi_index_dataframe_nan_in_two_groups[Decimal-NaTType-False-tuples1-outputs1]", + "tests/groupby/test_groupby_dropna.py::test_groupby_dropna_multi_index_dataframe_nan_in_two_groups[Decimal-NAType-False-tuples1-outputs1]", + "tests/groupby/test_groupby_dropna.py::test_groupby_dropna_multi_index_dataframe_nan_in_two_groups[Decimal-NoneType-False-tuples1-outputs1]", + "tests/groupby/test_groupby_dropna.py::test_groupby_dropna_multi_index_dataframe_nan_in_two_groups[NaTType-Decimal-False-tuples1-outputs1]", + "tests/groupby/test_groupby_dropna.py::test_groupby_dropna_multi_index_dataframe_nan_in_two_groups[NaTType-NaTType-False-tuples1-outputs1]", + "tests/groupby/test_groupby_dropna.py::test_groupby_dropna_multi_index_dataframe_nan_in_two_groups[NaTType-NAType-False-tuples1-outputs1]", + "tests/groupby/test_groupby_dropna.py::test_groupby_dropna_multi_index_dataframe_nan_in_two_groups[NaTType-NoneType-False-tuples1-outputs1]", + "tests/groupby/test_groupby_dropna.py::test_groupby_dropna_multi_index_dataframe_nan_in_two_groups[NAType-Decimal-False-tuples1-outputs1]", + "tests/groupby/test_groupby_dropna.py::test_groupby_dropna_multi_index_dataframe_nan_in_two_groups[NAType-NaTType-False-tuples1-outputs1]", + "tests/groupby/test_groupby_dropna.py::test_groupby_dropna_multi_index_dataframe_nan_in_two_groups[NAType-NAType-False-tuples1-outputs1]", + "tests/groupby/test_groupby_dropna.py::test_groupby_dropna_multi_index_dataframe_nan_in_two_groups[NAType-NoneType-False-tuples1-outputs1]", + "tests/groupby/test_groupby_dropna.py::test_groupby_dropna_multi_index_dataframe_nan_in_two_groups[NoneType-Decimal-False-tuples1-outputs1]", + "tests/groupby/test_groupby_dropna.py::test_groupby_dropna_multi_index_dataframe_nan_in_two_groups[NoneType-NaTType-False-tuples1-outputs1]", + "tests/groupby/test_groupby_dropna.py::test_groupby_dropna_multi_index_dataframe_nan_in_two_groups[NoneType-NAType-False-tuples1-outputs1]", "tests/groupby/test_groupby_dropna.py::test_groupby_dropna_multi_index_dataframe_nan_in_two_groups[NoneType-NoneType-False-tuples1-outputs1]", "tests/groupby/test_groupby_dropna.py::test_grouper_dropna_propagation[False]", "tests/groupby/test_groupby_dropna.py::test_grouper_dropna_propagation[True]", @@ -8196,6 +8251,11 @@ def pytest_unconfigure(config): "tests/indexes/numeric/test_numeric.py::TestIntNumericIndex::test_constructor[int16]", "tests/indexes/numeric/test_numeric.py::TestIntNumericIndex::test_constructor[int32]", "tests/indexes/numeric/test_numeric.py::TestIntNumericIndex::test_constructor[int64]", + "tests/indexes/object/test_indexing.py::TestGetIndexerNonUnique::test_get_indexer_non_unique_nas[Decimal]", + "tests/indexes/object/test_indexing.py::TestGetIndexerNonUnique::test_get_indexer_non_unique_nas[NaTType]", + "tests/indexes/object/test_indexing.py::TestGetIndexerNonUnique::test_get_indexer_non_unique_nas[NAType]", + "tests/indexes/object/test_indexing.py::TestGetIndexer::test_get_indexer_with_NA_values[None-unique_nulls_fixture22]", + "tests/indexes/object/test_indexing.py::TestGetIndexer::test_get_indexer_with_NA_values[unique_nulls_fixture2-None]", "tests/indexes/object/test_indexing.py::TestSliceLocs::test_slice_locs_negative_step[in_slice13--object]", "tests/indexes/object/test_indexing.py::TestSliceLocs::test_slice_locs_negative_step[in_slice13--string[pyarrow_numpy]]", "tests/indexes/period/test_constructors.py::TestPeriodIndex::test_constructor_fromarraylike", @@ -8268,6 +8328,10 @@ def pytest_unconfigure(config): "tests/indexes/string/test_indexing.py::TestGetIndexer::test_get_indexer_strings_raises[string=str[python]]", "tests/indexes/string/test_indexing.py::TestGetIndexer::test_get_indexer_strings_raises[string=string[pyarrow]]", "tests/indexes/string/test_indexing.py::TestGetIndexer::test_get_indexer_strings_raises[string=string[python]]", + "tests/indexes/string/test_indexing.py::TestGetIndexerNonUnique::test_get_indexer_non_unique_nas[string=object-null3]", + "tests/indexes/string/test_indexing.py::TestGetLoc::test_get_loc_missing[string=object-Decimal]", + "tests/indexes/string/test_indexing.py::TestGetLoc::test_get_loc_missing[string=object-NaTType]", + "tests/indexes/string/test_indexing.py::TestGetLoc::test_get_loc_missing[string=object-NAType]", "tests/indexes/string/test_indexing.py::TestSliceLocs::test_slice_locs_negative_step[string=object-in_slice13-]", "tests/indexes/string/test_indexing.py::TestSliceLocs::test_slice_locs_negative_step[string=str[pyarrow]-in_slice13-]", "tests/indexes/string/test_indexing.py::TestSliceLocs::test_slice_locs_negative_step[string=str[python]-in_slice13-]", @@ -8319,6 +8383,9 @@ def pytest_unconfigure(config): "tests/indexes/test_base.py::TestIndex::test_empty_fancy_raises[uint32]", "tests/indexes/test_base.py::TestIndex::test_empty_fancy_raises[uint64]", "tests/indexes/test_base.py::TestIndex::test_equals_op_mismatched_multiindex_raises[index0]", + "tests/indexes/test_base.py::TestIndex::test_format_missing[Decimal-vals1]", + "tests/indexes/test_base.py::TestIndex::test_format_missing[NaTType-vals1]", + "tests/indexes/test_base.py::TestIndex::test_format_missing[NAType-vals1]", "tests/indexes/test_base.py::TestIndex::test_is_", "tests/indexes/test_base.py::TestIndex::test_isin_level_kwarg_bad_label_raises[bool-dtype-nan]", "tests/indexes/test_base.py::TestIndex::test_isin_level_kwarg_bad_label_raises[categorical-nan]", @@ -8375,6 +8442,12 @@ def pytest_unconfigure(config): "tests/indexes/test_base.py::TestIndex::test_isin_nan_common_float64[NoneType-float32]", "tests/indexes/test_base.py::TestIndex::test_isin_nan_common_float64[NoneType-float64]", "tests/indexes/test_base.py::TestIndex::test_isin_nan_common_float64[NoneType-float]", + "tests/indexes/test_base.py::TestIndex::test_isin_nan_common_object[Decimal-Decimal]", + "tests/indexes/test_base.py::TestIndex::test_isin_nan_common_object[Decimal-NoneType]", + "tests/indexes/test_base.py::TestIndex::test_isin_nan_common_object[NaTType-NaTType]", + "tests/indexes/test_base.py::TestIndex::test_isin_nan_common_object[NaTType-NoneType]", + "tests/indexes/test_base.py::TestIndex::test_isin_nan_common_object[NAType-NAType]", + "tests/indexes/test_base.py::TestIndex::test_isin_nan_common_object[NAType-NoneType]", "tests/indexes/test_base.py::TestIndex::test_map_defaultdict", "tests/indexes/test_base.py::TestIndex::test_str_attribute_raises[index2]", "tests/indexes/test_base.py::TestIndex::test_str_bool_return", @@ -10456,6 +10529,8 @@ def pytest_unconfigure(config): "tests/io/test_fsspec.py::test_json_options[zip]", "tests/io/test_fsspec.py::test_json_options[zstd]", "tests/io/test_fsspec.py::test_non_fsspec_options", + "tests/io/test_html.py::TestReadHtml::test_extract_links[bs4-header]", + "tests/io/test_html.py::TestReadHtml::test_extract_links[lxml-header]", "tests/io/test_orc.py::test_orc_reader_basic", "tests/io/test_orc.py::test_orc_reader_date_high", "tests/io/test_orc.py::test_orc_reader_date_low", @@ -11231,6 +11306,18 @@ def pytest_unconfigure(config): "tests/reshape/merge/test_merge.py::TestMergeDtypes::test_join_multi_dtypes[float32-int320]", "tests/reshape/merge/test_merge.py::TestMergeDtypes::test_join_multi_dtypes[float32-int321]", "tests/reshape/merge/test_merge.py::TestMergeDtypes::test_merge_bool_dtype[right-expected_data3]", + "tests/reshape/merge/test_merge.py::TestMergeDtypes::test_merge_ea_with_string[inner-str0]", + "tests/reshape/merge/test_merge.py::TestMergeDtypes::test_merge_ea_with_string[inner-str1]", + "tests/reshape/merge/test_merge.py::TestMergeDtypes::test_merge_ea_with_string[inner-U]", + "tests/reshape/merge/test_merge.py::TestMergeDtypes::test_merge_ea_with_string[left-str0]", + "tests/reshape/merge/test_merge.py::TestMergeDtypes::test_merge_ea_with_string[left-str1]", + "tests/reshape/merge/test_merge.py::TestMergeDtypes::test_merge_ea_with_string[left-U]", + "tests/reshape/merge/test_merge.py::TestMergeDtypes::test_merge_ea_with_string[outer-str0]", + "tests/reshape/merge/test_merge.py::TestMergeDtypes::test_merge_ea_with_string[outer-str1]", + "tests/reshape/merge/test_merge.py::TestMergeDtypes::test_merge_ea_with_string[outer-U]", + "tests/reshape/merge/test_merge.py::TestMergeDtypes::test_merge_ea_with_string[right-str0]", + "tests/reshape/merge/test_merge.py::TestMergeDtypes::test_merge_ea_with_string[right-str1]", + "tests/reshape/merge/test_merge.py::TestMergeDtypes::test_merge_ea_with_string[right-U]", "tests/reshape/merge/test_merge.py::TestMergeDtypes::test_merge_empty[False-right-empty]", "tests/reshape/merge/test_merge.py::TestMergeDtypes::test_merge_empty[True-right-right]", "tests/reshape/merge/test_merge.py::TestMergeDtypes::test_merge_incompat_dtypes_are_ok[df1_vals3-df2_vals3]", @@ -12408,6 +12495,9 @@ def pytest_unconfigure(config): "tests/series/indexing/test_setitem.py::TestSetitemTimedelta64IntoNumeric::test_mask_key[int-loc]", "tests/series/indexing/test_setitem.py::TestSetitemTimedelta64IntoNumeric::test_mask_key[int-setitem]", "tests/series/indexing/test_setitem.py::TestSetitemTimedelta64IntoNumeric::test_series_where[int]", + "tests/series/indexing/test_setitem.py::TestSetitemWithExpansion::test_setitem_enlargement_object_none[Decimal]", + "tests/series/indexing/test_setitem.py::TestSetitemWithExpansion::test_setitem_enlargement_object_none[NaTType]", + "tests/series/indexing/test_setitem.py::TestSetitemWithExpansion::test_setitem_enlargement_object_none[NAType]", "tests/series/indexing/test_setitem.py::TestSmallIntegerSetitemUpcast::test_int_key[iloc-4611686018427387904]", "tests/series/indexing/test_setitem.py::TestSmallIntegerSetitemUpcast::test_int_key[iloc-8589934593.0]", "tests/series/indexing/test_setitem.py::TestSmallIntegerSetitemUpcast::test_int_key[iloc-8589934593.1]", @@ -12466,6 +12556,8 @@ def pytest_unconfigure(config): "tests/series/methods/test_astype.py::TestAstype::test_astype_str_cast_td64", "tests/series/methods/test_astype.py::TestAstype::test_dt64_series_astype_object", "tests/series/methods/test_astype.py::TestAstype::test_td64_series_astype_object", + "tests/series/methods/test_astype.py::TestAstype::test_astype_to_str_preserves_na[None-None]", + "tests/series/methods/test_astype.py::TestAstype::test_astype_to_str_preserves_na[value2-]", "tests/series/methods/test_astype.py::TestAstypeCategorical::test_astype_categorical_to_categorical[False-True-None]", "tests/series/methods/test_astype.py::TestAstypeCategorical::test_astype_categorical_to_categorical[False-True-foo]", "tests/series/methods/test_astype.py::TestAstypeCategorical::test_astype_categorical_to_categorical[True-False-None]", @@ -13017,6 +13109,7 @@ def pytest_unconfigure(config): "tests/series/methods/test_reindex.py::test_reindexing_with_float64_NA_log", "tests/series/methods/test_rename.py::TestRename::test_rename_copy_false", "tests/series/methods/test_replace.py::TestSeriesReplace::test_replace_categorical_single", + "tests/series/methods/test_replace.py::TestSeriesReplace::test_replace_change_dtype_series", "tests/series/methods/test_replace.py::TestSeriesReplace::test_replace_datetime64", "tests/series/methods/test_replace.py::TestSeriesReplace::test_replace_dtype[Float64-input_data4-to_replace4-expected_data4]", "tests/series/methods/test_replace.py::TestSeriesReplace::test_replace_dtype[Int64-input_data2-to_replace2-expected_data2]", diff --git a/python/cudf/cudf/tests/series/test_constructors.py b/python/cudf/cudf/tests/series/test_constructors.py index e7bdf6d415f..df83ab51043 100644 --- a/python/cudf/cudf/tests/series/test_constructors.py +++ b/python/cudf/cudf/tests/series/test_constructors.py @@ -11,6 +11,7 @@ import cudf from cudf.core._compat import ( PANDAS_CURRENT_SUPPORTED_VERSION, + PANDAS_GE_210, PANDAS_VERSION, ) from cudf.core.column.column import as_column @@ -112,6 +113,20 @@ def test_series_unitness_np_datetimelike_units(): pd.Series(data) +def test_from_numpyextensionarray_string_object_pandas_compat_mode(): + NumpyExtensionArray = ( + pd.arrays.NumpyExtensionArray + if PANDAS_GE_210 + else pd.arrays.PandasArray + ) + + data = NumpyExtensionArray(np.array(["a", None], dtype=object)) + with cudf.option_context("mode.pandas_compatible", True): + result = cudf.Series(data) + expected = pd.Series(data) + assert_eq(result, expected) + + def test_list_category_like_maintains_dtype(): dtype = cudf.CategoricalDtype(categories=[1, 2, 3, 4], ordered=True) data = [1, 2, 3] From 641e0910851c4799c537b505fc56fa766264aca6 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Thu, 14 Aug 2025 13:01:54 -0700 Subject: [PATCH 128/366] Move test_reshape.py to new cudf classic directory strucutre, remove reshape._merge_sorted (#19614) Towards https://github.com/rapidsai/cudf/issues/9999 Towards https://github.com/rapidsai/cudf/issues/15723 Also removed `reshape._merge_sorted` and its associated tests which were very slow. It appears this was once used by `dask_cudf`, xref https://github.com/rapidsai/cudf/pull/10713, but I cannot find usage of this anymore (possibly removed with migration to dask expressions?) Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/19614 --- python/cudf/cudf/core/reshape.py | 111 --- .../methods/test_interleave_columns.py | 48 + .../cudf/cudf/tests/reshape/test_crosstab.py | 61 ++ python/cudf/cudf/tests/reshape/test_melt.py | 124 +++ python/cudf/cudf/tests/reshape/test_pivot.py | 113 +++ .../cudf/tests/reshape/test_pivot_table.py | 95 ++ python/cudf/cudf/tests/reshape/test_stack.py | 182 ++++ python/cudf/cudf/tests/reshape/test_tile.py | 42 + .../cudf/cudf/tests/reshape/test_unstack.py | 121 +++ python/cudf/cudf/tests/test_reshape.py | 888 ------------------ 10 files changed, 786 insertions(+), 999 deletions(-) create mode 100644 python/cudf/cudf/tests/dataframe/methods/test_interleave_columns.py create mode 100644 python/cudf/cudf/tests/reshape/test_crosstab.py create mode 100644 python/cudf/cudf/tests/reshape/test_melt.py create mode 100644 python/cudf/cudf/tests/reshape/test_pivot.py create mode 100644 python/cudf/cudf/tests/reshape/test_pivot_table.py create mode 100644 python/cudf/cudf/tests/reshape/test_stack.py create mode 100644 python/cudf/cudf/tests/reshape/test_tile.py create mode 100644 python/cudf/cudf/tests/reshape/test_unstack.py delete mode 100644 python/cudf/cudf/tests/test_reshape.py diff --git a/python/cudf/cudf/core/reshape.py b/python/cudf/cudf/core/reshape.py index 74337fd5284..e9ee79f35aa 100644 --- a/python/cudf/cudf/core/reshape.py +++ b/python/cudf/cudf/core/reshape.py @@ -8,8 +8,6 @@ import numpy as np import pandas as pd -import pylibcudf as plc - import cudf from cudf.api.extensions import no_default from cudf.api.types import is_list_like, is_scalar @@ -878,115 +876,6 @@ def get_dummies( return cudf.DataFrame._from_data(data, index=ser.index) -def _merge_sorted( - objs, - keys=None, - by_index=False, - ignore_index=False, - ascending=True, - na_position="last", -): - """Merge a list of sorted DataFrame or Series objects. - - Dataframes/Series in objs list MUST be pre-sorted by columns - listed in `keys`, or by the index (if `by_index=True`). - - Parameters - ---------- - objs : list of DataFrame or Series - keys : list, default None - List of Column names to sort by. If None, all columns used - (Ignored if `by_index=True`) - by_index : bool, default False - Use index for sorting. `keys` input will be ignored if True - ignore_index : bool, default False - Drop and ignore index during merge. Default range index will - be used in the output dataframe. - ascending : bool, default True - Sorting is in ascending order, otherwise it is descending - na_position : {'first', 'last'}, default 'last' - 'first' nulls at the beginning, 'last' nulls at the end - - Returns - ------- - A new, lexicographically sorted, DataFrame/Series. - """ - if is_scalar(objs): - raise TypeError("objs must be a list-like of Frame-like objects") - - if len(objs) < 1: - raise ValueError("objs must be non-empty") - - if not all( - isinstance(table, (cudf.DataFrame, cudf.Series)) for table in objs - ): - raise TypeError("Elements of objs must be Frame-like") - - if len(objs) == 1: - return objs[0] - - if by_index and ignore_index: - raise ValueError("`by_index` and `ignore_index` cannot both be True") - - if by_index: - key_columns_indices = list(range(0, objs[0].index.nlevels)) - else: - if keys is None: - key_columns_indices = list(range(0, objs[0]._num_columns)) - else: - key_columns_indices = [ - objs[0]._column_names.index(key) for key in keys - ] - if not ignore_index: - key_columns_indices = [ - idx + objs[0].index.nlevels for idx in key_columns_indices - ] - - columns = ( - itertools.chain(obj.index._columns, obj._columns) - if not ignore_index - else obj._columns - for obj in objs - ) - - input_tables = [ - plc.Table([col.to_pylibcudf(mode="read") for col in source_columns]) - for source_columns in columns - ] - - num_keys = len(key_columns_indices) - - column_order = ( - plc.types.Order.ASCENDING if ascending else plc.types.Order.DESCENDING - ) - - if not ascending: - na_position = "last" if na_position == "first" else "first" - - null_precedence = ( - plc.types.NullOrder.BEFORE - if na_position == "first" - else plc.types.NullOrder.AFTER - ) - - plc_table = plc.merge.merge( - input_tables, - key_columns_indices, - [column_order] * num_keys, - [null_precedence] * num_keys, - ) - - result_columns = [ - ColumnBase.from_pylibcudf(col) for col in plc_table.columns() - ] - - return objs[0]._from_columns_like_self( - result_columns, - column_names=objs[0]._column_names, - index_names=None if ignore_index else objs[0]._index_names, - ) - - def _pivot( col_accessor: ColumnAccessor, index: Index | MultiIndex, diff --git a/python/cudf/cudf/tests/dataframe/methods/test_interleave_columns.py b/python/cudf/cudf/tests/dataframe/methods/test_interleave_columns.py new file mode 100644 index 00000000000..855619f31f4 --- /dev/null +++ b/python/cudf/cudf/tests/dataframe/methods/test_interleave_columns.py @@ -0,0 +1,48 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + +import numpy as np +import pandas as pd +import pytest + +import cudf +from cudf.testing import assert_eq + + +@pytest.mark.parametrize("nulls", ["none", "some"]) +def test_interleave_columns(nulls, all_supported_types_as_str): + if ( + all_supported_types_as_str not in ["float32", "float64"] + and nulls == "some" + ): + pytest.skip( + reason=f"nulls not supported in {all_supported_types_as_str}" + ) + + num_rows = 10 + num_cols = 2 + pdf = pd.DataFrame(dtype=all_supported_types_as_str) + rng = np.random.default_rng(seed=0) + for i in range(num_cols): + colname = str(i) + data = pd.Series(rng.integers(0, 26, num_rows)).astype( + all_supported_types_as_str + ) + + if nulls == "some": + idx = rng.choice(num_rows, size=int(num_rows / 2), replace=False) + data[idx] = np.nan + pdf[colname] = data + + gdf = cudf.from_pandas(pdf) + + if all_supported_types_as_str == "category": + with pytest.raises(ValueError): + assert gdf.interleave_columns() + else: + got = gdf.interleave_columns() + + expect = pd.Series(np.vstack(pdf.to_numpy()).reshape((-1,))).astype( + all_supported_types_as_str + ) + + assert_eq(expect, got) diff --git a/python/cudf/cudf/tests/reshape/test_crosstab.py b/python/cudf/cudf/tests/reshape/test_crosstab.py new file mode 100644 index 00000000000..fac485d9028 --- /dev/null +++ b/python/cudf/cudf/tests/reshape/test_crosstab.py @@ -0,0 +1,61 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + +import numpy as np +import pandas as pd + +import cudf +from cudf.testing import assert_eq + + +def test_crosstab_simple(): + a = np.array( + [ + "foo", + "foo", + "foo", + "foo", + "bar", + "bar", + "bar", + "bar", + "foo", + "foo", + "foo", + ], + dtype=object, + ) + b = np.array( + [ + "one", + "one", + "one", + "two", + "one", + "one", + "one", + "two", + "two", + "two", + "one", + ], + dtype=object, + ) + c = np.array( + [ + "dull", + "dull", + "shiny", + "dull", + "dull", + "shiny", + "shiny", + "dull", + "shiny", + "shiny", + "shiny", + ], + dtype=object, + ) + expected = pd.crosstab(a, [b, c], rownames=["a"], colnames=["b", "c"]) + actual = cudf.crosstab(a, [b, c], rownames=["a"], colnames=["b", "c"]) + assert_eq(expected, actual, check_dtype=False) diff --git a/python/cudf/cudf/tests/reshape/test_melt.py b/python/cudf/cudf/tests/reshape/test_melt.py new file mode 100644 index 00000000000..3583c12b544 --- /dev/null +++ b/python/cudf/cudf/tests/reshape/test_melt.py @@ -0,0 +1,124 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + +import numpy as np +import pandas as pd +import pytest + +import cudf +from cudf.testing import assert_eq + + +@pytest.mark.parametrize("num_id_vars", [0, 2]) +@pytest.mark.parametrize("num_value_vars", [0, 2]) +@pytest.mark.parametrize("nulls", ["none", "some", "all"]) +def test_melt( + nulls, + num_id_vars, + num_value_vars, + numeric_and_temporal_types_as_str, + ignore_index, +): + if numeric_and_temporal_types_as_str not in [ + "float32", + "float64", + ] and nulls in ["some", "all"]: + pytest.skip( + reason=f"nulls not supported in {numeric_and_temporal_types_as_str}" + ) + + num_rows = 10 + pdf = pd.DataFrame() + id_vars = [] + rng = np.random.default_rng(seed=0) + for i in range(num_id_vars): + colname = "id" + str(i) + data = rng.integers(0, 26, num_rows).astype( + numeric_and_temporal_types_as_str + ) + if nulls == "some": + idx = rng.choice(num_rows, size=int(num_rows / 2), replace=False) + data[idx] = np.nan + elif nulls == "all": + data[:] = np.nan + pdf[colname] = data + id_vars.append(colname) + + value_vars = [] + for i in range(num_value_vars): + colname = "val" + str(i) + data = rng.integers(0, 26, num_rows).astype( + numeric_and_temporal_types_as_str + ) + if nulls == "some": + idx = rng.choice(num_rows, size=int(num_rows / 2), replace=False) + data[idx] = np.nan + elif nulls == "all": + data[:] = np.nan + pdf[colname] = data + value_vars.append(colname) + + gdf = cudf.from_pandas(pdf) + + got = cudf.melt( + frame=gdf, + id_vars=id_vars, + value_vars=value_vars, + ignore_index=ignore_index, + ) + got_from_melt_method = gdf.melt( + id_vars=id_vars, value_vars=value_vars, ignore_index=ignore_index + ) + + expect = pd.melt( + frame=pdf, + id_vars=id_vars, + value_vars=value_vars, + ignore_index=ignore_index, + ) + + assert_eq(expect, got) + + assert_eq(expect, got_from_melt_method) + + +def test_melt_more_than_255_columns(): + mydict = {"id": ["foobar"]} + for i in range(1, 260): + mydict[f"d_{i}"] = i + + df = pd.DataFrame(mydict) + grid_df = pd.melt(df, id_vars=["id"], var_name="d", value_name="sales") + + df_d = cudf.DataFrame(mydict) + grid_df_d = cudf.melt( + df_d, id_vars=["id"], var_name="d", value_name="sales" + ) + grid_df_d["d"] = grid_df_d["d"] + + assert_eq(grid_df, grid_df_d) + + +def test_melt_str_scalar_id_var(): + data = {"index": [1, 2], "id": [1, 2], "d0": [10, 20], "d1": [30, 40]} + result = cudf.melt( + cudf.DataFrame(data), + id_vars="index", + var_name="column", + value_name="value", + ) + expected = pd.melt( + pd.DataFrame(data), + id_vars="index", + var_name="column", + value_name="value", + ) + assert_eq(result, expected) + + +def test_melt_falsy_var_name(): + df = cudf.DataFrame({"A": ["a", "b", "c"], "B": [1, 3, 5], "C": [2, 4, 6]}) + result = cudf.melt(df, id_vars=["A"], value_vars=["B"], var_name="") + expected = pd.melt( + df.to_pandas(), id_vars=["A"], value_vars=["B"], var_name="" + ) + assert_eq(result, expected) diff --git a/python/cudf/cudf/tests/reshape/test_pivot.py b/python/cudf/cudf/tests/reshape/test_pivot.py new file mode 100644 index 00000000000..73398d52bad --- /dev/null +++ b/python/cudf/cudf/tests/reshape/test_pivot.py @@ -0,0 +1,113 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + +import pandas as pd +import pytest + +import cudf +from cudf.testing import assert_eq + + +@pytest.mark.parametrize( + "index, column, data", + [ + ([], [], []), + ([0], [0], [0]), + ([0, 0], [0, 1], [1, 2.0]), + ([0, 1], [0, 0], [1, 2.0]), + ([0, 1], [0, 1], [1, 2.0]), + (["a", "a", "b", "b"], ["c", "d", "c", "d"], [1, 2, 3, 4]), + ( + ["a", "a", "b", "b", "a"], + ["c", "d", "c", "d", "e"], + [1, 2, 3, 4, 5], + ), + ], +) +def test_pivot_simple(index, column, data): + pdf = pd.DataFrame({"index": index, "column": column, "data": data}) + gdf = cudf.from_pandas(pdf) + + expect = pdf.pivot(columns="column", index="index") + got = gdf.pivot(columns="column", index="index") + + check_index_and_columns = expect.shape != (0, 0) + assert_eq( + expect, + got, + check_dtype=False, + check_index_type=check_index_and_columns, + check_column_type=check_index_and_columns, + ) + + +def test_pivot_multi_values(): + # from Pandas docs: + # https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.pivot.html + pdf = pd.DataFrame( + { + "foo": ["one", "one", "one", "two", "two", "two"], + "bar": ["A", "B", "C", "A", "B", "C"], + "baz": [1, 2, 3, 4, 5, 6], + "zoo": ["x", "y", "z", "q", "w", "t"], + } + ) + gdf = cudf.from_pandas(pdf) + assert_eq( + pdf.pivot(index="foo", columns="bar", values=["baz", "zoo"]), + gdf.pivot(index="foo", columns="bar", values=["baz", "zoo"]), + check_dtype=False, + ) + + +@pytest.mark.parametrize( + "values", ["z", "z123", ["z123"], ["z", "z123", "123z"]] +) +def test_pivot_values(values): + data = [ + ["A", "a", 0, 0, 0], + ["A", "b", 1, 1, 1], + ["A", "c", 2, 2, 2], + ["B", "a", 0, 0, 0], + ["B", "b", 1, 1, 1], + ["B", "c", 2, 2, 2], + ["C", "a", 0, 0, 0], + ["C", "b", 1, 1, 1], + ["C", "c", 2, 2, 2], + ] + columns = ["x", "y", "z", "z123", "123z"] + pdf = pd.DataFrame(data, columns=columns) + cdf = cudf.DataFrame(data, columns=columns) + expected = pd.pivot(pdf, index="x", columns="y", values=values) + actual = cudf.pivot(cdf, index="x", columns="y", values=values) + assert_eq( + expected, + actual, + check_dtype=False, + ) + + +def test_pivot_duplicate_error(): + gdf = cudf.DataFrame( + {"a": [0, 1, 2, 2], "b": [1, 2, 3, 3], "d": [1, 2, 3, 4]} + ) + with pytest.raises(ValueError): + gdf.pivot(index="a", columns="b") + with pytest.raises(ValueError): + gdf.pivot(index="b", columns="a") + + +@pytest.mark.parametrize("index", [["ix"], ["ix", "foo"]]) +@pytest.mark.parametrize("columns", [["col"], ["col", "baz"]]) +def test_pivot_list_like_index_columns(index, columns): + data = { + "bar": ["x", "y", "z", "w"], + "col": ["a", "b", "a", "b"], + "foo": [1, 2, 3, 4], + "ix": [1, 1, 2, 2], + "baz": [0, 0, 0, 0], + } + pd_df = pd.DataFrame(data) + cudf_df = cudf.DataFrame(data) + result = cudf_df.pivot(columns=columns, index=index) + expected = pd_df.pivot(columns=columns, index=index) + assert_eq(result, expected) diff --git a/python/cudf/cudf/tests/reshape/test_pivot_table.py b/python/cudf/cudf/tests/reshape/test_pivot_table.py new file mode 100644 index 00000000000..c7d0eb47411 --- /dev/null +++ b/python/cudf/cudf/tests/reshape/test_pivot_table.py @@ -0,0 +1,95 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + +import numpy as np +import pandas as pd +import pytest + +import cudf +from cudf.testing import assert_eq + + +@pytest.mark.parametrize( + "aggfunc", ["mean", "count", {"D": "sum", "E": "count"}] +) +def test_pivot_table_simple(aggfunc): + rng = np.random.default_rng(seed=0) + fill_value = 0 + pdf = pd.DataFrame( + { + "A": ["one", "one", "two", "three"] * 6, + "B": ["A", "B", "C"] * 8, + "C": ["foo", "foo", "foo", "bar", "bar", "bar"] * 4, + "D": rng.standard_normal(size=24), + "E": rng.standard_normal(size=24), + } + ) + expected = pd.pivot_table( + pdf, + values=["D", "E"], + index=["A", "B"], + columns=["C"], + aggfunc=aggfunc, + fill_value=fill_value, + ) + cdf = cudf.DataFrame.from_pandas(pdf) + actual = cudf.pivot_table( + cdf, + values=["D", "E"], + index=["A", "B"], + columns=["C"], + aggfunc=aggfunc, + fill_value=fill_value, + ) + assert_eq(expected, actual, check_dtype=False) + + +@pytest.mark.parametrize( + "aggfunc", ["mean", "count", {"D": "sum", "E": "count"}] +) +def test_dataframe_pivot_table_simple(aggfunc): + rng = np.random.default_rng(seed=0) + fill_value = 0 + pdf = pd.DataFrame( + { + "A": ["one", "one", "two", "three"] * 6, + "B": ["A", "B", "C"] * 8, + "C": ["foo", "foo", "foo", "bar", "bar", "bar"] * 4, + "D": rng.standard_normal(size=24), + "E": rng.standard_normal(size=24), + } + ) + expected = pdf.pivot_table( + values=["D", "E"], + index=["A", "B"], + columns=["C"], + aggfunc=aggfunc, + fill_value=fill_value, + ) + cdf = cudf.DataFrame.from_pandas(pdf) + actual = cdf.pivot_table( + values=["D", "E"], + index=["A", "B"], + columns=["C"], + aggfunc=aggfunc, + fill_value=fill_value, + ) + assert_eq(expected, actual, check_dtype=False) + + +@pytest.mark.parametrize("index", ["A", ["A"]]) +@pytest.mark.parametrize("columns", ["C", ["C"]]) +def test_pivot_table_scalar_index_columns(index, columns): + data = { + "A": ["one", "one", "two", "three"] * 6, + "B": ["A", "B", "C"] * 8, + "C": ["foo", "foo", "foo", "bar", "bar", "bar"] * 4, + "D": range(24), + "E": range(24), + } + result = cudf.DataFrame(data).pivot_table( + values="D", index=index, columns=columns, aggfunc="sum" + ) + expected = pd.DataFrame(data).pivot_table( + values="D", index=index, columns=columns, aggfunc="sum" + ) + assert_eq(result, expected) diff --git a/python/cudf/cudf/tests/reshape/test_stack.py b/python/cudf/cudf/tests/reshape/test_stack.py new file mode 100644 index 00000000000..64b976f6be0 --- /dev/null +++ b/python/cudf/cudf/tests/reshape/test_stack.py @@ -0,0 +1,182 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + +import numpy as np +import pandas as pd +import pytest + +import cudf +from cudf.core._compat import ( + PANDAS_CURRENT_SUPPORTED_VERSION, + PANDAS_GE_220, + PANDAS_VERSION, +) +from cudf.testing import assert_eq +from cudf.testing._utils import ( + expect_warning_if, +) + + +@pytest.mark.parametrize("nulls", ["none", "some"]) +def test_df_stack(nulls, all_supported_types_as_str): + if ( + all_supported_types_as_str not in ["float32", "float64"] + and nulls == "some" + ): + pytest.skip( + reason=f"nulls not supported in {all_supported_types_as_str}" + ) + elif all_supported_types_as_str == "category": + pytest.skip(reason="category not applicable for test") + + num_cols = 2 + num_rows = 10 + pdf = pd.DataFrame() + rng = np.random.default_rng(seed=0) + for i in range(num_cols): + colname = str(i) + data = rng.integers(0, 26, num_rows).astype(all_supported_types_as_str) + if nulls == "some": + idx = rng.choice(num_rows, size=int(num_rows / 2), replace=False) + data[idx] = np.nan + pdf[colname] = data + + gdf = cudf.from_pandas(pdf) + + got = gdf.stack() + expect = pdf.stack() + + assert_eq(expect, got) + + +def test_df_stack_reset_index(): + df = cudf.DataFrame( + { + "a": [1, 2, 3, 4], + "b": [10, 11, 12, 13], + "c": ["ab", "cd", None, "gh"], + } + ) + df = df.set_index(["a", "b"]) + pdf = df.to_pandas() + + expected = pdf.stack() + actual = df.stack() + + assert_eq(expected, actual) + + expected = expected.reset_index() + actual = actual.reset_index() + + assert_eq(expected, actual) + + +@pytest.mark.skipif( + PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, + reason="Need pandas-2.1.0+ to match `stack` api", +) +@pytest.mark.parametrize( + "tuples", + [ + [("A", "cat"), ("A", "dog"), ("B", "cat"), ("B", "dog")], + [("A", "cat"), ("B", "bird"), ("A", "dog"), ("B", "dog")], + ], +) +@pytest.mark.parametrize( + "level", + [ + -1, + 0, + 1, + "letter", + "animal", + [0, 1], + [1, 0], + ["letter", "animal"], + ["animal", "letter"], + ], +) +@pytest.mark.parametrize( + "index", + [ + pd.RangeIndex(2, name="range"), + pd.Index([9, 8], name="myindex"), + pd.MultiIndex.from_arrays( + [ + ["A", "B"], + [101, 102], + ], + names=["first", "second"], + ), + ], +) +def test_df_stack_multiindex_column_axis(tuples, index, level, dropna): + if isinstance(level, list) and len(level) > 1 and not dropna: + pytest.skip( + "Stacking multiple levels with dropna==False is unsupported." + ) + columns = pd.MultiIndex.from_tuples(tuples, names=["letter", "animal"]) + + pdf = pd.DataFrame( + data=[[1, 2, 3, 4], [2, 4, 6, 8]], columns=columns, index=index + ) + gdf = cudf.from_pandas(pdf) + + with pytest.warns(FutureWarning): + got = gdf.stack(level=level, dropna=dropna, future_stack=False) + with expect_warning_if(PANDAS_GE_220, FutureWarning): + expect = pdf.stack(level=level, dropna=dropna, future_stack=False) + + assert_eq(expect, got, check_dtype=False) + + got = gdf.stack(level=level, future_stack=True) + expect = pdf.stack(level=level, future_stack=True) + + assert_eq(expect, got, check_dtype=False) + + +def test_df_stack_mixed_dtypes(): + pdf = pd.DataFrame( + { + "A": pd.Series([1, 2, 3], dtype="f4"), + "B": pd.Series([4, 5, 6], dtype="f8"), + } + ) + + gdf = cudf.from_pandas(pdf) + + got = gdf.stack() + expect = pdf.stack() + + assert_eq(expect, got, check_dtype=False) + + +@pytest.mark.skipif( + PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, + reason="Need pandas-2.1.0+ to match `stack` api", +) +@pytest.mark.parametrize("level", [["animal", "hair_length"], [1, 2]]) +def test_df_stack_multiindex_column_axis_pd_example(level): + columns = pd.MultiIndex.from_tuples( + [ + ("A", "cat", "long"), + ("B", "cat", "long"), + ("A", "dog", "short"), + ("B", "dog", "short"), + ], + names=["exp", "animal", "hair_length"], + ) + rng = np.random.default_rng(seed=0) + df = pd.DataFrame(rng.standard_normal(size=(4, 4)), columns=columns) + + with expect_warning_if(PANDAS_GE_220, FutureWarning): + expect = df.stack(level=level, future_stack=False) + gdf = cudf.from_pandas(df) + with pytest.warns(FutureWarning): + got = gdf.stack(level=level, future_stack=False) + + assert_eq(expect, got) + + expect = df.stack(level=level, future_stack=True) + got = gdf.stack(level=level, future_stack=True) + + assert_eq(expect, got) diff --git a/python/cudf/cudf/tests/reshape/test_tile.py b/python/cudf/cudf/tests/reshape/test_tile.py new file mode 100644 index 00000000000..2f071829d13 --- /dev/null +++ b/python/cudf/cudf/tests/reshape/test_tile.py @@ -0,0 +1,42 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + +import numpy as np +import pandas as pd +import pytest + +import cudf +from cudf.testing import assert_eq + + +@pytest.mark.parametrize("count", [1, 10]) +@pytest.mark.parametrize("nulls", ["none", "some"]) +def test_tile(nulls, all_supported_types_as_str, count): + if ( + all_supported_types_as_str not in ["float32", "float64"] + and nulls == "some" + ): + pytest.skip( + reason=f"nulls not supported in {all_supported_types_as_str}" + ) + + num_cols = 2 + num_rows = 10 + pdf = pd.DataFrame(dtype=all_supported_types_as_str) + rng = np.random.default_rng(seed=0) + for i in range(num_cols): + colname = str(i) + data = pd.Series(rng.integers(num_cols, 26, num_rows)).astype( + all_supported_types_as_str + ) + + if nulls == "some": + idx = rng.choice(num_rows, size=int(num_rows / 2), replace=False) + data[idx] = np.nan + pdf[colname] = data + + gdf = cudf.from_pandas(pdf) + + got = gdf.tile(count) + expect = pd.DataFrame(pd.concat([pdf] * count)) + + assert_eq(expect, got) diff --git a/python/cudf/cudf/tests/reshape/test_unstack.py b/python/cudf/cudf/tests/reshape/test_unstack.py new file mode 100644 index 00000000000..6cb38537809 --- /dev/null +++ b/python/cudf/cudf/tests/reshape/test_unstack.py @@ -0,0 +1,121 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + +import re + +import pandas as pd +import pytest + +import cudf +from cudf.testing import assert_eq + + +@pytest.mark.parametrize( + "level", + [ + 0, + pytest.param( + 1, + marks=pytest.mark.xfail( + reason="Categorical column indexes not supported" + ), + ), + 2, + "foo", + pytest.param( + "bar", + marks=pytest.mark.xfail( + reason="Categorical column indexes not supported" + ), + ), + "baz", + [], + pytest.param( + [0, 1], + marks=pytest.mark.xfail( + reason="Categorical column indexes not supported" + ), + ), + ["foo"], + pytest.param( + ["foo", "bar"], + marks=pytest.mark.xfail( + reason="Categorical column indexes not supported" + ), + ), + pytest.param( + [0, 1, 2], + marks=pytest.mark.xfail(reason="Pandas behaviour unclear"), + ), + pytest.param( + ["foo", "bar", "baz"], + marks=pytest.mark.xfail(reason="Pandas behaviour unclear"), + ), + ], +) +def test_unstack_multiindex(level): + pdf = pd.DataFrame( + { + "foo": ["one", "one", "one", "two", "two", "two"], + "bar": pd.Categorical(["A", "B", "C", "A", "B", "C"]), + "baz": [1, 2, 3, 4, 5, 6], + "zoo": ["x", "y", "z", "q", "w", "t"], + } + ).set_index(["foo", "bar", "baz"]) + gdf = cudf.from_pandas(pdf) + assert_eq( + pdf.unstack(level=level), + gdf.unstack(level=level), + check_dtype=False, + ) + + +@pytest.mark.parametrize( + "index", + [ + pd.Index(range(0, 5), name=None), + pd.Index(range(0, 5), name="row_index"), + pytest.param( + pd.CategoricalIndex(["d", "e", "f", "g", "h"]), + marks=pytest.mark.xfail( + reason="Categorical column indexes not supported" + ), + ), + ], +) +@pytest.mark.parametrize( + "col_idx", + [ + pd.Index(["a", "b"], name=None), + pd.Index(["a", "b"], name="col_index"), + pd.MultiIndex.from_tuples( + [("c", 1), ("c", 2)], names=["col_index1", None] + ), + ], +) +def test_unstack_index(index, col_idx): + data = { + "A": [1.0, 2.0, 3.0, 4.0, 5.0], + "B": [11.0, 12.0, 13.0, 14.0, 15.0], + } + pdf = pd.DataFrame(data) + gdf = cudf.from_pandas(pdf) + + pdf.index = index + pdf.columns = col_idx + + gdf.index = cudf.from_pandas(index) + gdf.columns = cudf.from_pandas(col_idx) + + assert_eq(pdf.unstack(), gdf.unstack()) + + +def test_unstack_index_invalid(): + gdf = cudf.DataFrame({"a": [1, 2, 3], "b": ["a", "b", "c"]}) + with pytest.raises( + ValueError, + match=re.escape( + "Calling unstack() on single index dataframe with " + "different column datatype is not supported." + ), + ): + gdf.unstack() diff --git a/python/cudf/cudf/tests/test_reshape.py b/python/cudf/cudf/tests/test_reshape.py deleted file mode 100644 index 84cf6136255..00000000000 --- a/python/cudf/cudf/tests/test_reshape.py +++ /dev/null @@ -1,888 +0,0 @@ -# Copyright (c) 2021-2025, NVIDIA CORPORATION. - -import re -from itertools import chain - -import numpy as np -import pandas as pd -import pytest - -import cudf -from cudf.core._compat import ( - PANDAS_CURRENT_SUPPORTED_VERSION, - PANDAS_GE_220, - PANDAS_VERSION, -) -from cudf.core.buffer.spill_manager import get_global_manager -from cudf.testing import assert_eq -from cudf.testing._utils import ( - ALL_TYPES, - DATETIME_TYPES, - NUMERIC_TYPES, - expect_warning_if, -) - -pytest_xfail = pytest.mark.xfail -pytestmark = pytest.mark.spilling - -# If spilling is enabled globally, we skip many test permutations -# to reduce running time. -if get_global_manager() is not None: - ALL_TYPES = ["float32"] - DATETIME_TYPES = ["datetime64[ms]"] - NUMERIC_TYPES = ["float32"] - # To save time, we skip tests marked "pytest.mark.xfail" - pytest_xfail = pytest.mark.skipif - - -@pytest.mark.parametrize("num_id_vars", [0, 1, 2]) -@pytest.mark.parametrize("num_value_vars", [0, 1, 2]) -@pytest.mark.parametrize("num_rows", [1, 2, 100]) -@pytest.mark.parametrize("dtype", NUMERIC_TYPES + DATETIME_TYPES) -@pytest.mark.parametrize("nulls", ["none", "some", "all"]) -@pytest.mark.parametrize("ignore_index", [True, False]) -def test_melt( - nulls, num_id_vars, num_value_vars, num_rows, dtype, ignore_index -): - if dtype not in ["float32", "float64"] and nulls in ["some", "all"]: - pytest.skip(reason="nulls not supported in dtype: " + dtype) - - pdf = pd.DataFrame() - id_vars = [] - rng = np.random.default_rng(seed=0) - for i in range(num_id_vars): - colname = "id" + str(i) - data = rng.integers(0, 26, num_rows).astype(dtype) - if nulls == "some": - idx = rng.choice(num_rows, size=int(num_rows / 2), replace=False) - data[idx] = np.nan - elif nulls == "all": - data[:] = np.nan - pdf[colname] = data - id_vars.append(colname) - - value_vars = [] - for i in range(num_value_vars): - colname = "val" + str(i) - data = rng.integers(0, 26, num_rows).astype(dtype) - if nulls == "some": - idx = rng.choice(num_rows, size=int(num_rows / 2), replace=False) - data[idx] = np.nan - elif nulls == "all": - data[:] = np.nan - pdf[colname] = data - value_vars.append(colname) - - gdf = cudf.from_pandas(pdf) - - got = cudf.melt( - frame=gdf, - id_vars=id_vars, - value_vars=value_vars, - ignore_index=ignore_index, - ) - got_from_melt_method = gdf.melt( - id_vars=id_vars, value_vars=value_vars, ignore_index=ignore_index - ) - - expect = pd.melt( - frame=pdf, - id_vars=id_vars, - value_vars=value_vars, - ignore_index=ignore_index, - ) - - assert_eq(expect, got) - - assert_eq(expect, got_from_melt_method) - - -def test_melt_more_than_255_columns(): - mydict = {"id": ["foobar"]} - for i in range(1, 260): - mydict[f"d_{i}"] = i - - df = pd.DataFrame(mydict) - grid_df = pd.melt(df, id_vars=["id"], var_name="d", value_name="sales") - - df_d = cudf.DataFrame(mydict) - grid_df_d = cudf.melt( - df_d, id_vars=["id"], var_name="d", value_name="sales" - ) - grid_df_d["d"] = grid_df_d["d"] - - assert_eq(grid_df, grid_df_d) - - -def test_melt_str_scalar_id_var(): - data = {"index": [1, 2], "id": [1, 2], "d0": [10, 20], "d1": [30, 40]} - result = cudf.melt( - cudf.DataFrame(data), - id_vars="index", - var_name="column", - value_name="value", - ) - expected = pd.melt( - pd.DataFrame(data), - id_vars="index", - var_name="column", - value_name="value", - ) - assert_eq(result, expected) - - -def test_melt_falsy_var_name(): - df = cudf.DataFrame({"A": ["a", "b", "c"], "B": [1, 3, 5], "C": [2, 4, 6]}) - result = cudf.melt(df, id_vars=["A"], value_vars=["B"], var_name="") - expected = pd.melt( - df.to_pandas(), id_vars=["A"], value_vars=["B"], var_name="" - ) - assert_eq(result, expected) - - -@pytest.mark.parametrize("num_cols", [1, 2, 10]) -@pytest.mark.parametrize("num_rows", [1, 2, 1000]) -@pytest.mark.parametrize( - "dtype", list(chain(NUMERIC_TYPES, DATETIME_TYPES, ["str"])) -) -@pytest.mark.parametrize("nulls", ["none", "some"]) -def test_df_stack(nulls, num_cols, num_rows, dtype): - if dtype not in ["float32", "float64"] and nulls in ["some"]: - pytest.skip(reason="nulls not supported in dtype: " + dtype) - - pdf = pd.DataFrame() - rng = np.random.default_rng(seed=0) - for i in range(num_cols): - colname = str(i) - data = rng.integers(0, 26, num_rows).astype(dtype) - if nulls == "some": - idx = rng.choice(num_rows, size=int(num_rows / 2), replace=False) - data[idx] = np.nan - pdf[colname] = data - - gdf = cudf.from_pandas(pdf) - - got = gdf.stack() - expect = pdf.stack() - - assert_eq(expect, got) - - -def test_df_stack_reset_index(): - df = cudf.DataFrame( - { - "a": [1, 2, 3, 4], - "b": [10, 11, 12, 13], - "c": ["ab", "cd", None, "gh"], - } - ) - df = df.set_index(["a", "b"]) - pdf = df.to_pandas() - - expected = pdf.stack() - actual = df.stack() - - assert_eq(expected, actual) - - expected = expected.reset_index() - actual = actual.reset_index() - - assert_eq(expected, actual) - - -@pytest.mark.skipif( - PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, - reason="Need pandas-2.1.0+ to match `stack` api", -) -@pytest.mark.parametrize( - "columns", - [ - pd.MultiIndex.from_tuples( - [("A", "cat"), ("A", "dog"), ("B", "cat"), ("B", "dog")], - names=["letter", "animal"], - ), - pd.MultiIndex.from_tuples( - [("A", "cat"), ("B", "bird"), ("A", "dog"), ("B", "dog")], - names=["letter", "animal"], - ), - ], -) -@pytest.mark.parametrize( - "level", - [ - -1, - 0, - 1, - "letter", - "animal", - [0, 1], - [1, 0], - ["letter", "animal"], - ["animal", "letter"], - ], -) -@pytest.mark.parametrize( - "index", - [ - pd.RangeIndex(2, name="range"), - pd.Index([9, 8], name="myindex"), - pd.MultiIndex.from_arrays( - [ - ["A", "B"], - [101, 102], - ], - names=["first", "second"], - ), - ], -) -@pytest.mark.parametrize("dropna", [True, False]) -def test_df_stack_multiindex_column_axis(columns, index, level, dropna): - if isinstance(level, list) and len(level) > 1 and not dropna: - pytest.skip( - "Stacking multiple levels with dropna==False is unsupported." - ) - - pdf = pd.DataFrame( - data=[[1, 2, 3, 4], [2, 4, 6, 8]], columns=columns, index=index - ) - gdf = cudf.from_pandas(pdf) - - with pytest.warns(FutureWarning): - got = gdf.stack(level=level, dropna=dropna, future_stack=False) - with expect_warning_if(PANDAS_GE_220, FutureWarning): - expect = pdf.stack(level=level, dropna=dropna, future_stack=False) - - assert_eq(expect, got, check_dtype=False) - - got = gdf.stack(level=level, future_stack=True) - expect = pdf.stack(level=level, future_stack=True) - - assert_eq(expect, got, check_dtype=False) - - -def test_df_stack_mixed_dtypes(): - pdf = pd.DataFrame( - { - "A": pd.Series([1, 2, 3], dtype="f4"), - "B": pd.Series([4, 5, 6], dtype="f8"), - } - ) - - gdf = cudf.from_pandas(pdf) - - got = gdf.stack() - expect = pdf.stack() - - assert_eq(expect, got, check_dtype=False) - - -@pytest.mark.skipif( - PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, - reason="Need pandas-2.1.0+ to match `stack` api", -) -@pytest.mark.parametrize("level", [["animal", "hair_length"], [1, 2]]) -def test_df_stack_multiindex_column_axis_pd_example(level): - columns = pd.MultiIndex.from_tuples( - [ - ("A", "cat", "long"), - ("B", "cat", "long"), - ("A", "dog", "short"), - ("B", "dog", "short"), - ], - names=["exp", "animal", "hair_length"], - ) - rng = np.random.default_rng(seed=0) - df = pd.DataFrame(rng.standard_normal(size=(4, 4)), columns=columns) - - with expect_warning_if(PANDAS_GE_220, FutureWarning): - expect = df.stack(level=level, future_stack=False) - gdf = cudf.from_pandas(df) - with pytest.warns(FutureWarning): - got = gdf.stack(level=level, future_stack=False) - - assert_eq(expect, got) - - expect = df.stack(level=level, future_stack=True) - got = gdf.stack(level=level, future_stack=True) - - assert_eq(expect, got) - - -@pytest.mark.parametrize("num_rows", [1, 2, 10, 1000]) -@pytest.mark.parametrize("num_cols", [1, 2, 10]) -@pytest.mark.parametrize( - "dtype", NUMERIC_TYPES + DATETIME_TYPES + ["category"] -) -@pytest.mark.parametrize("nulls", ["none", "some"]) -def test_interleave_columns(nulls, num_cols, num_rows, dtype): - if dtype not in ["float32", "float64"] and nulls in ["some"]: - pytest.skip(reason="nulls not supported in dtype: " + dtype) - - pdf = pd.DataFrame(dtype=dtype) - rng = np.random.default_rng(seed=0) - for i in range(num_cols): - colname = str(i) - data = pd.Series(rng.integers(0, 26, num_rows)).astype(dtype) - - if nulls == "some": - idx = rng.choice(num_rows, size=int(num_rows / 2), replace=False) - data[idx] = np.nan - pdf[colname] = data - - gdf = cudf.from_pandas(pdf) - - if dtype == "category": - with pytest.raises(ValueError): - assert gdf.interleave_columns() - else: - got = gdf.interleave_columns() - - expect = pd.Series(np.vstack(pdf.to_numpy()).reshape((-1,))).astype( - dtype - ) - - assert_eq(expect, got) - - -@pytest.mark.parametrize("num_cols", [1, 2, 10]) -@pytest.mark.parametrize("num_rows", [1, 2, 1000]) -@pytest.mark.parametrize("count", [1, 2, 10]) -@pytest.mark.parametrize("dtype", ALL_TYPES) -@pytest.mark.parametrize("nulls", ["none", "some"]) -def test_tile(nulls, num_cols, num_rows, dtype, count): - if dtype not in ["float32", "float64"] and nulls in ["some"]: - pytest.skip(reason="nulls not supported in dtype: " + dtype) - - pdf = pd.DataFrame(dtype=dtype) - rng = np.random.default_rng(seed=0) - for i in range(num_cols): - colname = str(i) - data = pd.Series(rng.integers(num_cols, 26, num_rows)).astype(dtype) - - if nulls == "some": - idx = rng.choice(num_rows, size=int(num_rows / 2), replace=False) - data[idx] = np.nan - pdf[colname] = data - - gdf = cudf.from_pandas(pdf) - - got = gdf.tile(count) - expect = pd.DataFrame(pd.concat([pdf] * count)) - - assert_eq(expect, got) - - -def _prepare_merge_sorted_test( - size, - nparts, - keys, - add_null=False, - na_position="last", - ascending=True, - series=False, - index=False, -): - if index: - df = ( - cudf.datasets.timeseries()[:size] - .reset_index(drop=False) - .set_index(keys, drop=True) - ) - else: - df = cudf.datasets.timeseries()[:size].reset_index(drop=False) - if add_null: - df.iloc[1, df.columns.get_loc(keys[0])] = None - chunk = int(size / nparts) - indices = [i * chunk for i in range(0, nparts)] + [size] - if index: - dfs = [ - df.iloc[indices[i] : indices[i + 1]] - .copy() - .sort_index(ascending=ascending) - for i in range(nparts) - ] - elif series: - df = df[keys[0]] - dfs = [ - df.iloc[indices[i] : indices[i + 1]] - .copy() - .sort_values(na_position=na_position, ascending=ascending) - for i in range(nparts) - ] - else: - dfs = [ - df.iloc[indices[i] : indices[i + 1]] - .copy() - .sort_values(keys, na_position=na_position, ascending=ascending) - for i in range(nparts) - ] - return df, dfs - - -@pytest.mark.parametrize("ascending", [True, False]) -@pytest.mark.parametrize("na_position", ["first", "last"]) -@pytest.mark.parametrize("keys", [None, ["id"], ["name", "timestamp"]]) -@pytest.mark.parametrize("nparts", [2, 10]) -def test_df_merge_sorted(nparts, keys, na_position, ascending): - size = 100 - keys_1 = keys or ["timestamp"] - # Null values NOT currently supported with Categorical data - # or when `ascending=False` - add_null = keys_1[0] not in ("name") - df, dfs = _prepare_merge_sorted_test( - size, - nparts, - keys_1, - add_null=add_null, - na_position=na_position, - ascending=ascending, - ) - - expect = df.sort_values( - keys_1, na_position=na_position, ascending=ascending - ) - result = cudf.core.reshape._merge_sorted( - dfs, keys=keys, na_position=na_position, ascending=ascending - ) - if keys: - expect = expect[keys] - result = result[keys] - - assert expect.index.dtype == result.index.dtype - assert_eq(expect.reset_index(drop=True), result.reset_index(drop=True)) - - -@pytest.mark.parametrize("ascending", [True, False]) -@pytest.mark.parametrize("index", ["id", "x"]) -@pytest.mark.parametrize("nparts", [2, 10]) -def test_df_merge_sorted_index(nparts, index, ascending): - size = 100 - df, dfs = _prepare_merge_sorted_test( - size, nparts, index, ascending=ascending, index=True - ) - - expect = df.sort_index(ascending=ascending) - result = cudf.core.reshape._merge_sorted( - dfs, by_index=True, ascending=ascending - ) - - assert_eq(expect.index, result.index) - - -@pytest.mark.parametrize("ascending", [True, False]) -@pytest.mark.parametrize("na_position", ["first", "last"]) -@pytest.mark.parametrize("keys", [None, ["name", "timestamp"]]) -def test_df_merge_sorted_ignore_index(keys, na_position, ascending): - size = 100 - nparts = 3 - keys_1 = keys or ["timestamp"] - # Null values NOT currently supported with Categorical data - # or when `ascending=False` - add_null = keys_1[0] not in ("name") - df, dfs = _prepare_merge_sorted_test( - size, - nparts, - keys_1, - add_null=add_null, - na_position=na_position, - ascending=ascending, - ) - - expect = df.sort_values( - keys_1, na_position=na_position, ascending=ascending - ) - result = cudf.core.reshape._merge_sorted( - dfs, - keys=keys, - na_position=na_position, - ascending=ascending, - ignore_index=True, - ) - if keys: - expect = expect[keys] - result = result[keys] - - assert_eq(expect.reset_index(drop=True), result) - - -@pytest.mark.parametrize("ascending", [True, False]) -@pytest.mark.parametrize("na_position", ["first", "last"]) -@pytest.mark.parametrize("key", ["id", "name", "timestamp"]) -@pytest.mark.parametrize("nparts", [2, 10]) -def test_series_merge_sorted(nparts, key, na_position, ascending): - size = 100 - df, dfs = _prepare_merge_sorted_test( - size, - nparts, - [key], - na_position=na_position, - ascending=ascending, - series=True, - ) - - expect = df.sort_values(na_position=na_position, ascending=ascending) - result = cudf.core.reshape._merge_sorted( - dfs, na_position=na_position, ascending=ascending - ) - - assert_eq(expect.reset_index(drop=True), result.reset_index(drop=True)) - - -@pytest.mark.parametrize( - "index, column, data", - [ - ([], [], []), - ([0], [0], [0]), - ([0, 0], [0, 1], [1, 2.0]), - ([0, 1], [0, 0], [1, 2.0]), - ([0, 1], [0, 1], [1, 2.0]), - (["a", "a", "b", "b"], ["c", "d", "c", "d"], [1, 2, 3, 4]), - ( - ["a", "a", "b", "b", "a"], - ["c", "d", "c", "d", "e"], - [1, 2, 3, 4, 5], - ), - ], -) -def test_pivot_simple(index, column, data): - pdf = pd.DataFrame({"index": index, "column": column, "data": data}) - gdf = cudf.from_pandas(pdf) - - expect = pdf.pivot(columns="column", index="index") - got = gdf.pivot(columns="column", index="index") - - check_index_and_columns = expect.shape != (0, 0) - assert_eq( - expect, - got, - check_dtype=False, - check_index_type=check_index_and_columns, - check_column_type=check_index_and_columns, - ) - - -def test_pivot_multi_values(): - # from Pandas docs: - # https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.pivot.html - pdf = pd.DataFrame( - { - "foo": ["one", "one", "one", "two", "two", "two"], - "bar": ["A", "B", "C", "A", "B", "C"], - "baz": [1, 2, 3, 4, 5, 6], - "zoo": ["x", "y", "z", "q", "w", "t"], - } - ) - gdf = cudf.from_pandas(pdf) - assert_eq( - pdf.pivot(index="foo", columns="bar", values=["baz", "zoo"]), - gdf.pivot(index="foo", columns="bar", values=["baz", "zoo"]), - check_dtype=False, - ) - - -@pytest.mark.parametrize( - "values", ["z", "z123", ["z123"], ["z", "z123", "123z"]] -) -def test_pivot_values(values): - data = [ - ["A", "a", 0, 0, 0], - ["A", "b", 1, 1, 1], - ["A", "c", 2, 2, 2], - ["B", "a", 0, 0, 0], - ["B", "b", 1, 1, 1], - ["B", "c", 2, 2, 2], - ["C", "a", 0, 0, 0], - ["C", "b", 1, 1, 1], - ["C", "c", 2, 2, 2], - ] - columns = ["x", "y", "z", "z123", "123z"] - pdf = pd.DataFrame(data, columns=columns) - cdf = cudf.DataFrame(data, columns=columns) - expected = pd.pivot(pdf, index="x", columns="y", values=values) - actual = cudf.pivot(cdf, index="x", columns="y", values=values) - assert_eq( - expected, - actual, - check_dtype=False, - ) - - -@pytest.mark.parametrize( - "level", - [ - 0, - pytest.param( - 1, - marks=pytest_xfail( - reason="Categorical column indexes not supported" - ), - ), - 2, - "foo", - pytest.param( - "bar", - marks=pytest_xfail( - reason="Categorical column indexes not supported" - ), - ), - "baz", - [], - pytest.param( - [0, 1], - marks=pytest_xfail( - reason="Categorical column indexes not supported" - ), - ), - ["foo"], - pytest.param( - ["foo", "bar"], - marks=pytest_xfail( - reason="Categorical column indexes not supported" - ), - ), - pytest.param( - [0, 1, 2], - marks=pytest_xfail(reason="Pandas behaviour unclear"), - ), - pytest.param( - ["foo", "bar", "baz"], - marks=pytest_xfail(reason="Pandas behaviour unclear"), - ), - ], -) -def test_unstack_multiindex(level): - pdf = pd.DataFrame( - { - "foo": ["one", "one", "one", "two", "two", "two"], - "bar": pd.Categorical(["A", "B", "C", "A", "B", "C"]), - "baz": [1, 2, 3, 4, 5, 6], - "zoo": ["x", "y", "z", "q", "w", "t"], - } - ).set_index(["foo", "bar", "baz"]) - gdf = cudf.from_pandas(pdf) - assert_eq( - pdf.unstack(level=level), - gdf.unstack(level=level), - check_dtype=False, - ) - - -@pytest.mark.parametrize( - "index", - [ - pd.Index(range(0, 5), name=None), - pd.Index(range(0, 5), name="row_index"), - pytest.param( - pd.CategoricalIndex(["d", "e", "f", "g", "h"]), - marks=pytest_xfail( - reason="Categorical column indexes not supported" - ), - ), - ], -) -@pytest.mark.parametrize( - "col_idx", - [ - pd.Index(["a", "b"], name=None), - pd.Index(["a", "b"], name="col_index"), - pd.MultiIndex.from_tuples([("c", 1), ("c", 2)], names=[None, None]), - pd.MultiIndex.from_tuples( - [("c", 1), ("c", 2)], names=["col_index1", "col_index2"] - ), - ], -) -def test_unstack_index(index, col_idx): - data = { - "A": [1.0, 2.0, 3.0, 4.0, 5.0], - "B": [11.0, 12.0, 13.0, 14.0, 15.0], - } - pdf = pd.DataFrame(data) - gdf = cudf.from_pandas(pdf) - - pdf.index = index - pdf.columns = col_idx - - gdf.index = cudf.from_pandas(index) - gdf.columns = cudf.from_pandas(col_idx) - - assert_eq(pdf.unstack(), gdf.unstack()) - - -def test_unstack_index_invalid(): - gdf = cudf.DataFrame({"a": [1, 2, 3], "b": ["a", "b", "c"]}) - with pytest.raises( - ValueError, - match=re.escape( - "Calling unstack() on single index dataframe with " - "different column datatype is not supported." - ), - ): - gdf.unstack() - - -def test_pivot_duplicate_error(): - gdf = cudf.DataFrame( - {"a": [0, 1, 2, 2], "b": [1, 2, 3, 3], "d": [1, 2, 3, 4]} - ) - with pytest.raises(ValueError): - gdf.pivot(index="a", columns="b") - with pytest.raises(ValueError): - gdf.pivot(index="b", columns="a") - - -@pytest.mark.parametrize( - "aggfunc", ["mean", "count", {"D": "sum", "E": "count"}] -) -def test_pivot_table_simple(aggfunc): - rng = np.random.default_rng(seed=0) - fill_value = 0 - pdf = pd.DataFrame( - { - "A": ["one", "one", "two", "three"] * 6, - "B": ["A", "B", "C"] * 8, - "C": ["foo", "foo", "foo", "bar", "bar", "bar"] * 4, - "D": rng.standard_normal(size=24), - "E": rng.standard_normal(size=24), - } - ) - expected = pd.pivot_table( - pdf, - values=["D", "E"], - index=["A", "B"], - columns=["C"], - aggfunc=aggfunc, - fill_value=fill_value, - ) - cdf = cudf.DataFrame.from_pandas(pdf) - actual = cudf.pivot_table( - cdf, - values=["D", "E"], - index=["A", "B"], - columns=["C"], - aggfunc=aggfunc, - fill_value=fill_value, - ) - assert_eq(expected, actual, check_dtype=False) - - -@pytest.mark.parametrize( - "aggfunc", ["mean", "count", {"D": "sum", "E": "count"}] -) -def test_dataframe_pivot_table_simple(aggfunc): - rng = np.random.default_rng(seed=0) - fill_value = 0 - pdf = pd.DataFrame( - { - "A": ["one", "one", "two", "three"] * 6, - "B": ["A", "B", "C"] * 8, - "C": ["foo", "foo", "foo", "bar", "bar", "bar"] * 4, - "D": rng.standard_normal(size=24), - "E": rng.standard_normal(size=24), - } - ) - expected = pdf.pivot_table( - values=["D", "E"], - index=["A", "B"], - columns=["C"], - aggfunc=aggfunc, - fill_value=fill_value, - ) - cdf = cudf.DataFrame.from_pandas(pdf) - actual = cdf.pivot_table( - values=["D", "E"], - index=["A", "B"], - columns=["C"], - aggfunc=aggfunc, - fill_value=fill_value, - ) - assert_eq(expected, actual, check_dtype=False) - - -@pytest.mark.parametrize("index", ["A", ["A"]]) -@pytest.mark.parametrize("columns", ["C", ["C"]]) -def test_pivot_table_scalar_index_columns(index, columns): - data = { - "A": ["one", "one", "two", "three"] * 6, - "B": ["A", "B", "C"] * 8, - "C": ["foo", "foo", "foo", "bar", "bar", "bar"] * 4, - "D": range(24), - "E": range(24), - } - result = cudf.DataFrame(data).pivot_table( - values="D", index=index, columns=columns, aggfunc="sum" - ) - expected = pd.DataFrame(data).pivot_table( - values="D", index=index, columns=columns, aggfunc="sum" - ) - assert_eq(result, expected) - - -def test_crosstab_simple(): - a = np.array( - [ - "foo", - "foo", - "foo", - "foo", - "bar", - "bar", - "bar", - "bar", - "foo", - "foo", - "foo", - ], - dtype=object, - ) - b = np.array( - [ - "one", - "one", - "one", - "two", - "one", - "one", - "one", - "two", - "two", - "two", - "one", - ], - dtype=object, - ) - c = np.array( - [ - "dull", - "dull", - "shiny", - "dull", - "dull", - "shiny", - "shiny", - "dull", - "shiny", - "shiny", - "shiny", - ], - dtype=object, - ) - expected = pd.crosstab(a, [b, c], rownames=["a"], colnames=["b", "c"]) - actual = cudf.crosstab(a, [b, c], rownames=["a"], colnames=["b", "c"]) - assert_eq(expected, actual, check_dtype=False) - - -@pytest.mark.parametrize("index", [["ix"], ["ix", "foo"]]) -@pytest.mark.parametrize("columns", [["col"], ["col", "baz"]]) -def test_pivot_list_like_index_columns(index, columns): - data = { - "bar": ["x", "y", "z", "w"], - "col": ["a", "b", "a", "b"], - "foo": [1, 2, 3, 4], - "ix": [1, 1, 2, 2], - "baz": [0, 0, 0, 0], - } - pd_df = pd.DataFrame(data) - cudf_df = cudf.DataFrame(data) - result = cudf_df.pivot(columns=columns, index=index) - expected = pd_df.pivot(columns=columns, index=index) - assert_eq(result, expected) From f19b7ed4647df6cc6814b444cfe2f34f2d2be04d Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Thu, 14 Aug 2025 17:28:17 -0700 Subject: [PATCH 129/366] Move test_csv/feather/json.py to new cudf classic test directory structure (#19639) Towards https://github.com/rapidsai/cudf/issues/9999 Towards https://github.com/rapidsai/cudf/issues/15723 Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/19639 --- .../cudf/cudf/tests/input_output/test_csv.py | 2254 ++++++++++++++++- .../cudf/tests/input_output/test_feather.py | 66 +- .../cudf/cudf/tests/input_output/test_json.py | 1447 ++++++++++- python/cudf/cudf/tests/test_csv.py | 2249 ---------------- python/cudf/cudf/tests/test_feather.py | 78 - python/cudf/cudf/tests/test_json.py | 1467 ----------- 6 files changed, 3764 insertions(+), 3797 deletions(-) delete mode 100644 python/cudf/cudf/tests/test_csv.py delete mode 100644 python/cudf/cudf/tests/test_feather.py delete mode 100644 python/cudf/cudf/tests/test_json.py diff --git a/python/cudf/cudf/tests/input_output/test_csv.py b/python/cudf/cudf/tests/input_output/test_csv.py index 06777c8e6af..9ab62b11c7b 100644 --- a/python/cudf/cudf/tests/input_output/test_csv.py +++ b/python/cudf/cudf/tests/input_output/test_csv.py @@ -1 +1,2253 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. +# Copyright (c) 2018-2025, NVIDIA CORPORATION. + +import codecs +import gzip +import os +import re +import shutil +from io import BytesIO, StringIO + +import cupy as cp +import numpy as np +import pandas as pd +import pytest + +import cudf +from cudf import read_csv +from cudf.core._compat import ( + PANDAS_CURRENT_SUPPORTED_VERSION, + PANDAS_GE_220, + PANDAS_VERSION, +) +from cudf.testing import assert_eq +from cudf.testing._utils import assert_exceptions_equal, expect_warning_if + + +@pytest.fixture +def pd_mixed_dataframe(): + return pd.DataFrame( + { + "Integer": [2345, 11987, 9027, 9027], + "Date": ["18/04/1995", "14/07/1994", "07/06/2006", "16/09/2005"], + "Float": [9.001, 8.343, 6, 2.781], + "Integer2": [2345, 106, 2088, 789277], + "Category": ["M", "F", "F", "F"], + "String": ["Alpha", "Beta", "Gamma", "Delta"], + "Boolean": [True, False, True, False], + } + ) + + +@pytest.fixture +def cudf_mixed_dataframe(pd_mixed_dataframe): + return cudf.from_pandas(pd_mixed_dataframe) + + +@pytest.fixture +def gdf_np_dtypes(): + gdf_dtypes = [ + "float", + "float32", + "double", + "float64", + "int8", + "short", + "int16", + "int", + "int32", + "long", + "int64", + "uint8", + "uint16", + "uint32", + "uint64", + ] + + np_dtypes = [ + np.float32, + np.float32, + np.float64, + np.float64, + np.int8, + np.int16, + np.int16, + np.int32, + np.int32, + np.int64, + np.int64, + np.uint8, + np.uint16, + np.uint32, + np.uint64, + ] + return dict(zip(gdf_dtypes, np_dtypes, strict=True)) + + +@pytest.fixture +def numeric_extremes_dataframe(gdf_np_dtypes): + data = {} + for typ, np_type in gdf_np_dtypes.items(): + if np.dtype(np_type).kind in "iu": + itype = np.iinfo(np_type) + extremes = [0, +1, -1, itype.min, itype.max] + data[typ] = np.array(extremes * 4).astype(np_type)[:20] + else: + ftype = np.finfo(np_type) + extremes = [ + 0.0, + -0.0, + +1, + -1, + np.nan, + -np.nan, + # ftype.min, # TODO enable after fixing truncation issue #6235 + # ftype.max, # TODO enable after fixing truncation issue #6235 + np_type(np.inf), + -np_type(np.inf), + ftype.eps, + ftype.epsneg, + ftype.tiny, + -ftype.eps, + -ftype.epsneg, + -ftype.tiny, + ] + data[typ] = np.array(extremes * 4, dtype=np_type)[:20] + return pd.DataFrame(data) + + +def test_csv_reader_numeric_data(numeric_types_as_str, tmp_path): + fname = tmp_path / "tmp_csvreader_file1.csv" + + df = pd.DataFrame( + {"col1": [1, 2, 3], "col2": [4, 5, 6]}, + ).astype(numeric_types_as_str) + df.to_csv(fname, index=False, header=False) + + dtypes = [df[k].dtype for k in df.columns] + out = read_csv(str(fname), names=list(df.columns.values), dtype=dtypes) + + assert len(out.columns) == len(df.columns) + assert_eq(df, out) + + +@pytest.mark.parametrize("parse_dates", [["date2"], [0], ["date1", 1, "bad"]]) +def test_csv_reader_datetime(parse_dates): + df = pd.DataFrame( + { + "col1": [ + "31/10/2010", + "05/03/2001", + "20/10/1994", + "18/10/1990", + "1/1/1970", + "2016-04-30T01:02:03.000", + "2038-01-19 03:14:07", + ], + "col2": [ + "18/04/1995", + "14 / 07 / 1994", + "07/06/2006", + "16/09/2005", + "2/2/1970", + "2007-4-30 1:6:40.000PM", + "2038-01-19 03:14:08", + ], + "col3": [ + "1 Jan", + "2 January 1994", + "Feb 2002", + "31-01-2000", + "1-1-1996", + "15-May-2009", + "21-Dec-3262", + ], + } + ) + buffer = df.to_csv(index=False, header=False) + + gdf = read_csv( + StringIO(buffer), + names=["date1", "date2", "bad"], + parse_dates=parse_dates, + dayfirst=True, + ) + # Need to used `date_format='mixed'`, + # https://github.com/pandas-dev/pandas/issues/53355 + pdf = pd.read_csv( + StringIO(buffer), + names=["date1", "date2", "bad"], + parse_dates=parse_dates, + dayfirst=True, + date_format="mixed", + ) + + assert_eq(gdf, pdf) + + +@pytest.mark.parametrize("p_arg", ["delimiter", "sep"]) +@pytest.mark.parametrize("c_arg", ["sep", "delimiter"]) +def test_csv_reader_mixed_data_delimiter_sep( + tmp_path, p_arg, c_arg, pd_mixed_dataframe +): + pandas_arg = {p_arg: "|"} + cudf_arg = {c_arg: "|"} + fname = tmp_path / "tmp_csvreader_file3.csv" + + pd_mixed_dataframe.to_csv(fname, sep="|", index=False, header=False) + + gdf1 = read_csv( + str(fname), + names=["1", "2", "3", "4", "5", "6", "7"], + dtype=[ + "int64", + "datetime64[ns]", + "float64", + "int64", + "category", + "str", + "bool", + ], + dayfirst=True, + **cudf_arg, + ) + gdf2 = read_csv( + str(fname), + names=["1", "2", "3", "4", "5", "6", "7"], + dtype=[ + "int64", + "datetime64[ns]", + "float64", + "int64", + "category", + "str", + "bool", + ], + dayfirst=True, + **pandas_arg, + ) + + pdf = pd.read_csv( + fname, + names=["1", "2", "3", "4", "5", "6", "7"], + parse_dates=[1], + dayfirst=True, + **pandas_arg, + ) + + assert len(gdf1.columns) == len(pdf.columns) + assert len(gdf2.columns) == len(pdf.columns) + assert_eq(gdf1, gdf2) + + +@pytest.mark.parametrize("use_list", [False, True]) +def test_csv_reader_dtype_list(use_list): + df = pd.DataFrame( + {"col1": [1, 2, 3], "col2": [4, 5, 6]}, + ).astype(np.float32) + buffer = df.to_csv(index=False, header=False) + + # PANDAS doesn't list but cudf does (treated as implied ordered dict) + # Select first column's dtype if non-list; expect the same dtype for all + if use_list: + dtypes = [df[k].dtype for k in df.columns] + else: + dtypes = df[df.columns[0]].dtype + + gdf = read_csv(StringIO(buffer), dtype=dtypes, names=df.columns) + + assert_eq(gdf, df) + + +@pytest.mark.parametrize("use_names", [False, True]) +def test_csv_reader_dtype_dict(use_names, gdf_np_dtypes): + # Save with the column header if not explicitly specifying a list of names + df = pd.DataFrame( + { + typ: np.zeros(3, dtype=np_type) + for typ, np_type in gdf_np_dtypes.items() + } + ) + buffer = df.to_csv(index=False, header=not use_names) + dtypes = df.dtypes.to_dict() + names = list(gdf_np_dtypes.keys()) if use_names else None + gdf = read_csv(StringIO(buffer), dtype=dtypes, names=names) + pdf = pd.read_csv(StringIO(buffer), dtype=dtypes, names=names) + + assert_eq(gdf, pdf) + + +@pytest.mark.filterwarnings("ignore:invalid value encountered in cast") +@pytest.mark.parametrize("use_names", [True, False]) +def test_csv_reader_dtype_extremes(use_names, numeric_extremes_dataframe): + # Save with the column header if not explicitly specifying a list of names + df = numeric_extremes_dataframe + buffer = df.to_csv(index=False, header=not use_names) + dtypes = df.dtypes.to_dict() + names = df.columns.to_list() if use_names else None + + gdf = read_csv(StringIO(buffer), dtype=dtypes, names=names) + pdf = pd.read_csv(StringIO(buffer), dtype=dtypes, names=names) + + assert_eq(gdf, pdf) + + +@pytest.mark.skipif( + PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, + reason="https://github.com/pandas-dev/pandas/issues/52449", +) +def test_csv_reader_skiprows_skipfooter(tmp_path, pd_mixed_dataframe): + fname = tmp_path / "tmp_csvreader_file5.csv" + + pd_mixed_dataframe.to_csv( + fname, columns=["Integer", "Date", "Float"], index=False, header=False + ) + + # Using engine='python' to eliminate pandas warning of using python engine. + df_out = pd.read_csv( + fname, + names=["1", "2", "3"], + parse_dates=[1], + dayfirst=True, + skiprows=1, + skipfooter=1, + engine="python", + ) + out = read_csv( + str(fname), + names=["1", "2", "3"], + dtype=["int64", "datetime64[ns]", "float64"], + skiprows=1, + skipfooter=1, + dayfirst=True, + ) + + assert len(out.columns) == len(df_out.columns) + assert len(out) == len(df_out) + + assert_eq(df_out, out, check_dtype=False) + + +def test_csv_reader_negative_vals(tmp_path): + fname = tmp_path / "tmp_csvreader_file6.csv" + + names = ["0", "1", "2"] + dtypes = ["float32", "float32", "float32"] + lines = [ + ",".join(names), + "-181.5060,-185.37000,-3", + "-127.6300,-230.54600,-9", + ] + + with open(str(fname), "w") as fp: + fp.write("\n".join(lines)) + + zero = [-181.5060, -127.6300] + one = [-185.370, -230.54600] + two = [-3, -9] + + df = read_csv(str(fname), names=names, dtype=dtypes, skiprows=1) + + np.testing.assert_allclose(zero, df["0"].to_numpy()) + np.testing.assert_allclose(one, df["1"].to_numpy()) + np.testing.assert_allclose(two, df["2"].to_numpy()) + + +def test_csv_reader_strings(tmp_path): + fname = tmp_path / "tmp_csvreader_file7.csv" + + names = ["text", "int"] + dtypes = ["str", "int"] + lines = [",".join(names), "a,0", "b,0", "c,0", "d,0"] + + with open(str(fname), "w") as fp: + fp.write("\n".join(lines)) + + df = read_csv( + str(fname), + names=names, + dtype=dtypes, + skiprows=1, + decimal=".", + thousands="'", + ) + + assert len(df.columns) == 2 + assert df["text"].dtype == np.dtype("object") + assert df["int"].dtype == np.dtype("int64") + assert df["text"][0] == "a" + assert df["text"][1] == "b" + assert df["text"][2] == "c" + assert df["text"][3] == "d" + + +def test_csv_reader_strings_quotechars(tmp_path): + fname = tmp_path / "tmp_csvreader_file8.csv" + + names = ["text", "int"] + dtypes = ["str", "int"] + lines = [",".join(names), '"a,\n",0', '"b ""c"" d",0', "e,0", '"f,,!.,",0'] + + with open(str(fname), "w") as fp: + fp.write("\n".join(lines)) + + df = read_csv( + str(fname), + names=names, + dtype=dtypes, + skiprows=1, + quotechar='"', + quoting=1, + ) + + assert len(df.columns) == 2 + assert df["text"].dtype == np.dtype("object") + assert df["int"].dtype == np.dtype("int64") + assert df["text"][0] == "a,\n" + assert df["text"][1] == 'b "c" d' + assert df["text"][2] == "e" + assert df["text"][3] == "f,,!.," + + +def test_csv_reader_usecols_int_char(tmp_path, pd_mixed_dataframe): + fname = tmp_path / "tmp_csvreader_file10.csv" + pd_mixed_dataframe.to_csv( + fname, + columns=["Integer", "Date", "Float", "Integer2"], + index=False, + header=False, + ) + + df_out = pd.read_csv(fname, usecols=[0, 1, 3]) + out = read_csv(fname, usecols=[0, 1, 3]) + + assert len(out.columns) == len(df_out.columns) + assert len(out) == len(df_out) + assert_eq(df_out, out, check_names=False) + + +@pytest.mark.parametrize( + "buffer", + [ + "abc,ABC,abc,abcd,abc\n1,2,3,4,5\n", + "A,A,A.1,A,A.2,A,A.4,A,A\n1,2,3.1,4,a.2,a,a.4,a,a", + "A,A,A.1,,Unnamed: 4,A,A.4,A,A\n1,2,3.1,4,a.2,a,a.4,a,a", + ], +) +@pytest.mark.parametrize("mangle_dupe_cols", [True, False]) +def test_csv_reader_mangle_dupe_cols(buffer, mangle_dupe_cols): + # Default: mangle_dupe_cols=True + cu_df = read_csv(StringIO(buffer), mangle_dupe_cols=mangle_dupe_cols) + if mangle_dupe_cols: + pd_df = pd.read_csv(StringIO(buffer)) + else: + # Pandas does not support mangle_dupe_cols=False + head = buffer.split("\n")[0].split(",") + first_cols = np.unique(head, return_index=True)[1] + pd_df = pd.read_csv(StringIO(buffer), usecols=first_cols) + assert_eq(cu_df, pd_df) + + +def test_csv_reader_float_decimal(tmp_path): + fname = tmp_path / "tmp_csvreader_file12.csv" + + names = ["basic_32", "basic_64", "round", "decimal_only", "precision"] + dtypes = ["float32", "float64", "float64", "float32", "float64"] + lines = [ + ";".join(names), + "1,2;1234,5678;12345;0,123;-73,98007199999998", + "3,4;3456,7890;67890;,456;1,7976931348623157e+307", + "5,6e0;0,5679e2;1,2e10;0,07e-001;0,0", + ] + + with open(str(fname), "w") as fp: + fp.write("\n".join(lines)) + + basic_32_ref = [1.2, 3.4, 5.6] + basic_64_ref = [1234.5678, 3456.7890, 56.79] + round_ref = [12345, 67890, 12000000000] + decimal_only_ref = [0.123, 0.456, 0.007] + precision_ref = [-73.98007199999998, 1.7976931348623157e307, 0.0] + + df = read_csv( + str(fname), + names=names, + dtype=dtypes, + skiprows=1, + delimiter=";", + decimal=",", + ) + + np.testing.assert_allclose(basic_32_ref, df["basic_32"].to_numpy()) + np.testing.assert_allclose(basic_64_ref, df["basic_64"].to_numpy()) + np.testing.assert_allclose(round_ref, df["round"].to_numpy()) + np.testing.assert_allclose(decimal_only_ref, df["decimal_only"].to_numpy()) + np.testing.assert_allclose(precision_ref, df["precision"].to_numpy()) + + +def test_csv_reader_NaN_values(): + names = dtypes = ["float32"] + empty_cells = '\n""\n' + default_na_cells = ( + "#N/A\n#N/A N/A\n#NA\n-1.#IND\n" + "-1.#QNAN\n-NaN\n-nan\n1.#IND\n" + "1.#QNAN\nN/A\n\nNA\nNULL\n" + "NaN\nn/a\nnan\nnull\n" + ) + custom_na_cells = "NV_NAN\nNotANumber\n" + all_cells = empty_cells + default_na_cells + custom_na_cells + custom_na_values = ["NV_NAN", "NotANumber"] + + # test default NA values. empty cells should also yield NaNs + gdf = read_csv( + StringIO(default_na_cells + empty_cells), names=names, dtype=dtypes + ) + pdf = pd.read_csv( + StringIO(default_na_cells + empty_cells), names=names, dtype=np.float32 + ) + assert_eq(pdf, gdf) + + # custom NA values + gdf = read_csv( + StringIO(all_cells), + names=names, + dtype=dtypes, + na_values=custom_na_values, + ) + pdf = pd.read_csv( + StringIO(all_cells), + names=names, + dtype=np.float32, + na_values=custom_na_values, + ) + assert_eq(pdf, gdf) + + # custom NA values + gdf = read_csv( + StringIO(empty_cells + default_na_cells + "_NAA_\n"), + names=names, + dtype=dtypes, + na_values="_NAA_", + ) + pdf = pd.read_csv( + StringIO(empty_cells + default_na_cells + "_NAA_\n"), + names=names, + dtype=np.float32, + na_values="_NAA_", + ) + assert_eq(pdf, gdf) + + # data type detection should evaluate the column to int8 (all nulls) + gdf = read_csv( + StringIO(all_cells), + header=None, + na_values=custom_na_values, + ) + assert gdf.dtypes.iloc[0] == "int8" + assert all(gdf["0"][idx] is cudf.NA for idx in range(len(gdf["0"]))) + + # data type detection should evaluate the column to object if some nulls + gdf = read_csv(StringIO(all_cells), header=None) + assert gdf.dtypes.iloc[0] == np.dtype("object") + + +def test_csv_reader_thousands(tmp_path): + fname = tmp_path / "tmp_csvreader_file13.csv" + + names = dtypes = [ + "float32", + "float64", + "int32", + "int64", + "uint32", + "uint64", + ] + lines = [ + ",".join(names), + "1'234.5, 1'234.567, 1'234'567, 1'234'567'890,\ + 1'234'567, 1'234'567'890", + "12'345.6, 123'456.7, 12'345, 123'456'789, 12'345, 123'456'789", + ] + + with open(str(fname), "w") as fp: + fp.write("\n".join(lines)) + + f32_ref = [1234.5, 12345.6] + f64_ref = [1234.567, 123456.7] + int32_ref = [1234567, 12345] + int64_ref = [1234567890, 123456789] + uint32_ref = [1234567, 12345] + uint64_ref = [1234567890, 123456789] + + df = read_csv( + str(fname), names=names, dtype=dtypes, skiprows=1, thousands="'" + ) + + np.testing.assert_allclose(f32_ref, df["float32"].to_numpy()) + np.testing.assert_allclose(f64_ref, df["float64"].to_numpy()) + np.testing.assert_allclose(int32_ref, df["int32"].to_numpy()) + np.testing.assert_allclose(int64_ref, df["int64"].to_numpy()) + np.testing.assert_allclose(uint32_ref, df["uint32"].to_numpy()) + np.testing.assert_allclose(uint64_ref, df["uint64"].to_numpy()) + + +def test_csv_reader_buffer_strings(): + names = ["text", "int"] + dtypes = ["str", "int"] + lines = [",".join(names), "a,0", "b,0", "c,0", "d,0"] + + buffer = "\n".join(lines) + + df = read_csv(StringIO(buffer), names=names, dtype=dtypes, skiprows=1) + assert len(df.columns) == 2 + assert df["text"].dtype == np.dtype("object") + assert df["int"].dtype == np.dtype("int64") + assert df["text"][0] == "a" + assert df["text"][1] == "b" + assert df["text"][2] == "c" + assert df["text"][3] == "d" + + df2 = read_csv( + BytesIO(str.encode(buffer)), names=names, dtype=dtypes, skiprows=1 + ) + assert len(df2.columns) == 2 + assert df2["text"].dtype == np.dtype("object") + assert df2["int"].dtype == np.dtype("int64") + assert df2["text"][0] == "a" + assert df2["text"][1] == "b" + assert df2["text"][2] == "c" + assert df2["text"][3] == "d" + + +@pytest.mark.parametrize( + "ext, out_comp, in_comp", + [ + (".geez", "gzip", "gzip"), + (".beez", "bz2", "bz2"), + (".gz", "gzip", "infer"), + (".bz2", "bz2", "infer"), + (".beez", "bz2", np.str_("bz2")), + (".data", None, "infer"), + (".txt", None, None), + ("", None, None), + ], +) +def test_csv_reader_compression( + tmp_path, ext, out_comp, in_comp, pd_mixed_dataframe +): + fname = tmp_path / f"tmp_csvreader_compression.{ext}" + + df = pd_mixed_dataframe + df.to_csv(fname, index=False, header=False, compression=out_comp) + + gdf = read_csv(fname, names=list(df.columns.values), compression=in_comp) + pdf = pd.read_csv( + fname, names=list(df.columns.values), compression=in_comp + ) + + assert_eq(gdf, pdf) + + +@pytest.mark.parametrize( + "names, dtypes, data, trues, falses", + [ + ( + ["A", "B"], + ["bool", "bool"], + "True,True\nFalse,False\nTrue,False", + None, + None, + ), + ( + ["A", "B"], + ["int32", "int32"], + "True,1\nFalse,2\nTrue,3", + None, + None, + ), + ( + ["A", "B"], + ["int32", "int32"], + "YES,1\nno,2\nyes,3\nNo,4\nYes,5", + ["yes", "Yes", "YES"], + ["no", "NO", "No"], + ), + (["A", "B"], ["int32", "int32"], "foo,bar\nbar,foo", ["foo"], ["bar"]), + (["x", "y"], None, "True,1\nFalse,0", None, None), + ], +) +def test_csv_reader_bools(tmp_path, names, dtypes, data, trues, falses): + fname = tmp_path / "tmp_csvreader_file11.csv" + + lines = [",".join(names), data] + + with open(str(fname), "w") as fp: + fp.write("\n".join(lines)) + + # Usage of true_values and false_values makes that column into bool type + df_out = pd.read_csv( + fname, + names=names, + skiprows=1, + dtype=(dtypes[0] if dtypes else None), + true_values=trues, + false_values=falses, + ) + + out = read_csv( + fname, + names=names, + dtype=dtypes, + skiprows=1, + true_values=trues, + false_values=falses, + ) + + assert_eq(df_out, out) + + +def test_csv_reader_bools_custom(): + names = ["text", "bool"] + dtypes = {"text": "str", "bool": "bool"} + trues = ["foo", "1"] + falses = ["bar", "0"] + lines = [ + ",".join(names), + "true,true", + "false,false", + "foo,foo", + "bar,bar", + "0,0", + "1,1", + ] + buffer = "\n".join(lines) + + df = read_csv( + StringIO(buffer), + names=names, + dtype=dtypes, + skiprows=1, + true_values=trues, + false_values=falses, + ) + + # Note: bool literals give parsing errors as int + # "0" and "1" give parsing errors as bool in pandas + expected = pd.read_csv( + StringIO(buffer), + names=names, + dtype=dtypes, + skiprows=1, + true_values=trues, + false_values=falses, + ) + assert_eq(df, expected, check_dtype=True) + + +def test_csv_reader_bools_NA(): + names = ["text", "int"] + dtypes = ["str", "int"] + trues = ["foo"] + falses = ["bar"] + lines = [ + ",".join(names), + "true,true", + "false,false", + "foo,foo", + "bar,bar", + "qux,qux", + ] + + buffer = "\n".join(lines) + + df = read_csv( + StringIO(buffer), + names=names, + dtype=dtypes, + skiprows=1, + true_values=trues, + false_values=falses, + ) + assert len(df.columns) == 2 + assert df["text"].dtype == np.dtype("object") + assert df["int"].dtype == np.dtype("int64") + expected = pd.DataFrame( + { + "text": ["true", "false", "foo", "bar", "qux"], + "int": [1.0, 0.0, 1.0, 0.0, np.nan], + } + ) + assert_eq(df, expected) + + +def test_csv_quotednumbers(tmp_path): + fname = tmp_path / "tmp_csvreader_file12.csv" + + names = ["integer", "decimal"] + dtypes = ["int32", "float32"] + lines = [ + ",".join(names), + '1,"3.14"', + '"2","300"', + '"3",10101.0101', + '4,"6.28318"', + ] + + with open(str(fname), "w") as fp: + fp.write("\n".join(lines)) + + integer_ref = [1, 2, 3, 4] + decimal_ref = [3.14, 300, 10101.0101, 6.28318] + + df1 = read_csv(str(fname), names=names, dtype=dtypes, skiprows=1) + df2 = read_csv(str(fname), names=names, dtype=dtypes, skiprows=1) + + assert len(df2.columns) == 2 + np.testing.assert_allclose(integer_ref, df1["integer"].to_numpy()) + np.testing.assert_allclose(decimal_ref, df1["decimal"].to_numpy()) + np.testing.assert_allclose(integer_ref, df2["integer"].to_numpy()) + np.testing.assert_allclose(decimal_ref, df2["decimal"].to_numpy()) + + +def test_csv_reader_nrows(tmp_path): + fname = tmp_path / "tmp_csvreader_file14.csv" + + names = ["int1", "int2"] + dtypes = ["int32", "int32"] + + rows = 4000 + read_rows = (rows * 3) // 4 + skip_rows = (rows - read_rows) // 2 + sample_skip = 100 + + with open(str(fname), "w") as fp: + fp.write(",".join(names) + "\n") + for i in range(rows): + fp.write(str(i) + ", " + str(2 * i) + " \n") + + # with specified names + df = read_csv( + str(fname), + names=names, + dtype=dtypes, + skiprows=skip_rows + 1, + nrows=read_rows, + ) + assert df.shape == (read_rows, 2) + for row in range(0, read_rows // sample_skip, sample_skip): + assert df["int1"][row] == row + skip_rows + assert df["int2"][row] == 2 * (row + skip_rows) + assert df["int2"][read_rows - 1] == 2 * (read_rows - 1 + skip_rows) + + # with column name inference + df = read_csv( + str(fname), dtype=dtypes, skiprows=skip_rows + 1, nrows=read_rows + ) + assert df.shape == (read_rows, 2) + assert str(skip_rows) in next(iter(df)) + assert str(2 * skip_rows) in list(df)[1] + for row in range(0, read_rows // sample_skip, sample_skip): + assert df[next(iter(df))][row] == row + skip_rows + 1 + assert df[list(df)[1]][row] == 2 * (row + skip_rows + 1) + assert df[list(df)[1]][read_rows - 1] == 2 * (read_rows + skip_rows) + + # nrows larger than the file + df = read_csv(str(fname), dtype=dtypes, nrows=rows * 2) + assert df.shape == (rows, 2) + for row in range(0, rows // sample_skip, sample_skip): + assert df["int1"][row] == row + assert df["int2"][row] == 2 * row + assert df["int2"][rows - 1] == 2 * (rows - 1) + + # nrows + skiprows larger than the file + df = read_csv( + str(fname), dtype=dtypes, nrows=read_rows, skiprows=read_rows + ) + assert df.shape == (rows - read_rows, 2) + + # nrows equal to zero + df = read_csv(str(fname), dtype=dtypes, nrows=0) + assert df.shape == (0, 2) + + # with both skipfooter and nrows - should throw + with pytest.raises(ValueError): + read_csv(str(fname), nrows=read_rows, skipfooter=1) + + +def test_csv_reader_gzip_compression_strings(tmp_path): + fname = tmp_path / "tmp_csvreader_file15.csv" + fnamez = tmp_path / "tmp_csvreader_file15.csv.gz" + + names = ["text", "int"] + dtypes = ["str", "int"] + lines = [",".join(names), "a,0", "b,0", "c,0", "d,0"] + + with open(str(fname), "w") as fp: + fp.write("\n".join(lines)) + + with open(str(fname), "rb") as f_in, gzip.open(str(fnamez), "wb") as f_out: + shutil.copyfileobj(f_in, f_out) + + df = read_csv( + str(fnamez), + names=names, + dtype=dtypes, + skiprows=1, + decimal=".", + thousands="'", + compression="gzip", + ) + + assert len(df.columns) == 2 + assert df["text"].dtype == np.dtype("object") + assert df["int"].dtype == np.dtype("int64") + assert df["text"][0] == "a" + assert df["text"][1] == "b" + assert df["text"][2] == "c" + assert df["text"][3] == "d" + + +@pytest.mark.parametrize("skip_rows", [0, 4]) +@pytest.mark.parametrize("header_row", [0, 2]) +def test_csv_reader_skiprows_header(skip_rows, header_row): + names = ["float_point", "integer"] + dtypes = ["float64", "int64"] + lines = [ + ",".join(names), + "1.2, 1", + "2.3, 2", + "3.4, 3", + "4.5, 4", + "5.6, 5", + "6.7, 6", + ] + buffer = "\n".join(lines) + + cu_df = read_csv( + StringIO(buffer), dtype=dtypes, skiprows=skip_rows, header=header_row + ) + pd_df = pd.read_csv( + StringIO(buffer), skiprows=skip_rows, header=header_row + ) + + assert cu_df.shape == pd_df.shape + assert list(cu_df.columns.values) == list(pd_df.columns.values) + + +def test_csv_reader_dtype_inference(): + names = ["float_point", "integer"] + lines = [ + ",".join(names), + "1.2,1", + "2.3,2", + "3.4,3", + "4.5,4", + "5.6,5", + "6.7,6", + ] + buffer = "\n".join(lines) + cu_df = read_csv(StringIO(buffer)) + pd_df = pd.read_csv(StringIO(buffer)) + + assert cu_df.shape == pd_df.shape + assert list(cu_df.columns.values) == list(pd_df.columns.values) + + +def test_csv_reader_dtype_inference_whitespace(): + names = ["float_point", "integer"] + lines = [ + ",".join(names), + " 1.2, 1", + "2.3,2 ", + " 3.4, 3", + " 4.5,4", + "5.6, 5", + " 6.7,6 ", + ] + buffer = "\n".join(lines) + cu_df = read_csv(StringIO(buffer)) + pd_df = pd.read_csv(StringIO(buffer)) + + assert cu_df.shape == pd_df.shape + assert list(cu_df.columns.values) == list(pd_df.columns.values) + + +def test_csv_reader_empty_dataframe(): + dtypes = ["float64", "int64"] + buffer = "float_point, integer" + + # should work fine with dtypes + df = read_csv(StringIO(buffer), dtype=dtypes) + assert df.shape == (0, 2) + assert all(df.dtypes == ["float64", "int64"]) + + # should default to string columns without dtypes + df = read_csv(StringIO(buffer)) + assert df.shape == (0, 2) + assert all(df.dtypes == ["object", "object"]) + + +def test_csv_reader_filenotfound(tmp_path): + fname = str(tmp_path / "non-existing-filename.csv") + with pytest.raises(FileNotFoundError): + read_csv(fname) + + dir_name = tmp_path / "gdf_csv" + dir_name.mkdir() + with pytest.raises(FileNotFoundError): + read_csv(str(dir_name)) + + +@pytest.mark.parametrize( + "src", + [ + lambda path: str(path), + lambda path: path, + lambda path: BytesIO(path.read_bytes()), + lambda path: StringIO(path.read_text()), + lambda path: path.as_uri(), + ], + ids=["filepath", "pathlib.Path", "ByteIO", "StringIO", "url"], +) +def test_csv_reader_filepath_or_buffer(tmp_path, src): + df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}, dtype=np.int32) + csv_path = tmp_path / "tmp.csv" + df.to_csv(csv_path, index=False, header=False) + expect = pd.read_csv(csv_path) + got = cudf.read_csv(src(csv_path)) + + assert_eq(expect, got) + + +def test_small_zip(tmp_path): + df = pd.DataFrame( + { + "a": [1997] * 2, + "b": ["Ford"] * 2, + "c": ["Super, luxurious truck"] * 2, + } + ) + + fname = tmp_path / "small_zip_file.zip" + df.to_csv(fname, index=False) + + got = cudf.read_csv(fname) + assert_eq(df, got) + + +def test_csv_reader_carriage_return(): + rows = 100 + names = ["int_row", "int_double_row"] + buffer = ",".join(names) + "\r\n" + for row in range(rows): + buffer += str(row) + ", " + str(2 * row) + "\r\n" + + df = read_csv(StringIO(buffer)) + expect = cudf.DataFrame( + {"int_row": cp.arange(rows), "int_double_row": cp.arange(rows) * 2} + ) + + assert len(df) == rows + assert_eq(expect, df) + + +def test_csv_reader_tabs(): + names = ["float_point", "integer", "date"] + lines = [ + ",".join(names), + "1.2,\t12, \t11/22/1995", + "3.4\t,\t34\t,\t 01/01/2001", + "\t 5.6,56 \t, 12/12/1970", + "\t7.8 , 78\t,06/15/2018 \t", + ] + buffer = "\n".join(lines) + + df = read_csv(StringIO(buffer), parse_dates=["date"]) + + assert df.shape == (4, 3) + + floats = [1.2, 3.4, 5.6, 7.8] + ints = [12, 34, 56, 78] + dates = [ + "1995-11-22T00:00:00.000000000", + "2001-01-01T00:00:00.000000000", + "1970-12-12T00:00:00.000000000", + "2018-06-15T00:00:00.000000000", + ] + np.testing.assert_allclose(floats, df["float_point"].to_numpy()) + np.testing.assert_allclose(ints, df["integer"].to_numpy()) + for row in range(4): + assert str(df["date"][row]) == dates[row] + + +@pytest.mark.parametrize("segment_bytes", [10000, 19999, 30001, 36000]) +def test_csv_reader_byte_range(tmp_path, segment_bytes): + fname = tmp_path / "tmp_csvreader_file16.csv" + + names = ["int1", "int2"] + + rows = 10000 + with open(str(fname), "w") as fp: + for i in range(rows): + fp.write(str(i) + ", " + str(2 * i) + " \n") + file_size = os.stat(str(fname)).st_size + + ref_df = read_csv(str(fname), names=names).to_pandas() + + dfs = [] + for segment in range((file_size + segment_bytes - 1) // segment_bytes): + dfs.append( + read_csv( + str(fname), + names=names, + byte_range=(segment * segment_bytes, segment_bytes), + ) + ) + df = cudf.concat(dfs).to_pandas() + + assert list(df["int1"]) == list(ref_df["int1"]) + assert list(df["int2"]) == list(ref_df["int2"]) + + +def test_csv_reader_byte_range_type_corner_case(tmp_path): + fname = tmp_path / "tmp_csvreader_file17.csv" + + cudf.datasets.timeseries( + start="2000-01-01", + end="2000-01-02", + dtypes={"name": str, "id": int, "x": float, "y": float}, + ).to_csv(fname, chunksize=100000) + + byte_range = (2_147_483_648, 0) + with pytest.raises(ValueError, match="Invalid byte range offset"): + cudf.read_csv(fname, byte_range=byte_range, header=None) + + +@pytest.mark.parametrize("segment_bytes", [10, 19, 31, 36]) +def test_csv_reader_byte_range_strings(segment_bytes): + names = ["strings"] + buffer = "\n".join('"' + str(x) + '"' for x in range(1, 100)) + file_size = len(buffer) + + ref_df = read_csv(StringIO(buffer), names=names).to_pandas() + + dfs = [] + for segment in range((file_size + segment_bytes - 1) // segment_bytes): + dfs.append( + read_csv( + StringIO(buffer), + names=names, + byte_range=(segment * segment_bytes, segment_bytes), + ) + ) + df = cudf.concat(dfs).to_pandas() + + assert list(df["strings"]) == list(ref_df["strings"]) + + +@pytest.mark.parametrize( + "header_row, skip_rows, skip_blanks", + [ + (1, 0, True), + ("infer", 2, True), + (1, 4, True), + (3, 0, False), + ("infer", 5, False), + ], +) +@pytest.mark.parametrize("lineterminator", ["\n", "\r\n"]) +def test_csv_reader_blanks_and_comments( + skip_rows, header_row, skip_blanks, lineterminator +): + lines = [ + "# first comment line", + lineterminator, + "# third comment line", + "1,2,3", + "4,5,6", + "7,8,9", + lineterminator, + "# last comment line", + lineterminator, + "1,1,1", + ] + buffer = lineterminator.join(lines) + + cu_df = read_csv( + StringIO(buffer), + comment="#", + header=header_row, + skiprows=skip_rows, + skip_blank_lines=skip_blanks, + ) + pd_df = pd.read_csv( + StringIO(buffer), + comment="#", + header=header_row, + skiprows=skip_rows, + skip_blank_lines=skip_blanks, + ) + + assert cu_df.shape == pd_df.shape + assert list(cu_df.columns.values) == list(pd_df.columns.values) + + +def test_csv_reader_prefix(): + lines = ["1, 1, 1, 1"] + buffer = "\n".join(lines) + + prefix_str = "a_prefix" + df = read_csv(StringIO(buffer), header=None, prefix=prefix_str) + + column_names = list(df.columns.values) + for col in range(len(column_names)): + assert column_names[col] == prefix_str + str(col) + + +def test_csv_reader_delim_whitespace(): + buffer = "1 2 3\n4 5 6" + + # with header row + with pytest.warns(FutureWarning): + cu_df = read_csv(StringIO(buffer), delim_whitespace=True) + with expect_warning_if(PANDAS_GE_220): + pd_df = pd.read_csv(StringIO(buffer), delim_whitespace=True) + assert_eq(pd_df, cu_df) + + # without header row + with pytest.warns(FutureWarning): + cu_df = read_csv(StringIO(buffer), delim_whitespace=True, header=None) + with expect_warning_if(PANDAS_GE_220): + pd_df = pd.read_csv( + StringIO(buffer), delim_whitespace=True, header=None + ) + assert pd_df.shape == cu_df.shape + + # should raise an error if used with delimiter or sep + with pytest.raises(ValueError): + with pytest.warns(FutureWarning): + read_csv(StringIO(buffer), delim_whitespace=True, delimiter=" ") + with pytest.raises(ValueError): + with pytest.warns(FutureWarning): + read_csv(StringIO(buffer), delim_whitespace=True, sep=" ") + + +def test_csv_reader_unnamed_cols(): + # first and last columns are unnamed + buffer = ",1,2,3,\n4,5,6,7,8" + + cu_df = read_csv(StringIO(buffer)) + pd_df = pd.read_csv(StringIO(buffer)) + + assert all(pd_df.columns == cu_df.columns) + assert pd_df.shape == cu_df.shape + + +def test_csv_reader_header_quotation(): + buffer = '"1,,1","2,\n,2",3\n+4,+5,+6' + + cu_df = read_csv(StringIO(buffer)) + pd_df = pd.read_csv(StringIO(buffer)) + assert cu_df.shape == (1, 3) + assert_eq(pd_df, cu_df) + + # test cases that fail with pandas + buffer_pd_fail = '"1,one," , ",2,two" ,3\n4,5,6' + cu_df = read_csv(StringIO(buffer_pd_fail)) + assert cu_df.shape == (1, 3) + + +def test_csv_reader_oversized_byte_range(): + buffer = "a,b,c,d,e\n4,5,6,7,8" + + cu_df = read_csv(StringIO(buffer), byte_range=(0, 1024)) + pd_df = pd.read_csv(StringIO(buffer)) + + assert all(pd_df.columns == cu_df.columns) + assert pd_df.shape == cu_df.shape + + +def test_csv_reader_index_col(): + buffer = "0,1,2\n3,4,5\n6,7,8" + names = ["int1", "int2", "int3"] + + # using a column name + cu_df = read_csv(StringIO(buffer), names=names, index_col="int1") + pd_df = pd.read_csv(StringIO(buffer), names=names, index_col="int1") + assert_eq(pd_df, cu_df) + + # using a column index + cu_df = read_csv(StringIO(buffer), header=None, index_col=0) + pd_df = pd.read_csv(StringIO(buffer), header=None, index_col=0) + assert_eq(cu_df.index, pd_df.index) + + # using a column index with names + cu_df = read_csv(StringIO(buffer), header=None, index_col=0, names=names) + pd_df = pd.read_csv( + StringIO(buffer), header=None, index_col=0, names=names + ) + assert_eq(cu_df.index, pd_df.index) + + # passing False to avoid using a column as index (no-op in cuDF) + cu_df = read_csv(StringIO(buffer), header=None, index_col=False) + pd_df = pd.read_csv(StringIO(buffer), header=None, index_col=False) + assert_eq(cu_df.index, pd_df.index) + + +@pytest.mark.parametrize("index_name", [None, "custom name", 124]) +@pytest.mark.parametrize("index_col", [None, 0, "a"]) +def test_csv_reader_index_names(index_name, index_col): + pdf = pd.DataFrame( + {"a": [1, 2, 3], "b": [10, 11, 12]}, index=["AB", "CD", "EF"] + ) + pdf.index.name = index_name + + buffer = pdf.to_csv() + actual = cudf.read_csv(StringIO(buffer), index_col=index_col) + expected = pd.read_csv(StringIO(buffer), index_col=index_col) + assert_eq(actual, expected) + + +@pytest.mark.parametrize( + "names", [["a", "b", "c"], [416, 905, 647], range(3), None] +) +def test_csv_reader_column_names(names): + buffer = "0,1,2\n3,4,5\n6,7,8" + + df = read_csv(StringIO(buffer), names=names) + if names is None: + assert list(df) == ["0", "1", "2"] + else: + assert list(df) == list(names) + + +def test_csv_reader_repeated_column_name(): + buffer = """A,A,A.1,A,A.2,A,A.4,A,A + 1,2,3.1,4,a.2,a,a.4,a,a + 2,4,6.1,8,b.2,b,b.4,b,b""" + + # pandas and cudf to have same repeated column names + pdf = pd.read_csv(StringIO(buffer)) + gdf = cudf.read_csv(StringIO(buffer)) + assert_eq(pdf.columns, gdf.columns) + + +def test_csv_reader_bools_false_positives(): + # values that are equal to ["True", "TRUE", "False", "FALSE"] + # when using ints to detect bool values + items = [3977, 4329, 24015, 27567] + + buffer = "\n".join(str(i) for i in items) + + df = read_csv(StringIO(buffer), header=None, dtype=["int32"]) + + np.testing.assert_array_equal(items, df["0"].to_numpy()) + + +def test_csv_reader_aligned_byte_range(tmp_path): + fname = tmp_path / "tmp_csvreader_file19.csv" + nelem = 1000 + + input_df = pd.DataFrame( + {"key": np.arange(0, nelem), "zeros": np.zeros(nelem)} + ) + input_df.to_csv(fname) + + df = cudf.read_csv(str(fname), byte_range=(0, 4096)) + # read_csv call above used to crash; the assert below is not crucial + assert np.count_nonzero(df["zeros"].to_pandas().values) == 0 + + +@pytest.mark.parametrize( + "pdf_dtype, gdf_dtype", + [(None, None), ("int", "hex"), ("int32", "hex32"), ("int64", "hex64")], +) +def test_csv_reader_hexadecimals(pdf_dtype, gdf_dtype): + lines = ["0x0", "-0x1000", "0xfedcba", "0xABCDEF", "0xaBcDeF"] + values = [int(hex_int, 16) for hex_int in lines] + + buffer = "\n".join(lines) + + if gdf_dtype is not None: + # require explicit `hex` dtype to parse hexadecimals + pdf = pd.DataFrame(data=values, dtype=pdf_dtype, columns=["hex_int"]) + gdf = read_csv(StringIO(buffer), dtype=[gdf_dtype], names=["hex_int"]) + np.testing.assert_array_equal( + pdf["hex_int"], gdf["hex_int"].to_numpy() + ) + else: + # otherwise, dtype inference returns as object (string) + pdf = pd.read_csv(StringIO(buffer), names=["hex_int"]) + gdf = read_csv(StringIO(buffer), names=["hex_int"]) + assert_eq(pdf, gdf) + + +@pytest.mark.parametrize( + "np_dtype, gdf_dtype", + [("int", "hex"), ("int32", "hex32"), ("int64", "hex64")], +) +def test_csv_reader_hexadecimal_overflow(np_dtype, gdf_dtype): + # This tests values which cause an overflow warning that will become an + # error in pandas. NumPy wraps the overflow silently up to the bounds of a + # signed int64. + lines = [ + "0x0", + "-0x1000", + "0xfedcba", + "0xABCDEF", + "0xaBcDeF", + "0x9512c20b", + "0x7fffffff", + "0x7fffffffffffffff", + "-0x8000000000000000", + ] + values = [int(hex_int, 16) for hex_int in lines] + buffer = "\n".join(lines) + + gdf = read_csv(StringIO(buffer), dtype=[gdf_dtype], names=["hex_int"]) + + expected = np.array(values).astype(np_dtype) + actual = gdf["hex_int"].to_numpy() + np.testing.assert_array_equal(expected, actual) + + +@pytest.mark.parametrize("quoting", [0, 1, 2, 3]) +def test_csv_reader_pd_consistent_quotes(quoting): + names = ["text"] + dtypes = ["str"] + lines = ['"a"', '"b ""c"" d"', '"f!\n."'] + + buffer = "\n".join(lines) + + gd_df = read_csv( + StringIO(buffer), names=names, dtype=dtypes, quoting=quoting + ) + pd_df = pd.read_csv(StringIO(buffer), names=names, quoting=quoting) + + assert_eq(pd_df, gd_df) + + +def test_read_csv_names_header_combination(): + pdf = pd.DataFrame( + { + "firstname": ["Emma", "Ava", "Sophia"], + "lastname": ["Olivia", "Isabella", "Charlotte"], + "gender": ["F", "F", "F"], + } + ) + buffer = pdf.to_csv(header=True, index=False) + names = pdf.columns + + gdf = read_csv(StringIO(buffer), names=names, header=0) + assert_eq(pdf, gdf) + + gdf = read_csv(StringIO(buffer), header=0) + assert_eq(pdf, gdf) + + gdf = read_csv(StringIO(buffer)) + assert_eq(pdf, gdf) + + +def test_csv_reader_scientific_type_detection(): + buffer = """1.,1.1,-1.1,1E1,1e1,-1e1,-1e-1,1e-1,1.1e1,1.1e-1,-1.1e-1,-1.1e1 + +1.1,1E+1,1e+1,+1e1,+1e-1,1e-1,+1.1e1,1.1e+1,+1.1e+1,+1.1e1""" + expected = [ + 1.0, + 1.1, + -1.1, + 10.0, + 10.0, + -10, + -0.1, + 0.1, + 11, + 0.11, + -0.11, + -11, + 1.1, + 10.0, + 10.0, + 10, + 0.1, + 0.1, + 11, + 11, + 11, + 11, + ] + + df = read_csv(StringIO(buffer), header=None) + + for dt in df.dtypes: + assert dt == "float64" + for col in df: + assert np.isclose(df[col][0], expected[int(col)]) + + +@pytest.mark.parametrize("lineterminator", ["\n", "\r\n"]) +def test_csv_blank_first_row(lineterminator): + lines = ["colA,colB", "", "1, 1.1", "2, 2.2"] + buffer = lineterminator.join(lines) + + cu_df = read_csv(StringIO(buffer)) + + assert cu_df.shape == (2, 2) + assert all(cu_df.columns == ["colA", "colB"]) + + +@pytest.mark.parametrize("contents", ["", "\n"]) +def test_csv_empty_file(tmp_path, contents): + fname = tmp_path / "test_csv_empty_file.csv" + with open(fname, "w") as f: + f.write(contents) + + col_names = ["col1", "col2", "col3", "col4"] + in_dtypes = ["int", "str", "float", "short"] + out_dtypes = ["int64", "object", "float64", "int16"] + + # Empty dataframe if no columns names specified or inferred + df = read_csv(str(fname)) + assert len(df.columns) == 0 + + # No row dataframe if columns names are specified or inferred + df = read_csv(str(fname), dtype=in_dtypes, names=col_names) + assert all(df.columns == col_names) + assert list(df.dtypes) == out_dtypes + + +@pytest.mark.parametrize("contents", ["", "\n"]) +def test_csv_empty_buffer(contents): + col_names = ["col1", "col2", "col3", "col4"] + in_dtypes = ["int", "str", "float", "short"] + out_dtypes = ["int64", "object", "float64", "int16"] + + # Empty dataframe if no columns names specified or inferred + df = read_csv(StringIO(contents)) + assert len(df.columns) == 0 + + # No row dataframe if columns names are specified or inferred + df = read_csv(StringIO(contents), dtype=in_dtypes, names=col_names) + assert all(df.columns == col_names) + assert list(df.dtypes) == out_dtypes + + +@pytest.mark.parametrize( + "dtype", [["short", "float", "int"], {"A": "short", "C": "int"}] +) +def test_csv_reader_partial_dtype(dtype): + names_df = read_csv( + StringIO("0,1,2"), + names=["A", "B", "C"], + dtype=dtype, + usecols=["A", "C"], + ) + header_df = read_csv( + StringIO('"A","B","C"\n0,1,2'), dtype=dtype, usecols=["A", "C"] + ) + + assert_eq(names_df, header_df) + assert all(names_df.dtypes == ["int16", "int64"]) + + +def test_csv_writer_file_handle(tmp_path): + df = pd.DataFrame({"a": [1, 2, 3], "b": ["xxx", "yyyy", "zzzzz"]}) + gdf = cudf.from_pandas(df) + + gdf_df_fname = tmp_path / "gdf_df_1.csv" + with open(gdf_df_fname, "w") as f: + gdf.to_csv(path_or_buf=f, index=False) + assert os.path.exists(gdf_df_fname) + + gdf2 = pd.read_csv(gdf_df_fname) + assert_eq(gdf, gdf2) + + +def test_csv_writer_file_append(tmp_path): + gdf1 = cudf.DataFrame({"a": [1, 2, 3], "b": ["xxx", "yyyy", "zzzzz"]}) + gdf2 = cudf.DataFrame({"a": [4, 5, 6], "b": ["foo", "bar", "baz"]}) + + gdf_df_fname = tmp_path / "gdf_df_append.csv" + with open(gdf_df_fname, "w") as f: + gdf1.to_csv(f, index=False) + with open(gdf_df_fname, "a") as f: + gdf2.to_csv(f, header=False, index=False) + + result = cudf.read_csv(gdf_df_fname) + expected = cudf.concat([gdf1, gdf2], ignore_index=True) + assert_eq(result, expected, check_index_type=True) + + +def test_csv_writer_buffer(): + gdf = cudf.DataFrame({"a": [1, 2, 3], "b": ["xxx", "yyyy", "zzzzz"]}) + + buffer = BytesIO() + gdf.to_csv(buffer, index=False) + + result = cudf.read_csv(buffer) + assert_eq(result, gdf) + + +def test_csv_writer_numeric_data(numeric_types_as_str, tmp_path): + pdf_df_fname = tmp_path / "pdf_df_1.csv" + gdf_df_fname = tmp_path / "gdf_df_1.csv" + + df = pd.DataFrame( + {"col1": [1, 2, 3], "col2": [4, 5, 6]}, + ).astype(numeric_types_as_str) + gdf = cudf.from_pandas(df) + df.to_csv(path_or_buf=pdf_df_fname, index=False, lineterminator="\n") + gdf.to_csv(path_or_buf=gdf_df_fname, index=False) + + assert os.path.exists(pdf_df_fname) + assert os.path.exists(gdf_df_fname) + + expect = pd.read_csv(pdf_df_fname) + got = pd.read_csv(gdf_df_fname) + assert_eq(expect, got) + + +def test_csv_writer_datetime_data(tmp_path): + pdf_df_fname = tmp_path / "pdf_df_2.csv" + gdf_df_fname = tmp_path / "gdf_df_2.csv" + + df = pd.DataFrame( + { + "col1": [ + "31/10/2010", + "05/03/2001", + "20/10/1994", + "18/10/1990", + "1/1/1970", + "2016-04-30T01:02:03.000", + "2038-01-19 03:14:07", + ], + "col2": [ + "18/04/1995", + "14 / 07 / 1994", + "07/06/2006", + "16/09/2005", + "2/2/1970", + "2007-4-30 1:6:40.000PM", + "2038-01-19 03:14:08", + ], + "col3": [ + "1 Jan", + "2 January 1994", + "Feb 2002", + "31-01-2000", + "1-1-1996", + "15-May-2009", + "21-Dec-3262", + ], + } + ) + gdf = cudf.from_pandas(df) + df.to_csv(path_or_buf=pdf_df_fname, index=False, lineterminator="\n") + gdf.to_csv(path_or_buf=gdf_df_fname, index=False) + + assert os.path.exists(pdf_df_fname) + assert os.path.exists(gdf_df_fname) + + expect = pd.read_csv(pdf_df_fname) + got = pd.read_csv(gdf_df_fname) + assert_eq(expect, got) + + +@pytest.mark.parametrize("lineterminator", ["\r", "\n", "\t", np.str_("\n")]) +@pytest.mark.parametrize("sep", [",", "/", np.str_(",")]) +def test_csv_writer_terminator_sep(lineterminator, sep, cudf_mixed_dataframe): + df = cudf_mixed_dataframe + + buffer = BytesIO() + df.to_csv(buffer, lineterminator=lineterminator, sep=sep, index=False) + + got = read_csv(buffer, lineterminator=lineterminator, sep=sep) + assert_eq(df, got) + + +@pytest.mark.parametrize( + "lineterminator", ["\r\n", "ABC", "\t\t", np.str_("\r\n")] +) +def test_csv_writer_multichar_terminator(lineterminator, cudf_mixed_dataframe): + df = cudf_mixed_dataframe + + default_terminator_csv = StringIO() + df.to_csv(default_terminator_csv) + + # Need to check manually since readers don't support + # multicharacter line terminators + expected = default_terminator_csv.getvalue().replace("\n", lineterminator) + + buffer = StringIO() + df.to_csv(buffer, lineterminator=lineterminator) + got = buffer.getvalue() + + assert_eq(expected, got) + + +@pytest.mark.parametrize( + "columns", + [ + ["Date", "Float"], + ["Integer2", "Float", "Date", "Integer", "String", "Boolean"], + None, + ], +) +@pytest.mark.parametrize("header", [True, False]) +@pytest.mark.parametrize("index", [True, False]) +@pytest.mark.parametrize("bool_box", [bool, np.bool_]) +def test_csv_writer_column_and_header_options( + columns, header, index, bool_box, pd_mixed_dataframe +): + header = bool_box(header) + index = bool_box(index) + pdf = pd_mixed_dataframe + df = cudf.from_pandas(pdf) + + cudf_buffer = BytesIO() + df.to_csv(cudf_buffer, columns=columns, header=header, index=index) + pd_buffer = BytesIO() + pdf.to_csv(pd_buffer, columns=columns, header=header, index=index) + + expected = cudf.read_csv(pd_buffer, header=0 if header else None) + got = cudf.read_csv(cudf_buffer, header=0 if header else None) + + expected_column_cnt = (1 if index else 0) + ( + len(columns) if columns else pdf.shape[1] + ) + assert_eq(expected_column_cnt, got.shape[1]) + assert_eq(expected, got) + + +def test_csv_writer_empty_columns_parameter(cudf_mixed_dataframe): + write_str = cudf_mixed_dataframe.to_csv(columns=[], index=False) + assert_eq(write_str, "\n") + + +def test_csv_writer_multiindex(tmp_path): + pdf_df_fname = tmp_path / "pdf_df_3.csv" + gdf_df_fname = tmp_path / "gdf_df_3.csv" + + rng = np.random.default_rng(seed=0) + gdf = cudf.DataFrame( + { + "a": rng.integers(0, 5, 20), + "b": rng.integers(0, 5, 20), + "c": range(20), + "d": rng.random(20), + } + ) + gdg = gdf.groupby(["a", "b"]).mean() + pdg = gdg.to_pandas() + pdg.to_csv(pdf_df_fname) + gdg.to_csv(gdf_df_fname) + + assert os.path.exists(pdf_df_fname) + assert os.path.exists(gdf_df_fname) + + expect = pd.read_csv(pdf_df_fname) + got = pd.read_csv(gdf_df_fname) + assert_eq(expect, got) + + +@pytest.mark.parametrize("chunksize", [None, 2, 1000]) +def test_csv_writer_chunksize(chunksize, numeric_types_as_str): + cu_df = cudf.from_pandas( + pd.DataFrame( + {"col1": [1, 2, 3], "col2": [4, 5, 6]}, + ).astype(numeric_types_as_str) + ) + + buffer = BytesIO() + cu_df.to_csv(buffer, chunksize=chunksize, index=False) + + got = cudf.read_csv(buffer, dtype=numeric_types_as_str) + assert_eq(cu_df, got) + + +@pytest.mark.parametrize( + "data", + [ + {"vals": [1, 2, 3]}, + {"vals1": [1, 2, 3], "vals2": ["hello", "rapids", "cudf"]}, + {"vals1": [None, 2.0, 3.0], "vals2": ["hello", "rapids", None]}, + ], +) +def test_to_csv_empty_filename(data): + df = cudf.DataFrame(data) + pdf = df.to_pandas() + + actual = df.to_csv() + expected = pdf.to_csv() + + assert actual == expected + + +@pytest.mark.parametrize( + "data", + [ + {"vals": [1, 2, 3]}, + {"vals1": [1, 2, 3], "vals2": ["hello", "rapids", "cudf"]}, + {"vals1": [None, 2.0, 3.0], "vals2": ["hello", "rapids", None]}, + ], +) +def test_to_csv_StringIO(data): + df = cudf.DataFrame(data) + cudf_io = StringIO() + pandas_io = StringIO() + + pdf = df.to_pandas() + + df.to_csv(cudf_io) + pdf.to_csv(pandas_io) + + cudf_io.seek(0) + pandas_io.seek(0) + + assert cudf_io.read() == pandas_io.read() + + +def test_csv_writer_empty_dataframe(tmp_path): + df_fname = tmp_path / "gdf_df_5.csv" + gdf = cudf.DataFrame({"float_point": [], "integer": []}) + gdf["float_point"] = gdf["float_point"].astype("float") + gdf["integer"] = gdf["integer"].astype("int") + + gdf.to_csv(df_fname, index=False) + + df = cudf.read_csv(df_fname) + + assert df.shape == (0, 2) + assert all(df.dtypes == ["object", "object"]) + + +def test_csv_write_chunksize_corner_case(tmp_path): + # With this num of rows and chunksize + # libcudf splits table such a way that it + # will end up creating an empty table slice + # which caused the issue 5588. + df_fname = tmp_path / "gdf_df_17.csv" + df = cudf.DataFrame({"a": np.arange(10_000)}) + df.to_csv(df_fname, chunksize=1000, index=False) + got = cudf.read_csv(df_fname) + + assert_eq(df, got) + + +def test_csv_write_no_caller_manipulation(): + df = cudf.DataFrame({"a": [1, 2, 3]}) + df_copy = df.copy(deep=True) + _ = df.to_csv(index=True) + assert_eq(df, df_copy) + + +@pytest.mark.parametrize( + "pdf", + [ + pd.DataFrame({"a": [1, 2, 3], "": [10, 20, 40]}), + pd.DataFrame({"": [10, 20, 40], "a": [1, 2, 3]}), + pd.DataFrame( + {"a": [1, 2, 3], "": [10, 20, 40]}, + index=pd.Index(["a", "z", "v"], name="custom name"), + ), + ], +) +@pytest.mark.parametrize("index", [True, False]) +@pytest.mark.parametrize("columns", [["a"], [""], None]) +def test_csv_write_empty_column_name(pdf, index, columns): + df = cudf.DataFrame.from_pandas(pdf) + expected = pdf.to_csv(index=index, columns=columns) + actual = df.to_csv(index=index, columns=columns) + + assert expected == actual + + +@pytest.mark.parametrize("idx", [None, pd.Index([], name="index name")]) +@pytest.mark.parametrize("index", [True, False]) +def test_csv_write_empty_dataframe(idx, index): + df = cudf.DataFrame(index=idx) + pdf = df.to_pandas() + + expected = pdf.to_csv(index=index) + actual = df.to_csv(index=index) + + assert expected == actual + + +@pytest.mark.parametrize( + "df", + [ + pd.DataFrame( + { + "a": [1, 2, 3, None], + "": ["a", "v", None, None], + None: [12, 12, 32, 44], + } + ), + pd.DataFrame( + { + np.nan: [1, 2, 3, None], + "": ["a", "v", None, None], + None: [12, 12, 32, 44], + } + ), + pd.DataFrame({"": [1, None, 3, 4]}), + pd.DataFrame({None: [1, None, 3, 4]}), + pd.DataFrame(columns=[None, "", "a", "b"]), + pd.DataFrame(columns=[None]), + pd.DataFrame(columns=[""]), + ], +) +@pytest.mark.parametrize( + "na_rep", ["", "_NA_", "---", "_____CUSTOM_NA_REP______"] +) +def test_csv_write_dataframe_na_rep(df, na_rep): + gdf = cudf.from_pandas(df) + + expected = df.to_csv(na_rep=na_rep) + actual = gdf.to_csv(na_rep=na_rep) + + assert expected == actual + + +@pytest.mark.parametrize( + "dtype", + [ + "int", + "str", + "float", + np.int32, + np.dtype("float32"), + {"a": "int32", "b": "float64", "c": "uint8"}, + int, + str, + object, + ], +) +def test_csv_reader_dtypes(dtype): + buf = "a,b,c\n1,10,111\n2,11,112\n3,12,113\n4,13,114\n" + + expected = pd.read_csv(StringIO(buf), dtype=dtype) + actual = cudf.read_csv(StringIO(buf), dtype=dtype) + + assert_eq(expected, actual) + + +@pytest.mark.parametrize( + "dtype", ["Int64", "UInt32", {"a": "UInt64", "b": "Float64", "c": "Int32"}] +) +def test_csv_reader_nullable_dtypes(dtype): + buf = "a,b,c\n1,10,111\n2,11,112\n3,12,113\n4,13,114\n" + + expected = pd.read_csv(StringIO(buf), dtype=dtype) + actual = cudf.read_csv(StringIO(buf), dtype=dtype) + + assert_eq(expected, actual.to_pandas(nullable=True)) + + +def test_csv_reader_temporal_dtypes(temporal_types_as_str): + buf = "a,b,c\n1,10,111\n2,11,112\n3,12,113\n43432423,13342,13243214\n" + + expected = pd.read_csv(StringIO(buf)).astype(temporal_types_as_str) + actual = cudf.read_csv(StringIO(buf), dtype=temporal_types_as_str) + + assert_eq(expected, actual) + + +@pytest.mark.parametrize( + "df", + [ + lambda: cudf.DataFrame( + { + "a": cudf.Series([1, 2, 3, 1, 2], dtype="category"), + "b": cudf.Series(["a", "c", "a", "b", "a"], dtype="category"), + } + ), + lambda: cudf.DataFrame( + { + "a": cudf.Series([1.1, 2, 3, 1.1, 2], dtype="category"), + "b": cudf.Series( + [None, "c", None, "b", "a"], dtype="category" + ), + } + ), + lambda: cudf.DataFrame( + { + "b": cudf.Series( + [1.1, 2, 3, 1.1, 2], + dtype="category", + index=cudf.CategoricalIndex( + ["abc", "def", "ghi", "jkl", "xyz"] + ), + ) + } + ), + ], +) +def test_csv_writer_category(df): + df = df() + pdf = df.to_pandas() + + expected = pdf.to_csv() + actual = df.to_csv() + + assert expected == actual + + +@pytest.mark.parametrize( + "dtype", + [ + "category", + {"a": "category", "b": "str"}, + {"b": "category"}, + {"a": "category"}, + {"a": pd.CategoricalDtype([1, 2])}, + {"b": pd.CategoricalDtype([1, 2, 3])}, + {"b": pd.CategoricalDtype(["b", "a"]), "a": "str"}, + pd.CategoricalDtype(["a", "b"]), + ], +) +def test_csv_reader_category(dtype): + df = cudf.DataFrame({"a": [1, 2, 3, None], "b": ["a", "b", None, "c"]}) + csv_buf = df.to_csv() + + actual = cudf.read_csv(StringIO(csv_buf), dtype=dtype) + expected = pd.read_csv(StringIO(csv_buf), dtype=dtype) + + assert_eq(expected, actual, check_dtype=True) + + +def test_csv_writer_datetime_sep(): + df = cudf.DataFrame( + {"a": cudf.Series([22343, 2323423, 234324234], dtype="datetime64[ns]")} + ) + df["a"] = df["a"].astype("datetime64[s]") + expected = df.to_pandas().to_csv(date_format="%Y-%m-%dT%H:%M:%SZ", sep="-") + actual = df.to_csv(sep="-") + assert expected == actual + + +def test_na_filter_empty_fields(): + test_na = "TEST_NAN" + df = pd.DataFrame({"col0": ["valid", None, "also_valid", "", test_na]}) + buffer = df.to_csv(index=False) + + pdf = pd.read_csv(StringIO(buffer), na_filter=False) + gdf = cudf.read_csv(StringIO(buffer), na_filter=False) + assert_eq(pdf, gdf) + + pdf = pd.read_csv(StringIO(buffer), keep_default_na=False) + gdf = cudf.read_csv(StringIO(buffer), keep_default_na=False) + assert_eq(pdf, gdf) + + pdf = pd.read_csv( + StringIO(buffer), keep_default_na=False, na_values=test_na + ) + gdf = cudf.read_csv( + StringIO(buffer), keep_default_na=False, na_values=test_na + ) + assert_eq(pdf, gdf) + + +def test_csv_sep_error(): + pdf = pd.DataFrame({"a": [1, 2, 3]}) + gdf = cudf.DataFrame({"a": [1, 2, 3]}) + assert_exceptions_equal( + lfunc=pdf.to_csv, + rfunc=gdf.to_csv, + lfunc_args_and_kwargs=([], {"sep": "abc"}), + rfunc_args_and_kwargs=([], {"sep": "abc"}), + ) + + assert_exceptions_equal( + lfunc=pdf.to_csv, + rfunc=gdf.to_csv, + lfunc_args_and_kwargs=([], {"sep": 1}), + rfunc_args_and_kwargs=([], {"sep": 1}), + ) + + +def test_to_csv_encoding_error(): + # TODO: Remove this test once following + # issue is fixed: https://github.com/rapidsai/cudf/issues/2957 + df = cudf.DataFrame({"a": ["你好", "test"]}) + encoding = "utf-8-sig" + error_message = ( + f"Encoding {encoding} is not supported. " + + "Currently, only utf-8 encoding is supported." + ) + with pytest.raises(NotImplementedError, match=re.escape(error_message)): + df.to_csv("test.csv", encoding=encoding) + + +def test_to_csv_compression_error(): + df = cudf.DataFrame({"a": ["test"]}) + compression = "snappy" + error_message = "Writing compressed csv is not currently supported in cudf" + with pytest.raises(NotImplementedError, match=re.escape(error_message)): + df.to_csv("test.csv", compression=compression) + + +def test_empty_df_no_index(): + actual = cudf.DataFrame({}) + buffer = BytesIO() + actual.to_csv(buffer, index=False) + + result = cudf.read_csv(buffer) + + assert_eq(actual, result) + + +def test_default_integer_bitwidth( + cudf_mixed_dataframe, default_integer_bitwidth +): + # Test that integer columns in csv are _inferred_ as user specified + # bitwidth + buf = BytesIO() + cudf_mixed_dataframe.to_csv(buf) + buf.seek(0) + read = cudf.read_csv(buf) + assert read["Integer"].dtype == np.dtype( + f"i{default_integer_bitwidth // 8}" + ) + assert read["Integer2"].dtype == np.dtype( + f"i{default_integer_bitwidth // 8}" + ) + + +def test_default_integer_bitwidth_partial( + cudf_mixed_dataframe, default_integer_bitwidth +): + # Test that integer columns in csv are _inferred_ as user specified + # bitwidth + buf = BytesIO() + cudf_mixed_dataframe.to_csv(buf) + buf.seek(0) + read = cudf.read_csv(buf, dtype={"Integer": "int64"}) + assert read["Integer"].dtype == np.dtype("i8") + assert read["Integer2"].dtype == np.dtype( + f"i{default_integer_bitwidth // 8}" + ) + + +@pytest.mark.filterwarnings("ignore:invalid value encountered in cast") +def test_default_integer_bitwidth_extremes( + numeric_extremes_dataframe, default_integer_bitwidth +): + # Test that integer columns in csv are _inferred_ as user specified + # bitwidth + buf = BytesIO() + cudf.DataFrame.from_pandas(numeric_extremes_dataframe).to_csv(buf) + buf.seek(0) + read = cudf.read_csv(buf) + + assert read["int64"].dtype == np.dtype(f"i{default_integer_bitwidth // 8}") + assert read["long"].dtype == np.dtype(f"i{default_integer_bitwidth // 8}") + assert read["uint64"].dtype == np.dtype( + f"u{default_integer_bitwidth // 8}" + ) + + +def test_default_float_bitwidth(cudf_mixed_dataframe, default_float_bitwidth): + # Test that float columns in csv are _inferred_ as user specified + # bitwidth + buf = BytesIO() + cudf_mixed_dataframe.to_csv(buf) + buf.seek(0) + read = cudf.read_csv(buf) + assert read["Float"].dtype == np.dtype(f"f{default_float_bitwidth // 8}") + + +def test_default_float_bitwidth_partial(default_float_bitwidth): + # Test that float columns in csv are _inferred_ as user specified + # bitwidth + read = cudf.read_csv( + StringIO("float1,float2\n1.0,2.0\n3.0,4.0"), + dtype={"float2": "float64"}, + ) + assert read["float1"].dtype == np.dtype(f"f{default_float_bitwidth // 8}") + assert read["float2"].dtype == np.dtype("f8") + + +@pytest.mark.parametrize( + "usecols,names", + [ + # selection using indices; only names of selected columns are specified + ([1, 2], ["b", "c"]), + # selection using indices; names of all columns are specified + ([1, 2], ["a", "b", "c"]), + # selection using indices; duplicates + ([2, 2], ["a", "b", "c"]), + # selection using indices; out of order + ([2, 1], ["a", "b", "c"]), + # selection using names + (["b"], ["a", "b", "c"]), + # selection using names; multiple columns + (["b", "c"], ["a", "b", "c"]), + # selection using names; duplicates + (["c", "c"], ["a", "b", "c"]), + # selection using names; out of order + (["c", "b"], ["a", "b", "c"]), + ], +) +def test_column_selection_plus_column_names(usecols, names): + lines = [ + "num,datetime,text", + "123,2018-11-13T12:00:00,abc", + "456,2018-11-14T12:35:01,def", + "789,2018-11-15T18:02:59,ghi", + ] + + buffer = "\n".join(lines) + "\n" + + assert_eq( + pd.read_csv(StringIO(buffer), usecols=usecols, names=names), + cudf.read_csv(StringIO(buffer), usecols=usecols, names=names), + ) + + +def test_read_compressed_BOM(tmp_path): + buffer = 'int, string\n1, "a"\n2, "b"\n3, "c"\n' + + fname = tmp_path / "tmp_csvreader_file20.gz" + with gzip.open(fname, "wt", encoding="utf-8") as f: + f.write(codecs.BOM_UTF8.decode("utf-8")) + f.write(buffer) + + assert_eq(pd.read_csv(fname), cudf.read_csv(fname)) + + +def test_read_header_none_pandas_compat_column_type(): + data = "1\n2\n" + with cudf.option_context("mode.pandas_compatible", True): + result = cudf.read_csv(StringIO(data), header=None).columns + expected = pd.read_csv(StringIO(data), header=None).columns + pd.testing.assert_index_equal(result, expected, exact=True) + + +@pytest.mark.parametrize("buffer", ["1", '"one"']) +def test_read_single_unterminated_row(buffer): + gdf = cudf.read_csv(StringIO(buffer), header=None) + assert_eq(gdf.shape, (1, 1)) + + +@pytest.mark.parametrize("buffer", ["\n", "\r\n"]) +def test_read_empty_only_row(buffer): + gdf = cudf.read_csv(StringIO(buffer), header=None) + assert_eq(gdf.shape, (0, 0)) + + +def test_read_empty_only_row_custom_terminator(): + gdf = cudf.read_csv(StringIO("*"), header=None, lineterminator="*") + assert_eq(gdf.shape, (0, 0)) + + +def test_empty_file_pandas_compat_raises(tmp_path): + empty_file = tmp_path / "empty.csv" + empty_file.touch() + with cudf.option_context("mode.pandas_compatible", True): + with pytest.raises(pd.errors.EmptyDataError): + cudf.read_csv(StringIO()) + with pytest.raises(pd.errors.EmptyDataError): + cudf.read_csv(empty_file) + with pytest.raises(pd.errors.EmptyDataError): + cudf.read_csv(str(empty_file)) diff --git a/python/cudf/cudf/tests/input_output/test_feather.py b/python/cudf/cudf/tests/input_output/test_feather.py index 06777c8e6af..e32edcd4bc5 100644 --- a/python/cudf/cudf/tests/input_output/test_feather.py +++ b/python/cudf/cudf/tests/input_output/test_feather.py @@ -1 +1,65 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. +# Copyright (c) 2018-2025, NVIDIA CORPORATION. + +from string import ascii_letters + +import numpy as np +import pandas as pd +import pyarrow as pa +import pytest + +import cudf +from cudf.testing import assert_eq +from cudf.testing._utils import NUMERIC_TYPES + + +@pytest.fixture(params=[0, 10]) +def pdf(request): + rng = np.random.default_rng(seed=0) + types = [*NUMERIC_TYPES, "bool"] + nrows = request.param + + # Create a pandas dataframe with random data of mixed types + test_pdf = pd.DataFrame( + { + f"col_{typ}": rng.integers(0, nrows, nrows).astype(typ) + for typ in types + } + ) + # Create non-numeric categorical data otherwise may get typecasted + data = [ascii_letters[rng.integers(0, 52)] for i in range(nrows)] + test_pdf["col_category"] = pd.Series(data, dtype="category") + return test_pdf + + +@pytest.mark.filterwarnings("ignore:Using CPU") +@pytest.mark.filterwarnings("ignore:Strings are not yet supported") +@pytest.mark.parametrize( + "columns", + [["col_int8"], ["col_category"], ["col_int32", "col_float32"], None], +) +def test_feather_reader(pdf, columns, tmp_path): + feather_file = tmp_path / "test.feather" + pdf.to_feather(feather_file) + expect = pa.feather.read_table(feather_file, columns=columns).to_pandas() + got = ( + cudf.read_feather(feather_file, columns=columns) + .to_arrow(preserve_index=False) + .to_pandas() + ) + + assert_eq(expect, got, check_categorical=False) + + +@pytest.mark.filterwarnings("ignore:Using CPU") +def test_feather_writer(tmp_path, pdf): + gdf = cudf.DataFrame.from_pandas(pdf) + pdf_fname = tmp_path / "pdf.feather" + gdf_fname = tmp_path / "gdf.feather" + + pdf.to_feather(pdf_fname) + gdf.to_feather(gdf_fname) + + expect = pa.feather.read_table(pdf_fname) + got = pa.feather.read_table(gdf_fname) + + assert pa.Table.equals(expect, got) diff --git a/python/cudf/cudf/tests/input_output/test_json.py b/python/cudf/cudf/tests/input_output/test_json.py index 06777c8e6af..14f6d6d43b1 100644 --- a/python/cudf/cudf/tests/input_output/test_json.py +++ b/python/cudf/cudf/tests/input_output/test_json.py @@ -1 +1,1446 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. +# Copyright (c) 2018-2025, NVIDIA CORPORATION. + +import copy +import gzip +import os +from io import BytesIO, StringIO +from pathlib import Path + +import numpy as np +import pandas as pd +import pyarrow as pa +import pytest + +import cudf +from cudf.core._compat import PANDAS_CURRENT_SUPPORTED_VERSION, PANDAS_VERSION +from cudf.testing import assert_eq +from cudf.testing._utils import ( + DATETIME_TYPES, + NUMERIC_TYPES, + TIMEDELTA_TYPES, + expect_warning_if, +) + + +@pytest.fixture(params=["auto", "cudf", "pandas"]) +def engine(request): + return request.param + + +@pytest.fixture(params=[True, False]) +def lines(request): + return request.param + + +@pytest.fixture(params=[0, 10]) +def pdf(request): + rng = np.random.default_rng(seed=0) + types = NUMERIC_TYPES + DATETIME_TYPES + ["bool"] + nrows = request.param + + # Create a pandas dataframe with random data of mixed types + test_pdf = pd.DataFrame( + { + f"col_{typ}": rng.integers(0, nrows, nrows).astype(typ) + for typ in types + } + ) + test_pdf.index.name = "test_index" + return test_pdf + + +@pytest.fixture +def gdf(pdf): + return cudf.DataFrame.from_pandas(pdf) + + +@pytest.fixture(params=[0, 10]) +def gdf_writer_types(request): + # datetime64[us], datetime64[ns] are unsupported due to a bug in parser + types = [ + *NUMERIC_TYPES, + "datetime64[s]", + "datetime64[ms]", + *TIMEDELTA_TYPES, + "bool", + "str", + ] + typer = {"col_" + val: val for val in types} + ncols = len(types) + nrows = request.param + + # Create a pandas dataframe with random data of mixed types + test_pdf = cudf.DataFrame( + np.ones((nrows, ncols)), + columns=pd.Index(typer.keys()), + ) + + # Cast all the column dtypes to objects, rename them, and then cast to + # appropriate types + test_pdf = test_pdf.astype(typer) + + return test_pdf + + +@pytest.mark.filterwarnings("ignore:Strings are not yet supported") +@pytest.mark.filterwarnings("ignore:Using CPU") +@pytest.mark.parametrize("index", [True, False]) +# tests limited to compressions formats supported by pandas and cudf: bz2, gzip, zip, zstd +@pytest.mark.parametrize("compression", ["bz2", "gzip", "zip", "zstd", None]) +@pytest.mark.parametrize("orient", ["columns", "records", "table", "split"]) +def test_json_reader(index, compression, orient, pdf, tmp_path): + skip_reason = f"{index=} is not valid with {orient=}" + if index is False and orient != "split": + pytest.skip(skip_reason) + if index is True and orient not in ("split", "table", "index", "columns"): + pytest.skip(skip_reason) + path_df = tmp_path / "test_df.json" + path_series = tmp_path / "test_series.json" + pdf.to_json(path_df, index=index, compression=compression, orient=orient) + pdf["col_int32"].to_json( + path_series, index=index, compression=compression, orient=orient + ) + expect_df = pd.read_json(path_df, orient=orient, compression=compression) + got_df = cudf.read_json(path_df, orient=orient, compression=compression) + if len(expect_df) == 0: + expect_df = expect_df.reset_index(drop=True) + expect_df.columns = expect_df.columns.astype("object") + if len(got_df) == 0: + got_df = got_df.reset_index(drop=True) + + assert_eq(expect_df, got_df, check_categorical=False) + + # Only these orients are allowed for Series, but isn't enforced by Pandas + if orient in ("split", "records", "index"): + expect_series = pd.read_json( + path_series, orient=orient, compression=compression, typ="series" + ) + got_series = cudf.read_json( + path_series, orient=orient, compression=compression, typ="series" + ) + if len(expect_series) == 0: + expect_series = expect_series.reset_index(drop=True) + if len(got_df) == 0: + got_series = got_series.reset_index(drop=True) + + assert_eq(expect_series, got_series) + + +@pytest.mark.filterwarnings("ignore:Can't infer compression") +@pytest.mark.filterwarnings("ignore:Using CPU") +def test_json_writer(tmp_path, pdf, gdf): + pdf_df_fname = tmp_path / "pdf_df.json" + gdf_df_fname = tmp_path / "gdf_df.json" + + pdf.to_json(pdf_df_fname) + gdf.to_json(gdf_df_fname) + + assert os.path.exists(pdf_df_fname) + assert os.path.exists(gdf_df_fname) + + expect_df = pd.read_json(pdf_df_fname) + got_df = pd.read_json(gdf_df_fname) + + assert_eq(expect_df, got_df) + + for column in pdf.columns: + pdf_series_fname = tmp_path / f"{column}_pdf_series.json" + gdf_series_fname = tmp_path / f"{column}_gdf_series.json" + + pdf[column].to_json(pdf_series_fname) + gdf[column].to_json(gdf_series_fname) + + assert os.path.exists(pdf_series_fname) + assert os.path.exists(gdf_series_fname) + + expect_series = pd.read_json(pdf_series_fname, typ="series") + got_series = pd.read_json(gdf_series_fname, typ="series") + + assert_eq(expect_series, got_series) + + # Make sure results align for regular strings, not just files + pdf_string = pdf[column].to_json() + gdf_string = pdf[column].to_json() + assert_eq(pdf_string, gdf_string) + + +def test_cudf_json_writer(pdf, lines): + # removing datetime column because pandas doesn't support it + for col_name in pdf.columns: + if "datetime" in col_name: + pdf.drop(col_name, axis=1, inplace=True) + gdf = cudf.DataFrame.from_pandas(pdf) + pdf_string = pdf.to_json(orient="records", lines=lines) + gdf_string = gdf.to_json(orient="records", lines=lines, engine="cudf") + + assert_eq(pdf_string, gdf_string) + + gdf_string = gdf.to_json( + orient="records", lines=lines, engine="cudf", rows_per_chunk=8 + ) + + assert_eq(pdf_string, gdf_string) + + +def test_cudf_json_writer_read(gdf_writer_types): + dtypes = { + col_name: col_name[len("col_") :] + for col_name in gdf_writer_types.columns + } + gdf_string = gdf_writer_types.to_json( + orient="records", lines=True, engine="cudf" + ) + gdf2 = cudf.read_json( + StringIO(gdf_string), + lines=True, + engine="cudf", + dtype=dict(dtypes), + ) + pdf2 = pd.read_json(StringIO(gdf_string), lines=True, dtype=dict(dtypes)) + + # Bug in pandas https://github.com/pandas-dev/pandas/issues/28558 + if pdf2.empty: + pdf2.reset_index(drop=True, inplace=True) + + # Pandas moved to consistent datetimes parsing format: + # https://pandas.pydata.org/docs/dev/whatsnew/v2.0.0.html#datetimes-are-now-parsed-with-a-consistent-format + for unit in ["s", "ms"]: + if f"col_datetime64[{unit}]" in pdf2.columns: + pdf2[f"col_datetime64[{unit}]"] = ( + pd.to_datetime(pdf2[f"col_datetime64[{unit}]"], format="mixed") + .dt.tz_localize(None) + .astype(f"datetime64[{unit}]") + ) + assert_eq(pdf2, gdf2) + + +@pytest.mark.parametrize( + "jsonl_string, expected", + [ + # fixed width + ("""{"a":10, "b":1.1}\n {"a":20, "b":2.1}\n""", None), + # simple list + ("""{"a":[1, 2, 3], "b":1.1}\n {"a":[]}\n""", None), + # simple struct + ("""{"a":{"c": 123 }, "b":1.1}\n {"a": {"c": 456}}\n""", None), + # list of lists + ("""{"a":[[], [1, 2], [3, 4]], "b":1.1}\n""", None), + ("""{"a":[null, [1, 2], [null, 4]], "b":1.1}\n""", None), + # list of structs + # error ("""{"a":[null, {}], "b":1.1}\n""", None), + ( + """{"a":[null, {"L": 123}], "b":1.0}\n {"b":1.1}\n {"b":2.1}\n""", + None, + ), + ( + """{"a":[{"L": 123}, null], "b":1.0}\n {"b":1.1}\n {"b":2.1}\n""", + None, + ), + # struct of lists + ( + """{"a":{"L": [1, 2, 3]}, "b":1.1}\n {"a": {"L": [4, 5, 6]}}\n""", + None, + ), + ("""{"a":{"L": [1, 2, null]}, "b":1.1}\n {"a": {"L": []}}\n""", None), + # struct of structs + ( + """{"a":{"L": {"M": 123}}, "b":1.1} + {"a": {"L": {"M": 456}}}\n""", + None, + ), + ( + """{"a":{"L": {"M": null}}, "b":1.1}\n {"a": {"L": {}}}\n""", + """{"a":{"L": {}}, "b":1.1}\n {"a": {"L": {}}}\n""", + ), + # list of structs of lists + ("""{"a":[{"L": [1, 2, 3]}, {"L": [4, 5, 6]}], "b":1.1}\n""", None), + ("""{"a":[{"L": [1, 2, null]}, {"L": []}], "b":1.1}\n""", None), + # struct of lists of structs + ("""{"a":{"L": [{"M": 123}, {"M": 456}]}, "b":1.1}\n""", None), + ( + """{"a":{"L": [{"M": null}, {}]}, "b":1.1}\n""", + """{"a":{"L": [{}, {}]}, "b":1.1}\n""", + ), + # empty structs + ("""{"A": null}\n {"A": {}}\n {}""", """{}\n{"A":{}}\n{}\n"""), + ( + """{"A": {"B": null}}\n {"A": {"B": {}}}\n {"A": {}}""", + """{"A":{}}\n{"A":{"B":{}}}\n{"A":{}}\n""", + ), + ], +) +def test_cudf_json_roundtrip(jsonl_string, expected): + gdf = cudf.read_json( + StringIO(jsonl_string), + lines=True, + engine="cudf", + # dtype=dict(dtypes), + ) + expected = jsonl_string if expected is None else expected + gdf_string = gdf.to_json( + orient="records", lines=True, engine="cudf", include_nulls=False + ) + assert_eq(gdf_string, expected.replace(" ", "")) + + +@pytest.mark.parametrize("sink", ["string", "file"]) +def test_cudf_json_writer_sinks(sink, tmp_path): + df = cudf.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) + target = None + if sink == "string": + target = StringIO() + elif sink == "file": + target = tmp_path / "test_df.json" + df.to_json(target, engine="cudf") + if sink == "string": + assert ( + target.getvalue() == '[{"a":1,"b":4},{"a":2,"b":5},{"a":3,"b":6}]' + ) + elif sink == "file": + assert os.path.exists(target) + with open(target, "r") as f: + assert f.read() == '[{"a":1,"b":4},{"a":2,"b":5},{"a":3,"b":6}]' + + +@pytest.fixture( + params=["string", "filepath", "pathobj", "bytes_io", "string_io", "url"] +) +def json_input(request, tmp_path): + input_type = request.param + buffer = "[1, 2, 3]\n[4, 5, 6]\n[7, 8, 9]\n" + json_dir = tmp_path / "json" + json_dir.mkdir() + fname = json_dir / "test_df.json" + if not os.path.isfile(fname): + with open(str(fname), "w") as fp: + fp.write(buffer) + + if input_type == "string": + return buffer + if input_type == "filepath": + return str(fname) + if input_type == "pathobj": + return Path(fname) + if input_type == "bytes_io": + return BytesIO(buffer.encode()) + if input_type == "string_io": + return StringIO(buffer) + if input_type == "url": + return Path(fname).as_uri() + + +@pytest.mark.skipif( + PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, + reason="warning not present in older pandas versions", +) +@pytest.mark.filterwarnings("ignore:Using CPU") +def test_json_lines_basic(json_input, engine): + can_warn = isinstance(json_input, str) and not json_input.endswith(".json") + with expect_warning_if(can_warn): + cu_df = cudf.read_json(json_input, engine=engine, lines=True) + # io types must seek to the beginning before you can read again + if hasattr(json_input, "seek"): + json_input.seek(0) + with expect_warning_if(can_warn): + pd_df = pd.read_json(json_input, lines=True) + + assert all(cu_df.dtypes == ["int64", "int64", "int64"]) + for cu_col, pd_col in zip(cu_df.columns, pd_df.columns, strict=True): + assert str(cu_col) == str(pd_col) + np.testing.assert_array_equal(pd_df[pd_col], cu_df[cu_col].to_numpy()) + + +@pytest.mark.filterwarnings("ignore:Using CPU") +def test_nonexistent_json_correct_error(engine): + json_input = "doesnotexist.json" + with pytest.raises(FileNotFoundError): + cudf.read_json(json_input, engine=engine) + + +@pytest.mark.skipif( + PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, + reason="warning not present in older pandas versions", +) +@pytest.mark.filterwarnings("ignore:Using CPU") +def test_json_lines_multiple(tmp_path, json_input, engine): + if engine == "pandas": + pytest.skip("pandas engine does not support multiple files") + tmp_file1 = tmp_path / "MultiInputs1.json" + tmp_file2 = tmp_path / "MultiInputs2.json" + + with expect_warning_if( + isinstance(json_input, str) and not json_input.endswith(".json") + ): + pdf = pd.read_json(json_input, lines=True) + pdf.to_json(tmp_file1, compression="infer", lines=True, orient="records") + pdf.to_json(tmp_file2, compression="infer", lines=True, orient="records") + + cu_df = cudf.read_json([tmp_file1, tmp_file2], engine=engine, lines=True) + pd_df = pd.concat([pdf, pdf]) + + assert all(cu_df.dtypes == ["int64", "int64", "int64"]) + for cu_col, pd_col in zip(cu_df.columns, pd_df.columns, strict=True): + assert str(cu_col) == str(pd_col) + np.testing.assert_array_equal(pd_df[pd_col], cu_df[cu_col].to_numpy()) + + +@pytest.mark.skipif( + PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, + reason="warning not present in older pandas versions", +) +def test_json_read_directory(tmp_path, json_input, engine): + if engine == "pandas": + pytest.skip("pandas engine does not support directories") + with expect_warning_if( + isinstance(json_input, str) and not json_input.endswith(".json") + ): + pdf = pd.read_json(json_input, lines=True) + json_dir = tmp_path / "jsons" + json_dir.mkdir() + pdf.to_json( + json_dir / "MultiInputs1.json", + compression="infer", + lines=True, + orient="records", + ) + pdf.to_json( + json_dir / "MultiInputs2.json", + compression="infer", + lines=True, + orient="records", + ) + pdf.to_json( + json_dir / "MultiInputs3.json", + compression="infer", + lines=True, + orient="records", + ) + + cu_df = cudf.read_json(json_dir, engine=engine, lines=True) + pd_df = pd.concat([pdf, pdf, pdf]) + + assert all(cu_df.dtypes == ["int64", "int64", "int64"]) + for cu_col, pd_col in zip(cu_df.columns, pd_df.columns, strict=True): + assert str(cu_col) == str(pd_col) + np.testing.assert_array_equal(pd_df[pd_col], cu_df[cu_col].to_numpy()) + + +def test_json_lines_byte_range(json_input): + # include the first row and half of the second row + # should parse the first two rows + will_warn = isinstance(json_input, str) and not json_input.endswith( + ".json" + ) + with expect_warning_if(will_warn): + df = cudf.read_json( + copy.deepcopy(json_input), lines=True, byte_range=(0, 15) + ) + assert df.shape == (2, 3) + + # include half of the second row and half of the third row + # should parse only the third row + with expect_warning_if(will_warn): + df = cudf.read_json( + copy.deepcopy(json_input), lines=True, byte_range=(15, 10) + ) + assert df.shape == (1, 3) + + # include half of the second row and entire third row + # should parse only the third row + with expect_warning_if(will_warn): + df = cudf.read_json( + copy.deepcopy(json_input), lines=True, byte_range=(15, 0) + ) + assert df.shape == (1, 3) + + # include half of the second row till past the end of the file + # should parse only the third row + with expect_warning_if(will_warn): + df = cudf.read_json( + copy.deepcopy(json_input), lines=True, byte_range=(10, 50) + ) + assert df.shape == (1, 3) + + +def test_json_lines_dtypes(json_input): + with expect_warning_if( + isinstance(json_input, str) and not json_input.endswith(".json") + ): + df = cudf.read_json( + json_input, lines=True, dtype={1: "int", 2: "short", 0: "float"} + ) + assert all(df.dtypes == ["float64", "int64", "int16"]) + + +@pytest.mark.parametrize( + "ext, out_comp, in_comp", + [ + (".geez", "gzip", "gzip"), + (".beez", "bz2", "bz2"), + (".gz", "gzip", "infer"), + (".bz2", "bz2", "infer"), + (".data", None, "infer"), + (".txt", None, None), + ("", None, None), + ], +) +def test_json_lines_compression(tmp_path, ext, out_comp, in_comp): + fname = tmp_path / f"tmp_json_compression.{ext}" + + nrows = 20 + pd_df = pd.DataFrame( + { + "col1": np.arange(nrows, dtype=np.int32), + "col2": np.arange(1, 1 + nrows, dtype=np.int32), + } + ) + pd_df.to_json(fname, compression=out_comp, lines=True, orient="records") + + cu_df = cudf.read_json( + str(fname), + compression=in_comp, + lines=True, + dtype={"col1": "int32", "col2": "int32"}, + ) + assert_eq(pd_df, cu_df) + + +@pytest.mark.filterwarnings("ignore:Using CPU") +def test_json_engine_selection(): + json = "[1, 2, 3]" + + # should use the cudf engine + df = cudf.read_json(StringIO(json), lines=True) + # column names are strings when parsing with cudf + for col_name in df.columns: + assert isinstance(col_name, str) + + # should use the pandas engine + df = cudf.read_json(StringIO(json), lines=False, engine="pandas") + # column names are ints when parsing with pandas + for col_name in df.columns: + assert isinstance(col_name, int) + + # should use the pandas engine + df = cudf.read_json(StringIO(json), lines=True, engine="pandas") + # column names are ints when parsing with pandas + for col_name in df.columns: + assert isinstance(col_name, int) + + +def test_json_bool_values(): + buffer = "[true,1]\n[false,false]\n[true,true]" + cu_df = cudf.read_json(StringIO(buffer), lines=True) + pd_df = pd.read_json(StringIO(buffer), lines=True) + + # types should be ['bool', 'int64'] + np.testing.assert_array_equal(pd_df.dtypes, cu_df.dtypes) + np.testing.assert_array_equal(pd_df[0], cu_df["0"].to_numpy()) + # boolean values should be converted to 0/1 + np.testing.assert_array_equal(pd_df[1], cu_df["1"].to_numpy()) + + cu_df = cudf.read_json( + StringIO(buffer), lines=True, dtype={"0": "bool", "1": "long"} + ) + np.testing.assert_array_equal(pd_df.dtypes, cu_df.dtypes) + + +def test_json_bad_protocol_string(): + test_string = StringIO('{"field": "s3://path"}') + + expect = pd.DataFrame([{"field": "s3://path"}]) + got = cudf.read_json(test_string, lines=True) + + assert_eq(expect, got) + + +def test_json_corner_case_with_escape_and_double_quote_char_with_pandas( + tmp_path, +): + fname = tmp_path / "tmp_json_escape_double_quote" + + pdf = pd.DataFrame( + { + "a": ['ab"cd', "\\\b", "\r\\", "'"], + "b": ["a\tb\t", "\\", '\\"', "\t"], + "c": ["aeiou", "try", "json", "cudf"], + } + ) + pdf.to_json(fname, compression="infer", lines=True, orient="records") + + df = cudf.read_json( + fname, compression="infer", lines=True, orient="records" + ) + pdf = pd.read_json( + fname, compression="infer", lines=True, orient="records" + ) + + assert_eq(cudf.DataFrame(pdf), df) + + +def test_json_corner_case_with_escape_and_double_quote_char_with_strings(): + str_buffer = StringIO( + """{"a":"ab\\"cd","b":"a\\tb\\t","c":"aeiou"} + {"a":"\\\\\\b","b":"\\\\","c":"try"} + {"a":"\\r\\\\","b":"\\\\\\"","c":"json"} + {"a":"\'","b":"\\t","c":"cudf"}""" + ) + + df = cudf.read_json( + str_buffer, compression="infer", lines=True, orient="records" + ) + + expected = cudf.DataFrame( + { + "a": ['ab"cd', "\\\b", "\r\\", "'"], + "b": ["a\tb\t", "\\", '\\"', "\t"], + "c": ["aeiou", "try", "json", "cudf"], + } + ) + assert_eq(df, expected) + + +def test_json_to_json_special_characters(): + df = cudf.DataFrame( + { + "'a'": ['ab"cd', "\\\b", "\r\\", "'"], + "b": ["a\tb\t", "\\", '\\"', "\t"], + "c": ["aeiou", "try", "json", "cudf"], + } + ) + + actual = StringIO() + df.to_json(actual, engine="cudf", lines=True, orient="records") + expected = StringIO() + df.to_pandas().to_json(expected, lines=True, orient="records") + assert expected.getvalue() == actual.getvalue() + + +@pytest.mark.parametrize( + "gdf,pdf", + [ + ( + lambda: cudf.DataFrame( + { + "int col": cudf.Series( + [1, 2, None, 2, 2323, 234, None], dtype="int64" + ) + } + ), + pd.DataFrame( + { + "int col": pd.Series( + [1, 2, None, 2, 2323, 234, None], dtype=pd.Int64Dtype() + ) + } + ), + ), + ( + lambda: cudf.DataFrame( + { + "int64 col": cudf.Series( + [1, 2, None, 2323, None], dtype="int64" + ), + "string col": cudf.Series( + ["abc", "a", None, "", None], dtype="str" + ), + "float col": cudf.Series( + [0.234, None, 234234.2343, None, 0.0], dtype="float64" + ), + "bool col": cudf.Series( + [None, True, False, None, True], dtype="bool" + ), + "categorical col": cudf.Series( + [1, 2, 1, None, 2], dtype="category" + ), + "datetime col": cudf.Series( + [1231233, None, 2323234, None, 1], + dtype="datetime64[ns]", + ), + "timedelta col": cudf.Series( + [None, 34687236, 2323234, 1, None], + dtype="timedelta64[ns]", + ), + } + ), + pd.DataFrame( + { + "int64 col": pd.Series( + [1, 2, None, 2323, None], dtype=pd.Int64Dtype() + ), + "string col": pd.Series( + ["abc", "a", None, "", None], dtype=pd.StringDtype() + ), + "float col": pd.Series( + [0.234, None, 234234.2343, None, 0.0], dtype="float64" + ), + "bool col": pd.Series( + [None, True, False, None, True], + dtype=pd.BooleanDtype(), + ), + "categorical col": pd.Series( + [1, 2, 1, None, 2], dtype="category" + ), + "datetime col": pd.Series( + [1231233, None, 2323234, None, 1], + dtype="datetime64[ns]", + ), + "timedelta col": pd.Series( + [None, 34687236, 2323234, 1, None], + dtype="timedelta64[ns]", + ), + } + ), + ), + ], +) +def test_json_to_json_compare_contents(gdf, pdf): + expected_json = pdf.to_json(lines=True, orient="records") + with pytest.warns(UserWarning): + actual_json = gdf().to_json(lines=True, orient="records") + + assert expected_json == actual_json + + +@pytest.mark.filterwarnings("ignore:Using CPU") +def test_default_integer_bitwidth(default_integer_bitwidth, engine): + buf = BytesIO() + pd.DataFrame({"a": range(10)}).to_json(buf, lines=True, orient="records") + buf.seek(0) + df = cudf.read_json(buf, engine=engine, lines=True, orient="records") + + assert df["a"].dtype == np.dtype(f"i{default_integer_bitwidth // 8}") + + +@pytest.mark.filterwarnings("ignore:Using CPU") +def test_default_integer_bitwidth_partial(default_integer_bitwidth, engine): + buf = BytesIO() + pd.DataFrame({"a": range(10), "b": range(10, 20)}).to_json( + buf, lines=True, orient="records" + ) + buf.seek(0) + df = cudf.read_json( + buf, engine=engine, lines=True, orient="records", dtype={"b": "i8"} + ) + + assert df["a"].dtype == np.dtype(f"i{default_integer_bitwidth // 8}") + assert df["b"].dtype == np.dtype("i8") + + +@pytest.mark.filterwarnings("ignore:Using CPU") +def test_default_integer_bitwidth_extremes(default_integer_bitwidth, engine): + # Test that integer columns in json are _inferred_ as 32 bit columns. + buf = StringIO( + '{"u8":18446744073709551615, "i8":9223372036854775807}\n' + '{"u8": 0, "i8": -9223372036854775808}' + ) + df = cudf.read_json(buf, engine=engine, lines=True, orient="records") + + assert df["u8"].dtype == np.dtype(f"u{default_integer_bitwidth // 8}") + assert df["i8"].dtype == np.dtype(f"i{default_integer_bitwidth // 8}") + + +def test_default_float_bitwidth(default_float_bitwidth): + # Test that float columns in json are _inferred_ as 32 bit columns. + df = cudf.read_json( + StringIO('{"a": 1.0, "b": 2.5}\n{"a": 3.5, "b": 4.0}'), + engine="cudf", + lines=True, + orient="records", + ) + assert df["a"].dtype == np.dtype(f"f{default_float_bitwidth // 8}") + assert df["b"].dtype == np.dtype(f"f{default_float_bitwidth // 8}") + + +def test_json_nested_basic(): + bytes_obj = BytesIO() + data = { + "c1": [{"f1": "sf11", "f2": "sf21"}, {"f1": "sf12", "f2": "sf22"}], + "c2": [["l11", "l21"], ["l12", "l22"]], + } + pdf = pd.DataFrame(data) + pdf.to_json(bytes_obj, orient="records") + + df = cudf.read_json(bytes_obj, engine="cudf", orient="records") + bytes_obj.seek(0) + pdf = pd.read_json(bytes_obj, orient="records") + + assert_eq(pdf, df) + + +@pytest.mark.parametrize( + "data", + [ + { + "c1": [{"f1": "sf11", "f2": "sf21"}, {"f1": "sf12", "f2": "sf22"}], + "c2": [["l11", "l21"], ["l12", "l22"]], + }, + # Essential test case to handle omissions + { + "c1": [{"f2": "sf21"}, {"f1": "sf12"}], + "c2": [["l11", "l21"], []], + }, + # empty input + {}, + ], +) +def test_json_nested_lines(data, lines): + bio = BytesIO() + pdf = pd.DataFrame(data) + pdf.to_json(bio, orient="records", lines=lines) + bio.seek(0) + df = cudf.read_json(bio, engine="cudf", orient="records", lines=lines) + bio.seek(0) + pdf = pd.read_json(bio, orient="records", lines=lines) + # In the second test-case we need to take a detour via pyarrow + # Pandas omits "f1" in first row, so we have to enforce a common schema, + # such that pandas would have the f1 member with null + # Also, pyarrow chooses to select different ordering of a nested column + # children though key-value pairs are correct. + pa_table_pdf = pa.Table.from_pandas( + pdf, schema=df.to_arrow().schema, safe=False + ) + assert df.to_arrow().equals(pa_table_pdf) + + +def test_json_nested_data(): + json_str = ( + '[{"0":{},"2":{}},{"1":[[""],[]],"2":{"2":""}},' + '{"0":{"a":"1"},"2":{"0":"W&RR=+I","1":""}}]' + ) + df = cudf.read_json(StringIO(json_str), engine="cudf", orient="records") + pdf = pd.read_json(StringIO(json_str), orient="records") + pdf.columns = pdf.columns.astype("str") + pa_table_pdf = pa.Table.from_pandas( + pdf, schema=df.to_arrow().schema, safe=False + ) + assert df.to_arrow().equals(pa_table_pdf) + + +def test_json_empty_types(): + json_str = """ {} + {"a": [], "b": {}} + {"a": []} + {"b": {}} + {"c": {"d": []}} + {"e": [{}]} + """ + df = cudf.read_json(StringIO(json_str), orient="records", lines=True) + pdf = pd.read_json(StringIO(json_str), orient="records", lines=True) + assert_eq(df, pdf) + + +def test_json_types_data(): + # 0:<0:string,1:float> + # 1:list + # 2:<0:bool> + json_str = ( + '[{"0":null,"2":{}},' + '{"1":[123],"0":{"0":"foo","1":123.4},"2":{"0":false}},' + '{"0":{},"1":[],"2":{"0":null}}]' + ) + df = cudf.read_json(StringIO(json_str), engine="cudf", orient="records") + pdf = pd.read_json(StringIO(json_str), orient="records") + pdf.columns = pdf.columns.astype("str") + pa_table_pdf = pa.Table.from_pandas( + pdf, schema=df.to_arrow().schema, safe=False + ) + assert df.to_arrow().equals(pa_table_pdf) + + +@pytest.mark.parametrize( + "col_type,json_str,expected_data", + [ + # without quotes + ("int", '[{"k": 1}, {"k": 2}, {"k": 3}, {"k": 4}]', [1, 2, 3, 4]), + # with quotes + ("int", '[{"k": "1"}, {"k": "2"}]', [1, 2]), + # with quotes, mixed + ("int", '[{"k": "1"}, {"k": "2"}, {"k": 3}, {"k": 4}]', [1, 2, 3, 4]), + # with quotes, null, mixed + ( + "int", + '[{"k": "1"}, {"k": "2"}, {"k": null}, {"k": 4}]', + [1, 2, None, 4], + ), + # without quotes, null + ( + "int", + '[{"k": 1}, {"k": 2}, {"k": null}, {"k": 4}]', + [1, 2, None, 4], + ), + # without quotes + ("float", '[{"k": 1}, {"k": 2}, {"k": 3}, {"k": 4}]', [1, 2, 3, 4]), + # with quotes + ("float", '[{"k": "1"}, {"k": "2"}]', [1, 2]), + # with quotes, mixed + ( + "float", + '[{"k": "1"}, {"k": "2"}, {"k": 3}, {"k": 4}]', + [1, 2, 3, 4], + ), + # with quotes, null, mixed + ( + "float", + '[{"k": "1"}, {"k": "2"}, {"k": null}, {"k": 4}]', + [1, 2, None, 4], + ), + # with quotes, NAN + ( + "float", + '[{"k": "1"}, {"k": "2"}, {"k": NaN}, {"k": "4"}]', + [1, 2, np.nan, 4], + ), + # without quotes + ("str", '[{"k": 1}, {"k": 2}, {"k": 3}, {"k": 4}]', [1, 2, 3, 4]), + # with quotes + ("str", '[{"k": "1"}, {"k": "2"}]', [1, 2]), + # with quotes, mixed + ("str", '[{"k": "1"}, {"k": "2"}, {"k": 3}, {"k": 4}]', [1, 2, 3, 4]), + # with quotes, null, mixed + ( + "str", + '[{"k": "1"}, {"k": "2"}, {"k": null}, {"k": 4}]', + [1, 2, None, 4], + ), + # without quotes, null + ( + "str", + '[{"k": 1}, {"k": 2}, {"k": null}, {"k": 4}]', + [1, 2, None, 4], + ), + ], +) +def test_json_quoted_values_with_schema(col_type, json_str, expected_data): + actual = cudf.read_json( + StringIO(json_str), + engine="cudf", + orient="records", + dtype={"k": col_type}, + ) + expected = cudf.DataFrame({"k": expected_data}, dtype=col_type) + + assert_eq(actual, expected) + + +@pytest.mark.parametrize( + "col_type,json_str,expected_data", + [ + # with quotes, mixed + ("int", '[{"k": "1"}, {"k": "2"}, {"k": 3}, {"k": 4}]', [1, 2, 3, 4]), + # with quotes, null, mixed + ( + "int", + '[{"k": "1"}, {"k": "2"}, {"k": null}, {"k": 4}]', + [1, 2, None, 4], + ), + # with quotes, mixed + ( + "str", + '[{"k": "1"}, {"k": "2"}, {"k": 3}, {"k": 4}]', + ["1", "2", "3", "4"], + ), + # with quotes, null, mixed + ( + "str", + '[{"k": "1"}, {"k": "2"}, {"k": null}, {"k": 4}]', + ["1", "2", None, "4"], + ), + ], +) +def test_json_quoted_values(col_type, json_str, expected_data): + actual = cudf.read_json( + StringIO(json_str), + engine="cudf", + orient="records", + dtype={"k": col_type}, + ) + expected = cudf.DataFrame({"k": expected_data}, dtype=col_type) + + assert_eq(expected, actual) + assert_eq(expected_data, actual.k.to_arrow().to_pylist()) + + +@pytest.mark.parametrize( + "keep_quotes,result", + [ + ( + True, + { + "c1": [ + {"f1": '"sf11"', "f2": '"sf21"'}, + {"f1": '"sf12"', "f2": '"sf22"'}, + ], + "c2": [['"l11"', '"l21"'], ['"l12"', '"l22"']], + }, + ), + ( + False, + { + "c1": [ + {"f1": "sf11", "f2": "sf21"}, + {"f1": "sf12", "f2": "sf22"}, + ], + "c2": [["l11", "l21"], ["l12", "l22"]], + }, + ), + ], +) +def test_json_keep_quotes(keep_quotes, result): + bytes_file = BytesIO() + data = { + "c1": [{"f1": "sf11", "f2": "sf21"}, {"f1": "sf12", "f2": "sf22"}], + "c2": [["l11", "l21"], ["l12", "l22"]], + } + pdf = pd.DataFrame(data) + pdf.to_json(bytes_file, orient="records", lines=True) + + actual = cudf.read_json( + bytes_file, + orient="records", + lines=True, + keep_quotes=keep_quotes, + ) + expected = pd.DataFrame(result) + + assert_eq(actual, expected) + + +def test_json_dtypes_nested_data(): + # a: StructDtype({'a': StructDtype({'b': dtype('float64')}), + # 'b': dtype('int64')}) + # b: ListDtype(ListDtype(float64)) + actual_json_str = ( + '{"a":{"a":{"b":10.0},"b":11},"b":[[10.0,1.1],[12.0,23.0]]}\n' + '{"a":{"a":{"b":107.0},"b":5},"b":[[10.0,11.2],[12.0,0.23]]}\n' + '{"a":{"a":{"b":50.7},"b":2},"b":[[10.0,11.3],[12.0,2.3]]}\n' + '{"a":{"a":{"b":1.2},"b":67},"b":[[6.0,7.0]]}\n' + '{"a":{"a":{"b":40.1},"b":1090},"b":null}\n' + ) + + """ + In [3]: df + Out[3]: + a b + 0 {'a': {'b': 10.0}, 'b': 11} [[10.0, 1.1], [12.0, 23.0]] + 1 {'a': {'b': 107.0}, 'b': 5} [[10.0, 11.2], [12.0, 0.23]] + 2 {'a': {'b': 50.7}, 'b': 2} [[10.0, 11.3], [12.0, 2.3]] + 3 {'a': {'b': 1.2}, 'b': 67} [[6.0, 7.0]] + 4 {'a': {'b': 40.1}, 'b': 1090} None + """ + + # a: StructDtype({'a': StructDtype({'b': dtype('int64')}), + # 'b': dtype('float64')}) + # b: ListDtype(ListDtype(int64)) + expected_json_str = ( + '{"a":{"a":{"b":10},"b":11.0},"b":[[10,1],[12,23]]}\n' + '{"a":{"a":{"b":107},"b":5.0},"b":[[10,11],[12,0]]}\n' + '{"a":{"a":{"b":50},"b":2.0},"b":[[10,11],[12,2]]}\n' + '{"a":{"a":{"b":1},"b":67.0},"b":[[6,7]]}\n' + '{"a":{"a":{"b":40},"b":1090.0},"b":null}\n' + ) + + """ + In [7]: df + Out[7]: + a b + 0 {'a': {'b': 10}, 'b': 11.0} [[10, 1], [12, 23]] + 1 {'a': {'b': 107}, 'b': 5.0} [[10, 11], [12, 0]] + 2 {'a': {'b': 50}, 'b': 2.0} [[10, 11], [12, 2]] + 3 {'a': {'b': 1}, 'b': 67.0} [[6, 7]] + 4 {'a': {'b': 40}, 'b': 1090.0} None + """ + + df = cudf.read_json( + StringIO(actual_json_str), + engine="cudf", + orient="records", + lines=True, + dtype={ + "a": cudf.StructDtype( + { + "a": cudf.StructDtype({"b": cudf.dtype("int64")}), + "b": cudf.dtype("float64"), + } + ), + "b": cudf.ListDtype(cudf.ListDtype("int64")), + }, + ) + + pdf = pd.read_json( + StringIO(expected_json_str), + orient="records", + lines=True, + ) + + assert_eq(df, pdf) + + pdf.columns = pdf.columns.astype("str") + pa_table_pdf = pa.Table.from_pandas( + pdf, schema=df.to_arrow().schema, safe=False + ) + assert df.to_arrow().equals(pa_table_pdf) + + +@pytest.mark.parametrize( + "tag, data", + [ + ( + "normal", + """\ +{"a": 1, "b": 2} +{"a": 3, "b": 4}""", + ), + ( + "multiple", + """\ + { "a": { "y" : 6}, "b" : [1, 2, 3], "c": 11 } + { "a": { "y" : 6}, "b" : [4, 5 ], "c": 12 } + { "a": { "y" : 6}, "b" : [6 ], "c": 13 } + { "a": { "y" : 6}, "b" : [7 ], "c": 14 }""", + ), + ( + "reordered", + """\ + { "a": { "y" : 6}, "b" : [1, 2, 3], "c": 11 } + { "a": { "y" : 6}, "c": 12 , "b" : [4, 5 ]} + { "b" : [6 ], "a": { "y" : 6}, "c": 13} + { "c" : 14, "a": { "y" : 6}, "b" : [7 ]} +""", + ), + ( + "missing", + """ + { "a": { "y" : 6}, "b" : [1, 2, 3], "c": 11 } + { "a": { "y" : 6}, "b" : [4, 5 ] } + { "a": { "y" : 6}, "c": 13 } + { "a": { "y" : 6}, "b" : [7 ], "c": 14 } +""", + ), + ( + "dtype_mismatch", + """\ + { "a": { "y" : 6}, "b" : [1, 2, 3], "c": 11 } + { "a": { "y" : 6}, "b" : [4, 5 ], "c": 12 } + { "a": { "y" : 6}, "b" : [6 ], "c": 13 } + { "a": { "y" : 6}, "b" : [7 ], "c": 14.0 }""", + ), + ], +) +class TestNestedJsonReaderCommon: + @pytest.mark.parametrize("chunk_size", [10, 100, 1024, 1024 * 1024]) + def test_chunked_nested_json_reader(self, tag, data, chunk_size): + expected = cudf.read_json(StringIO(data), lines=True) + + source_size = len(data) + chunks = [] + for chunk_start in range(0, source_size, chunk_size): + chunks.append( + cudf.read_json( + StringIO(data), + byte_range=[chunk_start, chunk_size], + lines=True, + ) + ) + df = cudf.concat(chunks, ignore_index=True) + assert expected.to_arrow().equals(df.to_arrow()) + + @pytest.mark.skipif( + PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, + reason="https://github.com/pandas-dev/pandas/pull/57439", + ) + def test_order_nested_json_reader(self, tag, data): + expected = pd.read_json(StringIO(data), lines=True) + target = cudf.read_json(StringIO(data), lines=True) + # Using pyarrow instead of assert_eq because pandas + # doesn't handle nested values comparisons correctly + if tag == "dtype_mismatch": + with pytest.raises(AssertionError): + # pandas parses integer values in float representation + # as integer + assert pa.Table.from_pandas(expected).equals(target.to_arrow()) + elif tag == "missing": + with pytest.raises(AssertionError): + # pandas inferences integer with nulls as float64 + assert pa.Table.from_pandas(expected).equals(target.to_arrow()) + else: + assert pa.Table.from_pandas(expected).equals(target.to_arrow()) + + +def test_json_round_trip_gzip(): + df = cudf.DataFrame({"a": [1, 2, 3], "b": ["abc", "def", "ghi"]}) + bio = BytesIO() + with gzip.open(bio, mode="wb") as fo: + with pytest.warns(UserWarning): + df.to_json(fo, orient="records", lines=True) + bio.seek(0) + with gzip.open(bio, mode="rb") as fo: + written_df = cudf.read_json(fo, orient="records", lines=True) + assert_eq(written_df, df) + + # Testing writing from middle of the file. + loc = bio.tell() + + with gzip.open(bio, mode="wb") as fo: + fo.seek(loc) + with pytest.warns(UserWarning): + df.to_json(fo, orient="records", lines=True) + bio.seek(loc) + with gzip.open(bio, mode="rb") as fo: + fo.seek(loc) + written_df = cudf.read_json(fo, orient="records", lines=True) + assert_eq(written_df, df) + + +@pytest.mark.parametrize( + "data", + [ + # # empty input + # assert failing due to missing index size information + "", + "[]", + "[]\n[]\n[]", + # simple values + """[1]\n[2]\n[3]""", + """[1, 2, 3]\n[4, 5, 6]\n[7, 8, 9]""", + # nulls + """[1, 2, 3]\n[4, 5, null]\n[7, 8, 9]""", + """[1, 2, 3]\n[4, 5, null]\n[7, 8, 9]\n[null, null, null]""", + """[1, 2, 3]\n[4, 5, null]\n[]""", + # missing + """[1, 2, 3]\n[4, 5 ]\n[7, 8, 9]""", + """[1, 2, 3]\n[4, 5, 6]\n[7, 8, 9, 10]""", + """[1, 2, 3]\n[4, 5, 6, {}]\n[7, 8, 9]""", + """[1, 2, 3]\n[4, 5, 6, []]\n[7, 8, 9]""", + """[1, 2, 3]\n[4, 5, 6, {"a": 10}]\n[7, 8, 9]""", + """[1, 2, 3]\n[4, 5, 6, [10]]\n[7, 8, 9]""", + # mixed + """[1, 2, 3]\n[4, 5, {}]\n[7, 8, 9]""", + """[1, 2, {}]\n[4, 5, 6]\n[7, 8, 9]""", + """[1, 2, 3]\n[4, 5, [6]]\n[7, 8, 9]""", + """[1, 2, [3]]\n[4, 5, 6]\n[7, 8, 9]""", + # nested + """[1, 2, [3]]\n[4, 5, [6]]\n[7, 8, [9]]""", + """[1, 2, {"a": 3}]\n[4, 5, {"b": 6}]\n[7, 8, {"c": 9}]""", + """[1, 2, [{"a": 3}, {"a": 3}]] + [4, 5, [{"b": 6}, {"b": 6}, {}, {"b": 6}]] + [7, 8, [{}]]""", + """[1, 2, {"a": [3, 3, 3]}] + [4, 5, {"b": [6, 6]}] + [7, 8, {"c": 9}]""", + """[1, 2, [{"a": 3}, {"a": null}]] + [4, 5, [{"b": [6.0, 6, 06]}, {"b": [6]}, {}, {"b": null}]] + [7, 8, [{}]]""", + ], +) +def test_json_array_of_arrays(data, lines): + data = data if lines else "[" + data.replace("\n", ",") + "]" + pdf = pd.read_json(StringIO(data), orient="values", lines=lines) + df = cudf.read_json( + StringIO(data), + engine="cudf", + orient="values", + lines=lines, + ) + # if mixed with dict/list type, replace other types with None. + if 2 in pdf.columns and any( + pdf[2].apply(lambda x: isinstance(x, dict) or isinstance(x, list)) + ): + pdf[2] = pdf[2].apply( + lambda x: x if isinstance(x, dict) or isinstance(x, list) else None + ) + # TODO: Replace string column names with integer column names + # for values orient in cudf json reader + pdf.rename(columns={name: str(name) for name in pdf.columns}, inplace=True) + # assert_eq(pdf, df) + pa_table_pdf = pa.Table.from_pandas( + pdf, schema=df.to_arrow().schema, safe=False + ) + assert df.to_arrow().equals(pa_table_pdf) + + +@pytest.mark.parametrize( + "jsonl_string", + [ + # simple list with mixed types + """{"a":[123, {}], "b":1.1}""", + """{"a":[123, {"0": 123}], "b":1.0}\n {"b":1.1}\n {"b":2.1}""", + """{"a":[{"L": 123}, 123], "b":1.0}\n {"b":1.1}\n {"b":2.1}""", + """{"a":[123, {"0": 123}, 12.3], "b":1.0}\n {"b":1.1}\n {"b":2.1}""", + """{"a":[123, {"0": 123}, null], "b":1.0}\n {"b":1.1}\n {"b":2.1}""", + """{"a":["123", {"0": 123}], "b":1.0}\n {"b":1.1}\n {"b":2.1}""", + """{"a":[{"0": 123}, "123"], "b":1.0}\n {"b":1.1}\n {"b":2.1}""", + """{"a":["123", {"0": 123}, "123"], "b":1.0}\n {"b":1.1}""", + """{"a":[123]}\n {"a":[{"0": 123}], "b":1.0}\n {"b":1.1}""", + """{"a":[{"0": 123}]}\n {"a":[123], "b":1.0}\n {"b":1.1}""", + """{"a":[{"0": 123}]}\n {"a": []}\n {"a":[123], "b":1.0}\n{"b":1.1}""", + """{"b":1.0, "a":[{"0": 123}]}\n {"a":[123]}\n {"b":1.1}\n{"a": []}""", + """{"a": []}\n {"a":[{"0": 123}]}\n {"a":[123], "b":1.0}\n{"b":1.1}""", + """{"a": []}\n {"a":[123], "b":1.0}\n {"a":[{"0": 123}]}\n{"b":1.1}""", + # nested list with mixed types + """{"a":[123, [{"0": 123}, {}]], "b":1.0} + {"b":1.1} + {"a":[]} + {"a":[123]} + {"a":[[123], []]}""", + """{"a":[], "b":1.0} + {"a":[[[456]]]} + {"a":[[123]]} + {"a":[123]}""", + """{"a":[123], "b":1.0} + {"b":1.1} + {"b":2.1} + {"a":[[[[[[]]]]]]}""", + """{"a":[123], "b":1.0} + {"a":[[[[[[]]]]]]} + {"a":[[[[[[]]]]], [[[[[]]]]]]} + {"a":[[[[[[]]]], [[[[]]]]]]} + {"a":[[[[[[]]], [[[]]]]]]} + {"a":[[[[[[]], [[]]]]]]} + {"a":[[[[[[], 123, []]]]]]}""", + # mixed elements in multiple columns + """{"a":[123, {"0": 123}], "b":1.0} + {"c": ["abc"], "b":1.1} + {"c": ["abc", []] }""", + ], +) +def test_json_nested_mixed_types_in_list(jsonl_string): + # utility function for this test: + # replace list elements with None if it has dict and non-dict (ignore None) + def _replace_in_list(list_to_replace, replace_items): + return [ + _replace_in_list(x, replace_items) + if isinstance(x, list) + else None + if x in replace_items + else x + for x in list_to_replace + ] + + def _replace_with_nulls(df, replace_items): + for col in df.columns: + if df[col].dtype == "object": + df[col] = df[col].apply( + lambda x: _replace_in_list(x, replace_items) + if isinstance(x, list) + else x + ) + return df + + # both json lines and json string tested. + json_string = "[" + jsonl_string.replace("\n", ",") + "]" + pdf = pd.read_json(StringIO(jsonl_string), orient="records", lines=True) + pdf2 = pd.read_json(StringIO(json_string), orient="records", lines=False) + assert_eq(pdf, pdf2) + # replace list elements with None if it has dict and non-dict + # in above test cases, these items are mixed with dict/list items + # so, replace them with None. + pdf = _replace_with_nulls(pdf, [123, "123", 12.3, "abc"]) + gdf = cudf.read_json( + StringIO(jsonl_string), + orient="records", + lines=True, + ) + gdf2 = cudf.read_json( + StringIO(json_string), + engine="cudf", + orient="records", + lines=False, + ) + if """[{"0": 123}, {}]""" not in jsonl_string: + # {} in pandas is represented as {"0": None} in cudf + assert_eq(gdf, pdf) + assert_eq(gdf2, pdf) + pa_table_pdf = pa.Table.from_pandas( + pdf, schema=gdf.to_arrow().schema, safe=False + ) + assert gdf.to_arrow().equals(pa_table_pdf) + assert gdf2.to_arrow().equals(pa_table_pdf) + + +@pytest.mark.parametrize( + "jsonl_string", + [ + # mixed type in list (in different order) + """{"a":[[{"0": 123}, {}], {"1": 321}], "b":1.0}""", + """{"a":[{"1": 321}, [{"0": 123}, {}], ], "b":1.0}""", + """{"a":[123, [{"0": 123}, {}], {"1": 321}], "b":1.0}""", + """{"a":[null, [{"0": 123}, {}], {"1": 321}], "b":1.0}""", + # mixed type in struct (in different order) + """{"a": {"b": {"0": 123}, "c": {"1": 321}}, "d":1.0} + {"a": {"b": {"0": 123}, "c": [123, 123]}, "d":1.0}""", + """{"a": {"b": {"0": 123}, "c": [123, 123]}, "d":1.0} + {"a": {"b": {"0": 123}, "c": {"1": 321}}, "d":1.0}""", + """{"a": {"b": {"0": 123}, "c": null}, "d":1.0} + {"a": {"b": {"0": 123}, "c": {"1": 321}}, "d":1.0} + {"a": {"b": {"0": 123}, "c": [123, 123]}, "d":1.0}""", + """{"a": {"b": {"0": 123}, "c": 123}, "d":1.0} + {"a": {"b": {"0": 123}, "c": {"1": 321}}, "d":1.0} + {"a": {"b": {"0": 123}, "c": [123, 123]}, "d":1.0}""", + ], +) +def test_json_nested_mixed_types_error(jsonl_string): + # mixing list and struct should raise an exception + with pytest.raises(RuntimeError): + cudf.read_json( + StringIO(jsonl_string), + orient="records", + lines=True, + ) + + +@pytest.mark.parametrize("on_bad_lines", ["error", "recover", "abc"]) +def test_json_reader_on_bad_lines(on_bad_lines): + json_input = StringIO( + '{"a":1,"b":10}\n{"a":2,"b":11}\nabc\n{"a":3,"b":12}\n' + ) + if on_bad_lines == "error": + with pytest.raises(RuntimeError): + cudf.read_json( + json_input, + lines=True, + orient="records", + on_bad_lines=on_bad_lines, + ) + elif on_bad_lines == "recover": + actual = cudf.read_json( + json_input, lines=True, orient="records", on_bad_lines=on_bad_lines + ) + expected = cudf.DataFrame( + {"a": [1, 2, None, 3], "b": [10, 11, None, 12]} + ) + assert_eq(actual, expected) + else: + with pytest.raises(TypeError): + cudf.read_json( + json_input, + lines=True, + orient="records", + on_bad_lines=on_bad_lines, + ) + + +def test_chunked_json_reader(): + df = cudf.DataFrame( + { + "a": ["aaaa"] * 1_000_000, + "b": range(1_000_000), + } + ) + buf = BytesIO() + df.to_json(buf, lines=True, orient="records", engine="cudf") + buf.seek(0) + df = df.to_pandas() + with cudf.option_context("io.json.low_memory", True): + gdf = cudf.read_json(buf, lines=True) + assert_eq(df, gdf) + + +# compression formats limited to those supported by both reader and writer +@pytest.mark.parametrize("compression", ["gzip", "snappy", "zstd"]) +def test_roundtrip_compression(compression, tmp_path): + expected = cudf.DataFrame({"a": [1], "b": ["2"]}) + fle = BytesIO() + expected.to_json(fle, engine="cudf", compression=compression) + result = cudf.read_json(fle, engine="cudf", compression=compression) + assert_eq(result, expected) diff --git a/python/cudf/cudf/tests/test_csv.py b/python/cudf/cudf/tests/test_csv.py deleted file mode 100644 index 4f9ca1b4261..00000000000 --- a/python/cudf/cudf/tests/test_csv.py +++ /dev/null @@ -1,2249 +0,0 @@ -# Copyright (c) 2018-2025, NVIDIA CORPORATION. - -import codecs -import gzip -import os -import re -import shutil -from io import BytesIO, StringIO - -import cupy as cp -import numpy as np -import pandas as pd -import pytest - -import cudf -from cudf import read_csv -from cudf.core._compat import ( - PANDAS_CURRENT_SUPPORTED_VERSION, - PANDAS_GE_220, - PANDAS_VERSION, -) -from cudf.testing import assert_eq -from cudf.testing._utils import assert_exceptions_equal, expect_warning_if - - -@pytest.fixture -def numeric_dataframe(): - return pd.DataFrame( - {"col1": [1, 2, 3], "col2": [4, 5, 6]}, - ) - - -@pytest.fixture -def datetime_dataframe(): - return pd.DataFrame( - { - "col1": [ - "31/10/2010", - "05/03/2001", - "20/10/1994", - "18/10/1990", - "1/1/1970", - "2016-04-30T01:02:03.000", - "2038-01-19 03:14:07", - ], - "col2": [ - "18/04/1995", - "14 / 07 / 1994", - "07/06/2006", - "16/09/2005", - "2/2/1970", - "2007-4-30 1:6:40.000PM", - "2038-01-19 03:14:08", - ], - "col3": [ - "1 Jan", - "2 January 1994", - "Feb 2002", - "31-01-2000", - "1-1-1996", - "15-May-2009", - "21-Dec-3262", - ], - } - ) - - -@pytest.fixture -def pd_mixed_dataframe(): - return pd.DataFrame( - { - "Integer": [2345, 11987, 9027, 9027], - "Date": ["18/04/1995", "14/07/1994", "07/06/2006", "16/09/2005"], - "Float": [9.001, 8.343, 6, 2.781], - "Integer2": [2345, 106, 2088, 789277], - "Category": ["M", "F", "F", "F"], - "String": ["Alpha", "Beta", "Gamma", "Delta"], - "Boolean": [True, False, True, False], - } - ) - - -@pytest.fixture -def cudf_mixed_dataframe(pd_mixed_dataframe): - return cudf.from_pandas(pd_mixed_dataframe) - - -@pytest.fixture -def gdf_np_dtypes(): - gdf_dtypes = [ - "float", - "float32", - "double", - "float64", - "int8", - "short", - "int16", - "int", - "int32", - "long", - "int64", - "uint8", - "uint16", - "uint32", - "uint64", - ] - - np_dtypes = [ - np.float32, - np.float32, - np.float64, - np.float64, - np.int8, - np.int16, - np.int16, - np.int32, - np.int32, - np.int64, - np.int64, - np.uint8, - np.uint16, - np.uint32, - np.uint64, - ] - return dict(zip(gdf_dtypes, np_dtypes, strict=True)) - - -@pytest.fixture -def numeric_extremes_dataframe(gdf_np_dtypes): - data = {} - for typ, np_type in gdf_np_dtypes.items(): - if np.dtype(np_type).kind in "iu": - itype = np.iinfo(np_type) - extremes = [0, +1, -1, itype.min, itype.max] - data[typ] = np.array(extremes * 4).astype(np_type)[:20] - else: - ftype = np.finfo(np_type) - extremes = [ - 0.0, - -0.0, - +1, - -1, - np.nan, - -np.nan, - # ftype.min, # TODO enable after fixing truncation issue #6235 - # ftype.max, # TODO enable after fixing truncation issue #6235 - np_type(np.inf), - -np_type(np.inf), - ftype.eps, - ftype.epsneg, - ftype.tiny, - -ftype.eps, - -ftype.epsneg, - -ftype.tiny, - ] - data[typ] = np.array(extremes * 4, dtype=np_type)[:20] - return pd.DataFrame(data) - - -@pytest.fixture( - params=[np.float64, np.float32, np.int64, np.int32, np.uint64, np.uint32] -) -def dtype(request): - return request.param - - -def test_csv_reader_numeric_data(dtype, numeric_dataframe, tmp_path): - fname = tmp_path / "tmp_csvreader_file1.csv" - - df = numeric_dataframe.astype(dtype) - df.to_csv(fname, index=False, header=False) - - dtypes = [df[k].dtype for k in df.columns] - out = read_csv(str(fname), names=list(df.columns.values), dtype=dtypes) - - assert len(out.columns) == len(df.columns) - assert_eq(df, out) - - -@pytest.mark.parametrize("parse_dates", [["date2"], [0], ["date1", 1, "bad"]]) -def test_csv_reader_datetime(datetime_dataframe, parse_dates): - df = datetime_dataframe - buffer = df.to_csv(index=False, header=False) - - gdf = read_csv( - StringIO(buffer), - names=["date1", "date2", "bad"], - parse_dates=parse_dates, - dayfirst=True, - ) - # Need to used `date_format='mixed'`, - # https://github.com/pandas-dev/pandas/issues/53355 - pdf = pd.read_csv( - StringIO(buffer), - names=["date1", "date2", "bad"], - parse_dates=parse_dates, - dayfirst=True, - date_format="mixed", - ) - - assert_eq(gdf, pdf) - - -@pytest.mark.parametrize("p_arg", ["delimiter", "sep"]) -@pytest.mark.parametrize("c_arg", ["sep", "delimiter"]) -def test_csv_reader_mixed_data_delimiter_sep( - tmp_path, p_arg, c_arg, pd_mixed_dataframe -): - pandas_arg = {p_arg: "|"} - cudf_arg = {c_arg: "|"} - fname = tmp_path / "tmp_csvreader_file3.csv" - - pd_mixed_dataframe.to_csv(fname, sep="|", index=False, header=False) - - gdf1 = read_csv( - str(fname), - names=["1", "2", "3", "4", "5", "6", "7"], - dtype=[ - "int64", - "datetime64[ns]", - "float64", - "int64", - "category", - "str", - "bool", - ], - dayfirst=True, - **cudf_arg, - ) - gdf2 = read_csv( - str(fname), - names=["1", "2", "3", "4", "5", "6", "7"], - dtype=[ - "int64", - "datetime64[ns]", - "float64", - "int64", - "category", - "str", - "bool", - ], - dayfirst=True, - **pandas_arg, - ) - - pdf = pd.read_csv( - fname, - names=["1", "2", "3", "4", "5", "6", "7"], - parse_dates=[1], - dayfirst=True, - **pandas_arg, - ) - - assert len(gdf1.columns) == len(pdf.columns) - assert len(gdf2.columns) == len(pdf.columns) - assert_eq(gdf1, gdf2) - - -@pytest.mark.parametrize("use_list", [False, True]) -def test_csv_reader_dtype_list(numeric_dataframe, use_list): - df = numeric_dataframe.astype(np.float32) - buffer = df.to_csv(index=False, header=False) - - # PANDAS doesn't list but cudf does (treated as implied ordered dict) - # Select first column's dtype if non-list; expect the same dtype for all - if use_list: - dtypes = [df[k].dtype for k in df.columns] - else: - dtypes = df[df.columns[0]].dtype - - gdf = read_csv(StringIO(buffer), dtype=dtypes, names=df.columns) - - assert_eq(gdf, df) - - -@pytest.mark.parametrize("use_names", [False, True]) -def test_csv_reader_dtype_dict(use_names, gdf_np_dtypes): - # Save with the column header if not explicitly specifying a list of names - df = pd.DataFrame( - { - typ: np.zeros(3, dtype=np_type) - for typ, np_type in gdf_np_dtypes.items() - } - ) - buffer = df.to_csv(index=False, header=not use_names) - dtypes = df.dtypes.to_dict() - names = list(gdf_np_dtypes.keys()) if use_names else None - gdf = read_csv(StringIO(buffer), dtype=dtypes, names=names) - pdf = pd.read_csv(StringIO(buffer), dtype=dtypes, names=names) - - assert_eq(gdf, pdf) - - -@pytest.mark.filterwarnings("ignore:invalid value encountered in cast") -@pytest.mark.parametrize("use_names", [True, False]) -def test_csv_reader_dtype_extremes(use_names, numeric_extremes_dataframe): - # Save with the column header if not explicitly specifying a list of names - df = numeric_extremes_dataframe - buffer = df.to_csv(index=False, header=not use_names) - dtypes = df.dtypes.to_dict() - names = df.columns.to_list() if use_names else None - - gdf = read_csv(StringIO(buffer), dtype=dtypes, names=names) - pdf = pd.read_csv(StringIO(buffer), dtype=dtypes, names=names) - - assert_eq(gdf, pdf) - - -@pytest.mark.skipif( - PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, - reason="https://github.com/pandas-dev/pandas/issues/52449", -) -def test_csv_reader_skiprows_skipfooter(tmp_path, pd_mixed_dataframe): - fname = tmp_path / "tmp_csvreader_file5.csv" - - pd_mixed_dataframe.to_csv( - fname, columns=["Integer", "Date", "Float"], index=False, header=False - ) - - # Using engine='python' to eliminate pandas warning of using python engine. - df_out = pd.read_csv( - fname, - names=["1", "2", "3"], - parse_dates=[1], - dayfirst=True, - skiprows=1, - skipfooter=1, - engine="python", - ) - out = read_csv( - str(fname), - names=["1", "2", "3"], - dtype=["int64", "datetime64[ns]", "float64"], - skiprows=1, - skipfooter=1, - dayfirst=True, - ) - - assert len(out.columns) == len(df_out.columns) - assert len(out) == len(df_out) - - assert_eq(df_out, out, check_dtype=False) - - -def test_csv_reader_negative_vals(tmp_path): - fname = tmp_path / "tmp_csvreader_file6.csv" - - names = ["0", "1", "2"] - dtypes = ["float32", "float32", "float32"] - lines = [ - ",".join(names), - "-181.5060,-185.37000,-3", - "-127.6300,-230.54600,-9", - ] - - with open(str(fname), "w") as fp: - fp.write("\n".join(lines)) - - zero = [-181.5060, -127.6300] - one = [-185.370, -230.54600] - two = [-3, -9] - - df = read_csv(str(fname), names=names, dtype=dtypes, skiprows=1) - - np.testing.assert_allclose(zero, df["0"].to_numpy()) - np.testing.assert_allclose(one, df["1"].to_numpy()) - np.testing.assert_allclose(two, df["2"].to_numpy()) - - -def test_csv_reader_strings(tmp_path): - fname = tmp_path / "tmp_csvreader_file7.csv" - - names = ["text", "int"] - dtypes = ["str", "int"] - lines = [",".join(names), "a,0", "b,0", "c,0", "d,0"] - - with open(str(fname), "w") as fp: - fp.write("\n".join(lines)) - - df = read_csv( - str(fname), - names=names, - dtype=dtypes, - skiprows=1, - decimal=".", - thousands="'", - ) - - assert len(df.columns) == 2 - assert df["text"].dtype == np.dtype("object") - assert df["int"].dtype == np.dtype("int64") - assert df["text"][0] == "a" - assert df["text"][1] == "b" - assert df["text"][2] == "c" - assert df["text"][3] == "d" - - -def test_csv_reader_strings_quotechars(tmp_path): - fname = tmp_path / "tmp_csvreader_file8.csv" - - names = ["text", "int"] - dtypes = ["str", "int"] - lines = [",".join(names), '"a,\n",0', '"b ""c"" d",0', "e,0", '"f,,!.,",0'] - - with open(str(fname), "w") as fp: - fp.write("\n".join(lines)) - - df = read_csv( - str(fname), - names=names, - dtype=dtypes, - skiprows=1, - quotechar='"', - quoting=1, - ) - - assert len(df.columns) == 2 - assert df["text"].dtype == np.dtype("object") - assert df["int"].dtype == np.dtype("int64") - assert df["text"][0] == "a,\n" - assert df["text"][1] == 'b "c" d' - assert df["text"][2] == "e" - assert df["text"][3] == "f,,!.," - - -def test_csv_reader_usecols_int_char(tmp_path, pd_mixed_dataframe): - fname = tmp_path / "tmp_csvreader_file10.csv" - pd_mixed_dataframe.to_csv( - fname, - columns=["Integer", "Date", "Float", "Integer2"], - index=False, - header=False, - ) - - df_out = pd.read_csv(fname, usecols=[0, 1, 3]) - out = read_csv(fname, usecols=[0, 1, 3]) - - assert len(out.columns) == len(df_out.columns) - assert len(out) == len(df_out) - assert_eq(df_out, out, check_names=False) - - -@pytest.mark.parametrize( - "buffer", - [ - "abc,ABC,abc,abcd,abc\n1,2,3,4,5\n", - "A,A,A.1,A,A.2,A,A.4,A,A\n1,2,3.1,4,a.2,a,a.4,a,a", - "A,A,A.1,,Unnamed: 4,A,A.4,A,A\n1,2,3.1,4,a.2,a,a.4,a,a", - ], -) -@pytest.mark.parametrize("mangle_dupe_cols", [True, False]) -def test_csv_reader_mangle_dupe_cols(tmpdir, buffer, mangle_dupe_cols): - # Default: mangle_dupe_cols=True - cu_df = read_csv(StringIO(buffer), mangle_dupe_cols=mangle_dupe_cols) - if mangle_dupe_cols: - pd_df = pd.read_csv(StringIO(buffer)) - else: - # Pandas does not support mangle_dupe_cols=False - head = buffer.split("\n")[0].split(",") - first_cols = np.unique(head, return_index=True)[1] - pd_df = pd.read_csv(StringIO(buffer), usecols=first_cols) - assert_eq(cu_df, pd_df) - - -def test_csv_reader_float_decimal(tmp_path): - fname = tmp_path / "tmp_csvreader_file12.csv" - - names = ["basic_32", "basic_64", "round", "decimal_only", "precision"] - dtypes = ["float32", "float64", "float64", "float32", "float64"] - lines = [ - ";".join(names), - "1,2;1234,5678;12345;0,123;-73,98007199999998", - "3,4;3456,7890;67890;,456;1,7976931348623157e+307", - "5,6e0;0,5679e2;1,2e10;0,07e-001;0,0", - ] - - with open(str(fname), "w") as fp: - fp.write("\n".join(lines)) - - basic_32_ref = [1.2, 3.4, 5.6] - basic_64_ref = [1234.5678, 3456.7890, 56.79] - round_ref = [12345, 67890, 12000000000] - decimal_only_ref = [0.123, 0.456, 0.007] - precision_ref = [-73.98007199999998, 1.7976931348623157e307, 0.0] - - df = read_csv( - str(fname), - names=names, - dtype=dtypes, - skiprows=1, - delimiter=";", - decimal=",", - ) - - np.testing.assert_allclose(basic_32_ref, df["basic_32"].to_numpy()) - np.testing.assert_allclose(basic_64_ref, df["basic_64"].to_numpy()) - np.testing.assert_allclose(round_ref, df["round"].to_numpy()) - np.testing.assert_allclose(decimal_only_ref, df["decimal_only"].to_numpy()) - np.testing.assert_allclose(precision_ref, df["precision"].to_numpy()) - - -def test_csv_reader_NaN_values(): - names = dtypes = ["float32"] - empty_cells = '\n""\n' - default_na_cells = ( - "#N/A\n#N/A N/A\n#NA\n-1.#IND\n" - "-1.#QNAN\n-NaN\n-nan\n1.#IND\n" - "1.#QNAN\nN/A\n\nNA\nNULL\n" - "NaN\nn/a\nnan\nnull\n" - ) - custom_na_cells = "NV_NAN\nNotANumber\n" - all_cells = empty_cells + default_na_cells + custom_na_cells - custom_na_values = ["NV_NAN", "NotANumber"] - - # test default NA values. empty cells should also yield NaNs - gdf = read_csv( - StringIO(default_na_cells + empty_cells), names=names, dtype=dtypes - ) - pdf = pd.read_csv( - StringIO(default_na_cells + empty_cells), names=names, dtype=np.float32 - ) - assert_eq(pdf, gdf) - - # custom NA values - gdf = read_csv( - StringIO(all_cells), - names=names, - dtype=dtypes, - na_values=custom_na_values, - ) - pdf = pd.read_csv( - StringIO(all_cells), - names=names, - dtype=np.float32, - na_values=custom_na_values, - ) - assert_eq(pdf, gdf) - - # custom NA values - gdf = read_csv( - StringIO(empty_cells + default_na_cells + "_NAA_\n"), - names=names, - dtype=dtypes, - na_values="_NAA_", - ) - pdf = pd.read_csv( - StringIO(empty_cells + default_na_cells + "_NAA_\n"), - names=names, - dtype=np.float32, - na_values="_NAA_", - ) - assert_eq(pdf, gdf) - - # data type detection should evaluate the column to int8 (all nulls) - gdf = read_csv( - StringIO(all_cells), - header=None, - na_values=custom_na_values, - ) - assert gdf.dtypes.iloc[0] == "int8" - assert all(gdf["0"][idx] is cudf.NA for idx in range(len(gdf["0"]))) - - # data type detection should evaluate the column to object if some nulls - gdf = read_csv(StringIO(all_cells), header=None) - assert gdf.dtypes.iloc[0] == np.dtype("object") - - -def test_csv_reader_thousands(tmp_path): - fname = tmp_path / "tmp_csvreader_file13.csv" - - names = dtypes = [ - "float32", - "float64", - "int32", - "int64", - "uint32", - "uint64", - ] - lines = [ - ",".join(names), - "1'234.5, 1'234.567, 1'234'567, 1'234'567'890,\ - 1'234'567, 1'234'567'890", - "12'345.6, 123'456.7, 12'345, 123'456'789, 12'345, 123'456'789", - ] - - with open(str(fname), "w") as fp: - fp.write("\n".join(lines)) - - f32_ref = [1234.5, 12345.6] - f64_ref = [1234.567, 123456.7] - int32_ref = [1234567, 12345] - int64_ref = [1234567890, 123456789] - uint32_ref = [1234567, 12345] - uint64_ref = [1234567890, 123456789] - - df = read_csv( - str(fname), names=names, dtype=dtypes, skiprows=1, thousands="'" - ) - - np.testing.assert_allclose(f32_ref, df["float32"].to_numpy()) - np.testing.assert_allclose(f64_ref, df["float64"].to_numpy()) - np.testing.assert_allclose(int32_ref, df["int32"].to_numpy()) - np.testing.assert_allclose(int64_ref, df["int64"].to_numpy()) - np.testing.assert_allclose(uint32_ref, df["uint32"].to_numpy()) - np.testing.assert_allclose(uint64_ref, df["uint64"].to_numpy()) - - -def test_csv_reader_buffer_strings(): - names = ["text", "int"] - dtypes = ["str", "int"] - lines = [",".join(names), "a,0", "b,0", "c,0", "d,0"] - - buffer = "\n".join(lines) - - df = read_csv(StringIO(buffer), names=names, dtype=dtypes, skiprows=1) - assert len(df.columns) == 2 - assert df["text"].dtype == np.dtype("object") - assert df["int"].dtype == np.dtype("int64") - assert df["text"][0] == "a" - assert df["text"][1] == "b" - assert df["text"][2] == "c" - assert df["text"][3] == "d" - - df2 = read_csv( - BytesIO(str.encode(buffer)), names=names, dtype=dtypes, skiprows=1 - ) - assert len(df2.columns) == 2 - assert df2["text"].dtype == np.dtype("object") - assert df2["int"].dtype == np.dtype("int64") - assert df2["text"][0] == "a" - assert df2["text"][1] == "b" - assert df2["text"][2] == "c" - assert df2["text"][3] == "d" - - -@pytest.mark.parametrize( - "ext, out_comp, in_comp", - [ - (".geez", "gzip", "gzip"), - (".beez", "bz2", "bz2"), - (".gz", "gzip", "infer"), - (".bz2", "bz2", "infer"), - (".beez", "bz2", np.str_("bz2")), - (".data", None, "infer"), - (".txt", None, None), - ("", None, None), - ], -) -def test_csv_reader_compression( - tmpdir, ext, out_comp, in_comp, pd_mixed_dataframe -): - fname = tmpdir.mkdir("gdf_csv").join("tmp_csvreader_compression" + ext) - - df = pd_mixed_dataframe - df.to_csv(fname, index=False, header=False, compression=out_comp) - - gdf = read_csv(fname, names=list(df.columns.values), compression=in_comp) - pdf = pd.read_csv( - fname, names=list(df.columns.values), compression=in_comp - ) - - assert_eq(gdf, pdf) - - -@pytest.mark.parametrize( - "names, dtypes, data, trues, falses", - [ - ( - ["A", "B"], - ["bool", "bool"], - "True,True\nFalse,False\nTrue,False", - None, - None, - ), - ( - ["A", "B"], - ["int32", "int32"], - "True,1\nFalse,2\nTrue,3", - None, - None, - ), - ( - ["A", "B"], - ["int32", "int32"], - "YES,1\nno,2\nyes,3\nNo,4\nYes,5", - ["yes", "Yes", "YES"], - ["no", "NO", "No"], - ), - (["A", "B"], ["int32", "int32"], "foo,bar\nbar,foo", ["foo"], ["bar"]), - (["x", "y"], None, "True,1\nFalse,0", None, None), - ], -) -def test_csv_reader_bools(tmp_path, names, dtypes, data, trues, falses): - fname = tmp_path / "tmp_csvreader_file11.csv" - - lines = [",".join(names), data] - - with open(str(fname), "w") as fp: - fp.write("\n".join(lines)) - - # Usage of true_values and false_values makes that column into bool type - df_out = pd.read_csv( - fname, - names=names, - skiprows=1, - dtype=(dtypes[0] if dtypes else None), - true_values=trues, - false_values=falses, - ) - - out = read_csv( - fname, - names=names, - dtype=dtypes, - skiprows=1, - true_values=trues, - false_values=falses, - ) - - assert_eq(df_out, out) - - -def test_csv_reader_bools_custom(): - names = ["text", "bool"] - dtypes = {"text": "str", "bool": "bool"} - trues = ["foo", "1"] - falses = ["bar", "0"] - lines = [ - ",".join(names), - "true,true", - "false,false", - "foo,foo", - "bar,bar", - "0,0", - "1,1", - ] - buffer = "\n".join(lines) - - df = read_csv( - StringIO(buffer), - names=names, - dtype=dtypes, - skiprows=1, - true_values=trues, - false_values=falses, - ) - - # Note: bool literals give parsing errors as int - # "0" and "1" give parsing errors as bool in pandas - expected = pd.read_csv( - StringIO(buffer), - names=names, - dtype=dtypes, - skiprows=1, - true_values=trues, - false_values=falses, - ) - assert_eq(df, expected, check_dtype=True) - - -def test_csv_reader_bools_NA(): - names = ["text", "int"] - dtypes = ["str", "int"] - trues = ["foo"] - falses = ["bar"] - lines = [ - ",".join(names), - "true,true", - "false,false", - "foo,foo", - "bar,bar", - "qux,qux", - ] - - buffer = "\n".join(lines) - - df = read_csv( - StringIO(buffer), - names=names, - dtype=dtypes, - skiprows=1, - true_values=trues, - false_values=falses, - ) - assert len(df.columns) == 2 - assert df["text"].dtype == np.dtype("object") - assert df["int"].dtype == np.dtype("int64") - expected = pd.DataFrame( - { - "text": ["true", "false", "foo", "bar", "qux"], - "int": [1.0, 0.0, 1.0, 0.0, np.nan], - } - ) - assert_eq(df, expected) - - -def test_csv_quotednumbers(tmp_path): - fname = tmp_path / "tmp_csvreader_file12.csv" - - names = ["integer", "decimal"] - dtypes = ["int32", "float32"] - lines = [ - ",".join(names), - '1,"3.14"', - '"2","300"', - '"3",10101.0101', - '4,"6.28318"', - ] - - with open(str(fname), "w") as fp: - fp.write("\n".join(lines)) - - integer_ref = [1, 2, 3, 4] - decimal_ref = [3.14, 300, 10101.0101, 6.28318] - - df1 = read_csv(str(fname), names=names, dtype=dtypes, skiprows=1) - df2 = read_csv(str(fname), names=names, dtype=dtypes, skiprows=1) - - assert len(df2.columns) == 2 - np.testing.assert_allclose(integer_ref, df1["integer"].to_numpy()) - np.testing.assert_allclose(decimal_ref, df1["decimal"].to_numpy()) - np.testing.assert_allclose(integer_ref, df2["integer"].to_numpy()) - np.testing.assert_allclose(decimal_ref, df2["decimal"].to_numpy()) - - -def test_csv_reader_nrows(tmp_path): - fname = tmp_path / "tmp_csvreader_file14.csv" - - names = ["int1", "int2"] - dtypes = ["int32", "int32"] - - rows = 4000 - read_rows = (rows * 3) // 4 - skip_rows = (rows - read_rows) // 2 - sample_skip = 100 - - with open(str(fname), "w") as fp: - fp.write(",".join(names) + "\n") - for i in range(rows): - fp.write(str(i) + ", " + str(2 * i) + " \n") - - # with specified names - df = read_csv( - str(fname), - names=names, - dtype=dtypes, - skiprows=skip_rows + 1, - nrows=read_rows, - ) - assert df.shape == (read_rows, 2) - for row in range(0, read_rows // sample_skip, sample_skip): - assert df["int1"][row] == row + skip_rows - assert df["int2"][row] == 2 * (row + skip_rows) - assert df["int2"][read_rows - 1] == 2 * (read_rows - 1 + skip_rows) - - # with column name inference - df = read_csv( - str(fname), dtype=dtypes, skiprows=skip_rows + 1, nrows=read_rows - ) - assert df.shape == (read_rows, 2) - assert str(skip_rows) in next(iter(df)) - assert str(2 * skip_rows) in list(df)[1] - for row in range(0, read_rows // sample_skip, sample_skip): - assert df[next(iter(df))][row] == row + skip_rows + 1 - assert df[list(df)[1]][row] == 2 * (row + skip_rows + 1) - assert df[list(df)[1]][read_rows - 1] == 2 * (read_rows + skip_rows) - - # nrows larger than the file - df = read_csv(str(fname), dtype=dtypes, nrows=rows * 2) - assert df.shape == (rows, 2) - for row in range(0, rows // sample_skip, sample_skip): - assert df["int1"][row] == row - assert df["int2"][row] == 2 * row - assert df["int2"][rows - 1] == 2 * (rows - 1) - - # nrows + skiprows larger than the file - df = read_csv( - str(fname), dtype=dtypes, nrows=read_rows, skiprows=read_rows - ) - assert df.shape == (rows - read_rows, 2) - - # nrows equal to zero - df = read_csv(str(fname), dtype=dtypes, nrows=0) - assert df.shape == (0, 2) - - # with both skipfooter and nrows - should throw - with pytest.raises(ValueError): - read_csv(str(fname), nrows=read_rows, skipfooter=1) - - -def test_csv_reader_gzip_compression_strings(tmp_path): - fname = tmp_path / "tmp_csvreader_file15.csv" - fnamez = tmp_path / "tmp_csvreader_file15.csv.gz" - - names = ["text", "int"] - dtypes = ["str", "int"] - lines = [",".join(names), "a,0", "b,0", "c,0", "d,0"] - - with open(str(fname), "w") as fp: - fp.write("\n".join(lines)) - - with open(str(fname), "rb") as f_in, gzip.open(str(fnamez), "wb") as f_out: - shutil.copyfileobj(f_in, f_out) - - df = read_csv( - str(fnamez), - names=names, - dtype=dtypes, - skiprows=1, - decimal=".", - thousands="'", - compression="gzip", - ) - - assert len(df.columns) == 2 - assert df["text"].dtype == np.dtype("object") - assert df["int"].dtype == np.dtype("int64") - assert df["text"][0] == "a" - assert df["text"][1] == "b" - assert df["text"][2] == "c" - assert df["text"][3] == "d" - - -@pytest.mark.parametrize("skip_rows", [0, 4]) -@pytest.mark.parametrize("header_row", [0, 2]) -def test_csv_reader_skiprows_header(skip_rows, header_row): - names = ["float_point", "integer"] - dtypes = ["float64", "int64"] - lines = [ - ",".join(names), - "1.2, 1", - "2.3, 2", - "3.4, 3", - "4.5, 4", - "5.6, 5", - "6.7, 6", - ] - buffer = "\n".join(lines) - - cu_df = read_csv( - StringIO(buffer), dtype=dtypes, skiprows=skip_rows, header=header_row - ) - pd_df = pd.read_csv( - StringIO(buffer), skiprows=skip_rows, header=header_row - ) - - assert cu_df.shape == pd_df.shape - assert list(cu_df.columns.values) == list(pd_df.columns.values) - - -def test_csv_reader_dtype_inference(): - names = ["float_point", "integer"] - lines = [ - ",".join(names), - "1.2,1", - "2.3,2", - "3.4,3", - "4.5,4", - "5.6,5", - "6.7,6", - ] - buffer = "\n".join(lines) - cu_df = read_csv(StringIO(buffer)) - pd_df = pd.read_csv(StringIO(buffer)) - - assert cu_df.shape == pd_df.shape - assert list(cu_df.columns.values) == list(pd_df.columns.values) - - -def test_csv_reader_dtype_inference_whitespace(): - names = ["float_point", "integer"] - lines = [ - ",".join(names), - " 1.2, 1", - "2.3,2 ", - " 3.4, 3", - " 4.5,4", - "5.6, 5", - " 6.7,6 ", - ] - buffer = "\n".join(lines) - cu_df = read_csv(StringIO(buffer)) - pd_df = pd.read_csv(StringIO(buffer)) - - assert cu_df.shape == pd_df.shape - assert list(cu_df.columns.values) == list(pd_df.columns.values) - - -def test_csv_reader_empty_dataframe(): - dtypes = ["float64", "int64"] - buffer = "float_point, integer" - - # should work fine with dtypes - df = read_csv(StringIO(buffer), dtype=dtypes) - assert df.shape == (0, 2) - assert all(df.dtypes == ["float64", "int64"]) - - # should default to string columns without dtypes - df = read_csv(StringIO(buffer)) - assert df.shape == (0, 2) - assert all(df.dtypes == ["object", "object"]) - - -def test_csv_reader_filenotfound(tmpdir): - fname = "non-existing-filename.csv" - - # should raise an error - with pytest.raises(FileNotFoundError): - read_csv(str(fname)) - - # should raise an error - dname = tmpdir.mkdir("gdf_csv") - with pytest.raises(FileNotFoundError): - read_csv(str(dname)) - - -@pytest.mark.parametrize( - "src", - [ - lambda path: str(path), - lambda path: path, - lambda path: BytesIO(path.read_bytes()), - lambda path: StringIO(path.read_text()), - lambda path: path.as_uri(), - ], - ids=["filepath", "pathlib.Path", "ByteIO", "StringIO", "url"], -) -def test_csv_reader_filepath_or_buffer(tmp_path, src): - df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}, dtype=np.int32) - csv_path = tmp_path / "tmp.csv" - df.to_csv(csv_path, index=False, header=False) - expect = pd.read_csv(csv_path) - got = cudf.read_csv(src(csv_path)) - - assert_eq(expect, got) - - -def test_small_zip(tmp_path): - df = pd.DataFrame( - { - "a": [1997] * 2, - "b": ["Ford"] * 2, - "c": ["Super, luxurious truck"] * 2, - } - ) - - fname = tmp_path / "small_zip_file.zip" - df.to_csv(fname, index=False) - - got = cudf.read_csv(fname) - assert_eq(df, got) - - -def test_csv_reader_carriage_return(): - rows = 100 - names = ["int_row", "int_double_row"] - buffer = ",".join(names) + "\r\n" - for row in range(rows): - buffer += str(row) + ", " + str(2 * row) + "\r\n" - - df = read_csv(StringIO(buffer)) - expect = cudf.DataFrame( - {"int_row": cp.arange(rows), "int_double_row": cp.arange(rows) * 2} - ) - - assert len(df) == rows - assert_eq(expect, df) - - -def test_csv_reader_tabs(): - names = ["float_point", "integer", "date"] - lines = [ - ",".join(names), - "1.2,\t12, \t11/22/1995", - "3.4\t,\t34\t,\t 01/01/2001", - "\t 5.6,56 \t, 12/12/1970", - "\t7.8 , 78\t,06/15/2018 \t", - ] - buffer = "\n".join(lines) - - df = read_csv(StringIO(buffer), parse_dates=["date"]) - - assert df.shape == (4, 3) - - floats = [1.2, 3.4, 5.6, 7.8] - ints = [12, 34, 56, 78] - dates = [ - "1995-11-22T00:00:00.000000000", - "2001-01-01T00:00:00.000000000", - "1970-12-12T00:00:00.000000000", - "2018-06-15T00:00:00.000000000", - ] - np.testing.assert_allclose(floats, df["float_point"].to_numpy()) - np.testing.assert_allclose(ints, df["integer"].to_numpy()) - for row in range(4): - assert str(df["date"][row]) == dates[row] - - -@pytest.mark.parametrize("segment_bytes", [10000, 19999, 30001, 36000]) -def test_csv_reader_byte_range(tmp_path, segment_bytes): - fname = tmp_path / "tmp_csvreader_file16.csv" - - names = ["int1", "int2"] - - rows = 10000 - with open(str(fname), "w") as fp: - for i in range(rows): - fp.write(str(i) + ", " + str(2 * i) + " \n") - file_size = os.stat(str(fname)).st_size - - ref_df = read_csv(str(fname), names=names).to_pandas() - - dfs = [] - for segment in range((file_size + segment_bytes - 1) // segment_bytes): - dfs.append( - read_csv( - str(fname), - names=names, - byte_range=(segment * segment_bytes, segment_bytes), - ) - ) - df = cudf.concat(dfs).to_pandas() - - assert list(df["int1"]) == list(ref_df["int1"]) - assert list(df["int2"]) == list(ref_df["int2"]) - - -def test_csv_reader_byte_range_type_corner_case(tmp_path): - fname = tmp_path / "tmp_csvreader_file17.csv" - - cudf.datasets.timeseries( - start="2000-01-01", - end="2000-01-02", - dtypes={"name": str, "id": int, "x": float, "y": float}, - ).to_csv(fname, chunksize=100000) - - byte_range = (2_147_483_648, 0) - with pytest.raises(ValueError, match="Invalid byte range offset"): - cudf.read_csv(fname, byte_range=byte_range, header=None) - - -@pytest.mark.parametrize("segment_bytes", [10, 19, 31, 36]) -def test_csv_reader_byte_range_strings(segment_bytes): - names = ["strings"] - buffer = "\n".join('"' + str(x) + '"' for x in range(1, 100)) - file_size = len(buffer) - - ref_df = read_csv(StringIO(buffer), names=names).to_pandas() - - dfs = [] - for segment in range((file_size + segment_bytes - 1) // segment_bytes): - dfs.append( - read_csv( - StringIO(buffer), - names=names, - byte_range=(segment * segment_bytes, segment_bytes), - ) - ) - df = cudf.concat(dfs).to_pandas() - - assert list(df["strings"]) == list(ref_df["strings"]) - - -@pytest.mark.parametrize( - "header_row, skip_rows, skip_blanks", - [ - (1, 0, True), - ("infer", 2, True), - (1, 4, True), - (3, 0, False), - ("infer", 5, False), - ], -) -@pytest.mark.parametrize("lineterminator", ["\n", "\r\n"]) -def test_csv_reader_blanks_and_comments( - skip_rows, header_row, skip_blanks, lineterminator -): - lines = [ - "# first comment line", - lineterminator, - "# third comment line", - "1,2,3", - "4,5,6", - "7,8,9", - lineterminator, - "# last comment line", - lineterminator, - "1,1,1", - ] - buffer = lineterminator.join(lines) - - cu_df = read_csv( - StringIO(buffer), - comment="#", - header=header_row, - skiprows=skip_rows, - skip_blank_lines=skip_blanks, - ) - pd_df = pd.read_csv( - StringIO(buffer), - comment="#", - header=header_row, - skiprows=skip_rows, - skip_blank_lines=skip_blanks, - ) - - assert cu_df.shape == pd_df.shape - assert list(cu_df.columns.values) == list(pd_df.columns.values) - - -def test_csv_reader_prefix(): - lines = ["1, 1, 1, 1"] - buffer = "\n".join(lines) - - prefix_str = "a_prefix" - df = read_csv(StringIO(buffer), header=None, prefix=prefix_str) - - column_names = list(df.columns.values) - for col in range(len(column_names)): - assert column_names[col] == prefix_str + str(col) - - -def test_csv_reader_delim_whitespace(): - buffer = "1 2 3\n4 5 6" - - # with header row - with pytest.warns(FutureWarning): - cu_df = read_csv(StringIO(buffer), delim_whitespace=True) - with expect_warning_if(PANDAS_GE_220): - pd_df = pd.read_csv(StringIO(buffer), delim_whitespace=True) - assert_eq(pd_df, cu_df) - - # without header row - with pytest.warns(FutureWarning): - cu_df = read_csv(StringIO(buffer), delim_whitespace=True, header=None) - with expect_warning_if(PANDAS_GE_220): - pd_df = pd.read_csv( - StringIO(buffer), delim_whitespace=True, header=None - ) - assert pd_df.shape == cu_df.shape - - # should raise an error if used with delimiter or sep - with pytest.raises(ValueError): - with pytest.warns(FutureWarning): - read_csv(StringIO(buffer), delim_whitespace=True, delimiter=" ") - with pytest.raises(ValueError): - with pytest.warns(FutureWarning): - read_csv(StringIO(buffer), delim_whitespace=True, sep=" ") - - -def test_csv_reader_unnamed_cols(): - # first and last columns are unnamed - buffer = ",1,2,3,\n4,5,6,7,8" - - cu_df = read_csv(StringIO(buffer)) - pd_df = pd.read_csv(StringIO(buffer)) - - assert all(pd_df.columns == cu_df.columns) - assert pd_df.shape == cu_df.shape - - -def test_csv_reader_header_quotation(): - buffer = '"1,,1","2,\n,2",3\n+4,+5,+6' - - cu_df = read_csv(StringIO(buffer)) - pd_df = pd.read_csv(StringIO(buffer)) - assert cu_df.shape == (1, 3) - assert_eq(pd_df, cu_df) - - # test cases that fail with pandas - buffer_pd_fail = '"1,one," , ",2,two" ,3\n4,5,6' - cu_df = read_csv(StringIO(buffer_pd_fail)) - assert cu_df.shape == (1, 3) - - -def test_csv_reader_oversized_byte_range(): - buffer = "a,b,c,d,e\n4,5,6,7,8" - - cu_df = read_csv(StringIO(buffer), byte_range=(0, 1024)) - pd_df = pd.read_csv(StringIO(buffer)) - - assert all(pd_df.columns == cu_df.columns) - assert pd_df.shape == cu_df.shape - - -def test_csv_reader_index_col(): - buffer = "0,1,2\n3,4,5\n6,7,8" - names = ["int1", "int2", "int3"] - - # using a column name - cu_df = read_csv(StringIO(buffer), names=names, index_col="int1") - pd_df = pd.read_csv(StringIO(buffer), names=names, index_col="int1") - assert_eq(pd_df, cu_df) - - # using a column index - cu_df = read_csv(StringIO(buffer), header=None, index_col=0) - pd_df = pd.read_csv(StringIO(buffer), header=None, index_col=0) - assert_eq(cu_df.index, pd_df.index) - - # using a column index with names - cu_df = read_csv(StringIO(buffer), header=None, index_col=0, names=names) - pd_df = pd.read_csv( - StringIO(buffer), header=None, index_col=0, names=names - ) - assert_eq(cu_df.index, pd_df.index) - - # passing False to avoid using a column as index (no-op in cuDF) - cu_df = read_csv(StringIO(buffer), header=None, index_col=False) - pd_df = pd.read_csv(StringIO(buffer), header=None, index_col=False) - assert_eq(cu_df.index, pd_df.index) - - -@pytest.mark.parametrize("index_name", [None, "custom name", 124]) -@pytest.mark.parametrize("index_col", [None, 0, "a"]) -def test_csv_reader_index_names(index_name, index_col): - pdf = pd.DataFrame( - {"a": [1, 2, 3], "b": [10, 11, 12]}, index=["AB", "CD", "EF"] - ) - pdf.index.name = index_name - - buffer = pdf.to_csv() - actual = cudf.read_csv(StringIO(buffer), index_col=index_col) - expected = pd.read_csv(StringIO(buffer), index_col=index_col) - assert_eq(actual, expected) - - -@pytest.mark.parametrize( - "names", [["a", "b", "c"], [416, 905, 647], range(3), None] -) -def test_csv_reader_column_names(names): - buffer = "0,1,2\n3,4,5\n6,7,8" - - df = read_csv(StringIO(buffer), names=names) - if names is None: - assert list(df) == ["0", "1", "2"] - else: - assert list(df) == list(names) - - -def test_csv_reader_repeated_column_name(): - buffer = """A,A,A.1,A,A.2,A,A.4,A,A - 1,2,3.1,4,a.2,a,a.4,a,a - 2,4,6.1,8,b.2,b,b.4,b,b""" - - # pandas and cudf to have same repeated column names - pdf = pd.read_csv(StringIO(buffer)) - gdf = cudf.read_csv(StringIO(buffer)) - assert_eq(pdf.columns, gdf.columns) - - -def test_csv_reader_bools_false_positives(): - # values that are equal to ["True", "TRUE", "False", "FALSE"] - # when using ints to detect bool values - items = [3977, 4329, 24015, 27567] - - buffer = "\n".join(str(i) for i in items) - - df = read_csv(StringIO(buffer), header=None, dtype=["int32"]) - - np.testing.assert_array_equal(items, df["0"].to_numpy()) - - -def test_csv_reader_aligned_byte_range(tmp_path): - fname = tmp_path / "tmp_csvreader_file19.csv" - nelem = 1000 - - input_df = pd.DataFrame( - {"key": np.arange(0, nelem), "zeros": np.zeros(nelem)} - ) - input_df.to_csv(fname) - - df = cudf.read_csv(str(fname), byte_range=(0, 4096)) - # read_csv call above used to crash; the assert below is not crucial - assert np.count_nonzero(df["zeros"].to_pandas().values) == 0 - - -@pytest.mark.parametrize( - "pdf_dtype, gdf_dtype", - [(None, None), ("int", "hex"), ("int32", "hex32"), ("int64", "hex64")], -) -def test_csv_reader_hexadecimals(pdf_dtype, gdf_dtype): - lines = ["0x0", "-0x1000", "0xfedcba", "0xABCDEF", "0xaBcDeF"] - values = [int(hex_int, 16) for hex_int in lines] - - buffer = "\n".join(lines) - - if gdf_dtype is not None: - # require explicit `hex` dtype to parse hexadecimals - pdf = pd.DataFrame(data=values, dtype=pdf_dtype, columns=["hex_int"]) - gdf = read_csv(StringIO(buffer), dtype=[gdf_dtype], names=["hex_int"]) - np.testing.assert_array_equal( - pdf["hex_int"], gdf["hex_int"].to_numpy() - ) - else: - # otherwise, dtype inference returns as object (string) - pdf = pd.read_csv(StringIO(buffer), names=["hex_int"]) - gdf = read_csv(StringIO(buffer), names=["hex_int"]) - assert_eq(pdf, gdf) - - -@pytest.mark.parametrize( - "np_dtype, gdf_dtype", - [("int", "hex"), ("int32", "hex32"), ("int64", "hex64")], -) -def test_csv_reader_hexadecimal_overflow(np_dtype, gdf_dtype): - # This tests values which cause an overflow warning that will become an - # error in pandas. NumPy wraps the overflow silently up to the bounds of a - # signed int64. - lines = [ - "0x0", - "-0x1000", - "0xfedcba", - "0xABCDEF", - "0xaBcDeF", - "0x9512c20b", - "0x7fffffff", - "0x7fffffffffffffff", - "-0x8000000000000000", - ] - values = [int(hex_int, 16) for hex_int in lines] - buffer = "\n".join(lines) - - gdf = read_csv(StringIO(buffer), dtype=[gdf_dtype], names=["hex_int"]) - - expected = np.array(values).astype(np_dtype) - actual = gdf["hex_int"].to_numpy() - np.testing.assert_array_equal(expected, actual) - - -@pytest.mark.parametrize("quoting", [0, 1, 2, 3]) -def test_csv_reader_pd_consistent_quotes(quoting): - names = ["text"] - dtypes = ["str"] - lines = ['"a"', '"b ""c"" d"', '"f!\n."'] - - buffer = "\n".join(lines) - - gd_df = read_csv( - StringIO(buffer), names=names, dtype=dtypes, quoting=quoting - ) - pd_df = pd.read_csv(StringIO(buffer), names=names, quoting=quoting) - - assert_eq(pd_df, gd_df) - - -def test_read_csv_names_header_combination(): - pdf = pd.DataFrame( - { - "firstname": ["Emma", "Ava", "Sophia"], - "lastname": ["Olivia", "Isabella", "Charlotte"], - "gender": ["F", "F", "F"], - } - ) - buffer = pdf.to_csv(header=True, index=False) - names = pdf.columns - - gdf = read_csv(StringIO(buffer), names=names, header=0) - assert_eq(pdf, gdf) - - gdf = read_csv(StringIO(buffer), header=0) - assert_eq(pdf, gdf) - - gdf = read_csv(StringIO(buffer)) - assert_eq(pdf, gdf) - - -def test_csv_reader_scientific_type_detection(): - buffer = """1.,1.1,-1.1,1E1,1e1,-1e1,-1e-1,1e-1,1.1e1,1.1e-1,-1.1e-1,-1.1e1 - +1.1,1E+1,1e+1,+1e1,+1e-1,1e-1,+1.1e1,1.1e+1,+1.1e+1,+1.1e1""" - expected = [ - 1.0, - 1.1, - -1.1, - 10.0, - 10.0, - -10, - -0.1, - 0.1, - 11, - 0.11, - -0.11, - -11, - 1.1, - 10.0, - 10.0, - 10, - 0.1, - 0.1, - 11, - 11, - 11, - 11, - ] - - df = read_csv(StringIO(buffer), header=None) - - for dt in df.dtypes: - assert dt == "float64" - for col in df: - assert np.isclose(df[col][0], expected[int(col)]) - - -@pytest.mark.parametrize("lineterminator", ["\n", "\r\n"]) -def test_csv_blank_first_row(lineterminator): - lines = ["colA,colB", "", "1, 1.1", "2, 2.2"] - buffer = lineterminator.join(lines) - - cu_df = read_csv(StringIO(buffer)) - - assert cu_df.shape == (2, 2) - assert all(cu_df.columns == ["colA", "colB"]) - - -@pytest.mark.parametrize("contents", ["", "\n"]) -def test_csv_empty_file(tmp_path, contents): - fname = tmp_path / "test_csv_empty_file.csv" - with open(fname, "w") as f: - f.write(contents) - - col_names = ["col1", "col2", "col3", "col4"] - in_dtypes = ["int", "str", "float", "short"] - out_dtypes = ["int64", "object", "float64", "int16"] - - # Empty dataframe if no columns names specified or inferred - df = read_csv(str(fname)) - assert len(df.columns) == 0 - - # No row dataframe if columns names are specified or inferred - df = read_csv(str(fname), dtype=in_dtypes, names=col_names) - assert all(df.columns == col_names) - assert list(df.dtypes) == out_dtypes - - -@pytest.mark.parametrize("contents", ["", "\n"]) -def test_csv_empty_buffer(contents): - col_names = ["col1", "col2", "col3", "col4"] - in_dtypes = ["int", "str", "float", "short"] - out_dtypes = ["int64", "object", "float64", "int16"] - - # Empty dataframe if no columns names specified or inferred - df = read_csv(StringIO(contents)) - assert len(df.columns) == 0 - - # No row dataframe if columns names are specified or inferred - df = read_csv(StringIO(contents), dtype=in_dtypes, names=col_names) - assert all(df.columns == col_names) - assert list(df.dtypes) == out_dtypes - - -@pytest.mark.parametrize( - "dtype", [["short", "float", "int"], {"A": "short", "C": "int"}] -) -def test_csv_reader_partial_dtype(dtype): - names_df = read_csv( - StringIO("0,1,2"), - names=["A", "B", "C"], - dtype=dtype, - usecols=["A", "C"], - ) - header_df = read_csv( - StringIO('"A","B","C"\n0,1,2'), dtype=dtype, usecols=["A", "C"] - ) - - assert_eq(names_df, header_df) - assert all(names_df.dtypes == ["int16", "int64"]) - - -def test_csv_writer_file_handle(tmp_path): - df = pd.DataFrame({"a": [1, 2, 3], "b": ["xxx", "yyyy", "zzzzz"]}) - gdf = cudf.from_pandas(df) - - gdf_df_fname = tmp_path / "gdf_df_1.csv" - with open(gdf_df_fname, "w") as f: - gdf.to_csv(path_or_buf=f, index=False) - assert os.path.exists(gdf_df_fname) - - gdf2 = pd.read_csv(gdf_df_fname) - assert_eq(gdf, gdf2) - - -def test_csv_writer_file_append(tmp_path): - gdf1 = cudf.DataFrame({"a": [1, 2, 3], "b": ["xxx", "yyyy", "zzzzz"]}) - gdf2 = cudf.DataFrame({"a": [4, 5, 6], "b": ["foo", "bar", "baz"]}) - - gdf_df_fname = tmp_path / "gdf_df_append.csv" - with open(gdf_df_fname, "w") as f: - gdf1.to_csv(f, index=False) - with open(gdf_df_fname, "a") as f: - gdf2.to_csv(f, header=False, index=False) - - result = cudf.read_csv(gdf_df_fname) - expected = cudf.concat([gdf1, gdf2], ignore_index=True) - assert_eq(result, expected, check_index_type=True) - - -def test_csv_writer_buffer(): - gdf = cudf.DataFrame({"a": [1, 2, 3], "b": ["xxx", "yyyy", "zzzzz"]}) - - buffer = BytesIO() - gdf.to_csv(buffer, index=False) - - result = cudf.read_csv(buffer) - assert_eq(result, gdf) - - -def test_csv_writer_numeric_data(dtype, numeric_dataframe, tmp_path): - pdf_df_fname = tmp_path / "pdf_df_1.csv" - gdf_df_fname = tmp_path / "gdf_df_1.csv" - - df = numeric_dataframe.astype(dtype) - gdf = cudf.from_pandas(df) - df.to_csv(path_or_buf=pdf_df_fname, index=False, lineterminator="\n") - gdf.to_csv(path_or_buf=gdf_df_fname, index=False) - - assert os.path.exists(pdf_df_fname) - assert os.path.exists(gdf_df_fname) - - expect = pd.read_csv(pdf_df_fname) - got = pd.read_csv(gdf_df_fname) - assert_eq(expect, got) - - -def test_csv_writer_datetime_data(datetime_dataframe, tmp_path): - pdf_df_fname = tmp_path / "pdf_df_2.csv" - gdf_df_fname = tmp_path / "gdf_df_2.csv" - - df = datetime_dataframe - gdf = cudf.from_pandas(df) - df.to_csv(path_or_buf=pdf_df_fname, index=False, lineterminator="\n") - gdf.to_csv(path_or_buf=gdf_df_fname, index=False) - - assert os.path.exists(pdf_df_fname) - assert os.path.exists(gdf_df_fname) - - expect = pd.read_csv(pdf_df_fname) - got = pd.read_csv(gdf_df_fname) - assert_eq(expect, got) - - -@pytest.mark.parametrize("lineterminator", ["\r", "\n", "\t", np.str_("\n")]) -@pytest.mark.parametrize("sep", [",", "/", np.str_(",")]) -def test_csv_writer_terminator_sep(lineterminator, sep, cudf_mixed_dataframe): - df = cudf_mixed_dataframe - - buffer = BytesIO() - df.to_csv(buffer, lineterminator=lineterminator, sep=sep, index=False) - - got = read_csv(buffer, lineterminator=lineterminator, sep=sep) - assert_eq(df, got) - - -@pytest.mark.parametrize( - "lineterminator", ["\r\n", "ABC", "\t\t", np.str_("\r\n")] -) -def test_csv_writer_multichar_terminator(lineterminator, cudf_mixed_dataframe): - df = cudf_mixed_dataframe - - default_terminator_csv = StringIO() - df.to_csv(default_terminator_csv) - - # Need to check manually since readers don't support - # multicharacter line terminators - expected = default_terminator_csv.getvalue().replace("\n", lineterminator) - - buffer = StringIO() - df.to_csv(buffer, lineterminator=lineterminator) - got = buffer.getvalue() - - assert_eq(expected, got) - - -@pytest.mark.parametrize( - "columns", - [ - ["Date", "Float"], - ["Integer2", "Float", "Date", "Integer", "String", "Boolean"], - None, - ], -) -@pytest.mark.parametrize("header", [True, False]) -@pytest.mark.parametrize("index", [True, False]) -@pytest.mark.parametrize("bool_box", [bool, np.bool_]) -def test_csv_writer_column_and_header_options( - columns, header, index, bool_box, pd_mixed_dataframe -): - header = bool_box(header) - index = bool_box(index) - pdf = pd_mixed_dataframe - df = cudf.from_pandas(pdf) - - cudf_buffer = BytesIO() - df.to_csv(cudf_buffer, columns=columns, header=header, index=index) - pd_buffer = BytesIO() - pdf.to_csv(pd_buffer, columns=columns, header=header, index=index) - - expected = cudf.read_csv(pd_buffer, header=0 if header else None) - got = cudf.read_csv(cudf_buffer, header=0 if header else None) - - expected_column_cnt = (1 if index else 0) + ( - len(columns) if columns else pdf.shape[1] - ) - assert_eq(expected_column_cnt, got.shape[1]) - assert_eq(expected, got) - - -def test_csv_writer_empty_columns_parameter(cudf_mixed_dataframe): - write_str = cudf_mixed_dataframe.to_csv(columns=[], index=False) - assert_eq(write_str, "\n") - - -def test_csv_writer_multiindex(tmp_path): - pdf_df_fname = tmp_path / "pdf_df_3.csv" - gdf_df_fname = tmp_path / "gdf_df_3.csv" - - rng = np.random.default_rng(seed=0) - gdf = cudf.DataFrame( - { - "a": rng.integers(0, 5, 20), - "b": rng.integers(0, 5, 20), - "c": range(20), - "d": rng.random(20), - } - ) - gdg = gdf.groupby(["a", "b"]).mean() - pdg = gdg.to_pandas() - pdg.to_csv(pdf_df_fname) - gdg.to_csv(gdf_df_fname) - - assert os.path.exists(pdf_df_fname) - assert os.path.exists(gdf_df_fname) - - expect = pd.read_csv(pdf_df_fname) - got = pd.read_csv(gdf_df_fname) - assert_eq(expect, got) - - -@pytest.mark.parametrize("chunksize", [None, 2, 1000]) -def test_csv_writer_chunksize(chunksize, numeric_dataframe, dtype): - cu_df = cudf.from_pandas(numeric_dataframe.astype(dtype)) - - buffer = BytesIO() - cu_df.to_csv(buffer, chunksize=chunksize, index=False) - - got = cudf.read_csv(buffer, dtype=[dtype]) - assert_eq(cu_df, got) - - -@pytest.mark.parametrize( - "data", - [ - {"vals": [1, 2, 3]}, - {"vals1": [1, 2, 3], "vals2": ["hello", "rapids", "cudf"]}, - {"vals1": [None, 2.0, 3.0], "vals2": ["hello", "rapids", None]}, - ], -) -def test_to_csv_empty_filename(data): - df = cudf.DataFrame(data) - pdf = df.to_pandas() - - actual = df.to_csv() - expected = pdf.to_csv() - - assert actual == expected - - -@pytest.mark.parametrize( - "data", - [ - {"vals": [1, 2, 3]}, - {"vals1": [1, 2, 3], "vals2": ["hello", "rapids", "cudf"]}, - {"vals1": [None, 2.0, 3.0], "vals2": ["hello", "rapids", None]}, - ], -) -def test_to_csv_StringIO(data): - df = cudf.DataFrame(data) - cudf_io = StringIO() - pandas_io = StringIO() - - pdf = df.to_pandas() - - df.to_csv(cudf_io) - pdf.to_csv(pandas_io) - - cudf_io.seek(0) - pandas_io.seek(0) - - assert cudf_io.read() == pandas_io.read() - - -def test_csv_writer_empty_dataframe(tmp_path): - df_fname = tmp_path / "gdf_df_5.csv" - gdf = cudf.DataFrame({"float_point": [], "integer": []}) - gdf["float_point"] = gdf["float_point"].astype("float") - gdf["integer"] = gdf["integer"].astype("int") - - gdf.to_csv(df_fname, index=False) - - df = cudf.read_csv(df_fname) - - assert df.shape == (0, 2) - assert all(df.dtypes == ["object", "object"]) - - -def test_csv_write_chunksize_corner_case(tmp_path): - # With this num of rows and chunksize - # libcudf splits table such a way that it - # will end up creating an empty table slice - # which caused the issue 5588. - df_fname = tmp_path / "gdf_df_17.csv" - df = cudf.DataFrame({"a": np.arange(10_000)}) - df.to_csv(df_fname, chunksize=1000, index=False) - got = cudf.read_csv(df_fname) - - assert_eq(df, got) - - -def test_csv_write_no_caller_manipulation(): - df = cudf.DataFrame({"a": [1, 2, 3]}) - df_copy = df.copy(deep=True) - _ = df.to_csv(index=True) - assert_eq(df, df_copy) - - -@pytest.mark.parametrize( - "pdf", - [ - pd.DataFrame({"a": [1, 2, 3], "": [10, 20, 40]}), - pd.DataFrame({"": [10, 20, 40], "a": [1, 2, 3]}), - pd.DataFrame( - {"a": [1, 2, 3], "": [10, 20, 40]}, - index=pd.Index(["a", "z", "v"], name="custom name"), - ), - ], -) -@pytest.mark.parametrize("index", [True, False]) -@pytest.mark.parametrize("columns", [["a"], [""], None]) -def test_csv_write_empty_column_name(pdf, index, columns): - df = cudf.DataFrame.from_pandas(pdf) - expected = pdf.to_csv(index=index, columns=columns) - actual = df.to_csv(index=index, columns=columns) - - assert expected == actual - - -@pytest.mark.parametrize("idx", [None, pd.Index([], name="index name")]) -@pytest.mark.parametrize("index", [True, False]) -def test_csv_write_empty_dataframe(idx, index): - df = cudf.DataFrame(index=idx) - pdf = df.to_pandas() - - expected = pdf.to_csv(index=index) - actual = df.to_csv(index=index) - - assert expected == actual - - -@pytest.mark.parametrize( - "df", - [ - pd.DataFrame( - { - "a": [1, 2, 3, None], - "": ["a", "v", None, None], - None: [12, 12, 32, 44], - } - ), - pd.DataFrame( - { - np.nan: [1, 2, 3, None], - "": ["a", "v", None, None], - None: [12, 12, 32, 44], - } - ), - pd.DataFrame({"": [1, None, 3, 4]}), - pd.DataFrame({None: [1, None, 3, 4]}), - pd.DataFrame(columns=[None, "", "a", "b"]), - pd.DataFrame(columns=[None]), - pd.DataFrame(columns=[""]), - ], -) -@pytest.mark.parametrize( - "na_rep", ["", "_NA_", "---", "_____CUSTOM_NA_REP______"] -) -def test_csv_write_dataframe_na_rep(df, na_rep): - gdf = cudf.from_pandas(df) - - expected = df.to_csv(na_rep=na_rep) - actual = gdf.to_csv(na_rep=na_rep) - - assert expected == actual - - -@pytest.mark.parametrize( - "dtype", - [ - "int", - "str", - "float", - np.int32, - np.dtype("float32"), - {"a": "int32", "b": "float64", "c": "uint8"}, - int, - str, - object, - ], -) -def test_csv_reader_dtypes(dtype): - buf = "a,b,c\n1,10,111\n2,11,112\n3,12,113\n4,13,114\n" - - expected = pd.read_csv(StringIO(buf), dtype=dtype) - actual = cudf.read_csv(StringIO(buf), dtype=dtype) - - assert_eq(expected, actual) - - -@pytest.mark.parametrize( - "dtype", ["Int64", "UInt32", {"a": "UInt64", "b": "Float64", "c": "Int32"}] -) -def test_csv_reader_nullable_dtypes(dtype): - buf = "a,b,c\n1,10,111\n2,11,112\n3,12,113\n4,13,114\n" - - expected = pd.read_csv(StringIO(buf), dtype=dtype) - actual = cudf.read_csv(StringIO(buf), dtype=dtype) - - assert_eq(expected, actual.to_pandas(nullable=True)) - - -@pytest.mark.parametrize( - "dtype", sorted(list(cudf.utils.dtypes.TIMEDELTA_TYPES)) -) -def test_csv_reader_timedetla_dtypes(dtype): - buf = "a,b,c\n1,10,111\n2,11,112\n3,12,113\n43432423,13342,13243214\n" - - expected = pd.read_csv(StringIO(buf)).astype(dtype) - actual = cudf.read_csv(StringIO(buf), dtype=dtype) - - assert_eq(expected, actual) - - -@pytest.mark.parametrize( - "dtype", sorted(list(cudf.utils.dtypes.DATETIME_TYPES)) -) -def test_csv_reader_datetime_dtypes(dtype): - buf = "a,b,c\n1,10,111\n2,11,112\n3,12,113\n43432423,13342,13243214\n" - - expected = pd.read_csv(StringIO(buf)).astype(dtype) - actual = cudf.read_csv(StringIO(buf), dtype=dtype) - - assert_eq(expected, actual) - - -@pytest.mark.parametrize( - "df", - [ - lambda: cudf.DataFrame( - { - "a": cudf.Series([1, 2, 3, 1, 2], dtype="category"), - "b": cudf.Series(["a", "c", "a", "b", "a"], dtype="category"), - } - ), - lambda: cudf.DataFrame( - { - "a": cudf.Series([1.1, 2, 3, 1.1, 2], dtype="category"), - "b": cudf.Series( - [None, "c", None, "b", "a"], dtype="category" - ), - } - ), - lambda: cudf.DataFrame( - { - "b": cudf.Series( - [1.1, 2, 3, 1.1, 2], - dtype="category", - index=cudf.CategoricalIndex( - ["abc", "def", "ghi", "jkl", "xyz"] - ), - ) - } - ), - ], -) -def test_csv_writer_category(df): - df = df() - pdf = df.to_pandas() - - expected = pdf.to_csv() - actual = df.to_csv() - - assert expected == actual - - -@pytest.mark.parametrize( - "dtype", - [ - "category", - {"a": "category", "b": "str"}, - {"b": "category"}, - {"a": "category"}, - {"a": pd.CategoricalDtype([1, 2])}, - {"b": pd.CategoricalDtype([1, 2, 3])}, - {"b": pd.CategoricalDtype(["b", "a"]), "a": "str"}, - pd.CategoricalDtype(["a", "b"]), - ], -) -def test_csv_reader_category(dtype): - df = cudf.DataFrame({"a": [1, 2, 3, None], "b": ["a", "b", None, "c"]}) - csv_buf = df.to_csv() - - actual = cudf.read_csv(StringIO(csv_buf), dtype=dtype) - expected = pd.read_csv(StringIO(csv_buf), dtype=dtype) - - assert_eq(expected, actual, check_dtype=True) - - -def test_csv_writer_datetime_sep(): - df = cudf.DataFrame( - {"a": cudf.Series([22343, 2323423, 234324234], dtype="datetime64[ns]")} - ) - df["a"] = df["a"].astype("datetime64[s]") - expected = df.to_pandas().to_csv(date_format="%Y-%m-%dT%H:%M:%SZ", sep="-") - actual = df.to_csv(sep="-") - assert expected == actual - - -def test_na_filter_empty_fields(): - test_na = "TEST_NAN" - df = pd.DataFrame({"col0": ["valid", None, "also_valid", "", test_na]}) - buffer = df.to_csv(index=False) - - pdf = pd.read_csv(StringIO(buffer), na_filter=False) - gdf = cudf.read_csv(StringIO(buffer), na_filter=False) - assert_eq(pdf, gdf) - - pdf = pd.read_csv(StringIO(buffer), keep_default_na=False) - gdf = cudf.read_csv(StringIO(buffer), keep_default_na=False) - assert_eq(pdf, gdf) - - pdf = pd.read_csv( - StringIO(buffer), keep_default_na=False, na_values=test_na - ) - gdf = cudf.read_csv( - StringIO(buffer), keep_default_na=False, na_values=test_na - ) - assert_eq(pdf, gdf) - - -def test_csv_sep_error(): - pdf = pd.DataFrame({"a": [1, 2, 3]}) - gdf = cudf.DataFrame({"a": [1, 2, 3]}) - assert_exceptions_equal( - lfunc=pdf.to_csv, - rfunc=gdf.to_csv, - lfunc_args_and_kwargs=([], {"sep": "abc"}), - rfunc_args_and_kwargs=([], {"sep": "abc"}), - ) - - assert_exceptions_equal( - lfunc=pdf.to_csv, - rfunc=gdf.to_csv, - lfunc_args_and_kwargs=([], {"sep": 1}), - rfunc_args_and_kwargs=([], {"sep": 1}), - ) - - -def test_to_csv_encoding_error(): - # TODO: Remove this test once following - # issue is fixed: https://github.com/rapidsai/cudf/issues/2957 - df = cudf.DataFrame({"a": ["你好", "test"]}) - encoding = "utf-8-sig" - error_message = ( - f"Encoding {encoding} is not supported. " - + "Currently, only utf-8 encoding is supported." - ) - with pytest.raises(NotImplementedError, match=re.escape(error_message)): - df.to_csv("test.csv", encoding=encoding) - - -def test_to_csv_compression_error(): - df = cudf.DataFrame({"a": ["test"]}) - compression = "snappy" - error_message = "Writing compressed csv is not currently supported in cudf" - with pytest.raises(NotImplementedError, match=re.escape(error_message)): - df.to_csv("test.csv", compression=compression) - - -def test_empty_df_no_index(): - actual = cudf.DataFrame({}) - buffer = BytesIO() - actual.to_csv(buffer, index=False) - - result = cudf.read_csv(buffer) - - assert_eq(actual, result) - - -def test_default_integer_bitwidth( - cudf_mixed_dataframe, default_integer_bitwidth -): - # Test that integer columns in csv are _inferred_ as user specified - # bitwidth - buf = BytesIO() - cudf_mixed_dataframe.to_csv(buf) - buf.seek(0) - read = cudf.read_csv(buf) - assert read["Integer"].dtype == np.dtype( - f"i{default_integer_bitwidth // 8}" - ) - assert read["Integer2"].dtype == np.dtype( - f"i{default_integer_bitwidth // 8}" - ) - - -def test_default_integer_bitwidth_partial( - cudf_mixed_dataframe, default_integer_bitwidth -): - # Test that integer columns in csv are _inferred_ as user specified - # bitwidth - buf = BytesIO() - cudf_mixed_dataframe.to_csv(buf) - buf.seek(0) - read = cudf.read_csv(buf, dtype={"Integer": "int64"}) - assert read["Integer"].dtype == np.dtype("i8") - assert read["Integer2"].dtype == np.dtype( - f"i{default_integer_bitwidth // 8}" - ) - - -@pytest.mark.filterwarnings("ignore:invalid value encountered in cast") -def test_default_integer_bitwidth_extremes( - numeric_extremes_dataframe, default_integer_bitwidth -): - # Test that integer columns in csv are _inferred_ as user specified - # bitwidth - buf = BytesIO() - cudf.DataFrame.from_pandas(numeric_extremes_dataframe).to_csv(buf) - buf.seek(0) - read = cudf.read_csv(buf) - - assert read["int64"].dtype == np.dtype(f"i{default_integer_bitwidth // 8}") - assert read["long"].dtype == np.dtype(f"i{default_integer_bitwidth // 8}") - assert read["uint64"].dtype == np.dtype( - f"u{default_integer_bitwidth // 8}" - ) - - -def test_default_float_bitwidth(cudf_mixed_dataframe, default_float_bitwidth): - # Test that float columns in csv are _inferred_ as user specified - # bitwidth - buf = BytesIO() - cudf_mixed_dataframe.to_csv(buf) - buf.seek(0) - read = cudf.read_csv(buf) - assert read["Float"].dtype == np.dtype(f"f{default_float_bitwidth // 8}") - - -def test_default_float_bitwidth_partial(default_float_bitwidth): - # Test that float columns in csv are _inferred_ as user specified - # bitwidth - read = cudf.read_csv( - StringIO("float1,float2\n1.0,2.0\n3.0,4.0"), - dtype={"float2": "float64"}, - ) - assert read["float1"].dtype == np.dtype(f"f{default_float_bitwidth // 8}") - assert read["float2"].dtype == np.dtype("f8") - - -@pytest.mark.parametrize( - "usecols,names", - [ - # selection using indices; only names of selected columns are specified - ([1, 2], ["b", "c"]), - # selection using indices; names of all columns are specified - ([1, 2], ["a", "b", "c"]), - # selection using indices; duplicates - ([2, 2], ["a", "b", "c"]), - # selection using indices; out of order - ([2, 1], ["a", "b", "c"]), - # selection using names - (["b"], ["a", "b", "c"]), - # selection using names; multiple columns - (["b", "c"], ["a", "b", "c"]), - # selection using names; duplicates - (["c", "c"], ["a", "b", "c"]), - # selection using names; out of order - (["c", "b"], ["a", "b", "c"]), - ], -) -def test_column_selection_plus_column_names(usecols, names): - lines = [ - "num,datetime,text", - "123,2018-11-13T12:00:00,abc", - "456,2018-11-14T12:35:01,def", - "789,2018-11-15T18:02:59,ghi", - ] - - buffer = "\n".join(lines) + "\n" - - assert_eq( - pd.read_csv(StringIO(buffer), usecols=usecols, names=names), - cudf.read_csv(StringIO(buffer), usecols=usecols, names=names), - ) - - -def test_read_compressed_BOM(tmp_path): - buffer = 'int, string\n1, "a"\n2, "b"\n3, "c"\n' - - fname = tmp_path / "tmp_csvreader_file20.gz" - with gzip.open(fname, "wt", encoding="utf-8") as f: - f.write(codecs.BOM_UTF8.decode("utf-8")) - f.write(buffer) - - assert_eq(pd.read_csv(fname), cudf.read_csv(fname)) - - -def test_read_header_none_pandas_compat_column_type(): - data = "1\n2\n" - with cudf.option_context("mode.pandas_compatible", True): - result = cudf.read_csv(StringIO(data), header=None).columns - expected = pd.read_csv(StringIO(data), header=None).columns - pd.testing.assert_index_equal(result, expected, exact=True) - - -@pytest.mark.parametrize("buffer", ["1", '"one"']) -def test_read_single_unterminated_row(buffer): - gdf = cudf.read_csv(StringIO(buffer), header=None) - assert_eq(gdf.shape, (1, 1)) - - -@pytest.mark.parametrize("buffer", ["\n", "\r\n"]) -def test_read_empty_only_row(buffer): - gdf = cudf.read_csv(StringIO(buffer), header=None) - assert_eq(gdf.shape, (0, 0)) - - -def test_read_empty_only_row_custom_terminator(): - gdf = cudf.read_csv(StringIO("*"), header=None, lineterminator="*") - assert_eq(gdf.shape, (0, 0)) - - -def test_empty_file_pandas_compat_raises(tmp_path): - empty_file = tmp_path / "empty.csv" - empty_file.touch() - with cudf.option_context("mode.pandas_compatible", True): - with pytest.raises(pd.errors.EmptyDataError): - cudf.read_csv(StringIO()) - with pytest.raises(pd.errors.EmptyDataError): - cudf.read_csv(empty_file) - with pytest.raises(pd.errors.EmptyDataError): - cudf.read_csv(str(empty_file)) diff --git a/python/cudf/cudf/tests/test_feather.py b/python/cudf/cudf/tests/test_feather.py deleted file mode 100644 index 04c4d55afe6..00000000000 --- a/python/cudf/cudf/tests/test_feather.py +++ /dev/null @@ -1,78 +0,0 @@ -# Copyright (c) 2018-2025, NVIDIA CORPORATION. - -import os -from string import ascii_letters - -import numpy as np -import pandas as pd -import pyarrow as pa -import pytest - -import cudf -from cudf.testing import assert_eq -from cudf.testing._utils import NUMERIC_TYPES - - -@pytest.fixture(params=[0, 10]) -def pdf(request): - rng = np.random.default_rng(seed=0) - types = [*NUMERIC_TYPES, "bool"] - nrows = request.param - - # Create a pandas dataframe with random data of mixed types - test_pdf = pd.DataFrame( - { - f"col_{typ}": rng.integers(0, nrows, nrows).astype(typ) - for typ in types - } - ) - # Delete the name of the column index, and rename the row index - test_pdf.columns.name = None - test_pdf.index.name = "index" - - # Create non-numeric categorical data otherwise may get typecasted - data = [ascii_letters[rng.integers(0, 52)] for i in range(nrows)] - test_pdf["col_category"] = pd.Series(data, dtype="category") - - # Feather can't handle indexes properly - test_pdf = test_pdf.reset_index(drop=True) - test_pdf.index.name = None - - return test_pdf - - -@pytest.mark.filterwarnings("ignore:Using CPU") -@pytest.mark.filterwarnings("ignore:Strings are not yet supported") -@pytest.mark.parametrize( - "columns", - [["col_int8"], ["col_category"], ["col_int32", "col_float32"], None], -) -def test_feather_reader(pdf, columns, tmp_path): - feather_file = tmp_path / "test.feather" - pdf.to_feather(feather_file) - expect = pa.feather.read_table(feather_file, columns=columns).to_pandas() - got = ( - cudf.read_feather(feather_file, columns=columns) - .to_arrow(preserve_index=False) - .to_pandas() - ) - - assert_eq(expect, got, check_categorical=False) - - -@pytest.mark.filterwarnings("ignore:Using CPU") -def test_feather_writer(tmp_path, pdf): - gdf = cudf.DataFrame.from_pandas(pdf) - pdf_fname = tmp_path / "pdf.feather" - gdf_fname = tmp_path / "gdf.feather" - - pdf.to_feather(pdf_fname) - gdf.to_feather(gdf_fname) - - assert os.path.exists(pdf_fname) - assert os.path.exists(gdf_fname) - - expect = pa.feather.read_table(pdf_fname) - got = pa.feather.read_table(gdf_fname) - - assert pa.Table.equals(expect, got) diff --git a/python/cudf/cudf/tests/test_json.py b/python/cudf/cudf/tests/test_json.py deleted file mode 100644 index adcd3fff21d..00000000000 --- a/python/cudf/cudf/tests/test_json.py +++ /dev/null @@ -1,1467 +0,0 @@ -# Copyright (c) 2018-2025, NVIDIA CORPORATION. - -import copy -import gzip -import itertools -import os -from io import BytesIO, StringIO -from pathlib import Path - -import numpy as np -import pandas as pd -import pyarrow as pa -import pytest - -import cudf -from cudf.core._compat import PANDAS_CURRENT_SUPPORTED_VERSION, PANDAS_VERSION -from cudf.testing import assert_eq -from cudf.testing._utils import ( - DATETIME_TYPES, - NUMERIC_TYPES, - TIMEDELTA_TYPES, - expect_warning_if, -) - - -def make_numeric_dataframe(nrows, dtype): - df = pd.DataFrame() - df["col1"] = np.arange(nrows, dtype=dtype) - df["col2"] = np.arange(1, 1 + nrows, dtype=dtype) - return df - - -@pytest.fixture(params=[0, 1, 10, 100]) -def pdf(request): - rng = np.random.default_rng(seed=0) - types = NUMERIC_TYPES + DATETIME_TYPES + ["bool"] - nrows = request.param - - # Create a pandas dataframe with random data of mixed types - test_pdf = pd.DataFrame( - { - f"col_{typ}": rng.integers(0, nrows, nrows).astype(typ) - for typ in types - } - ) - # Delete the name of the column index, and rename the row index - test_pdf.columns.name = None - test_pdf.index.name = "test_index" - - return test_pdf - - -@pytest.fixture -def gdf(pdf): - return cudf.DataFrame.from_pandas(pdf) - - -@pytest.fixture(params=[0, 1, 10, 100]) -def gdf_writer_types(request): - # datetime64[us], datetime64[ns] are unsupported due to a bug in parser - types = [ - *NUMERIC_TYPES, - "datetime64[s]", - "datetime64[ms]", - *TIMEDELTA_TYPES, - "bool", - "str", - ] - typer = {"col_" + val: val for val in types} - ncols = len(types) - nrows = request.param - - # Create a pandas dataframe with random data of mixed types - test_pdf = cudf.DataFrame( - [list(range(ncols * i, ncols * (i + 1))) for i in range(nrows)], - columns=pd.Index([f"col_{typ}" for typ in types]), - ) - - # Cast all the column dtypes to objects, rename them, and then cast to - # appropriate types - test_pdf = test_pdf.astype(typer) - - return test_pdf - - -index_params = [True, False] -# tests limited to compressions formats supported by pandas and cudf: bz2, gzip, zip, zstd -compression_params = ["bz2", "gzip", "zip", "zstd", None] -orient_params = ["columns", "records", "table", "split"] -params = itertools.product(index_params, compression_params, orient_params) - - -@pytest.fixture(params=params) -def json_files(request, tmp_path_factory, pdf): - index, compression, orient = request.param - if index is False and orient not in ("split", "table"): - pytest.skip( - "'index=False' is only valid when 'orient' is 'split' or 'table'" - ) - if index is False and orient == "table": - pytest.skip("'index=False' isn't valid when 'orient' is 'table'") - if index is True and orient not in ("split", "table", "index", "columns"): - pytest.skip("'index=False' isn't valid when 'orient' is 'table'") - fname_df = tmp_path_factory.mktemp("json") / "test_df.json" - fname_series = tmp_path_factory.mktemp("json") / "test_series.json" - pdf.to_json(fname_df, index=index, compression=compression, orient=orient) - pdf["col_int32"].to_json( - fname_series, index=index, compression=compression, orient=orient - ) - return (fname_df, fname_series, orient, compression) - - -@pytest.mark.filterwarnings("ignore:Strings are not yet supported") -@pytest.mark.filterwarnings("ignore:Using CPU") -def test_json_reader(json_files): - path_df, path_series, orient, compression = json_files - expect_df = pd.read_json(path_df, orient=orient, compression=compression) - got_df = cudf.read_json(path_df, orient=orient, compression=compression) - if len(expect_df) == 0: - expect_df = expect_df.reset_index(drop=True) - expect_df.columns = expect_df.columns.astype("object") - if len(got_df) == 0: - got_df = got_df.reset_index(drop=True) - - assert_eq(expect_df, got_df, check_categorical=False) - - # Only these orients are allowed for Series, but isn't enforced by Pandas - if orient in ("split", "records", "index"): - expect_series = pd.read_json( - path_series, orient=orient, compression=compression, typ="series" - ) - got_series = cudf.read_json( - path_series, orient=orient, compression=compression, typ="series" - ) - if len(expect_series) == 0: - expect_series = expect_series.reset_index(drop=True) - if len(got_df) == 0: - got_series = got_series.reset_index(drop=True) - - assert_eq(expect_series, got_series) - - -@pytest.mark.filterwarnings("ignore:Can't infer compression") -@pytest.mark.filterwarnings("ignore:Using CPU") -def test_json_writer(tmpdir, pdf, gdf): - pdf_df_fname = tmpdir.join("pdf_df.json") - gdf_df_fname = tmpdir.join("gdf_df.json") - - pdf.to_json(pdf_df_fname) - gdf.to_json(gdf_df_fname) - - assert os.path.exists(pdf_df_fname) - assert os.path.exists(gdf_df_fname) - - expect_df = pd.read_json(pdf_df_fname) - got_df = pd.read_json(gdf_df_fname) - - assert_eq(expect_df, got_df) - - for column in pdf.columns: - pdf_series_fname = tmpdir.join(column + "_" + "pdf_series.json") - gdf_series_fname = tmpdir.join(column + "_" + "gdf_series.json") - - pdf[column].to_json(pdf_series_fname) - gdf[column].to_json(gdf_series_fname) - - assert os.path.exists(pdf_series_fname) - assert os.path.exists(gdf_series_fname) - - expect_series = pd.read_json(pdf_series_fname, typ="series") - got_series = pd.read_json(gdf_series_fname, typ="series") - - assert_eq(expect_series, got_series) - - # Make sure results align for regular strings, not just files - pdf_string = pdf[column].to_json() - gdf_string = pdf[column].to_json() - assert_eq(pdf_string, gdf_string) - - -@pytest.mark.parametrize( - "lines", [True, False], ids=["lines=True", "lines=False"] -) -def test_cudf_json_writer(pdf, lines): - # removing datetime column because pandas doesn't support it - for col_name in pdf.columns: - if "datetime" in col_name: - pdf.drop(col_name, axis=1, inplace=True) - gdf = cudf.DataFrame.from_pandas(pdf) - pdf_string = pdf.to_json(orient="records", lines=lines) - gdf_string = gdf.to_json(orient="records", lines=lines, engine="cudf") - - assert_eq(pdf_string, gdf_string) - - gdf_string = gdf.to_json( - orient="records", lines=lines, engine="cudf", rows_per_chunk=8 - ) - - assert_eq(pdf_string, gdf_string) - - -def test_cudf_json_writer_read(gdf_writer_types): - dtypes = { - col_name: col_name[len("col_") :] - for col_name in gdf_writer_types.columns - } - gdf_string = gdf_writer_types.to_json( - orient="records", lines=True, engine="cudf" - ) - gdf2 = cudf.read_json( - StringIO(gdf_string), - lines=True, - engine="cudf", - dtype=dict(dtypes), - ) - pdf2 = pd.read_json(StringIO(gdf_string), lines=True, dtype=dict(dtypes)) - - # Bug in pandas https://github.com/pandas-dev/pandas/issues/28558 - if pdf2.empty: - pdf2.reset_index(drop=True, inplace=True) - - # Pandas moved to consistent datetimes parsing format: - # https://pandas.pydata.org/docs/dev/whatsnew/v2.0.0.html#datetimes-are-now-parsed-with-a-consistent-format - for unit in ["s", "ms"]: - if f"col_datetime64[{unit}]" in pdf2.columns: - pdf2[f"col_datetime64[{unit}]"] = ( - pd.to_datetime(pdf2[f"col_datetime64[{unit}]"], format="mixed") - .dt.tz_localize(None) - .astype(f"datetime64[{unit}]") - ) - assert_eq(pdf2, gdf2) - - -@pytest.mark.parametrize( - "jsonl_string, expected", - [ - # fixed width - ("""{"a":10, "b":1.1}\n {"a":20, "b":2.1}\n""", None), - # simple list - ("""{"a":[1, 2, 3], "b":1.1}\n {"a":[]}\n""", None), - # simple struct - ("""{"a":{"c": 123 }, "b":1.1}\n {"a": {"c": 456}}\n""", None), - # list of lists - ("""{"a":[[], [1, 2], [3, 4]], "b":1.1}\n""", None), - ("""{"a":[null, [1, 2], [null, 4]], "b":1.1}\n""", None), - # list of structs - # error ("""{"a":[null, {}], "b":1.1}\n""", None), - ( - """{"a":[null, {"L": 123}], "b":1.0}\n {"b":1.1}\n {"b":2.1}\n""", - None, - ), - ( - """{"a":[{"L": 123}, null], "b":1.0}\n {"b":1.1}\n {"b":2.1}\n""", - None, - ), - # struct of lists - ( - """{"a":{"L": [1, 2, 3]}, "b":1.1}\n {"a": {"L": [4, 5, 6]}}\n""", - None, - ), - ("""{"a":{"L": [1, 2, null]}, "b":1.1}\n {"a": {"L": []}}\n""", None), - # struct of structs - ( - """{"a":{"L": {"M": 123}}, "b":1.1} - {"a": {"L": {"M": 456}}}\n""", - None, - ), - ( - """{"a":{"L": {"M": null}}, "b":1.1}\n {"a": {"L": {}}}\n""", - """{"a":{"L": {}}, "b":1.1}\n {"a": {"L": {}}}\n""", - ), - # list of structs of lists - ("""{"a":[{"L": [1, 2, 3]}, {"L": [4, 5, 6]}], "b":1.1}\n""", None), - ("""{"a":[{"L": [1, 2, null]}, {"L": []}], "b":1.1}\n""", None), - # struct of lists of structs - ("""{"a":{"L": [{"M": 123}, {"M": 456}]}, "b":1.1}\n""", None), - ( - """{"a":{"L": [{"M": null}, {}]}, "b":1.1}\n""", - """{"a":{"L": [{}, {}]}, "b":1.1}\n""", - ), - # empty structs - ("""{"A": null}\n {"A": {}}\n {}""", """{}\n{"A":{}}\n{}\n"""), - ( - """{"A": {"B": null}}\n {"A": {"B": {}}}\n {"A": {}}""", - """{"A":{}}\n{"A":{"B":{}}}\n{"A":{}}\n""", - ), - ], -) -def test_cudf_json_roundtrip(jsonl_string, expected): - gdf = cudf.read_json( - StringIO(jsonl_string), - lines=True, - engine="cudf", - # dtype=dict(dtypes), - ) - expected = jsonl_string if expected is None else expected - gdf_string = gdf.to_json( - orient="records", lines=True, engine="cudf", include_nulls=False - ) - assert_eq(gdf_string, expected.replace(" ", "")) - - -@pytest.mark.parametrize("sink", ["string", "file"]) -def test_cudf_json_writer_sinks(sink, tmp_path_factory): - df = cudf.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) - target = None - if sink == "string": - target = StringIO() - elif sink == "file": - target = tmp_path_factory.mktemp("json") / "test_df.json" - df.to_json(target, engine="cudf") - if sink == "string": - assert ( - target.getvalue() == '[{"a":1,"b":4},{"a":2,"b":5},{"a":3,"b":6}]' - ) - elif sink == "file": - assert os.path.exists(target) - with open(target, "r") as f: - assert f.read() == '[{"a":1,"b":4},{"a":2,"b":5},{"a":3,"b":6}]' - - -@pytest.fixture( - params=["string", "filepath", "pathobj", "bytes_io", "string_io", "url"] -) -def json_input(request, tmp_path_factory): - input_type = request.param - buffer = "[1, 2, 3]\n[4, 5, 6]\n[7, 8, 9]\n" - fname = tmp_path_factory.mktemp("json") / "test_df.json" - if not os.path.isfile(fname): - with open(str(fname), "w") as fp: - fp.write(buffer) - - if input_type == "string": - return buffer - if input_type == "filepath": - return str(fname) - if input_type == "pathobj": - return Path(fname) - if input_type == "bytes_io": - return BytesIO(buffer.encode()) - if input_type == "string_io": - return StringIO(buffer) - if input_type == "url": - return Path(fname).as_uri() - - -@pytest.mark.skipif( - PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, - reason="warning not present in older pandas versions", -) -@pytest.mark.filterwarnings("ignore:Using CPU") -@pytest.mark.parametrize("engine", ["auto", "cudf", "pandas"]) -def test_json_lines_basic(json_input, engine): - can_warn = isinstance(json_input, str) and not json_input.endswith(".json") - with expect_warning_if(can_warn): - cu_df = cudf.read_json(json_input, engine=engine, lines=True) - # io types must seek to the beginning before you can read again - if hasattr(json_input, "seek"): - json_input.seek(0) - with expect_warning_if(can_warn): - pd_df = pd.read_json(json_input, lines=True) - - assert all(cu_df.dtypes == ["int64", "int64", "int64"]) - for cu_col, pd_col in zip(cu_df.columns, pd_df.columns, strict=True): - assert str(cu_col) == str(pd_col) - np.testing.assert_array_equal(pd_df[pd_col], cu_df[cu_col].to_numpy()) - - -@pytest.mark.filterwarnings("ignore:Using CPU") -@pytest.mark.parametrize("engine", ["auto", "cudf", "pandas"]) -def test_nonexistent_json_correct_error(engine): - json_input = "doesnotexist.json" - with pytest.raises(FileNotFoundError): - cudf.read_json(json_input, engine=engine) - - -@pytest.mark.skipif( - PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, - reason="warning not present in older pandas versions", -) -@pytest.mark.filterwarnings("ignore:Using CPU") -@pytest.mark.parametrize("engine", ["auto", "cudf"]) -def test_json_lines_multiple(tmpdir, json_input, engine): - tmp_file1 = tmpdir.join("MultiInputs1.json") - tmp_file2 = tmpdir.join("MultiInputs2.json") - - with expect_warning_if( - isinstance(json_input, str) and not json_input.endswith(".json") - ): - pdf = pd.read_json(json_input, lines=True) - pdf.to_json(tmp_file1, compression="infer", lines=True, orient="records") - pdf.to_json(tmp_file2, compression="infer", lines=True, orient="records") - - cu_df = cudf.read_json([tmp_file1, tmp_file2], engine=engine, lines=True) - pd_df = pd.concat([pdf, pdf]) - - assert all(cu_df.dtypes == ["int64", "int64", "int64"]) - for cu_col, pd_col in zip(cu_df.columns, pd_df.columns, strict=True): - assert str(cu_col) == str(pd_col) - np.testing.assert_array_equal(pd_df[pd_col], cu_df[cu_col].to_numpy()) - - -@pytest.mark.skipif( - PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, - reason="warning not present in older pandas versions", -) -@pytest.mark.parametrize("engine", ["auto", "cudf"]) -def test_json_read_directory(tmpdir, json_input, engine): - with expect_warning_if( - isinstance(json_input, str) and not json_input.endswith(".json") - ): - pdf = pd.read_json(json_input, lines=True) - pdf.to_json( - tmpdir.join("MultiInputs1.json"), - compression="infer", - lines=True, - orient="records", - ) - pdf.to_json( - tmpdir.join("MultiInputs2.json"), - compression="infer", - lines=True, - orient="records", - ) - pdf.to_json( - tmpdir.join("MultiInputs3.json"), - compression="infer", - lines=True, - orient="records", - ) - - cu_df = cudf.read_json(tmpdir, engine=engine, lines=True) - pd_df = pd.concat([pdf, pdf, pdf]) - - assert all(cu_df.dtypes == ["int64", "int64", "int64"]) - for cu_col, pd_col in zip(cu_df.columns, pd_df.columns, strict=True): - assert str(cu_col) == str(pd_col) - np.testing.assert_array_equal(pd_df[pd_col], cu_df[cu_col].to_numpy()) - - -def test_json_lines_byte_range(json_input): - # include the first row and half of the second row - # should parse the first two rows - will_warn = isinstance(json_input, str) and not json_input.endswith( - ".json" - ) - with expect_warning_if(will_warn): - df = cudf.read_json( - copy.deepcopy(json_input), lines=True, byte_range=(0, 15) - ) - assert df.shape == (2, 3) - - # include half of the second row and half of the third row - # should parse only the third row - with expect_warning_if(will_warn): - df = cudf.read_json( - copy.deepcopy(json_input), lines=True, byte_range=(15, 10) - ) - assert df.shape == (1, 3) - - # include half of the second row and entire third row - # should parse only the third row - with expect_warning_if(will_warn): - df = cudf.read_json( - copy.deepcopy(json_input), lines=True, byte_range=(15, 0) - ) - assert df.shape == (1, 3) - - # include half of the second row till past the end of the file - # should parse only the third row - with expect_warning_if(will_warn): - df = cudf.read_json( - copy.deepcopy(json_input), lines=True, byte_range=(10, 50) - ) - assert df.shape == (1, 3) - - -def test_json_lines_dtypes(json_input): - with expect_warning_if( - isinstance(json_input, str) and not json_input.endswith(".json") - ): - df = cudf.read_json( - json_input, lines=True, dtype={1: "int", 2: "short", 0: "float"} - ) - assert all(df.dtypes == ["float64", "int64", "int16"]) - - -@pytest.mark.parametrize( - "ext, out_comp, in_comp", - [ - (".geez", "gzip", "gzip"), - (".beez", "bz2", "bz2"), - (".gz", "gzip", "infer"), - (".bz2", "bz2", "infer"), - (".data", None, "infer"), - (".txt", None, None), - ("", None, None), - ], -) -def test_json_lines_compression(tmpdir, ext, out_comp, in_comp): - fname = tmpdir.mkdir("gdf_json").join("tmp_json_compression" + ext) - - nrows = 20 - pd_df = make_numeric_dataframe(nrows, np.int32) - pd_df.to_json(fname, compression=out_comp, lines=True, orient="records") - - cu_df = cudf.read_json( - str(fname), - compression=in_comp, - lines=True, - dtype={"col1": "int32", "col2": "int32"}, - ) - assert_eq(pd_df, cu_df) - - -@pytest.mark.filterwarnings("ignore:Using CPU") -def test_json_engine_selection(): - json = "[1, 2, 3]" - - # should use the cudf engine - df = cudf.read_json(StringIO(json), lines=True) - # column names are strings when parsing with cudf - for col_name in df.columns: - assert isinstance(col_name, str) - - # should use the pandas engine - df = cudf.read_json(StringIO(json), lines=False, engine="pandas") - # column names are ints when parsing with pandas - for col_name in df.columns: - assert isinstance(col_name, int) - - # should use the pandas engine - df = cudf.read_json(StringIO(json), lines=True, engine="pandas") - # column names are ints when parsing with pandas - for col_name in df.columns: - assert isinstance(col_name, int) - - -def test_json_bool_values(): - buffer = "[true,1]\n[false,false]\n[true,true]" - cu_df = cudf.read_json(StringIO(buffer), lines=True) - pd_df = pd.read_json(StringIO(buffer), lines=True) - - # types should be ['bool', 'int64'] - np.testing.assert_array_equal(pd_df.dtypes, cu_df.dtypes) - np.testing.assert_array_equal(pd_df[0], cu_df["0"].to_numpy()) - # boolean values should be converted to 0/1 - np.testing.assert_array_equal(pd_df[1], cu_df["1"].to_numpy()) - - cu_df = cudf.read_json( - StringIO(buffer), lines=True, dtype={"0": "bool", "1": "long"} - ) - np.testing.assert_array_equal(pd_df.dtypes, cu_df.dtypes) - - -def test_json_bad_protocol_string(): - test_string = StringIO('{"field": "s3://path"}') - - expect = pd.DataFrame([{"field": "s3://path"}]) - got = cudf.read_json(test_string, lines=True) - - assert_eq(expect, got) - - -def test_json_corner_case_with_escape_and_double_quote_char_with_pandas( - tmpdir, -): - fname = tmpdir.mkdir("gdf_json").join("tmp_json_escape_double_quote") - - pdf = pd.DataFrame( - { - "a": ['ab"cd', "\\\b", "\r\\", "'"], - "b": ["a\tb\t", "\\", '\\"', "\t"], - "c": ["aeiou", "try", "json", "cudf"], - } - ) - pdf.to_json(fname, compression="infer", lines=True, orient="records") - - df = cudf.read_json( - fname, compression="infer", lines=True, orient="records" - ) - pdf = pd.read_json( - fname, compression="infer", lines=True, orient="records" - ) - - assert_eq(cudf.DataFrame(pdf), df) - - -def test_json_corner_case_with_escape_and_double_quote_char_with_strings(): - str_buffer = StringIO( - """{"a":"ab\\"cd","b":"a\\tb\\t","c":"aeiou"} - {"a":"\\\\\\b","b":"\\\\","c":"try"} - {"a":"\\r\\\\","b":"\\\\\\"","c":"json"} - {"a":"\'","b":"\\t","c":"cudf"}""" - ) - - df = cudf.read_json( - str_buffer, compression="infer", lines=True, orient="records" - ) - - expected = { - "a": ['ab"cd', "\\\b", "\r\\", "'"], - "b": ["a\tb\t", "\\", '\\"', "\t"], - "c": ["aeiou", "try", "json", "cudf"], - } - - num_rows = df.shape[0] - for col_name in df._data: - for i in range(num_rows): - assert expected[col_name][i] == df[col_name][i] - - -def test_json_to_json_special_characters(): - df = cudf.DataFrame( - { - "'a'": ['ab"cd', "\\\b", "\r\\", "'"], - "b": ["a\tb\t", "\\", '\\"', "\t"], - "c": ["aeiou", "try", "json", "cudf"], - } - ) - - actual = StringIO() - df.to_json(actual, engine="cudf", lines=True, orient="records") - expected = StringIO() - df.to_pandas().to_json(expected, lines=True, orient="records") - assert expected.getvalue() == actual.getvalue() - - -@pytest.mark.parametrize( - "gdf,pdf", - [ - ( - cudf.DataFrame( - { - "int col": cudf.Series( - [1, 2, None, 2, 2323, 234, None], dtype="int64" - ) - } - ), - pd.DataFrame( - { - "int col": pd.Series( - [1, 2, None, 2, 2323, 234, None], dtype=pd.Int64Dtype() - ) - } - ), - ), - ( - cudf.DataFrame( - { - "int64 col": cudf.Series( - [1, 2, None, 2323, None], dtype="int64" - ), - "string col": cudf.Series( - ["abc", "a", None, "", None], dtype="str" - ), - "float col": cudf.Series( - [0.234, None, 234234.2343, None, 0.0], dtype="float64" - ), - "bool col": cudf.Series( - [None, True, False, None, True], dtype="bool" - ), - "categorical col": cudf.Series( - [1, 2, 1, None, 2], dtype="category" - ), - "datetime col": cudf.Series( - [1231233, None, 2323234, None, 1], - dtype="datetime64[ns]", - ), - "timedelta col": cudf.Series( - [None, 34687236, 2323234, 1, None], - dtype="timedelta64[ns]", - ), - } - ), - pd.DataFrame( - { - "int64 col": pd.Series( - [1, 2, None, 2323, None], dtype=pd.Int64Dtype() - ), - "string col": pd.Series( - ["abc", "a", None, "", None], dtype=pd.StringDtype() - ), - "float col": pd.Series( - [0.234, None, 234234.2343, None, 0.0], dtype="float64" - ), - "bool col": pd.Series( - [None, True, False, None, True], - dtype=pd.BooleanDtype(), - ), - "categorical col": pd.Series( - [1, 2, 1, None, 2], dtype="category" - ), - "datetime col": pd.Series( - [1231233, None, 2323234, None, 1], - dtype="datetime64[ns]", - ), - "timedelta col": pd.Series( - [None, 34687236, 2323234, 1, None], - dtype="timedelta64[ns]", - ), - } - ), - ), - ], -) -def test_json_to_json_compare_contents(gdf, pdf): - expected_json = pdf.to_json(lines=True, orient="records") - with pytest.warns(UserWarning): - actual_json = gdf.to_json(lines=True, orient="records") - - assert expected_json == actual_json - - -@pytest.mark.filterwarnings("ignore:Using CPU") -@pytest.mark.parametrize("engine", ["cudf", "pandas"]) -def test_default_integer_bitwidth(default_integer_bitwidth, engine): - buf = BytesIO() - pd.DataFrame({"a": range(10)}).to_json(buf, lines=True, orient="records") - buf.seek(0) - df = cudf.read_json(buf, engine=engine, lines=True, orient="records") - - assert df["a"].dtype == np.dtype(f"i{default_integer_bitwidth // 8}") - - -@pytest.mark.filterwarnings("ignore:Using CPU") -@pytest.mark.parametrize( - "engine", - [ - "cudf", - "pandas", - ], -) -def test_default_integer_bitwidth_partial(default_integer_bitwidth, engine): - buf = BytesIO() - pd.DataFrame({"a": range(10), "b": range(10, 20)}).to_json( - buf, lines=True, orient="records" - ) - buf.seek(0) - df = cudf.read_json( - buf, engine=engine, lines=True, orient="records", dtype={"b": "i8"} - ) - - assert df["a"].dtype == np.dtype(f"i{default_integer_bitwidth // 8}") - assert df["b"].dtype == np.dtype("i8") - - -@pytest.mark.filterwarnings("ignore:Using CPU") -@pytest.mark.parametrize("engine", ["cudf", "pandas"]) -def test_default_integer_bitwidth_extremes(default_integer_bitwidth, engine): - # Test that integer columns in json are _inferred_ as 32 bit columns. - buf = StringIO( - '{"u8":18446744073709551615, "i8":9223372036854775807}\n' - '{"u8": 0, "i8": -9223372036854775808}' - ) - df = cudf.read_json(buf, engine=engine, lines=True, orient="records") - - assert df["u8"].dtype == np.dtype(f"u{default_integer_bitwidth // 8}") - assert df["i8"].dtype == np.dtype(f"i{default_integer_bitwidth // 8}") - - -def test_default_float_bitwidth(default_float_bitwidth): - # Test that float columns in json are _inferred_ as 32 bit columns. - df = cudf.read_json( - StringIO('{"a": 1.0, "b": 2.5}\n{"a": 3.5, "b": 4.0}'), - engine="cudf", - lines=True, - orient="records", - ) - assert df["a"].dtype == np.dtype(f"f{default_float_bitwidth // 8}") - assert df["b"].dtype == np.dtype(f"f{default_float_bitwidth // 8}") - - -def test_json_nested_basic(): - bytes_obj = BytesIO() - data = { - "c1": [{"f1": "sf11", "f2": "sf21"}, {"f1": "sf12", "f2": "sf22"}], - "c2": [["l11", "l21"], ["l12", "l22"]], - } - pdf = pd.DataFrame(data) - pdf.to_json(bytes_obj, orient="records") - - df = cudf.read_json(bytes_obj, engine="cudf", orient="records") - bytes_obj.seek(0) - pdf = pd.read_json(bytes_obj, orient="records") - - assert_eq(pdf, df) - - -@pytest.mark.parametrize( - "data", - [ - { - "c1": [{"f1": "sf11", "f2": "sf21"}, {"f1": "sf12", "f2": "sf22"}], - "c2": [["l11", "l21"], ["l12", "l22"]], - }, - # Essential test case to handle omissions - { - "c1": [{"f2": "sf21"}, {"f1": "sf12"}], - "c2": [["l11", "l21"], []], - }, - # empty input - {}, - ], -) -@pytest.mark.parametrize("lines", [True, False]) -def test_json_nested_lines(data, lines): - bytes = BytesIO() - pdf = pd.DataFrame(data) - pdf.to_json(bytes, orient="records", lines=lines) - bytes.seek(0) - df = cudf.read_json(bytes, engine="cudf", orient="records", lines=lines) - bytes.seek(0) - pdf = pd.read_json(bytes, orient="records", lines=lines) - # In the second test-case we need to take a detour via pyarrow - # Pandas omits "f1" in first row, so we have to enforce a common schema, - # such that pandas would have the f1 member with null - # Also, pyarrow chooses to select different ordering of a nested column - # children though key-value pairs are correct. - pa_table_pdf = pa.Table.from_pandas( - pdf, schema=df.to_arrow().schema, safe=False - ) - assert df.to_arrow().equals(pa_table_pdf) - - -def test_json_nested_data(): - json_str = ( - '[{"0":{},"2":{}},{"1":[[""],[]],"2":{"2":""}},' - '{"0":{"a":"1"},"2":{"0":"W&RR=+I","1":""}}]' - ) - df = cudf.read_json(StringIO(json_str), engine="cudf", orient="records") - pdf = pd.read_json(StringIO(json_str), orient="records") - pdf.columns = pdf.columns.astype("str") - pa_table_pdf = pa.Table.from_pandas( - pdf, schema=df.to_arrow().schema, safe=False - ) - assert df.to_arrow().equals(pa_table_pdf) - - -def test_json_empty_types(): - json_str = """ {} - {"a": [], "b": {}} - {"a": []} - {"b": {}} - {"c": {"d": []}} - {"e": [{}]} - """ - df = cudf.read_json(StringIO(json_str), orient="records", lines=True) - pdf = pd.read_json(StringIO(json_str), orient="records", lines=True) - assert_eq(df, pdf) - - -def test_json_types_data(): - # 0:<0:string,1:float> - # 1:list - # 2:<0:bool> - json_str = ( - '[{"0":null,"2":{}},' - '{"1":[123],"0":{"0":"foo","1":123.4},"2":{"0":false}},' - '{"0":{},"1":[],"2":{"0":null}}]' - ) - df = cudf.read_json(StringIO(json_str), engine="cudf", orient="records") - pdf = pd.read_json(StringIO(json_str), orient="records") - pdf.columns = pdf.columns.astype("str") - pa_table_pdf = pa.Table.from_pandas( - pdf, schema=df.to_arrow().schema, safe=False - ) - assert df.to_arrow().equals(pa_table_pdf) - - -@pytest.mark.parametrize( - "col_type,json_str,expected_data", - [ - # without quotes - ("int", '[{"k": 1}, {"k": 2}, {"k": 3}, {"k": 4}]', [1, 2, 3, 4]), - # with quotes - ("int", '[{"k": "1"}, {"k": "2"}]', [1, 2]), - # with quotes, mixed - ("int", '[{"k": "1"}, {"k": "2"}, {"k": 3}, {"k": 4}]', [1, 2, 3, 4]), - # with quotes, null, mixed - ( - "int", - '[{"k": "1"}, {"k": "2"}, {"k": null}, {"k": 4}]', - [1, 2, None, 4], - ), - # without quotes, null - ( - "int", - '[{"k": 1}, {"k": 2}, {"k": null}, {"k": 4}]', - [1, 2, None, 4], - ), - # without quotes - ("float", '[{"k": 1}, {"k": 2}, {"k": 3}, {"k": 4}]', [1, 2, 3, 4]), - # with quotes - ("float", '[{"k": "1"}, {"k": "2"}]', [1, 2]), - # with quotes, mixed - ( - "float", - '[{"k": "1"}, {"k": "2"}, {"k": 3}, {"k": 4}]', - [1, 2, 3, 4], - ), - # with quotes, null, mixed - ( - "float", - '[{"k": "1"}, {"k": "2"}, {"k": null}, {"k": 4}]', - [1, 2, None, 4], - ), - # with quotes, NAN - ( - "float", - '[{"k": "1"}, {"k": "2"}, {"k": NaN}, {"k": "4"}]', - [1, 2, np.nan, 4], - ), - # without quotes - ("str", '[{"k": 1}, {"k": 2}, {"k": 3}, {"k": 4}]', [1, 2, 3, 4]), - # with quotes - ("str", '[{"k": "1"}, {"k": "2"}]', [1, 2]), - # with quotes, mixed - ("str", '[{"k": "1"}, {"k": "2"}, {"k": 3}, {"k": 4}]', [1, 2, 3, 4]), - # with quotes, null, mixed - ( - "str", - '[{"k": "1"}, {"k": "2"}, {"k": null}, {"k": 4}]', - [1, 2, None, 4], - ), - # without quotes, null - ( - "str", - '[{"k": 1}, {"k": 2}, {"k": null}, {"k": 4}]', - [1, 2, None, 4], - ), - ], -) -def test_json_quoted_values_with_schema(col_type, json_str, expected_data): - actual = cudf.read_json( - StringIO(json_str), - engine="cudf", - orient="records", - dtype={"k": col_type}, - ) - expected = cudf.DataFrame({"k": expected_data}, dtype=col_type) - - assert_eq(actual, expected) - - -@pytest.mark.parametrize( - "col_type,json_str,expected_data", - [ - # with quotes, mixed - ("int", '[{"k": "1"}, {"k": "2"}, {"k": 3}, {"k": 4}]', [1, 2, 3, 4]), - # with quotes, null, mixed - ( - "int", - '[{"k": "1"}, {"k": "2"}, {"k": null}, {"k": 4}]', - [1, 2, None, 4], - ), - # with quotes, mixed - ( - "str", - '[{"k": "1"}, {"k": "2"}, {"k": 3}, {"k": 4}]', - ["1", "2", "3", "4"], - ), - # with quotes, null, mixed - ( - "str", - '[{"k": "1"}, {"k": "2"}, {"k": null}, {"k": 4}]', - ["1", "2", None, "4"], - ), - ], -) -def test_json_quoted_values(col_type, json_str, expected_data): - actual = cudf.read_json( - StringIO(json_str), - engine="cudf", - orient="records", - dtype={"k": col_type}, - ) - expected = cudf.DataFrame({"k": expected_data}, dtype=col_type) - - assert_eq(expected, actual) - assert_eq(expected_data, actual.k.to_arrow().to_pylist()) - - -@pytest.mark.parametrize( - "keep_quotes,result", - [ - ( - True, - { - "c1": [ - {"f1": '"sf11"', "f2": '"sf21"'}, - {"f1": '"sf12"', "f2": '"sf22"'}, - ], - "c2": [['"l11"', '"l21"'], ['"l12"', '"l22"']], - }, - ), - ( - False, - { - "c1": [ - {"f1": "sf11", "f2": "sf21"}, - {"f1": "sf12", "f2": "sf22"}, - ], - "c2": [["l11", "l21"], ["l12", "l22"]], - }, - ), - ], -) -def test_json_keep_quotes(keep_quotes, result): - bytes_file = BytesIO() - data = { - "c1": [{"f1": "sf11", "f2": "sf21"}, {"f1": "sf12", "f2": "sf22"}], - "c2": [["l11", "l21"], ["l12", "l22"]], - } - pdf = pd.DataFrame(data) - pdf.to_json(bytes_file, orient="records", lines=True) - - actual = cudf.read_json( - bytes_file, - orient="records", - lines=True, - keep_quotes=keep_quotes, - ) - expected = pd.DataFrame(result) - - assert_eq(actual, expected) - - -def test_json_dtypes_nested_data(): - # a: StructDtype({'a': StructDtype({'b': dtype('float64')}), - # 'b': dtype('int64')}) - # b: ListDtype(ListDtype(float64)) - actual_json_str = ( - '{"a":{"a":{"b":10.0},"b":11},"b":[[10.0,1.1],[12.0,23.0]]}\n' - '{"a":{"a":{"b":107.0},"b":5},"b":[[10.0,11.2],[12.0,0.23]]}\n' - '{"a":{"a":{"b":50.7},"b":2},"b":[[10.0,11.3],[12.0,2.3]]}\n' - '{"a":{"a":{"b":1.2},"b":67},"b":[[6.0,7.0]]}\n' - '{"a":{"a":{"b":40.1},"b":1090},"b":null}\n' - ) - - """ - In [3]: df - Out[3]: - a b - 0 {'a': {'b': 10.0}, 'b': 11} [[10.0, 1.1], [12.0, 23.0]] - 1 {'a': {'b': 107.0}, 'b': 5} [[10.0, 11.2], [12.0, 0.23]] - 2 {'a': {'b': 50.7}, 'b': 2} [[10.0, 11.3], [12.0, 2.3]] - 3 {'a': {'b': 1.2}, 'b': 67} [[6.0, 7.0]] - 4 {'a': {'b': 40.1}, 'b': 1090} None - """ - - # a: StructDtype({'a': StructDtype({'b': dtype('int64')}), - # 'b': dtype('float64')}) - # b: ListDtype(ListDtype(int64)) - expected_json_str = ( - '{"a":{"a":{"b":10},"b":11.0},"b":[[10,1],[12,23]]}\n' - '{"a":{"a":{"b":107},"b":5.0},"b":[[10,11],[12,0]]}\n' - '{"a":{"a":{"b":50},"b":2.0},"b":[[10,11],[12,2]]}\n' - '{"a":{"a":{"b":1},"b":67.0},"b":[[6,7]]}\n' - '{"a":{"a":{"b":40},"b":1090.0},"b":null}\n' - ) - - """ - In [7]: df - Out[7]: - a b - 0 {'a': {'b': 10}, 'b': 11.0} [[10, 1], [12, 23]] - 1 {'a': {'b': 107}, 'b': 5.0} [[10, 11], [12, 0]] - 2 {'a': {'b': 50}, 'b': 2.0} [[10, 11], [12, 2]] - 3 {'a': {'b': 1}, 'b': 67.0} [[6, 7]] - 4 {'a': {'b': 40}, 'b': 1090.0} None - """ - - df = cudf.read_json( - StringIO(actual_json_str), - engine="cudf", - orient="records", - lines=True, - dtype={ - "a": cudf.StructDtype( - { - "a": cudf.StructDtype({"b": cudf.dtype("int64")}), - "b": cudf.dtype("float64"), - } - ), - "b": cudf.ListDtype(cudf.ListDtype("int64")), - }, - ) - - pdf = pd.read_json( - StringIO(expected_json_str), - orient="records", - lines=True, - ) - - assert_eq(df, pdf) - - pdf.columns = pdf.columns.astype("str") - pa_table_pdf = pa.Table.from_pandas( - pdf, schema=df.to_arrow().schema, safe=False - ) - assert df.to_arrow().equals(pa_table_pdf) - - -@pytest.mark.parametrize( - "tag, data", - [ - ( - "normal", - """\ -{"a": 1, "b": 2} -{"a": 3, "b": 4}""", - ), - ( - "multiple", - """\ - { "a": { "y" : 6}, "b" : [1, 2, 3], "c": 11 } - { "a": { "y" : 6}, "b" : [4, 5 ], "c": 12 } - { "a": { "y" : 6}, "b" : [6 ], "c": 13 } - { "a": { "y" : 6}, "b" : [7 ], "c": 14 }""", - ), - ( - "reordered", - """\ - { "a": { "y" : 6}, "b" : [1, 2, 3], "c": 11 } - { "a": { "y" : 6}, "c": 12 , "b" : [4, 5 ]} - { "b" : [6 ], "a": { "y" : 6}, "c": 13} - { "c" : 14, "a": { "y" : 6}, "b" : [7 ]} -""", - ), - ( - "missing", - """ - { "a": { "y" : 6}, "b" : [1, 2, 3], "c": 11 } - { "a": { "y" : 6}, "b" : [4, 5 ] } - { "a": { "y" : 6}, "c": 13 } - { "a": { "y" : 6}, "b" : [7 ], "c": 14 } -""", - ), - pytest.param( - "dtype_mismatch", - """\ - { "a": { "y" : 6}, "b" : [1, 2, 3], "c": 11 } - { "a": { "y" : 6}, "b" : [4, 5 ], "c": 12 } - { "a": { "y" : 6}, "b" : [6 ], "c": 13 } - { "a": { "y" : 6}, "b" : [7 ], "c": 14.0 }""", - ), - ], -) -class TestNestedJsonReaderCommon: - @pytest.mark.parametrize("chunk_size", [10, 100, 1024, 1024 * 1024]) - def test_chunked_nested_json_reader(self, tag, data, chunk_size): - expected = cudf.read_json(StringIO(data), lines=True) - - source_size = len(data) - chunks = [] - for chunk_start in range(0, source_size, chunk_size): - chunks.append( - cudf.read_json( - StringIO(data), - byte_range=[chunk_start, chunk_size], - lines=True, - ) - ) - df = cudf.concat(chunks, ignore_index=True) - assert expected.to_arrow().equals(df.to_arrow()) - - @pytest.mark.skipif( - PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, - reason="https://github.com/pandas-dev/pandas/pull/57439", - ) - def test_order_nested_json_reader(self, tag, data): - expected = pd.read_json(StringIO(data), lines=True) - target = cudf.read_json(StringIO(data), lines=True) - # Using pyarrow instead of assert_eq because pandas - # doesn't handle nested values comparisons correctly - if tag == "dtype_mismatch": - with pytest.raises(AssertionError): - # pandas parses integer values in float representation - # as integer - assert pa.Table.from_pandas(expected).equals(target.to_arrow()) - elif tag == "missing": - with pytest.raises(AssertionError): - # pandas inferences integer with nulls as float64 - assert pa.Table.from_pandas(expected).equals(target.to_arrow()) - else: - assert pa.Table.from_pandas(expected).equals(target.to_arrow()) - - -def test_json_round_trip_gzip(): - df = cudf.DataFrame({"a": [1, 2, 3], "b": ["abc", "def", "ghi"]}) - bytes = BytesIO() - with gzip.open(bytes, mode="wb") as fo: - with pytest.warns(UserWarning): - df.to_json(fo, orient="records", lines=True) - bytes.seek(0) - with gzip.open(bytes, mode="rb") as fo: - written_df = cudf.read_json(fo, orient="records", lines=True) - assert_eq(written_df, df) - - # Testing writing from middle of the file. - loc = bytes.tell() - - with gzip.open(bytes, mode="wb") as fo: - fo.seek(loc) - with pytest.warns(UserWarning): - df.to_json(fo, orient="records", lines=True) - bytes.seek(loc) - with gzip.open(bytes, mode="rb") as fo: - fo.seek(loc) - written_df = cudf.read_json(fo, orient="records", lines=True) - assert_eq(written_df, df) - - -@pytest.mark.parametrize( - "data", - [ - # # empty input - # assert failing due to missing index size information - "", - "[]", - "[]\n[]\n[]", - # simple values - """[1]\n[2]\n[3]""", - """[1, 2, 3]\n[4, 5, 6]\n[7, 8, 9]""", - # nulls - """[1, 2, 3]\n[4, 5, null]\n[7, 8, 9]""", - """[1, 2, 3]\n[4, 5, null]\n[7, 8, 9]\n[null, null, null]""", - """[1, 2, 3]\n[4, 5, null]\n[]""", - # missing - """[1, 2, 3]\n[4, 5 ]\n[7, 8, 9]""", - """[1, 2, 3]\n[4, 5, 6]\n[7, 8, 9, 10]""", - """[1, 2, 3]\n[4, 5, 6, {}]\n[7, 8, 9]""", - """[1, 2, 3]\n[4, 5, 6, []]\n[7, 8, 9]""", - """[1, 2, 3]\n[4, 5, 6, {"a": 10}]\n[7, 8, 9]""", - """[1, 2, 3]\n[4, 5, 6, [10]]\n[7, 8, 9]""", - # mixed - """[1, 2, 3]\n[4, 5, {}]\n[7, 8, 9]""", - """[1, 2, {}]\n[4, 5, 6]\n[7, 8, 9]""", - """[1, 2, 3]\n[4, 5, [6]]\n[7, 8, 9]""", - """[1, 2, [3]]\n[4, 5, 6]\n[7, 8, 9]""", - # nested - """[1, 2, [3]]\n[4, 5, [6]]\n[7, 8, [9]]""", - """[1, 2, {"a": 3}]\n[4, 5, {"b": 6}]\n[7, 8, {"c": 9}]""", - """[1, 2, [{"a": 3}, {"a": 3}]] - [4, 5, [{"b": 6}, {"b": 6}, {}, {"b": 6}]] - [7, 8, [{}]]""", - """[1, 2, {"a": [3, 3, 3]}] - [4, 5, {"b": [6, 6]}] - [7, 8, {"c": 9}]""", - """[1, 2, [{"a": 3}, {"a": null}]] - [4, 5, [{"b": [6.0, 6, 06]}, {"b": [6]}, {}, {"b": null}]] - [7, 8, [{}]]""", - ], -) -@pytest.mark.parametrize("lines", [True, False]) -def test_json_array_of_arrays(data, lines): - data = data if lines else "[" + data.replace("\n", ",") + "]" - pdf = pd.read_json(StringIO(data), orient="values", lines=lines) - df = cudf.read_json( - StringIO(data), - engine="cudf", - orient="values", - lines=lines, - ) - # if mixed with dict/list type, replace other types with None. - if 2 in pdf.columns and any( - pdf[2].apply(lambda x: isinstance(x, dict) or isinstance(x, list)) - ): - pdf[2] = pdf[2].apply( - lambda x: x if isinstance(x, dict) or isinstance(x, list) else None - ) - # TODO: Replace string column names with integer column names - # for values orient in cudf json reader - pdf.rename(columns={name: str(name) for name in pdf.columns}, inplace=True) - # assert_eq(pdf, df) - pa_table_pdf = pa.Table.from_pandas( - pdf, schema=df.to_arrow().schema, safe=False - ) - assert df.to_arrow().equals(pa_table_pdf) - - -@pytest.mark.parametrize( - "jsonl_string", - [ - # simple list with mixed types - """{"a":[123, {}], "b":1.1}""", - """{"a":[123, {"0": 123}], "b":1.0}\n {"b":1.1}\n {"b":2.1}""", - """{"a":[{"L": 123}, 123], "b":1.0}\n {"b":1.1}\n {"b":2.1}""", - """{"a":[123, {"0": 123}, 12.3], "b":1.0}\n {"b":1.1}\n {"b":2.1}""", - """{"a":[123, {"0": 123}, null], "b":1.0}\n {"b":1.1}\n {"b":2.1}""", - """{"a":["123", {"0": 123}], "b":1.0}\n {"b":1.1}\n {"b":2.1}""", - """{"a":[{"0": 123}, "123"], "b":1.0}\n {"b":1.1}\n {"b":2.1}""", - """{"a":["123", {"0": 123}, "123"], "b":1.0}\n {"b":1.1}""", - """{"a":[123]}\n {"a":[{"0": 123}], "b":1.0}\n {"b":1.1}""", - """{"a":[{"0": 123}]}\n {"a":[123], "b":1.0}\n {"b":1.1}""", - """{"a":[{"0": 123}]}\n {"a": []}\n {"a":[123], "b":1.0}\n{"b":1.1}""", - """{"b":1.0, "a":[{"0": 123}]}\n {"a":[123]}\n {"b":1.1}\n{"a": []}""", - """{"a": []}\n {"a":[{"0": 123}]}\n {"a":[123], "b":1.0}\n{"b":1.1}""", - """{"a": []}\n {"a":[123], "b":1.0}\n {"a":[{"0": 123}]}\n{"b":1.1}""", - # nested list with mixed types - """{"a":[123, [{"0": 123}, {}]], "b":1.0} - {"b":1.1} - {"a":[]} - {"a":[123]} - {"a":[[123], []]}""", - """{"a":[], "b":1.0} - {"a":[[[456]]]} - {"a":[[123]]} - {"a":[123]}""", - """{"a":[123], "b":1.0} - {"b":1.1} - {"b":2.1} - {"a":[[[[[[]]]]]]}""", - """{"a":[123], "b":1.0} - {"a":[[[[[[]]]]]]} - {"a":[[[[[[]]]]], [[[[[]]]]]]} - {"a":[[[[[[]]]], [[[[]]]]]]} - {"a":[[[[[[]]], [[[]]]]]]} - {"a":[[[[[[]], [[]]]]]]} - {"a":[[[[[[], 123, []]]]]]}""", - # mixed elements in multiple columns - """{"a":[123, {"0": 123}], "b":1.0} - {"c": ["abc"], "b":1.1} - {"c": ["abc", []] }""", - ], -) -def test_json_nested_mixed_types_in_list(jsonl_string): - # utility function for this test: - # replace list elements with None if it has dict and non-dict (ignore None) - def _replace_in_list(list_to_replace, replace_items): - return [ - _replace_in_list(x, replace_items) - if isinstance(x, list) - else None - if x in replace_items - else x - for x in list_to_replace - ] - - def _replace_with_nulls(df, replace_items): - for col in df.columns: - if df[col].dtype == "object": - df[col] = df[col].apply( - lambda x: _replace_in_list(x, replace_items) - if isinstance(x, list) - else x - ) - return df - - # both json lines and json string tested. - json_string = "[" + jsonl_string.replace("\n", ",") + "]" - pdf = pd.read_json(StringIO(jsonl_string), orient="records", lines=True) - pdf2 = pd.read_json(StringIO(json_string), orient="records", lines=False) - assert_eq(pdf, pdf2) - # replace list elements with None if it has dict and non-dict - # in above test cases, these items are mixed with dict/list items - # so, replace them with None. - pdf = _replace_with_nulls(pdf, [123, "123", 12.3, "abc"]) - gdf = cudf.read_json( - StringIO(jsonl_string), - orient="records", - lines=True, - ) - gdf2 = cudf.read_json( - StringIO(json_string), - engine="cudf", - orient="records", - lines=False, - ) - if """[{"0": 123}, {}]""" not in jsonl_string: - # {} in pandas is represented as {"0": None} in cudf - assert_eq(gdf, pdf) - assert_eq(gdf2, pdf) - pa_table_pdf = pa.Table.from_pandas( - pdf, schema=gdf.to_arrow().schema, safe=False - ) - assert gdf.to_arrow().equals(pa_table_pdf) - assert gdf2.to_arrow().equals(pa_table_pdf) - - -@pytest.mark.parametrize( - "jsonl_string", - [ - # mixed type in list (in different order) - """{"a":[[{"0": 123}, {}], {"1": 321}], "b":1.0}""", - """{"a":[{"1": 321}, [{"0": 123}, {}], ], "b":1.0}""", - """{"a":[123, [{"0": 123}, {}], {"1": 321}], "b":1.0}""", - """{"a":[null, [{"0": 123}, {}], {"1": 321}], "b":1.0}""", - # mixed type in struct (in different order) - """{"a": {"b": {"0": 123}, "c": {"1": 321}}, "d":1.0} - {"a": {"b": {"0": 123}, "c": [123, 123]}, "d":1.0}""", - """{"a": {"b": {"0": 123}, "c": [123, 123]}, "d":1.0} - {"a": {"b": {"0": 123}, "c": {"1": 321}}, "d":1.0}""", - """{"a": {"b": {"0": 123}, "c": null}, "d":1.0} - {"a": {"b": {"0": 123}, "c": {"1": 321}}, "d":1.0} - {"a": {"b": {"0": 123}, "c": [123, 123]}, "d":1.0}""", - """{"a": {"b": {"0": 123}, "c": 123}, "d":1.0} - {"a": {"b": {"0": 123}, "c": {"1": 321}}, "d":1.0} - {"a": {"b": {"0": 123}, "c": [123, 123]}, "d":1.0}""", - ], -) -def test_json_nested_mixed_types_error(jsonl_string): - # mixing list and struct should raise an exception - with pytest.raises(RuntimeError): - cudf.read_json( - StringIO(jsonl_string), - orient="records", - lines=True, - ) - - -@pytest.mark.parametrize("on_bad_lines", ["error", "recover", "abc"]) -def test_json_reader_on_bad_lines(on_bad_lines): - json_input = StringIO( - '{"a":1,"b":10}\n{"a":2,"b":11}\nabc\n{"a":3,"b":12}\n' - ) - if on_bad_lines == "error": - with pytest.raises(RuntimeError): - cudf.read_json( - json_input, - lines=True, - orient="records", - on_bad_lines=on_bad_lines, - ) - elif on_bad_lines == "recover": - actual = cudf.read_json( - json_input, lines=True, orient="records", on_bad_lines=on_bad_lines - ) - expected = cudf.DataFrame( - {"a": [1, 2, None, 3], "b": [10, 11, None, 12]} - ) - assert_eq(actual, expected) - else: - with pytest.raises(TypeError): - cudf.read_json( - json_input, - lines=True, - orient="records", - on_bad_lines=on_bad_lines, - ) - - -def test_chunked_json_reader(): - df = cudf.DataFrame( - { - "a": ["aaaa"] * 1_000_000, - "b": range(1_000_000), - } - ) - buf = BytesIO() - df.to_json(buf, lines=True, orient="records", engine="cudf") - buf.seek(0) - df = df.to_pandas() - with cudf.option_context("io.json.low_memory", True): - gdf = cudf.read_json(buf, lines=True) - assert_eq(df, gdf) - - -# compression formats limited to those supported by both reader and writer -@pytest.mark.parametrize("compression", ["gzip", "snappy", "zstd"]) -def test_roundtrip_compression(compression, tmp_path): - expected = cudf.DataFrame({"a": [1], "b": ["2"]}) - fle = BytesIO() - expected.to_json(fle, engine="cudf", compression=compression) - result = cudf.read_json(fle, engine="cudf", compression=compression) - assert_eq(result, expected) From 82c2f4e60f2a501ce75931f70a08740462c523ae Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Thu, 14 Aug 2025 17:41:12 -0700 Subject: [PATCH 130/366] Move test_interval/test_dtypes/test_rank.py to new cudf directory structure (#19668) Towards https://github.com/rapidsai/cudf/issues/9999 Towards https://github.com/rapidsai/cudf/issues/15723 Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/19668 --- python/cudf/cudf/tests/conftest.py | 19 +- .../{ => dataframe/methods}/test_rank.py | 66 +--- .../cudf/tests/dataframe/test_constructors.py | 94 +++++ .../tests/dtypes/test_categoricaldtype.py | 36 ++ .../cudf/tests/dtypes/test_decimaldtype.py | 30 ++ python/cudf/cudf/tests/dtypes/test_dtype.py | 74 ++++ .../cudf/tests/dtypes/test_intervaldtype.py | 51 +++ .../cudf/cudf/tests/dtypes/test_listdtype.py | 37 ++ .../cudf/tests/dtypes/test_structdtype.py | 54 +++ .../tests/indexes/index/test_constructor.py | 14 + .../intervalindex/test_constructors.py | 29 ++ .../indexes/intervalindex/test_reductions.py | 16 + .../indexes/intervalindex/test_unique.py | 25 ++ .../cudf/tests/series/methods/test_rank.py | 32 ++ .../cudf/cudf/tests/series/test_attributes.py | 7 + .../cudf/tests/series/test_constructors.py | 118 ++++++ python/cudf/cudf/tests/test_dtypes.py | 352 ------------------ python/cudf/cudf/tests/test_interval.py | 221 ----------- 18 files changed, 653 insertions(+), 622 deletions(-) rename python/cudf/cudf/tests/{ => dataframe/methods}/test_rank.py (61%) create mode 100644 python/cudf/cudf/tests/dataframe/test_constructors.py create mode 100644 python/cudf/cudf/tests/dtypes/test_categoricaldtype.py create mode 100644 python/cudf/cudf/tests/dtypes/test_decimaldtype.py create mode 100644 python/cudf/cudf/tests/dtypes/test_dtype.py create mode 100644 python/cudf/cudf/tests/dtypes/test_intervaldtype.py create mode 100644 python/cudf/cudf/tests/dtypes/test_listdtype.py create mode 100644 python/cudf/cudf/tests/indexes/intervalindex/test_constructors.py create mode 100644 python/cudf/cudf/tests/indexes/intervalindex/test_reductions.py create mode 100644 python/cudf/cudf/tests/indexes/intervalindex/test_unique.py create mode 100644 python/cudf/cudf/tests/series/methods/test_rank.py delete mode 100644 python/cudf/cudf/tests/test_dtypes.py delete mode 100644 python/cudf/cudf/tests/test_interval.py diff --git a/python/cudf/cudf/tests/conftest.py b/python/cudf/cudf/tests/conftest.py index 4718ecdc711..217b9e6524e 100644 --- a/python/cudf/cudf/tests/conftest.py +++ b/python/cudf/cudf/tests/conftest.py @@ -300,7 +300,6 @@ def reduction_methods(request): def signed_integer_types_as_str(request): """ - "int8", "int16", "int32", "int64" - - "uint8", "uint16", "uint32", "uint64" """ return request.param @@ -503,3 +502,21 @@ def ignore_index(request): def ascending(request): """Param for `ascending` argument""" return request.param + + +@pytest.fixture(params=[True, False]) +def numeric_only(request): + """Param for `numeric_only` argument""" + return request.param + + +@pytest.fixture(params=[True, False, None]) +def categorical_ordered(request): + """Param for `ordered` argument for categorical types""" + return request.param + + +@pytest.fixture(params=["left", "right", "both", "neither"]) +def interval_closed(request): + """Param for `closed` argument for interval types""" + return request.param diff --git a/python/cudf/cudf/tests/test_rank.py b/python/cudf/cudf/tests/dataframe/methods/test_rank.py similarity index 61% rename from python/cudf/cudf/tests/test_rank.py rename to python/cudf/cudf/tests/dataframe/methods/test_rank.py index 07a844333cf..2befdd92581 100644 --- a/python/cudf/cudf/tests/test_rank.py +++ b/python/cudf/cudf/tests/dataframe/methods/test_rank.py @@ -1,17 +1,18 @@ -# Copyright (c) 2020-2025, NVIDIA CORPORATION. +# Copyright (c) 2025, NVIDIA CORPORATION. import numpy as np import pandas as pd import pytest import cudf -from cudf import DataFrame from cudf.testing import assert_eq from cudf.testing._utils import assert_exceptions_equal -@pytest.fixture -def pdf(): - return pd.DataFrame( +@pytest.mark.parametrize("method", ["average", "min", "max", "first", "dense"]) +@pytest.mark.parametrize("na_option", ["keep", "top", "bottom"]) +@pytest.mark.parametrize("pct", [True, False]) +def test_rank_all_arguments(ascending, method, na_option, pct, numeric_only): + pdf = pd.DataFrame( { "col1": np.array([5, 4, 3, 5, 8, 5, 2, 1, 6, 6]), "col2": np.array( @@ -21,26 +22,11 @@ def pdf(): index=np.array([5, 4, 3, 2, 1, 6, 7, 8, 9, 10]), ) - -@pytest.mark.parametrize("dtype", ["O", "f8", "i4"]) -@pytest.mark.parametrize("ascending", [True, False]) -@pytest.mark.parametrize("method", ["average", "min", "max", "first", "dense"]) -@pytest.mark.parametrize("na_option", ["keep", "top", "bottom"]) -@pytest.mark.parametrize("pct", [True, False]) -@pytest.mark.parametrize("numeric_only", [True, False]) -def test_rank_all_arguments( - pdf, dtype, ascending, method, na_option, pct, numeric_only -): - if method == "first" and dtype == "O": - # not supported by pandas - return - if numeric_only: - pdf = pdf.copy(deep=True) # for parallel pytest pdf["str"] = np.array( ["a", "b", "c", "d", "e", "1", "2", "3", "4", "5"] ) - gdf = DataFrame.from_pandas(pdf) + gdf = cudf.DataFrame.from_pandas(pdf) kwargs = { "method": method, @@ -73,8 +59,17 @@ def test_rank_all_arguments( assert_eq(expected, actual) -def test_rank_error_arguments(pdf): - gdf = DataFrame.from_pandas(pdf) +def test_rank_error_arguments(): + pdf = pd.DataFrame( + { + "col1": np.array([5, 4, 3, 5, 8, 5, 2, 1, 6, 6]), + "col2": np.array( + [5, 4, np.nan, 5, 8, 5, np.inf, np.nan, 6, -np.inf] + ), + }, + index=np.array([5, 4, 3, 2, 1, 6, 7, 8, 9, 10]), + ) + gdf = cudf.DataFrame.from_pandas(pdf) assert_exceptions_equal( lfunc=pdf["col1"].rank, @@ -121,28 +116,3 @@ def test_rank_error_arguments(pdf): }, ), ) - - -@pytest.mark.filterwarnings("ignore:invalid value encountered in cast") -@pytest.mark.parametrize("elem1", [np.nan, np.inf, -np.inf, 1.43]) -@pytest.mark.parametrize("elem2", [np.nan, np.inf, -np.inf, 1.43]) -@pytest.mark.parametrize("dtype", [np.int32, np.int64, np.float32, np.float64]) -def test_series_rank_combinations(elem1, elem2, dtype): - aa = np.array([elem1, elem2], dtype=np.float64).astype(dtype) - gdf = DataFrame({"a": aa}) - df = pd.DataFrame({"a": aa}) - ranked_gs = gdf["a"].rank(method="first") - ranked_ps = df["a"].rank(method="first") - # Check - assert_eq(ranked_ps, ranked_gs) - - -@pytest.mark.parametrize("klass", ["Series", "DataFrame"]) -def test_int_nan_pandas_compatible(klass): - data = [3, 6, 1, 1, None, 6] - pd_obj = getattr(pd, klass)(data) - cudf_obj = getattr(cudf, klass)(data) - with cudf.option_context("mode.pandas_compatible", True): - result = cudf_obj.rank() - expected = pd_obj.rank() - assert_eq(result, expected) diff --git a/python/cudf/cudf/tests/dataframe/test_constructors.py b/python/cudf/cudf/tests/dataframe/test_constructors.py new file mode 100644 index 00000000000..6d45334717f --- /dev/null +++ b/python/cudf/cudf/tests/dataframe/test_constructors.py @@ -0,0 +1,94 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + +import pandas as pd +import pytest + +import cudf +from cudf.testing import assert_eq + + +@pytest.mark.parametrize( + "data1, data2", + [(1, 2), (1.0, 2.0), (3, 4.0)], +) +@pytest.mark.parametrize("data3, data4", [(6, 10), (5.0, 9.0), (2, 6.0)]) +def test_create_interval_df(data1, data2, data3, data4, interval_closed): + # df for both pandas and cudf only works when interval is in a list + expect = pd.DataFrame( + [pd.Interval(data1, data2, interval_closed)], dtype="interval" + ) + got = cudf.DataFrame( + [pd.Interval(data1, data2, interval_closed)], dtype="interval" + ) + assert_eq(expect, got) + + expect_two = pd.DataFrame( + { + "a": [ + pd.Interval(data1, data2, interval_closed), + pd.Interval(data3, data4, interval_closed), + ], + "b": [ + pd.Interval(data3, data4, interval_closed), + pd.Interval(data1, data2, interval_closed), + ], + }, + dtype="interval", + ) + got_two = cudf.DataFrame( + { + "a": [ + pd.Interval(data1, data2, interval_closed), + pd.Interval(data3, data4, interval_closed), + ], + "b": [ + pd.Interval(data3, data4, interval_closed), + pd.Interval(data1, data2, interval_closed), + ], + }, + dtype="interval", + ) + assert_eq(expect_two, got_two) + + expect_three = pd.DataFrame( + { + "a": [ + pd.Interval(data1, data2, interval_closed), + pd.Interval(data3, data4, interval_closed), + pd.Interval(data1, data2, interval_closed), + ], + "b": [ + pd.Interval(data3, data4, interval_closed), + pd.Interval(data1, data2, interval_closed), + pd.Interval(data3, data4, interval_closed), + ], + "c": [ + pd.Interval(data1, data2, interval_closed), + pd.Interval(data1, data2, interval_closed), + pd.Interval(data3, data4, interval_closed), + ], + }, + dtype="interval", + ) + + got_three = cudf.DataFrame( + { + "a": [ + pd.Interval(data1, data2, interval_closed), + pd.Interval(data3, data4, interval_closed), + pd.Interval(data1, data2, interval_closed), + ], + "b": [ + pd.Interval(data3, data4, interval_closed), + pd.Interval(data1, data2, interval_closed), + pd.Interval(data3, data4, interval_closed), + ], + "c": [ + pd.Interval(data1, data2, interval_closed), + pd.Interval(data1, data2, interval_closed), + pd.Interval(data3, data4, interval_closed), + ], + }, + dtype="interval", + ) + assert_eq(expect_three, got_three) diff --git a/python/cudf/cudf/tests/dtypes/test_categoricaldtype.py b/python/cudf/cudf/tests/dtypes/test_categoricaldtype.py new file mode 100644 index 00000000000..93311fd3c02 --- /dev/null +++ b/python/cudf/cudf/tests/dtypes/test_categoricaldtype.py @@ -0,0 +1,36 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + +import pandas as pd +import pytest + +import cudf + + +@pytest.mark.parametrize( + "data", [None, [], ["a"], [1], [1.0], ["a", "b", "c"]] +) +def test_cdt_eq(data, categorical_ordered): + dt = cudf.CategoricalDtype(categories=data, ordered=categorical_ordered) + assert dt == "category" + assert dt == dt + assert dt == cudf.CategoricalDtype( + categories=None, ordered=categorical_ordered + ) + assert dt == cudf.CategoricalDtype( + categories=data, ordered=categorical_ordered + ) + assert dt != cudf.CategoricalDtype( + categories=data, ordered=not categorical_ordered + ) + + +@pytest.mark.parametrize( + "data", [None, [], ["a"], [1], [1.0], ["a", "b", "c"]] +) +def test_cdf_to_pandas(data, categorical_ordered): + assert ( + pd.CategoricalDtype(data, categorical_ordered) + == cudf.CategoricalDtype( + categories=data, ordered=categorical_ordered + ).to_pandas() + ) diff --git a/python/cudf/cudf/tests/dtypes/test_decimaldtype.py b/python/cudf/cudf/tests/dtypes/test_decimaldtype.py new file mode 100644 index 00000000000..883b55f0f68 --- /dev/null +++ b/python/cudf/cudf/tests/dtypes/test_decimaldtype.py @@ -0,0 +1,30 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + +import pyarrow as pa +import pytest + +import cudf + + +@pytest.mark.parametrize( + "decimal_type", + [cudf.Decimal32Dtype, cudf.Decimal64Dtype, cudf.Decimal128Dtype], +) +def test_decimal_dtype_arrow_roundtrip(decimal_type): + dt = decimal_type(4, 2) + assert dt.to_arrow() == pa.decimal128(4, 2) + assert dt == decimal_type.from_arrow(pa.decimal128(4, 2)) + + +@pytest.mark.parametrize( + "decimal_type,max_precision", + [ + (cudf.Decimal32Dtype, 9), + (cudf.Decimal64Dtype, 18), + (cudf.Decimal128Dtype, 38), + ], +) +def test_max_precision(decimal_type, max_precision): + decimal_type(scale=0, precision=max_precision) + with pytest.raises(ValueError): + decimal_type(scale=0, precision=max_precision + 1) diff --git a/python/cudf/cudf/tests/dtypes/test_dtype.py b/python/cudf/cudf/tests/dtypes/test_dtype.py new file mode 100644 index 00000000000..66de2386d35 --- /dev/null +++ b/python/cudf/cudf/tests/dtypes/test_dtype.py @@ -0,0 +1,74 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + +import numpy as np +import pandas as pd +import pytest + +import cudf +from cudf.testing import assert_eq + + +@pytest.mark.parametrize( + "in_dtype,expect", + [ + (np.dtype("int8"), np.dtype("int8")), + (np.int8, np.dtype("int8")), + (pd.Int8Dtype(), np.dtype("int8")), + (pd.StringDtype(), np.dtype("object")), + ("int8", np.dtype("int8")), + ("boolean", np.dtype("bool")), + ("bool_", np.dtype("bool")), + (np.bool_, np.dtype("bool")), + (int, np.dtype("int64")), + (float, np.dtype("float64")), + (cudf.ListDtype("int64"), cudf.ListDtype("int64")), + (np.dtype("U"), np.dtype("object")), + ("timedelta64[ns]", np.dtype(""} + ], + "type": "list", + }, + {}, + { + "name": "var2", + "val": [ + { + "name": "var3", + "val": {"field": 42}, + "type": "optional", + }, + { + "name": "var4", + "val": {"field": 3.14}, + "type": "optional", + }, + ], + "type": "list", + }, + None, + ], + ], +) +def test_lists_of_structs_data(data): + got = cudf.Series(data) + expected = cudf.Series(pa.array(data)) + assert_eq(got, expected) + + @pytest.fixture( params=[ [1000000, 200000, 3000000], diff --git a/python/cudf/cudf/tests/test_dtypes.py b/python/cudf/cudf/tests/test_dtypes.py deleted file mode 100644 index 103a286b892..00000000000 --- a/python/cudf/cudf/tests/test_dtypes.py +++ /dev/null @@ -1,352 +0,0 @@ -# Copyright (c) 2020-2025, NVIDIA CORPORATION. - -import numpy as np -import pandas as pd -import pyarrow as pa -import pytest -from pandas.core.arrays.arrow.extension_types import ArrowIntervalType - -import cudf -from cudf.core.column import ColumnBase -from cudf.core.dtypes import ( - CategoricalDtype, - Decimal32Dtype, - Decimal64Dtype, - Decimal128Dtype, - IntervalDtype, - ListDtype, - StructDtype, -) -from cudf.testing import assert_eq -from cudf.utils.dtypes import cudf_dtype_to_pa_type - - -def test_cdt_basic(): - psr = pd.Series(["a", "b", "a", "c"], dtype="category") - sr = cudf.Series(["a", "b", "a", "c"], dtype="category") - assert isinstance(sr.dtype, CategoricalDtype) - assert_eq(sr.dtype.categories, psr.dtype.categories) - - -@pytest.mark.parametrize( - "data", [None, [], ["a"], [1], [1.0], ["a", "b", "c"]] -) -@pytest.mark.parametrize("ordered", [None, False, True]) -def test_cdt_eq(data, ordered): - dt = cudf.CategoricalDtype(categories=data, ordered=ordered) - assert dt == "category" - assert dt == dt - assert dt == cudf.CategoricalDtype(categories=None, ordered=ordered) - assert dt == cudf.CategoricalDtype(categories=data, ordered=ordered) - assert not dt == cudf.CategoricalDtype( - categories=data, ordered=not ordered - ) - - -@pytest.mark.parametrize( - "data", [None, [], ["a"], [1], [1.0], ["a", "b", "c"]] -) -@pytest.mark.parametrize("ordered", [None, False, True]) -def test_cdf_to_pandas(data, ordered): - assert ( - pd.CategoricalDtype(data, ordered) - == cudf.CategoricalDtype(categories=data, ordered=ordered).to_pandas() - ) - - -@pytest.mark.parametrize( - "value_type", - [ - int, - "int32", - np.int32, - "datetime64[ms]", - "datetime64[ns]", - "str", - "object", - ], -) -def test_list_dtype_pyarrow_round_trip(value_type): - pa_type = pa.list_(cudf_dtype_to_pa_type(cudf.dtype(value_type))) - expect = pa_type - got = ListDtype.from_arrow(expect).to_arrow() - assert expect.equals(got) - - -def test_list_dtype_eq(): - lhs = ListDtype("int32") - rhs = ListDtype("int32") - assert lhs == rhs - rhs = ListDtype("int64") - assert lhs != rhs - - -def test_list_nested_dtype(): - dt = ListDtype(ListDtype("int32")) - expect = ListDtype("int32") - got = dt.element_type - assert expect == got - - -@pytest.mark.parametrize( - "fields", - [ - {}, - {"a": "int64"}, - {"a": "datetime64[ms]"}, - {"a": "int32", "b": "int64"}, - ], -) -def test_struct_dtype_pyarrow_round_trip(fields): - pa_type = pa.struct( - {k: pa.from_numpy_dtype(np.dtype(v)) for k, v in fields.items()} - ) - expect = pa_type - got = StructDtype.from_arrow(expect).to_arrow() - assert expect.equals(got) - - -def test_struct_dtype_eq(): - lhs = StructDtype( - {"a": "int32", "b": StructDtype({"c": "int64", "ab": "int32"})} - ) - rhs = StructDtype( - {"a": "int32", "b": StructDtype({"c": "int64", "ab": "int32"})} - ) - assert lhs == rhs - rhs = StructDtype({"a": "int32", "b": "int64"}) - assert lhs != rhs - lhs = StructDtype({"b": "int64", "a": "int32"}) - assert lhs != rhs - - -@pytest.mark.parametrize( - "fields", - [ - {}, - {"a": "int32"}, - {"a": "object"}, - {"a": "str"}, - {"a": "datetime64[D]"}, - {"a": "int32", "b": "int64"}, - {"a": "int32", "b": StructDtype({"a": "int32", "b": "int64"})}, - ], -) -def test_struct_dtype_fields(fields): - fields = {"a": "int32", "b": StructDtype({"c": "int64", "d": "int32"})} - dt = StructDtype(fields) - assert_eq(dt.fields, fields) - - -@pytest.mark.parametrize( - "decimal_type", - [cudf.Decimal32Dtype, cudf.Decimal64Dtype, cudf.Decimal128Dtype], -) -def test_decimal_dtype_arrow_roundtrip(decimal_type): - dt = decimal_type(4, 2) - assert dt.to_arrow() == pa.decimal128(4, 2) - assert dt == decimal_type.from_arrow(pa.decimal128(4, 2)) - - -@pytest.mark.parametrize( - "decimal_type,max_precision", - [ - (cudf.Decimal32Dtype, 9), - (cudf.Decimal64Dtype, 18), - (cudf.Decimal128Dtype, 38), - ], -) -def test_max_precision(decimal_type, max_precision): - decimal_type(scale=0, precision=max_precision) - with pytest.raises(ValueError): - decimal_type(scale=0, precision=max_precision + 1) - - -@pytest.fixture(params=["int64", "int32"]) -def subtype(request): - return request.param - - -@pytest.fixture(params=["left", "right", "both", "neither"]) -def closed(request): - return request.param - - -def test_interval_dtype_pyarrow_round_trip(subtype, closed): - pa_array = ArrowIntervalType(subtype, closed) - expect = pa_array - got = IntervalDtype.from_arrow(expect).to_arrow() - assert expect.equals(got) - - -def test_interval_dtype_from_pandas(subtype, closed): - expect = cudf.IntervalDtype(subtype, closed=closed) - pd_type = pd.IntervalDtype(subtype, closed=closed) - got = cudf.IntervalDtype.from_pandas(pd_type) - assert expect == got - - -def assert_column_array_dtype_equal(column: ColumnBase, array: pa.array): - """ - In cudf, each column holds its dtype. And since column may have child - columns, child columns also holds their datatype. This method tests - that every level of `column` matches the type of the given `array` - recursively. - """ - - if isinstance(column.dtype, ListDtype): - return array.type.equals( - column.dtype.to_arrow() - ) and assert_column_array_dtype_equal( - column.base_children[1], array.values - ) - elif isinstance(column.dtype, StructDtype): - return array.type.equals(column.dtype.to_arrow()) and all( - assert_column_array_dtype_equal(child, array.field(i)) - for i, child in enumerate(column.base_children) - ) - elif isinstance( - column.dtype, (Decimal128Dtype, Decimal64Dtype, Decimal32Dtype) - ): - return array.type.equals(column.dtype.to_arrow()) - elif isinstance(column.dtype, CategoricalDtype): - raise NotImplementedError() - else: - return array.type.equals(cudf_dtype_to_pa_type(column.dtype)) - - -@pytest.mark.parametrize( - "data", - [ - [[{"name": 123}]], - [ - [ - { - "IsLeapYear": False, - "data": {"Year": 1999, "Month": 7}, - "names": ["Mike", None], - }, - { - "IsLeapYear": True, - "data": {"Year": 2004, "Month": 12}, - "names": None, - }, - { - "IsLeapYear": False, - "data": {"Year": 1996, "Month": 2}, - "names": ["Rose", "Richard"], - }, - ] - ], - [ - [None, {"human?": True, "deets": {"weight": 2.4, "age": 27}}], - [ - {"human?": None, "deets": {"weight": 5.3, "age": 25}}, - {"human?": False, "deets": {"weight": 8.0, "age": 31}}, - {"human?": False, "deets": None}, - ], - [], - None, - [{"human?": None, "deets": {"weight": 6.9, "age": None}}], - ], - [ - { - "name": "var0", - "val": [ - {"name": "var1", "val": None, "type": "optional"} - ], - "type": "list", - }, - {}, - { - "name": "var2", - "val": [ - { - "name": "var3", - "val": {"field": 42}, - "type": "optional", - }, - { - "name": "var4", - "val": {"field": 3.14}, - "type": "optional", - }, - ], - "type": "list", - }, - None, - ], - ], -) -def test_lists_of_structs_dtype(data): - got = cudf.Series(data) - expected = pa.array(data) - - assert_column_array_dtype_equal(got._column, expected) - assert expected.equals(got._column.to_arrow()) - - -@pytest.mark.parametrize( - "in_dtype,expect", - [ - (np.dtype("int8"), np.dtype("int8")), - (np.int8, np.dtype("int8")), - (pd.Int8Dtype(), np.dtype("int8")), - (pd.StringDtype(), np.dtype("object")), - ("int8", np.dtype("int8")), - ("boolean", np.dtype("bool")), - ("bool_", np.dtype("bool")), - (np.bool_, np.dtype("bool")), - (int, np.dtype("int64")), - (float, np.dtype("float64")), - (cudf.ListDtype("int64"), cudf.ListDtype("int64")), - (np.dtype("U"), np.dtype("object")), - ("timedelta64[ns]", np.dtype(" Date: Thu, 14 Aug 2025 17:43:04 -0700 Subject: [PATCH 131/366] Move test_cuda_array_interface/cut/dataframe_copy.py to new cudf classic test directories (#19599) Towards https://github.com/rapidsai/cudf/issues/9999 Towards https://github.com/rapidsai/cudf/issues/15723 Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/19599 --- python/cudf/cudf/tests/conftest.py | 6 + .../methods/test_copy.py} | 80 +++--- .../tests/{ => general_functions}/test_cut.py | 96 +++---- .../test_register_accessor.py} | 0 .../cudf/cudf/tests/series/test_attributes.py | 8 + .../cudf/tests/series/test_constructors.py | 192 ++++++++++++++ .../cudf/tests/test_cuda_array_interface.py | 237 ------------------ 7 files changed, 288 insertions(+), 331 deletions(-) rename python/cudf/cudf/tests/{test_dataframe_copy.py => dataframe/methods/test_copy.py} (73%) rename python/cudf/cudf/tests/{ => general_functions}/test_cut.py (74%) rename python/cudf/cudf/tests/{test_custom_accessor.py => general_functions/test_register_accessor.py} (100%) delete mode 100644 python/cudf/cudf/tests/test_cuda_array_interface.py diff --git a/python/cudf/cudf/tests/conftest.py b/python/cudf/cudf/tests/conftest.py index 217b9e6524e..0e193b57cd1 100644 --- a/python/cudf/cudf/tests/conftest.py +++ b/python/cudf/cudf/tests/conftest.py @@ -430,6 +430,12 @@ def all_supported_types_as_str(request): return request.param +@pytest.fixture(params=[list, np.array]) +def one_dimensional_array_types(request): + """1D array containers commonly accepted by cuDF and pandas""" + return request.param + + # pandas can raise warnings for some inputs to the following ufuncs: numpy_ufuncs = [] for name in dir(np): diff --git a/python/cudf/cudf/tests/test_dataframe_copy.py b/python/cudf/cudf/tests/dataframe/methods/test_copy.py similarity index 73% rename from python/cudf/cudf/tests/test_dataframe_copy.py rename to python/cudf/cudf/tests/dataframe/methods/test_copy.py index 53257dc8f29..7910451db57 100644 --- a/python/cudf/cudf/tests/test_dataframe_copy.py +++ b/python/cudf/cudf/tests/dataframe/methods/test_copy.py @@ -1,4 +1,4 @@ -# Copyright (c) 2018-2025, NVIDIA CORPORATION. +# Copyright (c) 2025, NVIDIA CORPORATION. from copy import copy, deepcopy import cupy as cp @@ -8,7 +8,6 @@ from cudf.core.dataframe import DataFrame from cudf.testing import assert_eq, assert_neq -from cudf.testing._utils import ALL_TYPES """ DataFrame copy expectations @@ -22,22 +21,31 @@ """ -@pytest.mark.parametrize( - "fn", - [ - lambda x: x.copy(), +@pytest.fixture( + params=[ + lambda x: x.copy(deep=False), lambda x: x.copy(deep=True), - lambda x: copy(x), - lambda x: deepcopy(x), + copy, + deepcopy, + ], + ids=[ + "DatFrame.copy(deep=False)", + "DataFrame.copy(deep=True)", + "copy.copy()", + "copy.deepcopy()", ], ) -def test_dataframe_deep_copy(fn): +def copy_fn(request): + return request.param + + +def test_dataframe_deep_copy(copy_fn): pdf = pd.DataFrame( [[1, 2, 3], [4, 5, 6], [7, 8, 9]], columns=["a", "b", "c"] ) gdf = DataFrame.from_pandas(pdf) - copy_pdf = fn(pdf) - copy_gdf = fn(gdf) + copy_pdf = copy_fn(pdf) + copy_gdf = copy_fn(gdf) copy_pdf["b"] = [0, 0, 0] copy_gdf["b"] = [0, 0, 0] pdf_is_equal = np.array_equal(pdf["b"].values, copy_pdf["b"].values) @@ -48,30 +56,13 @@ def test_dataframe_deep_copy(fn): assert not gdf_is_equal -""" -DataFrame copy bounds checking - sizes 0 through 10 perform as -expected_equality -""" - - -@pytest.mark.parametrize( - "copy_fn", - [ - lambda x: x.copy(), - lambda x: x.copy(deep=True), - lambda x: copy(x), - lambda x: deepcopy(x), - lambda x: x.copy(deep=False), - ], -) -@pytest.mark.parametrize("ncols", [0, 1, 10]) -@pytest.mark.parametrize("data_type", ALL_TYPES) -def test_cudf_dataframe_copy(copy_fn, ncols, data_type): +@pytest.mark.parametrize("ncols", [0, 2]) +def test_cudf_dataframe_copy(copy_fn, ncols, all_supported_types_as_str): rng = np.random.default_rng(seed=0) pdf = pd.DataFrame( { chr(i + ord("a")): pd.Series(rng.integers(0, 1000, 20)).astype( - data_type + all_supported_types_as_str ) for i in range(ncols) } @@ -81,24 +72,15 @@ def test_cudf_dataframe_copy(copy_fn, ncols, data_type): assert_eq(df, copy_df) -@pytest.mark.parametrize( - "copy_fn", - [ - lambda x: x.copy(), - lambda x: x.copy(deep=True), - lambda x: copy(x), - lambda x: deepcopy(x), - lambda x: x.copy(deep=False), - ], -) -@pytest.mark.parametrize("ncols", [0, 1, 10]) -@pytest.mark.parametrize("data_type", ALL_TYPES) -def test_cudf_dataframe_copy_then_insert(copy_fn, ncols, data_type): +@pytest.mark.parametrize("ncols", [0, 2]) +def test_cudf_dataframe_copy_then_insert( + copy_fn, ncols, all_supported_types_as_str +): rng = np.random.default_rng(seed=0) pdf = pd.DataFrame( { chr(i + ord("a")): pd.Series(rng.integers(0, 1000, 20)).astype( - data_type + all_supported_types_as_str ) for i in range(ncols) } @@ -106,8 +88,12 @@ def test_cudf_dataframe_copy_then_insert(copy_fn, ncols, data_type): df = DataFrame.from_pandas(pdf) copy_df = copy_fn(df) copy_pdf = copy_fn(pdf) - copy_df["aa"] = pd.Series(rng.integers(0, 1000, 20)).astype(data_type) - copy_pdf["aa"] = pd.Series(rng.integers(0, 1000, 20)).astype(data_type) + copy_df["aa"] = pd.Series(rng.integers(0, 1000, 20)).astype( + all_supported_types_as_str + ) + copy_pdf["aa"] = pd.Series(rng.integers(0, 1000, 20)).astype( + all_supported_types_as_str + ) assert not copy_pdf.to_string().split() == pdf.to_string().split() assert not copy_df.to_string().split() == df.to_string().split() diff --git a/python/cudf/cudf/tests/test_cut.py b/python/cudf/cudf/tests/general_functions/test_cut.py similarity index 74% rename from python/cudf/cudf/tests/test_cut.py rename to python/cudf/cudf/tests/general_functions/test_cut.py index 3fc05599976..35e908ee035 100644 --- a/python/cudf/cudf/tests/test_cut.py +++ b/python/cudf/cudf/tests/general_functions/test_cut.py @@ -12,15 +12,29 @@ from cudf.testing import assert_eq -@pytest.mark.parametrize("box", [list, np.array]) +@pytest.fixture(params=[True, False]) +def right(request): + return request.param + + +@pytest.fixture(params=[True, False]) +def include_lowest(request): + return request.param + + +@pytest.fixture(params=[True, False]) +def ordered(request): + return request.param + + @pytest.mark.parametrize("bins", [1, 2, 3]) -@pytest.mark.parametrize("right", [True, False]) -@pytest.mark.parametrize("include_lowest", [True, False]) @pytest.mark.parametrize("precision", [1, 3]) -def test_cut_basic(box, bins, right, include_lowest, precision): +def test_cut_basic( + one_dimensional_array_types, bins, right, include_lowest, precision +): # will test optional labels, retbins and duplicates separately # they need more specific parameters to work - x = box([1, 7, 5, 4, 6, 3]) + x = one_dimensional_array_types([1, 7, 5, 4, 6, 3]) ordered = True pcat = pd.cut( x=x, @@ -43,9 +57,6 @@ def test_cut_basic(box, bins, right, include_lowest, precision): assert_eq(pindex, gindex) -@pytest.mark.parametrize("right", [True, False]) -@pytest.mark.parametrize("include_lowest", [True, False]) -@pytest.mark.parametrize("ordered", [True, False]) @pytest.mark.parametrize("precision", [1, 3]) @pytest.mark.parametrize( "labels", [["bad", "medium", "good"], [1, 2, 3], False] @@ -76,8 +87,6 @@ def test_cut_labels(right, include_lowest, ordered, precision, labels): assert_eq(pindex, gindex) -@pytest.mark.parametrize("right", [True, False]) -@pytest.mark.parametrize("include_lowest", [True, False]) @pytest.mark.parametrize("precision", [1, 3]) @pytest.mark.parametrize("labels", [["bad", "good", "good"], [1, 2, 2], False]) def test_cut_labels_non_unique(right, include_lowest, precision, labels): @@ -110,28 +119,26 @@ def test_cut_labels_non_unique(right, include_lowest, precision, labels): @pytest.mark.parametrize( "x", [ - [1, 7, 5, 4, 6, 3], - [1, 7], - np.array([1, 7, 5, 4, 6, 3]), - np.array([2, 4, 6, 8, 10]), + [1, 7, 5], + [2, 4, 6, 8, 10], ], ) @pytest.mark.parametrize( "bins", [1, 2, 3, [1, 2, 3], [0, 2, 4, 6, 10]], ) -@pytest.mark.parametrize("right", [True, False]) -def test_cut_right(x, bins, right): +def test_cut_right(x, one_dimensional_array_types, bins, right): + arg = one_dimensional_array_types(x) precision = 3 pcat = pd.cut( - x=x, + x=arg, bins=bins, right=right, precision=precision, ) pindex = pd.CategoricalIndex(pcat) gindex = cut( - x=x, + x=arg, bins=bins, right=right, precision=precision, @@ -143,24 +150,23 @@ def test_cut_right(x, bins, right): @pytest.mark.parametrize( "x", [ - [1, 7, 5, 4, 6, 3], - [1, 7], - np.array([1, 7, 5, 4, 6, 3]), - np.array([2, 4, 6, 8, 10]), + [1, 7, 5], + [2, 4, 6, 8, 10], ], ) @pytest.mark.parametrize( "bins", [[0, 2, 4, 6, 10, 10], [1, 2, 2, 3, 3]], ) -@pytest.mark.parametrize("right", [True, False]) -@pytest.mark.parametrize("include_lowest", [True, False]) @pytest.mark.parametrize("precision", [1, 3]) -def test_cut_drop_duplicates(x, bins, right, precision, include_lowest): +def test_cut_drop_duplicates( + x, one_dimensional_array_types, bins, right, precision, include_lowest +): ordered = True duplicates = "drop" + arg = one_dimensional_array_types(x) pcat = pd.cut( - x=x, + x=arg, bins=bins, right=right, precision=precision, @@ -170,7 +176,7 @@ def test_cut_drop_duplicates(x, bins, right, precision, include_lowest): ) pindex = pd.CategoricalIndex(pcat) gindex = cut( - x=x, + x=arg, bins=bins, right=right, precision=precision, @@ -185,26 +191,25 @@ def test_cut_drop_duplicates(x, bins, right, precision, include_lowest): @pytest.mark.parametrize( "x", [ - [1, 7, 5, 4, 6, 3], - [1, 7], - np.array([1, 7, 5, 4, 6, 3]), - np.array([2, 4, 6, 8, 10]), + [1, 7, 5], + [2, 4, 6, 8, 10], ], ) @pytest.mark.parametrize( "bins", [[0, 2, 4, 6, 10, 10], [1, 2, 2, 3, 3]], ) -@pytest.mark.parametrize("right", [True, False]) -@pytest.mark.parametrize("include_lowest", [True, False]) @pytest.mark.parametrize("precision", [1, 3]) -def test_cut_drop_duplicates_raises(x, bins, right, precision, include_lowest): +def test_cut_drop_duplicates_raises( + x, one_dimensional_array_types, bins, right, precision, include_lowest +): + arg = one_dimensional_array_types(x) ordered = True duplicates = "raise" msg = "Bin edges must be unique" with pytest.raises(ValueError, match=msg): cut( - x=x, + x=arg, bins=bins, right=right, precision=precision, @@ -214,7 +219,7 @@ def test_cut_drop_duplicates_raises(x, bins, right, precision, include_lowest): ) with pytest.raises(ValueError, match=msg): pd.cut( - x=x, + x=arg, bins=bins, right=right, precision=precision, @@ -227,20 +232,19 @@ def test_cut_drop_duplicates_raises(x, bins, right, precision, include_lowest): @pytest.mark.parametrize( "x", [ - [0, 0.5, 1.5, 2.5, 4.5], - [1, 7, 5, 4, 6, 3], - [1, 7], - np.array([1, 7, 5, 4, 6, 3]), - np.array([2, 4, 6, 8, 10]), + [1, 7, 5], + [2, 4, 6, 8, 10], ], ) -@pytest.mark.parametrize("right", [True, False]) @pytest.mark.parametrize("precision", [1, 3]) @pytest.mark.parametrize("duplicates", ["drop", "raise"]) -def test_cut_intervalindex_bin(x, right, precision, duplicates): +def test_cut_intervalindex_bin( + x, one_dimensional_array_types, right, precision, duplicates +): + arg = one_dimensional_array_types(x) bins = pd.IntervalIndex.from_tuples([(0, 1), (2, 3), (4, 5)]) pcat = pd.cut( - x=x, + x=arg, bins=bins, right=right, precision=precision, @@ -248,7 +252,7 @@ def test_cut_intervalindex_bin(x, right, precision, duplicates): ) pindex = pd.CategoricalIndex(pcat) gindex = cut( - x=x, + x=arg, bins=bins, right=right, precision=precision, @@ -259,8 +263,6 @@ def test_cut_intervalindex_bin(x, right, precision, duplicates): @pytest.mark.parametrize("bins", [1, 3]) -@pytest.mark.parametrize("right", [True, False]) -@pytest.mark.parametrize("include_lowest", [True, False]) def test_cut_series(bins, right, include_lowest): x = pd.Series(np.array([2, 4, 6, 8, 10]), index=["a", "b", "c", "d", "e"]) precision = 3 diff --git a/python/cudf/cudf/tests/test_custom_accessor.py b/python/cudf/cudf/tests/general_functions/test_register_accessor.py similarity index 100% rename from python/cudf/cudf/tests/test_custom_accessor.py rename to python/cudf/cudf/tests/general_functions/test_register_accessor.py diff --git a/python/cudf/cudf/tests/series/test_attributes.py b/python/cudf/cudf/tests/series/test_attributes.py index f105992872d..b79b1010cc3 100644 --- a/python/cudf/cudf/tests/series/test_attributes.py +++ b/python/cudf/cudf/tests/series/test_attributes.py @@ -251,6 +251,14 @@ def test_timedelta_contains(data, timedelta_types_as_str, scalar): assert_eq(expected, actual) +def test_cai_after_indexing(): + df = cudf.DataFrame({"a": [1, 2, 3]}) + cai1 = df["a"].__cuda_array_interface__ + df[["a"]] + cai2 = df["a"].__cuda_array_interface__ + assert cai1 == cai2 + + @pytest.mark.parametrize( "data, expected", [ diff --git a/python/cudf/cudf/tests/series/test_constructors.py b/python/cudf/cudf/tests/series/test_constructors.py index d4588005999..342f60c1580 100644 --- a/python/cudf/cudf/tests/series/test_constructors.py +++ b/python/cudf/cudf/tests/series/test_constructors.py @@ -1,8 +1,10 @@ # Copyright (c) 2023-2025, NVIDIA CORPORATION. import datetime import decimal +import types import cupy as cp +import numba.cuda import numpy as np import pandas as pd import pyarrow as pa @@ -14,6 +16,7 @@ PANDAS_GE_210, PANDAS_VERSION, ) +from cudf.core.buffer.spill_manager import get_global_manager from cudf.core.column.column import as_column from cudf.errors import MixedTypeError from cudf.testing import assert_eq @@ -778,6 +781,195 @@ def test_series_arrow_decimal_types_roundtrip(pa_type): assert_eq(pdf, gdf) +@pytest.mark.parametrize("module", ["cupy", "numba"]) +def test_cuda_array_interface_interop_in( + numeric_and_temporal_types_as_str, module +): + if module == "cupy": + module_constructor = cp.array + if numeric_and_temporal_types_as_str.startswith( + "datetime" + ) or numeric_and_temporal_types_as_str.startswith("timedelta"): + pytest.skip( + f"cupy doesn't support {numeric_and_temporal_types_as_str}" + ) + elif module == "numba": + module_constructor = numba.cuda.to_device + + np_data = np.arange(10).astype(numeric_and_temporal_types_as_str) + module_data = module_constructor(np_data) + + pd_data = pd.Series(np_data) + # Test using a specific function for __cuda_array_interface__ here + cudf_data = cudf.Series(module_data) + + assert_eq(pd_data, cudf_data) + + gdf = cudf.DataFrame() + gdf["test"] = module_data + pd_data.name = "test" + assert_eq(pd_data, gdf["test"]) + + +@pytest.mark.parametrize("module", ["cupy", "numba"]) +def test_cuda_array_interface_interop_out( + numeric_and_temporal_types_as_str, module +): + if module == "cupy": + module_constructor = cp.asarray + + def to_host_function(x): + return cp.asnumpy(x) + elif module == "numba": + module_constructor = numba.cuda.as_cuda_array + + def to_host_function(x): + return x.copy_to_host() + + np_data = np.arange(10).astype(numeric_and_temporal_types_as_str) + cudf_data = cudf.Series(np_data) + assert isinstance(cudf_data.__cuda_array_interface__, dict) + + module_data = module_constructor(cudf_data) + got = to_host_function(module_data) + + expect = np_data + + assert_eq(expect, got) + + +def test_cuda_array_interface_interop_out_masked( + numeric_and_temporal_types_as_str, +): + np_data = np.arange(10).astype("float64") + np_data[[0, 2, 4, 6, 8]] = np.nan + + cudf_data = cudf.Series(np_data).astype(numeric_and_temporal_types_as_str) + cai = cudf_data.__cuda_array_interface__ + assert isinstance(cai, dict) + assert "mask" in cai + + +@pytest.mark.parametrize("nulls", ["all", "some", "bools", "none"]) +@pytest.mark.parametrize("mask_type", ["bits", "bools"]) +def test_cuda_array_interface_as_column( + numeric_and_temporal_types_as_str, nulls, mask_type +): + sr = cudf.Series(np.arange(10)) + + if nulls == "some": + mask = [ + True, + False, + True, + False, + False, + True, + True, + False, + True, + True, + ] + sr[sr[~np.asarray(mask)]] = None + elif nulls == "all": + sr[:] = None + + sr = sr.astype(numeric_and_temporal_types_as_str) + + obj = types.SimpleNamespace( + __cuda_array_interface__=sr.__cuda_array_interface__ + ) + + if mask_type == "bools": + if nulls == "some": + obj.__cuda_array_interface__["mask"] = numba.cuda.to_device(mask) + elif nulls == "all": + obj.__cuda_array_interface__["mask"] = numba.cuda.to_device( + [False] * 10 + ) + + expect = sr + got = cudf.Series(obj) + + assert_eq(expect, got) + + +def test_series_from_ephemeral_cupy(): + # Test that we keep a reference to the ephemeral + # CuPy array. If we didn't, then `a` would end + # up referring to the same memory as `b` due to + # CuPy's caching allocator + a = cudf.Series(cp.asarray([1, 2, 3])) + b = cudf.Series(cp.asarray([1, 1, 1])) + assert_eq(pd.Series([1, 2, 3]), a) + assert_eq(pd.Series([1, 1, 1]), b) + + +def test_column_from_ephemeral_cupy_try_lose_reference(): + # Try to lose the reference we keep to the ephemeral + # CuPy array + a = cudf.Series(cp.asarray([1, 2, 3]))._column + a = cudf.core.column.as_column(a) + b = cp.asarray([1, 1, 1]) + assert_eq(pd.Index([1, 2, 3]), a.to_pandas()) + + a = cudf.Series(cp.asarray([1, 2, 3]))._column + a.name = "b" + b = cp.asarray([1, 1, 1]) # noqa: F841 + assert_eq(pd.Index([1, 2, 3]), a.to_pandas()) + + +@pytest.mark.xfail( + get_global_manager() is not None, + reason=( + "spilling doesn't support PyTorch, see " + "`cudf.core.buffer.spillable_buffer.DelayedPointerTuple`" + ), +) +def test_cuda_array_interface_pytorch(): + torch = pytest.importorskip("torch", minversion="2.4.0") + if not torch.cuda.is_available(): + pytest.skip("need gpu version of pytorch to be installed") + + series = cudf.Series([1, -1, 10, -56]) + tensor = torch.tensor(series) + got = cudf.Series(tensor) + + assert_eq(got, series) + buffer = cudf.core.buffer.as_buffer(cp.ones(10, dtype=np.bool_)) + tensor = torch.tensor(buffer) + got = cudf.Series(tensor, dtype=np.bool_) + + assert_eq(got, cudf.Series(buffer, dtype=np.bool_)) + + index = cudf.Index([], dtype="float64") + tensor = torch.tensor(index) + got = cudf.Index(tensor) + assert_eq(got, index) + + index = cudf.RangeIndex(start=0, stop=3) + tensor = torch.tensor(index) + got = cudf.Series(tensor) + + assert_eq(got, cudf.Series(index)) + + index = cudf.Index([1, 2, 8, 6]) + tensor = torch.tensor(index) + got = cudf.Index(tensor) + + assert_eq(got, index) + + str_series = cudf.Series(["a", "g"]) + + with pytest.raises(AttributeError): + str_series.__cuda_array_interface__ + + cat_series = str_series.astype("category") + + with pytest.raises(TypeError): + cat_series.__cuda_array_interface__ + + def test_series_arrow_struct_types_roundtrip(): ps = pd.Series( [{"a": 1}, {"b": "abc"}], diff --git a/python/cudf/cudf/tests/test_cuda_array_interface.py b/python/cudf/cudf/tests/test_cuda_array_interface.py deleted file mode 100644 index e163f62282b..00000000000 --- a/python/cudf/cudf/tests/test_cuda_array_interface.py +++ /dev/null @@ -1,237 +0,0 @@ -# Copyright (c) 2019-2025, NVIDIA CORPORATION. - -import types -from contextlib import nullcontext as does_not_raise - -import cupy -import numba.cuda -import numpy as np -import pandas as pd -import pytest - -import cudf -from cudf.core.buffer.spill_manager import get_global_manager -from cudf.testing import assert_eq -from cudf.testing._utils import DATETIME_TYPES, NUMERIC_TYPES, TIMEDELTA_TYPES - - -@pytest.mark.parametrize("dtype", NUMERIC_TYPES + DATETIME_TYPES) -@pytest.mark.parametrize("module", ["cupy", "numba"]) -def test_cuda_array_interface_interop_in(dtype, module): - np_data = np.arange(10).astype(dtype) - - expectation = does_not_raise() - if module == "cupy": - module_constructor = cupy.array - if dtype in DATETIME_TYPES: - expectation = pytest.raises(ValueError) - elif module == "numba": - module_constructor = numba.cuda.to_device - - with expectation: - module_data = module_constructor(np_data) - - pd_data = pd.Series(np_data) - # Test using a specific function for __cuda_array_interface__ here - cudf_data = cudf.Series(module_data) - - assert_eq(pd_data, cudf_data) - - gdf = cudf.DataFrame() - gdf["test"] = module_data - pd_data.name = "test" - assert_eq(pd_data, gdf["test"]) - - -@pytest.mark.parametrize( - "dtype", NUMERIC_TYPES + DATETIME_TYPES + TIMEDELTA_TYPES + ["str"] -) -@pytest.mark.parametrize("module", ["cupy", "numba"]) -def test_cuda_array_interface_interop_out(dtype, module): - expectation = does_not_raise() - if dtype == "str": - expectation = pytest.raises(AttributeError) - if module == "cupy": - module_constructor = cupy.asarray - - def to_host_function(x): - return cupy.asnumpy(x) - - elif module == "numba": - module_constructor = numba.cuda.as_cuda_array - - def to_host_function(x): - return x.copy_to_host() - - with expectation: - np_data = np.arange(10).astype(dtype) - cudf_data = cudf.Series(np_data) - assert isinstance(cudf_data.__cuda_array_interface__, dict) - - module_data = module_constructor(cudf_data) - got = to_host_function(module_data) - - expect = np_data - - assert_eq(expect, got) - - -@pytest.mark.parametrize( - "dtype", NUMERIC_TYPES + DATETIME_TYPES + TIMEDELTA_TYPES -) -@pytest.mark.parametrize("module", ["cupy", "numba"]) -def test_cuda_array_interface_interop_out_masked(dtype, module): - expectation = does_not_raise() - if module == "cupy": - pytest.skip( - "cupy doesn't support version 1 of `__cuda_array_interface__` yet" - ) - module_constructor = cupy.asarray - - def to_host_function(x): - return cupy.asnumpy(x) - - elif module == "numba": - expectation = pytest.raises(NotImplementedError) - module_constructor = numba.cuda.as_cuda_array - - def to_host_function(x): - return x.copy_to_host() - - np_data = np.arange(10).astype("float64") - np_data[[0, 2, 4, 6, 8]] = np.nan - - with expectation: - cudf_data = cudf.Series(np_data).astype(dtype) - assert isinstance(cudf_data.__cuda_array_interface__, dict) - - module_data = module_constructor(cudf_data) # noqa: F841 - - -@pytest.mark.parametrize( - "dtype", NUMERIC_TYPES + DATETIME_TYPES + TIMEDELTA_TYPES -) -@pytest.mark.parametrize("nulls", ["all", "some", "bools", "none"]) -@pytest.mark.parametrize("mask_type", ["bits", "bools"]) -def test_cuda_array_interface_as_column(dtype, nulls, mask_type): - sr = cudf.Series(np.arange(10)) - - if nulls == "some": - mask = [ - True, - False, - True, - False, - False, - True, - True, - False, - True, - True, - ] - sr[sr[~np.asarray(mask)]] = None - elif nulls == "all": - sr[:] = None - - sr = sr.astype(dtype) - - obj = types.SimpleNamespace( - __cuda_array_interface__=sr.__cuda_array_interface__ - ) - - if mask_type == "bools": - if nulls == "some": - obj.__cuda_array_interface__["mask"] = numba.cuda.to_device(mask) - elif nulls == "all": - obj.__cuda_array_interface__["mask"] = numba.cuda.to_device( - [False] * 10 - ) - - expect = sr - got = cudf.Series(obj) - - assert_eq(expect, got) - - -def test_column_from_ephemeral_cupy(): - # Test that we keep a reference to the ephemeral - # CuPy array. If we didn't, then `a` would end - # up referring to the same memory as `b` due to - # CuPy's caching allocator - a = cudf.Series(cupy.asarray([1, 2, 3])) - b = cudf.Series(cupy.asarray([1, 1, 1])) - assert_eq(pd.Series([1, 2, 3]), a) - assert_eq(pd.Series([1, 1, 1]), b) - - -def test_column_from_ephemeral_cupy_try_lose_reference(): - # Try to lose the reference we keep to the ephemeral - # CuPy array - a = cudf.Series(cupy.asarray([1, 2, 3]))._column - a = cudf.core.column.as_column(a) - b = cupy.asarray([1, 1, 1]) - assert_eq(pd.Index([1, 2, 3]), a.to_pandas()) - - a = cudf.Series(cupy.asarray([1, 2, 3]))._column - a.name = "b" - b = cupy.asarray([1, 1, 1]) # noqa: F841 - assert_eq(pd.Index([1, 2, 3]), a.to_pandas()) - - -@pytest.mark.xfail( - get_global_manager() is not None, - reason=( - "spilling doesn't support PyTorch, see " - "`cudf.core.buffer.spillable_buffer.DelayedPointerTuple`" - ), -) -def test_cuda_array_interface_pytorch(): - torch = pytest.importorskip("torch", minversion="2.4.0") - if not torch.cuda.is_available(): - pytest.skip("need gpu version of pytorch to be installed") - - series = cudf.Series([1, -1, 10, -56]) - tensor = torch.tensor(series) - got = cudf.Series(tensor) - - assert_eq(got, series) - buffer = cudf.core.buffer.as_buffer(cupy.ones(10, dtype=np.bool_)) - tensor = torch.tensor(buffer) - got = cudf.Series(tensor, dtype=np.bool_) - - assert_eq(got, cudf.Series(buffer, dtype=np.bool_)) - - index = cudf.Index([], dtype="float64") - tensor = torch.tensor(index) - got = cudf.Index(tensor) - assert_eq(got, index) - - index = cudf.core.index.RangeIndex(start=0, stop=100) - tensor = torch.tensor(index) - got = cudf.Series(tensor) - - assert_eq(got, cudf.Series(index)) - - index = cudf.Index([1, 2, 8, 6]) - tensor = torch.tensor(index) - got = cudf.Index(tensor) - - assert_eq(got, index) - - str_series = cudf.Series(["a", "g"]) - - with pytest.raises(AttributeError): - str_series.__cuda_array_interface__ - - cat_series = str_series.astype("category") - - with pytest.raises(TypeError): - cat_series.__cuda_array_interface__ - - -def test_cai_after_indexing(): - df = cudf.DataFrame({"a": [1, 2, 3]}) - cai1 = df["a"].__cuda_array_interface__ - df[["a"]] - cai2 = df["a"].__cuda_array_interface__ - assert cai1 == cai2 From 42b1e19db324c4b11ae4628c0798f6bc606fe036 Mon Sep 17 00:00:00 2001 From: Matthew Murray <41342305+Matt711@users.noreply.github.com> Date: Thu, 14 Aug 2025 22:45:01 -0400 Subject: [PATCH 132/366] Match polars semantics for rolling-sum with all-null windows (non-empty) (#19680) Closes https://github.com/rapidsai/cudf/issues/19679. Also unblocks #19242 Authors: - Matthew Murray (https://github.com/Matt711) Approvers: - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/19680 --- .../cudf_polars/cudf_polars/dsl/translate.py | 6 +- .../cudf_polars/dsl/utils/aggregations.py | 80 +++++++++++++++---- .../cudf_polars/dsl/utils/groupby.py | 7 +- .../cudf_polars/dsl/utils/rolling.py | 8 +- .../tests/expressions/test_rolling.py | 14 ++++ python/cudf_polars/tests/test_groupby.py | 12 +++ python/cudf_polars/tests/test_rolling.py | 14 ++++ 7 files changed, 124 insertions(+), 17 deletions(-) diff --git a/python/cudf_polars/cudf_polars/dsl/translate.py b/python/cudf_polars/cudf_polars/dsl/translate.py index 5c5537be43e..aa82883a51d 100644 --- a/python/cudf_polars/cudf_polars/dsl/translate.py +++ b/python/cudf_polars/cudf_polars/dsl/translate.py @@ -21,6 +21,7 @@ from cudf_polars.containers import DataType from cudf_polars.dsl import expr, ir +from cudf_polars.dsl.expressions.base import ExecutionContext from cudf_polars.dsl.to_ast import insert_colrefs from cudf_polars.dsl.utils.aggregations import decompose_single_agg from cudf_polars.dsl.utils.groupby import rewrite_groupby @@ -691,7 +692,10 @@ def _( agg = translator.translate_expr(n=node.function, schema=schema) name_generator = unique_names(schema) aggs, named_post_agg = decompose_single_agg( - expr.NamedExpr(next(name_generator), agg), name_generator, is_top=True + expr.NamedExpr(next(name_generator), agg), + name_generator, + is_top=True, + context=ExecutionContext.ROLLING, ) named_aggs = [agg for agg, _ in aggs] orderby = node.options.index_column diff --git a/python/cudf_polars/cudf_polars/dsl/utils/aggregations.py b/python/cudf_polars/cudf_polars/dsl/utils/aggregations.py index a8280c4c3bc..e1ed7a0d3a2 100644 --- a/python/cudf_polars/cudf_polars/dsl/utils/aggregations.py +++ b/python/cudf_polars/cudf_polars/dsl/utils/aggregations.py @@ -15,6 +15,7 @@ from cudf_polars.containers import DataType from cudf_polars.dsl import expr, ir +from cudf_polars.dsl.expressions.base import ExecutionContext if TYPE_CHECKING: from collections.abc import Callable, Generator, Iterable, Sequence @@ -53,6 +54,7 @@ def decompose_single_agg( name_generator: Generator[str, None, None], *, is_top: bool, + context: ExecutionContext, ) -> tuple[list[tuple[expr.NamedExpr, bool]], expr.NamedExpr]: """ Decompose a single named aggregation. @@ -65,6 +67,8 @@ def decompose_single_agg( Generator of unique names for temporaries introduced during decomposition. is_top Is this the top of an aggregation expression? + context + ExecutionContext in which the aggregation will run. Returns ------- @@ -94,7 +98,10 @@ def decompose_single_agg( # Special case to fill nulls with zeros for empty group length calculations (child,) = agg.children child_agg, post = decompose_single_agg( - expr.NamedExpr(next(name_generator), child), name_generator, is_top=True + expr.NamedExpr(next(name_generator), child), + name_generator, + is_top=True, + context=context, ) return child_agg, named_expr.reconstruct( replace_nulls( @@ -121,7 +128,10 @@ def decompose_single_agg( # pl.col("a").nan_max or nan_min raise NotImplementedError("Nan propagation in groupby for min/max") aggs, _ = decompose_single_agg( - expr.NamedExpr(next(name_generator), child), name_generator, is_top=False + expr.NamedExpr(next(name_generator), child), + name_generator, + is_top=False, + context=context, ) if any(has_agg for _, has_agg in aggs): raise NotImplementedError("Nested aggs in groupby not supported") @@ -143,14 +153,43 @@ def decompose_single_agg( ) else expr.Col(agg.dtype, name) ) - return [(named_expr, True)], expr.NamedExpr( - name, - # In polars sum(empty_group) => 0, but in libcudf - # sum(empty_group) => null So must post-process by - # replacing nulls, but only if we're a "top-level" - # agg. - replace_nulls(col, 0, is_top=is_top), - ) + # Polars semantics for sum differ by context: + # - GROUPBY: sum(all-null group) => 0; sum(empty group) => 0 (fill-null) + # - ROLLING: sum(all-null window) => null; sum(empty window) => 0 (fill only if empty) + # + # Must post-process because libcudf returns null for both empty and all-null windows/groups + if context == ExecutionContext.GROUPBY: + # GROUPBY: always fill top-level nulls with 0 + return [(named_expr, True)], expr.NamedExpr( + name, replace_nulls(col, 0, is_top=is_top) + ) + else: + # ROLLING: + # Add a second rolling agg to compute the window size, then only + # replace nulls with 0 when the window size is 0 (ie. empty window). + win_len_name = next(name_generator) + win_len = expr.NamedExpr( + win_len_name, + expr.Len(DataType(pl.Int32())), + ) + + win_len_col = expr.Col(DataType(pl.Int32()), win_len_name) + win_len_filled = replace_nulls(win_len_col, 0, is_top=True) + + is_empty = expr.BinOp( + DataType(pl.Boolean()), + plc.binaryop.BinaryOperator.EQUAL, + win_len_filled, + expr.Literal(DataType(pl.Int32()), 0), + ) + + # If empty -> fill 0; else keep libcudf's semantics for all-null windows. + filled = replace_nulls(col, 0, is_top=is_top) + post_ternary_expr = expr.Ternary(agg.dtype, is_empty, filled, col) + + return [(named_expr, True), (win_len, True)], expr.NamedExpr( + name, post_ternary_expr + ) elif agg.name == "mean": post_agg_col: expr.Expr = expr.Col( DataType(pl.Float64), name @@ -174,6 +213,7 @@ def decompose_single_agg( (expr.NamedExpr(next(name_generator), child) for child in agg.children), name_generator, is_top=False, + context=context, ) if any(has_agg for _, has_agg in aggs): if not all( @@ -202,16 +242,23 @@ def _decompose_aggs( name_generator: Generator[str, None, None], *, is_top: bool, + context: ExecutionContext, ) -> tuple[list[tuple[expr.NamedExpr, bool]], Sequence[expr.NamedExpr]]: new_aggs, post = zip( - *(decompose_single_agg(agg, name_generator, is_top=is_top) for agg in aggs), + *( + decompose_single_agg(agg, name_generator, is_top=is_top, context=context) + for agg in aggs + ), strict=True, ) return list(itertools.chain.from_iterable(new_aggs)), post def decompose_aggs( - aggs: Iterable[expr.NamedExpr], name_generator: Generator[str, None, None] + aggs: Iterable[expr.NamedExpr], + name_generator: Generator[str, None, None], + *, + context: ExecutionContext, ) -> tuple[list[expr.NamedExpr], Sequence[expr.NamedExpr]]: """ Process arbitrary aggregations into a form we can handle in grouped aggregations. @@ -222,6 +269,8 @@ def decompose_aggs( List of aggregation expressions name_generator Generator of unique names for temporaries introduced during decomposition. + context + ExecutionContext in which the aggregation will run. Returns ------- @@ -241,7 +290,7 @@ def decompose_aggs( NotImplementedError For unsupported aggregation combinations. """ - new_aggs, post = _decompose_aggs(aggs, name_generator, is_top=True) + new_aggs, post = _decompose_aggs(aggs, name_generator, is_top=True, context=context) return [agg for agg, _ in new_aggs], post @@ -250,6 +299,7 @@ def apply_pre_evaluation( keys: Sequence[expr.NamedExpr], original_aggs: Sequence[expr.NamedExpr], name_generator: Generator[str, None, None], + context: ExecutionContext, *extra_columns: expr.NamedExpr, ) -> tuple[Sequence[expr.NamedExpr], Schema, Callable[[ir.IR], ir.IR]]: """ @@ -265,6 +315,8 @@ def apply_pre_evaluation( Aggregation expressions to rewrite. name_generator Generator of unique names for temporaries introduced during decomposition. + context + ExecutionContext in which the aggregation will run. extra_columns Any additional columns to be included in the output (only relevant for rolling aggregations). Columns will appear in the @@ -285,7 +337,7 @@ def apply_pre_evaluation( NotImplementedError If the aggregations are somehow unsupported. """ - aggs, post = decompose_aggs(original_aggs, name_generator) + aggs, post = decompose_aggs(original_aggs, name_generator, context=context) assert len(post) == len(original_aggs), ( f"Unexpected number of post-aggs {len(post)=} {len(original_aggs)=}" ) diff --git a/python/cudf_polars/cudf_polars/dsl/utils/groupby.py b/python/cudf_polars/cudf_polars/dsl/utils/groupby.py index 7116bcab022..027af9c08be 100644 --- a/python/cudf_polars/cudf_polars/dsl/utils/groupby.py +++ b/python/cudf_polars/cudf_polars/dsl/utils/groupby.py @@ -10,6 +10,7 @@ import pylibcudf as plc from cudf_polars.dsl import ir +from cudf_polars.dsl.expressions.base import ExecutionContext from cudf_polars.dsl.utils.aggregations import apply_pre_evaluation from cudf_polars.dsl.utils.naming import unique_names @@ -78,7 +79,11 @@ def rewrite_groupby( ) aggs, group_schema, apply_post_evaluation = apply_pre_evaluation( - schema, keys, aggs, unique_names(schema.keys()) + schema, + keys, + aggs, + unique_names(schema.keys()), + ExecutionContext.GROUPBY, ) # TODO: use Distinct when the partitioned executor supports it if # the requested aggregations are empty diff --git a/python/cudf_polars/cudf_polars/dsl/utils/rolling.py b/python/cudf_polars/cudf_polars/dsl/utils/rolling.py index d4526537824..526c5823f31 100644 --- a/python/cudf_polars/cudf_polars/dsl/utils/rolling.py +++ b/python/cudf_polars/cudf_polars/dsl/utils/rolling.py @@ -10,6 +10,7 @@ import pylibcudf as plc from cudf_polars.dsl import expr, ir +from cudf_polars.dsl.expressions.base import ExecutionContext from cudf_polars.dsl.utils.aggregations import apply_pre_evaluation from cudf_polars.dsl.utils.naming import unique_names from cudf_polars.dsl.utils.windows import offsets_to_windows @@ -81,7 +82,12 @@ def rewrite_rolling( temp_prefix = "_" * max(map(len, schema)) if len(aggs) > 0: aggs, rolling_schema, apply_post_evaluation = apply_pre_evaluation( - schema, keys, aggs, unique_names(temp_prefix), index + schema, + keys, + aggs, + unique_names(temp_prefix), + ExecutionContext.ROLLING, + index, ) else: rolling_schema = schema diff --git a/python/cudf_polars/tests/expressions/test_rolling.py b/python/cudf_polars/tests/expressions/test_rolling.py index 68b1ada463c..0253f14c416 100644 --- a/python/cudf_polars/tests/expressions/test_rolling.py +++ b/python/cudf_polars/tests/expressions/test_rolling.py @@ -142,3 +142,17 @@ def test_rolling_inside_groupby_raises(): q.collect(engine="in-memory") assert_ir_translation_raises(q, NotImplementedError) + + +def test_rolling_sum_all_null_window_returns_null(): + df = pl.LazyFrame( + { + "orderby": [1, 2, 3, 4, 5, 6], + "null_windows": [None, None, 5, None, None, 1], + } + ) + q = df.select( + out=pl.col("null_windows").sum().rolling("orderby", period="2i", closed="both") + ) + # Expected: [null, null, 5, 5, 5, 1] + assert_gpu_result_equal(q) diff --git a/python/cudf_polars/tests/test_groupby.py b/python/cudf_polars/tests/test_groupby.py index f60a46b52d6..b47cc2962db 100644 --- a/python/cudf_polars/tests/test_groupby.py +++ b/python/cudf_polars/tests/test_groupby.py @@ -311,3 +311,15 @@ def test_groupby_mean_type_promotion(df: pl.LazyFrame) -> None: q = df.group_by("key1").agg(pl.col("float").mean()) assert_gpu_result_equal(q, check_row_order=False) + + +def test_groupby_sum_all_null_group_returns_null(): + df = pl.LazyFrame( + { + "key": ["a", "a", "b", "b", "c"], + "null_groups": [None, None, None, 2, None], + } + ) + + q = df.group_by("key").agg(out=pl.col("null_groups").sum()) + assert_gpu_result_equal(q, check_row_order=False) diff --git a/python/cudf_polars/tests/test_rolling.py b/python/cudf_polars/tests/test_rolling.py index 23862cb9adb..dd473078fbd 100644 --- a/python/cudf_polars/tests/test_rolling.py +++ b/python/cudf_polars/tests/test_rolling.py @@ -202,3 +202,17 @@ def test_unsupported_agg(): .agg(pl.col("values").n_unique()) ) assert_ir_translation_raises(q, NotImplementedError) + + +def test_rolling_sum_all_null_window_returns_null(): + df = pl.LazyFrame( + { + "orderby": [1, 2, 3, 4, 5, 6], + "null_windows": [None, None, 5, None, None, 1], + } + ) + q = df.rolling("orderby", period="2i", closed="both").agg( + out=pl.col("null_windows").sum() + ) + # Expected: [0, 0, 5, 5, 5, 1] + assert_gpu_result_equal(q) From 4a309b436685c605404fd82983673c1e94b7cd8c Mon Sep 17 00:00:00 2001 From: Chong Gao Date: Fri, 15 Aug 2025 11:22:56 +0800 Subject: [PATCH 133/366] Add Java JNI interface to get Gpu UUID (#19646) Contributes to https://github.com/NVIDIA/spark-rapids/issues/12982 Different GPU should generate different UUID sequences. I plan to use GPU card UUID to participate in the calculation of initial seed, so post this PR to expose the Gpu UUID. The seed will be calculated on Java side, the following informations will parqicipate in the intial seed: - Gpu card UUID - Current timestamp - Java process ID - Java process start time - an static incremental sequence ID on Java side. ...... Maybe more to make sure no same UUID sequences are generated. Authors: - Chong Gao (https://github.com/res-life) Approvers: - Liangcai Li (https://github.com/firestarman) URL: https://github.com/rapidsai/cudf/pull/19646 --- java/src/main/java/ai/rapids/cudf/Cuda.java | 18 ++++++++++++++++++ java/src/main/native/src/CudaJni.cpp | 14 ++++++++++++++ .../src/test/java/ai/rapids/cudf/CudaTest.java | 10 ++++++++++ 3 files changed, 42 insertions(+) diff --git a/java/src/main/java/ai/rapids/cudf/Cuda.java b/java/src/main/java/ai/rapids/cudf/Cuda.java index 7cc3d30a9cf..36559f5aa0e 100755 --- a/java/src/main/java/ai/rapids/cudf/Cuda.java +++ b/java/src/main/java/ai/rapids/cudf/Cuda.java @@ -270,6 +270,16 @@ public static CudaComputeMode getComputeMode() { return CudaComputeMode.fromNative(Cuda.getNativeComputeMode()); } + /** + * Gets the GPU UUID of the current device. + * + * @return UUID of the current device as a byte array. + */ + public static byte[] getGpuUuid() { + return getNativeGpuUuid(); + } + + /** * Mapping: cudaMemGetInfo(size_t *free, size_t *total) */ @@ -400,6 +410,14 @@ static void asyncMemcpy(long dst, long src, long count, CudaMemcpyKind kind) { */ static native int getNativeComputeMode() throws CudaException; + /** + * Gets the Gpu UUID of the current device. + * + * @return UUID of the current device as a byte array. + * @throws CudaException on any error + */ + static native byte[] getNativeGpuUuid() throws CudaException; + /** * Gets the major CUDA compute capability of the current device. * diff --git a/java/src/main/native/src/CudaJni.cpp b/java/src/main/native/src/CudaJni.cpp index c5359c821ae..8b212d671cd 100644 --- a/java/src/main/native/src/CudaJni.cpp +++ b/java/src/main/native/src/CudaJni.cpp @@ -216,6 +216,20 @@ JNIEXPORT jint JNICALL Java_ai_rapids_cudf_Cuda_getNativeComputeMode(JNIEnv* env CATCH_STD(env, -2); } +JNIEXPORT jbyteArray JNICALL Java_ai_rapids_cudf_Cuda_getNativeGpuUuid(JNIEnv* env, jclass) +{ + try { + cudf::jni::auto_set_device(env); + int device; + CUDF_CUDA_TRY(cudaGetDevice(&device)); + cudaDeviceProp device_prop; + CUDF_CUDA_TRY(cudaGetDeviceProperties(&device_prop, device)); + cudf::jni::native_jbyteArray jbytes{env, (jbyte*)device_prop.uuid.bytes, 16}; + return jbytes.get_jArray(); + } + CATCH_STD(env, nullptr); +} + JNIEXPORT jint JNICALL Java_ai_rapids_cudf_Cuda_getComputeCapabilityMajor(JNIEnv* env, jclass) { try { diff --git a/java/src/test/java/ai/rapids/cudf/CudaTest.java b/java/src/test/java/ai/rapids/cudf/CudaTest.java index a741b0a5e31..4e55bb8f903 100644 --- a/java/src/test/java/ai/rapids/cudf/CudaTest.java +++ b/java/src/test/java/ai/rapids/cudf/CudaTest.java @@ -20,6 +20,7 @@ import org.junit.jupiter.api.Test; import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertNotEquals; import static org.junit.jupiter.api.Assertions.assertThrows; public class CudaTest { @@ -32,6 +33,15 @@ public void testGetCudaRuntimeInfo() { assert Cuda.getDriverVersion() >= 1000; assert Cuda.getRuntimeVersion() >= 1000; assertEquals(Cuda.getNativeComputeMode(), Cuda.getComputeMode().nativeId); + + // test UUID + byte[] uuid = Cuda.getGpuUuid(); + assertEquals(uuid.length, 16); + long v = 0; + for (int i = 0; i < uuid.length; i++) { + v += uuid[i]; + } + assertNotEquals(0, v); } @Tag("noSanitizer") From b39b1f21cf69a17c68ada6d90be04ac41818a236 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Fri, 15 Aug 2025 10:02:44 -0700 Subject: [PATCH 134/366] Move test_factorize/drop_duplicates.py to new cudf classic test directory (#19681) Towards https://github.com/rapidsai/cudf/issues/9999 Towards https://github.com/rapidsai/cudf/issues/15723 Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/19681 --- .../methods/test_drop_duplicates.py} | 130 ++++--------- .../tests/general_functions/test_factorize.py | 59 ++++++ .../indexes/index/methods/test_factorize.py | 42 ++++ .../series/methods/test_drop_duplicates.py | 33 ++++ .../tests/series/methods/test_factorize.py | 86 +++++++++ python/cudf/cudf/tests/test_factorize.py | 179 ------------------ 6 files changed, 261 insertions(+), 268 deletions(-) rename python/cudf/cudf/tests/{test_duplicates.py => dataframe/methods/test_drop_duplicates.py} (86%) create mode 100644 python/cudf/cudf/tests/general_functions/test_factorize.py create mode 100644 python/cudf/cudf/tests/indexes/index/methods/test_factorize.py create mode 100644 python/cudf/cudf/tests/series/methods/test_drop_duplicates.py delete mode 100644 python/cudf/cudf/tests/test_factorize.py diff --git a/python/cudf/cudf/tests/test_duplicates.py b/python/cudf/cudf/tests/dataframe/methods/test_drop_duplicates.py similarity index 86% rename from python/cudf/cudf/tests/test_duplicates.py rename to python/cudf/cudf/tests/dataframe/methods/test_drop_duplicates.py index ce7e80e0dba..dd6479ab949 100644 --- a/python/cudf/cudf/tests/test_duplicates.py +++ b/python/cudf/cudf/tests/dataframe/methods/test_drop_duplicates.py @@ -1,7 +1,5 @@ -# Copyright (c) 2020-2025, NVIDIA CORPORATION. +# Copyright (c) 2025, NVIDIA CORPORATION. -import itertools -import random import numpy as np import pandas as pd @@ -12,8 +10,6 @@ from cudf.testing import assert_eq from cudf.testing._utils import assert_exceptions_equal -# most tests are similar to pandas drop_duplicates - @pytest.mark.parametrize("subset", ["a", ["a"], ["a", "B"]]) def test_duplicated_with_misspelled_column_name(subset): @@ -28,29 +24,19 @@ def test_duplicated_with_misspelled_column_name(subset): ) -@pytest.mark.parametrize("keep", ["first", "last", False]) -@pytest.mark.parametrize("ignore_index", [True, False]) -@pytest.mark.parametrize( - "data", - [ - [1, 2, 4, 5, 6, 6], - [], - ["a", "b", "s", "sd", "a", "b"], - pd.Series(["aaa"] * 10, dtype="object"), - ], -) -def test_drop_duplicates_series(data, keep, ignore_index): - pds = pd.Series(data) - gds = cudf.from_pandas(pds) - - assert_eq( - pds.drop_duplicates(keep=keep, ignore_index=ignore_index), - gds.drop_duplicates(keep=keep, ignore_index=ignore_index), +@pytest.mark.xfail(reason="cudf does not support duplicate column names yet") +def test_drop_duplicates_with_duplicate_column_names(): + df = pd.DataFrame( + [[1, 2, 5], [3, 4, 6], [3, 4, 7]], columns=["a", "a", "b"] ) + df = cudf.DataFrame.from_pandas(df) - pds.drop_duplicates(keep=keep, inplace=True, ignore_index=ignore_index) - gds.drop_duplicates(keep=keep, inplace=True, ignore_index=ignore_index) - assert_eq(pds, gds) + result0 = df.drop_duplicates() + assert_eq(result0, df) + + result1 = df.drop_duplicates("a") + expected1 = df[:2] + assert_eq(result1, expected1) def test_drop_duplicates(): @@ -127,20 +113,28 @@ def test_drop_duplicates(): expected = pdf.drop_duplicates("E", keep="last") assert_eq(result, expected) + +def test_drop_duplicates_integers(): pdf = pd.DataFrame( {"x": [7, 6, 3, 3, 4, 8, 0], "y": [0, 6, 5, 5, 9, 1, 2]} ) gdf = cudf.DataFrame.from_pandas(pdf) assert_eq(gdf.drop_duplicates(), pdf.drop_duplicates()) + +def test_drop_duplicates_integers_positive(): pdf = pd.DataFrame([[1, 0], [0, 2]]) gdf = cudf.DataFrame.from_pandas(pdf) assert_eq(gdf.drop_duplicates(), pdf.drop_duplicates()) + +def test_drop_duplicates_integers_negative(): pdf = pd.DataFrame([[-2, 0], [0, -4]]) gdf = cudf.DataFrame.from_pandas(pdf) assert_eq(gdf.drop_duplicates(), pdf.drop_duplicates()) + +def test_drop_duplicates_integers_max(): x = np.iinfo(np.int64).max / 3 * 2 pdf = pd.DataFrame([[-x, x], [0, x + 4]]) gdf = cudf.DataFrame.from_pandas(pdf) @@ -150,28 +144,17 @@ def test_drop_duplicates(): gdf = cudf.DataFrame.from_pandas(pdf) assert_eq(gdf.drop_duplicates(), pdf.drop_duplicates()) + +def test_drop_duplicates_integers_unique(): pdf = pd.DataFrame([i] * 9 for i in range(16)) pdf = pd.concat([pdf, pd.DataFrame([[1] + [0] * 8])], ignore_index=True) gdf = cudf.DataFrame.from_pandas(pdf) assert_eq(gdf.drop_duplicates(), pdf.drop_duplicates()) -@pytest.mark.xfail(reason="cudf does not support duplicate column names yet") -def test_drop_duplicates_with_duplicate_column_names(): - df = pd.DataFrame( - [[1, 2, 5], [3, 4, 6], [3, 4, 7]], columns=["a", "a", "b"] - ) - df = cudf.DataFrame.from_pandas(df) - - result0 = df.drop_duplicates() - assert_eq(result0, df) - - result1 = df.drop_duplicates("a") - expected1 = df[:2] - assert_eq(result1, expected1) - - -def test_drop_duplicates_for_take_all(): +@pytest.mark.parametrize("subset", ["AAA", ["AAA", "B"]]) +@pytest.mark.parametrize("keep", ["first", "last", False]) +def test_drop_duplicates_for_take_all(subset, keep): pdf = pd.DataFrame( { "AAA": ["foo", "bar", "baz", "bar", "foo", "bar", "qux", "foo"], @@ -181,30 +164,8 @@ def test_drop_duplicates_for_take_all(): } ) gdf = cudf.DataFrame.from_pandas(pdf) - # single column - result = gdf.drop_duplicates("AAA") - expected = pdf.drop_duplicates("AAA") - assert_eq(result, expected) - - result = gdf.drop_duplicates("AAA", keep="last") - expected = pdf.drop_duplicates("AAA", keep="last") - assert_eq(result, expected) - - result = gdf.drop_duplicates("AAA", keep=False) - expected = pdf.drop_duplicates("AAA", keep=False) - assert_eq(result, expected) - - # multiple columns - result = gdf.drop_duplicates(["AAA", "B"]) - expected = pdf.drop_duplicates(["AAA", "B"]) - assert_eq(result, expected) - - result = gdf.drop_duplicates(["AAA", "B"], keep="last") - expected = pdf.drop_duplicates(["AAA", "B"], keep="last") - assert_eq(result, expected) - - result = gdf.drop_duplicates(["AAA", "B"], keep=False) - expected = pdf.drop_duplicates(["AAA", "B"], keep=False) + result = gdf.drop_duplicates(subset, keep=keep) + expected = pdf.drop_duplicates(subset, keep=keep) assert_eq(result, expected) @@ -268,21 +229,15 @@ def test_drop_duplicates_empty(df): def test_dataframe_drop_duplicates_numeric_method(): - num_columns = 3 - comb = list(itertools.permutations(range(num_columns), num_columns)) - shuf = list(comb) - random.Random(num_columns).shuffle(shuf) - - def get_pdf(n_dup): - # create dataframe with n_dup duplicate rows - rows = comb + shuf[:n_dup] - random.Random(n_dup).shuffle(rows) - return pd.DataFrame(rows) - - for i in range(5): - pdf = get_pdf(i) - gdf = cudf.DataFrame.from_pandas(pdf) - assert_eq(gdf.drop_duplicates(), pdf.drop_duplicates()) + pdf = pd.DataFrame( + { + "A": [1, 1, 1, 4, 5], + "B": [1, 1, 1, 4, 5], + "C": [1, 1, 1, 4, 5], + } + ) + gdf = cudf.DataFrame.from_pandas(pdf) + assert_eq(gdf.drop_duplicates(), pdf.drop_duplicates()) # subset columns, single columns assert_eq( @@ -299,16 +254,13 @@ def get_pdf(n_dup): ) # subset columns shuffled - cols = list(pdf.columns) - random.Random(3).shuffle(cols) + cols = ["B", "C", "A"] assert_eq(gdf.drop_duplicates(cols), pdf.drop_duplicates(cols)) - random.Random(3).shuffle(cols) assert_eq(gdf.drop_duplicates(cols[:-1]), pdf.drop_duplicates(cols[:-1])) - random.Random(3).shuffle(cols) assert_eq(gdf.drop_duplicates(cols[-1]), pdf.drop_duplicates(cols[-1])) assert_eq( - gdf.drop_duplicates(cols, keep="last"), - pdf.drop_duplicates(cols, keep="last"), + gdf.drop_duplicates(pdf.columns, keep="last"), + pdf.drop_duplicates(pdf.columns, keep="last"), ) @@ -597,12 +549,12 @@ def test_drop_duplicates_multi_index(): expected = pdf.drop_duplicates() result = gdf.drop_duplicates() - assert_eq(result.to_pandas(), expected) + assert_eq(result, expected) # FIXME: to_pandas needed until sort_index support for MultiIndex for col in gdf.columns: assert_eq( - gdf[col].drop_duplicates().to_pandas(), + gdf[col].drop_duplicates(), pdf[col].drop_duplicates(), ) diff --git a/python/cudf/cudf/tests/general_functions/test_factorize.py b/python/cudf/cudf/tests/general_functions/test_factorize.py new file mode 100644 index 00000000000..4f1df7a359a --- /dev/null +++ b/python/cudf/cudf/tests/general_functions/test_factorize.py @@ -0,0 +1,59 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + +import cupy as cp +import numpy as np +import pandas as pd +import pytest + +import cudf +from cudf.testing import assert_eq + + +def test_cudf_factorize_array(): + data = [1, 2, 3, 4, 5] + + parr = np.array(data) + garr = cp.array(data) + + expect = pd.factorize(parr) + got = cudf.factorize(garr) + + assert len(expect) == len(got) + + np.testing.assert_array_equal(expect[0], got[0].get()) + np.testing.assert_array_equal(expect[1], got[1].get()) + + +@pytest.mark.parametrize("pandas_compatibility", [True, False]) +def test_factorize_code_pandas_compatibility(pandas_compatibility): + psr = pd.Series([1, 2, 3, 4, 5]) + gsr = cudf.from_pandas(psr) + + expect = pd.factorize(psr) + with cudf.option_context("mode.pandas_compatible", pandas_compatibility): + got = cudf.factorize(gsr) + assert_eq(got[0], expect[0]) + assert_eq(got[1], expect[1]) + if pandas_compatibility: + assert got[0].dtype == expect[0].dtype + else: + assert got[0].dtype == cudf.dtype("int8") + + +def test_factorize_result_classes(): + data = [1, 2, 3] + + labels, cats = cudf.factorize(cudf.Series(data)) + + assert isinstance(labels, cp.ndarray) + assert isinstance(cats, cudf.Index) + + labels, cats = cudf.factorize(cudf.Index(data)) + + assert isinstance(labels, cp.ndarray) + assert isinstance(cats, cudf.Index) + + labels, cats = cudf.factorize(cp.array(data)) + + assert isinstance(labels, cp.ndarray) + assert isinstance(cats, cp.ndarray) diff --git a/python/cudf/cudf/tests/indexes/index/methods/test_factorize.py b/python/cudf/cudf/tests/indexes/index/methods/test_factorize.py new file mode 100644 index 00000000000..1379d0ec9d3 --- /dev/null +++ b/python/cudf/cudf/tests/indexes/index/methods/test_factorize.py @@ -0,0 +1,42 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + +import cupy as cp +import numpy as np +import pandas as pd + +import cudf +from cudf import Index + + +def test_factorize_index_obj(): + rng = np.random.default_rng(seed=0) + + arr = rng.integers(2, size=10, dtype=np.int32) + ser = cudf.Index(arr) + + uvals, labels = ser.factorize() + unique_values, indices = np.unique(arr, return_index=True) + expected_values = unique_values[np.argsort(indices)] + + np.testing.assert_array_equal(labels.values.get(), expected_values) + assert isinstance(uvals, cp.ndarray) + assert isinstance(labels, Index) + + encoder = {labels[idx]: idx for idx in range(len(labels))} + handcoded = [encoder[v] for v in arr] + np.testing.assert_array_equal(uvals.get(), handcoded) + + +def test_cudf_factorize_index(): + data = [1, 2, 3, 4, 5] + + pi = pd.Index(data) + gi = cudf.Index(data) + + expect = pd.factorize(pi) + got = cudf.factorize(gi) + + assert len(expect) == len(got) + + np.testing.assert_array_equal(expect[0], got[0].get()) + np.testing.assert_array_equal(expect[1], got[1].values.get()) diff --git a/python/cudf/cudf/tests/series/methods/test_drop_duplicates.py b/python/cudf/cudf/tests/series/methods/test_drop_duplicates.py new file mode 100644 index 00000000000..68f40879c66 --- /dev/null +++ b/python/cudf/cudf/tests/series/methods/test_drop_duplicates.py @@ -0,0 +1,33 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + + +import pandas as pd +import pytest + +import cudf +from cudf.testing import assert_eq + + +@pytest.mark.parametrize("keep", ["first", "last", False]) +@pytest.mark.parametrize("ignore_index", [True, False]) +@pytest.mark.parametrize( + "data", + [ + [1, 2, 4, 5, 6, 6], + [], + ["a", "b", "s", "sd", "a", "b"], + pd.Series(["aaa"] * 10, dtype="object"), + ], +) +def test_drop_duplicates_series(data, keep, ignore_index): + pds = pd.Series(data) + gds = cudf.from_pandas(pds) + + assert_eq( + pds.drop_duplicates(keep=keep, ignore_index=ignore_index), + gds.drop_duplicates(keep=keep, ignore_index=ignore_index), + ) + + pds.drop_duplicates(keep=keep, inplace=True, ignore_index=ignore_index) + gds.drop_duplicates(keep=keep, inplace=True, ignore_index=ignore_index) + assert_eq(pds, gds) diff --git a/python/cudf/cudf/tests/series/methods/test_factorize.py b/python/cudf/cudf/tests/series/methods/test_factorize.py index 98cc8187fce..b64ad0fbf63 100644 --- a/python/cudf/cudf/tests/series/methods/test_factorize.py +++ b/python/cudf/cudf/tests/series/methods/test_factorize.py @@ -1,5 +1,8 @@ # Copyright (c) 2025, NVIDIA CORPORATION. +import cupy as cp +import numpy as np +import pandas as pd import pytest import cudf @@ -29,3 +32,86 @@ def test_series_factorize_use_na_sentinel(data, use_na_sentinel, sort): ) assert_eq(expected_labels, actual_labels.get()) assert_eq(expected_cats, actual_cats.to_pandas(nullable=True)) + + +def test_factorize_series_obj(): + rng = np.random.default_rng(seed=0) + + arr = rng.integers(2, size=10, dtype=np.int32) + ser = cudf.Series(arr) + + uvals, labels = ser.factorize() + unique_values, indices = np.unique(arr, return_index=True) + expected_values = unique_values[np.argsort(indices)] + + np.testing.assert_array_equal(labels.to_numpy(), expected_values) + assert isinstance(uvals, cp.ndarray) + assert isinstance(labels, cudf.Index) + + encoder = {labels[idx]: idx for idx in range(len(labels))} + handcoded = [encoder[v] for v in arr] + np.testing.assert_array_equal(uvals.get(), handcoded) + + +@pytest.mark.parametrize( + "index", + [ + None, + [ + 2992443.0, + 2992447.0, + 2992466.0, + 2992440.0, + 2992441.0, + 2992442.0, + 2992444.0, + 2992445.0, + 2992446.0, + 2992448.0, + ], + ], +) +def test_factorize_series_index(index): + data = ["C", "H", "C", "W", "W", "W", "W", "W", "C", "W"] + ser = cudf.Series(data, index=index) + pser = pd.Series(data, index=index) + result_unique, result_labels = ser.factorize() + expected_unique, expected_labels = pser.factorize() + assert_eq(result_unique.get(), expected_unique) + assert_eq( + result_labels.to_pandas().values, + expected_labels.values, + ) + + +def test_cudf_factorize_series(): + data = [1, 2, 3, 4, 5] + + psr = pd.Series(data) + gsr = cudf.Series(data) + + expect = pd.factorize(psr) + got = cudf.factorize(gsr) + + assert len(expect) == len(got) + + np.testing.assert_array_equal(expect[0], got[0].get()) + np.testing.assert_array_equal(expect[1], got[1].values.get()) + + +@pytest.mark.parametrize( + "data", + [ + ["abc", "def", "abc", "a", "def", None], + [10, 20, 100, -10, 0, 1, None, 10, 100], + ], +) +def test_category_dtype_factorize(data): + gs = cudf.Series(data, dtype="category") + ps = gs.to_pandas() + + actual_codes, actual_uniques = gs.factorize() + expected_codes, expected_uniques = ps.factorize() + + assert_eq(actual_codes, expected_codes) + assert_eq(actual_uniques, expected_uniques) diff --git a/python/cudf/cudf/tests/test_factorize.py b/python/cudf/cudf/tests/test_factorize.py deleted file mode 100644 index d16e725088f..00000000000 --- a/python/cudf/cudf/tests/test_factorize.py +++ /dev/null @@ -1,179 +0,0 @@ -# Copyright (c) 2018-2025, NVIDIA CORPORATION. - -import cupy as cp -import numpy as np -import pandas as pd -import pytest - -import cudf -from cudf import DataFrame, Index -from cudf.testing import assert_eq - - -def test_factorize_series_obj(): - df = DataFrame() - rng = np.random.default_rng(seed=0) - - # initialize data frame - df["cats"] = arr = rng.integers(2, size=10, dtype=np.int32) - - uvals, labels = df["cats"].factorize() - unique_values, indices = np.unique(arr, return_index=True) - expected_values = unique_values[np.argsort(indices)] - - np.testing.assert_array_equal(labels.to_numpy(), expected_values) - assert isinstance(uvals, cp.ndarray) - assert isinstance(labels, Index) - - encoder = {labels[idx]: idx for idx in range(len(labels))} - handcoded = [encoder[v] for v in arr] - np.testing.assert_array_equal(uvals.get(), handcoded) - - -def test_factorize_index_obj(): - df = DataFrame() - rng = np.random.default_rng(seed=0) - - # initialize data frame - df["cats"] = arr = rng.integers(2, size=10, dtype=np.int32) - df = df.set_index("cats") - - uvals, labels = df.index.factorize() - unique_values, indices = np.unique(arr, return_index=True) - expected_values = unique_values[np.argsort(indices)] - - np.testing.assert_array_equal(labels.values.get(), expected_values) - assert isinstance(uvals, cp.ndarray) - assert isinstance(labels, Index) - - encoder = {labels[idx]: idx for idx in range(len(labels))} - handcoded = [encoder[v] for v in arr] - np.testing.assert_array_equal(uvals.get(), handcoded) - - -def test_factorize_series_index(): - df = DataFrame() - df["col1"] = ["C", "H", "C", "W", "W", "W", "W", "W", "C", "W"] - df["col2"] = [ - 2992443.0, - 2992447.0, - 2992466.0, - 2992440.0, - 2992441.0, - 2992442.0, - 2992444.0, - 2992445.0, - 2992446.0, - 2992448.0, - ] - assert_eq(df.col1.factorize()[0].get(), df.to_pandas().col1.factorize()[0]) - assert_eq( - df.col1.factorize()[1].to_pandas().values, - df.to_pandas().col1.factorize()[1].values, - ) - - df = df.set_index("col2") - - assert_eq(df.col1.factorize()[0].get(), df.to_pandas().col1.factorize()[0]) - assert_eq( - df.col1.factorize()[1].to_pandas().values, - df.to_pandas().col1.factorize()[1].values, - ) - - -def test_cudf_factorize_series(): - data = [1, 2, 3, 4, 5] - - psr = pd.Series(data) - gsr = cudf.Series(data) - - expect = pd.factorize(psr) - got = cudf.factorize(gsr) - - assert len(expect) == len(got) - - np.testing.assert_array_equal(expect[0], got[0].get()) - np.testing.assert_array_equal(expect[1], got[1].values.get()) - - -def test_cudf_factorize_index(): - data = [1, 2, 3, 4, 5] - - pi = pd.Index(data) - gi = cudf.Index(data) - - expect = pd.factorize(pi) - got = cudf.factorize(gi) - - assert len(expect) == len(got) - - np.testing.assert_array_equal(expect[0], got[0].get()) - np.testing.assert_array_equal(expect[1], got[1].values.get()) - - -def test_cudf_factorize_array(): - data = [1, 2, 3, 4, 5] - - parr = np.array(data) - garr = cp.array(data) - - expect = pd.factorize(parr) - got = cudf.factorize(garr) - - assert len(expect) == len(got) - - np.testing.assert_array_equal(expect[0], got[0].get()) - np.testing.assert_array_equal(expect[1], got[1].get()) - - -@pytest.mark.parametrize("pandas_compatibility", [True, False]) -def test_factorize_code_pandas_compatibility(pandas_compatibility): - psr = pd.Series([1, 2, 3, 4, 5]) - gsr = cudf.from_pandas(psr) - - expect = pd.factorize(psr) - with cudf.option_context("mode.pandas_compatible", pandas_compatibility): - got = cudf.factorize(gsr) - assert_eq(got[0], expect[0]) - assert_eq(got[1], expect[1]) - if pandas_compatibility: - assert got[0].dtype == expect[0].dtype - else: - assert got[0].dtype == cudf.dtype("int8") - - -def test_factorize_result_classes(): - data = [1, 2, 3] - - labels, cats = cudf.factorize(cudf.Series(data)) - - assert isinstance(labels, cp.ndarray) - assert isinstance(cats, cudf.Index) - - labels, cats = cudf.factorize(cudf.Index(data)) - - assert isinstance(labels, cp.ndarray) - assert isinstance(cats, cudf.Index) - - labels, cats = cudf.factorize(cp.array(data)) - - assert isinstance(labels, cp.ndarray) - assert isinstance(cats, cp.ndarray) - - -@pytest.mark.parametrize( - "data", - [ - ["abc", "def", "abc", "a", "def", None], - [10, 20, 100, -10, 0, 1, None, 10, 100], - ], -) -def test_category_dtype_factorize(data): - gs = cudf.Series(data, dtype="category") - ps = gs.to_pandas() - - actual_codes, actual_uniques = gs.factorize() - expected_codes, expected_uniques = ps.factorize() - - assert_eq(actual_codes, expected_codes) - assert_eq(actual_uniques, expected_uniques) From c1d37301bb46b8d5d967a79b32de17123acffb22 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Fri, 15 Aug 2025 10:09:12 -0700 Subject: [PATCH 135/366] Move test_groupby to new cudf classic directory structure (#19688) Towards https://github.com/rapidsai/cudf/issues/9999 Towards https://github.com/rapidsai/cudf/issues/15723 Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/19688 --- python/cudf/cudf/tests/groupby/test_agg.py | 53 + python/cudf/cudf/tests/groupby/test_apply.py | 889 +++++++- .../cudf/tests/groupby/test_attributes.py | 49 + .../cudf/tests/groupby/test_cummulative.py | 13 +- python/cudf/cudf/tests/groupby/test_diff.py | 12 + python/cudf/cudf/tests/groupby/test_ffill.py | 22 + .../cudf/cudf/tests/groupby/test_get_group.py | 99 +- .../cudf/cudf/tests/groupby/test_head_tail.py | 106 + python/cudf/cudf/tests/groupby/test_ngroup.py | 34 + python/cudf/cudf/tests/groupby/test_nth.py | 34 + .../cudf/tests/groupby/test_pct_change.py | 98 + .../cudf/tests/groupby/test_reductions.py | 311 ++- python/cudf/cudf/tests/groupby/test_sample.py | 101 + python/cudf/cudf/tests/groupby/test_shift.py | 24 +- python/cudf/cudf/tests/groupby/test_size.py | 19 + .../cudf/cudf/tests/groupby/test_transform.py | 39 +- .../cudf/tests/groupby/test_value_counts.py | 77 + python/cudf/cudf/tests/test_groupby.py | 1917 ----------------- 18 files changed, 1972 insertions(+), 1925 deletions(-) create mode 100644 python/cudf/cudf/tests/groupby/test_ffill.py create mode 100644 python/cudf/cudf/tests/groupby/test_head_tail.py create mode 100644 python/cudf/cudf/tests/groupby/test_ngroup.py create mode 100644 python/cudf/cudf/tests/groupby/test_pct_change.py create mode 100644 python/cudf/cudf/tests/groupby/test_sample.py create mode 100644 python/cudf/cudf/tests/groupby/test_size.py create mode 100644 python/cudf/cudf/tests/groupby/test_value_counts.py delete mode 100644 python/cudf/cudf/tests/test_groupby.py diff --git a/python/cudf/cudf/tests/groupby/test_agg.py b/python/cudf/cudf/tests/groupby/test_agg.py index 42c74f967bb..07d069ef8ff 100644 --- a/python/cudf/cudf/tests/groupby/test_agg.py +++ b/python/cudf/cudf/tests/groupby/test_agg.py @@ -633,3 +633,56 @@ def test_groupby_mix_agg_scan(): gb.agg(func[1:]) with pytest.raises(NotImplementedError, match=err_msg): gb.agg(func) + + +@pytest.mark.parametrize( + "op", ["cummax", "cummin", "cumprod", "cumsum", "mean", "median"] +) +def test_group_by_raises_string_error(op): + df = cudf.DataFrame({"a": [1, 2, 3, 4, 5], "b": ["a", "b", "c", "d", "e"]}) + + with pytest.raises(TypeError): + df.groupby(df.a).agg(op) + + +@pytest.mark.parametrize( + "op", + [ + "cummax", + "cummin", + "cumprod", + "cumsum", + "mean", + "median", + "prod", + "sum", + list, + ], +) +def test_group_by_raises_category_error(op): + df = cudf.DataFrame( + { + "a": [1, 2, 3, 4, 5], + "b": cudf.Series(["a", "b", "c", "d", "e"], dtype="category"), + } + ) + + with pytest.raises(TypeError): + df.groupby(df.a).agg(op) + + +def test_agg_duplicate_aggs_pandas_compat_raises(): + agg = {"b": ["mean", "mean"]} + dfgb = cudf.DataFrame({"a": [1, 1, 2], "b": [4, 5, 6]}).groupby(["a"]) + with cudf.option_context("mode.pandas_compatible", True): + with pytest.raises(NotImplementedError): + dfgb.agg(agg) + + with pytest.warns(UserWarning): + result = dfgb.agg(agg) + expected = cudf.DataFrame( + [4.5, 6.0], + index=cudf.Index([1, 2], name="a"), + columns=pd.MultiIndex.from_tuples([("b", "mean")]), + ) + assert_groupby_results_equal(result, expected) diff --git a/python/cudf/cudf/tests/groupby/test_apply.py b/python/cudf/cudf/tests/groupby/test_apply.py index 1c340a34e2a..8458d3069d0 100644 --- a/python/cudf/cudf/tests/groupby/test_apply.py +++ b/python/cudf/cudf/tests/groupby/test_apply.py @@ -1,16 +1,25 @@ # Copyright (c) 2018-2025, NVIDIA CORPORATION. +import textwrap +from functools import partial + import numpy as np import pandas as pd import pytest from numba import cuda import cudf +from cudf import DataFrame from cudf.core._compat import ( PANDAS_CURRENT_SUPPORTED_VERSION, + PANDAS_GE_220, PANDAS_VERSION, ) -from cudf.testing import assert_groupby_results_equal +from cudf.core.udf._ops import arith_ops, comparison_ops, unary_ops +from cudf.core.udf.groupby_typing import SUPPORTED_GROUPBY_NUMPY_TYPES +from cudf.core.udf.utils import UDFError, precompiled +from cudf.testing import assert_eq, assert_groupby_results_equal +from cudf.testing._utils import expect_warning_if @pytest.fixture(params=["cudf", "jit"]) @@ -139,3 +148,881 @@ def foo(key1, val1, com1, com2): expect["com2"] = np.zeros(20, dtype=np.int32) assert_groupby_results_equal(expect, got) + + +@pytest.fixture +def groupby_jit_data_small(): + """ + Return a small dataset for testing JIT Groupby Apply. The dataframe + contains 4 groups of size 1, 2, 3, 4 as well as an additional key + column that can be used to test subgroups within groups. This data + is useful for smoke testing basic numeric results + """ + rng = np.random.default_rng(42) + df = DataFrame( + { + "key1": [1] + [2] * 2 + [3] * 3 + [4] * 4, + "key2": [1, 2] * 5, + "val1": rng.integers(0, 10, 10), + "val2": rng.integers(0, 10, 10), + } + ) + # randomly permute data + df = df.sample(frac=1, random_state=1, ignore_index=True) + return df + + +@pytest.fixture +def groupby_jit_data_large(groupby_jit_data_small): + """ + Larger version of groupby_jit_data_small which contains enough data + to require more than one block per group. This data is useful for + testing if JIT GroupBy algorithms scale to larger dastasets without + manifesting numerical issues such as overflow. + """ + max_tpb = 1024 + factor = ( + max_tpb + 1 + ) # bigger than a block but not always an exact multiple + df = cudf.concat([groupby_jit_data_small] * factor) + + return df + + +@pytest.fixture +def groupby_jit_data_nans(groupby_jit_data_small): + """ + Returns a modified version of groupby_jit_data_small which contains + nan values. + """ + + df = groupby_jit_data_small.sort_values(["key1", "key2"]) + df["val1"] = df["val1"].astype("float64") + df.loc[df.index[::2], "val1"] = np.nan + df = df.sample(frac=1, random_state=1, ignore_index=True) + return df + + +@pytest.fixture +def groupby_jit_datasets( + groupby_jit_data_small, groupby_jit_data_large, groupby_jit_data_nans +): + return { + "small": groupby_jit_data_small, + "large": groupby_jit_data_large, + "nans": groupby_jit_data_nans, + } + + +def run_groupby_apply_jit_test(data, func, keys, *args): + expect_groupby_obj = data.to_pandas().groupby(keys) + got_groupby_obj = data.groupby(keys) + + # compare cuDF jit to pandas + cudf_jit_result = got_groupby_obj.apply( + func, *args, engine="jit", include_groups=False + ) + pandas_result = expect_groupby_obj.apply(func, *args, include_groups=False) + assert_groupby_results_equal(cudf_jit_result, pandas_result) + + +def groupby_apply_jit_reductions_test_inner(func, data, dtype): + # ideally we'd just have: + # lambda group: getattr(group, func)() + # but the current kernel caching mechanism relies on pickle which + # does not play nice with local functions. What's below uses + # exec as a workaround to write the test functions dynamically + + funcstr = textwrap.dedent( + f""" + def func(df): + return df['val1'].{func}() + """ + ) + lcl = {} + exec(funcstr, lcl) + func = lcl["func"] + + data["val1"] = data["val1"].astype(dtype) + data["val2"] = data["val2"].astype(dtype) + + run_groupby_apply_jit_test(data, func, ["key1"]) + + +# test unary reductions +@pytest.mark.parametrize( + "dtype", + SUPPORTED_GROUPBY_NUMPY_TYPES, + ids=[str(t) for t in SUPPORTED_GROUPBY_NUMPY_TYPES], +) +@pytest.mark.parametrize( + "func", ["min", "max", "sum", "mean", "var", "std", "idxmin", "idxmax"] +) +@pytest.mark.parametrize("dataset", ["small", "large", "nans"]) +@pytest.mark.skipif( + PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, + reason="Include groups missing on old versions of pandas", +) +def test_groupby_apply_jit_unary_reductions( + request, func, dtype, dataset, groupby_jit_datasets +): + request.applymarker( + pytest.mark.xfail( + condition=( + ( + dataset == "nans" + and func in {"var", "std", "mean"} + and str(dtype) in {"int64", "float32", "float64"} + ) + or ( + dataset == "nans" + and func in {"idxmax", "idxmin", "sum"} + and dtype.kind == "f" + ) + ), + reason=("https://github.com/rapidsai/cudf/issues/14860"), + ) + ) + warn_condition = ( + dataset == "nans" + and func in {"idxmax", "idxmin"} + and dtype.kind == "f" + ) + dataset = groupby_jit_datasets[dataset].copy(deep=True) + with expect_warning_if(warn_condition, FutureWarning): + groupby_apply_jit_reductions_test_inner(func, dataset, dtype) + + +# test unary reductions for special values +def groupby_apply_jit_reductions_special_vals_inner( + func, data, dtype, special_val +): + funcstr = textwrap.dedent( + f""" + def func(df): + return df['val1'].{func}() + """ + ) + lcl = {} + exec(funcstr, lcl) + func = lcl["func"] + + data["val1"] = data["val1"].astype(dtype) + data["val2"] = data["val2"].astype(dtype) + data["val1"] = special_val + data["val2"] = special_val + + run_groupby_apply_jit_test(data, func, ["key1"]) + + +# test unary index reductions for special values +@pytest.mark.skipif( + PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, + reason="Fails in older versions of pandas", +) +def groupby_apply_jit_idx_reductions_special_vals_inner( + func, data, dtype, special_val +): + funcstr = textwrap.dedent( + f""" + def func(df): + return df['val1'].{func}() + """ + ) + lcl = {} + exec(funcstr, lcl) + func = lcl["func"] + + data["val1"] = data["val1"].astype(dtype) + data["val2"] = data["val2"].astype(dtype) + data["val1"] = special_val + data["val2"] = special_val + + run_groupby_apply_jit_test(data, func, ["key1"]) + + +@pytest.mark.parametrize("dtype", ["float64", "float32"]) +@pytest.mark.parametrize("func", ["min", "max", "sum", "mean", "var", "std"]) +@pytest.mark.parametrize("special_val", [np.nan, np.inf, -np.inf]) +@pytest.mark.parametrize("dataset", ["small", "large", "nans"]) +@pytest.mark.skipif( + PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, + reason="Include groups missing on old versions of pandas", +) +def test_groupby_apply_jit_reductions_special_vals( + func, dtype, dataset, groupby_jit_datasets, special_val +): + dataset = groupby_jit_datasets[dataset].copy(deep=True) + with expect_warning_if( + func in {"var", "std"} and not np.isnan(special_val), RuntimeWarning + ): + groupby_apply_jit_reductions_special_vals_inner( + func, dataset, dtype, special_val + ) + + +@pytest.mark.parametrize("func", ["idxmax", "idxmin"]) +@pytest.mark.parametrize( + "special_val", + [ + pytest.param( + np.nan, + marks=pytest.mark.xfail( + reason="https://github.com/rapidsai/cudf/issues/13832" + ), + ), + np.inf, + -np.inf, + ], +) +@pytest.mark.parametrize("dataset", ["small", "large", "nans"]) +@pytest.mark.skipif( + PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, + reason="include_groups keyword new in pandas 2.2", +) +def test_groupby_apply_jit_idx_reductions_special_vals( + func, dataset, groupby_jit_datasets, special_val +): + dataset = groupby_jit_datasets[dataset].copy(deep=True) + groupby_apply_jit_idx_reductions_special_vals_inner( + func, dataset, "float64", special_val + ) + + +@pytest.mark.skipif( + PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, + reason="Fails in older versions of pandas", +) +def test_groupby_apply_jit_sum_integer_overflow(): + max = np.iinfo("int32").max + + data = DataFrame( + { + "a": [0, 0, 0], + "b": [max, max, max], + } + ) + + def func(group): + return group["b"].sum() + + run_groupby_apply_jit_test(data, func, ["a"]) + + +@pytest.mark.parametrize("dtype", ["int32", "int64", "float32", "float64"]) +@pytest.mark.parametrize( + "dataset", + [ + pytest.param( + "small", + marks=[ + pytest.mark.filterwarnings( + "ignore:Degrees of Freedom <= 0 for slice" + ), + pytest.mark.filterwarnings( + "ignore:divide by zero encountered in divide" + ), + ], + ), + "large", + ], +) +@pytest.mark.skipif( + PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, + reason="Fails in older versions of pandas", +) +def test_groupby_apply_jit_correlation(dataset, groupby_jit_datasets, dtype): + dataset = groupby_jit_datasets[dataset].copy(deep=True) + + dataset["val1"] = dataset["val1"].astype(dtype) + dataset["val2"] = dataset["val2"].astype(dtype) + + keys = ["key1"] + + def func(group): + return group["val1"].corr(group["val2"]) + + if np.dtype(dtype).kind == "f": + # Correlation of floating types is not yet supported: + # https://github.com/rapidsai/cudf/issues/13839 + m = ( + f"Series.corr\\(Series\\) is not " + f"supported for \\({dtype}, {dtype}\\)" + ) + with pytest.raises(UDFError, match=m): + run_groupby_apply_jit_test(dataset, func, keys) + return + with expect_warning_if(dtype in {"int32", "int64"}, RuntimeWarning): + run_groupby_apply_jit_test(dataset, func, keys) + + +@pytest.mark.parametrize("dtype", ["int32", "int64"]) +@pytest.mark.skipif( + PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, + reason="Fails in older versions of pandas", +) +def test_groupby_apply_jit_correlation_zero_variance(dtype): + # pearson correlation is undefined when the variance of either + # variable is zero. This test ensures that the jit implementation + # returns the same result as pandas in this case. + data = DataFrame( + {"a": [0, 0, 0, 0, 0], "b": [1, 1, 1, 1, 1], "c": [2, 2, 2, 2, 2]} + ) + + def func(group): + return group["b"].corr(group["c"]) + + with expect_warning_if(dtype in {"int32", "int64"}, RuntimeWarning): + run_groupby_apply_jit_test(data, func, ["a"]) + + +@pytest.mark.parametrize("op", unary_ops) +def test_groupby_apply_jit_invalid_unary_ops_error(groupby_jit_data_small, op): + keys = ["key1"] + + def func(group): + return op(group["val1"]) + + with pytest.raises( + UDFError, + match=f"{op.__name__}\\(Series\\) is not supported by JIT GroupBy", + ): + run_groupby_apply_jit_test(groupby_jit_data_small, func, keys) + + +@pytest.mark.parametrize("op", arith_ops + comparison_ops) +def test_groupby_apply_jit_invalid_binary_ops_error( + groupby_jit_data_small, op +): + keys = ["key1"] + + def func(group): + return op(group["val1"], group["val2"]) + + with pytest.raises( + UDFError, + match=f"{op.__name__}\\(Series, Series\\) is not supported", + ): + run_groupby_apply_jit_test(groupby_jit_data_small, func, keys) + + +def test_groupby_apply_jit_no_df_ops(groupby_jit_data_small): + # DataFrame level operations are not yet supported. + def func(group): + return group.sum() + + with pytest.raises( + UDFError, + match="JIT GroupBy.apply\\(\\) does not support DataFrame.sum\\(\\)", + ): + run_groupby_apply_jit_test(groupby_jit_data_small, func, ["key1"]) + + +@pytest.mark.parametrize("dtype", ["uint8", "str"]) +@pytest.mark.skipif( + PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, + reason="Fails in older versions of pandas", +) +def test_groupby_apply_unsupported_dtype(dtype): + df = cudf.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]}) + df["b"] = df["b"].astype(dtype) + + # a UDAF that doesn't actually use the input column + # with the unsupported dtype should still succeed + def func(group): + return group["c"].sum() + + run_groupby_apply_jit_test(df, func, ["a"]) + + # however a UDAF that does use the unsupported dtype + # should fail + def func(group): + return group["b"].sum() + + with pytest.raises(UDFError, match="Only columns of the following dtypes"): + run_groupby_apply_jit_test(df, func, ["a"]) + + +@pytest.mark.parametrize( + "func", + [ + lambda df: df["val1"].max() + df["val2"].min(), + lambda df: df["val1"].sum() + df["val2"].var(), + lambda df: df["val1"].mean() + df["val2"].std(), + ], +) +@pytest.mark.skipif( + PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, + reason="Fails in older versions of pandas", +) +def test_groupby_apply_jit_basic(func, groupby_jit_data_small): + run_groupby_apply_jit_test(groupby_jit_data_small, func, ["key1", "key2"]) + + +def f1(df, k): + return df["val1"].max() + df["val2"].min() + k + + +def f2(df, k, L): + return df["val1"].sum() - df["val2"].var() + (k / L) + + +def f3(df, k, L, m): + return ((k * df["val1"].mean()) + (L * df["val2"].std())) / m + + +@pytest.mark.parametrize( + "func,args", [(f1, (42,)), (f2, (42, 119)), (f3, (42, 119, 212.1))] +) +@pytest.mark.skipif( + PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, + reason="Fails in older versions of pandas", +) +def test_groupby_apply_jit_args(func, args, groupby_jit_data_small): + run_groupby_apply_jit_test( + groupby_jit_data_small, func, ["key1", "key2"], *args + ) + + +@pytest.mark.skipif( + PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, + reason="Fails in older versions of pandas", +) +def test_groupby_apply_jit_block_divergence(): + # https://github.com/rapidsai/cudf/issues/12686 + df = cudf.DataFrame( + { + "a": [0, 0, 0, 1, 1, 1], + "b": [1, 1, 1, 2, 3, 4], + } + ) + + def diverging_block(grp_df): + if grp_df["b"].mean() > 1: + return grp_df["b"].mean() + return 0 + + run_groupby_apply_jit_test(df, diverging_block, ["a"]) + + +@pytest.mark.skipif( + PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, + reason="Fails in older versions of pandas", +) +def test_groupby_apply_caching(): + # Make sure similar functions that differ + # by simple things like constants actually + # recompile + + # begin with a clear cache + precompiled.clear() + assert precompiled.currsize == 0 + + data = cudf.DataFrame({"a": [1, 1, 1, 2, 2, 2], "b": [1, 2, 3, 4, 5, 6]}) + + def f(group): + return group["b"].mean() * 2 + + # a single run should result in a cache size of 1 + run_groupby_apply_jit_test(data, f, ["a"]) + assert precompiled.currsize == 1 + + # a second run with f should not increase the count + run_groupby_apply_jit_test(data, f, ["a"]) + assert precompiled.currsize == 1 + + # changing a constant value inside the UDF should miss + def f(group): + return group["b"].mean() * 3 + + run_groupby_apply_jit_test(data, f, ["a"]) + assert precompiled.currsize == 2 + + # changing the dtypes of the columns should miss + data["b"] = data["b"].astype("float64") + run_groupby_apply_jit_test(data, f, ["a"]) + + assert precompiled.currsize == 3 + + +@pytest.mark.skipif( + PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, + reason="Fails in older versions of pandas", +) +def test_groupby_apply_no_bytecode_fallback(): + # tests that a function which contains no bytecode + # attribute, but would still be executable using + # the iterative groupby apply approach, still works. + + gdf = cudf.DataFrame({"a": [0, 1, 1], "b": [1, 2, 3]}) + pdf = gdf.to_pandas() + + def f(group): + return group.sum() + + part = partial(f) + + expect = pdf.groupby("a").apply(part, include_groups=False) + got = gdf.groupby("a").apply(part, engine="auto", include_groups=False) + assert_groupby_results_equal(expect, got) + + +@pytest.mark.skipif( + PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, + reason="Fails in older versions of pandas", +) +def test_groupby_apply_return_col_from_df(): + # tests a UDF that consists of purely colwise + # ops, such as `lambda group: group.x + group.y` + # which returns a column + df = cudf.DataFrame( + { + "id": range(10), + "x": range(10), + "y": range(10), + } + ) + pdf = df.to_pandas() + + def func(df): + return df.x + df.y + + got = df.groupby("id").apply(func, include_groups=False) + expect = pdf.groupby("id").apply(func, include_groups=False) + # pandas seems to erroneously add an extra MI level of ids + # TODO: Figure out how pandas groupby.apply determines the columns + expect = pd.DataFrame(expect.droplevel(1), columns=got.columns) + assert_groupby_results_equal(expect, got) + + +@pytest.mark.parametrize("func", [lambda group: group.sum()]) +@pytest.mark.skipif( + PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, + reason="Fails in older versions of pandas", +) +def test_groupby_apply_return_df(func): + # tests a UDF that reduces over a dataframe + # and produces a series with the original column names + # as its index, such as lambda group: group.sum() + group.min() + df = cudf.DataFrame({"a": [1, 1, 2, 2], "b": [1, 2, 3, 4]}) + pdf = df.to_pandas() + + expect = pdf.groupby("a").apply(func, include_groups=False) + got = df.groupby("a").apply(func, include_groups=False) + assert_groupby_results_equal(expect, got) + + +@pytest.mark.parametrize("as_index", [True, False]) +def test_groupby_apply_return_reindexed_series(as_index): + def gdf_func(df): + return cudf.Series([df["a"].sum(), df["b"].min(), df["c"].max()]) + + def pdf_func(df): + return pd.Series([df["a"].sum(), df["b"].min(), df["c"].max()]) + + df = cudf.DataFrame( + { + "key": [0, 0, 1, 1, 2, 2], + "a": [1, 2, 3, 4, 5, 6], + "b": [7, 8, 9, 10, 11, 12], + "c": [13, 14, 15, 16, 17, 18], + } + ) + pdf = df.to_pandas() + + kwargs = {} + if PANDAS_GE_220: + kwargs["include_groups"] = False + + expect = pdf.groupby("key", as_index=as_index).apply(pdf_func, **kwargs) + got = df.groupby("key", as_index=as_index).apply(gdf_func, **kwargs) + assert_groupby_results_equal(expect, got) + + +@pytest.mark.skipif( + PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, + reason="Include groups missing on old versions of pandas", +) +def test_groupby_apply_noempty_group(): + pdf = pd.DataFrame( + {"a": [1, 1, 2, 2], "b": [1, 2, 1, 2], "c": [1, 2, 3, 4]} + ) + gdf = cudf.from_pandas(pdf) + + expect = ( + pdf.groupby("a", group_keys=False) + .apply(lambda x: x.iloc[[0, 1]], include_groups=False) + .reset_index(drop=True) + ) + got = ( + gdf.groupby("a") + .apply(lambda x: x.iloc[[0, 1]], include_groups=False) + .reset_index(drop=True) + ) + assert_groupby_results_equal(expect, got) + + +def create_test_groupby_apply_return_scalars_params(): + def f0(x): + x = x[~x["B"].isna()] + ticker = x.shape[0] + full = ticker / 10 + return full + + def f1(x, k): + x = x[~x["B"].isna()] + ticker = x.shape[0] + full = ticker / k + return full + + def f2(x, k, L): + x = x[~x["B"].isna()] + ticker = x.shape[0] + full = L * (ticker / k) + return full + + def f3(x, k, L, m): + x = x[~x["B"].isna()] + ticker = x.shape[0] + full = L * (ticker / k) % m + return full + + return [(f0, ()), (f1, (42,)), (f2, (42, 119)), (f3, (42, 119, 212.1))] + + +@pytest.mark.parametrize( + "func,args", create_test_groupby_apply_return_scalars_params() +) +@pytest.mark.skipif( + PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, + reason="Fails in older versions of pandas", +) +def test_groupby_apply_return_scalars(func, args): + pdf = pd.DataFrame( + { + "A": [1, 1, 2, 2, 3, 3, 4, 4, 5, 5], + "B": [ + 0.01, + np.nan, + 0.03, + 0.04, + np.nan, + 0.06, + 0.07, + 0.08, + 0.09, + 1.0, + ], + } + ) + gdf = cudf.from_pandas(pdf) + + expected = pdf.groupby("A").apply(func, *args, include_groups=False) + actual = gdf.groupby("A").apply(func, *args, include_groups=False) + + assert_groupby_results_equal(expected, actual) + + +def create_test_groupby_apply_return_series_dataframe_params(): + def f0(x): + return x - x.max() + + def f1(x): + return x.min() - x.max() + + def f2(x): + return x.min() + + def f3(x, k): + return x - x.max() + k + + def f4(x, k, L): + return x.min() - x.max() + (k / L) + + def f5(x, k, L, m): + return m * x.min() + (k / L) + + return [ + (f0, ()), + (f1, ()), + (f2, ()), + (f3, (42,)), + (f4, (42, 119)), + (f5, (41, 119, 212.1)), + ] + + +@pytest.mark.parametrize( + "func,args", create_test_groupby_apply_return_series_dataframe_params() +) +@pytest.mark.skipif( + PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, + reason="Include groups missing on old versions of pandas", +) +def test_groupby_apply_return_series_dataframe(func, args): + pdf = pd.DataFrame( + {"key": [0, 0, 1, 1, 2, 2, 2], "val": [0, 1, 2, 3, 4, 5, 6]} + ) + gdf = cudf.from_pandas(pdf) + + expected = pdf.groupby(["key"], group_keys=False).apply( + func, *args, include_groups=False + ) + actual = gdf.groupby(["key"]).apply(func, *args, include_groups=False) + + assert_groupby_results_equal(expected, actual) + + +@pytest.mark.parametrize( + "pdf", + [pd.DataFrame(), pd.DataFrame({"a": []}), pd.Series([], dtype="float64")], +) +def test_groupby_apply_no_keys(pdf): + gdf = cudf.from_pandas(pdf) + if isinstance(pdf, pd.DataFrame): + kwargs = {"check_column_type": False} + else: + kwargs = {} + assert_groupby_results_equal( + pdf.groupby([], group_keys=False).apply(lambda x: x.max()), + gdf.groupby([]).apply(lambda x: x.max()), + check_index_type=False, # Int64 v/s Float64 + **kwargs, + ) + + +@pytest.mark.parametrize( + "data", + [ + {"Speed": [380.0, 370.0, 24.0, 26.0], "Score": [50, 30, 90, 80]}, + { + "Speed": [380.0, 370.0, 24.0, 26.0], + "Score": [50, 30, 90, 80], + "Other": [10, 20, 30, 40], + }, + ], +) +@pytest.mark.parametrize("group", ["Score", "Speed"]) +def test_groupby_describe(data, group): + pdf = pd.DataFrame(data) + gdf = cudf.from_pandas(pdf) + + got = gdf.groupby(group).describe() + expect = pdf.groupby(group).describe() + + assert_groupby_results_equal(expect, got, check_dtype=False) + + +@pytest.mark.parametrize( + "data", + [ + {"a": [], "b": []}, + {"a": [2, 1, 2, 1, 1, 3], "b": [None, 1, 2, None, 2, None]}, + {"a": [None], "b": [None]}, + {"a": [2, 1, 1], "b": [None, 1, 0], "c": [None, 0, 1]}, + ], +) +@pytest.mark.parametrize("agg", ["first", "last", ["first", "last"]]) +def test_groupby_first(data, agg): + pdf = pd.DataFrame(data) + gdf = cudf.from_pandas(pdf) + expect = pdf.groupby("a").agg(agg) + got = gdf.groupby("a").agg(agg) + assert_groupby_results_equal(expect, got, check_dtype=False) + + +def test_groupby_apply_series(): + def foo(x): + return x.sum() + + pdf = pd.DataFrame( + { + "x": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], + "y": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], + "val": np.arange(10, dtype="float64"), + } + ) + gdf = cudf.from_pandas(pdf) + + got = gdf.groupby("x").y.apply(foo) + expect = pdf.groupby("x").y.apply(foo) + + assert_groupby_results_equal(expect, got) + + +@pytest.mark.parametrize( + "func,args", + [ + (lambda x, k: x + k, (42,)), + (lambda x, k, L: x + k - L, (42, 191)), + (lambda x, k, L, m: (x + k) / (L * m), (42, 191, 99.9)), + ], +) +def test_groupby_apply_series_args(func, args): + pdf = pd.DataFrame( + { + "x": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], + "y": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], + "val": np.arange(10, dtype="float64"), + } + ) + gdf = cudf.from_pandas(pdf) + + got = gdf.groupby("x").y.apply(func, *args) + expect = pdf.groupby("x", group_keys=False).y.apply(func, *args) + + assert_groupby_results_equal(expect, got) + + +@pytest.mark.parametrize("group_keys", [None, True, False]) +@pytest.mark.parametrize("by", ["A", ["A", "B"]]) +def test_groupby_group_keys(group_keys, by): + gdf = cudf.DataFrame( + { + "A": "a a a a b b".split(), + "B": [1, 1, 2, 2, 3, 3], + "C": [4, 6, 5, 9, 8, 7], + } + ) + pdf = gdf.to_pandas() + + g_group = gdf.groupby(by, group_keys=group_keys) + p_group = pdf.groupby(by, group_keys=group_keys) + + actual = g_group[["B", "C"]].apply(lambda x: x / x.sum()) + expected = p_group[["B", "C"]].apply(lambda x: x / x.sum()) + assert_eq(actual, expected) + + +@pytest.mark.parametrize( + "dtype", + ["int32", "int64", "float64", "datetime64[ns]", "timedelta64[ns]", "bool"], +) +@pytest.mark.parametrize( + "apply_op", + ["sum", "min", "max", "idxmax"], +) +@pytest.mark.skipif( + PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, + reason="Fails in older versions of pandas", +) +def test_group_by_empty_apply(request, dtype, apply_op): + request.applymarker( + pytest.mark.xfail( + condition=(dtype == "datetime64[ns]" and apply_op == "sum"), + reason=("sum isn't supported for datetime64[ns]"), + ) + ) + + gdf = cudf.DataFrame({"a": [], "b": [], "c": []}, dtype=dtype) + pdf = gdf.to_pandas() + + gg = gdf.groupby("a")["c"] + pg = pdf.groupby("a")["c"] + + assert_eq( + gg.apply(apply_op), + pg.apply(apply_op), + check_dtype=True, + check_index_type=True, + ) diff --git a/python/cudf/cudf/tests/groupby/test_attributes.py b/python/cudf/cudf/tests/groupby/test_attributes.py index 16e6741229d..eb95d915e03 100644 --- a/python/cudf/cudf/tests/groupby/test_attributes.py +++ b/python/cudf/cudf/tests/groupby/test_attributes.py @@ -4,6 +4,10 @@ import pytest import cudf +from cudf.core._compat import ( + PANDAS_CURRENT_SUPPORTED_VERSION, + PANDAS_VERSION, +) from cudf.testing import assert_eq @@ -124,3 +128,48 @@ def test_grouping(grouper): ): assert pdf_group[0] == gdf_group[0] assert_eq(pdf_group[1], gdf_group[1]) + + +@pytest.mark.skipif( + PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, + reason="warning not present in older pandas versions", +) +@pytest.mark.parametrize( + "groups", ["a", "b", "c", ["a", "c"], ["a", "b", "c"]] +) +def test_groupby_dtypes(groups): + df = cudf.DataFrame( + {"a": [1, 2, 3, 3], "b": ["x", "y", "z", "a"], "c": [10, 11, 12, 12]} + ) + pdf = df.to_pandas() + with pytest.warns(FutureWarning): + expected = pdf.groupby(groups).dtypes + with pytest.warns(FutureWarning): + actual = df.groupby(groups).dtypes + + assert_eq(expected, actual) + + +def test_ngroups(): + pdf = pd.DataFrame({"a": [1, 1, 3], "b": range(3)}) + gdf = cudf.DataFrame.from_pandas(pdf) + + pgb = pdf.groupby("a") + ggb = gdf.groupby("a") + assert pgb.ngroups == ggb.ngroups + assert len(pgb) == len(ggb) + + +def test_ndim(): + pdf = pd.DataFrame({"a": [1, 1, 3], "b": range(3)}) + gdf = cudf.DataFrame.from_pandas(pdf) + + pgb = pdf.groupby("a") + ggb = gdf.groupby("a") + assert pgb.ndim == ggb.ndim + + pser = pd.Series(range(3)) + gser = cudf.Series.from_pandas(pser) + pgb = pser.groupby([0, 0, 1]) + ggb = gser.groupby(cudf.Series([0, 0, 1])) + assert pgb.ndim == ggb.ndim diff --git a/python/cudf/cudf/tests/groupby/test_cummulative.py b/python/cudf/cudf/tests/groupby/test_cummulative.py index 1eab8a1b317..0149fe7a008 100644 --- a/python/cudf/cudf/tests/groupby/test_cummulative.py +++ b/python/cudf/cudf/tests/groupby/test_cummulative.py @@ -4,7 +4,7 @@ import pytest import cudf -from cudf.testing import assert_groupby_results_equal +from cudf.testing import assert_eq, assert_groupby_results_equal @pytest.mark.parametrize("index", [None, [1, 2, 3, 4]]) @@ -93,3 +93,14 @@ def test_groupby_scan_null_keys(with_nan, dropna, duplicate_index): expect = df.groupby("key", dropna=dropna).cumsum() got = cdf.groupby("key", dropna=dropna).cumsum() assert_groupby_results_equal(expect, got) + + +@pytest.mark.parametrize("op", ["cumsum", "cumprod", "cummin", "cummax"]) +def test_scan_int_null_pandas_compatible(op): + data = {"a": [1, 2, None, 3], "b": ["x"] * 4} + df_pd = pd.DataFrame(data) + df_cudf = cudf.DataFrame(data) + expected = getattr(df_pd.groupby("b")["a"], op)() + with cudf.option_context("mode.pandas_compatible", True): + result = getattr(df_cudf.groupby("b")["a"], op)() + assert_eq(result, expected) diff --git a/python/cudf/cudf/tests/groupby/test_diff.py b/python/cudf/cudf/tests/groupby/test_diff.py index a9e60e0f4b5..43d46f9feec 100644 --- a/python/cudf/cudf/tests/groupby/test_diff.py +++ b/python/cudf/cudf/tests/groupby/test_diff.py @@ -192,3 +192,15 @@ def test_groupby_fillna_multi_value_df(): got = gdf.groupby(key_col).fillna(value=fill_values) assert_groupby_results_equal(expect[value_cols], got[value_cols]) + + +def test_groupby_select_then_diff(): + pdf = pd.DataFrame( + {"a": [1, 1, 1, 2, 2], "b": [1, 2, 3, 4, 5], "c": [3, 4, 5, 6, 7]} + ) + gdf = cudf.from_pandas(pdf) + + expected = pdf.groupby("a")["c"].diff(1) + actual = gdf.groupby("a")["c"].diff(1) + + assert_groupby_results_equal(expected, actual) diff --git a/python/cudf/cudf/tests/groupby/test_ffill.py b/python/cudf/cudf/tests/groupby/test_ffill.py new file mode 100644 index 00000000000..60fdd265b38 --- /dev/null +++ b/python/cudf/cudf/tests/groupby/test_ffill.py @@ -0,0 +1,22 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + +import pandas as pd + +import cudf +from cudf.testing import assert_groupby_results_equal + + +def test_groupby_select_then_ffill(): + pdf = pd.DataFrame( + { + "a": [1, 1, 1, 2, 2], + "b": [1, None, None, 2, None], + "c": [3, None, None, 4, None], + } + ) + gdf = cudf.from_pandas(pdf) + + expected = pdf.groupby("a")["c"].ffill() + actual = gdf.groupby("a")["c"].ffill() + + assert_groupby_results_equal(expected, actual) diff --git a/python/cudf/cudf/tests/groupby/test_get_group.py b/python/cudf/cudf/tests/groupby/test_get_group.py index 43b6183fca5..6fb62c63b09 100644 --- a/python/cudf/cudf/tests/groupby/test_get_group.py +++ b/python/cudf/cudf/tests/groupby/test_get_group.py @@ -1,6 +1,16 @@ -# Copyright (c) 2023-2024, NVIDIA CORPORATION. +# Copyright (c) 2023-2025, NVIDIA CORPORATION. + +import pandas as pd +import pytest + import cudf -from cudf.testing import assert_eq +from cudf.core._compat import ( + PANDAS_CURRENT_SUPPORTED_VERSION, + PANDAS_GE_220, + PANDAS_VERSION, +) +from cudf.testing import assert_eq, assert_groupby_results_equal +from cudf.testing._utils import expect_warning_if def test_rank_return_type_compatible_mode(): @@ -10,3 +20,88 @@ def test_rank_return_type_compatible_mode(): expect = pdf.groupby("b").get_group(0) result = df.groupby("b").get_group(0) assert_eq(expect, result) + + +@pytest.mark.parametrize( + "pdf, group, name, obj", + [ + ( + pd.DataFrame({"X": ["A", "B", "A", "B"], "Y": [1, 4, 3, 2]}), + "X", + "A", + None, + ), + ( + pd.DataFrame({"X": ["A", "B", "A", "B"], "Y": [1, 4, 3, 2]}), + "X", + "B", + None, + ), + ( + pd.DataFrame({"X": ["A", "B", "A", "B"], "Y": [1, 4, 3, 2]}), + "X", + "A", + pd.DataFrame({"a": [1, 2, 4, 5, 10, 11]}), + ), + ( + pd.DataFrame({"X": ["A", "B", "A", "B"], "Y": [1, 4, 3, 2]}), + "Y", + 1, + pd.DataFrame({"a": [1, 2, 4, 5, 10, 11]}), + ), + ( + pd.DataFrame({"X": ["A", "B", "A", "B"], "Y": [1, 4, 3, 2]}), + "Y", + 3, + pd.DataFrame({"a": [1, 2, 0, 11]}), + ), + ], +) +@pytest.mark.skipif( + PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, + reason="Warnings only given on newer versions.", +) +def test_groupby_get_group(pdf, group, name, obj): + gdf = cudf.from_pandas(pdf) + + if isinstance(obj, pd.DataFrame): + gobj = cudf.from_pandas(obj) + else: + gobj = obj + + pgb = pdf.groupby(group) + ggb = gdf.groupby(group) + with expect_warning_if(obj is not None): + expected = pgb.get_group(name=name, obj=obj) + with expect_warning_if(obj is not None): + actual = ggb.get_group(name=name, obj=gobj) + + assert_groupby_results_equal(expected, actual) + + expected = pdf.iloc[pgb.indices.get(name)] + actual = gdf.iloc[ggb.indices.get(name)] + + assert_eq(expected, actual) + + +@pytest.mark.skipif( + not PANDAS_GE_220, reason="pandas behavior applicable in >=2.2" +) +def test_get_group_list_like(): + df = cudf.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) + result = df.groupby(["a"]).get_group((1,)) + expected = df.to_pandas().groupby(["a"]).get_group((1,)) + assert_eq(result, expected) + + with pytest.raises(KeyError): + df.groupby(["a"]).get_group((1, 2)) + + with pytest.raises(KeyError): + df.groupby(["a"]).get_group([1]) + + +def test_get_group_list_like_len_2(): + df = cudf.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [3, 2, 1]}) + result = df.groupby(["a", "b"]).get_group((1, 4)) + expected = df.to_pandas().groupby(["a", "b"]).get_group((1, 4)) + assert_eq(result, expected) diff --git a/python/cudf/cudf/tests/groupby/test_head_tail.py b/python/cudf/cudf/tests/groupby/test_head_tail.py new file mode 100644 index 00000000000..6457efb043e --- /dev/null +++ b/python/cudf/cudf/tests/groupby/test_head_tail.py @@ -0,0 +1,106 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + +import itertools +import operator + +import numpy as np +import pandas as pd +import pytest + +import cudf +from cudf.testing import assert_eq + + +@pytest.fixture(params=[-3, -1, 0, 1, 2]) +def n(request): + return request.param + + +@pytest.fixture( + params=[False, True], ids=["no-preserve-order", "preserve-order"] +) +def preserve_order(request): + return request.param + + +@pytest.fixture +def df(): + return cudf.DataFrame( + { + "a": [1, 0, 1, 2, 2, 1, 3, 2, 3, 3, 3], + "b": [0, 1, 2, 4, 3, 5, 6, 7, 9, 8, 10], + } + ) + + +@pytest.fixture(params=[True, False], ids=["head", "tail"]) +def take_head(request): + return request.param + + +@pytest.fixture +def expected(df, n, take_head, preserve_order): + if n == 0: + # We'll get an empty dataframe in this case + return df._empty_like(keep_index=True) + else: + if preserve_order: + # Should match pandas here + g = df.to_pandas().groupby("a") + if take_head: + return g.head(n=n) + else: + return g.tail(n=n) + else: + # We groupby "a" which is the first column. This + # possibly relies on an implementation detail that for + # integer group keys, cudf produces groups in sorted + # (ascending) order. + keyfunc = operator.itemgetter(0) + if take_head or n == 0: + # Head does group[:n] as does tail for n == 0 + slicefunc = operator.itemgetter(slice(None, n)) + else: + # Tail does group[-n:] except when n == 0 + slicefunc = operator.itemgetter( + slice(-n, None) if n else slice(0) + ) + values_to_sort = np.hstack( + [df.values_host, np.arange(len(df)).reshape(-1, 1)] + ) + expect_a, expect_b, index = zip( + *itertools.chain.from_iterable( + slicefunc(list(group)) + for _, group in itertools.groupby( + sorted(values_to_sort.tolist(), key=keyfunc), + key=keyfunc, + ) + ), + strict=True, + ) + return cudf.DataFrame({"a": expect_a, "b": expect_b}, index=index) + + +def test_head_tail(df, n, take_head, expected, preserve_order): + if take_head: + actual = df.groupby("a").head(n=n, preserve_order=preserve_order) + else: + actual = df.groupby("a").tail(n=n, preserve_order=preserve_order) + assert_eq(actual, expected) + + +def test_head_tail_empty(): + # GH #13397 + + values = [1, 2, 3] + pdf = pd.DataFrame({}, index=values) + df = cudf.DataFrame({}, index=values) + + expected = pdf.groupby(pd.Series(values)).head() + got = df.groupby(cudf.Series(values)).head() + assert_eq(expected, got, check_column_type=False) + + expected = pdf.groupby(pd.Series(values)).tail() + got = df.groupby(cudf.Series(values)).tail() + + assert_eq(expected, got, check_column_type=False) diff --git a/python/cudf/cudf/tests/groupby/test_ngroup.py b/python/cudf/cudf/tests/groupby/test_ngroup.py new file mode 100644 index 00000000000..4125eb01abd --- /dev/null +++ b/python/cudf/cudf/tests/groupby/test_ngroup.py @@ -0,0 +1,34 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + +import pandas as pd +import pytest + +import cudf +from cudf.testing import assert_eq + + +@pytest.mark.parametrize( + "by", + [ + lambda: "a", + lambda: "b", + lambda: ["a", "b"], + lambda: "c", + lambda: pd.Series([1, 2, 1, 2, 1, 2]), + lambda: pd.Series(["x", "y", "y", "x", "z", "x"]), + ], +) +def test_groupby_ngroup(by, ascending): + df_ngroup = cudf.DataFrame( + { + "a": [2, 2, 1, 1, 2, 3], + "b": [1, 2, 1, 2, 1, 2], + "c": ["a", "a", "b", "c", "d", "c"], + }, + index=[1, 3, 5, 7, 4, 2], + ) + df_ngroup.index.name = "foo" + by = by() + expected = df_ngroup.to_pandas().groupby(by).ngroup(ascending=ascending) + actual = df_ngroup.groupby(by).ngroup(ascending=ascending) + assert_eq(expected, actual, check_dtype=False) diff --git a/python/cudf/cudf/tests/groupby/test_nth.py b/python/cudf/cudf/tests/groupby/test_nth.py index 1fb9d32f535..fd93cc2e208 100644 --- a/python/cudf/cudf/tests/groupby/test_nth.py +++ b/python/cudf/cudf/tests/groupby/test_nth.py @@ -1,4 +1,5 @@ # Copyright (c) 2025, NVIDIA CORPORATION. +import numpy as np import pandas as pd import pytest @@ -23,3 +24,36 @@ def test_groupby_nth(n, by): got = gdf.groupby(by).nth(n) assert_groupby_results_equal(expect, got, check_dtype=False) + + +def test_groupby_consecutive_operations(): + df = cudf.DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=["A", "B"]) + pdf = df.to_pandas() + + gg = df.groupby("A") + pg = pdf.groupby("A") + + actual = gg.nth(-1) + expected = pg.nth(-1) + + assert_groupby_results_equal(actual, expected, check_dtype=False) + + actual = gg.nth(0) + expected = pg.nth(0) + + assert_groupby_results_equal(actual, expected, check_dtype=False) + + actual = gg.cumsum() + expected = pg.cumsum() + + assert_groupby_results_equal(actual, expected, check_dtype=False) + + actual = gg.cumcount() + expected = pg.cumcount() + + assert_groupby_results_equal(actual, expected, check_dtype=False) + + actual = gg.cumsum() + expected = pg.cumsum() + + assert_groupby_results_equal(actual, expected, check_dtype=False) diff --git a/python/cudf/cudf/tests/groupby/test_pct_change.py b/python/cudf/cudf/tests/groupby/test_pct_change.py new file mode 100644 index 00000000000..a2335afc1ba --- /dev/null +++ b/python/cudf/cudf/tests/groupby/test_pct_change.py @@ -0,0 +1,98 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + + +import pytest + +import cudf +from cudf.api.extensions import no_default +from cudf.core._compat import ( + PANDAS_CURRENT_SUPPORTED_VERSION, + PANDAS_VERSION, +) +from cudf.testing import assert_eq +from cudf.testing._utils import ( + expect_warning_if, +) + + +@pytest.mark.skipif( + PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, + reason="warning not present in older pandas versions", +) +@pytest.mark.parametrize( + "data, gkey", + [ + ( + { + "id": ["a", "a", "a", "b", "b", "b", "c", "c", "c"], + "val1": [5, 4, 6, 4, 8, 7, 4, 5, 2], + "val2": [4, 5, 6, 1, 2, 9, 8, 5, 1], + "val3": [4, 5, 6, 1, 2, 9, 8, 5, 1], + }, + ["id"], + ), + ( + { + "id": [0, 0, 0, 0, 1, 1, 1], + "a": [1, 3, 4, 2.0, -3.0, 9.0, 10.0], + "b": [10.0, 23, -4.0, 2, -3.0, None, 19.0], + }, + ["id", "a"], + ), + ( + { + "id": ["a", "a", "b", "b", "c", "c"], + "val1": [None, None, None, None, None, None], + }, + ["id"], + ), + ], +) +@pytest.mark.parametrize("periods", [-2, 0, 5]) +@pytest.mark.parametrize("fill_method", ["ffill", "bfill", no_default, None]) +def test_groupby_pct_change(data, gkey, periods, fill_method): + gdf = cudf.DataFrame(data) + pdf = gdf.to_pandas() + + with expect_warning_if(fill_method not in (no_default, None)): + actual = gdf.groupby(gkey).pct_change( + periods=periods, fill_method=fill_method + ) + with expect_warning_if( + ( + fill_method not in (no_default, None) + or (fill_method is not None and pdf.isna().any().any()) + ) + ): + expected = pdf.groupby(gkey).pct_change( + periods=periods, fill_method=fill_method + ) + + assert_eq(expected, actual) + + +@pytest.mark.parametrize("periods", [-5, 5]) +def test_groupby_pct_change_multiindex_dataframe(periods): + gdf = cudf.DataFrame( + { + "a": [1, 1, 2, 2], + "b": [1, 1, 2, 3], + "c": [2, 3, 4, 5], + "d": [6, 8, 9, 1], + } + ).set_index(["a", "b"]) + + actual = gdf.groupby(level=["a", "b"]).pct_change(periods) + expected = gdf.to_pandas().groupby(level=["a", "b"]).pct_change(periods) + + assert_eq(expected, actual) + + +def test_groupby_pct_change_empty_columns(): + gdf = cudf.DataFrame(columns=["id", "val1", "val2"]) + pdf = gdf.to_pandas() + + actual = gdf.groupby("id").pct_change() + expected = pdf.groupby("id").pct_change() + + assert_eq(expected, actual) diff --git a/python/cudf/cudf/tests/groupby/test_reductions.py b/python/cudf/cudf/tests/groupby/test_reductions.py index adbc5af309f..f54c0a79337 100644 --- a/python/cudf/cudf/tests/groupby/test_reductions.py +++ b/python/cudf/cudf/tests/groupby/test_reductions.py @@ -5,7 +5,11 @@ import pytest import cudf -from cudf.testing import assert_groupby_results_equal +from cudf.core._compat import ( + PANDAS_CURRENT_SUPPORTED_VERSION, + PANDAS_VERSION, +) +from cudf.testing import assert_eq, assert_groupby_results_equal from cudf.testing._utils import assert_exceptions_equal @@ -676,3 +680,308 @@ def test_groupby_no_keys(pdf): check_index_type=False, # Int64 v/s Float64 **kwargs, ) + + +@pytest.mark.parametrize("label", [None, "left", "right"]) +@pytest.mark.parametrize("closed", [None, "left", "right"]) +def test_groupby_freq_week(label, closed): + pdf = pd.DataFrame( + { + "Publish date": [ + pd.Timestamp("2000-01-03"), + pd.Timestamp("2000-01-01"), + pd.Timestamp("2000-01-09"), + pd.Timestamp("2000-01-02"), + pd.Timestamp("2000-01-07"), + pd.Timestamp("2000-01-16"), + ], + "ID": [0, 1, 2, 3, 4, 5], + "Price": [10, 20, 30, 40, 50, 60], + } + ) + gdf = cudf.from_pandas(pdf) + expect = pdf.groupby( + pd.Grouper(key="Publish date", freq="1W", label=label, closed=closed) + ).mean() + got = gdf.groupby( + cudf.Grouper(key="Publish date", freq="1W", label=label, closed=closed) + ).mean() + assert_eq( + expect, + got, + check_like=True, + check_dtype=False, + check_index_type=False, + ) + + +@pytest.mark.parametrize("label", [None, "left", "right"]) +@pytest.mark.parametrize("closed", [None, "left", "right"]) +def test_groupby_freq_day(label, closed): + pdf = pd.DataFrame( + { + "Publish date": [ + pd.Timestamp("2000-01-03"), + pd.Timestamp("2000-01-01"), + pd.Timestamp("2000-01-09"), + pd.Timestamp("2000-01-02"), + pd.Timestamp("2000-01-07"), + pd.Timestamp("2000-01-16"), + ], + "ID": [0, 1, 2, 3, 4, 5], + "Price": [10, 20, 30, 40, 50, 60], + } + ) + gdf = cudf.from_pandas(pdf) + expect = pdf.groupby( + pd.Grouper(key="Publish date", freq="3D", label=label, closed=closed) + ).mean() + got = gdf.groupby( + cudf.Grouper(key="Publish date", freq="3D", label=label, closed=closed) + ).mean() + assert_eq( + expect, + got, + check_like=True, + check_dtype=False, + check_index_type=False, + ) + + +@pytest.mark.parametrize("label", [None, "left", "right"]) +@pytest.mark.parametrize("closed", [None, "left", "right"]) +def test_groupby_freq_min(label, closed): + pdf = pd.DataFrame( + { + "Publish date": [ + pd.Timestamp("2000-01-01 12:01:00"), + pd.Timestamp("2000-01-01 12:05:00"), + pd.Timestamp("2000-01-01 15:30:00"), + pd.Timestamp("2000-01-02 00:00:00"), + pd.Timestamp("2000-01-01 23:47:00"), + pd.Timestamp("2000-01-02 00:05:00"), + ], + "ID": [0, 1, 2, 3, 4, 5], + "Price": [10, 20, 30, 40, 50, 60], + } + ) + gdf = cudf.from_pandas(pdf) + expect = pdf.groupby( + pd.Grouper(key="Publish date", freq="1h", label=label, closed=closed) + ).mean() + got = gdf.groupby( + cudf.Grouper(key="Publish date", freq="1h", label=label, closed=closed) + ).mean() + assert_eq( + expect, + got, + check_like=True, + check_dtype=False, + check_index_type=False, + ) + + +@pytest.mark.parametrize("label", [None, "left", "right"]) +@pytest.mark.parametrize("closed", [None, "left", "right"]) +def test_groupby_freq_s(label, closed): + pdf = pd.DataFrame( + { + "Publish date": [ + pd.Timestamp("2000-01-01 00:00:02"), + pd.Timestamp("2000-01-01 00:00:07"), + pd.Timestamp("2000-01-01 00:00:02"), + pd.Timestamp("2000-01-02 00:00:15"), + pd.Timestamp("2000-01-01 00:00:05"), + pd.Timestamp("2000-01-02 00:00:09"), + ], + "ID": [0, 1, 2, 3, 4, 5], + "Price": [10, 20, 30, 40, 50, 60], + } + ) + gdf = cudf.from_pandas(pdf) + expect = pdf.groupby( + pd.Grouper(key="Publish date", freq="3s", label=label, closed=closed) + ).mean() + got = gdf.groupby( + cudf.Grouper(key="Publish date", freq="3s", label=label, closed=closed) + ).mean() + assert_eq( + expect, + got, + check_like=True, + check_dtype=False, + check_index_type=False, + ) + + +@pytest.mark.parametrize("index_names", ["a", "b", "c", ["b", "c"]]) +def test_groupby_by_index_names(index_names): + gdf = cudf.DataFrame( + {"a": [1, 2, 3, 4], "b": ["a", "b", "a", "a"], "c": [1, 1, 2, 1]} + ).set_index(index_names) + pdf = gdf.to_pandas() + + assert_groupby_results_equal( + pdf.groupby(index_names).min(), gdf.groupby(index_names).min() + ) + + +@pytest.mark.parametrize( + "groups", ["a", "b", "c", ["a", "c"], ["a", "b", "c"]] +) +def test_group_by_pandas_compat(groups): + with cudf.option_context("mode.pandas_compatible", True): + df = cudf.DataFrame( + { + "a": [1, 3, 2, 3, 3], + "b": ["x", "a", "y", "z", "a"], + "c": [10, 13, 11, 12, 12], + } + ) + pdf = df.to_pandas() + + assert_eq(pdf.groupby(groups).max(), df.groupby(groups).max()) + + +@pytest.mark.parametrize( + "groups", ["a", "b", "c", ["a", "c"], ["a", "b", "c"]] +) +@pytest.mark.parametrize("sort", [True, False]) +def test_group_by_pandas_sort_order(groups, sort): + with cudf.option_context("mode.pandas_compatible", True): + df = cudf.DataFrame( + { + "a": [10, 1, 10, 3, 2, 1, 3, 3], + "b": [5, 6, 7, 1, 2, 3, 4, 9], + "c": [20, 20, 10, 11, 13, 11, 12, 12], + } + ) + pdf = df.to_pandas() + + assert_eq( + pdf.groupby(groups, sort=sort).sum(), + df.groupby(groups, sort=sort).sum(), + ) + + +@pytest.mark.skipif( + PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, + reason="Fails in older versions of pandas", +) +def test_group_by_empty_reduction( + all_supported_types_as_str, groupby_reduction_methods, request +): + request.applymarker( + pytest.mark.xfail( + condition=all_supported_types_as_str == "category" + and groupby_reduction_methods + in {"min", "max", "idxmin", "idxmax", "first", "last"}, + reason=f"cuDF doesn't support {groupby_reduction_methods} on {all_supported_types_as_str}", + ) + ) + request.applymarker( + pytest.mark.xfail( + condition=all_supported_types_as_str == "str" + and groupby_reduction_methods in {"idxmin", "idxmax"}, + reason=f"cuDF doesn't support {groupby_reduction_methods} on {all_supported_types_as_str}", + ) + ) + request.applymarker( + pytest.mark.xfail( + condition="int" in all_supported_types_as_str + and groupby_reduction_methods == "mean", + reason=f"{all_supported_types_as_str} returns incorrect result type with {groupby_reduction_methods}", + ) + ) + request.applymarker( + pytest.mark.xfail( + condition="timedelta" in all_supported_types_as_str + and groupby_reduction_methods == "prod", + raises=RuntimeError, + reason=f"{all_supported_types_as_str} raises libcudf error with {groupby_reduction_methods}", + ) + ) + request.applymarker( + pytest.mark.xfail( + condition="datetime" in all_supported_types_as_str + and groupby_reduction_methods in {"mean", "prod", "sum"}, + raises=RuntimeError, + reason=f"{all_supported_types_as_str} raises libcudf error with {groupby_reduction_methods}", + ) + ) + request.applymarker( + pytest.mark.xfail( + condition=all_supported_types_as_str in {"str", "category"} + and groupby_reduction_methods in {"sum", "prod", "mean"}, + raises=TypeError, + reason=f"{all_supported_types_as_str} raises TypeError with {groupby_reduction_methods}", + ) + ) + request.applymarker( + pytest.mark.xfail( + condition=all_supported_types_as_str == "bool" + and groupby_reduction_methods in {"sum", "prod", "mean"}, + reason=f"{all_supported_types_as_str} returns incorrect result type with {groupby_reduction_methods}", + ) + ) + gdf = cudf.DataFrame( + {"a": [], "b": [], "c": []}, dtype=all_supported_types_as_str + ) + pdf = gdf.to_pandas() + + gg = gdf.groupby("a")["c"] + pg = pdf.groupby("a", observed=True)["c"] + + assert_eq( + getattr(gg, groupby_reduction_methods)(), + getattr(pg, groupby_reduction_methods)(), + check_dtype=True, + ) + + +@pytest.mark.skipif( + PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, + reason="Warning only given on newer versions.", +) +def test_categorical_grouping_pandas_compatibility(): + gdf = cudf.DataFrame( + { + "key": cudf.Series([2, 1, 3, 1, 1], dtype="category"), + "a": [0, 1, 3, 2, 3], + } + ) + pdf = gdf.to_pandas() + + with cudf.option_context("mode.pandas_compatible", True): + actual = gdf.groupby("key", sort=False).sum() + with pytest.warns(FutureWarning): + # observed param deprecation. + expected = pdf.groupby("key", sort=False).sum() + assert_eq(actual, expected) + + +@pytest.mark.parametrize( + "by,data", + [ + ("a", {"a": [1, 2, 3]}), + (["a", "id"], {"id": [0, 0, 1], "a": [1, 2, 3]}), + ("a", {"a": [1, 2, 3], "b": ["A", "B", "C"]}), + ("id", {"id": [0, 0, 1], "a": [1, 2, 3], "b": ["A", "B", "C"]}), + (["b", "id"], {"id": [0, 0, 1], "b": ["A", "B", "C"]}), + ("b", {"b": ["A", "B", "C"]}), + ], +) +def test_group_by_reduce_numeric_only(by, data, groupby_reduction_methods): + # Test that simple groupby reductions support numeric_only=True + if groupby_reduction_methods == "count": + pytest.skip( + f"{groupby_reduction_methods} doesn't support numeric_only" + ) + df = cudf.DataFrame(data) + expected = getattr( + df.to_pandas().groupby(by, sort=True), groupby_reduction_methods + )(numeric_only=True) + result = getattr(df.groupby(by, sort=True), groupby_reduction_methods)( + numeric_only=True + ) + assert_eq(expected, result) diff --git a/python/cudf/cudf/tests/groupby/test_sample.py b/python/cudf/cudf/tests/groupby/test_sample.py new file mode 100644 index 00000000000..66104e81396 --- /dev/null +++ b/python/cudf/cudf/tests/groupby/test_sample.py @@ -0,0 +1,101 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + +import collections +import itertools +import string + +import pytest + +import cudf +from cudf.testing import assert_eq + + +@pytest.fixture(params=["default", "rangeindex", "intindex", "strindex"]) +def index(request): + n = 12 + if request.param == "rangeindex": + return cudf.RangeIndex(2, n + 2) + elif request.param == "intindex": + return cudf.Index( + [2, 3, 4, 1, 0, 5, 6, 8, 7, 9, 10, 13], dtype="int32" + ) + elif request.param == "strindex": + return cudf.Index(list(string.ascii_lowercase[:n])) + elif request.param == "default": + return None + + +@pytest.fixture( + params=[ + ["a", "a", "b", "b", "c", "c", "c", "d", "d", "d", "d", "d"], + [1, 1, 2, 2, 3, 3, 3, 4, 4, 4, 4, 4], + ], + ids=["str-group", "int-group"], +) +def df(index, request): + return cudf.DataFrame( + {"a": request.param, "b": request.param, "v": request.param}, + index=index, + ) + + +@pytest.fixture(params=["a", ["a", "b"]], ids=["single-col", "two-col"]) +def by(request): + return request.param + + +def expected(df, *, n=None, frac=None): + value_counts = collections.Counter(df.a.values_host) + if n is not None: + values = list( + itertools.chain.from_iterable( + itertools.repeat(v, n) for v in value_counts.keys() + ) + ) + elif frac is not None: + values = list( + itertools.chain.from_iterable( + itertools.repeat(v, round(count * frac)) + for v, count in value_counts.items() + ) + ) + else: + raise ValueError("Must provide either n or frac") + values = cudf.Series(sorted(values), dtype=df.a.dtype) + return cudf.DataFrame({"a": values, "b": values, "v": values}) + + +@pytest.mark.parametrize("n", [None, 0, 1, 2]) +def test_constant_n_no_replace(df, by, n): + result = df.groupby(by).sample(n=n).sort_values("a") + n = 1 if n is None else n + assert_eq(expected(df, n=n), result.reset_index(drop=True)) + + +def test_constant_n_no_replace_too_large_raises(df): + with pytest.raises(ValueError): + df.groupby("a").sample(n=3) + + +@pytest.mark.parametrize("n", [1, 2, 3]) +def test_constant_n_replace(df, by, n): + result = df.groupby(by).sample(n=n, replace=True).sort_values("a") + assert_eq(expected(df, n=n), result.reset_index(drop=True)) + + +def test_invalid_arguments(df): + with pytest.raises(ValueError): + df.groupby("a").sample(n=1, frac=0.1) + + +def test_not_implemented_arguments(df): + with pytest.raises(NotImplementedError): + # These are valid weights, but we don't implement this yet. + df.groupby("a").sample(n=1, weights=[1 / len(df)] * len(df)) + + +@pytest.mark.parametrize("frac", [0, 1 / 3, 1 / 2, 2 / 3, 1]) +@pytest.mark.parametrize("replace", [False, True]) +def test_fraction_rounding(df, by, frac, replace): + result = df.groupby(by).sample(frac=frac, replace=replace).sort_values("a") + assert_eq(expected(df, frac=frac), result.reset_index(drop=True)) diff --git a/python/cudf/cudf/tests/groupby/test_shift.py b/python/cudf/cudf/tests/groupby/test_shift.py index edb48826138..47a5a2b16e8 100644 --- a/python/cudf/cudf/tests/groupby/test_shift.py +++ b/python/cudf/cudf/tests/groupby/test_shift.py @@ -6,7 +6,7 @@ import pytest import cudf -from cudf.testing import assert_groupby_results_equal +from cudf.testing import assert_eq, assert_groupby_results_equal from cudf.testing.dataset_generator import rand_dataframe @@ -202,3 +202,25 @@ def test_groupby_shift_row_zero_shift(fill_value): assert_groupby_results_equal( expected[["1", "2", "3", "4"]], got[["1", "2", "3", "4"]] ) + + +def test_groupby_select_then_shift(): + pdf = pd.DataFrame( + {"a": [1, 1, 1, 2, 2], "b": [1, 2, 3, 4, 5], "c": [3, 4, 5, 6, 7]} + ) + gdf = cudf.from_pandas(pdf) + + expected = pdf.groupby("a")["c"].shift(1) + actual = gdf.groupby("a")["c"].shift(1) + + assert_groupby_results_equal(expected, actual) + + +def test_groupby_shift_series_multiindex(): + idx = cudf.MultiIndex.from_tuples( + [("a", 1), ("a", 2), ("b", 1), ("b", 2)], names=["f", "s"] + ) + ser = cudf.Series(range(4), index=idx) + result = ser.groupby(level=0).shift(1) + expected = ser.to_pandas().groupby(level=0).shift(1) + assert_eq(expected, result) diff --git a/python/cudf/cudf/tests/groupby/test_size.py b/python/cudf/cudf/tests/groupby/test_size.py new file mode 100644 index 00000000000..3b83f1e68bc --- /dev/null +++ b/python/cudf/cudf/tests/groupby/test_size.py @@ -0,0 +1,19 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. +import pandas as pd + +import cudf +from cudf.testing import assert_groupby_results_equal + + +def test_size_as_index_false(): + df = pd.DataFrame({"a": [1, 2, 1], "b": [1, 2, 3]}, columns=["a", "b"]) + expected = df.groupby("a", as_index=False).size() + result = cudf.from_pandas(df).groupby("a", as_index=False).size() + assert_groupby_results_equal(result, expected, as_index=False, by="a") + + +def test_size_series_with_name(): + ser = pd.Series(range(3), name="foo") + expected = ser.groupby(ser).size() + result = cudf.from_pandas(ser).groupby(ser).size() + assert_groupby_results_equal(result, expected) diff --git a/python/cudf/cudf/tests/groupby/test_transform.py b/python/cudf/cudf/tests/groupby/test_transform.py index f7138036ddf..4c007393a64 100644 --- a/python/cudf/cudf/tests/groupby/test_transform.py +++ b/python/cudf/cudf/tests/groupby/test_transform.py @@ -1,10 +1,11 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. +# Copyright (c) 2024-2025, NVIDIA CORPORATION. import itertools +import pandas as pd import pytest import cudf -from cudf.testing import assert_eq +from cudf.testing import assert_eq, assert_groupby_results_equal @pytest.fixture(params=[False, True], ids=["no-null-keys", "null-keys"]) @@ -41,3 +42,37 @@ def test_transform_invalid(): df = cudf.DataFrame({"key": [1, 1], "values": [4, 5]}) with pytest.raises(TypeError): df.groupby("key").transform({"values": "cumprod"}) + + +@pytest.mark.parametrize( + "by", + [ + "a", + ["a", "b"], + pd.Series([2, 1, 1, 2, 2]), + pd.Series(["b", "a", "a", "b", "b"]), + ], +) +@pytest.mark.parametrize("agg", ["sum", lambda df: df.mean()]) +def test_groupby_transform_aggregation(by, agg): + gdf = cudf.DataFrame( + {"a": [2, 2, 1, 2, 1], "b": [1, 1, 1, 2, 2], "c": [1, 2, 3, 4, 5]} + ) + pdf = gdf.to_pandas() + + expected = pdf.groupby(by).transform(agg) + actual = gdf.groupby(by).transform(agg) + + assert_groupby_results_equal(expected, actual) + + +@pytest.mark.parametrize("by", ["a", ["a", "b"], pd.Series([1, 2, 1, 3])]) +def test_groupby_transform_maintain_index(by): + # test that we maintain the index after a groupby transform + gdf = cudf.DataFrame( + {"a": [1, 1, 1, 2], "b": [1, 2, 1, 2]}, index=[3, 2, 1, 0] + ) + pdf = gdf.to_pandas() + assert_groupby_results_equal( + pdf.groupby(by).transform("max"), gdf.groupby(by).transform("max") + ) diff --git a/python/cudf/cudf/tests/groupby/test_value_counts.py b/python/cudf/cudf/tests/groupby/test_value_counts.py new file mode 100644 index 00000000000..4534d150287 --- /dev/null +++ b/python/cudf/cudf/tests/groupby/test_value_counts.py @@ -0,0 +1,77 @@ +# Copyright (c) 2018-2025, NVIDIA CORPORATION. + + +import numpy as np +import pytest + +import cudf +from cudf.testing import assert_groupby_results_equal + + +@pytest.mark.parametrize("normalize", [True, False]) +@pytest.mark.parametrize("sort", [True, False]) +def test_group_by_value_counts(normalize, sort, ascending, dropna, as_index): + # From Issue#12789 + df = cudf.DataFrame( + { + "gender": ["male", "male", "female", "male", "female", "male"], + "education": ["low", "medium", np.nan, "low", "high", "low"], + "country": ["US", "FR", "US", "FR", "FR", "FR"], + } + ) + pdf = df.to_pandas() + + actual = df.groupby("gender", as_index=as_index).value_counts( + normalize=normalize, sort=sort, ascending=ascending, dropna=dropna + ) + expected = pdf.groupby("gender", as_index=as_index).value_counts( + normalize=normalize, sort=sort, ascending=ascending, dropna=dropna + ) + + # TODO: Remove `check_names=False` once testing against `pandas>=2.0.0` + assert_groupby_results_equal( + actual, + expected, + check_index_type=False, + as_index=as_index, + by=["gender", "education"], + sort=sort, + ) + + +def test_group_by_value_counts_subset(): + # From Issue#12789 + df = cudf.DataFrame( + { + "gender": ["male", "male", "female", "male", "female", "male"], + "education": ["low", "medium", "high", "low", "high", "low"], + "country": ["US", "FR", "US", "FR", "FR", "FR"], + } + ) + pdf = df.to_pandas() + + actual = df.groupby("gender").value_counts(["education"]) + expected = pdf.groupby("gender").value_counts(["education"]) + + # TODO: Remove `check_names=False` once testing against `pandas>=2.0.0` + assert_groupby_results_equal( + actual, expected, check_names=False, check_index_type=False + ) + + +def test_group_by_value_counts_clash_with_subset(): + df = cudf.DataFrame({"a": [1, 5, 3], "b": [2, 5, 2]}) + with pytest.raises(ValueError): + df.groupby("a").value_counts(["a"]) + + +def test_group_by_value_counts_subset_not_exists(): + df = cudf.DataFrame({"a": [1, 5, 3], "b": [2, 5, 2]}) + with pytest.raises(ValueError): + df.groupby("a").value_counts(["c"]) + + +def test_group_by_value_counts_with_count_column(): + df = cudf.DataFrame({"a": [1, 5, 3], "count": [2, 5, 2]}) + with pytest.raises(ValueError): + df.groupby("a", as_index=False).value_counts() diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py deleted file mode 100644 index 5cab96d3db9..00000000000 --- a/python/cudf/cudf/tests/test_groupby.py +++ /dev/null @@ -1,1917 +0,0 @@ -# Copyright (c) 2018-2025, NVIDIA CORPORATION. - -import collections -import itertools -import operator -import string -import textwrap -from functools import partial - -import numpy as np -import pandas as pd -import pytest - -import cudf -from cudf import DataFrame, Series -from cudf.api.extensions import no_default -from cudf.core._compat import ( - PANDAS_CURRENT_SUPPORTED_VERSION, - PANDAS_GE_220, - PANDAS_VERSION, -) -from cudf.core.udf._ops import arith_ops, comparison_ops, unary_ops -from cudf.core.udf.groupby_typing import SUPPORTED_GROUPBY_NUMPY_TYPES -from cudf.core.udf.utils import UDFError, precompiled -from cudf.testing import assert_eq -from cudf.testing._utils import ( - expect_warning_if, -) - -_now = np.datetime64("now") -_tomorrow = _now + np.timedelta64(1, "D") -_now = np.int64(_now.astype("datetime64[ns]")) -_tomorrow = np.int64(_tomorrow.astype("datetime64[ns]")) -_index_type_aggs = {"count", "idxmin", "idxmax", "cumcount"} - - -def assert_groupby_results_equal( - expect, got, sort=True, as_index=True, by=None, **kwargs -): - # Because we don't sort by index by default in groupby, - # sort expect and got by index before comparing. - if sort: - if as_index: - expect = expect.sort_index() - got = got.sort_index() - else: - assert by is not None - if isinstance(expect, (pd.DataFrame, cudf.DataFrame)): - expect = expect.sort_values(by=by).reset_index(drop=True) - else: - expect = expect.sort_values(by=by).reset_index(drop=True) - - if isinstance(got, cudf.DataFrame): - got = got.sort_values(by=by).reset_index(drop=True) - else: - got = got.sort_values(by=by).reset_index(drop=True) - - assert_eq(expect, got, **kwargs) - - -def make_frame( - dataframe_class, - nelem, - seed=0, - extra_levels=(), - extra_vals=(), - with_datetime=False, -): - rng = np.random.default_rng(seed=seed) - - df = dataframe_class() - - df["x"] = rng.integers(0, 5, nelem) - df["y"] = rng.integers(0, 3, nelem) - for lvl in extra_levels: - df[lvl] = rng.integers(0, 2, nelem) - - df["val"] = rng.random(nelem) - for val in extra_vals: - df[val] = rng.random(nelem) - - if with_datetime: - df["datetime"] = rng.integers( - _now, _tomorrow, nelem, dtype=np.int64 - ).astype("datetime64[ns]") - - return df - - -@pytest.fixture -def gdf(): - return DataFrame({"x": [1, 2, 3], "y": [0, 1, 1]}) - - -@pytest.fixture -def pdf(gdf): - return gdf.to_pandas() - - -@pytest.fixture(scope="module") -def groupby_jit_data_small(): - """ - Return a small dataset for testing JIT Groupby Apply. The dataframe - contains 4 groups of size 1, 2, 3, 4 as well as an additional key - column that can be used to test subgroups within groups. This data - is useful for smoke testing basic numeric results - """ - rng = np.random.default_rng(42) - df = DataFrame() - key1 = [1] + [2] * 2 + [3] * 3 + [4] * 4 - key2 = [1, 2] * 5 - df["key1"] = key1 - df["key2"] = key2 - - df["val1"] = rng.integers(0, 10, len(key1)) - df["val2"] = rng.integers(0, 10, len(key1)) - - # randomly permute data - df = df.sample(frac=1, ignore_index=True) - return df - - -@pytest.fixture(scope="module") -def groupby_jit_data_large(groupby_jit_data_small): - """ - Larger version of groupby_jit_data_small which contains enough data - to require more than one block per group. This data is useful for - testing if JIT GroupBy algorithms scale to larger dastasets without - manifesting numerical issues such as overflow. - """ - max_tpb = 1024 - factor = ( - max_tpb + 1 - ) # bigger than a block but not always an exact multiple - df = cudf.concat([groupby_jit_data_small] * factor) - - return df - - -@pytest.fixture(scope="module") -def groupby_jit_data_nans(groupby_jit_data_small): - """ - Returns a modified version of groupby_jit_data_small which contains - nan values. - """ - - df = groupby_jit_data_small.sort_values(["key1", "key2"]) - df["val1"] = df["val1"].astype("float64") - df["val1"][::2] = np.nan - df = df.sample(frac=1, ignore_index=True) - return df - - -@pytest.fixture(scope="module") -def groupby_jit_datasets( - groupby_jit_data_small, groupby_jit_data_large, groupby_jit_data_nans -): - return { - "small": groupby_jit_data_small, - "large": groupby_jit_data_large, - "nans": groupby_jit_data_nans, - } - - -def run_groupby_apply_jit_test(data, func, keys, *args): - expect_groupby_obj = data.to_pandas().groupby(keys) - got_groupby_obj = data.groupby(keys) - - # compare cuDF jit to pandas - cudf_jit_result = got_groupby_obj.apply( - func, *args, engine="jit", include_groups=False - ) - pandas_result = expect_groupby_obj.apply(func, *args, include_groups=False) - assert_groupby_results_equal(cudf_jit_result, pandas_result) - - -def groupby_apply_jit_reductions_test_inner(func, data, dtype): - # ideally we'd just have: - # lambda group: getattr(group, func)() - # but the current kernel caching mechanism relies on pickle which - # does not play nice with local functions. What's below uses - # exec as a workaround to write the test functions dynamically - - funcstr = textwrap.dedent( - f""" - def func(df): - return df['val1'].{func}() - """ - ) - lcl = {} - exec(funcstr, lcl) - func = lcl["func"] - - data["val1"] = data["val1"].astype(dtype) - data["val2"] = data["val2"].astype(dtype) - - run_groupby_apply_jit_test(data, func, ["key1"]) - - -# test unary reductions -@pytest.mark.parametrize( - "dtype", - SUPPORTED_GROUPBY_NUMPY_TYPES, - ids=[str(t) for t in SUPPORTED_GROUPBY_NUMPY_TYPES], -) -@pytest.mark.parametrize( - "func", ["min", "max", "sum", "mean", "var", "std", "idxmin", "idxmax"] -) -@pytest.mark.parametrize("dataset", ["small", "large", "nans"]) -@pytest.mark.skipif( - PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, - reason="Include groups missing on old versions of pandas", -) -def test_groupby_apply_jit_unary_reductions( - request, func, dtype, dataset, groupby_jit_datasets -): - request.applymarker( - pytest.mark.xfail( - condition=( - ( - dataset == "nans" - and func in {"var", "std", "mean"} - and str(dtype) in {"int64", "float32", "float64"} - ) - or ( - dataset == "nans" - and func in {"idxmax", "idxmin", "sum"} - and dtype.kind == "f" - ) - ), - reason=("https://github.com/rapidsai/cudf/issues/14860"), - ) - ) - warn_condition = ( - dataset == "nans" - and func in {"idxmax", "idxmin"} - and dtype.kind == "f" - ) - dataset = groupby_jit_datasets[dataset].copy(deep=True) - with expect_warning_if(warn_condition, FutureWarning): - groupby_apply_jit_reductions_test_inner(func, dataset, dtype) - - -# test unary reductions for special values -def groupby_apply_jit_reductions_special_vals_inner( - func, data, dtype, special_val -): - funcstr = textwrap.dedent( - f""" - def func(df): - return df['val1'].{func}() - """ - ) - lcl = {} - exec(funcstr, lcl) - func = lcl["func"] - - data["val1"] = data["val1"].astype(dtype) - data["val2"] = data["val2"].astype(dtype) - data["val1"] = special_val - data["val2"] = special_val - - run_groupby_apply_jit_test(data, func, ["key1"]) - - -# test unary index reductions for special values -@pytest.mark.skipif( - PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, - reason="Fails in older versions of pandas", -) -def groupby_apply_jit_idx_reductions_special_vals_inner( - func, data, dtype, special_val -): - funcstr = textwrap.dedent( - f""" - def func(df): - return df['val1'].{func}() - """ - ) - lcl = {} - exec(funcstr, lcl) - func = lcl["func"] - - data["val1"] = data["val1"].astype(dtype) - data["val2"] = data["val2"].astype(dtype) - data["val1"] = special_val - data["val2"] = special_val - - run_groupby_apply_jit_test(data, func, ["key1"]) - - -@pytest.mark.parametrize("dtype", ["float64", "float32"]) -@pytest.mark.parametrize("func", ["min", "max", "sum", "mean", "var", "std"]) -@pytest.mark.parametrize("special_val", [np.nan, np.inf, -np.inf]) -@pytest.mark.parametrize("dataset", ["small", "large", "nans"]) -@pytest.mark.skipif( - PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, - reason="Include groups missing on old versions of pandas", -) -def test_groupby_apply_jit_reductions_special_vals( - func, dtype, dataset, groupby_jit_datasets, special_val -): - dataset = groupby_jit_datasets[dataset].copy(deep=True) - with expect_warning_if( - func in {"var", "std"} and not np.isnan(special_val), RuntimeWarning - ): - groupby_apply_jit_reductions_special_vals_inner( - func, dataset, dtype, special_val - ) - - -@pytest.mark.parametrize("func", ["idxmax", "idxmin"]) -@pytest.mark.parametrize( - "special_val", - [ - pytest.param( - np.nan, - marks=pytest.mark.xfail( - reason="https://github.com/rapidsai/cudf/issues/13832" - ), - ), - np.inf, - -np.inf, - ], -) -@pytest.mark.parametrize("dataset", ["small", "large", "nans"]) -@pytest.mark.skipif( - PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, - reason="include_groups keyword new in pandas 2.2", -) -def test_groupby_apply_jit_idx_reductions_special_vals( - func, dataset, groupby_jit_datasets, special_val -): - dataset = groupby_jit_datasets[dataset].copy(deep=True) - groupby_apply_jit_idx_reductions_special_vals_inner( - func, dataset, "float64", special_val - ) - - -@pytest.mark.skipif( - PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, - reason="Fails in older versions of pandas", -) -def test_groupby_apply_jit_sum_integer_overflow(): - max = np.iinfo("int32").max - - data = DataFrame( - { - "a": [0, 0, 0], - "b": [max, max, max], - } - ) - - def func(group): - return group["b"].sum() - - run_groupby_apply_jit_test(data, func, ["a"]) - - -@pytest.mark.parametrize("dtype", ["int32", "int64", "float32", "float64"]) -@pytest.mark.parametrize( - "dataset", - [ - pytest.param( - "small", - marks=[ - pytest.mark.filterwarnings( - "ignore:Degrees of Freedom <= 0 for slice" - ), - pytest.mark.filterwarnings( - "ignore:divide by zero encountered in divide" - ), - ], - ), - "large", - ], -) -@pytest.mark.skipif( - PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, - reason="Fails in older versions of pandas", -) -def test_groupby_apply_jit_correlation(dataset, groupby_jit_datasets, dtype): - dataset = groupby_jit_datasets[dataset].copy(deep=True) - - dataset["val1"] = dataset["val1"].astype(dtype) - dataset["val2"] = dataset["val2"].astype(dtype) - - keys = ["key1"] - - def func(group): - return group["val1"].corr(group["val2"]) - - if np.dtype(dtype).kind == "f": - # Correlation of floating types is not yet supported: - # https://github.com/rapidsai/cudf/issues/13839 - m = ( - f"Series.corr\\(Series\\) is not " - f"supported for \\({dtype}, {dtype}\\)" - ) - with pytest.raises(UDFError, match=m): - run_groupby_apply_jit_test(dataset, func, keys) - return - with expect_warning_if(dtype in {"int32", "int64"}, RuntimeWarning): - run_groupby_apply_jit_test(dataset, func, keys) - - -@pytest.mark.parametrize("dtype", ["int32", "int64"]) -@pytest.mark.skipif( - PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, - reason="Fails in older versions of pandas", -) -def test_groupby_apply_jit_correlation_zero_variance(dtype): - # pearson correlation is undefined when the variance of either - # variable is zero. This test ensures that the jit implementation - # returns the same result as pandas in this case. - data = DataFrame( - {"a": [0, 0, 0, 0, 0], "b": [1, 1, 1, 1, 1], "c": [2, 2, 2, 2, 2]} - ) - - def func(group): - return group["b"].corr(group["c"]) - - with expect_warning_if(dtype in {"int32", "int64"}, RuntimeWarning): - run_groupby_apply_jit_test(data, func, ["a"]) - - -@pytest.mark.parametrize("op", unary_ops) -def test_groupby_apply_jit_invalid_unary_ops_error(groupby_jit_data_small, op): - keys = ["key1"] - - def func(group): - return op(group["val1"]) - - with pytest.raises( - UDFError, - match=f"{op.__name__}\\(Series\\) is not supported by JIT GroupBy", - ): - run_groupby_apply_jit_test(groupby_jit_data_small, func, keys) - - -@pytest.mark.parametrize("op", arith_ops + comparison_ops) -def test_groupby_apply_jit_invalid_binary_ops_error( - groupby_jit_data_small, op -): - keys = ["key1"] - - def func(group): - return op(group["val1"], group["val2"]) - - with pytest.raises( - UDFError, - match=f"{op.__name__}\\(Series, Series\\) is not supported", - ): - run_groupby_apply_jit_test(groupby_jit_data_small, func, keys) - - -def test_groupby_apply_jit_no_df_ops(groupby_jit_data_small): - # DataFrame level operations are not yet supported. - def func(group): - return group.sum() - - with pytest.raises( - UDFError, - match="JIT GroupBy.apply\\(\\) does not support DataFrame.sum\\(\\)", - ): - run_groupby_apply_jit_test(groupby_jit_data_small, func, ["key1"]) - - -@pytest.mark.parametrize("dtype", ["uint8", "str"]) -@pytest.mark.skipif( - PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, - reason="Fails in older versions of pandas", -) -def test_groupby_apply_unsupported_dtype(dtype): - df = cudf.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]}) - df["b"] = df["b"].astype(dtype) - - # a UDAF that doesn't actually use the input column - # with the unsupported dtype should still succeed - def func(group): - return group["c"].sum() - - run_groupby_apply_jit_test(df, func, ["a"]) - - # however a UDAF that does use the unsupported dtype - # should fail - def func(group): - return group["b"].sum() - - with pytest.raises(UDFError, match="Only columns of the following dtypes"): - run_groupby_apply_jit_test(df, func, ["a"]) - - -@pytest.mark.parametrize( - "func", - [ - lambda df: df["val1"].max() + df["val2"].min(), - lambda df: df["val1"].sum() + df["val2"].var(), - lambda df: df["val1"].mean() + df["val2"].std(), - ], -) -@pytest.mark.skipif( - PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, - reason="Fails in older versions of pandas", -) -def test_groupby_apply_jit_basic(func, groupby_jit_data_small): - run_groupby_apply_jit_test(groupby_jit_data_small, func, ["key1", "key2"]) - - -def f1(df, k): - return df["val1"].max() + df["val2"].min() + k - - -def f2(df, k, L): - return df["val1"].sum() - df["val2"].var() + (k / L) - - -def f3(df, k, L, m): - return ((k * df["val1"].mean()) + (L * df["val2"].std())) / m - - -@pytest.mark.parametrize( - "func,args", [(f1, (42,)), (f2, (42, 119)), (f3, (42, 119, 212.1))] -) -@pytest.mark.skipif( - PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, - reason="Fails in older versions of pandas", -) -def test_groupby_apply_jit_args(func, args, groupby_jit_data_small): - run_groupby_apply_jit_test( - groupby_jit_data_small, func, ["key1", "key2"], *args - ) - - -@pytest.mark.skipif( - PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, - reason="Fails in older versions of pandas", -) -def test_groupby_apply_jit_block_divergence(): - # https://github.com/rapidsai/cudf/issues/12686 - df = cudf.DataFrame( - { - "a": [0, 0, 0, 1, 1, 1], - "b": [1, 1, 1, 2, 3, 4], - } - ) - - def diverging_block(grp_df): - if grp_df["b"].mean() > 1: - return grp_df["b"].mean() - return 0 - - run_groupby_apply_jit_test(df, diverging_block, ["a"]) - - -@pytest.mark.skipif( - PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, - reason="Fails in older versions of pandas", -) -def test_groupby_apply_caching(): - # Make sure similar functions that differ - # by simple things like constants actually - # recompile - - # begin with a clear cache - precompiled.clear() - assert precompiled.currsize == 0 - - data = cudf.DataFrame({"a": [1, 1, 1, 2, 2, 2], "b": [1, 2, 3, 4, 5, 6]}) - - def f(group): - return group["b"].mean() * 2 - - # a single run should result in a cache size of 1 - run_groupby_apply_jit_test(data, f, ["a"]) - assert precompiled.currsize == 1 - - # a second run with f should not increase the count - run_groupby_apply_jit_test(data, f, ["a"]) - assert precompiled.currsize == 1 - - # changing a constant value inside the UDF should miss - def f(group): - return group["b"].mean() * 3 - - run_groupby_apply_jit_test(data, f, ["a"]) - assert precompiled.currsize == 2 - - # changing the dtypes of the columns should miss - data["b"] = data["b"].astype("float64") - run_groupby_apply_jit_test(data, f, ["a"]) - - assert precompiled.currsize == 3 - - -@pytest.mark.skipif( - PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, - reason="Fails in older versions of pandas", -) -def test_groupby_apply_no_bytecode_fallback(): - # tests that a function which contains no bytecode - # attribute, but would still be executable using - # the iterative groupby apply approach, still works. - - gdf = cudf.DataFrame({"a": [0, 1, 1], "b": [1, 2, 3]}) - pdf = gdf.to_pandas() - - def f(group): - return group.sum() - - part = partial(f) - - expect = pdf.groupby("a").apply(part, include_groups=False) - got = gdf.groupby("a").apply(part, engine="auto", include_groups=False) - assert_groupby_results_equal(expect, got) - - -@pytest.mark.skipif( - PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, - reason="Fails in older versions of pandas", -) -def test_groupby_apply_return_col_from_df(): - # tests a UDF that consists of purely colwise - # ops, such as `lambda group: group.x + group.y` - # which returns a column - df = cudf.DataFrame( - { - "id": range(10), - "x": range(10), - "y": range(10), - } - ) - pdf = df.to_pandas() - - def func(df): - return df.x + df.y - - got = df.groupby("id").apply(func, include_groups=False) - expect = pdf.groupby("id").apply(func, include_groups=False) - # pandas seems to erroneously add an extra MI level of ids - # TODO: Figure out how pandas groupby.apply determines the columns - expect = pd.DataFrame(expect.droplevel(1), columns=got.columns) - assert_groupby_results_equal(expect, got) - - -@pytest.mark.parametrize("func", [lambda group: group.sum()]) -@pytest.mark.skipif( - PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, - reason="Fails in older versions of pandas", -) -def test_groupby_apply_return_df(func): - # tests a UDF that reduces over a dataframe - # and produces a series with the original column names - # as its index, such as lambda group: group.sum() + group.min() - df = cudf.DataFrame({"a": [1, 1, 2, 2], "b": [1, 2, 3, 4]}) - pdf = df.to_pandas() - - expect = pdf.groupby("a").apply(func, include_groups=False) - got = df.groupby("a").apply(func, include_groups=False) - assert_groupby_results_equal(expect, got) - - -@pytest.mark.parametrize("as_index", [True, False]) -def test_groupby_apply_return_reindexed_series(as_index): - def gdf_func(df): - return cudf.Series([df["a"].sum(), df["b"].min(), df["c"].max()]) - - def pdf_func(df): - return pd.Series([df["a"].sum(), df["b"].min(), df["c"].max()]) - - df = cudf.DataFrame( - { - "key": [0, 0, 1, 1, 2, 2], - "a": [1, 2, 3, 4, 5, 6], - "b": [7, 8, 9, 10, 11, 12], - "c": [13, 14, 15, 16, 17, 18], - } - ) - pdf = df.to_pandas() - - kwargs = {} - if PANDAS_GE_220: - kwargs["include_groups"] = False - - expect = pdf.groupby("key", as_index=as_index).apply(pdf_func, **kwargs) - got = df.groupby("key", as_index=as_index).apply(gdf_func, **kwargs) - assert_groupby_results_equal(expect, got) - - -@pytest.mark.skipif( - PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, - reason="Include groups missing on old versions of pandas", -) -def test_groupby_apply_noempty_group(): - pdf = pd.DataFrame( - {"a": [1, 1, 2, 2], "b": [1, 2, 1, 2], "c": [1, 2, 3, 4]} - ) - gdf = cudf.from_pandas(pdf) - - expect = ( - pdf.groupby("a", group_keys=False) - .apply(lambda x: x.iloc[[0, 1]], include_groups=False) - .reset_index(drop=True) - ) - got = ( - gdf.groupby("a") - .apply(lambda x: x.iloc[[0, 1]], include_groups=False) - .reset_index(drop=True) - ) - assert_groupby_results_equal(expect, got) - - -def create_test_groupby_apply_return_scalars_params(): - def f0(x): - x = x[~x["B"].isna()] - ticker = x.shape[0] - full = ticker / 10 - return full - - def f1(x, k): - x = x[~x["B"].isna()] - ticker = x.shape[0] - full = ticker / k - return full - - def f2(x, k, L): - x = x[~x["B"].isna()] - ticker = x.shape[0] - full = L * (ticker / k) - return full - - def f3(x, k, L, m): - x = x[~x["B"].isna()] - ticker = x.shape[0] - full = L * (ticker / k) % m - return full - - return [(f0, ()), (f1, (42,)), (f2, (42, 119)), (f3, (42, 119, 212.1))] - - -@pytest.mark.parametrize( - "func,args", create_test_groupby_apply_return_scalars_params() -) -@pytest.mark.skipif( - PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, - reason="Fails in older versions of pandas", -) -def test_groupby_apply_return_scalars(func, args): - pdf = pd.DataFrame( - { - "A": [1, 1, 2, 2, 3, 3, 4, 4, 5, 5], - "B": [ - 0.01, - np.nan, - 0.03, - 0.04, - np.nan, - 0.06, - 0.07, - 0.08, - 0.09, - 1.0, - ], - } - ) - gdf = cudf.from_pandas(pdf) - - expected = pdf.groupby("A").apply(func, *args, include_groups=False) - actual = gdf.groupby("A").apply(func, *args, include_groups=False) - - assert_groupby_results_equal(expected, actual) - - -def create_test_groupby_apply_return_series_dataframe_params(): - def f0(x): - return x - x.max() - - def f1(x): - return x.min() - x.max() - - def f2(x): - return x.min() - - def f3(x, k): - return x - x.max() + k - - def f4(x, k, L): - return x.min() - x.max() + (k / L) - - def f5(x, k, L, m): - return m * x.min() + (k / L) - - return [ - (f0, ()), - (f1, ()), - (f2, ()), - (f3, (42,)), - (f4, (42, 119)), - (f5, (41, 119, 212.1)), - ] - - -@pytest.mark.parametrize( - "func,args", create_test_groupby_apply_return_series_dataframe_params() -) -@pytest.mark.skipif( - PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, - reason="Include groups missing on old versions of pandas", -) -def test_groupby_apply_return_series_dataframe(func, args): - pdf = pd.DataFrame( - {"key": [0, 0, 1, 1, 2, 2, 2], "val": [0, 1, 2, 3, 4, 5, 6]} - ) - gdf = cudf.from_pandas(pdf) - - expected = pdf.groupby(["key"], group_keys=False).apply( - func, *args, include_groups=False - ) - actual = gdf.groupby(["key"]).apply(func, *args, include_groups=False) - - assert_groupby_results_equal(expected, actual) - - -@pytest.mark.parametrize( - "pdf", - [pd.DataFrame(), pd.DataFrame({"a": []}), pd.Series([], dtype="float64")], -) -def test_groupby_apply_no_keys(pdf): - gdf = cudf.from_pandas(pdf) - if isinstance(pdf, pd.DataFrame): - kwargs = {"check_column_type": False} - else: - kwargs = {} - assert_groupby_results_equal( - pdf.groupby([], group_keys=False).apply(lambda x: x.max()), - gdf.groupby([]).apply(lambda x: x.max()), - check_index_type=False, # Int64 v/s Float64 - **kwargs, - ) - - -@pytest.mark.parametrize( - "data", - [ - {"Speed": [380.0, 370.0, 24.0, 26.0], "Score": [50, 30, 90, 80]}, - { - "Speed": [380.0, 370.0, 24.0, 26.0], - "Score": [50, 30, 90, 80], - "Other": [10, 20, 30, 40], - }, - ], -) -@pytest.mark.parametrize("group", ["Score", "Speed"]) -def test_groupby_describe(data, group): - pdf = pd.DataFrame(data) - gdf = cudf.from_pandas(pdf) - - got = gdf.groupby(group).describe() - expect = pdf.groupby(group).describe() - - assert_groupby_results_equal(expect, got, check_dtype=False) - - -@pytest.mark.parametrize( - "data", - [ - {"a": [], "b": []}, - {"a": [2, 1, 2, 1, 1, 3], "b": [None, 1, 2, None, 2, None]}, - {"a": [None], "b": [None]}, - {"a": [2, 1, 1], "b": [None, 1, 0], "c": [None, 0, 1]}, - ], -) -@pytest.mark.parametrize("agg", ["first", "last", ["first", "last"]]) -def test_groupby_first(data, agg): - pdf = pd.DataFrame(data) - gdf = cudf.from_pandas(pdf) - expect = pdf.groupby("a").agg(agg) - got = gdf.groupby("a").agg(agg) - assert_groupby_results_equal(expect, got, check_dtype=False) - - -def test_groupby_apply_series(): - def foo(x): - return x.sum() - - got = make_frame(DataFrame, 100).groupby("x").y.apply(foo) - expect = make_frame(pd.DataFrame, 100).groupby("x").y.apply(foo) - - assert_groupby_results_equal(expect, got) - - -@pytest.mark.parametrize( - "func,args", - [ - (lambda x, k: x + k, (42,)), - (lambda x, k, L: x + k - L, (42, 191)), - (lambda x, k, L, m: (x + k) / (L * m), (42, 191, 99.9)), - ], -) -def test_groupby_apply_series_args(func, args): - got = make_frame(DataFrame, 100).groupby("x").y.apply(func, *args) - expect = ( - make_frame(pd.DataFrame, 100) - .groupby("x", group_keys=False) - .y.apply(func, *args) - ) - - assert_groupby_results_equal(expect, got) - - -@pytest.mark.parametrize("label", [None, "left", "right"]) -@pytest.mark.parametrize("closed", [None, "left", "right"]) -def test_groupby_freq_week(label, closed): - pdf = pd.DataFrame( - { - "Publish date": [ - pd.Timestamp("2000-01-03"), - pd.Timestamp("2000-01-01"), - pd.Timestamp("2000-01-09"), - pd.Timestamp("2000-01-02"), - pd.Timestamp("2000-01-07"), - pd.Timestamp("2000-01-16"), - ], - "ID": [0, 1, 2, 3, 4, 5], - "Price": [10, 20, 30, 40, 50, 60], - } - ) - gdf = cudf.from_pandas(pdf) - expect = pdf.groupby( - pd.Grouper(key="Publish date", freq="1W", label=label, closed=closed) - ).mean() - got = gdf.groupby( - cudf.Grouper(key="Publish date", freq="1W", label=label, closed=closed) - ).mean() - assert_eq( - expect, - got, - check_like=True, - check_dtype=False, - check_index_type=False, - ) - - -@pytest.mark.parametrize("label", [None, "left", "right"]) -@pytest.mark.parametrize("closed", [None, "left", "right"]) -def test_groupby_freq_day(label, closed): - pdf = pd.DataFrame( - { - "Publish date": [ - pd.Timestamp("2000-01-03"), - pd.Timestamp("2000-01-01"), - pd.Timestamp("2000-01-09"), - pd.Timestamp("2000-01-02"), - pd.Timestamp("2000-01-07"), - pd.Timestamp("2000-01-16"), - ], - "ID": [0, 1, 2, 3, 4, 5], - "Price": [10, 20, 30, 40, 50, 60], - } - ) - gdf = cudf.from_pandas(pdf) - expect = pdf.groupby( - pd.Grouper(key="Publish date", freq="3D", label=label, closed=closed) - ).mean() - got = gdf.groupby( - cudf.Grouper(key="Publish date", freq="3D", label=label, closed=closed) - ).mean() - assert_eq( - expect, - got, - check_like=True, - check_dtype=False, - check_index_type=False, - ) - - -@pytest.mark.parametrize("label", [None, "left", "right"]) -@pytest.mark.parametrize("closed", [None, "left", "right"]) -def test_groupby_freq_min(label, closed): - pdf = pd.DataFrame( - { - "Publish date": [ - pd.Timestamp("2000-01-01 12:01:00"), - pd.Timestamp("2000-01-01 12:05:00"), - pd.Timestamp("2000-01-01 15:30:00"), - pd.Timestamp("2000-01-02 00:00:00"), - pd.Timestamp("2000-01-01 23:47:00"), - pd.Timestamp("2000-01-02 00:05:00"), - ], - "ID": [0, 1, 2, 3, 4, 5], - "Price": [10, 20, 30, 40, 50, 60], - } - ) - gdf = cudf.from_pandas(pdf) - expect = pdf.groupby( - pd.Grouper(key="Publish date", freq="1h", label=label, closed=closed) - ).mean() - got = gdf.groupby( - cudf.Grouper(key="Publish date", freq="1h", label=label, closed=closed) - ).mean() - assert_eq( - expect, - got, - check_like=True, - check_dtype=False, - check_index_type=False, - ) - - -@pytest.mark.parametrize("label", [None, "left", "right"]) -@pytest.mark.parametrize("closed", [None, "left", "right"]) -def test_groupby_freq_s(label, closed): - pdf = pd.DataFrame( - { - "Publish date": [ - pd.Timestamp("2000-01-01 00:00:02"), - pd.Timestamp("2000-01-01 00:00:07"), - pd.Timestamp("2000-01-01 00:00:02"), - pd.Timestamp("2000-01-02 00:00:15"), - pd.Timestamp("2000-01-01 00:00:05"), - pd.Timestamp("2000-01-02 00:00:09"), - ], - "ID": [0, 1, 2, 3, 4, 5], - "Price": [10, 20, 30, 40, 50, 60], - } - ) - gdf = cudf.from_pandas(pdf) - expect = pdf.groupby( - pd.Grouper(key="Publish date", freq="3s", label=label, closed=closed) - ).mean() - got = gdf.groupby( - cudf.Grouper(key="Publish date", freq="3s", label=label, closed=closed) - ).mean() - assert_eq( - expect, - got, - check_like=True, - check_dtype=False, - check_index_type=False, - ) - - -@pytest.mark.parametrize( - "pdf, group, name, obj", - [ - ( - pd.DataFrame({"X": ["A", "B", "A", "B"], "Y": [1, 4, 3, 2]}), - "X", - "A", - None, - ), - ( - pd.DataFrame({"X": ["A", "B", "A", "B"], "Y": [1, 4, 3, 2]}), - "X", - "B", - None, - ), - ( - pd.DataFrame({"X": ["A", "B", "A", "B"], "Y": [1, 4, 3, 2]}), - "X", - "A", - pd.DataFrame({"a": [1, 2, 4, 5, 10, 11]}), - ), - ( - pd.DataFrame({"X": ["A", "B", "A", "B"], "Y": [1, 4, 3, 2]}), - "Y", - 1, - pd.DataFrame({"a": [1, 2, 4, 5, 10, 11]}), - ), - ( - pd.DataFrame({"X": ["A", "B", "A", "B"], "Y": [1, 4, 3, 2]}), - "Y", - 3, - pd.DataFrame({"a": [1, 2, 0, 11]}), - ), - ], -) -@pytest.mark.skipif( - PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, - reason="Warnings only given on newer versions.", -) -def test_groupby_get_group(pdf, group, name, obj): - gdf = cudf.from_pandas(pdf) - - if isinstance(obj, pd.DataFrame): - gobj = cudf.from_pandas(obj) - else: - gobj = obj - - pgb = pdf.groupby(group) - ggb = gdf.groupby(group) - with expect_warning_if(obj is not None): - expected = pgb.get_group(name=name, obj=obj) - with expect_warning_if(obj is not None): - actual = ggb.get_group(name=name, obj=gobj) - - assert_groupby_results_equal(expected, actual) - - expected = pdf.iloc[pgb.indices.get(name)] - actual = gdf.iloc[ggb.indices.get(name)] - - assert_eq(expected, actual) - - -@pytest.mark.parametrize( - "by", - [ - "a", - ["a", "b"], - pd.Series([2, 1, 1, 2, 2]), - pd.Series(["b", "a", "a", "b", "b"]), - ], -) -@pytest.mark.parametrize("agg", ["sum", "mean", lambda df: df.mean()]) -def test_groupby_transform_aggregation(by, agg): - gdf = cudf.DataFrame( - {"a": [2, 2, 1, 2, 1], "b": [1, 1, 1, 2, 2], "c": [1, 2, 3, 4, 5]} - ) - pdf = gdf.to_pandas() - - expected = pdf.groupby(by).transform(agg) - actual = gdf.groupby(by).transform(agg) - - assert_groupby_results_equal(expected, actual) - - -def test_groupby_select_then_ffill(): - pdf = pd.DataFrame( - { - "a": [1, 1, 1, 2, 2], - "b": [1, None, None, 2, None], - "c": [3, None, None, 4, None], - } - ) - gdf = cudf.from_pandas(pdf) - - expected = pdf.groupby("a")["c"].ffill() - actual = gdf.groupby("a")["c"].ffill() - - assert_groupby_results_equal(expected, actual) - - -def test_groupby_select_then_shift(): - pdf = pd.DataFrame( - {"a": [1, 1, 1, 2, 2], "b": [1, 2, 3, 4, 5], "c": [3, 4, 5, 6, 7]} - ) - gdf = cudf.from_pandas(pdf) - - expected = pdf.groupby("a")["c"].shift(1) - actual = gdf.groupby("a")["c"].shift(1) - - assert_groupby_results_equal(expected, actual) - - -def test_groupby_select_then_diff(): - pdf = pd.DataFrame( - {"a": [1, 1, 1, 2, 2], "b": [1, 2, 3, 4, 5], "c": [3, 4, 5, 6, 7]} - ) - gdf = cudf.from_pandas(pdf) - - expected = pdf.groupby("a")["c"].diff(1) - actual = gdf.groupby("a")["c"].diff(1) - - assert_groupby_results_equal(expected, actual) - - -# TODO: Add a test including datetime64[ms] column in input data - - -@pytest.mark.parametrize("by", ["a", ["a", "b"], pd.Series([1, 2, 1, 3])]) -def test_groupby_transform_maintain_index(by): - # test that we maintain the index after a groupby transform - gdf = cudf.DataFrame( - {"a": [1, 1, 1, 2], "b": [1, 2, 1, 2]}, index=[3, 2, 1, 0] - ) - pdf = gdf.to_pandas() - assert_groupby_results_equal( - pdf.groupby(by).transform("max"), gdf.groupby(by).transform("max") - ) - - -@pytest.mark.skipif( - PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, - reason="warning not present in older pandas versions", -) -@pytest.mark.parametrize( - "data, gkey", - [ - ( - { - "id": ["a", "a", "a", "b", "b", "b", "c", "c", "c"], - "val1": [5, 4, 6, 4, 8, 7, 4, 5, 2], - "val2": [4, 5, 6, 1, 2, 9, 8, 5, 1], - "val3": [4, 5, 6, 1, 2, 9, 8, 5, 1], - }, - ["id"], - ), - ( - { - "id": [0, 0, 0, 0, 1, 1, 1], - "a": [1, 3, 4, 2.0, -3.0, 9.0, 10.0], - "b": [10.0, 23, -4.0, 2, -3.0, None, 19.0], - }, - ["id", "a"], - ), - ( - { - "id": ["a", "a", "b", "b", "c", "c"], - "val1": [None, None, None, None, None, None], - }, - ["id"], - ), - ], -) -@pytest.mark.parametrize("periods", [-5, -2, 0, 2, 5]) -@pytest.mark.parametrize("fill_method", ["ffill", "bfill", no_default, None]) -def test_groupby_pct_change(data, gkey, periods, fill_method): - gdf = cudf.DataFrame(data) - pdf = gdf.to_pandas() - - with expect_warning_if(fill_method not in (no_default, None)): - actual = gdf.groupby(gkey).pct_change( - periods=periods, fill_method=fill_method - ) - with expect_warning_if( - ( - fill_method not in (no_default, None) - or (fill_method is not None and pdf.isna().any().any()) - ) - ): - expected = pdf.groupby(gkey).pct_change( - periods=periods, fill_method=fill_method - ) - - assert_eq(expected, actual) - - -@pytest.mark.parametrize("periods", [-5, 5]) -def test_groupby_pct_change_multiindex_dataframe(periods): - gdf = cudf.DataFrame( - { - "a": [1, 1, 2, 2], - "b": [1, 1, 2, 3], - "c": [2, 3, 4, 5], - "d": [6, 8, 9, 1], - } - ).set_index(["a", "b"]) - - actual = gdf.groupby(level=["a", "b"]).pct_change(periods) - expected = gdf.to_pandas().groupby(level=["a", "b"]).pct_change(periods) - - assert_eq(expected, actual) - - -def test_groupby_pct_change_empty_columns(): - gdf = cudf.DataFrame(columns=["id", "val1", "val2"]) - pdf = gdf.to_pandas() - - actual = gdf.groupby("id").pct_change() - expected = pdf.groupby("id").pct_change() - - assert_eq(expected, actual) - - -@pytest.mark.parametrize("group_keys", [None, True, False]) -@pytest.mark.parametrize("by", ["A", ["A", "B"]]) -def test_groupby_group_keys(group_keys, by): - gdf = cudf.DataFrame( - { - "A": "a a a a b b".split(), - "B": [1, 1, 2, 2, 3, 3], - "C": [4, 6, 5, 9, 8, 7], - } - ) - pdf = gdf.to_pandas() - - g_group = gdf.groupby(by, group_keys=group_keys) - p_group = pdf.groupby(by, group_keys=group_keys) - - actual = g_group[["B", "C"]].apply(lambda x: x / x.sum()) - expected = p_group[["B", "C"]].apply(lambda x: x / x.sum()) - assert_eq(actual, expected) - - -@pytest.mark.parametrize( - "by", - [ - lambda: "a", - lambda: "b", - lambda: ["a", "b"], - lambda: "c", - lambda: pd.Series([1, 2, 1, 2, 1, 2]), - lambda: pd.Series(["x", "y", "y", "x", "z", "x"]), - ], -) -@pytest.mark.parametrize("ascending", [True, False]) -def test_groupby_ngroup(by, ascending): - df_ngroup = cudf.DataFrame( - { - "a": [2, 2, 1, 1, 2, 3], - "b": [1, 2, 1, 2, 1, 2], - "c": ["a", "a", "b", "c", "d", "c"], - }, - index=[1, 3, 5, 7, 4, 2], - ) - df_ngroup.index.name = "foo" - by = by() - expected = df_ngroup.to_pandas().groupby(by).ngroup(ascending=ascending) - actual = df_ngroup.groupby(by).ngroup(ascending=ascending) - assert_eq(expected, actual, check_dtype=False) - - -@pytest.mark.skipif( - PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, - reason="warning not present in older pandas versions", -) -@pytest.mark.parametrize( - "groups", ["a", "b", "c", ["a", "c"], ["a", "b", "c"]] -) -def test_groupby_dtypes(groups): - df = cudf.DataFrame( - {"a": [1, 2, 3, 3], "b": ["x", "y", "z", "a"], "c": [10, 11, 12, 12]} - ) - pdf = df.to_pandas() - with pytest.warns(FutureWarning): - expected = pdf.groupby(groups).dtypes - with pytest.warns(FutureWarning): - actual = df.groupby(groups).dtypes - - assert_eq(expected, actual) - - -@pytest.mark.parametrize("index_names", ["a", "b", "c", ["b", "c"]]) -def test_groupby_by_index_names(index_names): - gdf = cudf.DataFrame( - {"a": [1, 2, 3, 4], "b": ["a", "b", "a", "a"], "c": [1, 1, 2, 1]} - ).set_index(index_names) - pdf = gdf.to_pandas() - - assert_groupby_results_equal( - pdf.groupby(index_names).min(), gdf.groupby(index_names).min() - ) - - -@pytest.mark.parametrize( - "groups", ["a", "b", "c", ["a", "c"], ["a", "b", "c"]] -) -def test_group_by_pandas_compat(groups): - with cudf.option_context("mode.pandas_compatible", True): - df = cudf.DataFrame( - { - "a": [1, 3, 2, 3, 3], - "b": ["x", "a", "y", "z", "a"], - "c": [10, 13, 11, 12, 12], - } - ) - pdf = df.to_pandas() - - assert_eq(pdf.groupby(groups).max(), df.groupby(groups).max()) - - -class TestSample: - @pytest.fixture(params=["default", "rangeindex", "intindex", "strindex"]) - def index(self, request): - n = 12 - if request.param == "rangeindex": - return cudf.RangeIndex(2, n + 2) - elif request.param == "intindex": - return cudf.Index( - [2, 3, 4, 1, 0, 5, 6, 8, 7, 9, 10, 13], dtype="int32" - ) - elif request.param == "strindex": - return cudf.Index(list(string.ascii_lowercase[:n])) - elif request.param == "default": - return None - - @pytest.fixture( - params=[ - ["a", "a", "b", "b", "c", "c", "c", "d", "d", "d", "d", "d"], - [1, 1, 2, 2, 3, 3, 3, 4, 4, 4, 4, 4], - ], - ids=["str-group", "int-group"], - ) - def df(self, index, request): - return cudf.DataFrame( - {"a": request.param, "b": request.param, "v": request.param}, - index=index, - ) - - @pytest.fixture(params=["a", ["a", "b"]], ids=["single-col", "two-col"]) - def by(self, request): - return request.param - - def expected(self, df, *, n=None, frac=None): - value_counts = collections.Counter(df.a.values_host) - if n is not None: - values = list( - itertools.chain.from_iterable( - itertools.repeat(v, n) for v in value_counts.keys() - ) - ) - elif frac is not None: - values = list( - itertools.chain.from_iterable( - itertools.repeat(v, round(count * frac)) - for v, count in value_counts.items() - ) - ) - else: - raise ValueError("Must provide either n or frac") - values = cudf.Series(sorted(values), dtype=df.a.dtype) - return cudf.DataFrame({"a": values, "b": values, "v": values}) - - @pytest.mark.parametrize("n", [None, 0, 1, 2]) - def test_constant_n_no_replace(self, df, by, n): - result = df.groupby(by).sample(n=n).sort_values("a") - n = 1 if n is None else n - assert_eq(self.expected(df, n=n), result.reset_index(drop=True)) - - def test_constant_n_no_replace_too_large_raises(self, df): - with pytest.raises(ValueError): - df.groupby("a").sample(n=3) - - @pytest.mark.parametrize("n", [1, 2, 3]) - def test_constant_n_replace(self, df, by, n): - result = df.groupby(by).sample(n=n, replace=True).sort_values("a") - assert_eq(self.expected(df, n=n), result.reset_index(drop=True)) - - def test_invalid_arguments(self, df): - with pytest.raises(ValueError): - df.groupby("a").sample(n=1, frac=0.1) - - def test_not_implemented_arguments(self, df): - with pytest.raises(NotImplementedError): - # These are valid weights, but we don't implement this yet. - df.groupby("a").sample(n=1, weights=[1 / len(df)] * len(df)) - - @pytest.mark.parametrize("frac", [0, 1 / 3, 1 / 2, 2 / 3, 1]) - @pytest.mark.parametrize("replace", [False, True]) - def test_fraction_rounding(self, df, by, frac, replace): - result = ( - df.groupby(by).sample(frac=frac, replace=replace).sort_values("a") - ) - assert_eq(self.expected(df, frac=frac), result.reset_index(drop=True)) - - -class TestHeadTail: - @pytest.fixture(params=[-3, -2, -1, 0, 1, 2, 3], ids=lambda n: f"{n=}") - def n(self, request): - return request.param - - @pytest.fixture( - params=[False, True], ids=["no-preserve-order", "preserve-order"] - ) - def preserve_order(self, request): - return request.param - - @pytest.fixture - def df(self): - return cudf.DataFrame( - { - "a": [1, 0, 1, 2, 2, 1, 3, 2, 3, 3, 3], - "b": [0, 1, 2, 4, 3, 5, 6, 7, 9, 8, 10], - } - ) - - @pytest.fixture(params=[True, False], ids=["head", "tail"]) - def take_head(self, request): - return request.param - - @pytest.fixture - def expected(self, df, n, take_head, preserve_order): - if n == 0: - # We'll get an empty dataframe in this case - return df._empty_like(keep_index=True) - else: - if preserve_order: - # Should match pandas here - g = df.to_pandas().groupby("a") - if take_head: - return g.head(n=n) - else: - return g.tail(n=n) - else: - # We groupby "a" which is the first column. This - # possibly relies on an implementation detail that for - # integer group keys, cudf produces groups in sorted - # (ascending) order. - keyfunc = operator.itemgetter(0) - if take_head or n == 0: - # Head does group[:n] as does tail for n == 0 - slicefunc = operator.itemgetter(slice(None, n)) - else: - # Tail does group[-n:] except when n == 0 - slicefunc = operator.itemgetter( - slice(-n, None) if n else slice(0) - ) - values_to_sort = np.hstack( - [df.values_host, np.arange(len(df)).reshape(-1, 1)] - ) - expect_a, expect_b, index = zip( - *itertools.chain.from_iterable( - slicefunc(list(group)) - for _, group in itertools.groupby( - sorted(values_to_sort.tolist(), key=keyfunc), - key=keyfunc, - ) - ), - strict=True, - ) - return cudf.DataFrame( - {"a": expect_a, "b": expect_b}, index=index - ) - - def test_head_tail(self, df, n, take_head, expected, preserve_order): - if take_head: - actual = df.groupby("a").head(n=n, preserve_order=preserve_order) - else: - actual = df.groupby("a").tail(n=n, preserve_order=preserve_order) - assert_eq(actual, expected) - - -def test_head_tail_empty(): - # GH #13397 - - values = [1, 2, 3] - pdf = pd.DataFrame({}, index=values) - df = cudf.DataFrame({}, index=values) - - expected = pdf.groupby(pd.Series(values)).head() - got = df.groupby(cudf.Series(values)).head() - assert_eq(expected, got, check_column_type=False) - - expected = pdf.groupby(pd.Series(values)).tail() - got = df.groupby(cudf.Series(values)).tail() - - assert_eq(expected, got, check_column_type=False) - - -@pytest.mark.parametrize( - "groups", ["a", "b", "c", ["a", "c"], ["a", "b", "c"]] -) -@pytest.mark.parametrize("sort", [True, False]) -def test_group_by_pandas_sort_order(groups, sort): - with cudf.option_context("mode.pandas_compatible", True): - df = cudf.DataFrame( - { - "a": [10, 1, 10, 3, 2, 1, 3, 3], - "b": [5, 6, 7, 1, 2, 3, 4, 9], - "c": [20, 20, 10, 11, 13, 11, 12, 12], - } - ) - pdf = df.to_pandas() - - assert_eq( - pdf.groupby(groups, sort=sort).sum(), - df.groupby(groups, sort=sort).sum(), - ) - - -@pytest.mark.parametrize( - "dtype", - ["int32", "int64", "float64", "datetime64[ns]", "timedelta64[ns]", "bool"], -) -@pytest.mark.parametrize( - "reduce_op", - [ - "min", - "max", - "idxmin", - "idxmax", - "first", - "last", - ], -) -@pytest.mark.skipif( - PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, - reason="Fails in older versions of pandas", -) -def test_group_by_empty_reduction(dtype, reduce_op): - gdf = cudf.DataFrame({"a": [], "b": [], "c": []}, dtype=dtype) - pdf = gdf.to_pandas() - - gg = gdf.groupby("a")["c"] - pg = pdf.groupby("a")["c"] - - assert_eq( - getattr(gg, reduce_op)(), getattr(pg, reduce_op)(), check_dtype=True - ) - - -@pytest.mark.parametrize( - "dtype", - ["int32", "int64", "float64", "datetime64[ns]", "timedelta64[ns]", "bool"], -) -@pytest.mark.parametrize( - "apply_op", - ["sum", "min", "max", "idxmax"], -) -@pytest.mark.skipif( - PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, - reason="Fails in older versions of pandas", -) -def test_group_by_empty_apply(request, dtype, apply_op): - request.applymarker( - pytest.mark.xfail( - condition=(dtype == "datetime64[ns]" and apply_op == "sum"), - reason=("sum isn't supported for datetime64[ns]"), - ) - ) - - gdf = cudf.DataFrame({"a": [], "b": [], "c": []}, dtype=dtype) - pdf = gdf.to_pandas() - - gg = gdf.groupby("a")["c"] - pg = pdf.groupby("a")["c"] - - assert_eq( - gg.apply(apply_op), - pg.apply(apply_op), - check_dtype=True, - check_index_type=True, - ) - - -def test_groupby_consecutive_operations(): - df = cudf.DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=["A", "B"]) - pdf = df.to_pandas() - - gg = df.groupby("A") - pg = pdf.groupby("A") - - actual = gg.nth(-1) - expected = pg.nth(-1) - - assert_groupby_results_equal(actual, expected, check_dtype=False) - - actual = gg.nth(0) - expected = pg.nth(0) - - assert_groupby_results_equal(actual, expected, check_dtype=False) - - actual = gg.cumsum() - expected = pg.cumsum() - - assert_groupby_results_equal(actual, expected, check_dtype=False) - - actual = gg.cumcount() - expected = pg.cumcount() - - assert_groupby_results_equal(actual, expected, check_dtype=False) - - actual = gg.cumsum() - expected = pg.cumsum() - - assert_groupby_results_equal(actual, expected, check_dtype=False) - - -@pytest.mark.skipif( - PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, - reason="Warning only given on newer versions.", -) -def test_categorical_grouping_pandas_compatibility(): - gdf = cudf.DataFrame( - { - "key": cudf.Series([2, 1, 3, 1, 1], dtype="category"), - "a": [0, 1, 3, 2, 3], - } - ) - pdf = gdf.to_pandas() - - with cudf.option_context("mode.pandas_compatible", True): - actual = gdf.groupby("key", sort=False).sum() - with pytest.warns(FutureWarning): - # observed param deprecation. - expected = pdf.groupby("key", sort=False).sum() - assert_eq(actual, expected) - - -@pytest.mark.parametrize("normalize", [True, False]) -@pytest.mark.parametrize("sort", [True, False]) -@pytest.mark.parametrize("ascending", [True, False]) -@pytest.mark.parametrize("dropna", [True, False]) -@pytest.mark.parametrize("as_index", [True, False]) -def test_group_by_value_counts(normalize, sort, ascending, dropna, as_index): - # From Issue#12789 - df = cudf.DataFrame( - { - "gender": ["male", "male", "female", "male", "female", "male"], - "education": ["low", "medium", np.nan, "low", "high", "low"], - "country": ["US", "FR", "US", "FR", "FR", "FR"], - } - ) - pdf = df.to_pandas() - - actual = df.groupby("gender", as_index=as_index).value_counts( - normalize=normalize, sort=sort, ascending=ascending, dropna=dropna - ) - expected = pdf.groupby("gender", as_index=as_index).value_counts( - normalize=normalize, sort=sort, ascending=ascending, dropna=dropna - ) - - # TODO: Remove `check_names=False` once testing against `pandas>=2.0.0` - assert_groupby_results_equal( - actual, - expected, - check_index_type=False, - as_index=as_index, - by=["gender", "education"], - sort=sort, - ) - - -def test_group_by_value_counts_subset(): - # From Issue#12789 - df = cudf.DataFrame( - { - "gender": ["male", "male", "female", "male", "female", "male"], - "education": ["low", "medium", "high", "low", "high", "low"], - "country": ["US", "FR", "US", "FR", "FR", "FR"], - } - ) - pdf = df.to_pandas() - - actual = df.groupby("gender").value_counts(["education"]) - expected = pdf.groupby("gender").value_counts(["education"]) - - # TODO: Remove `check_names=False` once testing against `pandas>=2.0.0` - assert_groupby_results_equal( - actual, expected, check_names=False, check_index_type=False - ) - - -def test_group_by_value_counts_clash_with_subset(): - df = cudf.DataFrame({"a": [1, 5, 3], "b": [2, 5, 2]}) - with pytest.raises(ValueError): - df.groupby("a").value_counts(["a"]) - - -def test_group_by_value_counts_subset_not_exists(): - df = cudf.DataFrame({"a": [1, 5, 3], "b": [2, 5, 2]}) - with pytest.raises(ValueError): - df.groupby("a").value_counts(["c"]) - - -def test_group_by_value_counts_with_count_column(): - df = cudf.DataFrame({"a": [1, 5, 3], "count": [2, 5, 2]}) - with pytest.raises(ValueError): - df.groupby("a", as_index=False).value_counts() - - -def test_groupby_internal_groups_empty(gdf): - # test that we don't segfault when calling the internal - # .groups() method with an empty list: - gb = gdf.groupby("y") - _, _, grouped_vals = gb._groups([]) - assert grouped_vals == [] - - -def test_groupby_shift_series_multiindex(): - idx = cudf.MultiIndex.from_tuples( - [("a", 1), ("a", 2), ("b", 1), ("b", 2)], names=["f", "s"] - ) - ser = Series(range(4), index=idx) - result = ser.groupby(level=0).shift(1) - expected = ser.to_pandas().groupby(level=0).shift(1) - assert_eq(expected, result) - - -@pytest.mark.parametrize( - "func", ["min", "max", "sum", "mean", "idxmin", "idxmax"] -) -@pytest.mark.parametrize( - "by,data", - [ - ("a", {"a": [1, 2, 3]}), - (["a", "id"], {"id": [0, 0, 1], "a": [1, 2, 3]}), - ("a", {"a": [1, 2, 3], "b": ["A", "B", "C"]}), - ("id", {"id": [0, 0, 1], "a": [1, 2, 3], "b": ["A", "B", "C"]}), - (["b", "id"], {"id": [0, 0, 1], "b": ["A", "B", "C"]}), - ("b", {"b": ["A", "B", "C"]}), - ], -) -def test_group_by_reduce_numeric_only(by, data, func): - # Test that simple groupby reductions support numeric_only=True - df = cudf.DataFrame(data) - expected = getattr(df.to_pandas().groupby(by, sort=True), func)( - numeric_only=True - ) - result = getattr(df.groupby(by, sort=True), func)(numeric_only=True) - assert_eq(expected, result) - - -@pytest.mark.parametrize( - "op", ["cummax", "cummin", "cumprod", "cumsum", "mean", "median"] -) -def test_group_by_raises_string_error(op): - df = cudf.DataFrame({"a": [1, 2, 3, 4, 5], "b": ["a", "b", "c", "d", "e"]}) - - with pytest.raises(TypeError): - df.groupby(df.a).agg(op) - - -@pytest.mark.parametrize( - "op", - [ - "cummax", - "cummin", - "cumprod", - "cumsum", - "mean", - "median", - "prod", - "sum", - list, - ], -) -def test_group_by_raises_category_error(op): - df = cudf.DataFrame( - { - "a": [1, 2, 3, 4, 5], - "b": cudf.Series(["a", "b", "c", "d", "e"], dtype="category"), - } - ) - - with pytest.raises(TypeError): - df.groupby(df.a).agg(op) - - -def test_ngroups(): - pdf = pd.DataFrame({"a": [1, 1, 3], "b": range(3)}) - gdf = cudf.DataFrame.from_pandas(pdf) - - pgb = pdf.groupby("a") - ggb = gdf.groupby("a") - assert pgb.ngroups == ggb.ngroups - assert len(pgb) == len(ggb) - - -def test_ndim(): - pdf = pd.DataFrame({"a": [1, 1, 3], "b": range(3)}) - gdf = cudf.DataFrame.from_pandas(pdf) - - pgb = pdf.groupby("a") - ggb = gdf.groupby("a") - assert pgb.ndim == ggb.ndim - - pser = pd.Series(range(3)) - gser = cudf.Series.from_pandas(pser) - pgb = pser.groupby([0, 0, 1]) - ggb = gser.groupby(cudf.Series([0, 0, 1])) - assert pgb.ndim == ggb.ndim - - -@pytest.mark.skipif( - not PANDAS_GE_220, reason="pandas behavior applicable in >=2.2" -) -def test_get_group_list_like(): - df = cudf.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) - result = df.groupby(["a"]).get_group((1,)) - expected = df.to_pandas().groupby(["a"]).get_group((1,)) - assert_eq(result, expected) - - with pytest.raises(KeyError): - df.groupby(["a"]).get_group((1, 2)) - - with pytest.raises(KeyError): - df.groupby(["a"]).get_group([1]) - - -def test_get_group_list_like_len_2(): - df = cudf.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [3, 2, 1]}) - result = df.groupby(["a", "b"]).get_group((1, 4)) - expected = df.to_pandas().groupby(["a", "b"]).get_group((1, 4)) - assert_eq(result, expected) - - -def test_size_as_index_false(): - df = pd.DataFrame({"a": [1, 2, 1], "b": [1, 2, 3]}, columns=["a", "b"]) - expected = df.groupby("a", as_index=False).size() - result = cudf.from_pandas(df).groupby("a", as_index=False).size() - assert_groupby_results_equal(result, expected, as_index=False, by="a") - - -def test_size_series_with_name(): - ser = pd.Series(range(3), name="foo") - expected = ser.groupby(ser).size() - result = cudf.from_pandas(ser).groupby(ser).size() - assert_groupby_results_equal(result, expected) - - -@pytest.mark.parametrize("op", ["cumsum", "cumprod", "cummin", "cummax"]) -def test_scan_int_null_pandas_compatible(op): - data = {"a": [1, 2, None, 3], "b": ["x"] * 4} - df_pd = pd.DataFrame(data) - df_cudf = cudf.DataFrame(data) - expected = getattr(df_pd.groupby("b")["a"], op)() - with cudf.option_context("mode.pandas_compatible", True): - result = getattr(df_cudf.groupby("b")["a"], op)() - assert_eq(result, expected) - - -def test_agg_duplicate_aggs_pandas_compat_raises(): - agg = {"b": ["mean", "mean"]} - dfgb = cudf.DataFrame({"a": [1, 1, 2], "b": [4, 5, 6]}).groupby(["a"]) - with cudf.option_context("mode.pandas_compatible", True): - with pytest.raises(NotImplementedError): - dfgb.agg(agg) - - with pytest.warns(UserWarning): - result = dfgb.agg(agg) - expected = cudf.DataFrame( - [4.5, 6.0], - index=cudf.Index([1, 2], name="a"), - columns=pd.MultiIndex.from_tuples([("b", "mean")]), - ) - assert_groupby_results_equal(result, expected) From df567254b81fc8d8c6d8c4f636d1c288717b77c2 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Fri, 15 Aug 2025 12:00:31 -0700 Subject: [PATCH 136/366] Add new xfails for xarray release (#19705) We have two new failures in the pandas test suite that I started seeing yesterday. I strongly suspect that these are due to [the new xarray release](https://pypi.org/project/xarray/). Adding to the xfail list for now to unblock our CI. Authors: - Vyas Ramasubramani (https://github.com/vyasr) Approvers: - GALI PREM SAGAR (https://github.com/galipremsagar) URL: https://github.com/rapidsai/cudf/pull/19705 --- python/cudf/cudf/pandas/scripts/conftest-patch.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/python/cudf/cudf/pandas/scripts/conftest-patch.py b/python/cudf/cudf/pandas/scripts/conftest-patch.py index 3869116496e..9a1051ec158 100644 --- a/python/cudf/cudf/pandas/scripts/conftest-patch.py +++ b/python/cudf/cudf/pandas/scripts/conftest-patch.py @@ -6867,6 +6867,8 @@ def pytest_unconfigure(config): "tests/generic/test_to_xarray.py::TestDataFrameToXArray::test_to_xarray_index_types[uint32]", "tests/generic/test_to_xarray.py::TestDataFrameToXArray::test_to_xarray_index_types[uint64]", "tests/generic/test_to_xarray.py::TestDataFrameToXArray::test_to_xarray_index_types[uint8]", + "tests/generic/test_to_xarray.py::TestSeriesToXArray::test_to_xarray_index_types[string-python]", + "tests/generic/test_to_xarray.py::TestSeriesToXArray::test_to_xarray_index_types[string-pyarrow]", "tests/groupby/aggregate/test_aggregate.py::test_agg_grouping_is_list_tuple", "tests/groupby/aggregate/test_aggregate.py::test_agg_multiple_with_as_index_false_subset_to_a_single_column", "tests/groupby/aggregate/test_aggregate.py::test_agg_str_with_kwarg_axis_1_raises[count]", From ff1a803ecdf4ceefca29428e8a421e471387e915 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Fri, 15 Aug 2025 14:44:34 -0700 Subject: [PATCH 137/366] Preserve decimal precision in `cudf::interop::column_metadata` (#19587) Towards https://github.com/rapidsai/cudf/issues/18863 To support decimal types in cudf_polars, we'll need to preserve/assign a resulting precision when exporting `to_polars` based on the input decimal precision from Polars. This PR allows us to write out the intended precision to the Arrow schema when utilizing `column_metadata` to interop between cuDF and Polars via Arrow. Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - Vyas Ramasubramani (https://github.com/vyasr) - David Wendt (https://github.com/davidwendt) URL: https://github.com/rapidsai/cudf/pull/19587 --- cpp/include/cudf/interop.hpp | 4 +- cpp/src/interop/to_arrow_schema.cpp | 22 +++++--- .../pylibcudf/pylibcudf/_interop_helpers.pyx | 5 +- .../pylibcudf/pylibcudf/libcudf/interop.pxd | 3 ++ python/pylibcudf/tests/test_interop.py | 52 +++++++++++++++++++ 5 files changed, 75 insertions(+), 11 deletions(-) diff --git a/cpp/include/cudf/interop.hpp b/cpp/include/cudf/interop.hpp index 76c7fa1b614..2929b5cc601 100644 --- a/cpp/include/cudf/interop.hpp +++ b/cpp/include/cudf/interop.hpp @@ -113,12 +113,12 @@ DLManagedTensor* to_dlpack( /** * @brief Detailed metadata information for arrow array. * - * As of now this contains only name in the hierarchy of children of cudf column, - * but in future this can be updated as per requirement. + * This contains attributes of the column or type not natively supported by cudf. */ struct column_metadata { std::string name; ///< Name of the column std::string timezone; ///< Timezone of the column + std::optional precision; ///< Resulting decimal precision of the column std::vector children_meta; ///< Metadata of children of the column /** diff --git a/cpp/src/interop/to_arrow_schema.cpp b/cpp/src/interop/to_arrow_schema.cpp index 2fe562dda94..bd6f7622a31 100644 --- a/cpp/src/interop/to_arrow_schema.cpp +++ b/cpp/src/interop/to_arrow_schema.cpp @@ -79,37 +79,43 @@ struct dispatch_to_arrow_type { template int decimals_to_arrow(column_view input, int32_t precision, ArrowSchema* out) { + CUDF_EXPECTS(precision >= 1 and precision <= 38, + "Precision must be between 1 and 38 inclusive", + cudf::data_type_error); return ArrowSchemaSetTypeDecimal( out, id_to_arrow_type(input.type().id()), precision, -input.type().scale()); } template <> int dispatch_to_arrow_type::operator()(column_view input, - column_metadata const&, + column_metadata const& metadata, ArrowSchema* out) { - using DeviceType = int32_t; - return decimals_to_arrow(input, cudf::detail::max_precision(), out); + using DeviceType = int32_t; + int32_t precision = metadata.precision.value_or(cudf::detail::max_precision()); + return decimals_to_arrow(input, precision, out); } template <> int dispatch_to_arrow_type::operator()(column_view input, - column_metadata const&, + column_metadata const& metadata, ArrowSchema* out) { using DeviceType = int64_t; // Arrow decimal 64 maxes at precision of 18, cudf::detail::max_precision() produces 19. // decimal32 has precision 1 - 9, decimal64 has precision 10 - 18, decimal128 is 19 - 38 - return decimals_to_arrow(input, cudf::detail::max_precision() - 1, out); + int32_t precision = metadata.precision.value_or(cudf::detail::max_precision() - 1); + return decimals_to_arrow(input, precision, out); } template <> int dispatch_to_arrow_type::operator()(column_view input, - column_metadata const&, + column_metadata const& metadata, ArrowSchema* out) { - using DeviceType = __int128_t; - return decimals_to_arrow(input, cudf::detail::max_precision(), out); + using DeviceType = __int128_t; + int32_t precision = metadata.precision.value_or(cudf::detail::max_precision()); + return decimals_to_arrow(input, precision, out); } template <> diff --git a/python/pylibcudf/pylibcudf/_interop_helpers.pyx b/python/pylibcudf/pylibcudf/_interop_helpers.pyx index 122858d98b1..3e474426898 100644 --- a/python/pylibcudf/pylibcudf/_interop_helpers.pyx +++ b/python/pylibcudf/pylibcudf/_interop_helpers.pyx @@ -1,5 +1,5 @@ # Copyright (c) 2025, NVIDIA CORPORATION. - +from libc.stdint cimport int32_t from cpython.pycapsule cimport PyCapsule_GetPointer from pylibcudf.libcudf.interop cimport ( @@ -39,6 +39,7 @@ class ColumnMetadata: """ name: str = "" timezone: str = "" + precision: int | None = None children_meta: list[ColumnMetadata] = field(default_factory=list) @@ -78,6 +79,8 @@ cdef column_metadata _metadata_to_libcudf(metadata): cdef column_metadata c_metadata c_metadata.name = metadata.name.encode() c_metadata.timezone = metadata.timezone.encode() + if metadata.precision is not None: + c_metadata.precision = metadata.precision for child_meta in metadata.children_meta: c_metadata.children_meta.push_back(_metadata_to_libcudf(child_meta)) return c_metadata diff --git a/python/pylibcudf/pylibcudf/libcudf/interop.pxd b/python/pylibcudf/pylibcudf/libcudf/interop.pxd index 257bdcea739..301ae41d667 100644 --- a/python/pylibcudf/pylibcudf/libcudf/interop.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/interop.pxd @@ -1,6 +1,8 @@ # Copyright (c) 2020-2025, NVIDIA CORPORATION. +from libc.stdint cimport int32_t from libcpp.memory cimport shared_ptr, unique_ptr from libcpp.string cimport string +from libcpp.optional cimport optional from libcpp.vector cimport vector from pylibcudf.exception_handler cimport libcudf_exception_handler from pylibcudf.libcudf.column.column cimport column @@ -49,6 +51,7 @@ cdef extern from "cudf/interop.hpp" namespace "cudf" \ column_metadata(string name_) except +libcudf_exception_handler string name string timezone + optional[int32_t] precision vector[column_metadata] children_meta diff --git a/python/pylibcudf/tests/test_interop.py b/python/pylibcudf/tests/test_interop.py index 35f9b73ebf3..171d70c2496 100644 --- a/python/pylibcudf/tests/test_interop.py +++ b/python/pylibcudf/tests/test_interop.py @@ -1,10 +1,13 @@ # Copyright (c) 2024-2025, NVIDIA CORPORATION. +import decimal + import cupy as cp import nanoarrow import nanoarrow.device import numpy as np import pyarrow as pa +import pyarrow.compute as pc import pytest from packaging.version import parse from utils import assert_column_eq, assert_table_eq @@ -97,6 +100,55 @@ def test_decimal_other(data_type): assert arrow_type == pa.decimal128(precision, 0) +@pytest.mark.parametrize( + "plc_type", + [plc.TypeId.DECIMAL128, plc.TypeId.DECIMAL64, plc.TypeId.DECIMAL32], +) +def test_decimal_respect_metadata_precision(plc_type, request): + request.node.add_marker( + pytest.mark.xfail( + parse(pa.__version__) < parse("19.0.0") + and plc_type in {plc.TypeId.DECIMAL64, plc.TypeId.DECIMAL32}, + reason=( + "pyarrow does not interpret Arrow schema decimal type string correctly" + ), + ) + ) + precision, scale = 3, 2 + expected = pa.array( + [decimal.Decimal("1.23"), None], type=pa.decimal128(precision, scale) + ) + plc_column = plc.unary.cast( + plc.Column.from_arrow(expected), plc.DataType(plc_type, scale=-scale) + ) + result = plc.interop.to_arrow( + plc_column, metadata=plc.interop.ColumnMetadata(precision=precision) + ) + if parse(pa.__version__) >= parse("19.0.0"): + if plc_type == plc.TypeId.DECIMAL64: + expected = pc.cast(expected, pa.decimal64(precision, scale)) + elif plc_type == plc.TypeId.DECIMAL32: + expected = pc.cast(expected, pa.decimal32(precision, scale)) + assert result.equals(expected) + + +@pytest.mark.parametrize("precision", [0, 39]) +def test_decimal_precision_metadata_out_of_range(precision): + scale = 2 + expected = pa.array( + [decimal.Decimal("1.23"), None], type=pa.decimal128(3, scale) + ) + plc_column = plc.unary.cast( + plc.Column.from_arrow(expected), + plc.DataType(plc.TypeId.DECIMAL128, scale=-scale), + ) + with pytest.raises(TypeError): + plc.interop.to_arrow( + plc_column, + metadata=plc.interop.ColumnMetadata(precision=precision), + ) + + def test_round_trip_dlpack_plc_table(): expected = pa.table({"a": [1, 2, 3], "b": [5, 6, 7]}) plc_table = plc.Table.from_arrow(expected) From 7271bf5a9623e8f02ce9d44305f04ed4a2a47313 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Fri, 15 Aug 2025 14:53:15 -0700 Subject: [PATCH 138/366] Move test_stats/reductions/quantile and misc to new cudf classic testing directory (#19675) Towards https://github.com/rapidsai/cudf/issues/9999 Towards https://github.com/rapidsai/cudf/issues/15723 Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/19675 --- python/cudf/cudf/tests/conftest.py | 16 + .../tests/dataframe/methods/test_cov_corr.py | 15 + .../dataframe/methods/test_reductions.py | 183 +++ .../tests/series/methods/test_cov_corr.py | 159 +++ .../tests/series/methods/test_pct_change.py | 49 + .../tests/series/methods/test_reductions.py | 1041 +++++++++++++++++ .../cudf/tests/series/methods/test_unique.py | 48 + .../cudf/cudf/tests/series/test_reductions.py | 17 - python/cudf/cudf/tests/test_datetime.py | 56 - python/cudf/cudf/tests/test_quantiles.py | 104 -- python/cudf/cudf/tests/test_reductions.py | 512 -------- python/cudf/cudf/tests/test_stats.py | 664 ----------- python/cudf/cudf/tests/test_string.py | 124 -- python/cudf/cudf/tests/test_timedelta.py | 79 +- 14 files changed, 1512 insertions(+), 1555 deletions(-) create mode 100644 python/cudf/cudf/tests/dataframe/methods/test_cov_corr.py create mode 100644 python/cudf/cudf/tests/dataframe/methods/test_reductions.py create mode 100644 python/cudf/cudf/tests/series/methods/test_cov_corr.py create mode 100644 python/cudf/cudf/tests/series/methods/test_pct_change.py create mode 100644 python/cudf/cudf/tests/series/methods/test_reductions.py delete mode 100644 python/cudf/cudf/tests/series/test_reductions.py delete mode 100644 python/cudf/cudf/tests/test_quantiles.py delete mode 100644 python/cudf/cudf/tests/test_reductions.py delete mode 100644 python/cudf/cudf/tests/test_stats.py diff --git a/python/cudf/cudf/tests/conftest.py b/python/cudf/cudf/tests/conftest.py index 0e193b57cd1..f05ec3dd247 100644 --- a/python/cudf/cudf/tests/conftest.py +++ b/python/cudf/cudf/tests/conftest.py @@ -276,6 +276,16 @@ def reduction_methods(request): return request.param +@pytest.fixture(params=["linear", "lower", "higher", "midpoint", "nearest"]) +def quantile_interpolation(request): + return request.param + + +@pytest.fixture(params=["spearman", "pearson"]) +def corr_method(request): + return request.param + + signed_integer_types = ["int8", "int16", "int32", "int64"] unsigned_integer_types = ["uint8", "uint16", "uint32", "uint64"] float_types = ["float32", "float64"] @@ -486,6 +496,12 @@ def dropna(request): return request.param +@pytest.fixture(params=[True, False]) +def skipna(request): + """Param for `skipna` argument""" + return request.param + + @pytest.fixture(params=[True, False, None]) def nan_as_null(request): """Param for `nan_as_null` argument""" diff --git a/python/cudf/cudf/tests/dataframe/methods/test_cov_corr.py b/python/cudf/cudf/tests/dataframe/methods/test_cov_corr.py new file mode 100644 index 00000000000..5e172baf92a --- /dev/null +++ b/python/cudf/cudf/tests/dataframe/methods/test_cov_corr.py @@ -0,0 +1,15 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + + +import numpy as np + +import cudf +from cudf.testing import assert_eq + + +def test_df_corr(corr_method): + gdf = cudf.DataFrame(np.random.default_rng(seed=0).normal(-100, 100, 10)) + pdf = gdf.to_pandas() + got = gdf.corr(corr_method) + expected = pdf.corr(corr_method) + assert_eq(got, expected) diff --git a/python/cudf/cudf/tests/dataframe/methods/test_reductions.py b/python/cudf/cudf/tests/dataframe/methods/test_reductions.py new file mode 100644 index 00000000000..9b4134e5b3b --- /dev/null +++ b/python/cudf/cudf/tests/dataframe/methods/test_reductions.py @@ -0,0 +1,183 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + +import numpy as np +import pandas as pd +import pytest + +import cudf +from cudf.core._compat import PANDAS_CURRENT_SUPPORTED_VERSION, PANDAS_VERSION +from cudf.testing import assert_eq +from cudf.testing._utils import expect_warning_if + + +@pytest.mark.parametrize("null_flag", [False, True]) +def test_kurtosis_df(null_flag, numeric_only): + data = cudf.DataFrame( + { + "a": np.arange(10, dtype="float64"), + "b": np.arange(10, dtype="int64"), + "c": np.arange(10, dtype="float64"), + "d": ["a"] * 10, + } + ) + if not numeric_only: + data = data.select_dtypes(include="number") + pdata = data.to_pandas() + + if null_flag and len(data) > 2: + data.iloc[[0, 2]] = None + pdata.iloc[[0, 2]] = None + + got = data.kurtosis(numeric_only=numeric_only) + got = got if np.isscalar(got) else got.to_numpy() + + expected = pdata.kurtosis(numeric_only=numeric_only) + np.testing.assert_array_almost_equal(got, expected) + + got = data.kurt(numeric_only=numeric_only) + got = got if np.isscalar(got) else got.to_numpy() + + expected = pdata.kurt(numeric_only=numeric_only) + np.testing.assert_array_almost_equal(got, expected) + + +@pytest.mark.parametrize("null_flag", [False, True]) +def test_skew_df(null_flag, numeric_only): + data = cudf.DataFrame( + { + "a": np.arange(10, dtype="float64"), + "b": np.arange(10, dtype="int64"), + "c": np.arange(10, dtype="float64"), + "d": ["a"] * 10, + } + ) + if not numeric_only: + data = data.select_dtypes(include="number") + pdata = data.to_pandas() + + if null_flag and len(data) > 2: + data.iloc[[0, 2]] = None + pdata.iloc[[0, 2]] = None + + got = data.skew(numeric_only=numeric_only) + expected = pdata.skew(numeric_only=numeric_only) + got = got if np.isscalar(got) else got.to_numpy() + np.testing.assert_array_almost_equal(got, expected) + + +def test_single_q(): + q = 0.5 + + pdf = pd.DataFrame({"a": [4, 24, 13, 8, 7]}) + gdf = cudf.from_pandas(pdf) + + pdf_q = pdf.quantile(q, interpolation="nearest") + gdf_q = gdf.quantile(q, interpolation="nearest", method="table") + + assert_eq(pdf_q, gdf_q, check_index_type=False) + + +def test_with_index(): + q = [0, 0.5, 1] + + pdf = pd.DataFrame({"a": [7, 4, 4, 9, 13]}, index=[0, 4, 3, 2, 7]) + gdf = cudf.from_pandas(pdf) + + pdf_q = pdf.quantile(q, interpolation="nearest") + gdf_q = gdf.quantile(q, interpolation="nearest", method="table") + + assert_eq(pdf_q, gdf_q, check_index_type=False) + + +def test_with_multiindex(): + q = [0, 0.5, 1] + + pdf = pd.DataFrame( + { + "index_1": [3, 1, 9, 7, 5], + "index_2": [2, 4, 3, 5, 1], + "a": [8, 4, 2, 3, 8], + } + ) + pdf.set_index(["index_1", "index_2"], inplace=True) + + gdf = cudf.from_pandas(pdf) + + pdf_q = pdf.quantile(q, interpolation="nearest") + gdf_q = gdf.quantile(q, interpolation="nearest", method="table") + + assert_eq(pdf_q, gdf_q, check_index_type=False) + + +@pytest.mark.parametrize( + "data", + [ + {"a": [1, 2, 3], "b": [10, 11, 12]}, + {"a": [1, 0, 3], "b": [10, 11, 12]}, + {"a": [1, 2, 3], "b": [10, 11, None]}, + { + "a": [], + }, + {}, + ], +) +@pytest.mark.parametrize("op", ["all", "any"]) +def test_any_all_axis_none(data, op): + gdf = cudf.DataFrame(data) + pdf = gdf.to_pandas() + + expected = getattr(pdf, op)(axis=None) + actual = getattr(gdf, op)(axis=None) + + assert expected == actual + + +@pytest.mark.skipif( + PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, + reason="Warning not given on older versions of pandas", +) +def test_reductions_axis_none_warning(request, reduction_methods): + if reduction_methods == "quantile": + pytest.skip(f"pandas {reduction_methods} doesn't support axis=None") + df = cudf.DataFrame({"a": [1, 2, 3], "b": [10, 2, 3]}) + pdf = df.to_pandas() + with expect_warning_if( + reduction_methods in {"sum", "product", "std", "var"}, + FutureWarning, + ): + actual = getattr(df, reduction_methods)(axis=None) + with expect_warning_if( + reduction_methods in {"sum", "product", "std", "var"}, + FutureWarning, + ): + expected = getattr(pdf, reduction_methods)(axis=None) + assert_eq(expected, actual, check_dtype=False) + + +def test_dataframe_reduction_no_args(reduction_methods): + df = cudf.DataFrame({"a": range(10), "b": range(10)}) + pdf = df.to_pandas() + result = getattr(df, reduction_methods)() + expected = getattr(pdf, reduction_methods)() + assert_eq(result, expected) + + +def test_reduction_column_multiindex(): + idx = cudf.MultiIndex.from_tuples( + [("a", 1), ("a", 2)], names=["foo", "bar"] + ) + df = cudf.DataFrame(np.array([[1, 3], [2, 4]]), columns=idx) + result = df.mean() + expected = df.to_pandas().mean() + assert_eq(result, expected) + + +@pytest.mark.parametrize( + "columns", [pd.RangeIndex(2), pd.Index([0, 1], dtype="int8")] +) +def test_dataframe_axis_0_preserve_column_type_in_index(columns): + pd_df = pd.DataFrame([[1, 2]], columns=columns) + cudf_df = cudf.DataFrame.from_pandas(pd_df) + result = cudf_df.sum(axis=0) + expected = pd_df.sum(axis=0) + assert_eq(result, expected, check_index_type=True) diff --git a/python/cudf/cudf/tests/series/methods/test_cov_corr.py b/python/cudf/cudf/tests/series/methods/test_cov_corr.py new file mode 100644 index 00000000000..332e9d68fe3 --- /dev/null +++ b/python/cudf/cudf/tests/series/methods/test_cov_corr.py @@ -0,0 +1,159 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + + +import numpy as np +import pandas as pd +import pyarrow as pa +import pytest + +import cudf +from cudf.core._compat import PANDAS_CURRENT_SUPPORTED_VERSION, PANDAS_VERSION +from cudf.testing import assert_eq +from cudf.testing._utils import expect_warning_if + + +@pytest.mark.parametrize( + "data1", + [ + np.random.default_rng(seed=0).normal(-100, 100, 1000), + np.random.default_rng(seed=0).integers(-50, 50, 1000), + np.zeros(100), + np.repeat(np.nan, 100), + np.array([1.123, 2.343, np.nan, 0.0]), + pa.array([5, 10, 53, None, np.nan, None]), + pd.Series([1.1, 2.32, 43.4], index=[0, 4, 3]), + np.array([], dtype="float64"), + np.array([-3]), + ], +) +@pytest.mark.parametrize( + "data2", + [ + np.random.default_rng(seed=0).normal(-100, 100, 1000), + np.random.default_rng(seed=0).integers(-50, 50, 1000), + np.zeros(100), + np.repeat(np.nan, 100), + np.array([1.123, 2.343, np.nan, 0.0]), + pd.Series([1.1, 2.32, 43.4], index=[0, 500, 4000]), + np.array([5]), + ], +) +def test_cov1d(data1, data2): + gs1 = cudf.Series(data1) + gs2 = cudf.Series(data2) + + ps1 = gs1.to_pandas() + ps2 = gs2.to_pandas() + + got = gs1.cov(gs2) + ps1_align, ps2_align = ps1.align(ps2, join="inner") + with expect_warning_if( + (len(ps1_align.dropna()) == 1 and len(ps2_align.dropna()) > 0) + or (len(ps2_align.dropna()) == 1 and len(ps1_align.dropna()) > 0), + RuntimeWarning, + ): + expected = ps1.cov(ps2) + np.testing.assert_approx_equal(got, expected, significant=8) + + +@pytest.mark.parametrize( + "data1", + [ + np.random.default_rng(seed=0).normal(-100, 100, 1000), + np.random.default_rng(seed=0).integers(-50, 50, 1000), + np.zeros(100), + np.repeat(np.nan, 100), + np.array([1.123, 2.343, np.nan, 0.0]), + pa.array([5, 10, 53, None, np.nan, None]), + pd.Series([1.1032, 2.32, 43.4], index=[0, 4, 3]), + np.array([], dtype="float64"), + np.array([-3]), + ], +) +@pytest.mark.parametrize( + "data2", + [ + np.random.default_rng(seed=0).normal(-100, 100, 1000), + np.random.default_rng(seed=0).integers(-50, 50, 1000), + np.zeros(100), + np.repeat(np.nan, 100), + np.array([1.123, 2.343, np.nan, 0.0]), + pd.Series([1.1, 2.32, 43.4], index=[0, 500, 4000]), + np.array([5]), + ], +) +@pytest.mark.skipif( + PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, + reason="Warnings missing on older pandas (scipy version seems unrelated?)", +) +def test_corr1d(data1, data2, corr_method): + if corr_method == "spearman": + # Pandas uses scipy.stats.spearmanr code-path + pytest.importorskip("scipy") + + gs1 = cudf.Series(data1) + gs2 = cudf.Series(data2) + + ps1 = gs1.to_pandas() + ps2 = gs2.to_pandas() + + got = gs1.corr(gs2, corr_method) + + ps1_align, ps2_align = ps1.align(ps2, join="inner") + + is_singular = ( + len(ps1_align.dropna()) == 1 and len(ps2_align.dropna()) > 0 + ) or (len(ps2_align.dropna()) == 1 and len(ps1_align.dropna()) > 0) + is_identical = ( + len(ps1_align.dropna().unique()) == 1 and len(ps2_align.dropna()) > 0 + ) or ( + len(ps2_align.dropna().unique()) == 1 and len(ps1_align.dropna()) > 0 + ) + + # Pearson correlation leads to division by 0 when either sample size is 1. + # Spearman allows for size 1 samples, but will error if all data in a + # sample is identical since the covariance is zero and so the correlation + # coefficient is not defined. + cond = ((is_singular or is_identical) and corr_method == "pearson") or ( + is_identical and not is_singular and corr_method == "spearman" + ) + if corr_method == "spearman": + # SciPy has shuffled around the warning it throws a couple of times. + # It's not worth the effort of conditionally importing the appropriate + # warning based on the scipy version, just catching a base Warning is + # good enough validation. + expected_warning = Warning + elif corr_method == "pearson": + expected_warning = RuntimeWarning + + with expect_warning_if(cond, expected_warning): + expected = ps1.corr(ps2, corr_method) + np.testing.assert_approx_equal(got, expected, significant=8) + + +@pytest.mark.parametrize( + "data1", + [ + [1, 2, 3, 4], + [10, 1, 3, 5], + ], +) +@pytest.mark.parametrize( + "data2", + [ + [1, 2, 3, 4], + [10, 1, 3, 5], + ], +) +@pytest.mark.skipif( + PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, + reason="Fails in older versions of pandas", +) +def test_cov_corr_datetime_timedelta(data1, data2, temporal_types_as_str): + gsr1 = cudf.Series(data1, dtype=temporal_types_as_str) + gsr2 = cudf.Series(data2, dtype=temporal_types_as_str) + psr1 = gsr1.to_pandas() + psr2 = gsr2.to_pandas() + + assert_eq(psr1.corr(psr2), gsr1.corr(gsr2)) + assert_eq(psr1.cov(psr2), gsr1.cov(gsr2)) diff --git a/python/cudf/cudf/tests/series/methods/test_pct_change.py b/python/cudf/cudf/tests/series/methods/test_pct_change.py new file mode 100644 index 00000000000..47b1a81151b --- /dev/null +++ b/python/cudf/cudf/tests/series/methods/test_pct_change.py @@ -0,0 +1,49 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + + +import numpy as np +import pytest + +import cudf +from cudf.api.extensions import no_default +from cudf.core._compat import PANDAS_CURRENT_SUPPORTED_VERSION, PANDAS_VERSION +from cudf.testing._utils import expect_warning_if + + +@pytest.mark.skipif( + PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, + reason="warning not present in older pandas versions", +) +@pytest.mark.parametrize( + "data", + [ + np.random.default_rng(seed=0).normal(-100, 100, 1000), + np.random.default_rng(seed=0).integers(-50, 50, 1000), + np.zeros(100), + np.array([1.123, 2.343, np.nan, 0.0]), + np.array([-2, 3.75, 6, None, None, None, -8.5, None, 4.2]), + np.array([], dtype="float64"), + np.array([-3]), + ], +) +@pytest.mark.parametrize("periods", [-5, 0, 5]) +@pytest.mark.parametrize( + "fill_method", ["ffill", "bfill", "pad", "backfill", no_default, None] +) +def test_series_pct_change(data, periods, fill_method): + cs = cudf.Series(data) + ps = cs.to_pandas() + + if np.abs(periods) <= len(cs): + with expect_warning_if(fill_method not in (no_default, None)): + got = cs.pct_change(periods=periods, fill_method=fill_method) + with expect_warning_if( + ( + fill_method not in (no_default, None) + or (fill_method is not None and ps.isna().any()) + ) + ): + expected = ps.pct_change(periods=periods, fill_method=fill_method) + np.testing.assert_array_almost_equal( + got.to_numpy(na_value=np.nan), expected + ) diff --git a/python/cudf/cudf/tests/series/methods/test_reductions.py b/python/cudf/cudf/tests/series/methods/test_reductions.py new file mode 100644 index 00000000000..3b636476651 --- /dev/null +++ b/python/cudf/cudf/tests/series/methods/test_reductions.py @@ -0,0 +1,1041 @@ +# Copyright (c) 2019-2025, NVIDIA CORPORATION. +import re +from concurrent.futures import ThreadPoolExecutor +from decimal import Decimal + +import cupy as cp +import numpy as np +import pandas as pd +import pyarrow as pa +import pytest +from packaging.version import parse + +import cudf +from cudf.core._compat import PANDAS_GE_230 +from cudf.testing import assert_eq +from cudf.testing._utils import assert_exceptions_equal, expect_warning_if + + +@pytest.mark.parametrize("data", [[], [1, 2, 3]]) +def test_series_pandas_methods(data, reduction_methods): + arr = np.array(data) + sr = cudf.Series(arr) + psr = pd.Series(arr) + np.testing.assert_equal( + getattr(sr, reduction_methods)(), getattr(psr, reduction_methods)() + ) + + +def test_series_reductions( + request, reduction_methods, numeric_types_as_str, skipna +): + request.applymarker( + pytest.mark.xfail( + reduction_methods == "quantile", + raises=TypeError, + reason=f"{reduction_methods} doesn't support skipna", + ) + ) + request.applymarker( + pytest.mark.xfail( + reduction_methods == "product" + and skipna + and numeric_types_as_str not in {"float32", "float64"}, + reason=f"{reduction_methods} incorrect with {skipna=}", + ) + ) + rng = np.random.default_rng(seed=0) + arr = rng.random(100) + if np.dtype(numeric_types_as_str).kind in "iu": + arr *= 100 + mask = arr > 10 + else: + mask = arr > 0.5 + + arr = arr.astype(numeric_types_as_str) + if numeric_types_as_str in ("float32", "float64"): + arr[[2, 5, 14, 19, 50, 70]] = np.nan + sr = cudf.Series(arr) + sr[~mask] = None + psr = sr.to_pandas() + psr[~mask] = np.nan + + def call_test(sr, skipna): + fn = getattr(sr, reduction_methods) + if reduction_methods in ["std", "var"]: + return fn(ddof=1, skipna=skipna) + else: + return fn(skipna=skipna) + + expect = call_test(psr, skipna=skipna) + got = call_test(sr, skipna=skipna) + + np.testing.assert_approx_equal(expect, got, significant=4) + + +def test_series_reductions_concurrency(reduction_methods): + rng = np.random.default_rng(seed=0) + srs = [cudf.Series(rng.random(100))] + + def call_test(sr): + fn = getattr(sr, reduction_methods) + if reduction_methods in ["std", "var"]: + return fn(ddof=1) + else: + return fn() + + def f(sr): + return call_test(sr + 1) + + with ThreadPoolExecutor(10) as e: + list(e.map(f, srs * 50)) + + +@pytest.mark.parametrize("ddof", range(3)) +def test_series_std(ddof): + rng = np.random.default_rng(seed=0) + arr = rng.random(100) - 0.5 + sr = cudf.Series(arr) + pd = sr.to_pandas() + got = sr.std(ddof=ddof) + expect = pd.std(ddof=ddof) + np.testing.assert_approx_equal(expect, got) + + +def test_series_scale(): + rng = np.random.default_rng(seed=0) + arr = pd.Series(rng.integers(low=-10, high=10, size=100)) + sr = cudf.Series(arr) + + vmin = arr.min() + vmax = arr.max() + scaled = (arr - vmin) / (vmax - vmin) + assert scaled.min() == 0 + assert scaled.max() == 1 + assert_eq(sr.scale(), scaled) + + +def test_exact_quantiles(quantile_interpolation): + arr = np.asarray([6.8, 0.15, 3.4, 4.17, 2.13, 1.11, -1.01, 0.8, 5.7]) + quant_values = [0.0, 0.25, 0.33, 0.5, 1.0] + + df = pd.DataFrame(arr) + gdf_series = cudf.Series(arr) + + q1 = gdf_series.quantile( + quant_values, interpolation=quantile_interpolation, exact=True + ) + + q2 = df.quantile(quant_values, interpolation=quantile_interpolation) + + np.testing.assert_allclose( + q1.to_pandas().values, np.array(q2.values).T.flatten(), rtol=1e-10 + ) + + +def test_exact_quantiles_int(quantile_interpolation): + arr = np.asarray([7, 0, 3, 4, 2, 1, -1, 1, 6]) + quant_values = [0.0, 0.25, 0.33, 0.5, 1.0] + + df = pd.DataFrame(arr) + gdf_series = cudf.Series(arr) + + q1 = gdf_series.quantile( + quant_values, interpolation=quantile_interpolation, exact=True + ) + + q2 = df.quantile(quant_values, interpolation=quantile_interpolation) + + np.testing.assert_allclose( + q1.to_pandas().values, np.array(q2.values).T.flatten(), rtol=1e-10 + ) + + +def test_approx_quantiles(): + arr = np.asarray([6.8, 0.15, 3.4, 4.17, 2.13, 1.11, -1.01, 0.8, 5.7]) + quant_values = [0.0, 0.25, 0.33, 0.5, 1.0] + + gdf_series = cudf.Series(arr) + pdf_series = pd.Series(arr) + + q1 = gdf_series.quantile(quant_values, exact=False) + q2 = pdf_series.quantile(quant_values) + + assert_eq(q1, q2) + + +def test_approx_quantiles_int(): + arr = np.asarray([1, 2, 3]) + quant_values = [0.5] + approx_results = [2] + + gdf_series = cudf.Series(arr) + + q1 = gdf_series.quantile(quant_values, exact=False) + + assert approx_results == q1.to_pandas().values + + +@pytest.mark.parametrize("data", [[], [1, 2, 3, 10, 326497]]) +@pytest.mark.parametrize( + "q", + [ + [], + 0.5, + 1, + 0.234, + [0.345], + [0.243, 0.5, 1], + np.array([0.5, 1]), + cp.array([0.5, 1]), + ], +) +def test_misc_quantiles(data, q): + pdf_series = pd.Series(data, dtype="float64" if len(data) == 0 else None) + gdf_series = cudf.from_pandas(pdf_series) + + expected = pdf_series.quantile(q.get() if isinstance(q, cp.ndarray) else q) + actual = gdf_series.quantile(q) + assert_eq(expected, actual) + + +@pytest.mark.parametrize( + "data", + [ + {"data": np.random.default_rng(seed=0).normal(-100, 100, 1000)}, + {"data": np.random.default_rng(seed=0).integers(-50, 50, 1000)}, + {"data": (np.zeros(100))}, + {"data": np.repeat(np.nan, 100)}, + {"data": np.array([1.123, 2.343, np.nan, 0.0])}, + { + "data": [5, 10, 53, None, np.nan, None, 12, 43, -423], + "nan_as_null": False, + }, + {"data": [1.1032, 2.32, 43.4, 13, -312.0], "index": [0, 4, 3, 19, 6]}, + {"data": [], "dtype": "float64"}, + {"data": [-3]}, + ], +) +@pytest.mark.parametrize("null_flag", [False, True]) +def test_kurtosis_series(data, null_flag, numeric_only): + gs = cudf.Series(**data) + ps = gs.to_pandas() + + if null_flag and len(gs) > 2: + gs.iloc[[0, 2]] = None + ps.iloc[[0, 2]] = None + + got = gs.kurtosis(numeric_only=numeric_only) + expected = ps.kurtosis(numeric_only=numeric_only) + + assert_eq(got, expected) + + got = gs.kurt(numeric_only=numeric_only) + expected = ps.kurt(numeric_only=numeric_only) + + assert_eq(got, expected) + + +@pytest.mark.parametrize("op", ["skew", "kurt"]) +def test_kurt_skew_error(op): + gs = cudf.Series(["ab", "cd"]) + ps = gs.to_pandas() + + assert_exceptions_equal( + getattr(gs, op), + getattr(ps, op), + lfunc_args_and_kwargs=([], {"numeric_only": True}), + rfunc_args_and_kwargs=([], {"numeric_only": True}), + ) + + +@pytest.mark.parametrize( + "data, index, dtype, nan_as_null", + [ + [ + np.random.default_rng(seed=0).normal(-100, 100, 1000), + None, + None, + None, + ], + [ + np.random.default_rng(seed=0).integers(-50, 50, 1000), + None, + None, + None, + ], + [np.zeros(100), None, None, None], + [np.repeat(np.nan, 100), None, None, None], + [np.array([1.123, 2.343, np.nan, 0.0]), None, None, None], + [[5, 10, 53, None, np.nan, None, 12, 43, -423], None, None, False], + [[1.1032, 2.32, 43.4, 13, -312.0], [0, 4, 3, 19, 6], None, None], + [[], None, "float64", None], + [[-3], None, None, None], + ], +) +@pytest.mark.parametrize("null_flag", [False, True]) +def test_skew_series(data, index, dtype, nan_as_null, null_flag, numeric_only): + data = cudf.Series(data, index=index, dtype=dtype, nan_as_null=nan_as_null) + pdata = data.to_pandas() + + if null_flag and len(data) > 2: + data.iloc[[0, 2]] = None + pdata.iloc[[0, 2]] = None + + got = data.skew(numeric_only=numeric_only) + expected = pdata.skew(numeric_only=numeric_only) + + assert_eq(got, expected) + + +@pytest.mark.parametrize("num_na", [0, 50, 100]) +def test_series_median(numeric_types_as_str, num_na): + rng = np.random.default_rng(seed=0) + arr = rng.random(100) + dtype = np.dtype(numeric_types_as_str) + if dtype.kind in "iu": + arr *= 100 + mask = np.arange(100) >= num_na + + arr = arr.astype(dtype) + sr = cudf.Series(arr) + sr[~mask] = None + arr2 = arr[mask] + ps = pd.Series(arr2, dtype=dtype) + + actual = sr.median(skipna=True) + desired = ps.median(skipna=True) + + np.testing.assert_approx_equal(actual, desired) + + # only for float until integer null supported convert to pandas in cudf + # eg. pd.Int64Dtype + if dtype.kind == "f": + ps = sr.to_pandas() + actual = sr.median(skipna=False) + desired = ps.median(skipna=False) + np.testing.assert_approx_equal(actual, desired) + + +@pytest.mark.parametrize( + "data", + [ + [0.0, 1, 3, 6, np.nan, 7, 5.0, np.nan, 5, 2, 3, -100], + [np.nan] * 3, + [1, 5, 3], + [], + ], +) +@pytest.mark.filterwarnings("ignore:Mean of empty slice:RuntimeWarning") +def test_nans_stats(data, reduction_methods, skipna, request): + request.applymarker( + pytest.mark.xfail( + reduction_methods == "quantile", + raises=TypeError, + reason=f"{reduction_methods} doesn't support skipna", + ) + ) + + psr = pd.Series(data, dtype="float64" if len(data) == 0 else None) + gsr = cudf.from_pandas(psr) + + assert_eq( + getattr(psr, reduction_methods)(skipna=skipna), + getattr(gsr, reduction_methods)(skipna=skipna), + ) + + gsr = cudf.Series( + data, dtype="float64" if len(data) == 0 else None, nan_as_null=False + ) + # Since there is no concept of `nan_as_null` in pandas, + # nulls will be returned in the operations. So only + # testing for `skipna=True` when `nan_as_null=False` + assert_eq( + getattr(psr, reduction_methods)(skipna=True), + getattr(gsr, reduction_methods)(skipna=True), + ) + + +@pytest.mark.parametrize( + "data", + [ + [0.0, 1, 3, 6, np.nan, 7, 5.0, np.nan, 5, 2, 3, -100], + [np.nan] * 3, + [1, 5, 3], + ], +) +@pytest.mark.parametrize("min_count", [-10, -1, 0, 1, 5, 10]) +def test_min_count_ops(data, request, reduction_methods, skipna, min_count): + request.applymarker( + pytest.mark.xfail( + reduction_methods == "quantile", + raises=TypeError, + reason=f"{reduction_methods} doesn't support skipna", + ) + ) + request.applymarker( + pytest.mark.xfail( + reduction_methods + in { + "skew", + "kurtosis", + "median", + "var", + "std", + "any", + "all", + "max", + "min", + }, + raises=TypeError, + reason=f"{reduction_methods} doesn't support min_count", + ) + ) + psr = pd.Series(data) + gsr = cudf.Series(data, nan_as_null=False) + + assert_eq( + getattr(psr, reduction_methods)(skipna=skipna, min_count=min_count), + getattr(gsr, reduction_methods)(skipna=skipna, min_count=min_count), + ) + + +@pytest.mark.parametrize("q", [2, [1, 2, 3]]) +def test_quantile_range_error(q): + ps = pd.Series([1, 2, 3]) + gs = cudf.from_pandas(ps) + assert_exceptions_equal( + lfunc=ps.quantile, + rfunc=gs.quantile, + lfunc_args_and_kwargs=([q],), + rfunc_args_and_kwargs=([q],), + ) + + +def test_quantile_q_type(): + gs = cudf.Series([1, 2, 3]) + with pytest.raises( + TypeError, + match=re.escape( + "q must be a scalar or array-like, got " + ), + ): + gs.quantile(cudf.DataFrame()) + + +def test_quantile_type_int_float(quantile_interpolation): + data = [1, 3, 4] + psr = pd.Series(data) + gsr = cudf.Series(data) + + expected = psr.quantile(0.5, interpolation=quantile_interpolation) + actual = gsr.quantile(0.5, interpolation=quantile_interpolation) + + assert expected == actual + assert type(expected) is type(actual) + + +@pytest.mark.parametrize("val", [0.9, float("nan")]) +def test_ignore_nans(val): + data = [float("nan"), float("nan"), val] + psr = pd.Series(data) + gsr = cudf.Series(data, nan_as_null=False) + + expected = gsr.quantile(0.9) + result = psr.quantile(0.9) + assert_eq(result, expected) + + +def test_sum(numeric_types_as_str): + data = np.arange(10, dtype=numeric_types_as_str) + sr = cudf.Series(data) + + got = sr.sum() + expect = data.sum() + significant = 4 if numeric_types_as_str == "float32" else 6 + np.testing.assert_approx_equal(expect, got, significant=significant) + + +@pytest.mark.parametrize( + "middle, expected", + [ + ("there", "HellothereWorld"), + (None, "HelloWorld"), + ], +) +def test_sum_string(middle, expected): + s = cudf.Series(["Hello", middle, "World"]) + + assert s.sum() == expected + + +@pytest.mark.parametrize( + "dtype", + [ + pytest.param( + cudf.Decimal64Dtype(6, 3), + marks=pytest.mark.skipif( + parse(pa.__version__) < parse("19.0.0"), + reason="decimal64 format string only supported in pyarrow >=19", + ), + ), + pytest.param( + cudf.Decimal64Dtype(10, 6), + marks=pytest.mark.skipif( + parse(pa.__version__) < parse("19.0.0"), + reason="decimal64 format string only supported in pyarrow >=19", + ), + ), + pytest.param( + cudf.Decimal64Dtype(16, 7), + marks=pytest.mark.skipif( + parse(pa.__version__) < parse("19.0.0"), + reason="decimal64 format string only supported in pyarrow >=19", + ), + ), + pytest.param( + cudf.Decimal32Dtype(6, 3), + marks=pytest.mark.skipif( + parse(pa.__version__) < parse("19.0.0"), + reason="decimal32 format string only supported in pyarrow >=19", + ), + ), + cudf.Decimal128Dtype(20, 7), + ], +) +def test_sum_decimal(dtype): + data = [str(x) for x in np.array([1, 11, 111]) / 100] + + expected = pd.Series([Decimal(x) for x in data]).sum() + got = cudf.Series(data).astype(dtype).sum() + + assert_eq(expected, got) + + +def test_product(numeric_types_as_str): + data = np.arange(10, dtype=numeric_types_as_str) + sr = cudf.Series(data) + + got = sr.product() + expect = pd.Series(data).product() + significant = 4 if numeric_types_as_str == "float32" else 6 + np.testing.assert_approx_equal(expect, got, significant=significant) + + +@pytest.mark.parametrize( + "dtype", + [ + pytest.param( + cudf.Decimal64Dtype(6, 2), + marks=pytest.mark.skipif( + parse(pa.__version__) < parse("19.0.0"), + reason="decimal64 format string only supported in pyarrow >=19", + ), + ), + pytest.param( + cudf.Decimal64Dtype(8, 4), + marks=pytest.mark.skipif( + parse(pa.__version__) < parse("19.0.0"), + reason="decimal64 format string only supported in pyarrow >=19", + ), + ), + pytest.param( + cudf.Decimal64Dtype(10, 5), + marks=pytest.mark.skipif( + parse(pa.__version__) < parse("19.0.0"), + reason="decimal64 format string only supported in pyarrow >=19", + ), + ), + pytest.param( + cudf.Decimal32Dtype(6, 2), + marks=pytest.mark.skipif( + parse(pa.__version__) < parse("19.0.0"), + reason="decimal32 format string only supported in pyarrow >=19", + ), + ), + cudf.Decimal128Dtype(20, 5), + ], +) +def test_product_decimal(dtype): + data = [str(x) for x in np.array([1, 11, 111]) / 100] + + expected = pd.Series([Decimal(x) for x in data]).product() + got = cudf.Series(data).astype(dtype).product() + + assert_eq(expected, got) + + +def test_sum_of_squares(numeric_types_as_str): + accuracy_for_dtype = {"float64": 6, "float32": 5} + data = np.arange(10, dtype=numeric_types_as_str) + sr = cudf.Series(data) + df = cudf.DataFrame(sr) + + got = (sr**2).sum() + got_df = (df**2).sum() + expect = (data**2).sum() + + if "int" in numeric_types_as_str: + np.testing.assert_array_almost_equal(expect, got) + np.testing.assert_array_almost_equal(expect, got_df.iloc[0]) + else: + np.testing.assert_approx_equal( + expect, got, significant=accuracy_for_dtype[numeric_types_as_str] + ) + np.testing.assert_approx_equal( + expect, + got_df.iloc[0], + significant=accuracy_for_dtype[numeric_types_as_str], + ) + + +@pytest.mark.parametrize( + "dtype", + [ + pytest.param( + cudf.Decimal64Dtype(6, 2), + marks=pytest.mark.skipif( + parse(pa.__version__) < parse("19.0.0"), + reason="decimal64 format string only supported in pyarrow >=19", + ), + ), + pytest.param( + cudf.Decimal64Dtype(8, 4), + marks=pytest.mark.skipif( + parse(pa.__version__) < parse("19.0.0"), + reason="decimal64 format string only supported in pyarrow >=19", + ), + ), + pytest.param( + cudf.Decimal64Dtype(10, 5), + marks=pytest.mark.skipif( + parse(pa.__version__) < parse("19.0.0"), + reason="decimal64 format string only supported in pyarrow >=19", + ), + ), + cudf.Decimal128Dtype(20, 7), + pytest.param( + cudf.Decimal32Dtype(6, 2), + marks=pytest.mark.skipif( + parse(pa.__version__) < parse("19.0.0"), + reason="decimal32 format string only supported in pyarrow >=19", + ), + ), + ], +) +def test_sum_of_squares_decimal(dtype): + data = [str(x) for x in np.array([1, 11, 111]) / 100] + + expected = pd.Series([Decimal(x) for x in data]).pow(2).sum() + got = (cudf.Series(data).astype(dtype) ** 2).sum() + + assert_eq(expected, got) + + +def test_min(numeric_types_as_str): + data = np.arange(10, dtype=numeric_types_as_str) + sr = cudf.Series(data) + + got = sr.min() + expect = getattr(np, numeric_types_as_str)(data.min()) + + assert expect == got + + +@pytest.mark.parametrize( + "dtype", + [ + pytest.param( + cudf.Decimal64Dtype(6, 3), + marks=pytest.mark.skipif( + parse(pa.__version__) < parse("19.0.0"), + reason="decimal64 format string only supported in pyarrow >=19", + ), + ), + pytest.param( + cudf.Decimal64Dtype(10, 6), + marks=pytest.mark.skipif( + parse(pa.__version__) < parse("19.0.0"), + reason="decimal64 format string only supported in pyarrow >=19", + ), + ), + pytest.param( + cudf.Decimal64Dtype(16, 7), + marks=pytest.mark.skipif( + parse(pa.__version__) < parse("19.0.0"), + reason="decimal64 format string only supported in pyarrow >=19", + ), + ), + pytest.param( + cudf.Decimal32Dtype(6, 3), + marks=pytest.mark.skipif( + parse(pa.__version__) < parse("19.0.0"), + reason="decimal32 format string only supported in pyarrow >=19", + ), + ), + cudf.Decimal128Dtype(20, 7), + ], +) +def test_min_decimal(dtype): + data = [str(x) for x in np.array([1, 11, 111]) / 100] + + expected = pd.Series([Decimal(x) for x in data]).min() + got = cudf.Series(data).astype(dtype).min() + + assert_eq(expected, got) + + +def test_max(numeric_types_as_str): + data = np.arange(10, dtype=numeric_types_as_str) + sr = cudf.Series(data) + + got = sr.max() + expect = getattr(np, numeric_types_as_str)(data.max()) + + assert expect == got + + +@pytest.mark.parametrize( + "dtype", + [ + pytest.param( + cudf.Decimal64Dtype(6, 3), + marks=pytest.mark.skipif( + parse(pa.__version__) < parse("19.0.0"), + reason="decimal64 format string only supported in pyarrow >=19", + ), + ), + pytest.param( + cudf.Decimal64Dtype(10, 6), + marks=pytest.mark.skipif( + parse(pa.__version__) < parse("19.0.0"), + reason="decimal64 format string only supported in pyarrow >=19", + ), + ), + pytest.param( + cudf.Decimal64Dtype(16, 7), + marks=pytest.mark.skipif( + parse(pa.__version__) < parse("19.0.0"), + reason="decimal64 format string only supported in pyarrow >=19", + ), + ), + pytest.param( + cudf.Decimal32Dtype(6, 3), + marks=pytest.mark.skipif( + parse(pa.__version__) < parse("19.0.0"), + reason="decimal32 format string only supported in pyarrow >=19", + ), + ), + cudf.Decimal128Dtype(20, 7), + ], +) +def test_max_decimal(dtype): + data = [str(x) for x in np.array([1, 11, 111]) / 100] + + expected = pd.Series([Decimal(x) for x in data]).max() + got = cudf.Series(data).astype(dtype).max() + + assert_eq(expected, got) + + +def test_sum_masked(): + data = np.array([1.1, 1.2, np.nan], dtype="float64") + sr = cudf.Series(data, nan_as_null=True) + + got = sr.sum() + expected = np.nansum(data) + np.testing.assert_approx_equal(expected, got) + + +def test_sum_boolean(): + s = cudf.Series(np.arange(100000)) + got = (s > 1).sum() + expect = 99998 + + assert expect == got + + +def test_date_minmax(): + rng = np.random.default_rng(seed=0) + np_data = rng.normal(size=10) + gdf_data = cudf.Series(np_data) + + np_casted = np_data.astype("datetime64[ms]") + gdf_casted = gdf_data.astype("datetime64[ms]") + + np_min = np_casted.min() + gdf_min = gdf_casted.min() + assert np_min == gdf_min + + np_max = np_casted.max() + gdf_max = gdf_casted.max() + assert np_max == gdf_max + + +@pytest.mark.parametrize( + "op", + ["sum", "product", "var", "kurt", "kurtosis", "skew"], +) +def test_datetime_unsupported_reductions(op): + gsr = cudf.Series([1, 2, 3, None], dtype="datetime64[ns]") + psr = gsr.to_pandas() + + assert_exceptions_equal( + lfunc=getattr(psr, op), + rfunc=getattr(gsr, op), + ) + + +@pytest.mark.parametrize("op", ["product", "var", "kurt", "kurtosis", "skew"]) +def test_timedelta_unsupported_reductions(op): + gsr = cudf.Series([1, 2, 3, None], dtype="timedelta64[ns]") + psr = gsr.to_pandas() + + assert_exceptions_equal( + lfunc=getattr(psr, op), + rfunc=getattr(gsr, op), + ) + + +def test_categorical_reductions(request, reduction_methods): + request.applymarker( + pytest.mark.xfail( + reduction_methods in ["quantile", "all", "any"], + reason=f"{reduction_methods} didn't fail", + ) + ) + + gsr = cudf.Series([1, 2, 3, None], dtype="category") + psr = gsr.to_pandas() + + assert_exceptions_equal( + getattr(psr, reduction_methods), getattr(gsr, reduction_methods) + ) + + +@pytest.mark.parametrize( + "data_non_overflow", + [ + [1000000, 200000, 3000000], + [1000000, 200000, None], + [], + [None], + [None, None, None, None, None], + [12, 12, 22, 343, 4353534, 435342], + np.array([10, 20, 30, None, 100]), + cp.asarray([10, 20, 30, 100]), + [1000000, 200000, 3000000], + [1000000, 200000, None], + [1], + [12, 11, 232, 223432411, 2343241, 234324, 23234], + [12, 11, 2.32, 2234.32411, 2343.241, 23432.4, 23234], + [1.321, 1132.324, 23223231.11, 233.41, 0.2434, 332, 323], + [12, 11, 2.32, 2234.32411, 2343.241, 23432.4, 23234], + ], +) +def test_timedelta_reduction_ops( + data_non_overflow, timedelta_types_as_str, reduction_methods +): + if reduction_methods not in ["sum", "mean", "median", "quantile"]: + pytest.skip(f"{reduction_methods} not supported for timedelta") + gsr = cudf.Series(data_non_overflow, dtype=timedelta_types_as_str) + psr = gsr.to_pandas() + + if len(psr) > 0 and psr.isnull().all() and reduction_methods == "median": + with pytest.warns(RuntimeWarning, match="Mean of empty slice"): + expected = getattr(psr, reduction_methods)() + else: + with expect_warning_if( + PANDAS_GE_230 + and reduction_methods == "quantile" + and len(data_non_overflow) == 0 + and timedelta_types_as_str != "timedelta64[ns]" + ): + expected = getattr(psr, reduction_methods)() + actual = getattr(gsr, reduction_methods)() + if pd.isna(expected) and pd.isna(actual): + pass + elif isinstance(expected, pd.Timedelta) and isinstance( + actual, pd.Timedelta + ): + assert ( + expected.round(gsr._column.time_unit).value + == actual.round(gsr._column.time_unit).value + ) + else: + assert_eq(expected, actual) + + +@pytest.mark.parametrize("data", [[1, 2, 3], [], [1, 20, 1000, None]]) +@pytest.mark.parametrize("ddof", [1, 2, 3]) +def test_timedelta_std(data, timedelta_types_as_str, ddof): + gsr = cudf.Series(data, dtype=timedelta_types_as_str) + psr = gsr.to_pandas() + + expected = psr.std(ddof=ddof) + actual = gsr.std(ddof=ddof) + + if np.isnat(expected.to_numpy()) and np.isnat(actual.to_numpy()): + assert True + else: + np.testing.assert_allclose( + expected.to_numpy().astype("float64"), + actual.to_numpy().astype("float64"), + rtol=1e-5, + atol=0, + ) + + +@pytest.mark.parametrize("op", ["max", "min"]) +@pytest.mark.parametrize( + "data", + [ + [], + [1, 2, 3, 100], + [10, None, 100, None, None], + [None, None, None], + [1231], + ], +) +def test_timedelta_reductions(data, op, timedelta_types_as_str): + sr = cudf.Series(data, dtype=timedelta_types_as_str) + psr = sr.to_pandas() + + actual = getattr(sr, op)() + expected = getattr(psr, op)() + + if np.isnat(expected.to_numpy()) and np.isnat(actual): + assert True + else: + assert_eq(expected.to_numpy(), actual) + + +@pytest.mark.parametrize( + "data", + [ + ["a", "b", "c", "d", "e"], + ["a", "z", ".", '"', "aa", "zz"], + ["aa", "zz"], + ["z", "a", "zz", "aa"], + ["1", "2", "3", "4", "5"], + [""], + ["a"], + ["hello"], + ["small text", "this is a larger text......"], + ["👋🏻", "🔥", "🥇"], + ["This is 💯", "here is a calendar", "📅"], + ["", ".", ";", "[", "]"], + ["\t", ".", "\n", "\n\t", "\t\n"], + ], +) +@pytest.mark.parametrize("op", ["min", "max", "sum"]) +def test_str_reductions_supported(data, op): + psr = pd.Series(data) + sr = cudf.Series(data) + + assert_eq(getattr(psr, op)(), getattr(sr, op)()) + + +def test_str_mean(): + sr = cudf.Series(["a", "b", "c", "d", "e"]) + + with pytest.raises(TypeError): + sr.mean() + + +def test_string_product(): + psr = pd.Series(["1", "2", "3", "4", "5"]) + sr = cudf.Series(["1", "2", "3", "4", "5"]) + + assert_exceptions_equal( + lfunc=psr.product, + rfunc=sr.product, + ) + + +def test_string_var(): + psr = pd.Series(["1", "2", "3", "4", "5"]) + sr = cudf.Series(["1", "2", "3", "4", "5"]) + + assert_exceptions_equal(lfunc=psr.var, rfunc=sr.var) + + +def test_string_std(): + psr = pd.Series(["1", "2", "3", "4", "5"]) + sr = cudf.Series(["1", "2", "3", "4", "5"]) + + assert_exceptions_equal(lfunc=psr.std, rfunc=sr.std) + + +def test_string_reduction_error(): + s = cudf.Series([None, None], dtype="str") + ps = s.to_pandas(nullable=True) + assert_exceptions_equal( + s.any, + ps.any, + lfunc_args_and_kwargs=([], {"skipna": False}), + rfunc_args_and_kwargs=([], {"skipna": False}), + ) + + assert_exceptions_equal( + s.all, + ps.all, + lfunc_args_and_kwargs=([], {"skipna": False}), + rfunc_args_and_kwargs=([], {"skipna": False}), + ) + + +@pytest.mark.parametrize("data", [[1, 2, 3], [], [1, 20, 1000, None]]) +def test_datetime_stats(data, datetime_types_as_str, reduction_methods): + if reduction_methods not in ["mean", "quantile"]: + pytest.skip(f"{reduction_methods} not applicable for test") + gsr = cudf.Series(data, dtype=datetime_types_as_str) + psr = gsr.to_pandas() + + with expect_warning_if( + PANDAS_GE_230 + and reduction_methods == "quantile" + and len(data) == 0 + and datetime_types_as_str != "datetime64[ns]" + ): + expected = getattr(psr, reduction_methods)() + actual = getattr(gsr, reduction_methods)() + + if len(data) == 0: + assert np.isnat(expected.to_numpy()) and np.isnat(actual.to_numpy()) + else: + assert_eq(expected, actual) + + +@pytest.mark.parametrize( + "data", + [ + [], + [1, 2, 3, 100], + [10, None, 100, None, None], + [None, None, None], + [1231], + ], +) +def test_datetime_reductions(data, reduction_methods, datetime_types_as_str): + if reduction_methods not in ["max", "min", "std", "median"]: + pytest.skip(f"{reduction_methods} not applicable for test") + sr = cudf.Series(data, dtype=datetime_types_as_str) + psr = sr.to_pandas() + + actual = getattr(sr, reduction_methods)() + with expect_warning_if( + psr.size > 0 and psr.isnull().all() and reduction_methods == "median", + RuntimeWarning, + ): + expected = getattr(psr, reduction_methods)() + + if ( + expected is pd.NaT + and actual is pd.NaT + or (np.isnat(expected.to_numpy()) and np.isnat(actual)) + ): + assert True + else: + assert_eq(expected, actual) diff --git a/python/cudf/cudf/tests/series/methods/test_unique.py b/python/cudf/cudf/tests/series/methods/test_unique.py index 2aa4b78b039..8346388ad78 100644 --- a/python/cudf/cudf/tests/series/methods/test_unique.py +++ b/python/cudf/cudf/tests/series/methods/test_unique.py @@ -36,3 +36,51 @@ def test_datetime_unique(data, nulls): pd.Series(expected).sort_values(ignore_index=True), got.sort_values(ignore_index=True).to_pandas(), ) + + +def test_series_unique(): + rng = np.random.default_rng(seed=0) + size = 100 + arr = rng.integers(low=-1, high=10, size=size) + mask = arr != -1 + sr = cudf.Series(arr) + sr[~mask] = None + assert set(arr[mask]) == set(sr.unique().dropna().to_numpy()) + assert len(set(arr[mask])) == sr.nunique() + + +def test_series_nunique(request, nan_as_null, dropna): + # We remove nulls as opposed to NaNs using the dropna parameter, + # so to test against pandas we replace NaN with another discrete value + request.applymarker( + pytest.mark.xfail( + nan_as_null is None, + reason=f"{nan_as_null=} returns wrong result", + ) + ) + cudf_series = cudf.Series([1, 2, 2, 3, 3], nan_as_null=nan_as_null) + pd_series = pd.Series([1, 2, 2, 3, 3]) + expect = pd_series.nunique(dropna=dropna) + got = cudf_series.nunique(dropna=dropna) + assert expect == got + + cudf_series = cudf.Series( + [1.0, 2.0, 3.0, np.nan, None], nan_as_null=nan_as_null + ) + if nan_as_null is True: + pd_series = pd.Series([1.0, 2.0, 3.0, np.nan, None]) + else: + pd_series = pd.Series([1.0, 2.0, 3.0, -1.0, None]) + + expect = pd_series.nunique(dropna=dropna) + got = cudf_series.nunique(dropna=dropna) + assert expect == got + + cudf_series = cudf.Series([1.0, np.nan, np.nan], nan_as_null=nan_as_null) + if nan_as_null is True: + pd_series = pd.Series([1.0, np.nan, np.nan]) + else: + pd_series = pd.Series([1.0, -1.0, -1.0]) + expect = pd_series.nunique(dropna=dropna) + got = cudf_series.nunique(dropna=dropna) + assert expect == got diff --git a/python/cudf/cudf/tests/series/test_reductions.py b/python/cudf/cudf/tests/series/test_reductions.py deleted file mode 100644 index 1c924444c1e..00000000000 --- a/python/cudf/cudf/tests/series/test_reductions.py +++ /dev/null @@ -1,17 +0,0 @@ -# Copyright (c) 2019-2025, NVIDIA CORPORATION. - -import numpy as np -import pandas as pd -import pytest - -from cudf import Series - - -@pytest.mark.parametrize("data", [[], [1, 2, 3]]) -def test_series_pandas_methods(data, reduction_methods): - arr = np.array(data) - sr = Series(arr) - psr = pd.Series(arr) - np.testing.assert_equal( - getattr(sr, reduction_methods)(), getattr(psr, reduction_methods)() - ) diff --git a/python/cudf/cudf/tests/test_datetime.py b/python/cudf/cudf/tests/test_datetime.py index 1bc6fe19d02..b1678889eff 100644 --- a/python/cudf/cudf/tests/test_datetime.py +++ b/python/cudf/cudf/tests/test_datetime.py @@ -12,14 +12,12 @@ from cudf import Series from cudf.core._compat import ( PANDAS_CURRENT_SUPPORTED_VERSION, - PANDAS_GE_230, PANDAS_VERSION, ) from cudf.testing import assert_eq from cudf.testing._utils import ( DATETIME_TYPES, assert_exceptions_equal, - expect_warning_if, ) @@ -384,60 +382,6 @@ def test_datetime_invalid_ops(): ) -@pytest.mark.parametrize("data", [[1, 2, 3], [], [1, 20, 1000, None]]) -@pytest.mark.parametrize("dtype", DATETIME_TYPES) -@pytest.mark.parametrize("stat", ["mean", "quantile"]) -def test_datetime_stats(data, dtype, stat): - gsr = cudf.Series(data, dtype=dtype) - psr = gsr.to_pandas() - - with expect_warning_if( - PANDAS_GE_230 - and stat == "quantile" - and len(data) == 0 - and dtype != "datetime64[ns]" - ): - expected = getattr(psr, stat)() - actual = getattr(gsr, stat)() - - if len(data) == 0: - assert np.isnat(expected.to_numpy()) and np.isnat(actual.to_numpy()) - else: - assert_eq(expected, actual) - - -@pytest.mark.parametrize("op", ["max", "min", "std", "median"]) -@pytest.mark.parametrize( - "data", - [ - [], - [1, 2, 3, 100], - [10, None, 100, None, None], - [None, None, None], - [1231], - ], -) -@pytest.mark.parametrize("dtype", DATETIME_TYPES) -def test_datetime_reductions(data, op, dtype): - sr = cudf.Series(data, dtype=dtype) - psr = sr.to_pandas() - - actual = getattr(sr, op)() - with expect_warning_if( - psr.size > 0 and psr.isnull().all() and op == "median", RuntimeWarning - ): - expected = getattr(psr, op)() - - if ( - expected is pd.NaT - and actual is pd.NaT - or (np.isnat(expected.to_numpy()) and np.isnat(actual)) - ): - assert True - else: - assert_eq(expected, actual) - - def test_datetime_binop_tz_timestamp(op): s = cudf.Series([1, 2, 3], dtype="datetime64[ns]") pd_tz_timestamp = pd.Timestamp("1970-01-01 00:00:00.000000001", tz="utc") diff --git a/python/cudf/cudf/tests/test_quantiles.py b/python/cudf/cudf/tests/test_quantiles.py deleted file mode 100644 index 8b2f5acb3e1..00000000000 --- a/python/cudf/cudf/tests/test_quantiles.py +++ /dev/null @@ -1,104 +0,0 @@ -# Copyright (c) 2020-2025, NVIDIA CORPORATION. - -import re - -import pandas as pd -import pytest - -import cudf -from cudf.testing import assert_eq -from cudf.testing._utils import assert_exceptions_equal - - -def test_single_q(): - q = 0.5 - - pdf = pd.DataFrame({"a": [4, 24, 13, 8, 7]}) - gdf = cudf.from_pandas(pdf) - - pdf_q = pdf.quantile(q, interpolation="nearest") - gdf_q = gdf.quantile(q, interpolation="nearest", method="table") - - assert_eq(pdf_q, gdf_q, check_index_type=False) - - -def test_with_index(): - q = [0, 0.5, 1] - - pdf = pd.DataFrame({"a": [7, 4, 4, 9, 13]}, index=[0, 4, 3, 2, 7]) - gdf = cudf.from_pandas(pdf) - - pdf_q = pdf.quantile(q, interpolation="nearest") - gdf_q = gdf.quantile(q, interpolation="nearest", method="table") - - assert_eq(pdf_q, gdf_q, check_index_type=False) - - -def test_with_multiindex(): - q = [0, 0.5, 1] - - pdf = pd.DataFrame( - { - "index_1": [3, 1, 9, 7, 5], - "index_2": [2, 4, 3, 5, 1], - "a": [8, 4, 2, 3, 8], - } - ) - pdf.set_index(["index_1", "index_2"], inplace=True) - - gdf = cudf.from_pandas(pdf) - - pdf_q = pdf.quantile(q, interpolation="nearest") - gdf_q = gdf.quantile(q, interpolation="nearest", method="table") - - assert_eq(pdf_q, gdf_q, check_index_type=False) - - -@pytest.mark.parametrize("q", [2, [1, 2, 3]]) -def test_quantile_range_error(q): - ps = pd.Series([1, 2, 3]) - gs = cudf.from_pandas(ps) - assert_exceptions_equal( - lfunc=ps.quantile, - rfunc=gs.quantile, - lfunc_args_and_kwargs=([q],), - rfunc_args_and_kwargs=([q],), - ) - - -def test_quantile_q_type(): - gs = cudf.Series([1, 2, 3]) - with pytest.raises( - TypeError, - match=re.escape( - "q must be a scalar or array-like, got " - ), - ): - gs.quantile(cudf.DataFrame()) - - -@pytest.mark.parametrize( - "interpolation", ["linear", "lower", "higher", "midpoint", "nearest"] -) -def test_quantile_type_int_float(interpolation): - data = [1, 3, 4] - psr = pd.Series(data) - gsr = cudf.Series(data) - - expected = psr.quantile(0.5, interpolation=interpolation) - actual = gsr.quantile(0.5, interpolation=interpolation) - - assert expected == actual - assert type(expected) is type(actual) - - -@pytest.mark.parametrize("val", [0.9, float("nan")]) -def test_ignore_nans(val): - data = [float("nan"), float("nan"), val] - psr = pd.Series(data) - gsr = cudf.Series(data, nan_as_null=False) - - expected = gsr.quantile(0.9) - result = psr.quantile(0.9) - assert_eq(result, expected) diff --git a/python/cudf/cudf/tests/test_reductions.py b/python/cudf/cudf/tests/test_reductions.py deleted file mode 100644 index 144d79ca94a..00000000000 --- a/python/cudf/cudf/tests/test_reductions.py +++ /dev/null @@ -1,512 +0,0 @@ -# Copyright (c) 2020-2025, NVIDIA CORPORATION. -from decimal import Decimal - -import numpy as np -import pandas as pd -import pyarrow as pa -import pytest - -import cudf -from cudf import Series -from cudf.core._compat import PANDAS_CURRENT_SUPPORTED_VERSION, PANDAS_VERSION -from cudf.core.column.column import as_column -from cudf.core.dtypes import Decimal32Dtype, Decimal64Dtype, Decimal128Dtype -from cudf.testing import _utils as utils, assert_eq -from cudf.testing._utils import NUMERIC_TYPES, expect_warning_if, gen_rand - - -@pytest.fixture(params=NUMERIC_TYPES) -def dtype(request): - return request.param - - -@pytest.fixture(params=[1, 20]) -def nelem(request): - return request.param - - -def test_sum(dtype, nelem): - dtype = cudf.dtype(dtype).type - data = gen_rand(dtype, nelem) - sr = Series(data) - - got = sr.sum() - expect = data.sum() - significant = 4 if dtype == np.float32 else 6 - np.testing.assert_approx_equal(expect, got, significant=significant) - - -def test_sum_string(): - s = Series(["Hello", "there", "World"]) - - got = s.sum() - expected = "HellothereWorld" - - assert got == expected - - s = Series(["Hello", None, "World"]) - - got = s.sum() - expected = "HelloWorld" - - assert got == expected - - -@pytest.mark.parametrize( - "dtype", - [ - pytest.param( - Decimal64Dtype(6, 3), - marks=pytest.mark.skipif( - pa._generated_version.version_tuple[0] < 19, - reason="decimal64 format string only supported in pyarrow >=19", - ), - ), - pytest.param( - Decimal64Dtype(10, 6), - marks=pytest.mark.skipif( - pa._generated_version.version_tuple[0] < 19, - reason="decimal64 format string only supported in pyarrow >=19", - ), - ), - pytest.param( - Decimal64Dtype(16, 7), - marks=pytest.mark.skipif( - pa._generated_version.version_tuple[0] < 19, - reason="decimal64 format string only supported in pyarrow >=19", - ), - ), - pytest.param( - Decimal32Dtype(6, 3), - marks=pytest.mark.skipif( - pa._generated_version.version_tuple[0] < 19, - reason="decimal32 format string only supported in pyarrow >=19", - ), - ), - Decimal128Dtype(20, 7), - ], -) -def test_sum_decimal(dtype, nelem): - data = [str(x) for x in gen_rand("int64", nelem, seed=0) / 100] - - expected = pd.Series([Decimal(x) for x in data]).sum() - got = cudf.Series(data).astype(dtype).sum() - - assert_eq(expected, got) - - -def test_product(dtype, nelem): - rng = np.random.default_rng(seed=0) - dtype = cudf.dtype(dtype).type - if cudf.dtype(dtype).kind in {"u", "i"}: - data = np.ones(nelem, dtype=dtype) - # Set at most 30 items to [0..2) to keep the value within 2^32 - for _ in range(30): - data[rng.integers(low=0, high=nelem, size=1)] = rng.uniform() * 2 - else: - data = gen_rand(dtype, nelem) - - sr = Series(data) - - got = sr.product() - expect = pd.Series(data).product() - significant = 4 if dtype == np.float32 else 6 - np.testing.assert_approx_equal(expect, got, significant=significant) - - -@pytest.mark.parametrize( - "dtype", - [ - pytest.param( - Decimal64Dtype(6, 2), - marks=pytest.mark.skipif( - pa._generated_version.version_tuple[0] < 19, - reason="decimal64 format string only supported in pyarrow >=19", - ), - ), - pytest.param( - Decimal64Dtype(8, 4), - marks=pytest.mark.skipif( - pa._generated_version.version_tuple[0] < 19, - reason="decimal64 format string only supported in pyarrow >=19", - ), - ), - pytest.param( - Decimal64Dtype(10, 5), - marks=pytest.mark.skipif( - pa._generated_version.version_tuple[0] < 19, - reason="decimal64 format string only supported in pyarrow >=19", - ), - ), - pytest.param( - Decimal32Dtype(6, 2), - marks=pytest.mark.skipif( - pa._generated_version.version_tuple[0] < 19, - reason="decimal32 format string only supported in pyarrow >=19", - ), - ), - Decimal128Dtype(20, 5), - ], -) -def test_product_decimal(dtype): - data = [str(x) for x in gen_rand("int8", 3) / 10] - - expected = pd.Series([Decimal(x) for x in data]).product() - got = cudf.Series(data).astype(dtype).product() - - assert_eq(expected, got) - - -accuracy_for_dtype = {np.float64: 6, np.float32: 5} - - -def test_sum_of_squares(dtype, nelem): - dtype = cudf.dtype(dtype).type - data = gen_rand(dtype, nelem) - sr = Series(data) - df = cudf.DataFrame(sr) - - got = (sr**2).sum() - got_df = (df**2).sum() - expect = (data**2).sum() - - if cudf.dtype(dtype).kind in {"u", "i"}: - np.testing.assert_array_almost_equal(expect, got) - np.testing.assert_array_almost_equal(expect, got_df.iloc[0]) - else: - np.testing.assert_approx_equal( - expect, got, significant=accuracy_for_dtype[dtype] - ) - np.testing.assert_approx_equal( - expect, got_df.iloc[0], significant=accuracy_for_dtype[dtype] - ) - - -@pytest.mark.parametrize( - "dtype", - [ - pytest.param( - Decimal64Dtype(6, 2), - marks=pytest.mark.skipif( - pa._generated_version.version_tuple[0] < 19, - reason="decimal64 format string only supported in pyarrow >=19", - ), - ), - pytest.param( - Decimal64Dtype(8, 4), - marks=pytest.mark.skipif( - pa._generated_version.version_tuple[0] < 19, - reason="decimal64 format string only supported in pyarrow >=19", - ), - ), - pytest.param( - Decimal64Dtype(10, 5), - marks=pytest.mark.skipif( - pa._generated_version.version_tuple[0] < 19, - reason="decimal64 format string only supported in pyarrow >=19", - ), - ), - Decimal128Dtype(20, 7), - pytest.param( - Decimal32Dtype(6, 2), - marks=pytest.mark.skipif( - pa._generated_version.version_tuple[0] < 19, - reason="decimal32 format string only supported in pyarrow >=19", - ), - ), - ], -) -def test_sum_of_squares_decimal(dtype): - data = [str(x) for x in gen_rand("int8", 3) / 10] - - expected = pd.Series([Decimal(x) for x in data]).pow(2).sum() - got = (cudf.Series(data).astype(dtype) ** 2).sum() - - assert_eq(expected, got) - - -def test_min(dtype, nelem): - dtype = cudf.dtype(dtype).type - data = gen_rand(dtype, nelem) - sr = Series(data) - - got = sr.min() - expect = dtype(data.min()) - - assert expect == got - - -@pytest.mark.parametrize( - "dtype", - [ - pytest.param( - Decimal64Dtype(6, 3), - marks=pytest.mark.skipif( - pa._generated_version.version_tuple[0] < 19, - reason="decimal64 format string only supported in pyarrow >=19", - ), - ), - pytest.param( - Decimal64Dtype(10, 6), - marks=pytest.mark.skipif( - pa._generated_version.version_tuple[0] < 19, - reason="decimal64 format string only supported in pyarrow >=19", - ), - ), - pytest.param( - Decimal64Dtype(16, 7), - marks=pytest.mark.skipif( - pa._generated_version.version_tuple[0] < 19, - reason="decimal64 format string only supported in pyarrow >=19", - ), - ), - pytest.param( - Decimal32Dtype(6, 3), - marks=pytest.mark.skipif( - pa._generated_version.version_tuple[0] < 19, - reason="decimal32 format string only supported in pyarrow >=19", - ), - ), - Decimal128Dtype(20, 7), - ], -) -def test_min_decimal(dtype, nelem): - data = [str(x) for x in gen_rand("int64", nelem) / 100] - - expected = pd.Series([Decimal(x) for x in data]).min() - got = cudf.Series(data).astype(dtype).min() - - assert_eq(expected, got) - - -def test_max(dtype, nelem): - dtype = cudf.dtype(dtype).type - data = gen_rand(dtype, nelem) - sr = Series(data) - - got = sr.max() - expect = dtype(data.max()) - - assert expect == got - - -@pytest.mark.parametrize( - "dtype", - [ - pytest.param( - Decimal64Dtype(6, 3), - marks=pytest.mark.skipif( - pa._generated_version.version_tuple[0] < 19, - reason="decimal64 format string only supported in pyarrow >=19", - ), - ), - pytest.param( - Decimal64Dtype(10, 6), - marks=pytest.mark.skipif( - pa._generated_version.version_tuple[0] < 19, - reason="decimal64 format string only supported in pyarrow >=19", - ), - ), - pytest.param( - Decimal64Dtype(16, 7), - marks=pytest.mark.skipif( - pa._generated_version.version_tuple[0] < 19, - reason="decimal64 format string only supported in pyarrow >=19", - ), - ), - pytest.param( - Decimal32Dtype(6, 3), - marks=pytest.mark.skipif( - pa._generated_version.version_tuple[0] < 19, - reason="decimal32 format string only supported in pyarrow >=19", - ), - ), - Decimal128Dtype(20, 7), - ], -) -def test_max_decimal(dtype, nelem): - data = [str(x) for x in gen_rand("int64", nelem) / 100] - - expected = pd.Series([Decimal(x) for x in data]).max() - got = cudf.Series(data).astype(dtype).max() - - assert_eq(expected, got) - - -def test_sum_masked(nelem): - dtype = np.float64 - data = gen_rand(dtype, nelem) - - mask = utils.random_bitmask(nelem) - bitmask = utils.expand_bits_to_bytes(mask)[:nelem] - - sr = Series._from_column(as_column(data).set_mask(mask)) - - got = sr.sum() - res_mask = np.asarray(bitmask, dtype=np.bool_)[: data.size] - expect = data[res_mask].sum() - - significant = 4 if dtype == np.float32 else 6 - np.testing.assert_approx_equal(expect, got, significant=significant) - - -def test_sum_boolean(): - s = Series(np.arange(100000)) - got = (s > 1).sum() - expect = 99998 - - assert expect == got - - -def test_date_minmax(): - rng = np.random.default_rng(seed=0) - np_data = rng.normal(size=10**3) - gdf_data = Series(np_data) - - np_casted = np_data.astype("datetime64[ms]") - gdf_casted = gdf_data.astype("datetime64[ms]") - - np_min = np_casted.min() - gdf_min = gdf_casted.min() - assert np_min == gdf_min - - np_max = np_casted.max() - gdf_max = gdf_casted.max() - assert np_max == gdf_max - - -@pytest.mark.parametrize( - "op", - ["sum", "product", "var", "kurt", "kurtosis", "skew"], -) -def test_datetime_unsupported_reductions(op): - gsr = cudf.Series([1, 2, 3, None], dtype="datetime64[ns]") - psr = gsr.to_pandas() - - utils.assert_exceptions_equal( - lfunc=getattr(psr, op), - rfunc=getattr(gsr, op), - ) - - -@pytest.mark.parametrize("op", ["product", "var", "kurt", "kurtosis", "skew"]) -def test_timedelta_unsupported_reductions(op): - gsr = cudf.Series([1, 2, 3, None], dtype="timedelta64[ns]") - psr = gsr.to_pandas() - - utils.assert_exceptions_equal( - lfunc=getattr(psr, op), - rfunc=getattr(gsr, op), - ) - - -@pytest.mark.parametrize("op", ["sum", "product", "std", "var"]) -def test_categorical_reductions(op): - gsr = cudf.Series([1, 2, 3, None], dtype="category") - psr = gsr.to_pandas() - - utils.assert_exceptions_equal(getattr(psr, op), getattr(gsr, op)) - - -@pytest.mark.parametrize( - "data", - [ - {"a": [1, 2, 3], "b": [10, 11, 12]}, - {"a": [1, 0, 3], "b": [10, 11, 12]}, - {"a": [1, 2, 3], "b": [10, 11, None]}, - { - "a": [], - }, - {}, - ], -) -@pytest.mark.parametrize("op", ["all", "any"]) -def test_any_all_axis_none(data, op): - gdf = cudf.DataFrame(data) - pdf = gdf.to_pandas() - - expected = getattr(pdf, op)(axis=None) - actual = getattr(gdf, op)(axis=None) - - assert expected == actual - - -@pytest.mark.parametrize( - "op", - [ - "sum", - "product", - "std", - "var", - "kurt", - "kurtosis", - "skew", - "min", - "max", - "mean", - "median", - ], -) -@pytest.mark.skipif( - PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, - reason="Warning not given on older versions of pandas", -) -def test_reductions_axis_none_warning(op): - df = cudf.DataFrame({"a": [1, 2, 3], "b": [10, 2, 3]}) - pdf = df.to_pandas() - with expect_warning_if( - op in {"sum", "product", "std", "var"}, - FutureWarning, - ): - actual = getattr(df, op)(axis=None) - with expect_warning_if( - op in {"sum", "product", "std", "var"}, - FutureWarning, - ): - expected = getattr(pdf, op)(axis=None) - assert_eq(expected, actual, check_dtype=False) - - -@pytest.mark.parametrize( - "op", - [ - "sum", - "product", - "std", - "var", - "kurt", - "kurtosis", - "skew", - "min", - "max", - "mean", - "median", - ], -) -def test_dataframe_reduction_no_args(op): - df = cudf.DataFrame({"a": range(10), "b": range(10)}) - pdf = df.to_pandas() - result = getattr(df, op)() - expected = getattr(pdf, op)() - assert_eq(result, expected) - - -def test_reduction_column_multiindex(): - idx = cudf.MultiIndex.from_tuples( - [("a", 1), ("a", 2)], names=["foo", "bar"] - ) - df = cudf.DataFrame(np.array([[1, 3], [2, 4]]), columns=idx) - result = df.mean() - expected = df.to_pandas().mean() - assert_eq(result, expected) - - -@pytest.mark.parametrize( - "columns", [pd.RangeIndex(2), pd.Index([0, 1], dtype="int8")] -) -def test_dataframe_axis_0_preserve_column_type_in_index(columns): - pd_df = pd.DataFrame([[1, 2]], columns=columns) - cudf_df = cudf.DataFrame.from_pandas(pd_df) - result = cudf_df.sum(axis=0) - expected = pd_df.sum(axis=0) - assert_eq(result, expected, check_index_type=True) diff --git a/python/cudf/cudf/tests/test_stats.py b/python/cudf/cudf/tests/test_stats.py deleted file mode 100644 index cb8349e706a..00000000000 --- a/python/cudf/cudf/tests/test_stats.py +++ /dev/null @@ -1,664 +0,0 @@ -# Copyright (c) 2018-2025, NVIDIA CORPORATION. - -from concurrent.futures import ThreadPoolExecutor - -import cupy as cp -import numpy as np -import pandas as pd -import pyarrow as pa -import pytest - -import cudf -from cudf.api.extensions import no_default -from cudf.core._compat import PANDAS_CURRENT_SUPPORTED_VERSION, PANDAS_VERSION -from cudf.datasets import randomdata -from cudf.testing import assert_eq -from cudf.testing._utils import assert_exceptions_equal, expect_warning_if - - -@pytest.fixture(params=[np.int32, np.uint32, np.float32, np.float64]) -def dtype(request): - return request.param - - -@pytest.fixture(params=["min", "max", "sum", "mean", "var", "std"]) -def method(request): - return request.param - - -@pytest.fixture(params=["linear", "lower", "higher", "midpoint", "nearest"]) -def int_method(request): - return request.param - - -@pytest.mark.parametrize("skipna", [True, False]) -def test_series_reductions(method, dtype, skipna): - rng = np.random.default_rng(seed=0) - arr = rng.random(100) - if np.dtype(dtype).kind in "iu": - arr *= 100 - mask = arr > 10 - else: - mask = arr > 0.5 - - arr = arr.astype(dtype) - if dtype in (np.float32, np.float64): - arr[[2, 5, 14, 19, 50, 70]] = np.nan - sr = cudf.Series(arr) - sr[~mask] = None - psr = sr.to_pandas() - psr[~mask] = np.nan - - def call_test(sr, skipna): - fn = getattr(sr, method) - if method in ["std", "var"]: - return fn(ddof=1, skipna=skipna) - else: - return fn(skipna=skipna) - - expect, got = call_test(psr, skipna=skipna), call_test(sr, skipna=skipna) - - np.testing.assert_approx_equal(expect, got) - - -def test_series_reductions_concurrency(method): - rng = np.random.default_rng(seed=0) - srs = [cudf.Series(rng.random(10000))] - - def call_test(sr): - fn = getattr(sr, method) - if method in ["std", "var"]: - return fn(ddof=1) - else: - return fn() - - def f(sr): - return call_test(sr + 1) - - with ThreadPoolExecutor(10) as e: - list(e.map(f, srs * 50)) - - -@pytest.mark.parametrize("ddof", range(3)) -def test_series_std(ddof): - rng = np.random.default_rng(seed=0) - arr = rng.random(100) - 0.5 - sr = cudf.Series(arr) - pd = sr.to_pandas() - got = sr.std(ddof=ddof) - expect = pd.std(ddof=ddof) - np.testing.assert_approx_equal(expect, got) - - -def test_series_unique(): - rng = np.random.default_rng(seed=0) - size = 100 - arr = rng.integers(low=-1, high=10, size=size) - mask = arr != -1 - sr = cudf.Series(arr) - sr[~mask] = None - assert set(arr[mask]) == set(sr.unique().dropna().to_numpy()) - assert len(set(arr[mask])) == sr.nunique() - - -@pytest.mark.parametrize("nan_as_null", [True, False]) -@pytest.mark.parametrize("dropna", [True, False]) -def test_series_nunique(nan_as_null, dropna): - # We remove nulls as opposed to NaNs using the dropna parameter, - # so to test against pandas we replace NaN with another discrete value - cudf_series = cudf.Series([1, 2, 2, 3, 3], nan_as_null=nan_as_null) - pd_series = pd.Series([1, 2, 2, 3, 3]) - expect = pd_series.nunique(dropna=dropna) - got = cudf_series.nunique(dropna=dropna) - assert expect == got - - cudf_series = cudf.Series( - [1.0, 2.0, 3.0, np.nan, None], nan_as_null=nan_as_null - ) - if nan_as_null is True: - pd_series = pd.Series([1.0, 2.0, 3.0, np.nan, None]) - else: - pd_series = pd.Series([1.0, 2.0, 3.0, -1.0, None]) - - expect = pd_series.nunique(dropna=dropna) - got = cudf_series.nunique(dropna=dropna) - assert expect == got - - cudf_series = cudf.Series([1.0, np.nan, np.nan], nan_as_null=nan_as_null) - if nan_as_null is True: - pd_series = pd.Series([1.0, np.nan, np.nan]) - else: - pd_series = pd.Series([1.0, -1.0, -1.0]) - expect = pd_series.nunique(dropna=dropna) - got = cudf_series.nunique(dropna=dropna) - assert expect == got - - -def test_series_scale(): - rng = np.random.default_rng(seed=0) - arr = pd.Series(rng.integers(low=-10, high=10, size=100)) - sr = cudf.Series(arr) - - vmin = arr.min() - vmax = arr.max() - scaled = (arr - vmin) / (vmax - vmin) - assert scaled.min() == 0 - assert scaled.max() == 1 - assert_eq(sr.scale(), scaled) - - -def test_exact_quantiles(int_method): - arr = np.asarray([6.8, 0.15, 3.4, 4.17, 2.13, 1.11, -1.01, 0.8, 5.7]) - quant_values = [0.0, 0.25, 0.33, 0.5, 1.0] - - df = pd.DataFrame(arr) - gdf_series = cudf.Series(arr) - - q1 = gdf_series.quantile( - quant_values, interpolation=int_method, exact=True - ) - - q2 = df.quantile(quant_values, interpolation=int_method) - - np.testing.assert_allclose( - q1.to_pandas().values, np.array(q2.values).T.flatten(), rtol=1e-10 - ) - - -def test_exact_quantiles_int(int_method): - arr = np.asarray([7, 0, 3, 4, 2, 1, -1, 1, 6]) - quant_values = [0.0, 0.25, 0.33, 0.5, 1.0] - - df = pd.DataFrame(arr) - gdf_series = cudf.Series(arr) - - q1 = gdf_series.quantile( - quant_values, interpolation=int_method, exact=True - ) - - q2 = df.quantile(quant_values, interpolation=int_method) - - np.testing.assert_allclose( - q1.to_pandas().values, np.array(q2.values).T.flatten(), rtol=1e-10 - ) - - -def test_approx_quantiles(): - arr = np.asarray([6.8, 0.15, 3.4, 4.17, 2.13, 1.11, -1.01, 0.8, 5.7]) - quant_values = [0.0, 0.25, 0.33, 0.5, 1.0] - - gdf_series = cudf.Series(arr) - pdf_series = pd.Series(arr) - - q1 = gdf_series.quantile(quant_values, exact=False) - q2 = pdf_series.quantile(quant_values) - - assert_eq(q1, q2) - - -def test_approx_quantiles_int(): - arr = np.asarray([1, 2, 3]) - quant_values = [0.5] - approx_results = [2] - - gdf_series = cudf.Series(arr) - - q1 = gdf_series.quantile(quant_values, exact=False) - - assert approx_results == q1.to_pandas().values - - -@pytest.mark.parametrize("data", [[], [1, 2, 3, 10, 326497]]) -@pytest.mark.parametrize( - "q", - [ - [], - 0.5, - 1, - 0.234, - [0.345], - [0.243, 0.5, 1], - np.array([0.5, 1]), - cp.array([0.5, 1]), - ], -) -def test_misc_quantiles(data, q): - pdf_series = pd.Series(data, dtype="float64" if len(data) == 0 else None) - gdf_series = cudf.from_pandas(pdf_series) - - expected = pdf_series.quantile(q.get() if isinstance(q, cp.ndarray) else q) - actual = gdf_series.quantile(q) - assert_eq(expected, actual) - - -@pytest.mark.parametrize( - "data", - [ - {"data": np.random.default_rng(seed=0).normal(-100, 100, 1000)}, - {"data": np.random.default_rng(seed=0).integers(-50, 50, 1000)}, - {"data": (np.zeros(100))}, - {"data": np.repeat(np.nan, 100)}, - {"data": np.array([1.123, 2.343, np.nan, 0.0])}, - { - "data": [5, 10, 53, None, np.nan, None, 12, 43, -423], - "nan_as_null": False, - }, - {"data": [1.1032, 2.32, 43.4, 13, -312.0], "index": [0, 4, 3, 19, 6]}, - {"data": [], "dtype": "float64"}, - {"data": [-3]}, - ], -) -@pytest.mark.parametrize("null_flag", [False, True]) -@pytest.mark.parametrize("numeric_only", [False, True]) -def test_kurtosis_series(data, null_flag, numeric_only): - gs = cudf.Series(**data) - ps = gs.to_pandas() - - if null_flag and len(gs) > 2: - gs.iloc[[0, 2]] = None - ps.iloc[[0, 2]] = None - - got = gs.kurtosis(numeric_only=numeric_only) - expected = ps.kurtosis(numeric_only=numeric_only) - - assert_eq(got, expected) - - got = gs.kurt(numeric_only=numeric_only) - expected = ps.kurt(numeric_only=numeric_only) - - assert_eq(got, expected) - - -@pytest.mark.parametrize("op", ["skew", "kurt"]) -def test_kurt_skew_error(op): - gs = cudf.Series(["ab", "cd"]) - ps = gs.to_pandas() - - assert_exceptions_equal( - getattr(gs, op), - getattr(ps, op), - lfunc_args_and_kwargs=([], {"numeric_only": True}), - rfunc_args_and_kwargs=([], {"numeric_only": True}), - ) - - -@pytest.mark.parametrize( - "data, index, dtype, nan_as_null", - [ - [ - np.random.default_rng(seed=0).normal(-100, 100, 1000), - None, - None, - None, - ], - [ - np.random.default_rng(seed=0).integers(-50, 50, 1000), - None, - None, - None, - ], - [np.zeros(100), None, None, None], - [np.repeat(np.nan, 100), None, None, None], - [np.array([1.123, 2.343, np.nan, 0.0]), None, None, None], - [[5, 10, 53, None, np.nan, None, 12, 43, -423], None, None, False], - [[1.1032, 2.32, 43.4, 13, -312.0], [0, 4, 3, 19, 6], None, None], - [[], None, "float64", None], - [[-3], None, None, None], - ], -) -@pytest.mark.parametrize("null_flag", [False, True]) -@pytest.mark.parametrize("numeric_only", [False, True]) -def test_skew_series(data, index, dtype, nan_as_null, null_flag, numeric_only): - data = cudf.Series(data, index=index, dtype=dtype, nan_as_null=nan_as_null) - pdata = data.to_pandas() - - if null_flag and len(data) > 2: - data.iloc[[0, 2]] = None - pdata.iloc[[0, 2]] = None - - got = data.skew(numeric_only=numeric_only) - expected = pdata.skew(numeric_only=numeric_only) - - assert_eq(got, expected) - - -@pytest.mark.parametrize("num_na", [0, 1, 50, 99, 100]) -def test_series_median(dtype, num_na): - rng = np.random.default_rng(seed=0) - arr = rng.random(100) - dtype = np.dtype(dtype) - if dtype.kind in "iu": - arr *= 100 - mask = np.arange(100) >= num_na - - arr = arr.astype(dtype) - sr = cudf.Series(arr) - sr[~mask] = None - arr2 = arr[mask] - ps = pd.Series(arr2, dtype=dtype) - - actual = sr.median(skipna=True) - desired = ps.median(skipna=True) - - np.testing.assert_approx_equal(actual, desired) - - # only for float until integer null supported convert to pandas in cudf - # eg. pd.Int64Dtype - if dtype.kind == "f": - ps = sr.to_pandas() - actual = sr.median(skipna=False) - desired = ps.median(skipna=False) - np.testing.assert_approx_equal(actual, desired) - - -@pytest.mark.skipif( - PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, - reason="warning not present in older pandas versions", -) -@pytest.mark.parametrize( - "data", - [ - np.random.default_rng(seed=0).normal(-100, 100, 1000), - np.random.default_rng(seed=0).integers(-50, 50, 1000), - np.zeros(100), - np.array([1.123, 2.343, np.nan, 0.0]), - np.array([-2, 3.75, 6, None, None, None, -8.5, None, 4.2]), - np.array([], dtype="float64"), - np.array([-3]), - ], -) -@pytest.mark.parametrize("periods", range(-5, 5)) -@pytest.mark.parametrize( - "fill_method", ["ffill", "bfill", "pad", "backfill", no_default, None] -) -def test_series_pct_change(data, periods, fill_method): - cs = cudf.Series(data) - ps = cs.to_pandas() - - if np.abs(periods) <= len(cs): - with expect_warning_if(fill_method not in (no_default, None)): - got = cs.pct_change(periods=periods, fill_method=fill_method) - with expect_warning_if( - ( - fill_method not in (no_default, None) - or (fill_method is not None and ps.isna().any()) - ) - ): - expected = ps.pct_change(periods=periods, fill_method=fill_method) - np.testing.assert_array_almost_equal( - got.to_numpy(na_value=np.nan), expected - ) - - -@pytest.mark.parametrize( - "data1", - [ - np.random.default_rng(seed=0).normal(-100, 100, 1000), - np.random.default_rng(seed=0).integers(-50, 50, 1000), - np.zeros(100), - np.repeat(np.nan, 100), - np.array([1.123, 2.343, np.nan, 0.0]), - pa.array([5, 10, 53, None, np.nan, None]), - pd.Series([1.1, 2.32, 43.4], index=[0, 4, 3]), - np.array([], dtype="float64"), - np.array([-3]), - ], -) -@pytest.mark.parametrize( - "data2", - [ - np.random.default_rng(seed=0).normal(-100, 100, 1000), - np.random.default_rng(seed=0).integers(-50, 50, 1000), - np.zeros(100), - np.repeat(np.nan, 100), - np.array([1.123, 2.343, np.nan, 0.0]), - pd.Series([1.1, 2.32, 43.4], index=[0, 500, 4000]), - np.array([5]), - ], -) -def test_cov1d(data1, data2): - gs1 = cudf.Series(data1) - gs2 = cudf.Series(data2) - - ps1 = gs1.to_pandas() - ps2 = gs2.to_pandas() - - got = gs1.cov(gs2) - ps1_align, ps2_align = ps1.align(ps2, join="inner") - with expect_warning_if( - (len(ps1_align.dropna()) == 1 and len(ps2_align.dropna()) > 0) - or (len(ps2_align.dropna()) == 1 and len(ps1_align.dropna()) > 0), - RuntimeWarning, - ): - expected = ps1.cov(ps2) - np.testing.assert_approx_equal(got, expected, significant=8) - - -@pytest.mark.parametrize( - "data1", - [ - np.random.default_rng(seed=0).normal(-100, 100, 1000), - np.random.default_rng(seed=0).integers(-50, 50, 1000), - np.zeros(100), - np.repeat(np.nan, 100), - np.array([1.123, 2.343, np.nan, 0.0]), - pa.array([5, 10, 53, None, np.nan, None]), - pd.Series([1.1032, 2.32, 43.4], index=[0, 4, 3]), - np.array([], dtype="float64"), - np.array([-3]), - ], -) -@pytest.mark.parametrize( - "data2", - [ - np.random.default_rng(seed=0).normal(-100, 100, 1000), - np.random.default_rng(seed=0).integers(-50, 50, 1000), - np.zeros(100), - np.repeat(np.nan, 100), - np.array([1.123, 2.343, np.nan, 0.0]), - pd.Series([1.1, 2.32, 43.4], index=[0, 500, 4000]), - np.array([5]), - ], -) -@pytest.mark.parametrize("method", ["spearman", "pearson"]) -@pytest.mark.skipif( - PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, - reason="Warnings missing on older pandas (scipy version seems unrelated?)", -) -def test_corr1d(data1, data2, method): - if method == "spearman": - # Pandas uses scipy.stats.spearmanr code-path - pytest.importorskip("scipy") - - gs1 = cudf.Series(data1) - gs2 = cudf.Series(data2) - - ps1 = gs1.to_pandas() - ps2 = gs2.to_pandas() - - got = gs1.corr(gs2, method) - - ps1_align, ps2_align = ps1.align(ps2, join="inner") - - is_singular = ( - len(ps1_align.dropna()) == 1 and len(ps2_align.dropna()) > 0 - ) or (len(ps2_align.dropna()) == 1 and len(ps1_align.dropna()) > 0) - is_identical = ( - len(ps1_align.dropna().unique()) == 1 and len(ps2_align.dropna()) > 0 - ) or ( - len(ps2_align.dropna().unique()) == 1 and len(ps1_align.dropna()) > 0 - ) - - # Pearson correlation leads to division by 0 when either sample size is 1. - # Spearman allows for size 1 samples, but will error if all data in a - # sample is identical since the covariance is zero and so the correlation - # coefficient is not defined. - cond = ((is_singular or is_identical) and method == "pearson") or ( - is_identical and not is_singular and method == "spearman" - ) - if method == "spearman": - # SciPy has shuffled around the warning it throws a couple of times. - # It's not worth the effort of conditionally importing the appropriate - # warning based on the scipy version, just catching a base Warning is - # good enough validation. - expected_warning = Warning - elif method == "pearson": - expected_warning = RuntimeWarning - - with expect_warning_if(cond, expected_warning): - expected = ps1.corr(ps2, method) - np.testing.assert_approx_equal(got, expected, significant=8) - - -@pytest.mark.parametrize("method", ["spearman", "pearson"]) -def test_df_corr(method): - gdf = randomdata(100, {str(x): float for x in range(50)}) - pdf = gdf.to_pandas() - got = gdf.corr(method) - expected = pdf.corr(method) - assert_eq(got, expected) - - -@pytest.mark.parametrize( - "data", - [ - [0.0, 1, 3, 6, np.nan, 7, 5.0, np.nan, 5, 2, 3, -100], - [np.nan] * 3, - [1, 5, 3], - [], - ], -) -@pytest.mark.parametrize( - "ops", - [ - "mean", - "min", - "max", - "sum", - "product", - "var", - "std", - "prod", - "kurtosis", - "skew", - "any", - "all", - "cummin", - "cummax", - "cumsum", - "cumprod", - ], -) -@pytest.mark.parametrize("skipna", [True, False]) -def test_nans_stats(data, ops, skipna): - psr = pd.Series(data, dtype="float64" if len(data) == 0 else None) - gsr = cudf.from_pandas(psr) - - assert_eq( - getattr(psr, ops)(skipna=skipna), getattr(gsr, ops)(skipna=skipna) - ) - - gsr = cudf.Series( - data, dtype="float64" if len(data) == 0 else None, nan_as_null=False - ) - # Since there is no concept of `nan_as_null` in pandas, - # nulls will be returned in the operations. So only - # testing for `skipna=True` when `nan_as_null=False` - assert_eq(getattr(psr, ops)(skipna=True), getattr(gsr, ops)(skipna=True)) - - -@pytest.mark.parametrize( - "data", - [ - [0.0, 1, 3, 6, np.nan, 7, 5.0, np.nan, 5, 2, 3, -100], - [np.nan] * 3, - [1, 5, 3], - ], -) -@pytest.mark.parametrize("ops", ["sum", "product", "prod"]) -@pytest.mark.parametrize("skipna", [True, False]) -@pytest.mark.parametrize("min_count", [-10, -1, 0, 1, 2, 3, 5, 10]) -def test_min_count_ops(data, ops, skipna, min_count): - psr = pd.Series(data) - gsr = cudf.Series(data, nan_as_null=False) - - assert_eq( - getattr(psr, ops)(skipna=skipna, min_count=min_count), - getattr(gsr, ops)(skipna=skipna, min_count=min_count), - ) - - -@pytest.mark.parametrize( - "data1", - [ - [1, 2, 3, 4], - [10, 1, 3, 5], - ], -) -@pytest.mark.parametrize( - "data2", - [ - [1, 2, 3, 4], - [10, 1, 3, 5], - ], -) -@pytest.mark.parametrize("dtype", ["datetime64[ns]", "timedelta64[ns]"]) -@pytest.mark.skipif( - PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, - reason="Fails in older versions of pandas", -) -def test_cov_corr_datetime_timedelta(data1, data2, dtype): - gsr1 = cudf.Series(data1, dtype=dtype) - gsr2 = cudf.Series(data2, dtype=dtype) - psr1 = gsr1.to_pandas() - psr2 = gsr2.to_pandas() - - assert_eq(psr1.corr(psr2), gsr1.corr(gsr2)) - assert_eq(psr1.cov(psr2), gsr1.cov(gsr2)) - - -@pytest.mark.parametrize("null_flag", [False, True]) -@pytest.mark.parametrize("numeric_only", [False, True]) -def test_kurtosis_df(null_flag, numeric_only): - data = randomdata( - nrows=1000, dtypes={"a": float, "b": int, "c": float, "d": str} - ) - if not numeric_only: - data = data.select_dtypes(include="number") - pdata = data.to_pandas() - - if null_flag and len(data) > 2: - data.iloc[[0, 2]] = None - pdata.iloc[[0, 2]] = None - - got = data.kurtosis(numeric_only=numeric_only) - got = got if np.isscalar(got) else got.to_numpy() - - expected = pdata.kurtosis(numeric_only=numeric_only) - np.testing.assert_array_almost_equal(got, expected) - - got = data.kurt(numeric_only=numeric_only) - got = got if np.isscalar(got) else got.to_numpy() - - expected = pdata.kurt(numeric_only=numeric_only) - np.testing.assert_array_almost_equal(got, expected) - - -@pytest.mark.parametrize("null_flag", [False, True]) -@pytest.mark.parametrize("numeric_only", [False, True]) -def test_skew_df(null_flag, numeric_only): - data = randomdata( - nrows=1000, dtypes={"a": float, "b": int, "c": float, "d": str} - ) - if not numeric_only: - data = data.select_dtypes(include="number") - pdata = data.to_pandas() - - if null_flag and len(data) > 2: - data.iloc[[0, 2]] = None - pdata.iloc[[0, 2]] = None - - got = data.skew(numeric_only=numeric_only) - expected = pdata.skew(numeric_only=numeric_only) - got = got if np.isscalar(got) else got.to_numpy() - np.testing.assert_array_almost_equal(got, expected) diff --git a/python/cudf/cudf/tests/test_string.py b/python/cudf/cudf/tests/test_string.py index 4b9e47ed275..f04c636757f 100644 --- a/python/cudf/cudf/tests/test_string.py +++ b/python/cudf/cudf/tests/test_string.py @@ -633,112 +633,6 @@ def test_string_int_to_ipv4_dtype_fail(dtype): gsr._column.int2ip() -@pytest.mark.parametrize( - "data", - [ - ["a", "b", "c", "d", "e"], - ["a", "z", ".", '"', "aa", "zz"], - ["aa", "zz"], - ["z", "a", "zz", "aa"], - ["1", "2", "3", "4", "5"], - [""], - ["a"], - ["hello"], - ["small text", "this is a larger text......"], - ["👋🏻", "🔥", "🥇"], - ["This is 💯", "here is a calendar", "📅"], - ["", ".", ";", "[", "]"], - ["\t", ".", "\n", "\n\t", "\t\n"], - ], -) -def test_str_min(data): - psr = pd.Series(data) - sr = cudf.Series(data) - - assert_eq(psr.min(), sr.min()) - - -@pytest.mark.parametrize( - "data", - [ - ["a", "b", "c", "d", "e"], - ["a", "z", ".", '"', "aa", "zz"], - ["aa", "zz"], - ["z", "a", "zz", "aa"], - ["1", "2", "3", "4", "5"], - [""], - ["a"], - ["hello"], - ["small text", "this is a larger text......"], - ["👋🏻", "🔥", "🥇"], - ["This is 💯", "here is a calendar", "📅"], - ["", ".", ";", "[", "]"], - ["\t", ".", "\n", "\n\t", "\t\n"], - ], -) -def test_str_max(data): - psr = pd.Series(data) - sr = cudf.Series(data) - - assert_eq(psr.max(), sr.max()) - - -@pytest.mark.parametrize( - "data", - [ - ["a", "b", "c", "d", "e"], - ["a", "z", ".", '"', "aa", "zz"], - ["aa", "zz"], - ["z", "a", "zz", "aa"], - ["1", "2", "3", "4", "5"], - [""], - ["a"], - ["hello"], - ["small text", "this is a larger text......"], - ["👋🏻", "🔥", "🥇"], - ["This is 💯", "here is a calendar", "📅"], - ["", ".", ";", "[", "]"], - ["\t", ".", "\n", "\n\t", "\t\n"], - ], -) -def test_str_sum(data): - psr = pd.Series(data) - sr = cudf.Series(data) - - assert_eq(psr.sum(), sr.sum()) - - -def test_str_mean(): - sr = cudf.Series(["a", "b", "c", "d", "e"]) - - with pytest.raises(TypeError): - sr.mean() - - -def test_string_product(): - psr = pd.Series(["1", "2", "3", "4", "5"]) - sr = cudf.Series(["1", "2", "3", "4", "5"]) - - assert_exceptions_equal( - lfunc=psr.product, - rfunc=sr.product, - ) - - -def test_string_var(): - psr = pd.Series(["1", "2", "3", "4", "5"]) - sr = cudf.Series(["1", "2", "3", "4", "5"]) - - assert_exceptions_equal(lfunc=psr.var, rfunc=sr.var) - - -def test_string_std(): - psr = pd.Series(["1", "2", "3", "4", "5"]) - sr = cudf.Series(["1", "2", "3", "4", "5"]) - - assert_exceptions_equal(lfunc=psr.std, rfunc=sr.std) - - def test_string_slice_with_mask(): actual = cudf.Series(["hi", "hello", None]) expected = actual[0:3] @@ -748,21 +642,3 @@ def test_string_slice_with_mask(): assert_eq(actual._column.null_count, expected._column.null_count) assert_eq(actual, expected) - - -def test_string_reduction_error(): - s = cudf.Series([None, None], dtype="str") - ps = s.to_pandas(nullable=True) - assert_exceptions_equal( - s.any, - ps.any, - lfunc_args_and_kwargs=([], {"skipna": False}), - rfunc_args_and_kwargs=([], {"skipna": False}), - ) - - assert_exceptions_equal( - s.all, - ps.all, - lfunc_args_and_kwargs=([], {"skipna": False}), - rfunc_args_and_kwargs=([], {"skipna": False}), - ) diff --git a/python/cudf/cudf/tests/test_timedelta.py b/python/cudf/cudf/tests/test_timedelta.py index 28741b9f592..833035858be 100644 --- a/python/cudf/cudf/tests/test_timedelta.py +++ b/python/cudf/cudf/tests/test_timedelta.py @@ -9,9 +9,8 @@ import pytest import cudf -from cudf.core._compat import PANDAS_GE_230 from cudf.testing import _utils as utils, assert_eq -from cudf.testing._utils import assert_exceptions_equal, expect_warning_if +from cudf.testing._utils import assert_exceptions_equal @pytest.fixture( @@ -335,38 +334,6 @@ def test_timedelta_series_mod_with_scalar_zero(reverse): assert_eq(expected, actual) -@pytest.mark.parametrize("reduction_op", ["sum", "mean", "median", "quantile"]) -def test_timedelta_reduction_ops( - data_non_overflow, timedelta_dtype, reduction_op -): - gsr = cudf.Series(data_non_overflow, dtype=timedelta_dtype) - psr = gsr.to_pandas() - - if len(psr) > 0 and psr.isnull().all() and reduction_op == "median": - with pytest.warns(RuntimeWarning, match="Mean of empty slice"): - expected = getattr(psr, reduction_op)() - else: - with expect_warning_if( - PANDAS_GE_230 - and reduction_op == "quantile" - and len(data_non_overflow) == 0 - and timedelta_dtype != "timedelta64[ns]" - ): - expected = getattr(psr, reduction_op)() - actual = getattr(gsr, reduction_op)() - if pd.isna(expected) and pd.isna(actual): - pass - elif isinstance(expected, pd.Timedelta) and isinstance( - actual, pd.Timedelta - ): - assert ( - expected.round(gsr._column.time_unit).value - == actual.round(gsr._column.time_unit).value - ) - else: - assert_eq(expected, actual) - - @pytest.mark.parametrize("datetime_dtype", utils.DATETIME_TYPES) def test_timedelta_index_datetime_index_ops( data_non_overflow, datetime_dtype, timedelta_dtype @@ -618,50 +585,6 @@ def test_timedelta_invalid_ops(): ) -@pytest.mark.parametrize("data", [[1, 2, 3], [], [1, 20, 1000, None]]) -@pytest.mark.parametrize("ddof", [1, 2, 3]) -def test_timedelta_std(data, timedelta_dtype, ddof): - gsr = cudf.Series(data, dtype=timedelta_dtype) - psr = gsr.to_pandas() - - expected = psr.std(ddof=ddof) - actual = gsr.std(ddof=ddof) - - if np.isnat(expected.to_numpy()) and np.isnat(actual.to_numpy()): - assert True - else: - np.testing.assert_allclose( - expected.to_numpy().astype("float64"), - actual.to_numpy().astype("float64"), - rtol=1e-5, - atol=0, - ) - - -@pytest.mark.parametrize("op", ["max", "min"]) -@pytest.mark.parametrize( - "data", - [ - [], - [1, 2, 3, 100], - [10, None, 100, None, None], - [None, None, None], - [1231], - ], -) -def test_timedelta_reductions(data, op, timedelta_dtype): - sr = cudf.Series(data, dtype=timedelta_dtype) - psr = sr.to_pandas() - - actual = getattr(sr, op)() - expected = getattr(psr, op)() - - if np.isnat(expected.to_numpy()) and np.isnat(actual): - assert True - else: - assert_eq(expected.to_numpy(), actual) - - @pytest.mark.parametrize("op", [operator.add, operator.sub]) def test_timdelta_binop_tz_timestamp(op): s = cudf.Series([1, 2, 3], dtype="timedelta64[ns]") From 265f9143a165fa539b9eb2787f1e964d423d1cf1 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Fri, 15 Aug 2025 14:58:08 -0700 Subject: [PATCH 139/366] Move some of test_dataframe.py to new cudf classic directory structure (#19687) Towards https://github.com/rapidsai/cudf/issues/9999 Towards https://github.com/rapidsai/cudf/issues/15723 Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/19687 --- python/cudf/cudf/tests/conftest.py | 6 + .../tests/dataframe/methods/test_assign.py | 36 + .../tests/dataframe/methods/test_describe.py | 234 ++ .../cudf/tests/dataframe/methods/test_drop.py | 278 ++ .../dataframe/methods/test_hash_values.py | 157 + .../tests/dataframe/methods/test_head_tail.py | 80 + .../dataframe/methods/test_isna_notnull.py | 56 + .../tests/dataframe/methods/test_items.py | 13 + .../methods/test_partition_by_hash.py | 116 + .../cudf/tests/dataframe/methods/test_pop.py | 46 + .../tests/dataframe/methods/test_reindex.py | 191 + .../tests/dataframe/methods/test_rename.py | 70 + .../dataframe/methods/test_reset_index.py | 144 + .../tests/dataframe/methods/test_round.py | 70 + .../dataframe/methods/test_select_dtypes.py | 170 + .../tests/dataframe/methods/test_set_index.py | 105 + .../dataframe/methods/test_sort_index.py | 147 + .../tests/dataframe/methods/test_swaplevel.py | 69 + .../tests/dataframe/methods/test_to_arrow.py | 116 +- .../tests/dataframe/methods/test_to_cupy.py | 136 + .../tests/dataframe/methods/test_transpose.py | 85 +- .../tests/dataframe/methods/test_truncate.py | 61 + .../cudf/tests/dataframe/test_attributes.py | 56 + .../cudf/tests/dataframe/test_constructors.py | 396 ++ .../general_functions/test_from_pandas.py | 21 + .../cudf/tests/series/methods/test_shift.py | 37 + .../tests/series/methods/test_to_frame.py | 14 + .../tests/series/methods/test_to_string.py | 20 + .../cudf/cudf/tests/series/test_attributes.py | 6 + .../cudf/tests/series/test_constructors.py | 109 + python/cudf/cudf/tests/test_dataframe.py | 3356 ++--------------- 31 files changed, 3267 insertions(+), 3134 deletions(-) create mode 100644 python/cudf/cudf/tests/dataframe/methods/test_assign.py create mode 100644 python/cudf/cudf/tests/dataframe/methods/test_describe.py create mode 100644 python/cudf/cudf/tests/dataframe/methods/test_drop.py create mode 100644 python/cudf/cudf/tests/dataframe/methods/test_hash_values.py create mode 100644 python/cudf/cudf/tests/dataframe/methods/test_head_tail.py create mode 100644 python/cudf/cudf/tests/dataframe/methods/test_isna_notnull.py create mode 100644 python/cudf/cudf/tests/dataframe/methods/test_items.py create mode 100644 python/cudf/cudf/tests/dataframe/methods/test_partition_by_hash.py create mode 100644 python/cudf/cudf/tests/dataframe/methods/test_pop.py create mode 100644 python/cudf/cudf/tests/dataframe/methods/test_reindex.py create mode 100644 python/cudf/cudf/tests/dataframe/methods/test_rename.py create mode 100644 python/cudf/cudf/tests/dataframe/methods/test_reset_index.py create mode 100644 python/cudf/cudf/tests/dataframe/methods/test_round.py create mode 100644 python/cudf/cudf/tests/dataframe/methods/test_select_dtypes.py create mode 100644 python/cudf/cudf/tests/dataframe/methods/test_set_index.py create mode 100644 python/cudf/cudf/tests/dataframe/methods/test_sort_index.py create mode 100644 python/cudf/cudf/tests/dataframe/methods/test_swaplevel.py create mode 100644 python/cudf/cudf/tests/dataframe/methods/test_to_cupy.py create mode 100644 python/cudf/cudf/tests/dataframe/methods/test_truncate.py create mode 100644 python/cudf/cudf/tests/dataframe/test_attributes.py create mode 100644 python/cudf/cudf/tests/general_functions/test_from_pandas.py create mode 100644 python/cudf/cudf/tests/series/methods/test_shift.py create mode 100644 python/cudf/cudf/tests/series/methods/test_to_frame.py create mode 100644 python/cudf/cudf/tests/series/methods/test_to_string.py diff --git a/python/cudf/cudf/tests/conftest.py b/python/cudf/cudf/tests/conftest.py index f05ec3dd247..88940a3ec47 100644 --- a/python/cudf/cudf/tests/conftest.py +++ b/python/cudf/cudf/tests/conftest.py @@ -514,6 +514,12 @@ def inplace(request): return request.param +@pytest.fixture(params=[True, False]) +def drop(request): + """Param for `drop` argument""" + return request.param + + @pytest.fixture(params=[True, False]) def ignore_index(request): """Param for `ignore_index` argument""" diff --git a/python/cudf/cudf/tests/dataframe/methods/test_assign.py b/python/cudf/cudf/tests/dataframe/methods/test_assign.py new file mode 100644 index 00000000000..9cbd740a4ef --- /dev/null +++ b/python/cudf/cudf/tests/dataframe/methods/test_assign.py @@ -0,0 +1,36 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + +import numpy as np +import pandas as pd +import pytest + +import cudf +from cudf.testing import assert_eq + + +def test_assign(): + gdf = cudf.DataFrame({"x": [1, 2, 3]}) + gdf2 = gdf.assign(y=gdf.x + 1) + assert list(gdf.columns) == ["x"] + assert list(gdf2.columns) == ["x", "y"] + + np.testing.assert_equal(gdf2.y.to_numpy(), [2, 3, 4]) + + +@pytest.mark.parametrize( + "mapping", + [ + {"y": 1, "z": lambda df: df["x"] + df["y"]}, + { + "x": lambda df: df["x"] * 2, + "y": lambda df: 2, + "z": lambda df: df["x"] / df["y"], + }, + ], +) +def test_assign_callable(mapping): + df = pd.DataFrame({"x": [1, 2, 3]}) + cdf = cudf.from_pandas(df) + expect = df.assign(**mapping) + actual = cdf.assign(**mapping) + assert_eq(expect, actual) diff --git a/python/cudf/cudf/tests/dataframe/methods/test_describe.py b/python/cudf/cudf/tests/dataframe/methods/test_describe.py new file mode 100644 index 00000000000..51ef9a7750e --- /dev/null +++ b/python/cudf/cudf/tests/dataframe/methods/test_describe.py @@ -0,0 +1,234 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + +import numpy as np +import pandas as pd +import pytest + +import cudf +from cudf.testing import assert_eq + + +def test_dataframe_describe_exclude(): + rng = np.random.default_rng(seed=12) + data_length = 10 + + df = cudf.DataFrame( + { + "x": rng.normal(10, 1, data_length).astype("int64"), + "y": rng.normal(10, 1, data_length), + } + ) + pdf = df.to_pandas() + + gdf_results = df.describe(exclude=["float"]) + pdf_results = pdf.describe(exclude=["float"]) + + assert_eq(gdf_results, pdf_results) + + +def test_dataframe_describe_include(): + rng = np.random.default_rng(seed=12) + data_length = 10 + + df = cudf.DataFrame( + { + "x": rng.normal(10, 1, data_length).astype("int64"), + "y": rng.normal(10, 1, data_length), + } + ) + pdf = df.to_pandas() + gdf_results = df.describe(include=["int"]) + pdf_results = pdf.describe(include=["int"]) + + assert_eq(gdf_results, pdf_results) + + +def test_dataframe_describe_default(): + rng = np.random.default_rng(seed=12) + data_length = 10 + + df = cudf.DataFrame( + { + "x": rng.normal(10, 1, data_length), + "y": rng.normal(10, 1, data_length), + } + ) + pdf = df.to_pandas() + gdf_results = df.describe() + pdf_results = pdf.describe() + + assert_eq(pdf_results, gdf_results) + + +def test_dataframe_describe_percentiles(): + rng = np.random.default_rng(seed=12) + data_length = 100 + sample_percentiles = [0.0, 0.1, 0.33, 0.84, 0.4, 0.99] + + df = cudf.DataFrame( + { + "x": rng.normal(10, 1, data_length), + "y": rng.normal(10, 1, data_length), + } + ) + pdf = df.to_pandas() + gdf_results = df.describe(percentiles=sample_percentiles) + pdf_results = pdf.describe(percentiles=sample_percentiles) + + assert_eq(pdf_results, gdf_results) + + +def test_dataframe_describe_include_all(): + rng = np.random.default_rng(seed=12) + data_length = 10 + + df = cudf.DataFrame( + { + "x": rng.normal(10, 1, data_length).astype("int64"), + "y": rng.normal(10, 1, data_length), + "animal": rng.choice(["dog", "cat", "bird"], data_length), + } + ) + + pdf = df.to_pandas() + gdf_results = df.describe(include="all") + pdf_results = pdf.describe(include="all") + + assert_eq(gdf_results[["x", "y"]], pdf_results[["x", "y"]]) + assert_eq(gdf_results.index, pdf_results.index) + assert_eq(gdf_results.columns, pdf_results.columns) + assert_eq( + gdf_results[["animal"]].fillna(-1).astype("str"), + pdf_results[["animal"]].fillna(-1).astype("str"), + ) + + +@pytest.mark.parametrize( + "pdf", + [ + pd.DataFrame( + { + "a": [1, 2, 3], + "b": [10, 22, 33], + "c": [0.3234, 0.23432, 0.0], + "d": ["hello", "world", "hello"], + } + ), + pd.DataFrame( + { + "a": [1, 2, 3], + "b": ["hello", "world", "hello"], + "c": [0.3234, 0.23432, 0.0], + } + ), + pd.DataFrame( + { + "int_data": [1, 2, 3], + "str_data": ["hello", "world", "hello"], + "float_data": [0.3234, 0.23432, 0.0], + "timedelta_data": pd.Series( + [1, 2, 1], dtype="timedelta64[ns]" + ), + "datetime_data": pd.Series([1, 2, 1], dtype="datetime64[ns]"), + } + ), + pd.DataFrame( + { + "int_data": [1, 2, 3], + "str_data": ["hello", "world", "hello"], + "float_data": [0.3234, 0.23432, 0.0], + "timedelta_data": pd.Series( + [1, 2, 1], dtype="timedelta64[ns]" + ), + "datetime_data": pd.Series([1, 2, 1], dtype="datetime64[ns]"), + "category_data": pd.Series(["a", "a", "b"], dtype="category"), + } + ), + ], +) +@pytest.mark.parametrize( + "include", + [None, "all", ["object"], ["int"], ["object", "int", "category"]], +) +def test_describe_misc_include(pdf, include): + df = cudf.DataFrame.from_pandas(pdf) + + expected = pdf.describe(include=include) + actual = df.describe(include=include) + + for col in expected.columns: + if expected[col].dtype == np.dtype("object"): + expected[col] = expected[col].fillna(-1).astype("str") + actual[col] = actual[col].fillna(-1).astype("str") + + assert_eq(expected, actual) + + +@pytest.mark.parametrize( + "pdf", + [ + pd.DataFrame( + { + "a": [1, 2, 3], + "b": [10, 22, 33], + "c": [0.3234, 0.23432, 0.0], + "d": ["hello", "world", "hello"], + } + ), + pd.DataFrame( + { + "a": [1, 2, 3], + "b": ["hello", "world", "hello"], + "c": [0.3234, 0.23432, 0.0], + } + ), + pd.DataFrame( + { + "int_data": [1, 2, 3], + "str_data": ["hello", "world", "hello"], + "float_data": [0.3234, 0.23432, 0.0], + "timedelta_data": pd.Series( + [1, 2, 1], dtype="timedelta64[ns]" + ), + "datetime_data": pd.Series([1, 2, 1], dtype="datetime64[ns]"), + } + ), + pd.DataFrame( + { + "int_data": [1, 2, 3], + "str_data": ["hello", "world", "hello"], + "float_data": [0.3234, 0.23432, 0.0], + "timedelta_data": pd.Series( + [1, 2, 1], dtype="timedelta64[ns]" + ), + "datetime_data": pd.Series([1, 2, 1], dtype="datetime64[ns]"), + "category_data": pd.Series(["a", "a", "b"], dtype="category"), + } + ), + ], +) +@pytest.mark.parametrize( + "exclude", [None, ["object"], ["int"], ["object", "int", "category"]] +) +def test_describe_misc_exclude(pdf, exclude): + df = cudf.DataFrame.from_pandas(pdf) + + expected = pdf.describe(exclude=exclude) + actual = df.describe(exclude=exclude) + + for col in expected.columns: + if expected[col].dtype == np.dtype("object"): + expected[col] = expected[col].fillna(-1).astype("str") + actual[col] = actual[col].fillna(-1).astype("str") + + assert_eq(expected, actual) + + +def test_empty_dataframe_describe(): + pdf = pd.DataFrame({"a": [], "b": []}) + gdf = cudf.from_pandas(pdf) + + expected = pdf.describe() + actual = gdf.describe() + + assert_eq(expected, actual) diff --git a/python/cudf/cudf/tests/dataframe/methods/test_drop.py b/python/cudf/cudf/tests/dataframe/methods/test_drop.py new file mode 100644 index 00000000000..a0e6c1f4049 --- /dev/null +++ b/python/cudf/cudf/tests/dataframe/methods/test_drop.py @@ -0,0 +1,278 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + +import pandas as pd +import pytest + +import cudf +from cudf.testing import assert_eq +from cudf.testing._utils import assert_exceptions_equal + + +@pytest.mark.parametrize( + "pdf", + [ + pd.DataFrame( + {"a": range(10), "b": range(10, 20), "c": range(1, 11)}, + index=pd.Index( + ["a", "b", "c", "d", "e", "f", "g", "h", "i", "j"], + name="custom_name", + ), + ), + pd.DataFrame( + {"a": range(10), "b": range(10, 20), "d": ["a", "v"] * 5} + ), + ], +) +@pytest.mark.parametrize( + "columns", + [["a"], ["b"], "a", "b", ["a", "b"]], +) +def test_dataframe_drop_columns(pdf, columns, inplace): + if inplace: + pdf = pdf.copy() + gdf = cudf.from_pandas(pdf) + + expected = pdf.drop(columns=columns, inplace=inplace) + actual = gdf.drop(columns=columns, inplace=inplace) + + if inplace: + expected = pdf + actual = gdf + + assert_eq(expected, actual) + + +@pytest.mark.parametrize("obj", ["Index", "Series"]) +def test_drop_cudf_obj_columns(obj): + pdf = pd.DataFrame({"A": [1], "B": [1]}) + gdf = cudf.from_pandas(pdf) + + columns = ["B"] + expected = pdf.drop(labels=getattr(pd, obj)(columns), axis=1) + actual = gdf.drop(columns=getattr(cudf, obj)(columns), axis=1) + assert_eq(expected, actual) + + +@pytest.mark.parametrize( + "pdf", + [ + pd.DataFrame( + {"a": range(10), "b": range(10, 20), "c": range(1, 11)}, + index=pd.Index(list(range(10)), name="custom_name"), + ), + pd.DataFrame( + {"a": range(10), "b": range(10, 20), "d": ["a", "v"] * 5} + ), + ], +) +@pytest.mark.parametrize( + "labels", + [ + [1], + [0], + 1, + 5, + [5, 9], + pd.Index([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]), + pd.Index([0, 1, 8, 9], name="new name"), + ], +) +def test_dataframe_drop_labels_axis_0(pdf, labels, inplace): + pdf = pdf.copy() + gdf = cudf.from_pandas(pdf) + + expected = pdf.drop(labels=labels, axis=0, inplace=inplace) + actual = gdf.drop(labels=labels, axis=0, inplace=inplace) + + if inplace: + expected = pdf + actual = gdf + + assert_eq(expected, actual) + + +@pytest.mark.parametrize( + "pdf", + [ + pd.DataFrame({"a": range(10), "b": range(10, 20), "c": range(1, 11)}), + pd.DataFrame( + {"a": range(10), "b": range(10, 20), "d": ["a", "v"] * 5} + ), + pd.DataFrame( + { + "a": range(10), + "b": range(10, 20), + }, + index=pd.Index(list(range(10)), dtype="uint64"), + ), + ], +) +@pytest.mark.parametrize( + "index", + [[1], [0], 1, 5, [5, 9], pd.Index([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])], +) +@pytest.mark.parametrize("inplace", [True, False]) +def test_dataframe_drop_index(pdf, index, inplace): + pdf = pdf.copy() + gdf = cudf.from_pandas(pdf) + + expected = pdf.drop(index=index, inplace=inplace) + actual = gdf.drop(index=index, inplace=inplace) + + if inplace: + expected = pdf + actual = gdf + + assert_eq(expected, actual) + + +@pytest.mark.parametrize( + "index,level", + [ + ("cow", 0), + ("lama", 0), + ("falcon", 0), + ("speed", 1), + ("weight", 1), + ("length", 1), + ("cow", None), + ( + "lama", + None, + ), + ( + "falcon", + None, + ), + ], +) +def test_dataframe_drop_multiindex(index, level, inplace): + pdf = pd.DataFrame( + {"a": range(10), "b": range(10, 20), "d": ["a", "v"] * 5}, + index=pd.MultiIndex( + levels=[ + ["lama", "cow", "falcon"], + ["speed", "weight", "length"], + ], + codes=[ + [0, 0, 0, 1, 1, 1, 2, 2, 2, 1], + [0, 1, 2, 0, 1, 2, 0, 1, 2, 1], + ], + ), + ) + gdf = cudf.from_pandas(pdf) + + expected = pdf.drop(index=index, inplace=inplace, level=level) + actual = gdf.drop(index=index, inplace=inplace, level=level) + + if inplace: + expected = pdf + actual = gdf + + assert_eq(expected, actual) + + +@pytest.mark.parametrize("data", [{"c": range(1, 11)}, {"d": ["a", "v"] * 5}]) +@pytest.mark.parametrize("labels", [["a"], ["b"], "a", "b", ["a", "b"]]) +def test_dataframe_drop_labels_axis_1(data, labels, inplace): + pdf = pd.DataFrame({"a": range(10), "b": range(10, 20), **data}) + gdf = cudf.from_pandas(pdf) + + expected = pdf.drop(labels=labels, axis=1, inplace=inplace) + actual = gdf.drop(labels=labels, axis=1, inplace=inplace) + + if inplace: + expected = pdf + actual = gdf + + assert_eq(expected, actual) + + +def test_dataframe_drop_error(): + df = cudf.DataFrame({"a": [1], "b": [2], "c": [3]}) + pdf = df.to_pandas() + + assert_exceptions_equal( + lfunc=pdf.drop, + rfunc=df.drop, + lfunc_args_and_kwargs=([], {"columns": "d"}), + rfunc_args_and_kwargs=([], {"columns": "d"}), + ) + + assert_exceptions_equal( + lfunc=pdf.drop, + rfunc=df.drop, + lfunc_args_and_kwargs=([], {"columns": ["a", "d", "b"]}), + rfunc_args_and_kwargs=([], {"columns": ["a", "d", "b"]}), + ) + + assert_exceptions_equal( + lfunc=pdf.drop, + rfunc=df.drop, + lfunc_args_and_kwargs=(["a"], {"columns": "a", "axis": 1}), + rfunc_args_and_kwargs=(["a"], {"columns": "a", "axis": 1}), + ) + + assert_exceptions_equal( + lfunc=pdf.drop, + rfunc=df.drop, + lfunc_args_and_kwargs=([], {"axis": 1}), + rfunc_args_and_kwargs=([], {"axis": 1}), + ) + + assert_exceptions_equal( + lfunc=pdf.drop, + rfunc=df.drop, + lfunc_args_and_kwargs=([[2, 0]],), + rfunc_args_and_kwargs=([[2, 0]],), + ) + + +def test_dataframe_drop_raises(): + df = cudf.DataFrame( + {"a": [1, 2, 3], "c": [10, 20, 30]}, index=["x", "y", "z"] + ) + pdf = df.to_pandas() + assert_exceptions_equal( + lfunc=pdf.drop, + rfunc=df.drop, + lfunc_args_and_kwargs=(["p"],), + rfunc_args_and_kwargs=(["p"],), + ) + + # label dtype mismatch + assert_exceptions_equal( + lfunc=pdf.drop, + rfunc=df.drop, + lfunc_args_and_kwargs=([3],), + rfunc_args_and_kwargs=([3],), + ) + + expect = pdf.drop("p", errors="ignore") + actual = df.drop("p", errors="ignore") + + assert_eq(actual, expect) + + assert_exceptions_equal( + lfunc=pdf.drop, + rfunc=df.drop, + lfunc_args_and_kwargs=([], {"columns": "p"}), + rfunc_args_and_kwargs=([], {"columns": "p"}), + ) + + expect = pdf.drop(columns="p", errors="ignore") + actual = df.drop(columns="p", errors="ignore") + + assert_eq(actual, expect) + + assert_exceptions_equal( + lfunc=pdf.drop, + rfunc=df.drop, + lfunc_args_and_kwargs=([], {"labels": "p", "axis": 1}), + rfunc_args_and_kwargs=([], {"labels": "p", "axis": 1}), + ) + + expect = pdf.drop(labels="p", axis=1, errors="ignore") + actual = df.drop(labels="p", axis=1, errors="ignore") + + assert_eq(actual, expect) diff --git a/python/cudf/cudf/tests/dataframe/methods/test_hash_values.py b/python/cudf/cudf/tests/dataframe/methods/test_hash_values.py new file mode 100644 index 00000000000..ff884d0cb2e --- /dev/null +++ b/python/cudf/cudf/tests/dataframe/methods/test_hash_values.py @@ -0,0 +1,157 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + +import contextlib + +import numpy as np +import pytest + +import cudf +from cudf.testing import assert_eq, assert_neq + + +@pytest.mark.parametrize( + "method", + [ + "murmur3", + "md5", + "sha1", + "sha224", + "sha256", + "sha384", + "sha512", + "xxhash32", + "xxhash64", + ], +) +@pytest.mark.parametrize("seed", [None, 42]) +def test_dataframe_hash_values(method, seed): + nrows = 10 + warning_expected = seed is not None and method not in { + "murmur3", + "xxhash32", + "xxhash64", + } + potential_warning = ( + pytest.warns(UserWarning, match="Provided seed value has no effect*") + if warning_expected + else contextlib.nullcontext() + ) + + gdf = cudf.DataFrame() + data = np.arange(nrows) + data[0] = data[-1] # make first and last the same + gdf["a"] = data + gdf["b"] = gdf.a + 100 + with potential_warning: + out = gdf.hash_values(method=method, seed=seed) + assert isinstance(out, cudf.Series) + assert len(out) == nrows + expected_dtypes = { + "murmur3": np.uint32, + "md5": object, + "sha1": object, + "sha224": object, + "sha256": object, + "sha384": object, + "sha512": object, + "xxhash32": np.uint32, + "xxhash64": np.uint64, + } + assert out.dtype == expected_dtypes[method] + + # Check single column + with potential_warning: + out_one = gdf[["a"]].hash_values(method=method, seed=seed) + # First matches last + assert out_one.iloc[0] == out_one.iloc[-1] + # Equivalent to the cudf.Series.hash_values() + with potential_warning: + assert_eq(gdf["a"].hash_values(method=method, seed=seed), out_one) + + +@pytest.mark.parametrize("method", ["murmur3", "xxhash32", "xxhash64"]) +def test_dataframe_hash_values_seed(method): + gdf = cudf.DataFrame() + data = np.arange(10) + data[0] = data[-1] # make first and last the same + gdf["a"] = data + gdf["b"] = gdf.a + 100 + out_one = gdf.hash_values(method=method, seed=0) + out_two = gdf.hash_values(method=method, seed=1) + assert out_one.iloc[0] == out_one.iloc[-1] + assert out_two.iloc[0] == out_two.iloc[-1] + assert_neq(out_one, out_two) + + +def test_dataframe_hash_values_xxhash32(): + # xxhash32 has no built-in implementation in Python and we don't want to + # add a testing dependency, so we use regression tests against known good + # values. + gdf = cudf.DataFrame({"a": [0.0, 1.0, 2.0, np.inf, np.nan]}) + gdf["b"] = -gdf["a"] + out_a = gdf["a"].hash_values(method="xxhash32", seed=0) + expected_a = cudf.Series( + [3736311059, 2307980487, 2906647130, 746578903, 4294967295], + dtype=np.uint32, + ) + assert_eq(out_a, expected_a) + + out_b = gdf["b"].hash_values(method="xxhash32", seed=42) + expected_b = cudf.Series( + [1076387279, 2261349915, 531498073, 650869264, 4294967295], + dtype=np.uint32, + ) + assert_eq(out_b, expected_b) + + out_df = gdf.hash_values(method="xxhash32", seed=0) + expected_df = cudf.Series( + [1223721700, 2885793241, 1920811472, 1146715602, 4294967295], + dtype=np.uint32, + ) + assert_eq(out_df, expected_df) + + +def test_dataframe_hash_values_xxhash64(): + # xxhash64 has no built-in implementation in Python and we don't want to + # add a testing dependency, so we use regression tests against known good + # values. + gdf = cudf.DataFrame({"a": [0.0, 1.0, 2.0, np.inf, np.nan]}) + gdf["b"] = -gdf["a"] + out_a = gdf["a"].hash_values(method="xxhash64", seed=0) + expected_a = cudf.Series( + [ + 3803688792395291579, + 10706502109028787093, + 9835943264235290955, + 18031741628920313605, + 18446744073709551615, + ], + dtype=np.uint64, + ) + assert_eq(out_a, expected_a) + + out_b = gdf["b"].hash_values(method="xxhash64", seed=42) + expected_b = cudf.Series( + [ + 9826995235083043316, + 10150515573749944095, + 5005707091092326006, + 5326262080505358431, + 18446744073709551615, + ], + dtype=np.uint64, + ) + assert_eq(out_b, expected_b) + + out_df = gdf.hash_values(method="xxhash64", seed=0) + expected_df = cudf.Series( + [ + 10208049663714815266, + 4949201786888768834, + 18122173653994477335, + 11133539368563441730, + 18446744073709551615, + ], + dtype=np.uint64, + ) + assert_eq(out_df, expected_df) diff --git a/python/cudf/cudf/tests/dataframe/methods/test_head_tail.py b/python/cudf/cudf/tests/dataframe/methods/test_head_tail.py new file mode 100644 index 00000000000..3277b80ca1c --- /dev/null +++ b/python/cudf/cudf/tests/dataframe/methods/test_head_tail.py @@ -0,0 +1,80 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + + +import numpy as np +import pandas as pd +import pytest + +import cudf +from cudf.testing import assert_eq + + +@pytest.mark.parametrize("nelem", [0, 10]) +def test_head_tail(nelem, numeric_types_as_str): + rng = np.random.default_rng(seed=0) + pdf = pd.DataFrame( + { + "a": rng.integers(0, 1000, nelem).astype(numeric_types_as_str), + "b": rng.integers(0, 1000, nelem).astype(numeric_types_as_str), + } + ) + gdf = cudf.from_pandas(pdf) + + assert_eq(gdf.head(), pdf.head()) + assert_eq(gdf.head(3), pdf.head(3)) + assert_eq(gdf.head(-2), pdf.head(-2)) + assert_eq(gdf.head(0), pdf.head(0)) + + assert_eq(gdf["a"].head(), pdf["a"].head()) + assert_eq(gdf["a"].head(3), pdf["a"].head(3)) + assert_eq(gdf["a"].head(-2), pdf["a"].head(-2)) + + assert_eq(gdf.tail(), pdf.tail()) + assert_eq(gdf.tail(3), pdf.tail(3)) + assert_eq(gdf.tail(-2), pdf.tail(-2)) + assert_eq(gdf.tail(0), pdf.tail(0)) + + assert_eq(gdf["a"].tail(), pdf["a"].tail()) + assert_eq(gdf["a"].tail(3), pdf["a"].tail(3)) + assert_eq(gdf["a"].tail(-2), pdf["a"].tail(-2)) + + +def test_tail_for_string(): + gdf = cudf.DataFrame({"id": ["a", "b"], "v": [1, 2]}) + assert_eq(gdf.tail(3), gdf.to_pandas().tail(3)) + + +def test_dataframe_0_row_dtype(all_supported_types_as_str): + data = cudf.Series([1, 2, 3, 4, 5], dtype=all_supported_types_as_str) + + expect = cudf.DataFrame({"x": data, "y": data}) + got = expect.head(0) + + for col_name in got.columns: + assert expect[col_name].dtype == got[col_name].dtype + + expect = cudf.Series(data) + got = expect.head(0) + + assert expect.dtype == got.dtype + + +def test_one_row_head(): + gdf = cudf.DataFrame({"name": ["carl"], "score": [100]}, index=[123]) + pdf = gdf.to_pandas() + + head_gdf = gdf.head() + head_pdf = pdf.head() + + assert_eq(head_pdf, head_gdf) + + +@pytest.mark.parametrize("index", [None, [123], ["a", "b"]]) +def test_no_cols_head(index): + pdf = pd.DataFrame(index=index) + gdf = cudf.from_pandas(pdf) + + head_gdf = gdf.head() + head_pdf = pdf.head() + + assert_eq(head_pdf, head_gdf) diff --git a/python/cudf/cudf/tests/dataframe/methods/test_isna_notnull.py b/python/cudf/cudf/tests/dataframe/methods/test_isna_notnull.py new file mode 100644 index 00000000000..8736533fe80 --- /dev/null +++ b/python/cudf/cudf/tests/dataframe/methods/test_isna_notnull.py @@ -0,0 +1,56 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + +import numpy as np +import pandas as pd +import pytest + +import cudf +from cudf.testing import assert_eq + + +@pytest.mark.parametrize( + "na_data", + [ + pd.DataFrame( + { + "a": [0, 1, 2, np.nan, 4, None, 6], + "b": [np.nan, None, "u", "h", "d", "a", "m"], + }, + index=["q", "w", "e", "r", "t", "y", "u"], + ), + pd.DataFrame({"a": [0, 1, 2, 3, 4], "b": ["a", "b", "u", "h", "d"]}), + pd.DataFrame( + { + "a": [None, None, np.nan, None], + "b": [np.nan, None, np.nan, None], + } + ), + pd.DataFrame({"a": []}), + pd.DataFrame({"a": [np.nan], "b": [None]}), + pd.DataFrame({"a": ["a", "b", "c", None, "e"]}), + pd.DataFrame({"a": ["a", "b", "c", "d", "e"]}), + ], +) +@pytest.mark.parametrize("api_call", ["isnull", "isna", "notna", "notnull"]) +def test_dataframe_isnull_isna_and_reverse(na_data, nan_as_null, api_call): + def detect_nan(x): + # Check if the input is a float and if it is nan + return x.apply(lambda v: isinstance(v, float) and np.isnan(v)) + + df = na_data + nan_contains = df.select_dtypes(object).apply(detect_nan) + if nan_as_null is False and ( + nan_contains.any().any() and not nan_contains.all().all() + ): + with pytest.raises(cudf.errors.MixedTypeError): + cudf.DataFrame.from_pandas(df, nan_as_null=nan_as_null) + else: + gdf = cudf.DataFrame.from_pandas(df, nan_as_null=nan_as_null) + + assert_eq(getattr(df, api_call)(), getattr(gdf, api_call)()) + + # Test individual columns + for col in df: + assert_eq( + getattr(df[col], api_call)(), getattr(gdf[col], api_call)() + ) diff --git a/python/cudf/cudf/tests/dataframe/methods/test_items.py b/python/cudf/cudf/tests/dataframe/methods/test_items.py new file mode 100644 index 00000000000..8e65c50f617 --- /dev/null +++ b/python/cudf/cudf/tests/dataframe/methods/test_items.py @@ -0,0 +1,13 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + + +import cudf +from cudf.testing import assert_eq + + +def test_iteritems(): + gdf = cudf.DataFrame({"x": range(10), "y": range(10)}) + for k, v in gdf.items(): + assert k in gdf.columns + assert isinstance(v, cudf.Series) + assert_eq(v, gdf[k]) diff --git a/python/cudf/cudf/tests/dataframe/methods/test_partition_by_hash.py b/python/cudf/cudf/tests/dataframe/methods/test_partition_by_hash.py new file mode 100644 index 00000000000..432c250a288 --- /dev/null +++ b/python/cudf/cudf/tests/dataframe/methods/test_partition_by_hash.py @@ -0,0 +1,116 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + +import numpy as np +import pytest + +import cudf +from cudf.testing import assert_eq +from cudf.testing._utils import expand_bits_to_bytes, random_bitmask + + +@pytest.mark.parametrize("nparts", [1, 2]) +def test_dataframe_hash_partition(nparts): + nrows = 10 + nkeys = 2 + rng = np.random.default_rng(seed=0) + gdf = cudf.DataFrame( + {f"key{i}": rng.integers(0, 7 - i, nrows) for i in range(nkeys)} + ) + keycols = gdf.columns.to_list() + gdf["val1"] = rng.integers(0, nrows * 2, nrows) + + got = gdf.partition_by_hash(keycols, nparts=nparts) + # Must return a list + assert isinstance(got, list) + # Must have correct number of partitions + assert len(got) == nparts + # All partitions must be DataFrame type + assert all(isinstance(p, cudf.DataFrame) for p in got) + # Check that all partitions have unique keys + part_unique_keys = set() + for p in got: + if len(p): + # Take rows of the keycolumns and build a set of the key-values + unique_keys = set(map(tuple, p[keycols].values_host)) + # Ensure that none of the key-values have occurred in other groups + assert not (unique_keys & part_unique_keys) + part_unique_keys |= unique_keys + assert len(part_unique_keys) + + +def test_dataframe_hash_partition_masked_value(): + nrows = 10 + gdf = cudf.DataFrame( + { + "key": np.arange(nrows), + "val": np.arange(nrows) + 100, + } + ) + bitmask = random_bitmask(nrows) + bytemask = expand_bits_to_bytes(bitmask) + gdf["val"] = gdf["val"]._column.set_mask(bitmask) + parted = gdf.partition_by_hash(["key"], nparts=3) + # Verify that the valid mask is correct + for p in parted: + df = p.to_pandas() + for row in df.itertuples(): + valid = bool(bytemask[row.key]) + expected_value = row.key + 100 if valid else np.nan + got_value = row.val + assert (expected_value == got_value) or ( + np.isnan(expected_value) and np.isnan(got_value) + ) + + +def test_dataframe_hash_partition_masked_keys(): + nrows = 5 + gdf = cudf.DataFrame( + { + "key": np.arange(nrows), + "val": np.arange(nrows) + 100, + } + ) + bitmask = random_bitmask(nrows) + bytemask = expand_bits_to_bytes(bitmask) + gdf["key"] = gdf["key"]._column.set_mask(bitmask) + parted = gdf.partition_by_hash(["key"], nparts=3, keep_index=False) + # Verify that the valid mask is correct + for p in parted: + df = p.to_pandas() + for row in df.itertuples(): + valid = bool(bytemask[row.val - 100]) + # val is key + 100 + expected_value = row.val - 100 if valid else np.nan + got_value = row.key + assert (expected_value == got_value) or ( + np.isnan(expected_value) and np.isnan(got_value) + ) + + +@pytest.mark.parametrize("keep_index", [True, False]) +def test_dataframe_hash_partition_keep_index(keep_index): + gdf = cudf.DataFrame( + {"val": [1, 2, 3, 4, 5], "key": [3, 2, 1, 4, 5]}, index=[5, 4, 3, 2, 1] + ) + + expected_df1 = cudf.DataFrame( + {"val": [1, 5], "key": [3, 5]}, index=[5, 1] if keep_index else None + ) + expected_df2 = cudf.DataFrame( + {"val": [2, 3, 4], "key": [2, 1, 4]}, + index=[4, 3, 2] if keep_index else None, + ) + expected = [expected_df1, expected_df2] + + parts = gdf.partition_by_hash(["key"], nparts=2, keep_index=keep_index) + + for exp, got in zip(expected, parts, strict=True): + assert_eq(exp, got) + + +def test_dataframe_hash_partition_empty(): + gdf = cudf.DataFrame({"val": [1, 2], "key": [3, 2]}, index=["a", "b"]) + parts = gdf.iloc[:0].partition_by_hash(["key"], nparts=3) + assert len(parts) == 3 + for part in parts: + assert_eq(gdf.iloc[:0], part) diff --git a/python/cudf/cudf/tests/dataframe/methods/test_pop.py b/python/cudf/cudf/tests/dataframe/methods/test_pop.py new file mode 100644 index 00000000000..bdfb1004db1 --- /dev/null +++ b/python/cudf/cudf/tests/dataframe/methods/test_pop.py @@ -0,0 +1,46 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + + +import pandas as pd +import pytest + +import cudf +from cudf.testing import assert_eq + + +def test_dataframe_pop(): + pdf = pd.DataFrame( + {"a": [1, 2, 3], "b": ["x", "y", "z"], "c": [7.0, 8.0, 9.0]} + ) + gdf = cudf.DataFrame.from_pandas(pdf) + + # Test non-existing column error + with pytest.raises(KeyError) as raises: + gdf.pop("fake_colname") + raises.match("fake_colname") + + # check pop numeric column + pdf_pop = pdf.pop("a") + gdf_pop = gdf.pop("a") + assert_eq(pdf_pop, gdf_pop) + assert_eq(pdf, gdf) + + # check string column + pdf_pop = pdf.pop("b") + gdf_pop = gdf.pop("b") + assert_eq(pdf_pop, gdf_pop) + assert_eq(pdf, gdf) + + # check float column and empty dataframe + pdf_pop = pdf.pop("c") + gdf_pop = gdf.pop("c") + assert_eq(pdf_pop, gdf_pop) + assert_eq(pdf, gdf) + + # check empty dataframe edge case + empty_pdf = pd.DataFrame(columns=["a", "b"]) + empty_gdf = cudf.DataFrame(columns=["a", "b"]) + pb = empty_pdf.pop("b") + gb = empty_gdf.pop("b") + assert len(pb) == len(gb) + assert empty_pdf.empty and empty_gdf.empty diff --git a/python/cudf/cudf/tests/dataframe/methods/test_reindex.py b/python/cudf/cudf/tests/dataframe/methods/test_reindex.py new file mode 100644 index 00000000000..19efd968a95 --- /dev/null +++ b/python/cudf/cudf/tests/dataframe/methods/test_reindex.py @@ -0,0 +1,191 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + + +import pandas as pd +import pytest + +import cudf +from cudf.testing import assert_eq + + +@pytest.mark.parametrize("copy", [True, False]) +@pytest.mark.parametrize( + "args,gd_kwargs", + [ + ([], {}), + ([[-3, 0, 3, 0, -2, 1, 3, 4, 6]], {}), + ([[-3, 0, 3, 0, -2, 1, 3, 4, 6]], {}), + ([[-3, 0, 3, 0, -2, 1, 3, 4, 6]], {"axis": 0}), + ([["a", "b", "c", "d", "e"]], {"axis": 1}), + ([], {"labels": [-3, 0, 3, 0, -2, 1, 3, 4, 6], "axis": 0}), + ([], {"labels": ["a", "b", "c", "d", "e"], "axis": 1}), + ([], {"labels": [-3, 0, 3, 0, -2, 1, 3, 4, 6], "axis": "index"}), + ([], {"labels": ["a", "b", "c", "d", "e"], "axis": "columns"}), + ([], {"index": [-3, 0, 3, 0, -2, 1, 3, 4, 6]}), + ([], {"columns": ["a", "b", "c", "d", "e"]}), + ( + [], + { + "index": [-3, 0, 3, 0, -2, 1, 3, 4, 6], + "columns": ["a", "b", "c", "d", "e"], + }, + ), + ], +) +def test_dataframe_reindex(copy, args, gd_kwargs): + reindex_data = cudf.datasets.randomdata( + nrows=6, + dtypes={ + "a": "category", + "c": float, + "d": str, + }, + ) + pdf, gdf = reindex_data.to_pandas(), reindex_data + assert_eq( + pdf.reindex(*args, **gd_kwargs, copy=True), + gdf.reindex(*args, **gd_kwargs, copy=copy), + ) + + +@pytest.mark.parametrize("fill_value", [-1.0, 0.0, 1.5]) +@pytest.mark.parametrize( + "args,kwargs", + [ + ([], {}), + ([[-3, 0, 3, 0, -2, 1, 3, 4, 6]], {}), + ([[-3, 0, 3, 0, -2, 1, 3, 4, 6]], {}), + ([[-3, 0, 3, 0, -2, 1, 3, 4, 6]], {"axis": 0}), + ([["a", "b", "c", "d", "e"]], {"axis": 1}), + ([], {"labels": [-3, 0, 3, 0, -2, 1, 3, 4, 6], "axis": 0}), + ([], {"labels": ["a", "b", "c", "d", "e"], "axis": 1}), + ([], {"labels": [-3, 0, 3, 0, -2, 1, 3, 4, 6], "axis": "index"}), + ([], {"labels": ["a", "b", "c", "d", "e"], "axis": "columns"}), + ([], {"index": [-3, 0, 3, 0, -2, 1, 3, 4, 6]}), + ([], {"columns": ["a", "b", "c", "d", "e"]}), + ( + [], + { + "index": [-3, 0, 3, 0, -2, 1, 3, 4, 6], + "columns": ["a", "b", "c", "d", "e"], + }, + ), + ], +) +def test_dataframe_reindex_fill_value(args, kwargs, fill_value): + reindex_data_numeric = cudf.datasets.randomdata( + nrows=6, + dtypes={"a": float, "b": float, "c": float}, + ) + pdf, gdf = reindex_data_numeric.to_pandas(), reindex_data_numeric + assert_eq( + pdf.reindex(*args, **kwargs, fill_value=fill_value), + gdf.reindex(*args, **kwargs, fill_value=fill_value), + ) + + +@pytest.mark.parametrize("copy", [True, False]) +def test_dataframe_reindex_change_dtype(copy): + index = pd.date_range("12/29/2009", periods=10, freq="D") + columns = ["a", "b", "c", "d", "e"] + gdf = cudf.datasets.randomdata( + nrows=6, dtypes={"a": "category", "c": float, "d": str} + ) + pdf = gdf.to_pandas() + # Validate reindexes both labels and column names when + # index=index_labels and columns=column_labels + assert_eq( + pdf.reindex(index=index, columns=columns, copy=True), + gdf.reindex(index=index, columns=columns, copy=copy), + check_freq=False, + ) + + +@pytest.mark.parametrize("copy", [True, False]) +def test_series_categorical_reindex(copy): + index = [-3, 0, 3, 0, -2, 1, 3, 4, 6] + gdf = cudf.datasets.randomdata(nrows=6, dtypes={"a": "category"}) + pdf = gdf.to_pandas() + assert_eq(pdf["a"].reindex(copy=True), gdf["a"].reindex(copy=copy)) + assert_eq( + pdf["a"].reindex(index, copy=True), gdf["a"].reindex(index, copy=copy) + ) + assert_eq( + pdf["a"].reindex(index=index, copy=True), + gdf["a"].reindex(index=index, copy=copy), + ) + + +@pytest.mark.parametrize("copy", [True, False]) +def test_series_float_reindex(copy): + index = [-3, 0, 3, 0, -2, 1, 3, 4, 6] + gdf = cudf.datasets.randomdata(nrows=6, dtypes={"c": float}) + pdf = gdf.to_pandas() + assert_eq(pdf["c"].reindex(copy=True), gdf["c"].reindex(copy=copy)) + assert_eq( + pdf["c"].reindex(index, copy=True), gdf["c"].reindex(index, copy=copy) + ) + assert_eq( + pdf["c"].reindex(index=index, copy=True), + gdf["c"].reindex(index=index, copy=copy), + ) + + +@pytest.mark.parametrize("copy", [True, False]) +def test_series_string_reindex(copy): + index = [-3, 0, 3, 0, -2, 1, 3, 4, 6] + gdf = cudf.datasets.randomdata(nrows=6, dtypes={"d": str}) + pdf = gdf.to_pandas() + assert_eq(pdf["d"].reindex(copy=True), gdf["d"].reindex(copy=copy)) + assert_eq( + pdf["d"].reindex(index, copy=True), gdf["d"].reindex(index, copy=copy) + ) + assert_eq( + pdf["d"].reindex(index=index, copy=True), + gdf["d"].reindex(index=index, copy=copy), + ) + + +@pytest.mark.parametrize("names", [None, ["a", "b"]]) +@pytest.mark.parametrize("klass", [cudf.MultiIndex, pd.MultiIndex]) +def test_reindex_multiindex_col_to_multiindex(names, klass): + idx = pd.Index( + [("A", "one"), ("A", "two")], + dtype="object", + ) + df = pd.DataFrame([[1, 2]], columns=idx) + gdf = cudf.from_pandas(df) + midx = klass.from_tuples([("A", "one"), ("A", "three")], names=names) + result = gdf.reindex(columns=midx) + expected = cudf.DataFrame([[1, None]], columns=midx) + # (pandas2.0): check_dtype=False won't be needed + # as None col will return object instead of float + assert_eq(result, expected, check_dtype=False) + + +@pytest.mark.parametrize("names", [None, ["a", "b"]]) +@pytest.mark.parametrize("klass", [cudf.MultiIndex, pd.MultiIndex]) +def test_reindex_tuple_col_to_multiindex(names, klass): + idx = pd.Index( + [("A", "one"), ("A", "two")], dtype="object", tupleize_cols=False + ) + df = pd.DataFrame([[1, 2]], columns=idx) + gdf = cudf.from_pandas(df) + midx = klass.from_tuples([("A", "one"), ("A", "two")], names=names) + result = gdf.reindex(columns=midx) + expected = cudf.DataFrame([[1, 2]], columns=midx) + assert_eq(result, expected) + + +@pytest.mark.parametrize("name", [None, "foo"]) +@pytest.mark.parametrize("klass", [range, cudf.RangeIndex, pd.RangeIndex]) +def test_reindex_columns_rangeindex_keeps_rangeindex(name, klass): + new_columns = klass(3) + exp_name = None + if klass is not range: + new_columns.name = name + exp_name = name + df = cudf.DataFrame([[1, 2]]) + result = df.reindex(columns=new_columns).columns + expected = pd.RangeIndex(3, name=exp_name) + assert_eq(result, expected) diff --git a/python/cudf/cudf/tests/dataframe/methods/test_rename.py b/python/cudf/cudf/tests/dataframe/methods/test_rename.py new file mode 100644 index 00000000000..b7e9bf3c7ce --- /dev/null +++ b/python/cudf/cudf/tests/dataframe/methods/test_rename.py @@ -0,0 +1,70 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + + +import numpy as np +import pandas as pd +import pytest + +import cudf +from cudf.testing import assert_eq + + +@pytest.mark.parametrize("axis", [0, "index"]) +def test_dataframe_index_rename(axis): + pdf = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]}) + gdf = cudf.DataFrame.from_pandas(pdf) + + expect = pdf.rename(mapper={1: 5, 2: 6}, axis=axis) + got = gdf.rename(mapper={1: 5, 2: 6}, axis=axis) + + assert_eq(expect, got) + + expect = pdf.rename(index={1: 5, 2: 6}) + got = gdf.rename(index={1: 5, 2: 6}) + + assert_eq(expect, got) + + expect = pdf.rename({1: 5, 2: 6}) + got = gdf.rename({1: 5, 2: 6}) + + assert_eq(expect, got) + + # `pandas` can support indexes with mixed values. We throw a + # `NotImplementedError`. + with pytest.raises(NotImplementedError): + gdf.rename(mapper={1: "x", 2: "y"}, axis=axis) + + +def test_dataframe_MI_rename(): + gdf = cudf.DataFrame( + {"a": np.arange(10), "b": np.arange(10), "c": np.arange(10)} + ) + gdg = gdf.groupby(["a", "b"]).count() + pdg = gdg.to_pandas() + + expect = pdg.rename(mapper={1: 5, 2: 6}, axis=0) + got = gdg.rename(mapper={1: 5, 2: 6}, axis=0) + + assert_eq(expect, got) + + +@pytest.mark.parametrize("axis", [1, "columns"]) +def test_dataframe_column_rename(axis): + pdf = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]}) + gdf = cudf.DataFrame.from_pandas(pdf) + + expect = pdf.rename(mapper=lambda name: 2 * name, axis=axis) + got = gdf.rename(mapper=lambda name: 2 * name, axis=axis) + + assert_eq(expect, got) + + expect = pdf.rename(columns=lambda name: 2 * name) + got = gdf.rename(columns=lambda name: 2 * name) + + assert_eq(expect, got) + + rename_mapper = {"a": "z", "b": "y", "c": "x"} + expect = pdf.rename(columns=rename_mapper) + got = gdf.rename(columns=rename_mapper) + + assert_eq(expect, got) diff --git a/python/cudf/cudf/tests/dataframe/methods/test_reset_index.py b/python/cudf/cudf/tests/dataframe/methods/test_reset_index.py new file mode 100644 index 00000000000..fa0546d526f --- /dev/null +++ b/python/cudf/cudf/tests/dataframe/methods/test_reset_index.py @@ -0,0 +1,144 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + + +import pandas as pd +import pytest + +import cudf +from cudf.testing import assert_eq +from cudf.testing._utils import assert_exceptions_equal + + +@pytest.mark.parametrize("level", [None, 0, "l0", 1, ["l0", 1]]) +@pytest.mark.parametrize( + "column_names", + [ + ["v0", "v1"], + ["v0", "index"], + pd.MultiIndex.from_tuples([("x0", "x1"), ("y0", "y1")]), + pd.MultiIndex.from_tuples([(1, 2), (10, 11)], names=["ABC", "DEF"]), + ], +) +@pytest.mark.parametrize("col_level", [0, 1]) +@pytest.mark.parametrize("col_fill", ["", "some_lv"]) +def test_reset_index(level, drop, column_names, inplace, col_level, col_fill): + midx = pd.MultiIndex.from_tuples( + [("a", 1), ("a", 2), ("b", 1), ("b", 2)], names=["l0", None] + ) + pdf = pd.DataFrame( + [[1, 2], [3, 4], [5, 6], [7, 8]], index=midx, columns=column_names + ) + gdf = cudf.from_pandas(pdf) + + expect = pdf.reset_index( + level=level, + drop=drop, + inplace=inplace, + col_level=col_level, + col_fill=col_fill, + ) + got = gdf.reset_index( + level=level, + drop=drop, + inplace=inplace, + col_level=col_level, + col_fill=col_fill, + ) + if inplace: + expect = pdf + got = gdf + + assert_eq(expect, got) + + +@pytest.mark.parametrize("level", [None, 0, 1, [None]]) +@pytest.mark.parametrize("col_level", [0, 1]) +@pytest.mark.parametrize("col_fill", ["", "some_lv"]) +def test_reset_index_dup_level_name(level, drop, inplace, col_level, col_fill): + # midx levels are named [None, None] + midx = pd.MultiIndex.from_tuples([("a", 1), ("a", 2), ("b", 1), ("b", 2)]) + pdf = pd.DataFrame([[1, 2], [3, 4], [5, 6], [7, 8]], index=midx) + gdf = cudf.from_pandas(pdf) + if level == [None]: + assert_exceptions_equal( + lfunc=pdf.reset_index, + rfunc=gdf.reset_index, + lfunc_args_and_kwargs=( + [], + {"level": level, "drop": drop, "inplace": inplace}, + ), + rfunc_args_and_kwargs=( + [], + {"level": level, "drop": drop, "inplace": inplace}, + ), + ) + return + + expect = pdf.reset_index( + level=level, + drop=drop, + inplace=inplace, + col_level=col_level, + col_fill=col_fill, + ) + got = gdf.reset_index( + level=level, + drop=drop, + inplace=inplace, + col_level=col_level, + col_fill=col_fill, + ) + if inplace: + expect = pdf + got = gdf + + assert_eq(expect, got) + + +@pytest.mark.parametrize("col_level", [0, 1]) +@pytest.mark.parametrize("col_fill", ["", "some_lv"]) +def test_reset_index_named(drop, inplace, col_level, col_fill): + pdf = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) + gdf = cudf.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) + pdf.index.name = "cudf" + gdf.index.name = "cudf" + + expect = pdf.reset_index( + drop=drop, inplace=inplace, col_level=col_level, col_fill=col_fill + ) + got = gdf.reset_index( + drop=drop, inplace=inplace, col_level=col_level, col_fill=col_fill + ) + if inplace: + expect = pdf + got = gdf + assert_eq(expect, got) + + +@pytest.mark.parametrize("column_names", [["x", "y"], ["index", "y"]]) +@pytest.mark.parametrize("col_level", [0, 1]) +@pytest.mark.parametrize("col_fill", ["", "some_lv"]) +def test_reset_index_unnamed(drop, inplace, column_names, col_level, col_fill): + pdf = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) + gdf = cudf.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) + pdf.columns = column_names + gdf.columns = column_names + + expect = pdf.reset_index( + drop=drop, inplace=inplace, col_level=col_level, col_fill=col_fill + ) + got = gdf.reset_index( + drop=drop, inplace=inplace, col_level=col_level, col_fill=col_fill + ) + if inplace: + expect = pdf + got = gdf + assert_eq(expect, got) + + +def test_reset_index_invalid_level(): + with pytest.raises(IndexError): + cudf.DataFrame([1]).reset_index(level=2) + + with pytest.raises(IndexError): + pd.DataFrame([1]).reset_index(level=2) diff --git a/python/cudf/cudf/tests/dataframe/methods/test_round.py b/python/cudf/cudf/tests/dataframe/methods/test_round.py new file mode 100644 index 00000000000..6c4781236c9 --- /dev/null +++ b/python/cudf/cudf/tests/dataframe/methods/test_round.py @@ -0,0 +1,70 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + +import numpy as np +import pandas as pd +import pytest + +import cudf +from cudf.testing import assert_eq + + +@pytest.mark.parametrize( + "decimals", + [ + -3, + 0, + 5, + pd.Series( + [1, 4, 3, -6], + index=["floats", "ints", "floats_with_nan", "floats_same"], + ), + cudf.Series( + [-4, -2, 12], index=["ints", "floats_with_nan", "floats_same"] + ), + {"floats": -1, "ints": 15, "floats_will_nan": 2}, + ], +) +def test_dataframe_round(decimals): + rng = np.random.default_rng(seed=0) + gdf = cudf.DataFrame( + { + "floats": np.arange(0.5, 10.5, 1), + "ints": rng.normal(-100, 100, 10), + "floats_with_na": np.array( + [ + 14.123, + 2.343, + np.nan, + 0.0, + -8.302, + np.nan, + 94.313, + None, + -8.029, + np.nan, + ] + ), + "floats_same": np.repeat([-0.6459412758761901], 10), + "bools": rng.choice([True, None, False], 10), + "strings": rng.choice(["abc", "xyz", None], 10), + "struct": rng.choice([{"abc": 1}, {"xyz": 2}, None], 10), + "list": [[1], [2], None, [4], [3]] * 2, + } + ) + pdf = gdf.to_pandas() + + if isinstance(decimals, cudf.Series): + pdecimals = decimals.to_pandas() + else: + pdecimals = decimals + + result = gdf.round(decimals) + expected = pdf.round(pdecimals) + + assert_eq(result, expected) + + +def test_dataframe_round_dict_decimal_validation(): + df = cudf.DataFrame({"A": [0.12], "B": [0.13]}) + with pytest.raises(TypeError): + df.round({"A": 1, "B": 0.5}) diff --git a/python/cudf/cudf/tests/dataframe/methods/test_select_dtypes.py b/python/cudf/cudf/tests/dataframe/methods/test_select_dtypes.py new file mode 100644 index 00000000000..70a8ce0ed49 --- /dev/null +++ b/python/cudf/cudf/tests/dataframe/methods/test_select_dtypes.py @@ -0,0 +1,170 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + +import numpy as np + +import cudf +from cudf.testing import assert_eq +from cudf.testing._utils import assert_exceptions_equal + + +def test_select_dtype(): + gdf = cudf.datasets.randomdata( + nrows=20, dtypes={"a": "category", "b": int, "c": float, "d": str} + ) + pdf = gdf.to_pandas() + + assert_eq(pdf.select_dtypes("float64"), gdf.select_dtypes("float64")) + assert_eq(pdf.select_dtypes(np.float64), gdf.select_dtypes(np.float64)) + assert_eq( + pdf.select_dtypes(include=["float64"]), + gdf.select_dtypes(include=["float64"]), + ) + assert_eq( + pdf.select_dtypes(include=["object", "int", "category"]), + gdf.select_dtypes(include=["object", "int", "category"]), + ) + + assert_eq( + pdf.select_dtypes(include=["int64", "float64"]), + gdf.select_dtypes(include=["int64", "float64"]), + ) + assert_eq( + pdf.select_dtypes(include=np.number), + gdf.select_dtypes(include=np.number), + ) + assert_eq( + pdf.select_dtypes(include=[np.int64, np.float64]), + gdf.select_dtypes(include=[np.int64, np.float64]), + ) + + assert_eq( + pdf.select_dtypes(include=["category"]), + gdf.select_dtypes(include=["category"]), + ) + assert_eq( + pdf.select_dtypes(exclude=np.number), + gdf.select_dtypes(exclude=np.number), + ) + + assert_exceptions_equal( + lfunc=pdf.select_dtypes, + rfunc=gdf.select_dtypes, + lfunc_args_and_kwargs=([], {"includes": ["Foo"]}), + rfunc_args_and_kwargs=([], {"includes": ["Foo"]}), + ) + + assert_exceptions_equal( + lfunc=pdf.select_dtypes, + rfunc=gdf.select_dtypes, + lfunc_args_and_kwargs=( + [], + {"exclude": np.number, "include": np.number}, + ), + rfunc_args_and_kwargs=( + [], + {"exclude": np.number, "include": np.number}, + ), + ) + + gdf = cudf.DataFrame( + {"A": [3, 4, 5], "C": [1, 2, 3], "D": ["a", "b", "c"]} + ) + pdf = gdf.to_pandas() + assert_eq( + pdf.select_dtypes(include=["object", "int", "category"]), + gdf.select_dtypes(include=["object", "int", "category"]), + ) + assert_eq( + pdf.select_dtypes(include=["object"], exclude=["category"]), + gdf.select_dtypes(include=["object"], exclude=["category"]), + ) + + gdf = cudf.DataFrame({"a": range(10), "b": range(10, 20)}) + pdf = gdf.to_pandas() + assert_eq( + pdf.select_dtypes(include=["category"]), + gdf.select_dtypes(include=["category"]), + ) + assert_eq( + pdf.select_dtypes(include=["float"]), + gdf.select_dtypes(include=["float"]), + ) + assert_eq( + pdf.select_dtypes(include=["object"]), + gdf.select_dtypes(include=["object"]), + ) + assert_eq( + pdf.select_dtypes(include=["int"]), gdf.select_dtypes(include=["int"]) + ) + assert_eq( + pdf.select_dtypes(exclude=["float"]), + gdf.select_dtypes(exclude=["float"]), + ) + assert_eq( + pdf.select_dtypes(exclude=["object"]), + gdf.select_dtypes(exclude=["object"]), + ) + assert_eq( + pdf.select_dtypes(include=["int"], exclude=["object"]), + gdf.select_dtypes(include=["int"], exclude=["object"]), + ) + + assert_exceptions_equal( + lfunc=pdf.select_dtypes, + rfunc=gdf.select_dtypes, + ) + + gdf = cudf.DataFrame( + {"a": cudf.Series([], dtype="int"), "b": cudf.Series([], dtype="str")} + ) + pdf = gdf.to_pandas() + assert_eq( + pdf.select_dtypes(exclude=["object"]), + gdf.select_dtypes(exclude=["object"]), + ) + assert_eq( + pdf.select_dtypes(include=["int"], exclude=["object"]), + gdf.select_dtypes(include=["int"], exclude=["object"]), + ) + + gdf = cudf.DataFrame( + {"int_col": [0, 1, 2], "list_col": [[1, 2], [3, 4], [5, 6]]} + ) + pdf = gdf.to_pandas() + assert_eq( + pdf.select_dtypes("int64"), + gdf.select_dtypes("int64"), + ) + + +def test_select_dtype_datetime(): + gdf = cudf.datasets.timeseries( + start="2000-01-01", end="2000-01-02", freq="3600s", dtypes={"x": int} + ) + gdf = gdf.reset_index() + pdf = gdf.to_pandas() + + assert_eq(pdf.select_dtypes("datetime64"), gdf.select_dtypes("datetime64")) + assert_eq( + pdf.select_dtypes(np.dtype("datetime64")), + gdf.select_dtypes(np.dtype("datetime64")), + ) + assert_eq( + pdf.select_dtypes(include="datetime64"), + gdf.select_dtypes(include="datetime64"), + ) + + +def test_select_dtype_datetime_with_frequency(): + gdf = cudf.datasets.timeseries( + start="2000-01-01", end="2000-01-02", freq="3600s", dtypes={"x": int} + ) + gdf = gdf.reset_index() + pdf = gdf.to_pandas() + + assert_exceptions_equal( + pdf.select_dtypes, + gdf.select_dtypes, + (["datetime64[ms]"],), + (["datetime64[ms]"],), + ) diff --git a/python/cudf/cudf/tests/dataframe/methods/test_set_index.py b/python/cudf/cudf/tests/dataframe/methods/test_set_index.py new file mode 100644 index 00000000000..e1a99df4119 --- /dev/null +++ b/python/cudf/cudf/tests/dataframe/methods/test_set_index.py @@ -0,0 +1,105 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + + +import numpy as np +import pandas as pd +import pytest + +import cudf +from cudf.testing import assert_eq + + +@pytest.mark.parametrize( + "index", + [ + "a", + ["a", "b"], + pd.CategoricalIndex(["I", "II", "III", "IV", "V"]), + pd.Series(["h", "i", "k", "l", "m"]), + ["b", pd.Index(["I", "II", "III", "IV", "V"])], + ["c", [11, 12, 13, 14, 15]], + pd.MultiIndex( + levels=[ + ["I", "II", "III", "IV", "V"], + ["one", "two", "three", "four", "five"], + ], + codes=[[0, 1, 2, 3, 4], [4, 3, 2, 1, 0]], + names=["col1", "col2"], + ), + pd.RangeIndex(0, 5), # corner case + [pd.Series(["h", "i", "k", "l", "m"]), pd.RangeIndex(0, 5)], + [ + pd.MultiIndex( + levels=[ + ["I", "II", "III", "IV", "V"], + ["one", "two", "three", "four", "five"], + ], + codes=[[0, 1, 2, 3, 4], [4, 3, 2, 1, 0]], + names=["col1", "col2"], + ), + pd.RangeIndex(0, 5), + ], + ], +) +@pytest.mark.parametrize("append", [True, False]) +def test_set_index(index, drop, append, inplace): + gdf = cudf.DataFrame( + { + "a": [1, 2, 3, 4, 5], + "b": ["a", "b", "c", "d", "e"], + "c": [1.0, 2.0, 3.0, 4.0, 5.0], + } + ) + pdf = gdf.to_pandas() + + expected = pdf.set_index(index, inplace=inplace, drop=drop, append=append) + actual = gdf.set_index(index, inplace=inplace, drop=drop, append=append) + + if inplace: + expected = pdf + actual = gdf + assert_eq(expected, actual) + + +@pytest.mark.parametrize("index", ["a", pd.Index([1, 1, 2, 2, 3])]) +def test_set_index_verify_integrity(index): + gdf = cudf.DataFrame( + { + "a": [1, 1, 2, 2, 5], + "b": ["a", "b", "c", "d", "e"], + "c": [1.0, 2.0, 3.0, 4.0, 5.0], + } + ) + with pytest.raises(ValueError): + gdf.set_index(index, verify_integrity=True) + + +def test_set_index_multi(drop): + nelem = 10 + rng = np.random.default_rng(seed=0) + a = np.arange(nelem) + rng.shuffle(a) + df = pd.DataFrame( + { + "a": a, + "b": rng.integers(0, 4, size=nelem), + "c": rng.uniform(low=0, high=4, size=nelem), + "d": rng.choice(["green", "black", "white"], nelem), + } + ) + df["e"] = df["d"].astype("category") + gdf = cudf.DataFrame.from_pandas(df) + + assert_eq(gdf.set_index("a", drop=drop), gdf.set_index(["a"], drop=drop)) + assert_eq( + df.set_index(["b", "c"], drop=drop), + gdf.set_index(["b", "c"], drop=drop), + ) + assert_eq( + df.set_index(["d", "b"], drop=drop), + gdf.set_index(["d", "b"], drop=drop), + ) + assert_eq( + df.set_index(["b", "d", "e"], drop=drop), + gdf.set_index(["b", "d", "e"], drop=drop), + ) diff --git a/python/cudf/cudf/tests/dataframe/methods/test_sort_index.py b/python/cudf/cudf/tests/dataframe/methods/test_sort_index.py new file mode 100644 index 00000000000..905c4622ade --- /dev/null +++ b/python/cudf/cudf/tests/dataframe/methods/test_sort_index.py @@ -0,0 +1,147 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + + +import numpy as np +import pandas as pd +import pytest + +import cudf +from cudf.core._compat import ( + PANDAS_CURRENT_SUPPORTED_VERSION, + PANDAS_GE_220, + PANDAS_VERSION, +) +from cudf.testing import assert_eq + + +def test_dataframe_empty_sort_index(): + pdf = pd.DataFrame({"x": []}) + gdf = cudf.DataFrame.from_pandas(pdf) + + expect = pdf.sort_index() + got = gdf.sort_index() + + assert_eq(expect, got, check_index_type=True) + + +@pytest.mark.parametrize( + "index", + [ + pd.RangeIndex(0, 3, 1), + [3.0, 1.0, np.nan], + # Test for single column MultiIndex + pd.MultiIndex.from_arrays( + [ + [2, 0, 1], + ] + ), + pd.RangeIndex(2, -1, -1), + ], +) +@pytest.mark.parametrize("axis", [0, 1, "index", "columns"]) +@pytest.mark.parametrize("na_position", ["first", "last"]) +def test_dataframe_sort_index( + request, index, axis, ascending, inplace, ignore_index, na_position +): + if not PANDAS_GE_220 and axis in (1, "columns") and ignore_index: + pytest.skip(reason="Bug fixed in pandas-2.2") + + pdf = pd.DataFrame( + {"b": [1, 3, 2], "a": [1, 4, 3], "c": [4, 1, 5]}, + index=index, + ) + gdf = cudf.DataFrame.from_pandas(pdf) + + expected = pdf.sort_index( + axis=axis, + ascending=ascending, + ignore_index=ignore_index, + inplace=inplace, + na_position=na_position, + ) + got = gdf.sort_index( + axis=axis, + ascending=ascending, + ignore_index=ignore_index, + inplace=inplace, + na_position=na_position, + ) + + if inplace is True: + assert_eq(pdf, gdf, check_index_type=True) + else: + assert_eq(expected, got, check_index_type=True) + + +@pytest.mark.parametrize("axis", [0, 1, "index", "columns"]) +@pytest.mark.parametrize( + "level", + [ + 0, + "b", + 1, + ["b"], + "a", + ["a", "b"], + ["b", "a"], + [0, 1], + [1, 0], + [0, 2], + None, + ], +) +@pytest.mark.parametrize("na_position", ["first", "last"]) +@pytest.mark.skipif( + PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, + reason="Fails in older versions of pandas", +) +def test_dataframe_mulitindex_sort_index( + request, axis, level, ascending, inplace, ignore_index, na_position +): + request.applymarker( + pytest.mark.xfail( + condition=axis in (1, "columns") + and level is None + and not ascending + and ignore_index, + reason="https://github.com/pandas-dev/pandas/issues/57293", + ) + ) + pdf = pd.DataFrame( + { + "b": [1.0, 3.0, np.nan], + "a": [1, 4, 3], + 1: ["a", "b", "c"], + "e": [3, 1, 4], + "d": [1, 2, 8], + } + ).set_index(["b", "a", 1]) + gdf = cudf.DataFrame.from_pandas(pdf) + + expected = pdf.sort_index( + axis=axis, + level=level, + ascending=ascending, + inplace=inplace, + na_position=na_position, + ignore_index=ignore_index, + ) + got = gdf.sort_index( + axis=axis, + level=level, + ascending=ascending, + ignore_index=ignore_index, + inplace=inplace, + na_position=na_position, + ) + + if inplace is True: + assert_eq(pdf, gdf) + else: + assert_eq(expected, got) + + +def test_sort_index_axis_1_ignore_index_true_columnaccessor_state_names(): + gdf = cudf.DataFrame([[1, 2, 3]], columns=["b", "a", "c"]) + result = gdf.sort_index(axis=1, ignore_index=True) + assert result._data.names == tuple(result._data.keys()) diff --git a/python/cudf/cudf/tests/dataframe/methods/test_swaplevel.py b/python/cudf/cudf/tests/dataframe/methods/test_swaplevel.py new file mode 100644 index 00000000000..5b1a4b6c092 --- /dev/null +++ b/python/cudf/cudf/tests/dataframe/methods/test_swaplevel.py @@ -0,0 +1,69 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + + +import pytest + +import cudf +from cudf.testing import assert_eq + + +def test_dataframe_swaplevel_axis_0(): + midx = cudf.MultiIndex( + levels=[ + ["Work"], + ["Final exam", "Coursework"], + ["History", "Geography"], + ["January", "February", "March", "April"], + ], + codes=[[0, 0, 0, 0], [0, 0, 1, 1], [0, 1, 0, 1], [0, 1, 2, 3]], + names=["a", "b", "c", "d"], + ) + cdf = cudf.DataFrame( + { + "Grade": ["A", "B", "A", "C"], + "Percentage": ["95", "85", "95", "75"], + }, + index=midx, + ) + pdf = cdf.to_pandas() + + assert_eq(pdf.swaplevel(), cdf.swaplevel()) + assert_eq(pdf.swaplevel(), cdf.swaplevel(-2, -1, 0)) + assert_eq(pdf.swaplevel(1, 2), cdf.swaplevel(1, 2)) + assert_eq(cdf.swaplevel(2, 1), cdf.swaplevel(1, 2)) + assert_eq(pdf.swaplevel(-1, -3), cdf.swaplevel(-1, -3)) + assert_eq(pdf.swaplevel("a", "b", 0), cdf.swaplevel("a", "b", 0)) + assert_eq(cdf.swaplevel("a", "b"), cdf.swaplevel("b", "a")) + + +def test_dataframe_swaplevel_TypeError(): + cdf = cudf.DataFrame( + {"a": [1, 2, 3], "c": [10, 20, 30]}, index=["x", "y", "z"] + ) + + with pytest.raises(TypeError): + cdf.swaplevel() + + +def test_dataframe_swaplevel_axis_1(): + midx = cudf.MultiIndex( + levels=[ + ["b", "a"], + ["bb", "aa"], + ["bbb", "aaa"], + ], + codes=[[0, 0, 1, 1], [0, 1, 0, 1], [0, 1, 0, 1]], + names=[None, "a", "b"], + ) + cdf = cudf.DataFrame( + data=[[45, 30, 100, 90], [200, 100, 50, 80]], + columns=midx, + ) + pdf = cdf.to_pandas() + + assert_eq(pdf.swaplevel(1, 2, 1), cdf.swaplevel(1, 2, 1)) + assert_eq(pdf.swaplevel("a", "b", 1), cdf.swaplevel("a", "b", 1)) + assert_eq(cdf.swaplevel(2, 1, 1), cdf.swaplevel(1, 2, 1)) + assert_eq(pdf.swaplevel(0, 2, 1), cdf.swaplevel(0, 2, 1)) + assert_eq(pdf.swaplevel(2, 0, 1), cdf.swaplevel(2, 0, 1)) + assert_eq(cdf.swaplevel("a", "a", 1), cdf.swaplevel("b", "b", 1)) diff --git a/python/cudf/cudf/tests/dataframe/methods/test_to_arrow.py b/python/cudf/cudf/tests/dataframe/methods/test_to_arrow.py index c033efed0b0..2d3174a8225 100644 --- a/python/cudf/cudf/tests/dataframe/methods/test_to_arrow.py +++ b/python/cudf/cudf/tests/dataframe/methods/test_to_arrow.py @@ -1,5 +1,6 @@ # Copyright (c) 2023-2025, NVIDIA CORPORATION. +import numpy as np import pandas as pd import pyarrow as pa import pytest @@ -8,7 +9,11 @@ from cudf.testing import assert_eq -@pytest.mark.parametrize("preserve_index", [False, True, None]) +@pytest.fixture(params=[False, True, None]) +def preserve_index(request): + return request.param + + def test_dataframe_to_arrow_preserve_index(preserve_index): df = cudf.DataFrame({"x": ["cat", "dog"] * 5}) pf = df.to_pandas() @@ -23,3 +28,112 @@ def test_datetime_to_arrow(datetime_types_as_str): assert_eq( gdf, cudf.DataFrame.from_arrow(gdf.to_arrow(preserve_index=False)) ) + + +def test_arrow_pandas_compat(preserve_index): + data = {"a": range(10), "b": range(10)} + pdf = pd.DataFrame(data, index=pd.Index(np.arange(10), name="z")) + gdf = cudf.DataFrame(data, index=cudf.Index(np.arange(10), name="z")) + + pdf_arrow_table = pa.Table.from_pandas(pdf, preserve_index=preserve_index) + gdf_arrow_table = gdf.to_arrow(preserve_index=preserve_index) + + assert pa.Table.equals(pdf_arrow_table, gdf_arrow_table) + + gdf2 = cudf.DataFrame.from_arrow(pdf_arrow_table) + pdf2 = pdf_arrow_table.to_pandas() + + assert_eq(pdf2, gdf2) + pdf.columns.name = "abc" + pdf_arrow_table = pa.Table.from_pandas(pdf, preserve_index=preserve_index) + + gdf2 = cudf.DataFrame.from_arrow(pdf_arrow_table) + pdf2 = pdf_arrow_table.to_pandas() + assert_eq(pdf2, gdf2) + + +@pytest.mark.parametrize( + "index", + [ + None, + cudf.RangeIndex(3, name="a"), + "a", + "b", + ["a", "b"], + cudf.RangeIndex(0, 5, 2, name="a"), + ], +) +def test_arrow_round_trip(preserve_index, index): + data = {"a": [4, 5, 6], "b": ["cat", "dog", "bird"]} + if isinstance(index, (list, str)): + gdf = cudf.DataFrame(data).set_index(index) + else: + gdf = cudf.DataFrame(data, index=index) + + table = gdf.to_arrow(preserve_index=preserve_index) + table_pd = pa.Table.from_pandas( + gdf.to_pandas(), preserve_index=preserve_index + ) + + gdf_out = cudf.DataFrame.from_arrow(table) + pdf_out = table_pd.to_pandas() + + assert_eq(gdf_out, pdf_out) + + +@pytest.mark.parametrize("nelem", [0, 2]) +def test_to_arrow(nelem, all_supported_types_as_str): + if all_supported_types_as_str in {"category", "str"}: + pytest.skip(f"Test not applicable with {all_supported_types_as_str}") + rng = np.random.default_rng(seed=0) + df = pd.DataFrame( + { + "a": rng.integers(0, 1000, nelem).astype( + all_supported_types_as_str + ), + "b": rng.integers(0, 1000, nelem).astype( + all_supported_types_as_str + ), + } + ) + gdf = cudf.DataFrame.from_pandas(df) + + pa_df = pa.Table.from_pandas( + df, preserve_index=False + ).replace_schema_metadata(None) + + pa_gdf = gdf.to_arrow(preserve_index=False).replace_schema_metadata(None) + + assert isinstance(pa_gdf, pa.Table) + assert pa.Table.equals(pa_df, pa_gdf) + + pa_s = pa.Array.from_pandas(df.a) + pa_gs = gdf["a"].to_arrow() + + assert isinstance(pa_gs, pa.Array) + assert pa.Array.equals(pa_s, pa_gs) + + pa_i = pa.Array.from_pandas(df.index) + pa_gi = gdf.index.to_arrow() + + assert isinstance(pa_gi, pa.Array) + assert pa.Array.equals(pa_i, pa_gi) + + +def test_to_arrow_categorical(): + df = pd.DataFrame({"a": pd.Series(["a", "b", "c"], dtype="category")}) + gdf = cudf.DataFrame.from_pandas(df) + + pa_df = pa.Table.from_pandas( + df, preserve_index=False + ).replace_schema_metadata(None) + pa_gdf = gdf.to_arrow(preserve_index=False).replace_schema_metadata(None) + + assert isinstance(pa_gdf, pa.Table) + assert pa.Table.equals(pa_df, pa_gdf) + + pa_s = pa.Array.from_pandas(df.a) + pa_gs = gdf["a"].to_arrow() + + assert isinstance(pa_gs, pa.Array) + assert pa.Array.equals(pa_s, pa_gs) diff --git a/python/cudf/cudf/tests/dataframe/methods/test_to_cupy.py b/python/cudf/cudf/tests/dataframe/methods/test_to_cupy.py new file mode 100644 index 00000000000..2d47bd17840 --- /dev/null +++ b/python/cudf/cudf/tests/dataframe/methods/test_to_cupy.py @@ -0,0 +1,136 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + +import cupy as cp +import numpy as np +import pandas as pd +import pytest + +import cudf +from cudf.testing import assert_eq +from cudf.testing._utils import expand_bits_to_bytes, random_bitmask + + +def test_empty_dataframe_to_cupy(): + df = cudf.DataFrame() + + # Check fully empty dataframe. + mat = df.to_cupy() + assert mat.shape == (0, 0) + mat = df.to_numpy() + assert mat.shape == (0, 0) + + rng = np.random.default_rng(seed=0) + df = cudf.DataFrame( + { + "a": rng.random(10), + "b": rng.random(10), + "c": rng.random(10), + } + ) + + # Check all columns in empty dataframe. + mat = df.head(0).to_cupy() + assert mat.shape == (0, 3) + + +def test_dataframe_to_cupy(): + nelem = 123 + rng = np.random.default_rng(seed=0) + df = cudf.DataFrame( + { + "a": rng.random(nelem), + "b": rng.random(nelem), + "c": rng.random(nelem), + "d": rng.random(nelem), + } + ) + + # Check all columns + mat = df.to_cupy() + assert mat.shape == (nelem, 4) + assert mat.strides == (8, 984) + + mat = df.to_numpy() + assert mat.shape == (nelem, 4) + assert mat.strides == (8, 984) + for i, k in enumerate(df.columns): + np.testing.assert_array_equal(df[k].to_numpy(), mat[:, i]) + + # Check column subset + mat = df[["a", "c"]].to_cupy().get() + assert mat.shape == (nelem, 2) + + for i, k in enumerate("ac"): + np.testing.assert_array_equal(df[k].to_numpy(), mat[:, i]) + + +@pytest.mark.parametrize("has_nulls", [False, True]) +@pytest.mark.parametrize("use_na_value", [False, True]) +def test_dataframe_to_cupy_single_column(has_nulls, use_na_value): + nelem = 10 + data = np.arange(nelem, dtype=np.float64) + + if has_nulls: + data = data.astype("object") + data[::2] = None + + df = cudf.DataFrame({"a": data}) + + if has_nulls and not use_na_value: + with pytest.raises(ValueError, match="Column must have no nulls"): + df.to_cupy() + return + + na_value = 0.0 if use_na_value else None + expected = ( + cp.asarray(df["a"].fillna(na_value)) + if has_nulls + else cp.asarray(df["a"]) + ) + result = df.to_cupy(na_value=na_value) + assert result.shape == (nelem, 1) + assert_eq(result.ravel(), expected) + + +def test_dataframe_to_cupy_null_values(): + df = cudf.DataFrame() + + nelem = 123 + na = -10000 + + refvalues = {} + rng = np.random.default_rng(seed=0) + for k in "abcd": + df[k] = data = rng.random(nelem) + bitmask = random_bitmask(nelem) + df[k] = df[k]._column.set_mask(bitmask) + boolmask = np.asarray( + expand_bits_to_bytes(bitmask)[:nelem], dtype=np.bool_ + ) + data[~boolmask] = na + refvalues[k] = data + + # Check null value causes error + with pytest.raises(ValueError): + df.to_cupy() + with pytest.raises(ValueError): + df.to_numpy() + + for k in df.columns: + df[k] = df[k].fillna(na) + + mat = df.to_numpy() + for i, k in enumerate(df.columns): + np.testing.assert_array_equal(refvalues[k], mat[:, i]) + + +@pytest.mark.parametrize("method", ["to_cupy", "to_numpy"]) +@pytest.mark.parametrize("value", [1, True, 1.5]) +@pytest.mark.parametrize("constructor", ["DataFrame", "Series"]) +def test_to_array_categorical(method, value, constructor): + data = [value] + expected = getattr(pd, constructor)(data, dtype="category").to_numpy() + result = getattr( + getattr(cudf, constructor)(data, dtype="category"), method + )() + assert_eq(result, expected) diff --git a/python/cudf/cudf/tests/dataframe/methods/test_transpose.py b/python/cudf/cudf/tests/dataframe/methods/test_transpose.py index bb64e6d4896..96ef5deb49a 100644 --- a/python/cudf/cudf/tests/dataframe/methods/test_transpose.py +++ b/python/cudf/cudf/tests/dataframe/methods/test_transpose.py @@ -1,7 +1,9 @@ # Copyright (c) 2025, NVIDIA CORPORATION. +import string - +import numpy as np import pandas as pd +import pytest import cudf from cudf.testing import assert_eq @@ -14,3 +16,84 @@ def test_multiindex_transpose(): ) gdf = cudf.from_pandas(pdf) assert_eq(pdf.transpose(), gdf.transpose()) + + +@pytest.mark.parametrize("num_cols", [1, 3]) +@pytest.mark.parametrize("num_rows", [1, 4]) +@pytest.mark.parametrize("nulls", ["none", "some", "all"]) +def test_dataframe_transpose( + nulls, num_cols, num_rows, all_supported_types_as_str +): + # In case of `bool` dtype: pandas <= 1.2.5 type-casts + # a boolean series to `float64` series if a `np.nan` is assigned to it: + # >>> s = pd.Series([True, False, True]) + # >>> s + # 0 True + # 1 False + # 2 True + # dtype: bool + # >>> s[[2]] = np.nan + # >>> s + # 0 1.0 + # 1 0.0 + # 2 NaN + # dtype: float64 + # In pandas >= 1.3.2 this behavior is fixed: + # >>> s = pd.Series([True, False, True]) + # >>> s + # 0 + # True + # 1 + # False + # 2 + # True + # dtype: bool + # >>> s[[2]] = np.nan + # >>> s + # 0 + # True + # 1 + # False + # 2 + # NaN + # dtype: object + # In cudf we change `object` dtype to `str` type - for which there + # is no transpose implemented yet. Hence we need to test transpose + # against pandas nullable types as they are the ones that closely + # resemble `cudf` dtypes behavior. + if all_supported_types_as_str in {"category", "str"}: + pytest.skip(f"Test not applicable with {all_supported_types_as_str}") + pdf = pd.DataFrame() + rng = np.random.default_rng(seed=0) + null_rep = ( + np.nan + if all_supported_types_as_str in ["float32", "float64"] + else None + ) + np_dtype = all_supported_types_as_str + dtype = np.dtype(all_supported_types_as_str) + dtype = cudf.utils.dtypes.np_dtypes_to_pandas_dtypes.get(dtype, dtype) + for i in range(num_cols): + colname = string.ascii_lowercase[i] + data = pd.Series( + rng.integers(0, 26, num_rows).astype(np_dtype), + dtype=dtype, + ) + if nulls == "some": + idx = rng.choice(num_rows, size=int(num_rows / 2), replace=False) + if len(idx): + data[idx] = null_rep + elif nulls == "all": + data[:] = null_rep + pdf[colname] = data + + gdf = cudf.DataFrame.from_pandas(pdf) + + got_function = gdf.transpose() + got_property = gdf.T + + expect = pdf.transpose() + nullable = dtype.kind not in "Mm" + + assert_eq(expect, got_function.to_pandas(nullable=nullable)) + assert_eq(expect, got_property.to_pandas(nullable=nullable)) diff --git a/python/cudf/cudf/tests/dataframe/methods/test_truncate.py b/python/cudf/cudf/tests/dataframe/methods/test_truncate.py new file mode 100644 index 00000000000..640a26fe562 --- /dev/null +++ b/python/cudf/cudf/tests/dataframe/methods/test_truncate.py @@ -0,0 +1,61 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + + +import cudf +from cudf.testing import assert_eq + + +def test_dataframe_truncate_axis_0(): + df = cudf.DataFrame( + { + "A": ["a", "b", "c", "d", "e"], + "B": ["f", "g", "h", "i", "j"], + "C": ["k", "l", "m", "n", "o"], + }, + index=[1, 2, 3, 4, 5], + ) + pdf = df.to_pandas() + + expected = pdf.truncate(before=2, after=4, axis="index") + actual = df.truncate(before=2, after=4, axis="index") + assert_eq(actual, expected) + + expected = pdf.truncate(before=1, after=4, axis=0) + actual = df.truncate(before=1, after=4, axis=0) + assert_eq(expected, actual) + + +def test_dataframe_truncate_axis_1(): + df = cudf.DataFrame( + { + "A": ["a", "b", "c", "d", "e"], + "B": ["f", "g", "h", "i", "j"], + "C": ["k", "l", "m", "n", "o"], + }, + index=[1, 2, 3, 4, 5], + ) + pdf = df.to_pandas() + + expected = pdf.truncate(before="A", after="B", axis="columns") + actual = df.truncate(before="A", after="B", axis="columns") + assert_eq(actual, expected) + + expected = pdf.truncate(before="A", after="B", axis=1) + actual = df.truncate(before="A", after="B", axis=1) + assert_eq(actual, expected) + + +def test_dataframe_truncate_datetimeindex(): + dates = cudf.date_range( + "2021-01-01 23:45:00", "2021-01-01 23:46:00", freq="s" + ) + df = cudf.DataFrame(data={"A": 1, "B": 2}, index=dates) + pdf = df.to_pandas() + expected = pdf.truncate( + before="2021-01-01 23:45:18", after="2021-01-01 23:45:27" + ) + actual = df.truncate( + before="2021-01-01 23:45:18", after="2021-01-01 23:45:27" + ) + + assert_eq(actual, expected) diff --git a/python/cudf/cudf/tests/dataframe/test_attributes.py b/python/cudf/cudf/tests/dataframe/test_attributes.py new file mode 100644 index 00000000000..afe2dbc10c4 --- /dev/null +++ b/python/cudf/cudf/tests/dataframe/test_attributes.py @@ -0,0 +1,56 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + + +import numpy as np +import pandas as pd +import pytest + +import cudf +from cudf.testing import assert_eq + + +@pytest.mark.parametrize( + "data", + [ + {"a": [1, 2]}, + {"a": [1, 2, 3], "b": [3, 4, 5]}, + {"a": [1, 2, 3, 4], "b": [3, 4, 5, 6], "c": [1, 3, 5, 7]}, + {"a": [np.nan, 2, 3, 4], "b": [3, 4, np.nan, 6], "c": [1, 3, 5, 7]}, + {1: [1, 2, 3], 2: [3, 4, 5]}, + {"a": [1, None, None], "b": [3, np.nan, np.nan]}, + {1: ["a", "b", "c"], 2: ["q", "w", "u"]}, + {1: ["a", np.nan, "c"], 2: ["q", None, "u"]}, + {}, + {1: [], 2: [], 3: []}, + [1, 2, 3], + ], +) +def test_axes(data): + csr = cudf.DataFrame(data) + psr = pd.DataFrame(data) + + expected = psr.axes + actual = csr.axes + + for e, a in zip(expected, actual, strict=True): + assert_eq(e, a, exact=False) + + +def test_iter(): + data = {"a": [1, 2, 3], "b": [4, 5, 6]} + pdf = pd.DataFrame(data) + gdf = cudf.DataFrame(data) + assert list(pdf) == list(gdf) + + +def test_column_assignment(): + gdf = cudf.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]}) + new_cols = ["q", "r", "s"] + gdf.columns = new_cols + assert list(gdf.columns) == new_cols + + +def test_ndim(): + pdf = pd.DataFrame({"x": range(5), "y": range(5, 10)}) + gdf = cudf.DataFrame.from_pandas(pdf) + assert pdf.ndim == gdf.ndim diff --git a/python/cudf/cudf/tests/dataframe/test_constructors.py b/python/cudf/cudf/tests/dataframe/test_constructors.py index 6d45334717f..59463b812a0 100644 --- a/python/cudf/cudf/tests/dataframe/test_constructors.py +++ b/python/cudf/cudf/tests/dataframe/test_constructors.py @@ -1,12 +1,408 @@ # Copyright (c) 2025, NVIDIA CORPORATION. +from contextlib import nullcontext as does_not_raise + +import cupy as cp +import numpy as np import pandas as pd +import pyarrow as pa import pytest import cudf from cudf.testing import assert_eq +def test_init_via_list_of_tuples(): + data = [ + (5, "cats", "jump", np.nan), + (2, "dogs", "dig", 7.5), + (3, "cows", "moo", -2.1, "occasionally"), + ] + + pdf = pd.DataFrame(data) + gdf = cudf.DataFrame(data) + + assert_eq(pdf, gdf) + + +@pytest.mark.parametrize("columns", [["a", "b"], pd.Series(["a", "b"])]) +def test_init_via_list_of_series(columns): + data = [pd.Series([1, 2]), pd.Series([3, 4])] + + pdf = cudf.DataFrame(data, columns=columns) + gdf = cudf.DataFrame(data, columns=columns) + + assert_eq(pdf, gdf) + + +@pytest.mark.parametrize("index", [None, [0, 1, 2]]) +def test_init_with_missing_columns(index): + """Test initialization when columns and data keys are disjoint.""" + data = {"a": [1, 2, 3], "b": [2, 3, 4]} + columns = ["c", "d"] + + pdf = cudf.DataFrame(data, columns=columns, index=index) + gdf = cudf.DataFrame(data, columns=columns, index=index) + + assert_eq(pdf, gdf) + + +@pytest.mark.parametrize("rows", [0, 1, 2, 100]) +def test_init_via_list_of_empty_tuples(rows): + data = [()] * rows + + pdf = pd.DataFrame(data) + gdf = cudf.DataFrame(data) + + assert_eq(pdf, gdf) + + +@pytest.mark.parametrize( + "dict_of_series", + [ + {"a": pd.Series([1.0, 2.0, 3.0])}, + {"a": pd.Series([1.0, 2.0, 3.0], index=[4, 5, 6])}, + { + "a": pd.Series([1.0, 2.0, 3.0], index=[4, 5, 6]), + "b": pd.Series([1.0, 2.0, 4.0], index=[1, 2, 3]), + }, + {"a": [1, 2, 3], "b": pd.Series([1.0, 2.0, 3.0], index=[4, 5, 6])}, + { + "a": pd.Series([1.0, 2.0, 3.0], index=["a", "b", "c"]), + "b": pd.Series([1.0, 2.0, 4.0], index=["c", "d", "e"]), + }, + { + "a": pd.Series( + ["a", "b", "c"], + index=pd.MultiIndex.from_tuples([(1, 2), (1, 3), (2, 3)]), + ), + "b": pd.Series( + ["a", " b", "d"], + index=pd.MultiIndex.from_tuples([(1, 2), (1, 3), (2, 3)]), + ), + }, + ], +) +def test_init_from_series_align(dict_of_series): + pdf = pd.DataFrame(dict_of_series) + gdf = cudf.DataFrame(dict_of_series) + + assert_eq(pdf, gdf) + + for key in dict_of_series: + if isinstance(dict_of_series[key], pd.Series): + dict_of_series[key] = cudf.Series(dict_of_series[key]) + + gdf = cudf.DataFrame(dict_of_series) + + assert_eq(pdf, gdf) + + +@pytest.mark.parametrize( + ("dict_of_series", "expectation"), + [ + ( + { + "a": pd.Series(["a", "b", "c"], index=[4, 4, 5]), + "b": pd.Series(["a", "b", "c"], index=[4, 5, 6]), + }, + pytest.raises( + ValueError, match="Cannot align indices with non-unique values" + ), + ), + ( + { + "a": pd.Series(["a", "b", "c"], index=[4, 4, 5]), + "b": pd.Series(["a", "b", "c"], index=[4, 4, 5]), + }, + does_not_raise(), + ), + ], +) +def test_init_from_series_align_nonunique(dict_of_series, expectation): + with expectation: + gdf = cudf.DataFrame(dict_of_series) + + if expectation == does_not_raise(): + pdf = pd.DataFrame(dict_of_series) + assert_eq(pdf, gdf) + + +def test_init_unaligned_with_index(): + pdf = pd.DataFrame( + { + "a": pd.Series([1.0, 2.0, 3.0], index=[4, 5, 6]), + "b": pd.Series([1.0, 2.0, 3.0], index=[1, 2, 3]), + }, + index=[7, 8, 9], + ) + gdf = cudf.DataFrame( + { + "a": cudf.Series([1.0, 2.0, 3.0], index=[4, 5, 6]), + "b": cudf.Series([1.0, 2.0, 3.0], index=[1, 2, 3]), + }, + index=[7, 8, 9], + ) + + assert_eq(pdf, gdf, check_dtype=False) + + +def test_init_series_list_columns_unsort(): + pseries = [ + pd.Series(i, index=["b", "a", "c"], name=str(i)) for i in range(3) + ] + gseries = [ + cudf.Series(i, index=["b", "a", "c"], name=str(i)) for i in range(3) + ] + pdf = pd.DataFrame(pseries) + gdf = cudf.DataFrame(gseries) + assert_eq(pdf, gdf) + + +@pytest.mark.parametrize("nelem", [0, 10]) +@pytest.mark.parametrize("nchunks", [1, 5]) +def test_from_arrow_chunked_arrays(nelem, nchunks, numeric_types_as_str): + rng = np.random.default_rng(seed=0) + np_list_data = [ + rng.integers(0, 100, nelem).astype(numeric_types_as_str) + for i in range(nchunks) + ] + pa_chunk_array = pa.chunked_array(np_list_data) + + expect = pa_chunk_array.to_pandas() + got = cudf.Series(pa_chunk_array) + + assert_eq(expect, got) + + np_list_data2 = [ + rng.integers(0, 100, nelem).astype(numeric_types_as_str) + for i in range(nchunks) + ] + pa_chunk_array2 = pa.chunked_array(np_list_data2) + pa_table = pa.Table.from_arrays( + [pa_chunk_array, pa_chunk_array2], names=["a", "b"] + ) + + expect = pa_table.to_pandas() + got = cudf.DataFrame.from_arrow(pa_table) + + assert_eq(expect, got) + + +def test_1row_arrow_table(): + data = [pa.array([0]), pa.array([1])] + batch = pa.RecordBatch.from_arrays(data, ["f0", "f1"]) + table = pa.Table.from_batches([batch]) + + expect = table.to_pandas() + got = cudf.DataFrame.from_arrow(table) + assert_eq(expect, got) + + +def test_arrow_handle_no_index_name(): + pdf = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) + gdf = cudf.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) + + gdf_arrow = gdf.to_arrow() + pdf_arrow = pa.Table.from_pandas(pdf) + assert pa.Table.equals(pdf_arrow, gdf_arrow) + + got = cudf.DataFrame.from_arrow(gdf_arrow) + expect = pdf_arrow.to_pandas() + assert_eq(expect, got) + + +def test_pandas_non_contiguious(): + rng = np.random.default_rng(seed=0) + arr1 = rng.random(size=(5000, 10)) + assert arr1.flags["C_CONTIGUOUS"] is True + df = pd.DataFrame(arr1) + for col in df.columns: + assert df[col].values.flags["C_CONTIGUOUS"] is False + + gdf = cudf.DataFrame.from_pandas(df) + assert_eq(gdf.to_pandas(), df) + + +def test_from_records(numeric_types_as_str): + h_ary = np.ndarray(shape=(10, 4), dtype=numeric_types_as_str) + rec_ary = h_ary.view(np.recarray) + + gdf = cudf.DataFrame.from_records(rec_ary, columns=["a", "b", "c", "d"]) + df = pd.DataFrame.from_records(rec_ary, columns=["a", "b", "c", "d"]) + assert isinstance(gdf, cudf.DataFrame) + assert_eq(df, gdf) + + gdf = cudf.DataFrame.from_records(rec_ary) + df = pd.DataFrame.from_records(rec_ary) + assert isinstance(gdf, cudf.DataFrame) + assert_eq(df, gdf) + + +@pytest.mark.parametrize("columns", [None, ["first", "second", "third"]]) +@pytest.mark.parametrize( + "index", + [ + None, + ["first", "second"], + "name", + "age", + "weight", + [10, 11], + ["abc", "xyz"], + ], +) +def test_from_records_index(columns, index): + rec_ary = np.array( + [("Rex", 9, 81.0), ("Fido", 3, 27.0)], + dtype=[("name", "U10"), ("age", "i4"), ("weight", "f4")], + ) + gdf = cudf.DataFrame.from_records(rec_ary, columns=columns, index=index) + df = pd.DataFrame.from_records(rec_ary, columns=columns, index=index) + assert isinstance(gdf, cudf.DataFrame) + assert_eq(df, gdf) + + +def test_dataframe_construction_from_cupy_arrays(): + h_ary = np.array([[1, 2, 3], [4, 5, 6]], np.int32) + d_ary = cp.asarray(h_ary) + + gdf = cudf.DataFrame(d_ary, columns=["a", "b", "c"]) + df = pd.DataFrame(h_ary, columns=["a", "b", "c"]) + assert isinstance(gdf, cudf.DataFrame) + + assert_eq(df, gdf) + + gdf = cudf.DataFrame(d_ary) + df = pd.DataFrame(h_ary) + assert isinstance(gdf, cudf.DataFrame) + + assert_eq(df, gdf) + + gdf = cudf.DataFrame(d_ary, index=["a", "b"]) + df = pd.DataFrame(h_ary, index=["a", "b"]) + assert isinstance(gdf, cudf.DataFrame) + + assert_eq(df, gdf) + + gdf = cudf.DataFrame(d_ary) + gdf = gdf.set_index(keys=0, drop=False) + df = pd.DataFrame(h_ary) + df = df.set_index(keys=0, drop=False) + assert isinstance(gdf, cudf.DataFrame) + + assert_eq(df, gdf) + + gdf = cudf.DataFrame(d_ary) + gdf = gdf.set_index(keys=1, drop=False) + df = pd.DataFrame(h_ary) + df = df.set_index(keys=1, drop=False) + assert isinstance(gdf, cudf.DataFrame) + + assert_eq(df, gdf) + + +def test_dataframe_cupy_wrong_dimensions(): + d_ary = cp.empty((2, 3, 4), dtype=np.int32) + with pytest.raises( + ValueError, match="records dimension expected 1 or 2 but found: 3" + ): + cudf.DataFrame(d_ary) + + +def test_dataframe_cupy_array_wrong_index(): + d_ary = cp.empty((2, 3), dtype=np.int32) + + with pytest.raises(ValueError): + cudf.DataFrame(d_ary, index=["a"]) + + with pytest.raises(TypeError): + cudf.DataFrame(d_ary, index="a") + + +def test_index_in_dataframe_constructor(): + a = pd.DataFrame({"x": [1, 2, 3]}, index=[4.0, 5.0, 6.0]) + b = cudf.DataFrame({"x": [1, 2, 3]}, index=[4.0, 5.0, 6.0]) + + assert_eq(a, b) + assert_eq(a.loc[4:], b.loc[4:]) + + +@pytest.mark.parametrize("nelem", [0, 2]) +def test_from_arrow(nelem, all_supported_types_as_str): + if all_supported_types_as_str in {"category", "str"}: + pytest.skip(f"Test not applicable with {all_supported_types_as_str}") + rng = np.random.default_rng(seed=0) + df = pd.DataFrame( + { + "a": rng.integers(0, 1000, nelem).astype( + all_supported_types_as_str + ), + "b": rng.integers(0, 1000, nelem).astype( + all_supported_types_as_str + ), + } + ) + padf = pa.Table.from_pandas( + df, preserve_index=False + ).replace_schema_metadata(None) + gdf = cudf.DataFrame.from_arrow(padf) + assert isinstance(gdf, cudf.DataFrame) + + assert_eq(df, gdf) + + s = pa.Array.from_pandas(df.a) + gs = cudf.Series.from_arrow(s) + assert isinstance(gs, cudf.Series) + + # For some reason PyArrow to_pandas() converts to numpy array and has + # better type compatibility + np.testing.assert_array_equal(s.to_pandas(), gs.to_numpy()) + + +def test_from_arrow_chunked_categories(): + # Verify that categories are properly deduplicated across chunked arrays. + indices = pa.array([0, 1, 0, 1, 2, 0, None, 2]) + dictionary = pa.array(["foo", "bar", "baz"]) + dict_array = pa.DictionaryArray.from_arrays(indices, dictionary) + chunked_array = pa.chunked_array([dict_array, dict_array]) + table = pa.table({"a": chunked_array}) + df = cudf.DataFrame.from_arrow(table) + final_dictionary = df["a"].dtype.categories.to_arrow().to_pylist() + assert sorted(final_dictionary) == sorted(dictionary.to_pylist()) + + +def test_from_scalar_typing(request, all_supported_types_as_str): + if all_supported_types_as_str in {"category", "str"}: + pytest.skip(f"Test not applicable with {all_supported_types_as_str}") + request.applymarker( + pytest.mark.xfail( + all_supported_types_as_str + in {"timedelta64[ms]", "timedelta64[us]", "timedelta64[ns]"}, + reason=f"{all_supported_types_as_str} incorrectly results in timedelta64[s]", + ) + ) + rng = np.random.default_rng(seed=0) + if all_supported_types_as_str == "datetime64[ms]": + scalar = ( + np.dtype("int64").type(rng.integers(0, 5)).astype("datetime64[ms]") + ) + elif all_supported_types_as_str.startswith("datetime64"): + scalar = np.datetime64("2020-01-01").astype("datetime64[ms]") + all_supported_types_as_str = "datetime64[ms]" + else: + scalar = np.dtype(all_supported_types_as_str).type(rng.integers(0, 5)) + + gdf = cudf.DataFrame( + { + "a": [1, 2, 3, 4, 5], + "b": scalar, + } + ) + assert gdf["b"].dtype == np.dtype(all_supported_types_as_str) + assert len(gdf["b"]) == len(gdf["a"]) + + @pytest.mark.parametrize( "data1, data2", [(1, 2), (1.0, 2.0), (3, 4.0)], diff --git a/python/cudf/cudf/tests/general_functions/test_from_pandas.py b/python/cudf/cudf/tests/general_functions/test_from_pandas.py new file mode 100644 index 00000000000..a7e7172d8d8 --- /dev/null +++ b/python/cudf/cudf/tests/general_functions/test_from_pandas.py @@ -0,0 +1,21 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + +import pandas as pd +import pytest + +import cudf +from cudf.testing import assert_eq + + +def test_from_pandas_function(): + pdf = pd.DataFrame({"x": [1, 2, 3]}) + gdf = cudf.from_pandas(pdf) + assert isinstance(gdf, cudf.DataFrame) + assert_eq(pdf, gdf) + + gdf = cudf.from_pandas(pdf.x) + assert isinstance(gdf, cudf.Series) + assert_eq(pdf.x, gdf) + + with pytest.raises(TypeError): + cudf.from_pandas(123) diff --git a/python/cudf/cudf/tests/series/methods/test_shift.py b/python/cudf/cudf/tests/series/methods/test_shift.py new file mode 100644 index 00000000000..ac99f879f15 --- /dev/null +++ b/python/cudf/cudf/tests/series/methods/test_shift.py @@ -0,0 +1,37 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + +import numpy as np +import pandas as pd +import pytest + +import cudf +from cudf.testing import assert_eq + + +@pytest.mark.parametrize("period", [-15, -1, 0, 1, 15]) +@pytest.mark.parametrize("data_empty", [False, True]) +def test_shift(numeric_types_as_str, period, data_empty): + # TODO : this function currently tests for series.shift() + # but should instead test for dataframe.shift() + if data_empty: + data = None + else: + data = np.arange(10, dtype=numeric_types_as_str) + + gs = cudf.Series(data) + ps = pd.Series(data) + + shifted_outcome = gs.shift(period) + expected_outcome = ps.shift(period) + + # pandas uses NaNs to signal missing value and force converts the + # results columns to float types + if data_empty: + assert_eq( + shifted_outcome, + expected_outcome, + check_index_type=False, + check_dtype=False, + ) + else: + assert_eq(shifted_outcome, expected_outcome, check_dtype=False) diff --git a/python/cudf/cudf/tests/series/methods/test_to_frame.py b/python/cudf/cudf/tests/series/methods/test_to_frame.py new file mode 100644 index 00000000000..6217dec9a6a --- /dev/null +++ b/python/cudf/cudf/tests/series/methods/test_to_frame.py @@ -0,0 +1,14 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + +import pandas as pd +import pytest + +import cudf +from cudf.testing import assert_eq + + +@pytest.mark.parametrize("name", [None, False, "foo"]) +def test_to_frame(name): + gser = cudf.Series([1, 2, 3], name=name) + pser = pd.Series([1, 2, 3], name=name) + assert_eq(gser.to_frame(), pser.to_frame()) diff --git a/python/cudf/cudf/tests/series/methods/test_to_string.py b/python/cudf/cudf/tests/series/methods/test_to_string.py new file mode 100644 index 00000000000..7604d940844 --- /dev/null +++ b/python/cudf/cudf/tests/series/methods/test_to_string.py @@ -0,0 +1,20 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + +import cudf + + +def test_series_init_none(): + # test for creating empty series + # 1: without initializing + sr1 = cudf.Series() + got = sr1.to_string() + + expect = repr(sr1.to_pandas()) + assert got == expect + + # 2: Using `None` as an initializer + sr2 = cudf.Series(None) + got = sr2.to_string() + + expect = repr(sr2.to_pandas()) + assert got == expect diff --git a/python/cudf/cudf/tests/series/test_attributes.py b/python/cudf/cudf/tests/series/test_attributes.py index b79b1010cc3..80475505bc4 100644 --- a/python/cudf/cudf/tests/series/test_attributes.py +++ b/python/cudf/cudf/tests/series/test_attributes.py @@ -308,3 +308,9 @@ def test_error_values_datetime(): s = cudf.Series([1, 2, 3], dtype="datetime64[ns]") with pytest.raises(NotImplementedError, match="cupy does not support"): s.values + + +def test_ndim(): + s = pd.Series(dtype="float64") + gs = cudf.Series() + assert s.ndim == gs.ndim diff --git a/python/cudf/cudf/tests/series/test_constructors.py b/python/cudf/cudf/tests/series/test_constructors.py index 342f60c1580..24bdcc6f894 100644 --- a/python/cudf/cudf/tests/series/test_constructors.py +++ b/python/cudf/cudf/tests/series/test_constructors.py @@ -640,6 +640,98 @@ def test_construct_all_pd_NA_with_dtype(nan_as_null): assert_eq(result, expected) +def test_to_from_arrow_nulls(all_supported_types_as_str): + if all_supported_types_as_str in {"category", "str"}: + pytest.skip(f"Test not applicable with {all_supported_types_as_str}") + data_type = all_supported_types_as_str + if data_type == "bool": + s1 = pa.array([True, None, False, None, True], type=data_type) + else: + dtype = np.dtype(data_type) + if dtype.type == np.datetime64: + time_unit, _ = np.datetime_data(dtype) + data_type = pa.timestamp(unit=time_unit) + elif dtype.type == np.timedelta64: + time_unit, _ = np.datetime_data(dtype) + data_type = pa.duration(unit=time_unit) + s1 = pa.array([1, None, 3, None, 5], type=data_type) + gs1 = cudf.Series.from_arrow(s1) + assert isinstance(gs1, cudf.Series) + # We have 64B padded buffers for nulls whereas Arrow returns a minimal + # number of bytes, so only check the first byte in this case + np.testing.assert_array_equal( + np.asarray(s1.buffers()[0]).view("u1")[0], + gs1._column.mask_array_view(mode="read").copy_to_host().view("u1")[0], + ) + assert pa.Array.equals(s1, gs1.to_arrow()) + + s2 = pa.array([None, None, None, None, None], type=data_type) + gs2 = cudf.Series.from_arrow(s2) + assert isinstance(gs2, cudf.Series) + # We have 64B padded buffers for nulls whereas Arrow returns a minimal + # number of bytes, so only check the first byte in this case + np.testing.assert_array_equal( + np.asarray(s2.buffers()[0]).view("u1")[0], + gs2._column.mask_array_view(mode="read").copy_to_host().view("u1")[0], + ) + assert pa.Array.equals(s2, gs2.to_arrow()) + + +def test_cuda_array_interface(numeric_and_bool_types_as_str): + np_data = np.arange(10).astype(numeric_and_bool_types_as_str) + cupy_data = cp.array(np_data) + pd_data = pd.Series(np_data) + + cudf_data = cudf.Series(cupy_data) + assert_eq(pd_data, cudf_data) + + gdf = cudf.DataFrame() + gdf["test"] = cupy_data + pd_data.name = "test" + assert_eq(pd_data, gdf["test"]) + + +@pytest.mark.parametrize("nan_as_null", [True, False]) +def test_series_list_nanasnull(nan_as_null): + data = [1.0, 2.0, 3.0, np.nan, None] + + expect = pa.array(data, from_pandas=nan_as_null) + got = cudf.Series(data, nan_as_null=nan_as_null).to_arrow() + + # Bug in Arrow 0.14.1 where NaNs aren't handled + expect = expect.cast("int64", safe=False) + got = got.cast("int64", safe=False) + + assert pa.Array.equals(expect, got) + + +@pytest.mark.parametrize("num_elements", [0, 10]) +@pytest.mark.parametrize("null_type", [np.nan, None, "mixed"]) +def test_series_all_null(num_elements, null_type): + if null_type == "mixed": + data = [] + data1 = [np.nan] * int(num_elements / 2) + data2 = [None] * int(num_elements / 2) + for idx in range(len(data1)): + data.append(data1[idx]) + data.append(data2[idx]) + else: + data = [null_type] * num_elements + + # Typecast Pandas because None will return `object` dtype + expect = pd.Series(data, dtype="float64") + got = cudf.Series(data, dtype="float64") + + assert_eq(expect, got) + + +@pytest.mark.parametrize("num_elements", [0, 10]) +def test_series_all_valid_nan(num_elements): + data = [np.nan] * num_elements + sr = cudf.Series(data, nan_as_null=False) + np.testing.assert_equal(sr.null_count, 0) + + def test_series_empty_dtype(): expected = pd.Series([]) actual = cudf.Series([]) @@ -1012,6 +1104,23 @@ def test_from_pandas_object_dtype_passed_dtype(klass): assert_eq(result, expected) +def test_series_basic(): + # Make series from buffer + a1 = np.arange(10, dtype=np.float64) + series = cudf.Series(a1) + assert len(series) == 10 + np.testing.assert_equal(series.to_numpy(), np.hstack([a1])) + + +def test_series_from_cupy_scalars(): + data = [0.1, 0.2, 0.3] + data_np = np.array(data) + data_cp = cp.array(data) + s_np = cudf.Series([data_np[0], data_np[2]]) + s_cp = cudf.Series([data_cp[0], data_cp[2]]) + assert_eq(s_np, s_cp) + + def test_to_dense_array(): rng = np.random.default_rng(seed=0) data = rng.random(8) diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index ccf4c7de0d8..72601dd6a1a 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -1,7 +1,6 @@ # Copyright (c) 2018-2025, NVIDIA CORPORATION. import array as arr -import contextlib import datetime import decimal import functools @@ -13,7 +12,7 @@ import textwrap import warnings from collections import OrderedDict, defaultdict, namedtuple -from contextlib import contextmanager, nullcontext as does_not_raise +from contextlib import contextmanager from copy import copy import cupy @@ -28,20 +27,17 @@ from cudf.api.extensions import no_default from cudf.core._compat import ( PANDAS_CURRENT_SUPPORTED_VERSION, - PANDAS_GE_220, PANDAS_VERSION, ) from cudf.core.buffer.spill_manager import get_global_manager from cudf.core.column.column import as_column -from cudf.errors import MixedTypeError -from cudf.testing import _utils as utils, assert_eq, assert_neq +from cudf.testing import _utils as utils, assert_eq from cudf.testing._utils import ( ALL_TYPES, DATETIME_TYPES, NUMERIC_TYPES, assert_exceptions_equal, expect_warning_if, - gen_rand, ) from cudf.utils.dtypes import SIZE_TYPE_DTYPE @@ -103,41 +99,6 @@ def _hide_concat_empty_dtype_warning(): yield -def test_init_via_list_of_tuples(): - data = [ - (5, "cats", "jump", np.nan), - (2, "dogs", "dig", 7.5), - (3, "cows", "moo", -2.1, "occasionally"), - ] - - pdf = pd.DataFrame(data) - gdf = cudf.DataFrame(data) - - assert_eq(pdf, gdf) - - -@pytest.mark.parametrize("columns", [["a", "b"], pd.Series(["a", "b"])]) -def test_init_via_list_of_series(columns): - data = [pd.Series([1, 2]), pd.Series([3, 4])] - - pdf = cudf.DataFrame(data, columns=columns) - gdf = cudf.DataFrame(data, columns=columns) - - assert_eq(pdf, gdf) - - -@pytest.mark.parametrize("index", [None, [0, 1, 2]]) -def test_init_with_missing_columns(index): - """Test initialization when columns and data keys are disjoint.""" - data = {"a": [1, 2, 3], "b": [2, 3, 4]} - columns = ["c", "d"] - - pdf = cudf.DataFrame(data, columns=columns, index=index) - gdf = cudf.DataFrame(data, columns=columns, index=index) - - assert_eq(pdf, gdf) - - @pytest.fixture( params=[ pd.DataFrame( @@ -164,143 +125,6 @@ def na_data(request): return request.param -@pytest.mark.parametrize( - "rows", - [ - 0, - 1, - 2, - 100, - ], -) -def test_init_via_list_of_empty_tuples(rows): - data = [()] * rows - - pdf = pd.DataFrame(data) - gdf = cudf.DataFrame(data) - - assert_eq(pdf, gdf) - - -@pytest.mark.parametrize( - "dict_of_series", - [ - {"a": pd.Series([1.0, 2.0, 3.0])}, - {"a": pd.Series([1.0, 2.0, 3.0], index=[4, 5, 6])}, - { - "a": pd.Series([1.0, 2.0, 3.0], index=[4, 5, 6]), - "b": pd.Series([1.0, 2.0, 4.0], index=[1, 2, 3]), - }, - {"a": [1, 2, 3], "b": pd.Series([1.0, 2.0, 3.0], index=[4, 5, 6])}, - { - "a": pd.Series([1.0, 2.0, 3.0], index=["a", "b", "c"]), - "b": pd.Series([1.0, 2.0, 4.0], index=["c", "d", "e"]), - }, - { - "a": pd.Series( - ["a", "b", "c"], - index=pd.MultiIndex.from_tuples([(1, 2), (1, 3), (2, 3)]), - ), - "b": pd.Series( - ["a", " b", "d"], - index=pd.MultiIndex.from_tuples([(1, 2), (1, 3), (2, 3)]), - ), - }, - ], -) -def test_init_from_series_align(dict_of_series): - pdf = pd.DataFrame(dict_of_series) - gdf = cudf.DataFrame(dict_of_series) - - assert_eq(pdf, gdf) - - for key in dict_of_series: - if isinstance(dict_of_series[key], pd.Series): - dict_of_series[key] = cudf.Series(dict_of_series[key]) - - gdf = cudf.DataFrame(dict_of_series) - - assert_eq(pdf, gdf) - - -@pytest.mark.parametrize( - ("dict_of_series", "expectation"), - [ - ( - { - "a": pd.Series(["a", "b", "c"], index=[4, 4, 5]), - "b": pd.Series(["a", "b", "c"], index=[4, 5, 6]), - }, - pytest.raises( - ValueError, match="Cannot align indices with non-unique values" - ), - ), - ( - { - "a": pd.Series(["a", "b", "c"], index=[4, 4, 5]), - "b": pd.Series(["a", "b", "c"], index=[4, 4, 5]), - }, - does_not_raise(), - ), - ], -) -def test_init_from_series_align_nonunique(dict_of_series, expectation): - with expectation: - gdf = cudf.DataFrame(dict_of_series) - - if expectation == does_not_raise(): - pdf = pd.DataFrame(dict_of_series) - assert_eq(pdf, gdf) - - -def test_init_unaligned_with_index(): - pdf = pd.DataFrame( - { - "a": pd.Series([1.0, 2.0, 3.0], index=[4, 5, 6]), - "b": pd.Series([1.0, 2.0, 3.0], index=[1, 2, 3]), - }, - index=[7, 8, 9], - ) - gdf = cudf.DataFrame( - { - "a": cudf.Series([1.0, 2.0, 3.0], index=[4, 5, 6]), - "b": cudf.Series([1.0, 2.0, 3.0], index=[1, 2, 3]), - }, - index=[7, 8, 9], - ) - - assert_eq(pdf, gdf, check_dtype=False) - - -def test_init_series_list_columns_unsort(): - pseries = [ - pd.Series(i, index=["b", "a", "c"], name=str(i)) for i in range(3) - ] - gseries = [ - cudf.Series(i, index=["b", "a", "c"], name=str(i)) for i in range(3) - ] - pdf = pd.DataFrame(pseries) - gdf = cudf.DataFrame(gseries) - assert_eq(pdf, gdf) - - -def test_series_basic(): - # Make series from buffer - a1 = np.arange(10, dtype=np.float64) - series = cudf.Series(a1) - assert len(series) == 10 - np.testing.assert_equal(series.to_numpy(), np.hstack([a1])) - - -def test_series_from_cupy_scalars(): - data = [0.1, 0.2, 0.3] - data_np = np.array(data) - data_cp = cupy.array(data) - s_np = cudf.Series([data_np[0], data_np[2]]) - s_cp = cudf.Series([data_cp[0], data_cp[2]]) - assert_eq(s_np, s_cp) - - @pytest.mark.parametrize("a", [[1, 2, 3], [1, 10, 30]]) @pytest.mark.parametrize("b", [[4, 5, 6], [-11, -100, 30]]) def test_concat_index(a, b): @@ -325,106 +149,6 @@ def test_concat_index(a, b): assert_eq(expected.index, actual.index) -@pytest.mark.parametrize( - "data", - [ - {"a": [1, 2]}, - {"a": [1, 2, 3], "b": [3, 4, 5]}, - {"a": [1, 2, 3, 4], "b": [3, 4, 5, 6], "c": [1, 3, 5, 7]}, - {"a": [np.nan, 2, 3, 4], "b": [3, 4, np.nan, 6], "c": [1, 3, 5, 7]}, - {1: [1, 2, 3], 2: [3, 4, 5]}, - {"a": [1, None, None], "b": [3, np.nan, np.nan]}, - {1: ["a", "b", "c"], 2: ["q", "w", "u"]}, - {1: ["a", np.nan, "c"], 2: ["q", None, "u"]}, - {}, - {1: [], 2: [], 3: []}, - [1, 2, 3], - ], -) -def test_axes(data): - csr = cudf.DataFrame(data) - psr = pd.DataFrame(data) - - expected = psr.axes - actual = csr.axes - - for e, a in zip(expected, actual, strict=True): - assert_eq(e, a, exact=False) - - -def test_dataframe_truncate_axis_0(): - df = cudf.DataFrame( - { - "A": ["a", "b", "c", "d", "e"], - "B": ["f", "g", "h", "i", "j"], - "C": ["k", "l", "m", "n", "o"], - }, - index=[1, 2, 3, 4, 5], - ) - pdf = df.to_pandas() - - expected = pdf.truncate(before=2, after=4, axis="index") - actual = df.truncate(before=2, after=4, axis="index") - assert_eq(actual, expected) - - expected = pdf.truncate(before=1, after=4, axis=0) - actual = df.truncate(before=1, after=4, axis=0) - assert_eq(expected, actual) - - -def test_dataframe_truncate_axis_1(): - df = cudf.DataFrame( - { - "A": ["a", "b", "c", "d", "e"], - "B": ["f", "g", "h", "i", "j"], - "C": ["k", "l", "m", "n", "o"], - }, - index=[1, 2, 3, 4, 5], - ) - pdf = df.to_pandas() - - expected = pdf.truncate(before="A", after="B", axis="columns") - actual = df.truncate(before="A", after="B", axis="columns") - assert_eq(actual, expected) - - expected = pdf.truncate(before="A", after="B", axis=1) - actual = df.truncate(before="A", after="B", axis=1) - assert_eq(actual, expected) - - -def test_dataframe_truncate_datetimeindex(): - dates = cudf.date_range( - "2021-01-01 23:45:00", "2021-01-01 23:46:00", freq="s" - ) - df = cudf.DataFrame(data={"A": 1, "B": 2}, index=dates) - pdf = df.to_pandas() - expected = pdf.truncate( - before="2021-01-01 23:45:18", after="2021-01-01 23:45:27" - ) - actual = df.truncate( - before="2021-01-01 23:45:18", after="2021-01-01 23:45:27" - ) - - assert_eq(actual, expected) - - -def test_series_init_none(): - # test for creating empty series - # 1: without initializing - sr1 = cudf.Series() - got = sr1.to_string() - - expect = repr(sr1.to_pandas()) - assert got == expect - - # 2: Using `None` as an initializer - sr2 = cudf.Series(None) - got = sr2.to_string() - - expect = repr(sr2.to_pandas()) - assert got == expect - - def test_dataframe_basic(): rng = np.random.default_rng(seed=0) df = cudf.DataFrame() @@ -479,522 +203,82 @@ def test_dataframe_basic(): assert gdf["val"].isnull().all() -@pytest.mark.parametrize( - "pdf", - [ - pd.DataFrame( - {"a": range(10), "b": range(10, 20), "c": range(1, 11)}, - index=pd.Index( - ["a", "b", "c", "d", "e", "f", "g", "h", "i", "j"], - name="custom_name", - ), - ), - pd.DataFrame( - {"a": range(10), "b": range(10, 20), "d": ["a", "v"] * 5} - ), - ], -) -@pytest.mark.parametrize( - "columns", - [["a"], ["b"], "a", "b", ["a", "b"]], -) -@pytest.mark.parametrize("inplace", [True, False]) -def test_dataframe_drop_columns(pdf, columns, inplace): - pdf = pdf.copy() - gdf = cudf.from_pandas(pdf) - - expected = pdf.drop(columns=columns, inplace=inplace) - actual = gdf.drop(columns=columns, inplace=inplace) +def test_dataframe_column_add_drop_via_setitem(): + df = cudf.DataFrame() + data = np.asarray(range(10)) + df["a"] = data + df["b"] = data + assert tuple(df.columns) == ("a", "b") + del df["a"] + assert tuple(df.columns) == ("b",) + df["c"] = data + assert tuple(df.columns) == ("b", "c") + df["a"] = data + assert tuple(df.columns) == ("b", "c", "a") - if inplace: - expected = pdf - actual = gdf - assert_eq(expected, actual) +def test_dataframe_column_set_via_attr(): + data_0 = np.asarray([0, 2, 4, 5]) + data_1 = np.asarray([1, 4, 2, 3]) + data_2 = np.asarray([2, 0, 3, 0]) + df = cudf.DataFrame({"a": data_0, "b": data_1, "c": data_2}) + for i in range(10): + df.c = df.a + assert assert_eq(df.c, df.a, check_names=False) + assert tuple(df.columns) == ("a", "b", "c") -@pytest.mark.parametrize("obj", ["Index", "Series"]) -def test_drop_cudf_obj_columns(obj): - pdf = pd.DataFrame({"A": [1], "B": [1]}) - gdf = cudf.from_pandas(pdf) + df.c = df.b + assert assert_eq(df.c, df.b, check_names=False) + assert tuple(df.columns) == ("a", "b", "c") - columns = ["B"] - expected = pdf.drop(labels=getattr(pd, obj)(columns), axis=1) - actual = gdf.drop(columns=getattr(cudf, obj)(columns), axis=1) - assert_eq(expected, actual) +def test_dataframe_column_drop_via_attr(): + df = cudf.DataFrame({"a": []}) -@pytest.mark.parametrize( - "pdf", - [ - pd.DataFrame( - {"a": range(10), "b": range(10, 20), "c": range(1, 11)}, - index=pd.Index(list(range(10)), name="custom_name"), - ), - pd.DataFrame( - {"a": range(10), "b": range(10, 20), "d": ["a", "v"] * 5} - ), - ], -) -@pytest.mark.parametrize( - "labels", - [ - [1], - [0], - 1, - 5, - [5, 9], - pd.Index([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]), - pd.Index([0, 1, 8, 9], name="new name"), - ], -) -@pytest.mark.parametrize("inplace", [True, False]) -def test_dataframe_drop_labels_axis_0(pdf, labels, inplace): - pdf = pdf.copy() - gdf = cudf.from_pandas(pdf) + with pytest.raises(AttributeError): + del df.a - expected = pdf.drop(labels=labels, axis=0, inplace=inplace) - actual = gdf.drop(labels=labels, axis=0, inplace=inplace) + assert tuple(df.columns) == tuple("a") - if inplace: - expected = pdf - actual = gdf - assert_eq(expected, actual) +@pytest.mark.parametrize("nelem", [0, 10]) +def test_dataframe_astype(nelem): + df = cudf.DataFrame() + data = np.asarray(range(nelem), dtype=np.int32) + df["a"] = data + assert df["a"].dtype is np.dtype(np.int32) + df["b"] = df["a"].astype(np.float32) + assert df["b"].dtype is np.dtype(np.float32) + np.testing.assert_equal(df["a"].to_numpy(), df["b"].to_numpy()) -@pytest.mark.parametrize( - "pdf", - [ - pd.DataFrame({"a": range(10), "b": range(10, 20), "c": range(1, 11)}), - pd.DataFrame( - {"a": range(10), "b": range(10, 20), "d": ["a", "v"] * 5} - ), - pd.DataFrame( - { - "a": range(10), - "b": range(10, 20), - }, - index=pd.Index(list(range(10)), dtype="uint64"), - ), - ], -) -@pytest.mark.parametrize( - "index", - [[1], [0], 1, 5, [5, 9], pd.Index([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])], -) -@pytest.mark.parametrize("inplace", [True, False]) -def test_dataframe_drop_index(pdf, index, inplace): - pdf = pdf.copy() - gdf = cudf.from_pandas(pdf) +def test_astype_dict(): + gdf = cudf.DataFrame({"a": [1, 2, 3], "b": ["1", "2", "3"]}) + pdf = gdf.to_pandas() - expected = pdf.drop(index=index, inplace=inplace) - actual = gdf.drop(index=index, inplace=inplace) + assert_eq(pdf.astype({"a": "str"}), gdf.astype({"a": "str"})) + assert_eq( + pdf.astype({"a": "str", "b": np.int64}), + gdf.astype({"a": "str", "b": np.int64}), + ) - if inplace: - expected = pdf - actual = gdf - assert_eq(expected, actual) - - -@pytest.mark.parametrize( - "index,level", - [ - ("cow", 0), - ("lama", 0), - ("falcon", 0), - ("speed", 1), - ("weight", 1), - ("length", 1), - ("cow", None), - ( - "lama", - None, - ), - ( - "falcon", - None, - ), - ], -) -@pytest.mark.parametrize("inplace", [True, False]) -def test_dataframe_drop_multiindex(pdf, index, level, inplace): - pdf = pd.DataFrame( - {"a": range(10), "b": range(10, 20), "d": ["a", "v"] * 5}, - index=pd.MultiIndex( - levels=[ - ["lama", "cow", "falcon"], - ["speed", "weight", "length"], - ], - codes=[ - [0, 0, 0, 1, 1, 1, 2, 2, 2, 1], - [0, 1, 2, 0, 1, 2, 0, 1, 2, 1], - ], - ), - ) - gdf = cudf.from_pandas(pdf) - - expected = pdf.drop(index=index, inplace=inplace, level=level) - actual = gdf.drop(index=index, inplace=inplace, level=level) - - if inplace: - expected = pdf - actual = gdf - - assert_eq(expected, actual) - - -@pytest.mark.parametrize( - "data", - [{"c": range(1, 11)}, {"d": ["a", "v"] * 5}], -) -@pytest.mark.parametrize( - "labels", - [["a"], ["b"], "a", "b", ["a", "b"]], -) -@pytest.mark.parametrize("inplace", [True, False]) -def test_dataframe_drop_labels_axis_1(data, labels, inplace): - pdf = pd.DataFrame({"a": range(10), "b": range(10, 20), **data}) - gdf = cudf.from_pandas(pdf) - - expected = pdf.drop(labels=labels, axis=1, inplace=inplace) - actual = gdf.drop(labels=labels, axis=1, inplace=inplace) - - if inplace: - expected = pdf - actual = gdf - - assert_eq(expected, actual) - - -def test_dataframe_drop_error(): - df = cudf.DataFrame({"a": [1], "b": [2], "c": [3]}) - pdf = df.to_pandas() - - assert_exceptions_equal( - lfunc=pdf.drop, - rfunc=df.drop, - lfunc_args_and_kwargs=([], {"columns": "d"}), - rfunc_args_and_kwargs=([], {"columns": "d"}), - ) - - assert_exceptions_equal( - lfunc=pdf.drop, - rfunc=df.drop, - lfunc_args_and_kwargs=([], {"columns": ["a", "d", "b"]}), - rfunc_args_and_kwargs=([], {"columns": ["a", "d", "b"]}), - ) - - assert_exceptions_equal( - lfunc=pdf.drop, - rfunc=df.drop, - lfunc_args_and_kwargs=(["a"], {"columns": "a", "axis": 1}), - rfunc_args_and_kwargs=(["a"], {"columns": "a", "axis": 1}), - ) - - assert_exceptions_equal( - lfunc=pdf.drop, - rfunc=df.drop, - lfunc_args_and_kwargs=([], {"axis": 1}), - rfunc_args_and_kwargs=([], {"axis": 1}), - ) - - assert_exceptions_equal( - lfunc=pdf.drop, - rfunc=df.drop, - lfunc_args_and_kwargs=([[2, 0]],), - rfunc_args_and_kwargs=([[2, 0]],), - ) - - -def test_dataframe_swaplevel_axis_0(): - midx = cudf.MultiIndex( - levels=[ - ["Work"], - ["Final exam", "Coursework"], - ["History", "Geography"], - ["January", "February", "March", "April"], - ], - codes=[[0, 0, 0, 0], [0, 0, 1, 1], [0, 1, 0, 1], [0, 1, 2, 3]], - names=["a", "b", "c", "d"], - ) - cdf = cudf.DataFrame( - { - "Grade": ["A", "B", "A", "C"], - "Percentage": ["95", "85", "95", "75"], - }, - index=midx, - ) - pdf = cdf.to_pandas() - - assert_eq(pdf.swaplevel(), cdf.swaplevel()) - assert_eq(pdf.swaplevel(), cdf.swaplevel(-2, -1, 0)) - assert_eq(pdf.swaplevel(1, 2), cdf.swaplevel(1, 2)) - assert_eq(cdf.swaplevel(2, 1), cdf.swaplevel(1, 2)) - assert_eq(pdf.swaplevel(-1, -3), cdf.swaplevel(-1, -3)) - assert_eq(pdf.swaplevel("a", "b", 0), cdf.swaplevel("a", "b", 0)) - assert_eq(cdf.swaplevel("a", "b"), cdf.swaplevel("b", "a")) - - -def test_dataframe_swaplevel_TypeError(): - cdf = cudf.DataFrame( - {"a": [1, 2, 3], "c": [10, 20, 30]}, index=["x", "y", "z"] - ) - - with pytest.raises(TypeError): - cdf.swaplevel() - - -def test_dataframe_swaplevel_axis_1(): - midx = cudf.MultiIndex( - levels=[ - ["b", "a"], - ["bb", "aa"], - ["bbb", "aaa"], - ], - codes=[[0, 0, 1, 1], [0, 1, 0, 1], [0, 1, 0, 1]], - names=[None, "a", "b"], - ) - cdf = cudf.DataFrame( - data=[[45, 30, 100, 90], [200, 100, 50, 80]], - columns=midx, - ) - pdf = cdf.to_pandas() - - assert_eq(pdf.swaplevel(1, 2, 1), cdf.swaplevel(1, 2, 1)) - assert_eq(pdf.swaplevel("a", "b", 1), cdf.swaplevel("a", "b", 1)) - assert_eq(cdf.swaplevel(2, 1, 1), cdf.swaplevel(1, 2, 1)) - assert_eq(pdf.swaplevel(0, 2, 1), cdf.swaplevel(0, 2, 1)) - assert_eq(pdf.swaplevel(2, 0, 1), cdf.swaplevel(2, 0, 1)) - assert_eq(cdf.swaplevel("a", "a", 1), cdf.swaplevel("b", "b", 1)) - - -def test_dataframe_drop_raises(): - df = cudf.DataFrame( - {"a": [1, 2, 3], "c": [10, 20, 30]}, index=["x", "y", "z"] - ) - pdf = df.to_pandas() - assert_exceptions_equal( - lfunc=pdf.drop, - rfunc=df.drop, - lfunc_args_and_kwargs=(["p"],), - rfunc_args_and_kwargs=(["p"],), - ) - - # label dtype mismatch - assert_exceptions_equal( - lfunc=pdf.drop, - rfunc=df.drop, - lfunc_args_and_kwargs=([3],), - rfunc_args_and_kwargs=([3],), - ) - - expect = pdf.drop("p", errors="ignore") - actual = df.drop("p", errors="ignore") - - assert_eq(actual, expect) - - assert_exceptions_equal( - lfunc=pdf.drop, - rfunc=df.drop, - lfunc_args_and_kwargs=([], {"columns": "p"}), - rfunc_args_and_kwargs=([], {"columns": "p"}), - ) - - expect = pdf.drop(columns="p", errors="ignore") - actual = df.drop(columns="p", errors="ignore") - - assert_eq(actual, expect) - - assert_exceptions_equal( - lfunc=pdf.drop, - rfunc=df.drop, - lfunc_args_and_kwargs=([], {"labels": "p", "axis": 1}), - rfunc_args_and_kwargs=([], {"labels": "p", "axis": 1}), - ) - - expect = pdf.drop(labels="p", axis=1, errors="ignore") - actual = df.drop(labels="p", axis=1, errors="ignore") - - assert_eq(actual, expect) - - -def test_dataframe_column_add_drop_via_setitem(): - df = cudf.DataFrame() - data = np.asarray(range(10)) - df["a"] = data - df["b"] = data - assert tuple(df.columns) == ("a", "b") - del df["a"] - assert tuple(df.columns) == ("b",) - df["c"] = data - assert tuple(df.columns) == ("b", "c") - df["a"] = data - assert tuple(df.columns) == ("b", "c", "a") - - -def test_dataframe_column_set_via_attr(): - data_0 = np.asarray([0, 2, 4, 5]) - data_1 = np.asarray([1, 4, 2, 3]) - data_2 = np.asarray([2, 0, 3, 0]) - df = cudf.DataFrame({"a": data_0, "b": data_1, "c": data_2}) - - for i in range(10): - df.c = df.a - assert assert_eq(df.c, df.a, check_names=False) - assert tuple(df.columns) == ("a", "b", "c") - - df.c = df.b - assert assert_eq(df.c, df.b, check_names=False) - assert tuple(df.columns) == ("a", "b", "c") - - -def test_dataframe_column_drop_via_attr(): - df = cudf.DataFrame({"a": []}) - - with pytest.raises(AttributeError): - del df.a - - assert tuple(df.columns) == tuple("a") - - -@pytest.mark.parametrize("axis", [0, "index"]) -def test_dataframe_index_rename(axis): - pdf = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]}) - gdf = cudf.DataFrame.from_pandas(pdf) - - expect = pdf.rename(mapper={1: 5, 2: 6}, axis=axis) - got = gdf.rename(mapper={1: 5, 2: 6}, axis=axis) - - assert_eq(expect, got) - - expect = pdf.rename(index={1: 5, 2: 6}) - got = gdf.rename(index={1: 5, 2: 6}) - - assert_eq(expect, got) - - expect = pdf.rename({1: 5, 2: 6}) - got = gdf.rename({1: 5, 2: 6}) - - assert_eq(expect, got) - - # `pandas` can support indexes with mixed values. We throw a - # `NotImplementedError`. - with pytest.raises(NotImplementedError): - gdf.rename(mapper={1: "x", 2: "y"}, axis=axis) - - -def test_dataframe_MI_rename(): - gdf = cudf.DataFrame( - {"a": np.arange(10), "b": np.arange(10), "c": np.arange(10)} - ) - gdg = gdf.groupby(["a", "b"]).count() - pdg = gdg.to_pandas() - - expect = pdg.rename(mapper={1: 5, 2: 6}, axis=0) - got = gdg.rename(mapper={1: 5, 2: 6}, axis=0) - - assert_eq(expect, got) - - -@pytest.mark.parametrize("axis", [1, "columns"]) -def test_dataframe_column_rename(axis): - pdf = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]}) - gdf = cudf.DataFrame.from_pandas(pdf) - - expect = pdf.rename(mapper=lambda name: 2 * name, axis=axis) - got = gdf.rename(mapper=lambda name: 2 * name, axis=axis) - - assert_eq(expect, got) - - expect = pdf.rename(columns=lambda name: 2 * name) - got = gdf.rename(columns=lambda name: 2 * name) - - assert_eq(expect, got) - - rename_mapper = {"a": "z", "b": "y", "c": "x"} - expect = pdf.rename(columns=rename_mapper) - got = gdf.rename(columns=rename_mapper) - - assert_eq(expect, got) - - -def test_dataframe_pop(): - pdf = pd.DataFrame( - {"a": [1, 2, 3], "b": ["x", "y", "z"], "c": [7.0, 8.0, 9.0]} - ) - gdf = cudf.DataFrame.from_pandas(pdf) - - # Test non-existing column error - with pytest.raises(KeyError) as raises: - gdf.pop("fake_colname") - raises.match("fake_colname") - - # check pop numeric column - pdf_pop = pdf.pop("a") - gdf_pop = gdf.pop("a") - assert_eq(pdf_pop, gdf_pop) - assert_eq(pdf, gdf) - - # check string column - pdf_pop = pdf.pop("b") - gdf_pop = gdf.pop("b") - assert_eq(pdf_pop, gdf_pop) - assert_eq(pdf, gdf) - - # check float column and empty dataframe - pdf_pop = pdf.pop("c") - gdf_pop = gdf.pop("c") - assert_eq(pdf_pop, gdf_pop) - assert_eq(pdf, gdf) - - # check empty dataframe edge case - empty_pdf = pd.DataFrame(columns=["a", "b"]) - empty_gdf = cudf.DataFrame(columns=["a", "b"]) - pb = empty_pdf.pop("b") - gb = empty_gdf.pop("b") - assert len(pb) == len(gb) - assert empty_pdf.empty and empty_gdf.empty - - -@pytest.mark.parametrize("nelem", [0, 10]) -def test_dataframe_astype(nelem): - df = cudf.DataFrame() - data = np.asarray(range(nelem), dtype=np.int32) - df["a"] = data - assert df["a"].dtype is np.dtype(np.int32) - df["b"] = df["a"].astype(np.float32) - assert df["b"].dtype is np.dtype(np.float32) - np.testing.assert_equal(df["a"].to_numpy(), df["b"].to_numpy()) - - -def test_astype_dict(): - gdf = cudf.DataFrame({"a": [1, 2, 3], "b": ["1", "2", "3"]}) - pdf = gdf.to_pandas() - - assert_eq(pdf.astype({"a": "str"}), gdf.astype({"a": "str"})) - assert_eq( - pdf.astype({"a": "str", "b": np.int64}), - gdf.astype({"a": "str", "b": np.int64}), - ) - - -@pytest.mark.parametrize("nelem", [0, 100]) -def test_index_astype(nelem): - df = cudf.DataFrame() - data = np.asarray(range(nelem), dtype=np.int32) - df["a"] = data - assert df.index.dtype is np.dtype(np.int64) - df.index = df.index.astype(np.float32) - assert df.index.dtype is np.dtype(np.float32) - df["a"] = df["a"].astype(np.float32) - np.testing.assert_equal(df.index.to_numpy(), df["a"].to_numpy()) - df["b"] = df["a"] - df = df.set_index("b") - df["a"] = df["a"].astype(np.int16) - df.index = df.index.astype(np.int16) - np.testing.assert_equal(df.index.to_numpy(), df["a"].to_numpy()) +@pytest.mark.parametrize("nelem", [0, 100]) +def test_index_astype(nelem): + df = cudf.DataFrame() + data = np.asarray(range(nelem), dtype=np.int32) + df["a"] = data + assert df.index.dtype is np.dtype(np.int64) + df.index = df.index.astype(np.float32) + assert df.index.dtype is np.dtype(np.float32) + df["a"] = df["a"].astype(np.float32) + np.testing.assert_equal(df.index.to_numpy(), df["a"].to_numpy()) + df["b"] = df["a"] + df = df.set_index("b") + df["a"] = df["a"].astype(np.int16) + df.index = df.index.astype(np.int16) + np.testing.assert_equal(df.index.to_numpy(), df["a"].to_numpy()) def test_dataframe_to_string_with_skipped_rows(): @@ -1213,163 +497,44 @@ def test_dataframe_dir_and_getattr(): df.not_a_column -def test_empty_dataframe_to_cupy(): - df = cudf.DataFrame() - - # Check fully empty dataframe. - mat = df.to_cupy() - assert mat.shape == (0, 0) - mat = df.to_numpy() - assert mat.shape == (0, 0) - - df = cudf.DataFrame() - nelem = 123 - rng = np.random.default_rng(seed=0) - for k in "abc": - df[k] = rng.random(nelem) +def test_dataframe_append_empty(): + pdf = pd.DataFrame( + { + "key": [1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4], + "value": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12], + } + ) + gdf = cudf.DataFrame.from_pandas(pdf) - # Check all columns in empty dataframe. - mat = df.head(0).to_cupy() - assert mat.shape == (0, 3) + gdf["newcol"] = 100 + pdf["newcol"] = 100 + assert len(gdf["newcol"]) == len(pdf) + assert len(pdf["newcol"]) == len(pdf) + assert_eq(gdf, pdf) -def test_dataframe_to_cupy(): - df = cudf.DataFrame() - nelem = 123 +def test_dataframe_setitem_from_masked_object(): rng = np.random.default_rng(seed=0) - for k in "abcd": - df[k] = rng.random(nelem) + ary = rng.standard_normal(100) + mask = np.zeros(100, dtype=bool) + mask[:20] = True + rng.shuffle(mask) + ary[mask] = np.nan - # Check all columns - mat = df.to_cupy() - assert mat.shape == (nelem, 4) - assert mat.strides == (8, 984) + test1_null = cudf.Series(ary, nan_as_null=True) + assert test1_null.null_count == 20 + test1_nan = cudf.Series(ary, nan_as_null=False) + assert test1_nan.null_count == 0 - mat = df.to_numpy() - assert mat.shape == (nelem, 4) - assert mat.strides == (8, 984) - for i, k in enumerate(df.columns): - np.testing.assert_array_equal(df[k].to_numpy(), mat[:, i]) - - # Check column subset - mat = df[["a", "c"]].to_cupy().get() - assert mat.shape == (nelem, 2) - - for i, k in enumerate("ac"): - np.testing.assert_array_equal(df[k].to_numpy(), mat[:, i]) - - -@pytest.mark.parametrize("has_nulls", [False, True]) -@pytest.mark.parametrize("use_na_value", [False, True]) -def test_dataframe_to_cupy_single_column(has_nulls, use_na_value): - nelem = 10 - data = np.arange(nelem, dtype=np.float64) - - if has_nulls: - data = data.astype("object") - data[::2] = None - - df = cudf.DataFrame({"a": data}) - - if has_nulls and not use_na_value: - with pytest.raises(ValueError, match="Column must have no nulls"): - df.to_cupy() - return - - na_value = 0.0 if use_na_value else None - expected = ( - cupy.asarray(df["a"].fillna(na_value)) - if has_nulls - else cupy.asarray(df["a"]) - ) - result = df.to_cupy(na_value=na_value) - assert result.shape == (nelem, 1) - assert_eq(result.ravel(), expected) - - -def test_dataframe_to_cupy_null_values(): - df = cudf.DataFrame() - - nelem = 123 - na = -10000 - - refvalues = {} - rng = np.random.default_rng(seed=0) - for k in "abcd": - df[k] = data = rng.random(nelem) - bitmask = utils.random_bitmask(nelem) - df[k] = df[k]._column.set_mask(bitmask) - boolmask = np.asarray( - utils.expand_bits_to_bytes(bitmask)[:nelem], dtype=np.bool_ - ) - data[~boolmask] = na - refvalues[k] = data - - # Check null value causes error - with pytest.raises(ValueError): - df.to_cupy() - with pytest.raises(ValueError): - df.to_numpy() - - for k in df.columns: - df[k] = df[k].fillna(na) - - mat = df.to_numpy() - for i, k in enumerate(df.columns): - np.testing.assert_array_equal(refvalues[k], mat[:, i]) - - -@pytest.mark.parametrize("method", ["to_cupy", "to_numpy"]) -@pytest.mark.parametrize("value", [1, True, 1.5]) -@pytest.mark.parametrize("constructor", ["DataFrame", "Series"]) -def test_to_array_categorical(method, value, constructor): - data = [value] - expected = getattr(pd, constructor)(data, dtype="category").to_numpy() - result = getattr( - getattr(cudf, constructor)(data, dtype="category"), method - )() - assert_eq(result, expected) - - -def test_dataframe_append_empty(): - pdf = pd.DataFrame( - { - "key": [1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4], - "value": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12], - } - ) - gdf = cudf.DataFrame.from_pandas(pdf) - - gdf["newcol"] = 100 - pdf["newcol"] = 100 - - assert len(gdf["newcol"]) == len(pdf) - assert len(pdf["newcol"]) == len(pdf) - assert_eq(gdf, pdf) - - -def test_dataframe_setitem_from_masked_object(): - rng = np.random.default_rng(seed=0) - ary = rng.standard_normal(100) - mask = np.zeros(100, dtype=bool) - mask[:20] = True - rng.shuffle(mask) - ary[mask] = np.nan - - test1_null = cudf.Series(ary, nan_as_null=True) - assert test1_null.null_count == 20 - test1_nan = cudf.Series(ary, nan_as_null=False) - assert test1_nan.null_count == 0 - - test2_null = cudf.DataFrame.from_pandas( - pd.DataFrame({"a": ary}), nan_as_null=True - ) - assert test2_null["a"].null_count == 20 - test2_nan = cudf.DataFrame.from_pandas( - pd.DataFrame({"a": ary}), nan_as_null=False - ) - assert test2_nan["a"].null_count == 0 + test2_null = cudf.DataFrame.from_pandas( + pd.DataFrame({"a": ary}), nan_as_null=True + ) + assert test2_null["a"].null_count == 20 + test2_nan = cudf.DataFrame.from_pandas( + pd.DataFrame({"a": ary}), nan_as_null=False + ) + assert test2_nan["a"].null_count == 0 gpu_ary = cupy.asarray(ary) test3_null = cudf.Series(gpu_ary, nan_as_null=True) @@ -1411,284 +576,6 @@ def test_empty_dataframe_setitem_df(): assert_eq(gdf1, gdf2) -def test_assign(): - gdf = cudf.DataFrame({"x": [1, 2, 3]}) - gdf2 = gdf.assign(y=gdf.x + 1) - assert list(gdf.columns) == ["x"] - assert list(gdf2.columns) == ["x", "y"] - - np.testing.assert_equal(gdf2.y.to_numpy(), [2, 3, 4]) - - -@pytest.mark.parametrize( - "mapping", - [ - {"y": 1, "z": lambda df: df["x"] + df["y"]}, - { - "x": lambda df: df["x"] * 2, - "y": lambda df: 2, - "z": lambda df: df["x"] / df["y"], - }, - ], -) -def test_assign_callable(mapping): - df = pd.DataFrame({"x": [1, 2, 3]}) - cdf = cudf.from_pandas(df) - expect = df.assign(**mapping) - actual = cdf.assign(**mapping) - assert_eq(expect, actual) - - -@pytest.mark.parametrize( - "method", - [ - "murmur3", - "md5", - "sha1", - "sha224", - "sha256", - "sha384", - "sha512", - "xxhash32", - "xxhash64", - ], -) -@pytest.mark.parametrize("seed", [None, 42]) -def test_dataframe_hash_values(method, seed): - nrows = 10 - warning_expected = seed is not None and method not in { - "murmur3", - "xxhash32", - "xxhash64", - } - potential_warning = ( - pytest.warns(UserWarning, match="Provided seed value has no effect*") - if warning_expected - else contextlib.nullcontext() - ) - - gdf = cudf.DataFrame() - data = np.arange(nrows) - data[0] = data[-1] # make first and last the same - gdf["a"] = data - gdf["b"] = gdf.a + 100 - with potential_warning: - out = gdf.hash_values(method=method, seed=seed) - assert isinstance(out, cudf.Series) - assert len(out) == nrows - expected_dtypes = { - "murmur3": np.uint32, - "md5": object, - "sha1": object, - "sha224": object, - "sha256": object, - "sha384": object, - "sha512": object, - "xxhash32": np.uint32, - "xxhash64": np.uint64, - } - assert out.dtype == expected_dtypes[method] - - # Check single column - with potential_warning: - out_one = gdf[["a"]].hash_values(method=method, seed=seed) - # First matches last - assert out_one.iloc[0] == out_one.iloc[-1] - # Equivalent to the cudf.Series.hash_values() - with potential_warning: - assert_eq(gdf["a"].hash_values(method=method, seed=seed), out_one) - - -@pytest.mark.parametrize("method", ["murmur3", "xxhash32", "xxhash64"]) -def test_dataframe_hash_values_seed(method): - gdf = cudf.DataFrame() - data = np.arange(10) - data[0] = data[-1] # make first and last the same - gdf["a"] = data - gdf["b"] = gdf.a + 100 - out_one = gdf.hash_values(method=method, seed=0) - out_two = gdf.hash_values(method=method, seed=1) - assert out_one.iloc[0] == out_one.iloc[-1] - assert out_two.iloc[0] == out_two.iloc[-1] - assert_neq(out_one, out_two) - - -def test_dataframe_hash_values_xxhash32(): - # xxhash32 has no built-in implementation in Python and we don't want to - # add a testing dependency, so we use regression tests against known good - # values. - gdf = cudf.DataFrame({"a": [0.0, 1.0, 2.0, np.inf, np.nan]}) - gdf["b"] = -gdf["a"] - out_a = gdf["a"].hash_values(method="xxhash32", seed=0) - expected_a = cudf.Series( - [3736311059, 2307980487, 2906647130, 746578903, 4294967295], - dtype=np.uint32, - ) - assert_eq(out_a, expected_a) - - out_b = gdf["b"].hash_values(method="xxhash32", seed=42) - expected_b = cudf.Series( - [1076387279, 2261349915, 531498073, 650869264, 4294967295], - dtype=np.uint32, - ) - assert_eq(out_b, expected_b) - - out_df = gdf.hash_values(method="xxhash32", seed=0) - expected_df = cudf.Series( - [1223721700, 2885793241, 1920811472, 1146715602, 4294967295], - dtype=np.uint32, - ) - assert_eq(out_df, expected_df) - - -def test_dataframe_hash_values_xxhash64(): - # xxhash64 has no built-in implementation in Python and we don't want to - # add a testing dependency, so we use regression tests against known good - # values. - gdf = cudf.DataFrame({"a": [0.0, 1.0, 2.0, np.inf, np.nan]}) - gdf["b"] = -gdf["a"] - out_a = gdf["a"].hash_values(method="xxhash64", seed=0) - expected_a = cudf.Series( - [ - 3803688792395291579, - 10706502109028787093, - 9835943264235290955, - 18031741628920313605, - 18446744073709551615, - ], - dtype=np.uint64, - ) - assert_eq(out_a, expected_a) - - out_b = gdf["b"].hash_values(method="xxhash64", seed=42) - expected_b = cudf.Series( - [ - 9826995235083043316, - 10150515573749944095, - 5005707091092326006, - 5326262080505358431, - 18446744073709551615, - ], - dtype=np.uint64, - ) - assert_eq(out_b, expected_b) - - out_df = gdf.hash_values(method="xxhash64", seed=0) - expected_df = cudf.Series( - [ - 10208049663714815266, - 4949201786888768834, - 18122173653994477335, - 11133539368563441730, - 18446744073709551615, - ], - dtype=np.uint64, - ) - assert_eq(out_df, expected_df) - - -@pytest.mark.parametrize("nparts", [1, 2]) -def test_dataframe_hash_partition(nparts): - nrows = 10 - nkeys = 2 - rng = np.random.default_rng(seed=0) - gdf = cudf.DataFrame( - {f"key{i}": rng.integers(0, 7 - i, nrows) for i in range(nkeys)} - ) - keycols = gdf.columns.to_list() - gdf["val1"] = rng.integers(0, nrows * 2, nrows) - - got = gdf.partition_by_hash(keycols, nparts=nparts) - # Must return a list - assert isinstance(got, list) - # Must have correct number of partitions - assert len(got) == nparts - # All partitions must be DataFrame type - assert all(isinstance(p, cudf.DataFrame) for p in got) - # Check that all partitions have unique keys - part_unique_keys = set() - for p in got: - if len(p): - # Take rows of the keycolumns and build a set of the key-values - unique_keys = set(map(tuple, p[keycols].values_host)) - # Ensure that none of the key-values have occurred in other groups - assert not (unique_keys & part_unique_keys) - part_unique_keys |= unique_keys - assert len(part_unique_keys) - - -def test_dataframe_hash_partition_masked_value(): - nrows = 10 - gdf = cudf.DataFrame() - gdf["key"] = np.arange(nrows) - gdf["val"] = np.arange(nrows) + 100 - bitmask = utils.random_bitmask(nrows) - bytemask = utils.expand_bits_to_bytes(bitmask) - gdf["val"] = gdf["val"]._column.set_mask(bitmask) - parted = gdf.partition_by_hash(["key"], nparts=3) - # Verify that the valid mask is correct - for p in parted: - df = p.to_pandas() - for row in df.itertuples(): - valid = bool(bytemask[row.key]) - expected_value = row.key + 100 if valid else np.nan - got_value = row.val - assert (expected_value == got_value) or ( - np.isnan(expected_value) and np.isnan(got_value) - ) - - -def test_dataframe_hash_partition_masked_keys(): - nrows = 5 - gdf = cudf.DataFrame() - gdf["key"] = np.arange(nrows) - gdf["val"] = np.arange(nrows) + 100 - bitmask = utils.random_bitmask(nrows) - bytemask = utils.expand_bits_to_bytes(bitmask) - gdf["key"] = gdf["key"]._column.set_mask(bitmask) - parted = gdf.partition_by_hash(["key"], nparts=3, keep_index=False) - # Verify that the valid mask is correct - for p in parted: - df = p.to_pandas() - for row in df.itertuples(): - valid = bool(bytemask[row.val - 100]) - # val is key + 100 - expected_value = row.val - 100 if valid else np.nan - got_value = row.key - assert (expected_value == got_value) or ( - np.isnan(expected_value) and np.isnan(got_value) - ) - - -@pytest.mark.parametrize("keep_index", [True, False]) -def test_dataframe_hash_partition_keep_index(keep_index): - gdf = cudf.DataFrame( - {"val": [1, 2, 3, 4, 5], "key": [3, 2, 1, 4, 5]}, index=[5, 4, 3, 2, 1] - ) - - expected_df1 = cudf.DataFrame( - {"val": [1, 5], "key": [3, 5]}, index=[5, 1] if keep_index else None - ) - expected_df2 = cudf.DataFrame( - {"val": [2, 3, 4], "key": [2, 1, 4]}, - index=[4, 3, 2] if keep_index else None, - ) - expected = [expected_df1, expected_df2] - - parts = gdf.partition_by_hash(["key"], nparts=2, keep_index=keep_index) - - for exp, got in zip(expected, parts, strict=True): - assert_eq(exp, got) - - -def test_dataframe_hash_partition_empty(): - gdf = cudf.DataFrame({"val": [1, 2], "key": [3, 2]}, index=["a", "b"]) - parts = gdf.iloc[:0].partition_by_hash(["key"], nparts=3) - assert len(parts) == 3 - for part in parts: - assert_eq(gdf.iloc[:0], part) - - @pytest.mark.parametrize("dtype1", utils.supported_numpy_dtypes) @pytest.mark.parametrize("dtype2", utils.supported_numpy_dtypes) def test_dataframe_concat_different_numerical_columns(dtype1, dtype2): @@ -1918,241 +805,6 @@ def test_from_pandas(dtype): assert_eq(s, gs, check_dtype="pyarrow" not in dtype) -@pytest.mark.parametrize("dtypes", [int, float]) -def test_from_records(dtypes): - h_ary = np.ndarray(shape=(10, 4), dtype=dtypes) - rec_ary = h_ary.view(np.recarray) - - gdf = cudf.DataFrame.from_records(rec_ary, columns=["a", "b", "c", "d"]) - df = pd.DataFrame.from_records(rec_ary, columns=["a", "b", "c", "d"]) - assert isinstance(gdf, cudf.DataFrame) - assert_eq(df, gdf) - - gdf = cudf.DataFrame.from_records(rec_ary) - df = pd.DataFrame.from_records(rec_ary) - assert isinstance(gdf, cudf.DataFrame) - assert_eq(df, gdf) - - -@pytest.mark.parametrize("columns", [None, ["first", "second", "third"]]) -@pytest.mark.parametrize( - "index", - [ - None, - ["first", "second"], - "name", - "age", - "weight", - [10, 11], - ["abc", "xyz"], - ], -) -def test_from_records_index(columns, index): - rec_ary = np.array( - [("Rex", 9, 81.0), ("Fido", 3, 27.0)], - dtype=[("name", "U10"), ("age", "i4"), ("weight", "f4")], - ) - gdf = cudf.DataFrame.from_records(rec_ary, columns=columns, index=index) - df = pd.DataFrame.from_records(rec_ary, columns=columns, index=index) - assert isinstance(gdf, cudf.DataFrame) - assert_eq(df, gdf) - - -def test_dataframe_construction_from_cupy_arrays(): - h_ary = np.array([[1, 2, 3], [4, 5, 6]], np.int32) - d_ary = cupy.asarray(h_ary) - - gdf = cudf.DataFrame(d_ary, columns=["a", "b", "c"]) - df = pd.DataFrame(h_ary, columns=["a", "b", "c"]) - assert isinstance(gdf, cudf.DataFrame) - - assert_eq(df, gdf) - - gdf = cudf.DataFrame(d_ary) - df = pd.DataFrame(h_ary) - assert isinstance(gdf, cudf.DataFrame) - - assert_eq(df, gdf) - - gdf = cudf.DataFrame(d_ary, index=["a", "b"]) - df = pd.DataFrame(h_ary, index=["a", "b"]) - assert isinstance(gdf, cudf.DataFrame) - - assert_eq(df, gdf) - - gdf = cudf.DataFrame(d_ary) - gdf = gdf.set_index(keys=0, drop=False) - df = pd.DataFrame(h_ary) - df = df.set_index(keys=0, drop=False) - assert isinstance(gdf, cudf.DataFrame) - - assert_eq(df, gdf) - - gdf = cudf.DataFrame(d_ary) - gdf = gdf.set_index(keys=1, drop=False) - df = pd.DataFrame(h_ary) - df = df.set_index(keys=1, drop=False) - assert isinstance(gdf, cudf.DataFrame) - - assert_eq(df, gdf) - - -def test_dataframe_cupy_wrong_dimensions(): - d_ary = cupy.empty((2, 3, 4), dtype=np.int32) - with pytest.raises( - ValueError, match="records dimension expected 1 or 2 but found: 3" - ): - cudf.DataFrame(d_ary) - - -def test_dataframe_cupy_array_wrong_index(): - d_ary = cupy.empty((2, 3), dtype=np.int32) - - with pytest.raises(ValueError): - cudf.DataFrame(d_ary, index=["a"]) - - with pytest.raises(TypeError): - cudf.DataFrame(d_ary, index="a") - - -def test_index_in_dataframe_constructor(): - a = pd.DataFrame({"x": [1, 2, 3]}, index=[4.0, 5.0, 6.0]) - b = cudf.DataFrame({"x": [1, 2, 3]}, index=[4.0, 5.0, 6.0]) - - assert_eq(a, b) - assert_eq(a.loc[4:], b.loc[4:]) - - -dtypes = NUMERIC_TYPES + DATETIME_TYPES + ["bool"] - - -@pytest.mark.parametrize("nelem", [0, 2]) -@pytest.mark.parametrize("data_type", dtypes) -def test_from_arrow(nelem, data_type): - rng = np.random.default_rng(seed=0) - df = pd.DataFrame( - { - "a": rng.integers(0, 1000, nelem).astype(data_type), - "b": rng.integers(0, 1000, nelem).astype(data_type), - } - ) - padf = pa.Table.from_pandas( - df, preserve_index=False - ).replace_schema_metadata(None) - gdf = cudf.DataFrame.from_arrow(padf) - assert isinstance(gdf, cudf.DataFrame) - - assert_eq(df, gdf) - - s = pa.Array.from_pandas(df.a) - gs = cudf.Series.from_arrow(s) - assert isinstance(gs, cudf.Series) - - # For some reason PyArrow to_pandas() converts to numpy array and has - # better type compatibility - np.testing.assert_array_equal(s.to_pandas(), gs.to_numpy()) - - -def test_from_arrow_chunked_categories(): - # Verify that categories are properly deduplicated across chunked arrays. - indices = pa.array([0, 1, 0, 1, 2, 0, None, 2]) - dictionary = pa.array(["foo", "bar", "baz"]) - dict_array = pa.DictionaryArray.from_arrays(indices, dictionary) - chunked_array = pa.chunked_array([dict_array, dict_array]) - table = pa.table({"a": chunked_array}) - df = cudf.DataFrame.from_arrow(table) - final_dictionary = df["a"].dtype.categories.to_arrow().to_pylist() - assert sorted(final_dictionary) == sorted(dictionary.to_pylist()) - - -@pytest.mark.parametrize("nelem", [0, 2]) -@pytest.mark.parametrize("data_type", dtypes) -def test_to_arrow(nelem, data_type): - rng = np.random.default_rng(seed=0) - df = pd.DataFrame( - { - "a": rng.integers(0, 1000, nelem).astype(data_type), - "b": rng.integers(0, 1000, nelem).astype(data_type), - } - ) - gdf = cudf.DataFrame.from_pandas(df) - - pa_df = pa.Table.from_pandas( - df, preserve_index=False - ).replace_schema_metadata(None) - - pa_gdf = gdf.to_arrow(preserve_index=False).replace_schema_metadata(None) - - assert isinstance(pa_gdf, pa.Table) - assert pa.Table.equals(pa_df, pa_gdf) - - pa_s = pa.Array.from_pandas(df.a) - pa_gs = gdf["a"].to_arrow() - - assert isinstance(pa_gs, pa.Array) - assert pa.Array.equals(pa_s, pa_gs) - - pa_i = pa.Array.from_pandas(df.index) - pa_gi = gdf.index.to_arrow() - - assert isinstance(pa_gi, pa.Array) - assert pa.Array.equals(pa_i, pa_gi) - - -@pytest.mark.parametrize("data_type", dtypes) -def test_to_from_arrow_nulls(data_type): - if data_type == "longlong": - data_type = "int64" - if data_type == "bool": - s1 = pa.array([True, None, False, None, True], type=data_type) - else: - dtype = np.dtype(data_type) - if dtype.type == np.datetime64: - time_unit, _ = np.datetime_data(dtype) - data_type = pa.timestamp(unit=time_unit) - s1 = pa.array([1, None, 3, None, 5], type=data_type) - gs1 = cudf.Series.from_arrow(s1) - assert isinstance(gs1, cudf.Series) - # We have 64B padded buffers for nulls whereas Arrow returns a minimal - # number of bytes, so only check the first byte in this case - np.testing.assert_array_equal( - np.asarray(s1.buffers()[0]).view("u1")[0], - gs1._column.mask_array_view(mode="read").copy_to_host().view("u1")[0], - ) - assert pa.Array.equals(s1, gs1.to_arrow()) - - s2 = pa.array([None, None, None, None, None], type=data_type) - gs2 = cudf.Series.from_arrow(s2) - assert isinstance(gs2, cudf.Series) - # We have 64B padded buffers for nulls whereas Arrow returns a minimal - # number of bytes, so only check the first byte in this case - np.testing.assert_array_equal( - np.asarray(s2.buffers()[0]).view("u1")[0], - gs2._column.mask_array_view(mode="read").copy_to_host().view("u1")[0], - ) - assert pa.Array.equals(s2, gs2.to_arrow()) - - -def test_to_arrow_categorical(): - df = pd.DataFrame() - df["a"] = pd.Series(["a", "b", "c"], dtype="category") - gdf = cudf.DataFrame.from_pandas(df) - - pa_df = pa.Table.from_pandas( - df, preserve_index=False - ).replace_schema_metadata(None) - pa_gdf = gdf.to_arrow(preserve_index=False).replace_schema_metadata(None) - - assert isinstance(pa_gdf, pa.Table) - assert pa.Table.equals(pa_df, pa_gdf) - - pa_s = pa.Array.from_pandas(df.a) - pa_gs = gdf["a"].to_arrow() - - assert isinstance(pa_gs, pa.Array) - assert pa.Array.equals(pa_s, pa_gs) - - def test_from_arrow_missing_categorical(): pd_cat = pd.Categorical(["a", "b", "c"], categories=["a", "b"]) pa_cat = pa.array(pd_cat, from_pandas=True) @@ -2174,26 +826,6 @@ def test_to_arrow_missing_categorical(): assert pa.Array.equals(pa_cat, gd_cat.to_arrow()) -@pytest.mark.parametrize("data_type", dtypes) -def test_from_scalar_typing(data_type): - rng = np.random.default_rng(seed=0) - if data_type == "datetime64[ms]": - scalar = ( - np.dtype("int64").type(rng.integers(0, 5)).astype("datetime64[ms]") - ) - elif data_type.startswith("datetime64"): - scalar = np.datetime64(datetime.date.today()).astype("datetime64[ms]") - data_type = "datetime64[ms]" - else: - scalar = np.dtype(data_type).type(rng.integers(0, 5)) - - gdf = cudf.DataFrame() - gdf["a"] = [1, 2, 3, 4, 5] - gdf["b"] = scalar - assert gdf["b"].dtype == np.dtype(data_type) - assert len(gdf["b"]) == len(gdf["a"]) - - @pytest.mark.parametrize("data_type", NUMERIC_TYPES) def test_from_python_array(data_type): rng = np.random.default_rng(seed=0) @@ -2234,80 +866,6 @@ def test_dataframe_shape_empty(): assert pdf.shape == gdf.shape -@pytest.mark.parametrize("num_cols", [1, 3]) -@pytest.mark.parametrize("num_rows", [1, 4]) -@pytest.mark.parametrize("dtype", [*dtypes, "object"]) -@pytest.mark.parametrize("nulls", ["none", "some", "all"]) -def test_dataframe_transpose(nulls, num_cols, num_rows, dtype): - # In case of `bool` dtype: pandas <= 1.2.5 type-casts - # a boolean series to `float64` series if a `np.nan` is assigned to it: - # >>> s = pd.Series([True, False, True]) - # >>> s - # 0 True - # 1 False - # 2 True - # dtype: bool - # >>> s[[2]] = np.nan - # >>> s - # 0 1.0 - # 1 0.0 - # 2 NaN - # dtype: float64 - # In pandas >= 1.3.2 this behavior is fixed: - # >>> s = pd.Series([True, False, True]) - # >>> s - # 0 - # True - # 1 - # False - # 2 - # True - # dtype: bool - # >>> s[[2]] = np.nan - # >>> s - # 0 - # True - # 1 - # False - # 2 - # NaN - # dtype: object - # In cudf we change `object` dtype to `str` type - for which there - # is no transpose implemented yet. Hence we need to test transpose - # against pandas nullable types as they are the ones that closely - # resemble `cudf` dtypes behavior. - pdf = pd.DataFrame() - rng = np.random.default_rng(seed=0) - null_rep = np.nan if dtype in ["float32", "float64"] else None - np_dtype = dtype - dtype = np.dtype(dtype) - dtype = cudf.utils.dtypes.np_dtypes_to_pandas_dtypes.get(dtype, dtype) - for i in range(num_cols): - colname = string.ascii_lowercase[i] - data = pd.Series( - rng.integers(0, 26, num_rows).astype(np_dtype), - dtype=dtype, - ) - if nulls == "some": - idx = rng.choice(num_rows, size=int(num_rows / 2), replace=False) - if len(idx): - data[idx] = null_rep - elif nulls == "all": - data[:] = null_rep - pdf[colname] = data - - gdf = cudf.DataFrame.from_pandas(pdf) - - got_function = gdf.transpose() - got_property = gdf.T - - expect = pdf.transpose() - nullable = dtype not in DATETIME_TYPES - - assert_eq(expect, got_function.to_pandas(nullable=nullable)) - assert_eq(expect, got_property.to_pandas(nullable=nullable)) - - @pytest.mark.parametrize("num_cols", [1, 3]) @pytest.mark.parametrize("num_rows", [1, 5]) def test_dataframe_transpose_category(num_cols, num_rows): @@ -2701,1488 +1259,176 @@ def test_unaryops_df(pdf, unaryop, col_name, assign_col_name): if assign_col_name: pd_df.columns.name = col_name gdf = cudf.from_pandas(pd_df) - d = unaryop(pd_df - 5) - g = unaryop(gdf - 5) - assert_eq(d, g) - - -def test_df_abs(pdf): - rng = np.random.default_rng(seed=0) - disturbance = pd.Series(rng.random(10)) - pdf = pdf - 5 + disturbance - d = pdf.apply(np.abs) - g = cudf.from_pandas(pdf).abs() - assert_eq(d, g) - - -def test_scale_df(gdf): - got = (gdf - 5).scale() - expect = cudf.DataFrame( - {"x": np.linspace(0.0, 1.0, 10), "y": np.linspace(0.0, 1.0, 10)} - ) - assert_eq(expect, got) - - -@pytest.mark.parametrize( - "func", - [ - lambda df: df.empty, - lambda df: df.x.empty, - lambda df: df.x.fillna(123, limit=None, method=None, axis=None), - lambda df: df.drop("x", axis=1, errors="raise"), - ], -) -def test_unary_operators(func, pdf, gdf): - p = func(pdf) - g = func(gdf) - assert_eq(p, g) - - -def test_is_monotonic(gdf): - pdf = pd.DataFrame({"x": [1, 2, 3]}, index=[3, 1, 2]) - gdf = cudf.DataFrame.from_pandas(pdf) - assert not gdf.index.is_monotonic_increasing - assert not gdf.index.is_monotonic_decreasing - - -def test_iter(pdf, gdf): - assert list(pdf) == list(gdf) - - -def test_iteritems(gdf): - for k, v in gdf.items(): - assert k in gdf.columns - assert isinstance(v, cudf.Series) - assert_eq(v, gdf[k]) - - -@pytest.mark.parametrize("q", [0.5, 1, 0.001, [0.5], [], [0.005, 0.5, 1]]) -@pytest.mark.parametrize("numeric_only", [True, False]) -def test_quantile(q, numeric_only): - ts = pd.date_range("2018-08-24", periods=5, freq="D") - td = pd.to_timedelta(np.arange(5), unit="h") - rng = np.random.default_rng(seed=0) - pdf = pd.DataFrame( - {"date": ts, "delta": td, "val": rng.standard_normal(len(ts))} - ) - gdf = cudf.DataFrame.from_pandas(pdf) - - assert_eq(pdf["date"].quantile(q), gdf["date"].quantile(q)) - assert_eq(pdf["delta"].quantile(q), gdf["delta"].quantile(q)) - assert_eq(pdf["val"].quantile(q), gdf["val"].quantile(q)) - - q = q if isinstance(q, list) else [q] - assert_eq( - pdf.quantile(q, numeric_only=numeric_only), - gdf.quantile(q, numeric_only=numeric_only), - ) - - -@pytest.mark.parametrize("q", [0.2, 1, 0.001, [0.5], [], [0.005, 0.8, 0.03]]) -@pytest.mark.parametrize("interpolation", ["higher", "lower", "nearest"]) -@pytest.mark.parametrize( - "decimal_type", - [cudf.Decimal32Dtype, cudf.Decimal64Dtype, cudf.Decimal128Dtype], -) -def test_decimal_quantile(q, interpolation, decimal_type): - rng = np.random.default_rng(seed=0) - data = ["244.8", "32.24", "2.22", "98.14", "453.23", "5.45"] - gdf = cudf.DataFrame( - {"id": rng.integers(0, 10, size=len(data)), "val": data} - ) - gdf["id"] = gdf["id"].astype("float64") - gdf["val"] = gdf["val"].astype(decimal_type(7, 2)) - pdf = gdf.to_pandas() - - got = gdf.quantile(q, numeric_only=False, interpolation=interpolation) - expected = pdf.quantile( - q if isinstance(q, list) else [q], - numeric_only=False, - interpolation=interpolation, - ) - - assert_eq(got, expected) - - -def test_empty_quantile(): - pdf = pd.DataFrame({"x": []}, dtype="float64") - df = cudf.DataFrame({"x": []}, dtype="float64") - - actual = df.quantile() - expected = pdf.quantile() - - assert_eq(actual, expected) - - -def test_from_pandas_function(pdf): - gdf = cudf.from_pandas(pdf) - assert isinstance(gdf, cudf.DataFrame) - assert_eq(pdf, gdf) - - gdf = cudf.from_pandas(pdf.x) - assert isinstance(gdf, cudf.Series) - assert_eq(pdf.x, gdf) - - with pytest.raises(TypeError): - cudf.from_pandas(123) - - -@pytest.mark.parametrize("preserve_index", [True, False]) -def test_arrow_pandas_compat(pdf, gdf, preserve_index): - pdf["z"] = range(10) - pdf = pdf.set_index("z") - gdf["z"] = range(10) - gdf = gdf.set_index("z") - - pdf_arrow_table = pa.Table.from_pandas(pdf, preserve_index=preserve_index) - gdf_arrow_table = gdf.to_arrow(preserve_index=preserve_index) - - assert pa.Table.equals(pdf_arrow_table, gdf_arrow_table) - - gdf2 = cudf.DataFrame.from_arrow(pdf_arrow_table) - pdf2 = pdf_arrow_table.to_pandas() - - assert_eq(pdf2, gdf2) - pdf.columns.name = "abc" - pdf_arrow_table = pa.Table.from_pandas(pdf, preserve_index=preserve_index) - - gdf2 = cudf.DataFrame.from_arrow(pdf_arrow_table) - pdf2 = pdf_arrow_table.to_pandas() - assert_eq(pdf2, gdf2) - - -@pytest.mark.parametrize( - "index", - [ - None, - cudf.RangeIndex(3, name="a"), - "a", - "b", - ["a", "b"], - cudf.RangeIndex(0, 5, 2, name="a"), - ], -) -@pytest.mark.parametrize("preserve_index", [True, False, None]) -def test_arrow_round_trip(preserve_index, index): - data = {"a": [4, 5, 6], "b": ["cat", "dog", "bird"]} - if isinstance(index, (list, str)): - gdf = cudf.DataFrame(data).set_index(index) - else: - gdf = cudf.DataFrame(data, index=index) - - table = gdf.to_arrow(preserve_index=preserve_index) - table_pd = pa.Table.from_pandas( - gdf.to_pandas(), preserve_index=preserve_index - ) - - gdf_out = cudf.DataFrame.from_arrow(table) - pdf_out = table_pd.to_pandas() - - assert_eq(gdf_out, pdf_out) - - -@pytest.mark.parametrize("dtype", [*NUMERIC_TYPES, "bool"]) -def test_cuda_array_interface(dtype): - np_data = np.arange(10).astype(dtype) - cupy_data = cupy.array(np_data) - pd_data = pd.Series(np_data) - - cudf_data = cudf.Series(cupy_data) - assert_eq(pd_data, cudf_data) - - gdf = cudf.DataFrame() - gdf["test"] = cupy_data - pd_data.name = "test" - assert_eq(pd_data, gdf["test"]) - - -@pytest.mark.parametrize("nelem", [0, 10]) -@pytest.mark.parametrize("nchunks", [1, 5]) -@pytest.mark.parametrize("data_type", dtypes) -def test_from_arrow_chunked_arrays(nelem, nchunks, data_type): - rng = np.random.default_rng(seed=0) - np_list_data = [ - rng.integers(0, 100, nelem).astype(data_type) for i in range(nchunks) - ] - pa_chunk_array = pa.chunked_array(np_list_data) - - expect = pa_chunk_array.to_pandas() - got = cudf.Series(pa_chunk_array) - - assert_eq(expect, got) - - np_list_data2 = [ - rng.integers(0, 100, nelem).astype(data_type) for i in range(nchunks) - ] - pa_chunk_array2 = pa.chunked_array(np_list_data2) - pa_table = pa.Table.from_arrays( - [pa_chunk_array, pa_chunk_array2], names=["a", "b"] - ) - - expect = pa_table.to_pandas() - got = cudf.DataFrame.from_arrow(pa_table) - - assert_eq(expect, got) - - -@pytest.mark.skip(reason="Test was designed to be run in isolation") -def test_gpu_memory_usage_with_boolmask(): - ctx = cuda.current_context() - - def query_GPU_memory(note=""): - memInfo = ctx.get_memory_info() - usedMemoryGB = (memInfo.total - memInfo.free) / 1e9 - return usedMemoryGB - - cuda.current_context().deallocations.clear() - nRows = int(1e8) - nCols = 2 - rng = np.random.default_rng(seed=0) - dataNumpy = np.asfortranarray(rng.random(nRows, nCols)) - colNames = ["col" + str(iCol) for iCol in range(nCols)] - pandasDF = pd.DataFrame(data=dataNumpy, columns=colNames, dtype=np.float32) - cudaDF = cudf.core.DataFrame.from_pandas(pandasDF) - rng = np.random.default_rng(seed=0) - boolmask = cudf.Series(rng.integers(1, 2, len(cudaDF)).astype("bool")) - - memory_used = query_GPU_memory() - cudaDF = cudaDF[boolmask] - - assert ( - cudaDF.index._column.data_array_view(mode="read").device_ctypes_pointer - == cudaDF["col0"].index._column.data_array_view.device_ctypes_pointer - ) - assert ( - cudaDF.index._column.data_array_view(mode="read").device_ctypes_pointer - == cudaDF["col1"].index._column.data_array_view.device_ctypes_pointer - ) - - assert memory_used == query_GPU_memory() - - -def test_boolmask(pdf, gdf): - rng = np.random.default_rng(seed=0) - boolmask = rng.integers(0, 2, len(pdf)) > 0 - gdf = gdf[boolmask] - pdf = pdf[boolmask] - assert_eq(pdf, gdf) - - -@pytest.mark.parametrize( - "mask_shape", - [ - (2, "ab"), - (2, "abc"), - (3, "ab"), - (3, "abc"), - (3, "abcd"), - (4, "abc"), - (4, "abcd"), - ], -) -def test_dataframe_boolmask(mask_shape): - rng = np.random.default_rng(seed=0) - pdf = pd.DataFrame({col: rng.integers(0, 10, 3) for col in "abc"}) - pdf_mask = pd.DataFrame( - {col: rng.integers(0, 2, mask_shape[0]) > 0 for col in mask_shape[1]} - ) - gdf = cudf.DataFrame.from_pandas(pdf) - gdf_mask = cudf.DataFrame.from_pandas(pdf_mask) - gdf = gdf[gdf_mask] - pdf = pdf[pdf_mask] - - assert np.array_equal(gdf.columns, pdf.columns) - for col in gdf.columns: - assert np.array_equal( - gdf[col].fillna(-1).to_pandas().values, pdf[col].fillna(-1).values - ) - - -@pytest.mark.parametrize( - "box", - [ - list, - pytest.param( - cudf.Series, - marks=pytest_xfail( - reason="Pandas can't index a multiindex with a Series" - ), - ), - ], -) -def test_dataframe_multiindex_boolmask(box): - mask = box([True, False, True]) - gdf = cudf.DataFrame( - {"w": [3, 2, 1], "x": [1, 2, 3], "y": [0, 1, 0], "z": [1, 1, 1]} - ) - gdg = gdf.groupby(["w", "x"]).count() - pdg = gdg.to_pandas() - assert_eq(gdg[mask], pdg[mask]) - - -def test_dataframe_assignment(): - pdf = pd.DataFrame() - for col in "abc": - pdf[col] = np.array([0, 1, 1, -2, 10]) - gdf = cudf.DataFrame.from_pandas(pdf) - gdf[gdf < 0] = 999 - pdf[pdf < 0] = 999 - assert_eq(gdf, pdf) - - -def test_1row_arrow_table(): - data = [pa.array([0]), pa.array([1])] - batch = pa.RecordBatch.from_arrays(data, ["f0", "f1"]) - table = pa.Table.from_batches([batch]) - - expect = table.to_pandas() - got = cudf.DataFrame.from_arrow(table) - assert_eq(expect, got) - - -def test_arrow_handle_no_index_name(pdf, gdf): - gdf_arrow = gdf.to_arrow() - pdf_arrow = pa.Table.from_pandas(pdf) - assert pa.Table.equals(pdf_arrow, gdf_arrow) - - got = cudf.DataFrame.from_arrow(gdf_arrow) - expect = pdf_arrow.to_pandas() - assert_eq(expect, got) - - -def test_pandas_non_contiguious(): - rng = np.random.default_rng(seed=0) - arr1 = rng.random(size=(5000, 10)) - assert arr1.flags["C_CONTIGUOUS"] is True - df = pd.DataFrame(arr1) - for col in df.columns: - assert df[col].values.flags["C_CONTIGUOUS"] is False - - gdf = cudf.DataFrame.from_pandas(df) - assert_eq(gdf.to_pandas(), df) - - -@pytest.mark.parametrize("num_elements", [0, 10]) -@pytest.mark.parametrize("null_type", [np.nan, None, "mixed"]) -def test_series_all_null(num_elements, null_type): - if null_type == "mixed": - data = [] - data1 = [np.nan] * int(num_elements / 2) - data2 = [None] * int(num_elements / 2) - for idx in range(len(data1)): - data.append(data1[idx]) - data.append(data2[idx]) - else: - data = [null_type] * num_elements - - # Typecast Pandas because None will return `object` dtype - expect = pd.Series(data, dtype="float64") - got = cudf.Series(data, dtype="float64") - - assert_eq(expect, got) - - -@pytest.mark.parametrize("num_elements", [0, 10]) -def test_series_all_valid_nan(num_elements): - data = [np.nan] * num_elements - sr = cudf.Series(data, nan_as_null=False) - np.testing.assert_equal(sr.null_count, 0) - - -def test_series_rename(): - pds = pd.Series([1, 2, 3], name="asdf") - gds = cudf.Series([1, 2, 3], name="asdf") - - expect = pds.rename("new_name") - got = gds.rename("new_name") - - assert_eq(expect, got) - - pds = pd.Series(expect) - gds = cudf.Series(got) - - assert_eq(pds, gds) - - pds = pd.Series(expect, name="name name") - gds = cudf.Series(got, name="name name") - - assert_eq(pds, gds) - - -@pytest.mark.parametrize("data_type", dtypes) -@pytest.mark.parametrize("nelem", [0, 100]) -def test_head_tail(nelem, data_type): - rng = np.random.default_rng(seed=0) - pdf = pd.DataFrame( - { - "a": rng.integers(0, 1000, nelem).astype(data_type), - "b": rng.integers(0, 1000, nelem).astype(data_type), - } - ) - gdf = cudf.from_pandas(pdf) - - assert_eq(gdf.head(), pdf.head()) - assert_eq(gdf.head(3), pdf.head(3)) - assert_eq(gdf.head(-2), pdf.head(-2)) - assert_eq(gdf.head(0), pdf.head(0)) - - assert_eq(gdf["a"].head(), pdf["a"].head()) - assert_eq(gdf["a"].head(3), pdf["a"].head(3)) - assert_eq(gdf["a"].head(-2), pdf["a"].head(-2)) - - assert_eq(gdf.tail(), pdf.tail()) - assert_eq(gdf.tail(3), pdf.tail(3)) - assert_eq(gdf.tail(-2), pdf.tail(-2)) - assert_eq(gdf.tail(0), pdf.tail(0)) - - assert_eq(gdf["a"].tail(), pdf["a"].tail()) - assert_eq(gdf["a"].tail(3), pdf["a"].tail(3)) - assert_eq(gdf["a"].tail(-2), pdf["a"].tail(-2)) - - -def test_tail_for_string(): - gdf = cudf.DataFrame() - gdf["id"] = cudf.Series(["a", "b"], dtype=np.object_) - gdf["v"] = cudf.Series([1, 2]) - assert_eq(gdf.tail(3), gdf.to_pandas().tail(3)) - - -@pytest_unmark_spilling -@pytest.mark.parametrize("level", [None, 0, "l0", 1, ["l0", 1]]) -@pytest.mark.parametrize("drop", [True, False]) -@pytest.mark.parametrize( - "column_names", - [ - ["v0", "v1"], - ["v0", "index"], - pd.MultiIndex.from_tuples([("x0", "x1"), ("y0", "y1")]), - pd.MultiIndex.from_tuples([(1, 2), (10, 11)], names=["ABC", "DEF"]), - ], -) -@pytest.mark.parametrize("inplace", [True, False]) -@pytest.mark.parametrize("col_level", [0, 1]) -@pytest.mark.parametrize("col_fill", ["", "some_lv"]) -def test_reset_index(level, drop, column_names, inplace, col_level, col_fill): - midx = pd.MultiIndex.from_tuples( - [("a", 1), ("a", 2), ("b", 1), ("b", 2)], names=["l0", None] - ) - pdf = pd.DataFrame( - [[1, 2], [3, 4], [5, 6], [7, 8]], index=midx, columns=column_names - ) - gdf = cudf.from_pandas(pdf) - - expect = pdf.reset_index( - level=level, - drop=drop, - inplace=inplace, - col_level=col_level, - col_fill=col_fill, - ) - got = gdf.reset_index( - level=level, - drop=drop, - inplace=inplace, - col_level=col_level, - col_fill=col_fill, - ) - if inplace: - expect = pdf - got = gdf - - assert_eq(expect, got) - - -@pytest_unmark_spilling -@pytest.mark.parametrize("level", [None, 0, 1, [None]]) -@pytest.mark.parametrize("drop", [False, True]) -@pytest.mark.parametrize("inplace", [False, True]) -@pytest.mark.parametrize("col_level", [0, 1]) -@pytest.mark.parametrize("col_fill", ["", "some_lv"]) -def test_reset_index_dup_level_name(level, drop, inplace, col_level, col_fill): - # midx levels are named [None, None] - midx = pd.MultiIndex.from_tuples([("a", 1), ("a", 2), ("b", 1), ("b", 2)]) - pdf = pd.DataFrame([[1, 2], [3, 4], [5, 6], [7, 8]], index=midx) - gdf = cudf.from_pandas(pdf) - if level == [None]: - assert_exceptions_equal( - lfunc=pdf.reset_index, - rfunc=gdf.reset_index, - lfunc_args_and_kwargs=( - [], - {"level": level, "drop": drop, "inplace": inplace}, - ), - rfunc_args_and_kwargs=( - [], - {"level": level, "drop": drop, "inplace": inplace}, - ), - ) - return - - expect = pdf.reset_index( - level=level, - drop=drop, - inplace=inplace, - col_level=col_level, - col_fill=col_fill, - ) - got = gdf.reset_index( - level=level, - drop=drop, - inplace=inplace, - col_level=col_level, - col_fill=col_fill, - ) - if inplace: - expect = pdf - got = gdf - - assert_eq(expect, got) - - -@pytest.mark.parametrize("drop", [True, False]) -@pytest.mark.parametrize("inplace", [False, True]) -@pytest.mark.parametrize("col_level", [0, 1]) -@pytest.mark.parametrize("col_fill", ["", "some_lv"]) -def test_reset_index_named(pdf, gdf, drop, inplace, col_level, col_fill): - pdf.index.name = "cudf" - gdf.index.name = "cudf" - - expect = pdf.reset_index( - drop=drop, inplace=inplace, col_level=col_level, col_fill=col_fill - ) - got = gdf.reset_index( - drop=drop, inplace=inplace, col_level=col_level, col_fill=col_fill - ) - if inplace: - expect = pdf - got = gdf - assert_eq(expect, got) - - -@pytest.mark.parametrize("drop", [True, False]) -@pytest.mark.parametrize("inplace", [False, True]) -@pytest.mark.parametrize("column_names", [["x", "y"], ["index", "y"]]) -@pytest.mark.parametrize("col_level", [0, 1]) -@pytest.mark.parametrize("col_fill", ["", "some_lv"]) -def test_reset_index_unnamed( - pdf, gdf, drop, inplace, column_names, col_level, col_fill -): - pdf.columns = column_names - gdf.columns = column_names - - expect = pdf.reset_index( - drop=drop, inplace=inplace, col_level=col_level, col_fill=col_fill - ) - got = gdf.reset_index( - drop=drop, inplace=inplace, col_level=col_level, col_fill=col_fill - ) - if inplace: - expect = pdf - got = gdf - assert_eq(expect, got) - - -def test_reset_index_invalid_level(): - with pytest.raises(IndexError): - cudf.DataFrame([1]).reset_index(level=2) - - with pytest.raises(IndexError): - pd.DataFrame([1]).reset_index(level=2) - - -@pytest.mark.parametrize( - "index", - [ - "a", - ["a", "b"], - pd.CategoricalIndex(["I", "II", "III", "IV", "V"]), - pd.Series(["h", "i", "k", "l", "m"]), - ["b", pd.Index(["I", "II", "III", "IV", "V"])], - ["c", [11, 12, 13, 14, 15]], - pd.MultiIndex( - levels=[ - ["I", "II", "III", "IV", "V"], - ["one", "two", "three", "four", "five"], - ], - codes=[[0, 1, 2, 3, 4], [4, 3, 2, 1, 0]], - names=["col1", "col2"], - ), - pd.RangeIndex(0, 5), # corner case - [pd.Series(["h", "i", "k", "l", "m"]), pd.RangeIndex(0, 5)], - [ - pd.MultiIndex( - levels=[ - ["I", "II", "III", "IV", "V"], - ["one", "two", "three", "four", "five"], - ], - codes=[[0, 1, 2, 3, 4], [4, 3, 2, 1, 0]], - names=["col1", "col2"], - ), - pd.RangeIndex(0, 5), - ], - ], -) -@pytest.mark.parametrize("drop", [True, False]) -@pytest.mark.parametrize("append", [True, False]) -@pytest.mark.parametrize("inplace", [True, False]) -def test_set_index(index, drop, append, inplace): - gdf = cudf.DataFrame( - { - "a": [1, 2, 3, 4, 5], - "b": ["a", "b", "c", "d", "e"], - "c": [1.0, 2.0, 3.0, 4.0, 5.0], - } - ) - pdf = gdf.to_pandas() - - expected = pdf.set_index(index, inplace=inplace, drop=drop, append=append) - actual = gdf.set_index(index, inplace=inplace, drop=drop, append=append) - - if inplace: - expected = pdf - actual = gdf - assert_eq(expected, actual) - - -@pytest.mark.parametrize("index", ["a", pd.Index([1, 1, 2, 2, 3])]) -def test_set_index_verify_integrity(index): - gdf = cudf.DataFrame( - { - "a": [1, 1, 2, 2, 5], - "b": ["a", "b", "c", "d", "e"], - "c": [1.0, 2.0, 3.0, 4.0, 5.0], - } - ) - with pytest.raises(ValueError): - gdf.set_index(index, verify_integrity=True) - - -@pytest.mark.parametrize("drop", [True, False]) -def test_set_index_multi(drop): - nelem = 10 - rng = np.random.default_rng(seed=0) - a = np.arange(nelem) - rng.shuffle(a) - df = pd.DataFrame( - { - "a": a, - "b": rng.integers(0, 4, size=nelem), - "c": rng.uniform(low=0, high=4, size=nelem), - "d": rng.choice(["green", "black", "white"], nelem), - } - ) - df["e"] = df["d"].astype("category") - gdf = cudf.DataFrame.from_pandas(df) - - assert_eq(gdf.set_index("a", drop=drop), gdf.set_index(["a"], drop=drop)) - assert_eq( - df.set_index(["b", "c"], drop=drop), - gdf.set_index(["b", "c"], drop=drop), - ) - assert_eq( - df.set_index(["d", "b"], drop=drop), - gdf.set_index(["d", "b"], drop=drop), - ) - assert_eq( - df.set_index(["b", "d", "e"], drop=drop), - gdf.set_index(["b", "d", "e"], drop=drop), - ) - - -@pytest_unmark_spilling -@pytest.mark.parametrize("copy", [True, False]) -@pytest.mark.parametrize( - "args,gd_kwargs", - [ - ([], {}), - ([[-3, 0, 3, 0, -2, 1, 3, 4, 6]], {}), - ([[-3, 0, 3, 0, -2, 1, 3, 4, 6]], {}), - ([[-3, 0, 3, 0, -2, 1, 3, 4, 6]], {"axis": 0}), - ([["a", "b", "c", "d", "e"]], {"axis": 1}), - ([], {"labels": [-3, 0, 3, 0, -2, 1, 3, 4, 6], "axis": 0}), - ([], {"labels": ["a", "b", "c", "d", "e"], "axis": 1}), - ([], {"labels": [-3, 0, 3, 0, -2, 1, 3, 4, 6], "axis": "index"}), - ([], {"labels": ["a", "b", "c", "d", "e"], "axis": "columns"}), - ([], {"index": [-3, 0, 3, 0, -2, 1, 3, 4, 6]}), - ([], {"columns": ["a", "b", "c", "d", "e"]}), - ( - [], - { - "index": [-3, 0, 3, 0, -2, 1, 3, 4, 6], - "columns": ["a", "b", "c", "d", "e"], - }, - ), - ], -) -def test_dataframe_reindex(copy, args, gd_kwargs): - reindex_data = cudf.datasets.randomdata( - nrows=6, - dtypes={ - "a": "category", - "c": float, - "d": str, - }, - ) - pdf, gdf = reindex_data.to_pandas(), reindex_data - - gd_kwargs["copy"] = copy - pd_kwargs = gd_kwargs.copy() - pd_kwargs["copy"] = True - assert_eq(pdf.reindex(*args, **pd_kwargs), gdf.reindex(*args, **gd_kwargs)) - - -@pytest.mark.parametrize("fill_value", [-1.0, 0.0, 1.5]) -@pytest.mark.parametrize( - "args,kwargs", - [ - ([], {}), - ([[-3, 0, 3, 0, -2, 1, 3, 4, 6]], {}), - ([[-3, 0, 3, 0, -2, 1, 3, 4, 6]], {}), - ([[-3, 0, 3, 0, -2, 1, 3, 4, 6]], {"axis": 0}), - ([["a", "b", "c", "d", "e"]], {"axis": 1}), - ([], {"labels": [-3, 0, 3, 0, -2, 1, 3, 4, 6], "axis": 0}), - ([], {"labels": ["a", "b", "c", "d", "e"], "axis": 1}), - ([], {"labels": [-3, 0, 3, 0, -2, 1, 3, 4, 6], "axis": "index"}), - ([], {"labels": ["a", "b", "c", "d", "e"], "axis": "columns"}), - ([], {"index": [-3, 0, 3, 0, -2, 1, 3, 4, 6]}), - ([], {"columns": ["a", "b", "c", "d", "e"]}), - ( - [], - { - "index": [-3, 0, 3, 0, -2, 1, 3, 4, 6], - "columns": ["a", "b", "c", "d", "e"], - }, - ), - ], -) -def test_dataframe_reindex_fill_value(args, kwargs, fill_value): - reindex_data_numeric = cudf.datasets.randomdata( - nrows=6, - dtypes={"a": float, "b": float, "c": float}, - ) - pdf, gdf = reindex_data_numeric.to_pandas(), reindex_data_numeric - kwargs["fill_value"] = fill_value - assert_eq(pdf.reindex(*args, **kwargs), gdf.reindex(*args, **kwargs)) - - -@pytest.mark.parametrize("copy", [True, False]) -def test_dataframe_reindex_change_dtype(copy): - index = pd.date_range("12/29/2009", periods=10, freq="D") - columns = ["a", "b", "c", "d", "e"] - gdf = cudf.datasets.randomdata( - nrows=6, dtypes={"a": "category", "c": float, "d": str} - ) - pdf = gdf.to_pandas() - # Validate reindexes both labels and column names when - # index=index_labels and columns=column_labels - assert_eq( - pdf.reindex(index=index, columns=columns, copy=True), - gdf.reindex(index=index, columns=columns, copy=copy), - check_freq=False, - ) - - -@pytest.mark.parametrize("copy", [True, False]) -def test_series_categorical_reindex(copy): - index = [-3, 0, 3, 0, -2, 1, 3, 4, 6] - gdf = cudf.datasets.randomdata(nrows=6, dtypes={"a": "category"}) - pdf = gdf.to_pandas() - assert_eq(pdf["a"].reindex(copy=True), gdf["a"].reindex(copy=copy)) - assert_eq( - pdf["a"].reindex(index, copy=True), gdf["a"].reindex(index, copy=copy) - ) - assert_eq( - pdf["a"].reindex(index=index, copy=True), - gdf["a"].reindex(index=index, copy=copy), - ) - - -@pytest.mark.parametrize("copy", [True, False]) -def test_series_float_reindex(copy): - index = [-3, 0, 3, 0, -2, 1, 3, 4, 6] - gdf = cudf.datasets.randomdata(nrows=6, dtypes={"c": float}) - pdf = gdf.to_pandas() - assert_eq(pdf["c"].reindex(copy=True), gdf["c"].reindex(copy=copy)) - assert_eq( - pdf["c"].reindex(index, copy=True), gdf["c"].reindex(index, copy=copy) - ) - assert_eq( - pdf["c"].reindex(index=index, copy=True), - gdf["c"].reindex(index=index, copy=copy), - ) - - -@pytest.mark.parametrize("copy", [True, False]) -def test_series_string_reindex(copy): - index = [-3, 0, 3, 0, -2, 1, 3, 4, 6] - gdf = cudf.datasets.randomdata(nrows=6, dtypes={"d": str}) - pdf = gdf.to_pandas() - assert_eq(pdf["d"].reindex(copy=True), gdf["d"].reindex(copy=copy)) - assert_eq( - pdf["d"].reindex(index, copy=True), gdf["d"].reindex(index, copy=copy) - ) - assert_eq( - pdf["d"].reindex(index=index, copy=True), - gdf["d"].reindex(index=index, copy=copy), - ) - - -@pytest.mark.parametrize("names", [None, ["a", "b"]]) -@pytest.mark.parametrize("klass", [cudf.MultiIndex, pd.MultiIndex]) -def test_reindex_multiindex_col_to_multiindex(names, klass): - idx = pd.Index( - [("A", "one"), ("A", "two")], - dtype="object", - ) - df = pd.DataFrame([[1, 2]], columns=idx) - gdf = cudf.from_pandas(df) - midx = klass.from_tuples([("A", "one"), ("A", "three")], names=names) - result = gdf.reindex(columns=midx) - expected = cudf.DataFrame([[1, None]], columns=midx) - # (pandas2.0): check_dtype=False won't be needed - # as None col will return object instead of float - assert_eq(result, expected, check_dtype=False) - - -@pytest.mark.parametrize("names", [None, ["a", "b"]]) -@pytest.mark.parametrize("klass", [cudf.MultiIndex, pd.MultiIndex]) -def test_reindex_tuple_col_to_multiindex(names, klass): - idx = pd.Index( - [("A", "one"), ("A", "two")], dtype="object", tupleize_cols=False - ) - df = pd.DataFrame([[1, 2]], columns=idx) - gdf = cudf.from_pandas(df) - midx = klass.from_tuples([("A", "one"), ("A", "two")], names=names) - result = gdf.reindex(columns=midx) - expected = cudf.DataFrame([[1, 2]], columns=midx) - assert_eq(result, expected) - - -@pytest.mark.parametrize("name", [None, "foo"]) -@pytest.mark.parametrize("klass", [range, cudf.RangeIndex, pd.RangeIndex]) -def test_reindex_columns_rangeindex_keeps_rangeindex(name, klass): - new_columns = klass(3) - exp_name = None - if klass is not range: - new_columns.name = name - exp_name = name - df = cudf.DataFrame([[1, 2]]) - result = df.reindex(columns=new_columns).columns - expected = pd.RangeIndex(3, name=exp_name) - assert_eq(result, expected) - - -def test_to_frame(pdf, gdf): - assert_eq(pdf.x.to_frame(), gdf.x.to_frame()) - - name = "foo" - gdf_new_name = gdf.x.to_frame(name=name) - pdf_new_name = pdf.x.to_frame(name=name) - assert_eq(pdf.x.to_frame(), gdf.x.to_frame()) - - name = False - gdf_new_name = gdf.x.to_frame(name=name) - pdf_new_name = pdf.x.to_frame(name=name) - assert_eq(gdf_new_name, pdf_new_name) - assert gdf_new_name.columns[0] == name - - -def test_dataframe_empty_sort_index(): - pdf = pd.DataFrame({"x": []}) - gdf = cudf.DataFrame.from_pandas(pdf) - - expect = pdf.sort_index() - got = gdf.sort_index() - - assert_eq(expect, got, check_index_type=True) - - -@pytest_unmark_spilling -@pytest.mark.parametrize( - "index", - [ - pd.RangeIndex(0, 3, 1), - [3.0, 1.0, np.nan], - # Test for single column MultiIndex - pd.MultiIndex.from_arrays( - [ - [2, 0, 1], - ] - ), - pd.RangeIndex(2, -1, -1), - ], -) -@pytest.mark.parametrize("axis", [0, 1, "index", "columns"]) -@pytest.mark.parametrize("ascending", [True, False]) -@pytest.mark.parametrize("ignore_index", [True, False]) -@pytest.mark.parametrize("inplace", [True, False]) -@pytest.mark.parametrize("na_position", ["first", "last"]) -def test_dataframe_sort_index( - request, index, axis, ascending, inplace, ignore_index, na_position -): - if not PANDAS_GE_220 and axis in (1, "columns") and ignore_index: - pytest.skip(reason="Bug fixed in pandas-2.2") - - pdf = pd.DataFrame( - {"b": [1, 3, 2], "a": [1, 4, 3], "c": [4, 1, 5]}, - index=index, - ) - gdf = cudf.DataFrame.from_pandas(pdf) - - expected = pdf.sort_index( - axis=axis, - ascending=ascending, - ignore_index=ignore_index, - inplace=inplace, - na_position=na_position, - ) - got = gdf.sort_index( - axis=axis, - ascending=ascending, - ignore_index=ignore_index, - inplace=inplace, - na_position=na_position, - ) - - if inplace is True: - assert_eq(pdf, gdf, check_index_type=True) - else: - assert_eq(expected, got, check_index_type=True) - - -@pytest_unmark_spilling -@pytest.mark.parametrize("axis", [0, 1, "index", "columns"]) -@pytest.mark.parametrize( - "level", - [ - 0, - "b", - 1, - ["b"], - "a", - ["a", "b"], - ["b", "a"], - [0, 1], - [1, 0], - [0, 2], - None, - ], -) -@pytest.mark.parametrize("ascending", [True, False]) -@pytest.mark.parametrize("ignore_index", [True, False]) -@pytest.mark.parametrize("inplace", [True, False]) -@pytest.mark.parametrize("na_position", ["first", "last"]) -@pytest.mark.skipif( - PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, - reason="Fails in older versions of pandas", -) -def test_dataframe_mulitindex_sort_index( - request, axis, level, ascending, inplace, ignore_index, na_position -): - request.applymarker( - pytest.mark.xfail( - condition=axis in (1, "columns") - and level is None - and not ascending - and ignore_index, - reason="https://github.com/pandas-dev/pandas/issues/57293", - ) - ) - pdf = pd.DataFrame( - { - "b": [1.0, 3.0, np.nan], - "a": [1, 4, 3], - 1: ["a", "b", "c"], - "e": [3, 1, 4], - "d": [1, 2, 8], - } - ).set_index(["b", "a", 1]) - gdf = cudf.DataFrame.from_pandas(pdf) - - expected = pdf.sort_index( - axis=axis, - level=level, - ascending=ascending, - inplace=inplace, - na_position=na_position, - ignore_index=ignore_index, - ) - got = gdf.sort_index( - axis=axis, - level=level, - ascending=ascending, - ignore_index=ignore_index, - inplace=inplace, - na_position=na_position, - ) - - if inplace is True: - assert_eq(pdf, gdf) - else: - assert_eq(expected, got) - - -def test_sort_index_axis_1_ignore_index_true_columnaccessor_state_names(): - gdf = cudf.DataFrame([[1, 2, 3]], columns=["b", "a", "c"]) - result = gdf.sort_index(axis=1, ignore_index=True) - assert result._data.names == tuple(result._data.keys()) - - -@pytest.mark.parametrize("dtype", [*dtypes, "category"]) -def test_dataframe_0_row_dtype(dtype): - if dtype == "category": - data = pd.Series(["a", "b", "c", "d", "e"], dtype="category") - else: - data = np.array([1, 2, 3, 4, 5], dtype=dtype) - - expect = cudf.DataFrame() - expect["x"] = data - expect["y"] = data - got = expect.head(0) - - for col_name in got.columns: - assert expect[col_name].dtype == got[col_name].dtype - - expect = cudf.Series(data) - got = expect.head(0) - - assert expect.dtype == got.dtype - - -@pytest.mark.parametrize("nan_as_null", [True, False]) -def test_series_list_nanasnull(nan_as_null): - data = [1.0, 2.0, 3.0, np.nan, None] - - expect = pa.array(data, from_pandas=nan_as_null) - got = cudf.Series(data, nan_as_null=nan_as_null).to_arrow() - - # Bug in Arrow 0.14.1 where NaNs aren't handled - expect = expect.cast("int64", safe=False) - got = got.cast("int64", safe=False) - - assert pa.Array.equals(expect, got) - - -def test_column_assignment(): - gdf = cudf.datasets.randomdata( - nrows=20, dtypes={"a": "category", "b": int, "c": float} - ) - new_cols = ["q", "r", "s"] - gdf.columns = new_cols - assert list(gdf.columns) == new_cols - - -def test_select_dtype(): - gdf = cudf.datasets.randomdata( - nrows=20, dtypes={"a": "category", "b": int, "c": float, "d": str} - ) - pdf = gdf.to_pandas() - - assert_eq(pdf.select_dtypes("float64"), gdf.select_dtypes("float64")) - assert_eq(pdf.select_dtypes(np.float64), gdf.select_dtypes(np.float64)) - assert_eq( - pdf.select_dtypes(include=["float64"]), - gdf.select_dtypes(include=["float64"]), - ) - assert_eq( - pdf.select_dtypes(include=["object", "int", "category"]), - gdf.select_dtypes(include=["object", "int", "category"]), - ) - - assert_eq( - pdf.select_dtypes(include=["int64", "float64"]), - gdf.select_dtypes(include=["int64", "float64"]), - ) - assert_eq( - pdf.select_dtypes(include=np.number), - gdf.select_dtypes(include=np.number), - ) - assert_eq( - pdf.select_dtypes(include=[np.int64, np.float64]), - gdf.select_dtypes(include=[np.int64, np.float64]), - ) - - assert_eq( - pdf.select_dtypes(include=["category"]), - gdf.select_dtypes(include=["category"]), - ) - assert_eq( - pdf.select_dtypes(exclude=np.number), - gdf.select_dtypes(exclude=np.number), - ) - - assert_exceptions_equal( - lfunc=pdf.select_dtypes, - rfunc=gdf.select_dtypes, - lfunc_args_and_kwargs=([], {"includes": ["Foo"]}), - rfunc_args_and_kwargs=([], {"includes": ["Foo"]}), - ) - - assert_exceptions_equal( - lfunc=pdf.select_dtypes, - rfunc=gdf.select_dtypes, - lfunc_args_and_kwargs=( - [], - {"exclude": np.number, "include": np.number}, - ), - rfunc_args_and_kwargs=( - [], - {"exclude": np.number, "include": np.number}, - ), - ) - - gdf = cudf.DataFrame( - {"A": [3, 4, 5], "C": [1, 2, 3], "D": ["a", "b", "c"]} - ) - pdf = gdf.to_pandas() - assert_eq( - pdf.select_dtypes(include=["object", "int", "category"]), - gdf.select_dtypes(include=["object", "int", "category"]), - ) - assert_eq( - pdf.select_dtypes(include=["object"], exclude=["category"]), - gdf.select_dtypes(include=["object"], exclude=["category"]), - ) - - gdf = cudf.DataFrame({"a": range(10), "b": range(10, 20)}) - pdf = gdf.to_pandas() - assert_eq( - pdf.select_dtypes(include=["category"]), - gdf.select_dtypes(include=["category"]), - ) - assert_eq( - pdf.select_dtypes(include=["float"]), - gdf.select_dtypes(include=["float"]), - ) - assert_eq( - pdf.select_dtypes(include=["object"]), - gdf.select_dtypes(include=["object"]), - ) - assert_eq( - pdf.select_dtypes(include=["int"]), gdf.select_dtypes(include=["int"]) - ) - assert_eq( - pdf.select_dtypes(exclude=["float"]), - gdf.select_dtypes(exclude=["float"]), - ) - assert_eq( - pdf.select_dtypes(exclude=["object"]), - gdf.select_dtypes(exclude=["object"]), - ) - assert_eq( - pdf.select_dtypes(include=["int"], exclude=["object"]), - gdf.select_dtypes(include=["int"], exclude=["object"]), - ) - - assert_exceptions_equal( - lfunc=pdf.select_dtypes, - rfunc=gdf.select_dtypes, - ) - - gdf = cudf.DataFrame( - {"a": cudf.Series([], dtype="int"), "b": cudf.Series([], dtype="str")} - ) - pdf = gdf.to_pandas() - assert_eq( - pdf.select_dtypes(exclude=["object"]), - gdf.select_dtypes(exclude=["object"]), - ) - assert_eq( - pdf.select_dtypes(include=["int"], exclude=["object"]), - gdf.select_dtypes(include=["int"], exclude=["object"]), - ) - - gdf = cudf.DataFrame( - {"int_col": [0, 1, 2], "list_col": [[1, 2], [3, 4], [5, 6]]} - ) - pdf = gdf.to_pandas() - assert_eq( - pdf.select_dtypes("int64"), - gdf.select_dtypes("int64"), - ) - - -def test_select_dtype_datetime(): - gdf = cudf.datasets.timeseries( - start="2000-01-01", end="2000-01-02", freq="3600s", dtypes={"x": int} - ) - gdf = gdf.reset_index() - pdf = gdf.to_pandas() - - assert_eq(pdf.select_dtypes("datetime64"), gdf.select_dtypes("datetime64")) - assert_eq( - pdf.select_dtypes(np.dtype("datetime64")), - gdf.select_dtypes(np.dtype("datetime64")), - ) - assert_eq( - pdf.select_dtypes(include="datetime64"), - gdf.select_dtypes(include="datetime64"), - ) - - -def test_select_dtype_datetime_with_frequency(): - gdf = cudf.datasets.timeseries( - start="2000-01-01", end="2000-01-02", freq="3600s", dtypes={"x": int} - ) - gdf = gdf.reset_index() - pdf = gdf.to_pandas() - - assert_exceptions_equal( - pdf.select_dtypes, - gdf.select_dtypes, - (["datetime64[ms]"],), - (["datetime64[ms]"],), - ) - - -def test_dataframe_describe_exclude(): - rng = np.random.default_rng(seed=12) - data_length = 10 - - df = cudf.DataFrame() - df["x"] = rng.normal(10, 1, data_length) - df["x"] = df.x.astype("int64") - df["y"] = rng.normal(10, 1, data_length) - pdf = df.to_pandas() - - gdf_results = df.describe(exclude=["float"]) - pdf_results = pdf.describe(exclude=["float"]) - - assert_eq(gdf_results, pdf_results) - - -def test_dataframe_describe_include(): - rng = np.random.default_rng(seed=12) - data_length = 10 - - df = cudf.DataFrame() - df["x"] = rng.normal(10, 1, data_length) - df["x"] = df.x.astype("int64") - df["y"] = rng.normal(10, 1, data_length) - pdf = df.to_pandas() - gdf_results = df.describe(include=["int"]) - pdf_results = pdf.describe(include=["int"]) - - assert_eq(gdf_results, pdf_results) - - -def test_dataframe_describe_default(): - rng = np.random.default_rng(seed=12) - data_length = 10 - - df = cudf.DataFrame() - df["x"] = rng.normal(10, 1, data_length) - df["y"] = rng.normal(10, 1, data_length) - pdf = df.to_pandas() - gdf_results = df.describe() - pdf_results = pdf.describe() - - assert_eq(pdf_results, gdf_results) - + d = unaryop(pd_df - 5) + g = unaryop(gdf - 5) + assert_eq(d, g) -def test_series_describe_include_all(): - rng = np.random.default_rng(seed=12) - data_length = 10 - df = cudf.DataFrame() - df["x"] = rng.normal(10, 1, data_length) - df["x"] = df.x.astype("int64") - df["y"] = rng.normal(10, 1, data_length) - df["animal"] = rng.choice(["dog", "cat", "bird"], data_length) +def test_df_abs(pdf): + rng = np.random.default_rng(seed=0) + disturbance = pd.Series(rng.random(10)) + pdf = pdf - 5 + disturbance + d = pdf.apply(np.abs) + g = cudf.from_pandas(pdf).abs() + assert_eq(d, g) - pdf = df.to_pandas() - gdf_results = df.describe(include="all") - pdf_results = pdf.describe(include="all") - assert_eq(gdf_results[["x", "y"]], pdf_results[["x", "y"]]) - assert_eq(gdf_results.index, pdf_results.index) - assert_eq(gdf_results.columns, pdf_results.columns) - assert_eq( - gdf_results[["animal"]].fillna(-1).astype("str"), - pdf_results[["animal"]].fillna(-1).astype("str"), +def test_scale_df(gdf): + got = (gdf - 5).scale() + expect = cudf.DataFrame( + {"x": np.linspace(0.0, 1.0, 10), "y": np.linspace(0.0, 1.0, 10)} ) + assert_eq(expect, got) -def test_dataframe_describe_percentiles(): - rng = np.random.default_rng(seed=12) - data_length = 100 - sample_percentiles = [0.0, 0.1, 0.33, 0.84, 0.4, 0.99] +@pytest.mark.parametrize( + "func", + [ + lambda df: df.empty, + lambda df: df.x.empty, + lambda df: df.x.fillna(123, limit=None, method=None, axis=None), + lambda df: df.drop("x", axis=1, errors="raise"), + ], +) +def test_unary_operators(func, pdf, gdf): + p = func(pdf) + g = func(gdf) + assert_eq(p, g) - df = cudf.DataFrame() - df["x"] = rng.normal(10, 1, data_length) - df["y"] = rng.normal(10, 1, data_length) - pdf = df.to_pandas() - gdf_results = df.describe(percentiles=sample_percentiles) - pdf_results = pdf.describe(percentiles=sample_percentiles) - assert_eq(pdf_results, gdf_results) +def test_is_monotonic(gdf): + pdf = pd.DataFrame({"x": [1, 2, 3]}, index=[3, 1, 2]) + gdf = cudf.DataFrame.from_pandas(pdf) + assert not gdf.index.is_monotonic_increasing + assert not gdf.index.is_monotonic_decreasing -def test_get_numeric_data(): +@pytest.mark.parametrize("q", [0.5, 1, 0.001, [0.5], [], [0.005, 0.5, 1]]) +@pytest.mark.parametrize("numeric_only", [True, False]) +def test_quantile(q, numeric_only): + ts = pd.date_range("2018-08-24", periods=5, freq="D") + td = pd.to_timedelta(np.arange(5), unit="h") + rng = np.random.default_rng(seed=0) pdf = pd.DataFrame( - {"x": [1, 2, 3], "y": [1.0, 2.0, 3.0], "z": ["a", "b", "c"]} + {"date": ts, "delta": td, "val": rng.standard_normal(len(ts))} ) - gdf = cudf.from_pandas(pdf) - - assert_eq(pdf._get_numeric_data(), gdf._get_numeric_data()) - - -@pytest.mark.parametrize("dtype", NUMERIC_TYPES) -@pytest.mark.parametrize("period", [-15, -1, 0, 1, 15]) -@pytest.mark.parametrize("data_empty", [False, True]) -def test_shift(dtype, period, data_empty): - # TODO : this function currently tests for series.shift() - # but should instead test for dataframe.shift() - if data_empty: - data = None - else: - if dtype == np.int8: - # to keep data in range - data = gen_rand(dtype, 10, low=-2, high=2) - else: - data = gen_rand(dtype, 10) - - gs = cudf.DataFrame({"a": cudf.Series(data, dtype=dtype)}) - ps = pd.DataFrame({"a": pd.Series(data, dtype=dtype)}) + gdf = cudf.DataFrame.from_pandas(pdf) - shifted_outcome = gs.a.shift(period) - expected_outcome = ps.a.shift(period) + assert_eq(pdf["date"].quantile(q), gdf["date"].quantile(q)) + assert_eq(pdf["delta"].quantile(q), gdf["delta"].quantile(q)) + assert_eq(pdf["val"].quantile(q), gdf["val"].quantile(q)) - # pandas uses NaNs to signal missing value and force converts the - # results columns to float types - if data_empty: - assert_eq( - shifted_outcome, - expected_outcome, - check_index_type=False, - check_dtype=False, - ) - else: - assert_eq(shifted_outcome, expected_outcome, check_dtype=False) + q = q if isinstance(q, list) else [q] + assert_eq( + pdf.quantile(q, numeric_only=numeric_only), + gdf.quantile(q, numeric_only=numeric_only), + ) -@pytest.mark.parametrize("dtype", NUMERIC_TYPES) -@pytest.mark.parametrize("period", [-15, -1, 0, 1, 15]) -@pytest.mark.parametrize("data_empty", [False, True]) -def test_diff(dtype, period, data_empty): - if data_empty: - data = None - else: - if dtype == np.int8: - # to keep data in range - data = gen_rand(dtype, 100000, low=-2, high=2) - else: - data = gen_rand(dtype, 100000) +@pytest.mark.parametrize("q", [0.2, 1, 0.001, [0.5], [], [0.005, 0.8, 0.03]]) +@pytest.mark.parametrize("interpolation", ["higher", "lower", "nearest"]) +@pytest.mark.parametrize( + "decimal_type", + [cudf.Decimal32Dtype, cudf.Decimal64Dtype, cudf.Decimal128Dtype], +) +def test_decimal_quantile(q, interpolation, decimal_type): + rng = np.random.default_rng(seed=0) + data = ["244.8", "32.24", "2.22", "98.14", "453.23", "5.45"] + gdf = cudf.DataFrame( + {"id": rng.integers(0, 10, size=len(data)), "val": data} + ) + gdf["id"] = gdf["id"].astype("float64") + gdf["val"] = gdf["val"].astype(decimal_type(7, 2)) + pdf = gdf.to_pandas() - gdf = cudf.DataFrame({"a": cudf.Series(data, dtype=dtype)}) - pdf = pd.DataFrame({"a": pd.Series(data, dtype=dtype)}) + got = gdf.quantile(q, numeric_only=False, interpolation=interpolation) + expected = pdf.quantile( + q if isinstance(q, list) else [q], + numeric_only=False, + interpolation=interpolation, + ) - expected_outcome = pdf.a.diff(period) - diffed_outcome = gdf.a.diff(period).astype(expected_outcome.dtype) + assert_eq(got, expected) - if data_empty: - assert_eq(diffed_outcome, expected_outcome, check_index_type=False) - else: - assert_eq(diffed_outcome, expected_outcome) +def test_empty_quantile(): + pdf = pd.DataFrame({"x": []}, dtype="float64") + df = cudf.DataFrame({"x": []}, dtype="float64") -@pytest.mark.parametrize("nan_as_null", [True, False, None]) -@pytest.mark.parametrize("api_call", ["isnull", "isna", "notna", "notnull"]) -def test_dataframe_isnull_isna_and_reverse(na_data, nan_as_null, api_call): - def detect_nan(x): - # Check if the input is a float and if it is nan - return x.apply(lambda v: isinstance(v, float) and np.isnan(v)) + actual = df.quantile() + expected = pdf.quantile() - df = na_data - nan_contains = df.select_dtypes(object).apply(detect_nan) - if nan_as_null is False and ( - nan_contains.any().any() and not nan_contains.all().all() - ): - with pytest.raises(MixedTypeError): - cudf.DataFrame.from_pandas(df, nan_as_null=nan_as_null) - else: - gdf = cudf.DataFrame.from_pandas(df, nan_as_null=nan_as_null) + assert_eq(actual, expected) - assert_eq(getattr(df, api_call)(), getattr(gdf, api_call)()) - # Test individual columns - for col in df: - assert_eq( - getattr(df[col], api_call)(), getattr(gdf[col], api_call)() - ) +def test_boolmask(pdf, gdf): + rng = np.random.default_rng(seed=0) + boolmask = rng.integers(0, 2, len(pdf)) > 0 + gdf = gdf[boolmask] + pdf = pdf[boolmask] + assert_eq(pdf, gdf) -def test_ndim(): - pdf = pd.DataFrame({"x": range(5), "y": range(5, 10)}) +@pytest.mark.parametrize( + "mask_shape", + [ + (2, "ab"), + (2, "abc"), + (3, "ab"), + (3, "abc"), + (3, "abcd"), + (4, "abc"), + (4, "abcd"), + ], +) +def test_dataframe_boolmask(mask_shape): + rng = np.random.default_rng(seed=0) + pdf = pd.DataFrame({col: rng.integers(0, 10, 3) for col in "abc"}) + pdf_mask = pd.DataFrame( + {col: rng.integers(0, 2, mask_shape[0]) > 0 for col in mask_shape[1]} + ) gdf = cudf.DataFrame.from_pandas(pdf) - assert pdf.ndim == gdf.ndim - assert pdf.x.ndim == gdf.x.ndim + gdf_mask = cudf.DataFrame.from_pandas(pdf_mask) + gdf = gdf[gdf_mask] + pdf = pdf[pdf_mask] - s = pd.Series(dtype="float64") - gs = cudf.Series() - assert s.ndim == gs.ndim + assert np.array_equal(gdf.columns, pdf.columns) + for col in gdf.columns: + assert np.array_equal( + gdf[col].fillna(-1).to_pandas().values, pdf[col].fillna(-1).values + ) @pytest.mark.parametrize( - "decimals", + "box", [ - -3, - 0, - 5, - pd.Series( - [1, 4, 3, -6], - index=["floats", "ints", "floats_with_nan", "floats_same"], - ), - cudf.Series( - [-4, -2, 12], index=["ints", "floats_with_nan", "floats_same"] + list, + pytest.param( + cudf.Series, + marks=pytest_xfail( + reason="Pandas can't index a multiindex with a Series" + ), ), - {"floats": -1, "ints": 15, "floats_will_nan": 2}, ], ) -def test_dataframe_round(decimals): - rng = np.random.default_rng(seed=0) +def test_dataframe_multiindex_boolmask(box): + mask = box([True, False, True]) gdf = cudf.DataFrame( - { - "floats": np.arange(0.5, 10.5, 1), - "ints": rng.normal(-100, 100, 10), - "floats_with_na": np.array( - [ - 14.123, - 2.343, - np.nan, - 0.0, - -8.302, - np.nan, - 94.313, - None, - -8.029, - np.nan, - ] - ), - "floats_same": np.repeat([-0.6459412758761901], 10), - "bools": rng.choice([True, None, False], 10), - "strings": rng.choice(["abc", "xyz", None], 10), - "struct": rng.choice([{"abc": 1}, {"xyz": 2}, None], 10), - "list": [[1], [2], None, [4], [3]] * 2, - } + {"w": [3, 2, 1], "x": [1, 2, 3], "y": [0, 1, 0], "z": [1, 1, 1]} ) - pdf = gdf.to_pandas() - - if isinstance(decimals, cudf.Series): - pdecimals = decimals.to_pandas() - else: - pdecimals = decimals - - result = gdf.round(decimals) - expected = pdf.round(pdecimals) - - assert_eq(result, expected) + gdg = gdf.groupby(["w", "x"]).count() + pdg = gdg.to_pandas() + assert_eq(gdg[mask], pdg[mask]) -def test_dataframe_round_dict_decimal_validation(): - df = cudf.DataFrame({"A": [0.12], "B": [0.13]}) - with pytest.raises(TypeError): - df.round({"A": 1, "B": 0.5}) +def test_dataframe_assignment(): + pdf = pd.DataFrame() + for col in "abc": + pdf[col] = np.array([0, 1, 1, -2, 10]) + gdf = cudf.DataFrame.from_pandas(pdf) + gdf[gdf < 0] = 999 + pdf[pdf < 0] = 999 + assert_eq(gdf, pdf) @pytest.mark.parametrize( @@ -4316,16 +1562,6 @@ def test_create_dataframe_cols_empty_data(a, b, misc_data, non_list_data): assert_eq(actual, expected) -def test_empty_dataframe_describe(): - pdf = pd.DataFrame({"a": [], "b": []}) - gdf = cudf.from_pandas(pdf) - - expected = pdf.describe() - actual = gdf.describe() - - assert_eq(expected, actual) - - def test_as_column_types(): col = as_column(cudf.Series([], dtype="float64")) assert_eq(col.dtype, np.dtype("float64")) @@ -4400,27 +1636,6 @@ def test_as_column_types(): assert_eq(pds, gds) -def test_one_row_head(): - gdf = cudf.DataFrame({"name": ["carl"], "score": [100]}, index=[123]) - pdf = gdf.to_pandas() - - head_gdf = gdf.head() - head_pdf = pdf.head() - - assert_eq(head_pdf, head_gdf) - - -@pytest.mark.parametrize("index", [None, [123], ["a", "b"]]) -def test_no_cols_head(index): - pdf = pd.DataFrame(index=index) - gdf = cudf.from_pandas(pdf) - - head_gdf = gdf.head() - head_pdf = pdf.head() - - assert_eq(head_pdf, head_gdf) - - @pytest.mark.parametrize("dtype", ALL_TYPES) @pytest.mark.parametrize( "np_dtype,pd_dtype", @@ -8391,129 +5606,6 @@ def test_dataframe_iterrows_itertuples(): df.iterrows() -@pytest_unmark_spilling -@pytest.mark.parametrize( - "pdf", - [ - pd.DataFrame( - { - "a": [1, 2, 3], - "b": [10, 22, 33], - "c": [0.3234, 0.23432, 0.0], - "d": ["hello", "world", "hello"], - } - ), - pd.DataFrame( - { - "a": [1, 2, 3], - "b": ["hello", "world", "hello"], - "c": [0.3234, 0.23432, 0.0], - } - ), - pd.DataFrame( - { - "int_data": [1, 2, 3], - "str_data": ["hello", "world", "hello"], - "float_data": [0.3234, 0.23432, 0.0], - "timedelta_data": pd.Series( - [1, 2, 1], dtype="timedelta64[ns]" - ), - "datetime_data": pd.Series([1, 2, 1], dtype="datetime64[ns]"), - } - ), - pd.DataFrame( - { - "int_data": [1, 2, 3], - "str_data": ["hello", "world", "hello"], - "float_data": [0.3234, 0.23432, 0.0], - "timedelta_data": pd.Series( - [1, 2, 1], dtype="timedelta64[ns]" - ), - "datetime_data": pd.Series([1, 2, 1], dtype="datetime64[ns]"), - "category_data": pd.Series(["a", "a", "b"], dtype="category"), - } - ), - ], -) -@pytest.mark.parametrize( - "include", - [None, "all", ["object"], ["int"], ["object", "int", "category"]], -) -def test_describe_misc_include(pdf, include): - df = cudf.DataFrame.from_pandas(pdf) - - expected = pdf.describe(include=include) - actual = df.describe(include=include) - - for col in expected.columns: - if expected[col].dtype == np.dtype("object"): - expected[col] = expected[col].fillna(-1).astype("str") - actual[col] = actual[col].fillna(-1).astype("str") - - assert_eq(expected, actual) - - -@pytest_unmark_spilling -@pytest.mark.parametrize( - "pdf", - [ - pd.DataFrame( - { - "a": [1, 2, 3], - "b": [10, 22, 33], - "c": [0.3234, 0.23432, 0.0], - "d": ["hello", "world", "hello"], - } - ), - pd.DataFrame( - { - "a": [1, 2, 3], - "b": ["hello", "world", "hello"], - "c": [0.3234, 0.23432, 0.0], - } - ), - pd.DataFrame( - { - "int_data": [1, 2, 3], - "str_data": ["hello", "world", "hello"], - "float_data": [0.3234, 0.23432, 0.0], - "timedelta_data": pd.Series( - [1, 2, 1], dtype="timedelta64[ns]" - ), - "datetime_data": pd.Series([1, 2, 1], dtype="datetime64[ns]"), - } - ), - pd.DataFrame( - { - "int_data": [1, 2, 3], - "str_data": ["hello", "world", "hello"], - "float_data": [0.3234, 0.23432, 0.0], - "timedelta_data": pd.Series( - [1, 2, 1], dtype="timedelta64[ns]" - ), - "datetime_data": pd.Series([1, 2, 1], dtype="datetime64[ns]"), - "category_data": pd.Series(["a", "a", "b"], dtype="category"), - } - ), - ], -) -@pytest.mark.parametrize( - "exclude", [None, ["object"], ["int"], ["object", "int", "category"]] -) -def test_describe_misc_exclude(pdf, exclude): - df = cudf.DataFrame.from_pandas(pdf) - - expected = pdf.describe(exclude=exclude) - actual = df.describe(exclude=exclude) - - for col in expected.columns: - if expected[col].dtype == np.dtype("object"): - expected[col] = expected[col].fillna(-1).astype("str") - actual[col] = actual[col].fillna(-1).astype("str") - - assert_eq(expected, actual) - - @pytest.mark.parametrize( "df", [ From ef3fe33ea0fe25d5f47b1c82f7d48abf34fb3faa Mon Sep 17 00:00:00 2001 From: "Richard (Rick) Zamora" Date: Fri, 15 Aug 2025 17:08:38 -0500 Subject: [PATCH 140/366] Fix "--executor" pytest parameter for cudf-polars (#19703) The default value for the `--executor` pytest parameter is still `"in-memory"`. This means we are currently running the in-memory tests twice in CI. The good news is that the `--blocksize-mode small` tests have still been hitting the `"streaming"` engine, so I'm pretty sure there is no need to worry about 25.08. Authors: - Richard (Rick) Zamora (https://github.com/rjzamora) Approvers: - Tom Augspurger (https://github.com/TomAugspurger) - Matthew Murray (https://github.com/Matt711) - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/19703 --- ci/run_cudf_polars_pytests.sh | 2 +- python/cudf_polars/tests/conftest.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/ci/run_cudf_polars_pytests.sh b/ci/run_cudf_polars_pytests.sh index 4fdf7080c03..304573d6a85 100755 --- a/ci/run_cudf_polars_pytests.sh +++ b/ci/run_cudf_polars_pytests.sh @@ -12,7 +12,7 @@ cd "$(dirname "$(realpath "${BASH_SOURCE[0]}")")"/../python/cudf_polars/ python -m pytest --cache-clear "$@" tests --executor in-memory # Test the default "streaming" executor -python -m pytest --cache-clear "$@" tests +python -m pytest --cache-clear "$@" tests --executor streaming # Test the "streaming" executor with small blocksize python -m pytest --cache-clear "$@" tests --executor streaming --blocksize-mode small diff --git a/python/cudf_polars/tests/conftest.py b/python/cudf_polars/tests/conftest.py index 334f0f38820..3537a66d938 100644 --- a/python/cudf_polars/tests/conftest.py +++ b/python/cudf_polars/tests/conftest.py @@ -18,7 +18,7 @@ def pytest_addoption(parser): parser.addoption( "--executor", action="store", - default="in-memory", + default="streaming", choices=("in-memory", "streaming"), help="Executor to use for GPUEngine.", ) From ebfdba5ca3fa04575ce09a956693490446d1a637 Mon Sep 17 00:00:00 2001 From: David Wendt <45795991+davidwendt@users.noreply.github.com> Date: Fri, 15 Aug 2025 18:35:13 -0400 Subject: [PATCH 141/366] Move quantiles libcudf benchmark to nvbench (#19692) Moves the libcudf quantiles benchmark from googlebench to nvbench. Authors: - David Wendt (https://github.com/davidwendt) - Vyas Ramasubramani (https://github.com/vyasr) Approvers: - Nghia Truong (https://github.com/ttnghia) - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/19692 --- cpp/benchmarks/CMakeLists.txt | 2 +- cpp/benchmarks/quantiles/quantiles.cpp | 60 ++++++++++++-------------- 2 files changed, 29 insertions(+), 33 deletions(-) diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt index e111b6395e9..ec1fb42cdab 100644 --- a/cpp/benchmarks/CMakeLists.txt +++ b/cpp/benchmarks/CMakeLists.txt @@ -217,7 +217,7 @@ ConfigureNVBench(STRUCT_CREATION_NVBENCH structs/create_structs.cpp) # ################################################################################################## # * quantiles benchmark # -------------------------------------------------------------------------------- -ConfigureBench(QUANTILES_BENCH quantiles/quantiles.cpp) +ConfigureNVBench(QUANTILES_NVBENCH quantiles/quantiles.cpp) # ################################################################################################## # * tdigest benchmark diff --git a/cpp/benchmarks/quantiles/quantiles.cpp b/cpp/benchmarks/quantiles/quantiles.cpp index 24f9cc9c68e..25eae2d68e4 100644 --- a/cpp/benchmarks/quantiles/quantiles.cpp +++ b/cpp/benchmarks/quantiles/quantiles.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2023, NVIDIA CORPORATION. + * Copyright (c) 2021-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -15,55 +15,51 @@ */ #include -#include -#include #include +#include #include #include #include -class Quantiles : public cudf::benchmark {}; +#include -static void BM_quantiles(benchmark::State& state, bool nulls) +static void bench_quantiles(nvbench::state& state) { - using Type = int; + cudf::size_type const num_rows{static_cast(state.get_int64("num_rows"))}; + cudf::size_type const num_cols{static_cast(state.get_int64("num_cols"))}; + cudf::size_type const num_quantiles{ + static_cast(state.get_int64("num_quantiles"))}; + bool const nulls{static_cast(state.get_int64("nulls"))}; - cudf::size_type const n_rows{(cudf::size_type)state.range(0)}; - cudf::size_type const n_cols{(cudf::size_type)state.range(1)}; - cudf::size_type const n_quantiles{(cudf::size_type)state.range(2)}; + auto const data_type = cudf::type_to_id(); // Create columns with values in the range [0,100) - data_profile profile = data_profile_builder().cardinality(0).distribution( - cudf::type_to_id(), distribution_id::UNIFORM, 0, 100); + data_profile profile = + data_profile_builder().cardinality(0).distribution(data_type, distribution_id::UNIFORM, 0, 100); profile.set_null_probability(nulls ? std::optional{0.01} : std::nullopt); // 1% nulls or no null mask (<0) - auto input_table = create_random_table( - cycle_dtypes({cudf::type_to_id()}, n_cols), row_count{n_rows}, profile); + auto input_table = + create_random_table(cycle_dtypes({data_type}, num_cols), row_count{num_rows}, profile); auto input = cudf::table_view(*input_table); - std::vector q(n_quantiles); - thrust::tabulate( - thrust::seq, q.begin(), q.end(), [n_quantiles](auto i) { return i * (1.0f / n_quantiles); }); + std::vector q(num_quantiles); + thrust::tabulate(thrust::seq, q.begin(), q.end(), [num_quantiles](auto i) { + return i * (1.0f / num_quantiles); + }); - for (auto _ : state) { - cuda_event_timer raii(state, true, cudf::get_default_stream()); + auto stream = cudf::get_default_stream(); + state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value())); - auto result = cudf::quantiles(input, q); - // auto result = (stable) ? cudf::stable_sorted_order(input) : cudf::sorted_order(input); - } + state.exec(nvbench::exec_tag::sync, + [&](nvbench::launch&) { auto result = cudf::quantiles(input, q); }); } -#define QUANTILES_BENCHMARK_DEFINE(name, nulls) \ - BENCHMARK_DEFINE_F(Quantiles, name) \ - (::benchmark::State & st) { BM_quantiles(st, nulls); } \ - BENCHMARK_REGISTER_F(Quantiles, name) \ - ->RangeMultiplier(4) \ - ->Ranges({{1 << 16, 1 << 26}, {1, 8}, {1, 12}}) \ - ->UseManualTime() \ - ->Unit(benchmark::kMillisecond); - -QUANTILES_BENCHMARK_DEFINE(no_nulls, false) -QUANTILES_BENCHMARK_DEFINE(nulls, true) +NVBENCH_BENCH(bench_quantiles) + .set_name("quantiles") + .add_int64_power_of_two_axis("num_rows", {16, 18, 20, 22, 24, 26}) + .add_int64_axis("num_cols", {1, 2, 4, 8}) + .add_int64_axis("num_quantiles", {1, 4, 8, 12}) + .add_int64_axis("nulls", {0, 1}); From 6dc891ab9dbb3adfb553a86b7eb91f20b4f5e50b Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Fri, 15 Aug 2025 16:38:07 -0700 Subject: [PATCH 142/366] Move to pyarrow and numpy to run_constrained (#19706) This is a follow-up to #19657 to remove hard pyarrow and numpy requirements from the pylibcudf conda recipe. Authors: - Vyas Ramasubramani (https://github.com/vyasr) Approvers: - Bradley Dice (https://github.com/bdice) URL: https://github.com/rapidsai/cudf/pull/19706 --- conda/recipes/pylibcudf/recipe.yaml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/conda/recipes/pylibcudf/recipe.yaml b/conda/recipes/pylibcudf/recipe.yaml index 6ba6e189d0f..548a35da119 100644 --- a/conda/recipes/pylibcudf/recipe.yaml +++ b/conda/recipes/pylibcudf/recipe.yaml @@ -67,14 +67,15 @@ requirements: - python - typing_extensions >=4.0.0 - pandas >=2.0,<2.4.0dev0 - - numpy >=1.23,<3.0a0 - - pyarrow>=14.0.0,<20.0.0a0 - libcudf =${{ version }} - ${{ pin_compatible("rmm", upper_bound="x.x") }} - fsspec >=0.6.0 - cuda-python >=12.9.1,<13.0a0 - nvtx >=0.2.1 - packaging + run_constraints: + - numpy >=1.23,<3.0a0 + - pyarrow>=14.0.0,<20.0.0a0 ignore_run_exports: from_package: - cuda-cudart-dev From cef896de223d6c9d13faca0c45fe191e63d6eb9b Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Fri, 15 Aug 2025 17:52:50 -0700 Subject: [PATCH 143/366] Move test_concat/test_reductions.py to new cudf classic directory structure (#19626) Towards https://github.com/rapidsai/cudf/issues/9999 Towards https://github.com/rapidsai/cudf/issues/15723 Includes some reduction themed test in test_string.py and test_timedelta.py Some test were consolidated or removed as similar to existing test Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/19626 --- python/cudf/cudf/tests/conftest.py | 28 + .../cudf/tests/dataframe/test_reductions.py | 75 + python/cudf/cudf/tests/reshape/test_concat.py | 2035 ++++++++++++++++ python/cudf/cudf/tests/test_concat.py | 2060 ----------------- 4 files changed, 2138 insertions(+), 2060 deletions(-) create mode 100644 python/cudf/cudf/tests/dataframe/test_reductions.py delete mode 100644 python/cudf/cudf/tests/test_concat.py diff --git a/python/cudf/cudf/tests/conftest.py b/python/cudf/cudf/tests/conftest.py index 88940a3ec47..772abdfba32 100644 --- a/python/cudf/cudf/tests/conftest.py +++ b/python/cudf/cudf/tests/conftest.py @@ -532,6 +532,34 @@ def ascending(request): return request.param +axis_0s = [0, "index"] +axis_1s = [1, "columns"] + + +@pytest.fixture(params=axis_0s) +def axis_0(request): + """Param for `axis=0` argument""" + return request.param + + +@pytest.fixture(params=axis_1s) +def axis_1(request): + """Param for `axis=1` argument""" + return request.param + + +@pytest.fixture(params=axis_0s + axis_1s) +def axis(request): + """Param for `axis` argument""" + return request.param + + +@pytest.fixture(params=[True, False]) +def sort(request): + """Param for `sort` argument""" + return request.param + + @pytest.fixture(params=[True, False]) def numeric_only(request): """Param for `numeric_only` argument""" diff --git a/python/cudf/cudf/tests/dataframe/test_reductions.py b/python/cudf/cudf/tests/dataframe/test_reductions.py new file mode 100644 index 00000000000..167ba2bd427 --- /dev/null +++ b/python/cudf/cudf/tests/dataframe/test_reductions.py @@ -0,0 +1,75 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + + +import pandas as pd +import pytest + +import cudf +from cudf.testing import assert_eq + + +def test_single_q(): + q = 0.5 + + pdf = pd.DataFrame({"a": [4, 24, 13, 8, 7]}) + gdf = cudf.from_pandas(pdf) + + pdf_q = pdf.quantile(q, interpolation="nearest") + gdf_q = gdf.quantile(q, interpolation="nearest", method="table") + + assert_eq(pdf_q, gdf_q, check_index_type=False) + + +def test_with_index(): + q = [0, 0.5, 1] + + pdf = pd.DataFrame({"a": [7, 4, 4, 9, 13]}, index=[0, 4, 3, 2, 7]) + gdf = cudf.from_pandas(pdf) + + pdf_q = pdf.quantile(q, interpolation="nearest") + gdf_q = gdf.quantile(q, interpolation="nearest", method="table") + + assert_eq(pdf_q, gdf_q, check_index_type=False) + + +def test_with_multiindex(): + q = [0, 0.5, 1] + + pdf = pd.DataFrame( + { + "index_1": [3, 1, 9, 7, 5], + "index_2": [2, 4, 3, 5, 1], + "a": [8, 4, 2, 3, 8], + } + ) + pdf.set_index(["index_1", "index_2"], inplace=True) + + gdf = cudf.from_pandas(pdf) + + pdf_q = pdf.quantile(q, interpolation="nearest") + gdf_q = gdf.quantile(q, interpolation="nearest", method="table") + + assert_eq(pdf_q, gdf_q, check_index_type=False) + + +@pytest.mark.parametrize( + "data", + [ + {"a": [1, 2, 3], "b": [10, 11, 12]}, + {"a": [1, 0, 3], "b": [10, 11, 12]}, + {"a": [1, 2, 3], "b": [10, 11, None]}, + { + "a": [], + }, + {}, + ], +) +@pytest.mark.parametrize("op", ["all", "any"]) +def test_any_all_axis_none(data, op): + gdf = cudf.DataFrame(data) + pdf = gdf.to_pandas() + + expected = getattr(pdf, op)(axis=None) + actual = getattr(gdf, op)(axis=None) + + assert expected == actual diff --git a/python/cudf/cudf/tests/reshape/test_concat.py b/python/cudf/cudf/tests/reshape/test_concat.py index 8da43038159..e533bfac9df 100644 --- a/python/cudf/cudf/tests/reshape/test_concat.py +++ b/python/cudf/cudf/tests/reshape/test_concat.py @@ -1,11 +1,2046 @@ # Copyright (c) 2025, NVIDIA CORPORATION. +import warnings +from contextlib import contextmanager +from decimal import Decimal + import numpy as np import pandas as pd import pytest import cudf +from cudf.core._compat import PANDAS_GE_220 +from cudf.core.dtypes import Decimal32Dtype, Decimal64Dtype, Decimal128Dtype from cudf.testing import assert_eq +from cudf.testing._utils import assert_exceptions_equal, expect_warning_if + + +@pytest.fixture(params=["outer", "inner"]) +def join(request): + return request.param + + +@contextmanager +def _hide_concat_empty_dtype_warning(): + with warnings.catch_warnings(): + # Ignoring warnings in this test as warnings are + # being caught and validated in other tests. + warnings.filterwarnings( + "ignore", + "The behavior of array concatenation with empty entries " + "is deprecated.", + category=FutureWarning, + ) + yield + + +def make_frames(index=None, nulls="none"): + rng = np.random.default_rng(seed=0) + df = pd.DataFrame( + { + "x": range(10), + "y": list(map(float, range(10))), + "z": list("abcde") * 2, + } + ) + df.z = df.z.astype("category") + df2 = pd.DataFrame( + { + "x": range(10, 20), + "y": list(map(float, range(10, 20))), + "z": list("edcba") * 2, + } + ) + df2.z = df2.z.astype("category") + if nulls == "all": + df.y = np.full_like(df.y, np.nan) + df2.y = np.full_like(df2.y, np.nan) + if nulls == "some": + mask = np.arange(10) + rng.shuffle(mask) + mask = mask[:5] + df.loc[mask, "y"] = np.nan + df2.loc[mask, "y"] = np.nan + gdf = cudf.DataFrame.from_pandas(df) + gdf2 = cudf.DataFrame.from_pandas(df2) + if index: + df = df.set_index(index) + df2 = df2.set_index(index) + gdf = gdf.set_index(index) + gdf2 = gdf2.set_index(index) + return df, df2, gdf, gdf2 + + +@pytest.mark.parametrize("nulls", ["none", "some", "all"]) +@pytest.mark.parametrize("index", [False, "z", "y"]) +def test_concat_dataframe(index, nulls, axis_0): + if index == "y" and nulls in ("some", "all"): + pytest.skip("nulls in columns, dont index") + df, df2, gdf, gdf2 = make_frames(index, nulls=nulls) + # Make empty frame + gdf_empty1 = gdf2[:0] + assert len(gdf_empty1) == 0 + df_empty1 = gdf_empty1.to_pandas() + + # DataFrame + with _hide_concat_empty_dtype_warning(): + res = cudf.concat( + [gdf, gdf2, gdf, gdf_empty1], axis=axis_0 + ).to_pandas() + sol = pd.concat([df, df2, df, df_empty1], axis=axis_0) + assert_eq( + res, + sol, + check_names=False, + check_categorical=False, + check_index_type=True, + ) + + # Series + for c in [i for i in ("x", "y", "z") if i != index]: + res = cudf.concat([gdf[c], gdf2[c], gdf[c]], axis=axis_0).to_pandas() + sol = pd.concat([df[c], df2[c], df[c]], axis=axis_0) + assert_eq( + res, + sol, + check_names=False, + check_categorical=False, + check_index_type=True, + ) + + # Index + res = cudf.concat([gdf.index, gdf2.index], axis=axis_0).to_pandas() + sol = df.index.append(df2.index) + assert_eq(res, sol, check_names=False, check_categorical=False) + + +@pytest.mark.parametrize( + "values", + [["foo", "bar"], [1.0, 2.0], pd.Series(["one", "two"], dtype="category")], +) +def test_concat_all_nulls(values): + pa = pd.Series(values) + pb = pd.Series([None]) + ps = pd.concat([pa, pb]) + + ga = cudf.Series(values) + gb = cudf.Series([None]) + gs = cudf.concat([ga, gb]) + + assert_eq( + ps, + gs, + check_dtype=False, + check_categorical=False, + check_index_type=True, + ) + + +def test_concat_errors(): + df, df2, gdf, gdf2 = make_frames() + + # No objs + assert_exceptions_equal( + lfunc=pd.concat, + rfunc=cudf.concat, + lfunc_args_and_kwargs=([], {"objs": []}), + rfunc_args_and_kwargs=([], {"objs": []}), + ) + + # All None + assert_exceptions_equal( + lfunc=pd.concat, + rfunc=cudf.concat, + lfunc_args_and_kwargs=([], {"objs": [None, None]}), + rfunc_args_and_kwargs=([], {"objs": [None, None]}), + ) + + # Mismatched types + assert_exceptions_equal( + lfunc=pd.concat, + rfunc=cudf.concat, + lfunc_args_and_kwargs=([], {"objs": [df, df.index, df.x]}), + rfunc_args_and_kwargs=([], {"objs": [gdf, gdf.index, gdf.x]}), + ) + + # Unknown type + assert_exceptions_equal( + lfunc=pd.concat, + rfunc=cudf.concat, + lfunc_args_and_kwargs=([], {"objs": ["bar", "foo"]}), + rfunc_args_and_kwargs=([], {"objs": ["bar", "foo"]}), + ) + + # Mismatched index dtypes + gdf3 = gdf2.copy() + del gdf3["z"] + gdf4 = gdf2.set_index("z") + + with pytest.raises(ValueError, match="All columns must be the same type"): + cudf.concat([gdf3, gdf4]) + + # Bad axis value + assert_exceptions_equal( + lfunc=pd.concat, + rfunc=cudf.concat, + lfunc_args_and_kwargs=( + [], + {"objs": [gdf.to_pandas(), gdf2.to_pandas()], "axis": "bad_value"}, + ), + rfunc_args_and_kwargs=([], {"objs": [gdf, gdf2], "axis": "bad_value"}), + ) + + +def test_concat_misordered_columns(): + df, df2, gdf, gdf2 = make_frames(False) + gdf2 = gdf2[["z", "x", "y"]] + df2 = df2[["z", "x", "y"]] + + res = cudf.concat([gdf, gdf2]).to_pandas() + sol = pd.concat([df, df2], sort=False) + + assert_eq( + res, + sol, + check_names=False, + check_categorical=False, + check_index_type=True, + ) + + +def test_concat_columns(axis_1): + rng = np.random.default_rng(seed=0) + pdf1 = pd.DataFrame(rng.integers(10, size=(5, 3)), columns=[1, 2, 3]) + pdf2 = pd.DataFrame(rng.integers(10, size=(5, 4)), columns=[4, 5, 6, 7]) + gdf1 = cudf.from_pandas(pdf1) + gdf2 = cudf.from_pandas(pdf2) + + expect = pd.concat([pdf1, pdf2], axis=axis_1) + got = cudf.concat([gdf1, gdf2], axis=axis_1) + + assert_eq(expect, got, check_index_type=True) + + +def test_concat_multiindex_dataframe(axis): + gdf = cudf.DataFrame( + { + "w": np.arange(4), + "x": np.arange(4), + "y": np.arange(4), + "z": np.arange(4), + } + ) + gdg = gdf.groupby(["w", "x"]).min() + pdg = gdg.to_pandas() + pdg1 = pdg.iloc[:, :1] + pdg2 = pdg.iloc[:, 1:] + gdg1 = cudf.from_pandas(pdg1) + gdg2 = cudf.from_pandas(pdg2) + expected = pd.concat([pdg1, pdg2], axis=axis) + result = cudf.concat([gdg1, gdg2], axis=axis) + assert_eq( + expected, + result, + check_index_type=True, + ) + + +def test_concat_multiindex_series(): + gdf = cudf.DataFrame( + { + "w": np.arange(4), + "x": np.arange(4), + "y": np.arange(4), + "z": np.arange(4), + } + ) + gdg = gdf.groupby(["w", "x"]).min() + pdg = gdg.to_pandas() + pdg1 = pdg["y"] + pdg2 = pdg["z"] + gdg1 = cudf.from_pandas(pdg1) + gdg2 = cudf.from_pandas(pdg2) + assert_eq( + cudf.concat([gdg1, gdg2]), + pd.concat([pdg1, pdg2]), + check_index_type=True, + ) + assert_eq( + cudf.concat([gdg1, gdg2], axis=1), pd.concat([pdg1, pdg2], axis=1) + ) + + +def test_concat_multiindex_dataframe_and_series(): + gdf = cudf.DataFrame( + { + "w": np.arange(4), + "x": np.arange(4), + "y": np.arange(4), + "z": np.arange(4), + } + ) + gdg = gdf.groupby(["w", "x"]).min() + pdg = gdg.to_pandas() + pdg1 = pdg[["y", "z"]] + pdg2 = pdg["z"] + pdg2.name = "a" + gdg1 = cudf.from_pandas(pdg1) + gdg2 = cudf.from_pandas(pdg2) + assert_eq( + cudf.concat([gdg1, gdg2], axis=1), + pd.concat([pdg1, pdg2], axis=1), + check_index_type=True, + ) + + +def test_concat_multiindex_series_and_dataframe(): + gdf = cudf.DataFrame( + { + "w": np.arange(4), + "x": np.arange(4), + "y": np.arange(4), + "z": np.arange(4), + } + ) + gdg = gdf.groupby(["w", "x"]).min() + pdg = gdg.to_pandas() + pdg1 = pdg["z"] + pdg2 = pdg[["y", "z"]] + pdg1.name = "a" + gdg1 = cudf.from_pandas(pdg1) + gdg2 = cudf.from_pandas(pdg2) + assert_eq( + cudf.concat([gdg1, gdg2], axis=1), + pd.concat([pdg1, pdg2], axis=1), + check_index_type=True, + ) + + +@pytest.mark.parametrize("myindex", ["a", "b"]) +def test_concat_string_index_name(myindex): + # GH-Issue #3420 + data = {"a": [123, 456], "b": ["s1", "s2"]} + df1 = cudf.DataFrame(data).set_index(myindex) + df2 = df1.copy() + df3 = cudf.concat([df1, df2]) + + assert df3.index.name == myindex + + +def test_pandas_concat_compatibility_axis1(): + d1 = cudf.datasets.randomdata( + 3, dtypes={"a": float, "ind": float} + ).set_index("ind") + d2 = cudf.datasets.randomdata( + 3, dtypes={"b": float, "ind": float} + ).set_index("ind") + d3 = cudf.datasets.randomdata( + 3, dtypes={"c": float, "ind": float} + ).set_index("ind") + d4 = cudf.datasets.randomdata( + 3, dtypes={"d": float, "ind": float} + ).set_index("ind") + d5 = cudf.datasets.randomdata( + 3, dtypes={"e": float, "ind": float} + ).set_index("ind") + + pd1 = d1.to_pandas() + pd2 = d2.to_pandas() + pd3 = d3.to_pandas() + pd4 = d4.to_pandas() + pd5 = d5.to_pandas() + + expect = pd.concat([pd1, pd2, pd3, pd4, pd5], axis=1) + got = cudf.concat([d1, d2, d3, d4, d5], axis=1) + + assert_eq( + got.sort_index(), + expect.sort_index(), + check_index_type=True, + ) + + +@pytest.mark.parametrize("index", [[0, 1, 2], [2, 1, 0], [5, 9, 10]]) +@pytest.mark.parametrize("names", [False, (0, 1)]) +@pytest.mark.parametrize( + "data", + [ + (["a", "b", "c"], ["a", "b", "c"]), + (["a", "b", "c"], ["XX", "YY", "ZZ"]), + ], +) +def test_pandas_concat_compatibility_axis1_overlap(index, names, data): + s1 = cudf.Series(data[0], index=[0, 1, 2]) + s2 = cudf.Series(data[1], index=index) + if names: + s1.name = names[0] + s2.name = names[1] + ps1 = s1.to_pandas() + ps2 = s2.to_pandas() + got = cudf.concat([s1, s2], axis=1) + expect = pd.concat([ps1, ps2], axis=1) + assert_eq(got, expect, check_index_type=True) + + +def test_pandas_concat_compatibility_axis1_eq_index(): + s1 = cudf.Series(["a", "b", "c"], index=[0, 1, 2]) + s2 = cudf.Series(["a", "b", "c"], index=[1, 1, 1]) + ps1 = s1.to_pandas() + ps2 = s2.to_pandas() + + assert_exceptions_equal( + lfunc=pd.concat, + rfunc=cudf.concat, + lfunc_args_and_kwargs=([], {"objs": [ps1, ps2], "axis": 1}), + rfunc_args_and_kwargs=([], {"objs": [s1, s2], "axis": 1}), + ) + + +@pytest.mark.parametrize("name", [None, "a"]) +def test_pandas_concat_compatibility_axis1_single_column(name): + # Pandas renames series name `None` to 0 + # and preserves anything else + s = cudf.Series([1, 2, 3], name=name) + got = cudf.concat([s], axis=1) + expected = pd.concat([s.to_pandas()], axis=1) + assert_eq(expected, got) + + +def test_concat_duplicate_columns(): + cdf = cudf.DataFrame( + { + "id4": 4 * list(range(6)), + "id5": 4 * list(reversed(range(6))), + "v3": 6 * list(range(4)), + } + ) + cdf_std = cdf.groupby(["id4", "id5"])[["v3"]].std() + cdf_med = cdf.groupby(["id4", "id5"])[["v3"]].quantile(q=0.5) + with pytest.raises(NotImplementedError): + cudf.concat([cdf_med, cdf_std], axis=1) + + +def test_concat_mixed_input(): + pdf1 = pd.DataFrame({"a": [10, 20, 30]}) + pdf2 = pd.DataFrame({"a": [11, 22, 33]}) + + gdf1 = cudf.from_pandas(pdf1) + gdf2 = cudf.from_pandas(pdf2) + + assert_eq( + pd.concat([pdf1, None, pdf2, None]), + cudf.concat([gdf1, None, gdf2, None]), + check_index_type=True, + ) + assert_eq( + pd.concat([pdf1, None]), + cudf.concat([gdf1, None]), + check_index_type=True, + ) + assert_eq( + pd.concat([None, pdf2]), + cudf.concat([None, gdf2]), + check_index_type=True, + ) + assert_eq( + pd.concat([None, pdf2, pdf1]), + cudf.concat([None, gdf2, gdf1]), + check_index_type=True, + ) + + +@pytest.mark.parametrize( + "objs", + [ + [pd.Series([1, 2, 3]), pd.DataFrame({"a": [1, 2]})], + [pd.Series([1, 2, 3]), pd.DataFrame({"a": []})], + [pd.Series([], dtype="float64"), pd.DataFrame({"a": []})], + [pd.Series([], dtype="float64"), pd.DataFrame({"a": [1, 2]})], + pytest.param( + [ + pd.Series([1, 2, 3.0, 1.2], name="abc"), + pd.DataFrame({"a": [1, 2]}), + ], + marks=pytest.mark.skipif( + not PANDAS_GE_220, + reason="https://github.com/pandas-dev/pandas/pull/56365", + ), + ), + pytest.param( + [ + pd.Series( + [1, 2, 3.0, 1.2], name="abc", index=[100, 110, 120, 130] + ), + pd.DataFrame({"a": [1, 2]}), + ], + marks=pytest.mark.skipif( + not PANDAS_GE_220, + reason="https://github.com/pandas-dev/pandas/pull/56365", + ), + ), + pytest.param( + [ + pd.Series( + [1, 2, 3.0, 1.2], name="abc", index=["a", "b", "c", "d"] + ), + pd.DataFrame({"a": [1, 2]}, index=["a", "b"]), + ], + marks=pytest.mark.skipif( + not PANDAS_GE_220, + reason="https://github.com/pandas-dev/pandas/pull/56365", + ), + ), + pytest.param( + [ + pd.Series( + [1, 2, 3.0, 1.2, 8, 100], + name="New name", + index=["a", "b", "c", "d", "e", "f"], + ), + pd.DataFrame( + {"a": [1, 2, 4, 10, 11, 12]}, + index=["a", "b", "c", "d", "e", "f"], + ), + ], + marks=pytest.mark.skipif( + not PANDAS_GE_220, + reason="https://github.com/pandas-dev/pandas/pull/56365", + ), + ), + pytest.param( + [ + pd.Series( + [1, 2, 3.0, 1.2, 8, 100], + name="New name", + index=["a", "b", "c", "d", "e", "f"], + ), + pd.DataFrame( + {"a": [1, 2, 4, 10, 11, 12]}, + index=["a", "b", "c", "d", "e", "f"], + ), + ] + * 7, + marks=pytest.mark.skipif( + not PANDAS_GE_220, + reason="https://github.com/pandas-dev/pandas/pull/56365", + ), + ), + ], +) +def test_concat_series_dataframe_input(objs): + pd_objs = objs + gd_objs = [cudf.from_pandas(obj) for obj in objs] + + with _hide_concat_empty_dtype_warning(): + expected = pd.concat(pd_objs) + actual = cudf.concat(gd_objs) + + assert_eq( + expected.fillna(-1), + actual.fillna(-1), + check_dtype=False, + check_index_type=False, + ) + + +@pytest.mark.parametrize( + "objs", + [ + [ + pd.Series(["a", "b", "c", "d"]), + pd.Series(["1", "2", "3", "4"]), + pd.DataFrame({"first col": ["10", "11", "12", "13"]}), + ], + [ + pd.Series(["a", "b", "c", "d"]), + pd.Series(["1", "2", "3", "4"]), + pd.DataFrame( + { + "first col": ["10", "11", "12", "13"], + "second col": ["a", "b", "c", "d"], + } + ), + ], + [ + pd.Series(["a", "b", "c"]), + pd.Series(["1", "2", "3", "4"]), + pd.DataFrame( + { + "first col": ["10", "11", "12", "13"], + "second col": ["a", "b", "c", "d"], + } + ), + ], + ], +) +def test_concat_series_dataframe_input_str(objs): + pd_objs = objs + gd_objs = [cudf.from_pandas(obj) for obj in objs] + + expected = pd.concat(pd_objs) + actual = cudf.concat(gd_objs) + assert_eq(expected, actual, check_dtype=False, check_index_type=False) + + +@pytest.mark.parametrize( + "df", + [ + pd.DataFrame(), + pd.DataFrame(index=[10, 20, 30]), + pd.DataFrame( + {"c": [10, 11, 22, 33, 44, 100]}, index=[7, 8, 9, 10, 11, 20] + ), + pd.DataFrame([[5, 6], [7, 8]], columns=list("AB")), + pd.DataFrame({"f": [10.2, 11.2332, 0.22, 3.3, 44.23, 10.0]}), + pd.DataFrame({"l": [10]}), + pd.DataFrame({"l": [10]}, index=[200]), + pd.DataFrame([], index=[100]), + pd.DataFrame({"cat": pd.Series(["one", "two"], dtype="category")}), + ], +) +@pytest.mark.parametrize( + "other", + [ + [pd.DataFrame(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame()], + [ + pd.DataFrame( + {"c": [10, 11, 22, 33, 44, 100]}, index=[7, 8, 9, 10, 11, 20] + ), + pd.DataFrame(), + pd.DataFrame(), + pd.DataFrame([[5, 6], [7, 8]], columns=list("AB")), + ], + [ + pd.DataFrame({"f": [10.2, 11.2332, 0.22, 3.3, 44.23, 10.0]}), + pd.DataFrame({"l": [10]}), + pd.DataFrame({"l": [10]}, index=[200]), + pd.DataFrame( + {"cat": pd.Series(["two", "three"], dtype="category")} + ), + ], + [ + pd.DataFrame([]), + pd.DataFrame([], index=[100]), + pd.DataFrame( + {"cat": pd.Series(["two", "three"], dtype="category")} + ), + ], + ], +) +def test_concat_empty_dataframes(df, other, ignore_index): + other_pd = [df, *other] + + gdf = cudf.from_pandas(df) + other_gd = [gdf] + [cudf.from_pandas(o) for o in other] + + expected = pd.concat(other_pd, ignore_index=ignore_index) + actual = cudf.concat(other_gd, ignore_index=ignore_index) + if expected.shape != df.shape: + for key, col in actual[actual.columns].items(): + if isinstance(col.dtype, cudf.CategoricalDtype): + if not isinstance(expected[key].dtype, pd.CategoricalDtype): + # TODO: Pandas bug: + # https://github.com/pandas-dev/pandas/issues/42840 + expected[key] = expected[key].fillna("-1").astype("str") + else: + expected[key] = ( + expected[key] + .cat.add_categories(["-1"]) + .fillna("-1") + .astype("str") + ) + actual[key] = col.astype("str").fillna("-1") + else: + expected[key] = expected[key].fillna(-1) + actual[key] = col.fillna(-1) + assert_eq(expected, actual, check_dtype=False, check_index_type=True) + else: + assert_eq( + expected, + actual, + check_index_type=not gdf.empty, + check_column_type=False, + ) + + +@pytest.mark.parametrize( + "data", + [ + (["a", "b", "c"], ["a", "b", "c"]), + (["a", "b", "c"], ["XX", "YY", "ZZ"]), + ], +) +def test_concat_empty_and_nonempty_series(ignore_index, data, axis_0): + s1 = cudf.Series() + s2 = cudf.Series(data[0]) + ps1 = s1.to_pandas() + ps2 = s2.to_pandas() + got = cudf.concat([s1, s2], axis=axis_0, ignore_index=ignore_index) + expect = pd.concat([ps1, ps2], axis=axis_0, ignore_index=ignore_index) + + assert_eq(got, expect, check_index_type=True) + + +def test_concat_two_empty_series(ignore_index, axis_0): + s1 = cudf.Series() + s2 = cudf.Series() + ps1 = s1.to_pandas() + ps2 = s2.to_pandas() + got = cudf.concat([s1, s2], axis=axis_0, ignore_index=ignore_index) + expect = pd.concat([ps1, ps2], axis=axis_0, ignore_index=ignore_index) + + assert_eq(got, expect, check_index_type=True) + + +@pytest.mark.parametrize( + "key2", [[0, 1], [1, 0]], ids=["matching", "different"] +) +def test_concat_dataframe_with_multiindex(key2): + gdf1 = cudf.DataFrame({"k1": [0, 1], "k2": [2, 3], "v1": [4, 5]}) + gdf1 = gdf1.set_index(["k1", "k2"]) + + gdf2 = cudf.DataFrame({"k1": key2, "k2": [3, 2], "v2": [6, 7]}) + gdf2 = gdf2.set_index(["k1", "k2"]) + + pdf1 = gdf1.to_pandas() + pdf2 = gdf2.to_pandas() + + actual = cudf.concat([gdf1, gdf2], axis=1) + expected = pd.concat([pdf1, pdf2], axis=1) + + # Will need to sort_index before comparing as + # ordering is not deterministic in case of pandas + # multiIndex with concat. + assert_eq( + expected.sort_index(), + actual.sort_index(), + check_index_type=True, + ) + + +@pytest.mark.parametrize( + "objs", + [ + [ + pd.DataFrame( + { + "x": range(10), + "y": list(map(float, range(10))), + "z": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], + } + ), + pd.DataFrame( + {"x": range(10, 20), "y": list(map(float, range(10, 20)))} + ), + ], + [ + pd.DataFrame( + { + "x": range(10), + "y": list(map(float, range(10))), + "z": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], + }, + index=["a", "b", "c", "d", "e", "f", "g", "h", "i", "j"], + ), + pd.DataFrame( + {"x": range(10, 20), "y": list(map(float, range(10, 20)))}, + index=["k", "l", "m", "n", "o", "p", "q", "r", "s", "t"], + ), + pd.DataFrame( + { + "x": range(10), + "y": list(map(float, range(10))), + "z": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], + }, + index=["a", "b", "c", "d", "e", "f", "g", "h", "i", "j"], + ), + pd.DataFrame( + {"x": range(10, 20), "y": list(map(float, range(10, 20)))}, + index=["a", "b", "c", "d", "z", "f", "g", "h", "i", "w"], + ), + ], + ], +) +def test_concat_join(objs, ignore_index, sort, join, axis): + axis = 0 + gpu_objs = [cudf.from_pandas(o) for o in objs] + + assert_eq( + pd.concat( + objs, sort=sort, join=join, ignore_index=ignore_index, axis=axis + ), + cudf.concat( + gpu_objs, + sort=sort, + join=join, + ignore_index=ignore_index, + axis=axis, + ), + check_index_type=True, + ) + + +@pytest.mark.parametrize( + "objs", + [ + [ + pd.DataFrame( + { + "x": range(10), + "y": list(map(float, range(10))), + "z": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], + } + ), + pd.DataFrame( + {"x": range(10, 20), "y": list(map(float, range(10, 20)))} + ), + ], + ], +) +def test_concat_join_axis_1_dup_error(objs): + gpu_objs = [cudf.from_pandas(o) for o in objs] + # we do not support duplicate columns + with pytest.raises(NotImplementedError): + assert_eq( + pd.concat( + objs, + axis=1, + ), + cudf.concat( + gpu_objs, + axis=1, + ), + ) + + +@pytest.mark.parametrize( + "objs", + [ + [ + pd.DataFrame( + { + "x": range(10), + "y": list(map(float, range(10))), + "z": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], + } + ), + pd.DataFrame( + {"l": range(10, 20), "m": list(map(float, range(10, 20)))} + ), + ], + ], +) +def test_concat_join_axis_1(objs, ignore_index, sort, join, axis): + # no duplicate columns + axis = 1 + gpu_objs = [cudf.from_pandas(o) for o in objs] + expected = pd.concat( + objs, sort=sort, join=join, ignore_index=ignore_index, axis=axis + ) + actual = cudf.concat( + gpu_objs, + sort=sort, + join=join, + ignore_index=ignore_index, + axis=axis, + ) + + assert_eq(expected, actual, check_index_type=True) + + +def test_concat_join_many_df_and_empty_df(ignore_index, sort, join, axis): + # no duplicate columns + pdf1 = pd.DataFrame( + { + "x": range(10), + "y": list(map(float, range(10))), + "z": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], + } + ) + pdf2 = pd.DataFrame( + {"l": range(10, 20), "m": list(map(float, range(10, 20)))} + ) + pdf3 = pd.DataFrame({"j": [1, 2], "k": [1, 2], "s": [1, 2], "t": [1, 2]}) + pdf_empty1 = pd.DataFrame() + + gdf1 = cudf.from_pandas(pdf1) + gdf2 = cudf.from_pandas(pdf2) + gdf3 = cudf.from_pandas(pdf3) + gdf_empty1 = cudf.from_pandas(pdf_empty1) + + with _hide_concat_empty_dtype_warning(): + assert_eq( + pd.concat( + [pdf1, pdf2, pdf3, pdf_empty1], + sort=sort, + join=join, + ignore_index=ignore_index, + axis=axis, + ), + cudf.concat( + [gdf1, gdf2, gdf3, gdf_empty1], + sort=sort, + join=join, + ignore_index=ignore_index, + axis=axis, + ), + check_index_type=False, + ) + + +def test_concat_join_one_df(ignore_index, sort, join, axis): + pdf1 = pd.DataFrame( + { + "x": range(10), + "y": list(map(float, range(10))), + "z": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], + } + ) + + gdf1 = cudf.from_pandas(pdf1) + expected = pd.concat( + [pdf1], sort=sort, join=join, ignore_index=ignore_index, axis=axis + ) + actual = cudf.concat( + [gdf1], sort=sort, join=join, ignore_index=ignore_index, axis=axis + ) + + assert_eq(expected, actual, check_index_type=True) + + +@pytest.mark.parametrize( + "pdf1,pdf2", + [ + ( + pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}), + pd.DataFrame({"c": [7, 8, 9], "d": [10, 11, 12]}), + ), + ( + pd.DataFrame( + {"a": [1, 2, 3], "b": [4, 5, 6]}, index=["p", "q", "r"] + ), + pd.DataFrame( + {"c": [7, 8, 9], "d": [10, 11, 12]}, index=["r", "p", "z"] + ), + ), + ], +) +def test_concat_join_no_overlapping_columns( + pdf1, pdf2, ignore_index, sort, join, axis +): + gdf1 = cudf.from_pandas(pdf1) + gdf2 = cudf.from_pandas(pdf2) + + expected = pd.concat( + [pdf1, pdf2], + sort=sort, + join=join, + ignore_index=ignore_index, + axis=axis, + ) + actual = cudf.concat( + [gdf1, gdf2], + sort=sort, + join=join, + ignore_index=ignore_index, + axis=axis, + ) + + assert_eq(expected, actual, check_index_type=True) + + +def test_concat_join_no_overlapping_columns_many_and_empty( + ignore_index, sort, join, axis +): + pdf4 = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) + pdf5 = pd.DataFrame({"c": [7, 8, 9], "d": [10, 11, 12]}) + pdf6 = pd.DataFrame( + { + "x": range(10), + "y": list(map(float, range(10))), + "z": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], + } + ) + pdf_empty = pd.DataFrame() + + gdf4 = cudf.from_pandas(pdf4) + gdf5 = cudf.from_pandas(pdf5) + gdf6 = cudf.from_pandas(pdf6) + gdf_empty = cudf.from_pandas(pdf_empty) + + with _hide_concat_empty_dtype_warning(): + expected = pd.concat( + [pdf4, pdf5, pdf6, pdf_empty], + sort=sort, + join=join, + ignore_index=ignore_index, + axis=axis, + ) + actual = cudf.concat( + [gdf4, gdf5, gdf6, gdf_empty], + sort=sort, + join=join, + ignore_index=ignore_index, + axis=axis, + ) + assert_eq( + expected, + actual, + check_index_type=False, + ) + + +@pytest.mark.parametrize( + "objs", + [ + [ + pd.DataFrame( + {"a": [1, 2, 3], "b": [4, 5, 6]}, index=["z", "t", "k"] + ), + pd.DataFrame( + {"c": [7, 8, 9], "d": [10, 11, 12]}, index=["z", "t", "k"] + ), + pd.DataFrame( + { + "x": range(10), + "y": list(map(float, range(10))), + "z": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], + }, + index=["z", "t", "k", "a", "b", "c", "d", "e", "f", "g"], + ), + pd.DataFrame(index=pd.Index([], dtype="str")), + ], + [ + pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}), + pd.DataFrame({"c": [7, 8, 9], "d": [10, 11, 12]}), + pd.DataFrame( + { + "x": range(10), + "y": list(map(float, range(10))), + "z": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], + } + ), + pd.DataFrame(index=pd.Index([], dtype="str")), + ], + pytest.param( + [ + pd.DataFrame( + {"a": [1, 2, 3], "nb": [10, 11, 12]}, index=["Q", "W", "R"] + ), + None, + ], + ), + ], +) +def test_concat_join_no_overlapping_columns_many_and_empty2( + objs, ignore_index, sort, join, axis +): + objs_gd = [cudf.from_pandas(o) if o is not None else o for o in objs] + + with _hide_concat_empty_dtype_warning(): + expected = pd.concat( + objs, + sort=sort, + join=join, + ignore_index=ignore_index, + axis=axis, + ) + actual = cudf.concat( + objs_gd, + sort=sort, + join=join, + ignore_index=ignore_index, + axis=axis, + ) + assert_eq(expected, actual, check_index_type=False) + + +def test_concat_join_no_overlapping_columns_empty_df_basic( + ignore_index, sort, join, axis +): + pdf6 = pd.DataFrame( + { + "x": range(10), + "y": list(map(float, range(10))), + "z": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], + } + ) + pdf_empty = pd.DataFrame() + + gdf6 = cudf.from_pandas(pdf6) + gdf_empty = cudf.from_pandas(pdf_empty) + + with _hide_concat_empty_dtype_warning(): + expected = pd.concat( + [pdf6, pdf_empty], + sort=sort, + join=join, + ignore_index=ignore_index, + axis=axis, + ) + actual = cudf.concat( + [gdf6, gdf_empty], + sort=sort, + join=join, + ignore_index=ignore_index, + axis=axis, + ) + assert_eq( + expected, + actual, + check_index_type=True, + check_column_type=False, + ) + + +def test_concat_join_series(ignore_index, sort, join, axis): + s1 = cudf.Series(["a", "b", "c"]) + s2 = cudf.Series(["a", "b"]) + s3 = cudf.Series(["a", "b", "c", "d"]) + s4 = cudf.Series(dtype="str") + + ps1 = s1.to_pandas() + ps2 = s2.to_pandas() + ps3 = s3.to_pandas() + ps4 = s4.to_pandas() + + expected = pd.concat( + [ps1, ps2, ps3, ps4], + sort=sort, + join=join, + ignore_index=ignore_index, + axis=axis, + ) + with expect_warning_if(axis in {1, "columns"}): + actual = cudf.concat( + [s1, s2, s3, s4], + sort=sort, + join=join, + ignore_index=ignore_index, + axis=axis, + ) + + assert_eq( + expected, + actual, + check_index_type=True, + ) + + +@pytest.mark.parametrize( + "df", + [ + pd.DataFrame(), + pd.DataFrame(index=[10, 20, 30]), + pd.DataFrame( + {"c": [10, 11, 22, 33, 44, 100]}, index=[7, 8, 9, 10, 11, 20] + ), + pd.DataFrame([[5, 6], [7, 8]], columns=list("AB")), + pd.DataFrame({"f": [10.2, 11.2332, 0.22, 3.3, 44.23, 10.0]}), + pd.DataFrame({"l": [10]}), + pd.DataFrame({"l": [10]}, index=[200]), + pd.DataFrame([], index=[100]), + pd.DataFrame({"cat": pd.Series(["one", "two"], dtype="category")}), + ], +) +@pytest.mark.parametrize( + "other", + [ + [pd.DataFrame(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame()], + [ + pd.DataFrame( + {"b": [10, 11, 22, 33, 44, 100]}, index=[7, 8, 9, 10, 11, 20] + ), + pd.DataFrame(), + pd.DataFrame(), + pd.DataFrame([[5, 6], [7, 8]], columns=list("AB")), + ], + [ + pd.DataFrame({"f": [10.2, 11.2332, 0.22, 3.3, 44.23, 10.0]}), + pd.DataFrame({"l": [10]}), + pd.DataFrame({"k": [10]}, index=[200]), + pd.DataFrame( + {"cat": pd.Series(["two", "three"], dtype="category")} + ), + ], + [ + pd.DataFrame([]), + pd.DataFrame([], index=[100]), + pd.DataFrame( + {"cat": pd.Series(["two", "three"], dtype="category")} + ), + ], + ], +) +def test_concat_join_empty_dataframes( + request, df, other, ignore_index, join, sort +): + axis = 0 + other_pd = [df, *other] + gdf = cudf.from_pandas(df) + other_gd = [gdf] + [cudf.from_pandas(o) for o in other] + + expected = pd.concat( + other_pd, ignore_index=ignore_index, axis=axis, join=join, sort=sort + ) + actual = cudf.concat( + other_gd, ignore_index=ignore_index, axis=axis, join=join, sort=sort + ) + if ( + join == "outer" + and any( + isinstance(dtype, pd.CategoricalDtype) + for dtype in df.dtypes.tolist() + ) + and any( + isinstance(dtype, pd.CategoricalDtype) + for other_df in other + for dtype in other_df.dtypes.tolist() + ) + ): + request.applymarker( + pytest.mark.xfail( + reason="https://github.com/pandas-dev/pandas/issues/42840" + ) + ) + assert_eq( + expected, + actual, + check_dtype=False, + check_column_type=False, + ) + + +@pytest.mark.parametrize( + "df", + [ + pd.DataFrame(), + pd.DataFrame(index=[10, 20, 30]), + pd.DataFrame( + {"c": [10, 11, 22, 33, 44, 100]}, index=[7, 8, 9, 10, 11, 20] + ), + pd.DataFrame([[5, 6], [7, 8]], columns=list("AB")), + pd.DataFrame({"f": [10.2, 11.2332, 0.22, 3.3, 44.23, 10.0]}), + pd.DataFrame({"l": [10]}), + pd.DataFrame({"m": [10]}, index=[200]), + pd.DataFrame([], index=[100]), + pd.DataFrame({"cat": pd.Series(["one", "two"], dtype="category")}), + ], +) +@pytest.mark.parametrize( + "other", + [ + [pd.DataFrame(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame()], + [ + pd.DataFrame( + {"b": [10, 11, 22, 33, 44, 100]}, index=[7, 8, 9, 10, 11, 20] + ), + pd.DataFrame(), + pd.DataFrame(), + pd.DataFrame([[5, 6], [7, 8]], columns=list("CD")), + ], + [ + pd.DataFrame({"g": [10.2, 11.2332, 0.22, 3.3, 44.23, 10.0]}), + pd.DataFrame({"h": [10]}), + pd.DataFrame({"k": [10]}, index=[200]), + pd.DataFrame( + {"dog": pd.Series(["two", "three"], dtype="category")} + ), + ], + [ + pd.DataFrame([]), + pd.DataFrame([], index=[100]), + pd.DataFrame( + {"bird": pd.Series(["two", "three"], dtype="category")} + ), + ], + ], +) +def test_concat_join_empty_dataframes_axis_1( + df, other, ignore_index, axis, join, sort +): + # no duplicate columns + axis = 1 + other_pd = [df, *other] + gdf = cudf.from_pandas(df) + other_gd = [gdf] + [cudf.from_pandas(o) for o in other] + + with _hide_concat_empty_dtype_warning(): + expected = pd.concat( + other_pd, + ignore_index=ignore_index, + axis=axis, + join=join, + sort=sort, + ) + actual = cudf.concat( + other_gd, + ignore_index=ignore_index, + axis=axis, + join=join, + sort=sort, + ) + if expected.shape != df.shape: + if axis == 0: + for key, col in actual[actual.columns].items(): + if isinstance(expected[key].dtype, pd.CategoricalDtype): + expected[key] = expected[key].fillna("-1") + actual[key] = col.astype("str").fillna("-1") + # if not expected.empty: + assert_eq( + expected.fillna(-1), + actual.fillna(-1), + check_dtype=False, + check_index_type=False + if len(expected) == 0 or actual.empty + else True, + check_column_type=False, + ) + else: + # no need to fill in if axis=1 + assert_eq( + expected, + actual, + check_index_type=False, + check_column_type=False, + ) + assert_eq( + expected, actual, check_index_type=False, check_column_type=False + ) + + +def test_concat_preserve_order(): + """Ensure that order is preserved on 'inner' concatenations.""" + df = pd.DataFrame([["d", 3, 4.0], ["c", 4, 5.0]], columns=["c", "b", "a"]) + dfs = [df, df] + + assert_eq( + pd.concat(dfs, join="inner"), + cudf.concat([cudf.DataFrame(df) for df in dfs], join="inner"), + check_index_type=True, + ) + + +@pytest.mark.parametrize("typ", [cudf.DataFrame, cudf.Series]) +def test_concat_single_object(ignore_index, typ): + """Ensure that concat on a single object does not change it.""" + obj = typ([1, 2, 3]) + assert_eq( + cudf.concat([obj], ignore_index=ignore_index, axis=0), + obj, + check_index_type=True, + ) + + +@pytest.mark.parametrize( + "ltype", + [Decimal64Dtype(3, 1), Decimal64Dtype(7, 2), Decimal64Dtype(8, 4)], +) +@pytest.mark.parametrize( + "rtype", + [ + Decimal64Dtype(3, 2), + Decimal64Dtype(8, 4), + cudf.Decimal128Dtype(3, 2), + cudf.Decimal32Dtype(8, 4), + ], +) +def test_concat_decimal_dataframe(ltype, rtype): + rng = np.random.default_rng(seed=0) + gdf1 = cudf.DataFrame( + {"id": rng.integers(0, 10, 3), "val": ["22.3", "59.5", "81.1"]} + ) + gdf2 = cudf.DataFrame( + {"id": rng.integers(0, 10, 3), "val": ["2.35", "5.59", "8.14"]} + ) + + gdf1["val"] = gdf1["val"].astype(ltype) + gdf2["val"] = gdf2["val"].astype(rtype) + + pdf1 = gdf1.to_pandas() + pdf2 = gdf2.to_pandas() + + got = cudf.concat([gdf1, gdf2]) + expected = pd.concat([pdf1, pdf2]) + + assert_eq(expected, got, check_index_type=True) + + +@pytest.mark.parametrize("ltype", [Decimal64Dtype(4, 1), Decimal64Dtype(8, 2)]) +@pytest.mark.parametrize( + "rtype", + [ + Decimal64Dtype(4, 3), + Decimal64Dtype(10, 4), + Decimal32Dtype(8, 3), + Decimal128Dtype(18, 3), + ], +) +def test_concat_decimal_series(ltype, rtype): + gs1 = cudf.Series(["228.3", "559.5", "281.1"]).astype(ltype) + gs2 = cudf.Series(["2.345", "5.259", "8.154"]).astype(rtype) + + ps1 = gs1.to_pandas() + ps2 = gs2.to_pandas() + + got = cudf.concat([gs1, gs2]) + expected = pd.concat([ps1, ps2]) + + assert_eq(expected, got, check_index_type=True) + + +@pytest.mark.parametrize( + "data1, dtype1, index1, data2, dtype2, index2, data3, dtype3, index3, expected_data, expected_dtype, expected_index", + [ + [ + {"val": [Decimal("42.5"), Decimal("8.7")]}, + Decimal64Dtype(5, 2), + None, + {"val": [Decimal("9.23"), Decimal("-67.49")]}, + Decimal64Dtype(6, 4), + None, + {"val": [8, -5]}, + "int32", + None, + { + "val": [ + Decimal("42.5"), + Decimal("8.7"), + Decimal("9.23"), + Decimal("-67.49"), + Decimal("8"), + Decimal("-5"), + ] + }, + Decimal32Dtype(7, 4), + [0, 1, 0, 1, 0, 1], + ], + [ + {"val": [Decimal("95.2"), Decimal("23.4")]}, + Decimal64Dtype(5, 2), + None, + {"val": [54, 509]}, + "uint16", + None, + {"val": [24, -48]}, + "int32", + None, + { + "val": [ + Decimal("95.2"), + Decimal("23.4"), + Decimal("54"), + Decimal("509"), + Decimal("24"), + Decimal("-48"), + ] + }, + Decimal32Dtype(5, 2), + [0, 1, 0, 1, 0, 1], + ], + [ + {"val": [Decimal("36.56"), Decimal("-59.24")]}, + Decimal64Dtype(9, 4), + None, + {"val": [403.21, 45.13]}, + "float32", + None, + {"val": [52.262, -49.25]}, + "float64", + None, + { + "val": [ + Decimal("36.56"), + Decimal("-59.24"), + Decimal("403.21"), + Decimal("45.13"), + Decimal("52.262"), + Decimal("-49.25"), + ] + }, + Decimal32Dtype(9, 4), + [0, 1, 0, 1, 0, 1], + ], + [ + {"val": [Decimal("9563.24"), Decimal("236.633")]}, + Decimal64Dtype(9, 4), + None, + {"val": [5393, -95832]}, + "int64", + None, + {"val": [-29.234, -31.945]}, + "float64", + None, + { + "val": [ + Decimal("9563.24"), + Decimal("236.633"), + Decimal("5393"), + Decimal("-95832"), + Decimal("-29.234"), + Decimal("-31.945"), + ] + }, + Decimal32Dtype(9, 4), + [0, 1, 0, 1, 0, 1], + ], + [ + {"val": [Decimal("95633.24"), Decimal("236.633")]}, + Decimal128Dtype(19, 4), + None, + {"val": [5393, -95832]}, + "int64", + None, + {"val": [-29.234, -31.945]}, + "float64", + None, + { + "val": [ + Decimal("95633.24"), + Decimal("236.633"), + Decimal("5393"), + Decimal("-95832"), + Decimal("-29.234"), + Decimal("-31.945"), + ] + }, + Decimal128Dtype(19, 4), + [0, 1, 0, 1, 0, 1], + ], + ], +) +def test_concat_decimal_numeric_dataframe( + data1, + dtype1, + index1, + data2, + dtype2, + index2, + data3, + dtype3, + index3, + expected_data, + expected_dtype, + expected_index, +): + df1 = cudf.DataFrame(data1, dtype=dtype1, index=index1) + df2 = cudf.DataFrame(data2, dtype=dtype2, index=index2) + df3 = cudf.DataFrame(data3, dtype=dtype3, index=index3) + expected = cudf.DataFrame( + expected_data, dtype=expected_dtype, index=expected_index + ) + df = cudf.concat([df1, df2, df3]) + assert_eq(df, expected, check_index_type=True) + assert_eq(df.val.dtype, expected.val.dtype) + + +@pytest.mark.parametrize( + "data1, dtype1, index1, data2, dtype2, index2, data3, dtype3, index3, expected_data, expected_dtype, expected_index", + [ + [ + [Decimal("32.8"), Decimal("-87.7")], + Decimal64Dtype(6, 2), + None, + [Decimal("101.243"), Decimal("-92.449")], + Decimal64Dtype(9, 6), + None, + [94, -22], + "int32", + None, + [ + Decimal("32.8"), + Decimal("-87.7"), + Decimal("101.243"), + Decimal("-92.449"), + Decimal("94"), + Decimal("-22"), + ], + Decimal64Dtype(10, 6), + [0, 1, 0, 1, 0, 1], + ], + [ + [Decimal("7.2"), Decimal("122.1")], + Decimal64Dtype(5, 2), + None, + [33, 984], + "uint32", + None, + [593, -702], + "int32", + None, + [ + Decimal("7.2"), + Decimal("122.1"), + Decimal("33"), + Decimal("984"), + Decimal("593"), + Decimal("-702"), + ], + Decimal32Dtype(5, 2), + [0, 1, 0, 1, 0, 1], + ], + [ + [Decimal("982.94"), Decimal("-493.626")], + Decimal64Dtype(9, 4), + None, + [847.98, 254.442], + "float32", + None, + [5299.262, -2049.25], + "float64", + None, + [ + Decimal("982.94"), + Decimal("-493.626"), + Decimal("847.98"), + Decimal("254.442"), + Decimal("5299.262"), + Decimal("-2049.25"), + ], + Decimal32Dtype(9, 4), + [0, 1, 0, 1, 0, 1], + ], + [ + [Decimal("492.204"), Decimal("-72824.455")], + Decimal64Dtype(9, 4), + None, + [8438, -27462], + "int64", + None, + [-40.292, 49202.953], + "float64", + None, + [ + Decimal("492.204"), + Decimal("-72824.455"), + Decimal("8438"), + Decimal("-27462"), + Decimal("-40.292"), + Decimal("49202.953"), + ], + Decimal32Dtype(9, 4), + [0, 1, 0, 1, 0, 1], + ], + [ + [Decimal("492.204"), Decimal("-72824.455")], + Decimal64Dtype(10, 4), + None, + [Decimal("8438"), Decimal("-27462")], + Decimal32Dtype(9, 4), + None, + [Decimal("-40.292"), Decimal("49202.953")], + Decimal128Dtype(19, 4), + None, + [ + Decimal("492.204"), + Decimal("-72824.455"), + Decimal("8438"), + Decimal("-27462"), + Decimal("-40.292"), + Decimal("49202.953"), + ], + Decimal128Dtype(19, 4), + [0, 1, 0, 1, 0, 1], + ], + ], +) +def test_concat_decimal_numeric_series( + data1, + dtype1, + index1, + data2, + dtype2, + index2, + data3, + dtype3, + index3, + expected_data, + expected_dtype, + expected_index, +): + s1 = cudf.Series(data1, dtype=dtype1, index=index1) + s2 = cudf.Series(data2, dtype=dtype2, index=index2) + s3 = cudf.Series(data3, dtype=dtype3, index=index3) + expected = cudf.Series( + expected_data, dtype=expected_dtype, index=expected_index + ) + s = cudf.concat([s1, s2, s3]) + assert_eq(s, expected, check_index_type=True) + + +@pytest.mark.parametrize( + "data1, dtype1, index1, data2, dtype2, index2, expected_data, expected_dtype, expected_index", + [ + [ + [Decimal("955.22"), Decimal("8.2")], + Decimal64Dtype(5, 2), + None, + ["2007-06-12", "2006-03-14"], + "datetime64[s]", + None, + [ + "955.22", + "8.20", + "2007-06-12 00:00:00", + "2006-03-14 00:00:00", + ], + None, + [0, 1, 0, 1], + ], + [ + [Decimal("-52.44"), Decimal("365.22")], + Decimal64Dtype(5, 2), + None, + np.arange( + "2005-02-01T12", "2005-02-01T15", dtype="datetime64[h]" + ).astype("datetime64[s]"), + "datetime64[s]", + None, + [ + "-52.44", + "365.22", + "2005-02-01 12:00:00", + "2005-02-01 13:00:00", + "2005-02-01 14:00:00", + ], + None, + [0, 1, 0, 1, 2], + ], + [ + [Decimal("753.0"), Decimal("94.22")], + Decimal64Dtype(5, 2), + None, + [np.timedelta64(111, "s"), np.timedelta64(509, "s")], + None, + None, + [ + "753.00", + "94.22", + "0 days 00:01:51", + "0 days 00:08:29", + ], + None, + [0, 1, 0, 1], + ], + [ + [Decimal("753.0"), Decimal("94.22")], + Decimal64Dtype(5, 2), + None, + [np.timedelta64(940252, "s"), np.timedelta64(758385, "s")], + None, + None, + [ + "753.00", + "94.22", + "10 days 21:10:52", + "8 days 18:39:45", + ], + None, + [0, 1, 0, 1], + ], + ], +) +def test_concat_decimal_non_numeric( + data1, + dtype1, + index1, + data2, + dtype2, + index2, + expected_data, + expected_dtype, + expected_index, +): + s1 = cudf.Series(data1, dtype=dtype1, index=index1) + s2 = cudf.Series(data2, dtype=dtype2, index=index2) + expected = cudf.Series( + expected_data, dtype=expected_dtype, index=expected_index + ) + s = cudf.concat([s1, s2]) + assert_eq(s, expected, check_index_type=True) + + +def test_concat_struct_column(): + s1 = cudf.Series([{"a": 5}, {"c": "hello"}, {"b": 7}]) + s2 = cudf.Series([{"a": 5, "c": "hello", "b": 7}]) + expected = cudf.Series( + [ + {"a": 5, "b": None, "c": None}, + {"a": None, "b": None, "c": "hello"}, + {"a": None, "b": 7, "c": None}, + {"a": 5, "b": 7, "c": "hello"}, + ], + index=[0, 1, 2, 0], + ) + s = cudf.concat([s1, s2]) + assert_eq(s, expected, check_index_type=True) + + +@pytest.mark.parametrize( + "frame1_cls, frame1_data, frame2_cls, frame2_data, expected_cls, expected_data", + [ + ( + cudf.Series, + {"data": [[{"b": 0}], [{"b": 1}], [{"b": 3}]]}, + cudf.Series, + {"data": [[{"b": 10}], [{"b": 12}], None]}, + cudf.Series, + { + "data": [ + [{"b": 0}], + [{"b": 1}], + [{"b": 3}], + [{"b": 10}], + [{"b": 12}], + None, + ], + "index": [0, 1, 2, 0, 1, 2], + }, + ), + ( + cudf.DataFrame, + {"data": {"a": [[{"b": 0}], [{"b": 1}], [{"b": 3}]]}}, + cudf.DataFrame, + {"data": {"a": [[{"b": 10}], [{"b": 12}], None]}}, + cudf.DataFrame, + { + "data": { + "a": [ + [{"b": 0}], + [{"b": 1}], + [{"b": 3}], + [{"b": 10}], + [{"b": 12}], + None, + ] + }, + "index": [0, 1, 2, 0, 1, 2], + }, + ), + ], +) +def test_concat_list_column( + frame1_cls, + frame1_data, + frame2_cls, + frame2_data, + expected_cls, + expected_data, +): + frame1 = frame1_cls(**frame1_data) + frame2 = frame2_cls(**frame2_data) + expected = expected_cls(**expected_data) + actual = cudf.concat([frame1, frame2]) + assert_eq(actual, expected, check_index_type=True) + + +def test_concat_categorical_ordering(): + # https://github.com/rapidsai/cudf/issues/11486 + sr = pd.Series( + ["a", "b", "c", "d", "e", "a", "b", "c", "d", "e"], dtype="category" + ) + sr = sr.cat.set_categories(["d", "a", "b", "c", "e"]) + + df = pd.DataFrame({"a": sr}) + gdf = cudf.from_pandas(df) + + expect = pd.concat([df, df, df]) + got = cudf.concat([gdf, gdf, gdf]) + + assert_eq(expect, got) + + +@pytest.fixture(params=["rangeindex", "index"]) +def singleton_concat_index(request): + if request.param == "rangeindex": + return pd.RangeIndex(0, 4) + else: + return pd.Index(["a", "h", "g", "f"]) + + +@pytest.fixture(params=["dataframe", "series"]) +def singleton_concat_obj(request, singleton_concat_index): + if request.param == "dataframe": + return pd.DataFrame( + { + "b": [1, 2, 3, 4], + "d": [7, 8, 9, 10], + "a": [4, 5, 6, 7], + "c": [10, 11, 12, 13], + }, + index=singleton_concat_index, + ) + else: + return pd.Series([4, 5, 5, 6], index=singleton_concat_index) + + +def test_concat_singleton_sorting( + axis, sort, ignore_index, singleton_concat_obj +): + gobj = cudf.from_pandas(singleton_concat_obj) + gconcat = cudf.concat( + [gobj], axis=axis, sort=sort, ignore_index=ignore_index + ) + pconcat = pd.concat( + [singleton_concat_obj], axis=axis, sort=sort, ignore_index=ignore_index + ) + assert_eq(pconcat, gconcat) + + +@pytest.mark.parametrize("axis", [2, "invalid"]) +def test_concat_invalid_axis(axis): + s = cudf.Series([1, 2, 3]) + with pytest.raises(ValueError): + cudf.concat([s], axis=axis) + + +@pytest.mark.parametrize( + "s1,s2", + [ + ([1, 2], [[1, 2], [3, 4]]), + ], +) +def test_concat_mixed_list_types_error(s1, s2): + s1, s2 = cudf.Series(s1), cudf.Series(s2) + + with pytest.raises(NotImplementedError): + cudf.concat([s1, s2], ignore_index=True) + + +@pytest.mark.parametrize( + "axis", + [ + pytest.param( + 0, + marks=pytest.mark.xfail( + reason="concat dictionaries with axis=0 not implemented" + ), + ), + 1, + "columns", + ], +) +@pytest.mark.parametrize( + "d", + [ + {"first": (cudf.DataFrame, {"data": {"A": [1, 2], "B": [3, 4]}})}, + { + "first": (cudf.DataFrame, {"data": {"A": [1, 2], "B": [3, 4]}}), + "second": (cudf.DataFrame, {"data": {"A": [5, 6], "B": [7, 8]}}), + "third": (cudf.DataFrame, {"data": {"C": [1, 2, 3]}}), + }, + { + "first": (cudf.DataFrame, {"data": {"A": [1, 2], "B": [3, 4]}}), + "second": (cudf.DataFrame, {"data": {"C": [1, 2, 3]}}), + "third": (cudf.DataFrame, {"data": {"A": [5, 6], "B": [7, 8]}}), + }, + { + "first": (cudf.DataFrame, {"data": {"A": [1, 2], "B": [3, 4]}}), + "second": (cudf.DataFrame, {"data": {"C": [1, 2, 3]}}), + "third": (cudf.DataFrame, {"data": {"A": [5, 6], "C": [7, 8]}}), + "fourth": (cudf.DataFrame, {"data": {"B": [9, 10]}}), + }, + pytest.param( + { + "first": (cudf.DataFrame, {"data": {2.0: [1, 1]}}), + "second": (cudf.DataFrame, {"data": {"test": ["abc", "def"]}}), + }, + marks=pytest.mark.xfail( + reason=( + "Cannot construct a MultiIndex column with multiple " + "label types in cuDF at this time. You must convert " + "the labels to the same type." + ) + ), + ), + { + "first": (cudf.Series, {"data": [1, 2, 3]}), + "second": (cudf.Series, {"data": [4, 5, 6]}), + }, + { + "first": (cudf.DataFrame, {"data": {"A": [1, 2], "B": [3, 4]}}), + "second": (cudf.Series, {"data": [5, 6], "name": "C"}), + }, + pytest.param( + { + "first": ( + cudf.DataFrame, + {"data": {("A", "B"): [1, 2], "C": [3, 4]}}, + ), + "second": ( + cudf.DataFrame, + {"data": {"D": [5, 6], ("A", "B"): [7, 8]}}, + ), + }, + marks=pytest.mark.xfail( + reason=( + "Cannot construct a MultiIndex column with multiple " + "label types in cuDF at this time. You must convert " + "the labels to the same type." + ) + ), + ), + pytest.param( + { + "first": ( + cudf.DataFrame, + {"data": {("A", "B"): [3, 4], 2.0: [1, 1]}}, + ), + "second": ( + cudf.DataFrame, + {"data": {("C", "D"): [3, 4], 3.0: [5, 6]}}, + ), + }, + marks=pytest.mark.xfail( + reason=( + "Cannot construct a MultiIndex column with multiple " + "label types in cuDF at this time. You must convert " + "the labels to the same type." + ) + ), + ), + { + "first": ( + cudf.DataFrame, + {"data": {(1, 2): [1, 2], (3, 4): [3, 4]}}, + ), + "second": ( + cudf.DataFrame, + {"data": {(1, 2): [5, 6], (5, 6): [7, 8]}}, + ), + }, + ], +) +def test_concat_dictionary(d, axis): + _dict = {k: c(**v) for k, (c, v) in d.items()} + result = cudf.concat(_dict, axis=axis) + expected = cudf.from_pandas( + pd.concat({k: df.to_pandas() for k, df in _dict.items()}, axis=axis) + ) + assert_eq(expected, result) + + +@pytest.mark.parametrize( + "idx_cls, idx_data", + [ + [cudf.Index, {"data": [1, 2, 3]}], + [ + cudf.MultiIndex, + { + "levels": [[1, 2], ["blue", "red"]], + "codes": [[0, 0, 1, 1], [1, 0, 1, 0]], + }, + ], + [cudf.CategoricalIndex, {"data": [1, 2, 3]}], + ], +) +def test_concat_dict_incorrect_type_index(idx_cls, idx_data): + idx = idx_cls(**idx_data) + with pytest.raises( + TypeError, + match="cannot concatenate a dictionary containing indices", + ): + cudf.concat({"first": idx}, axis=1) @pytest.mark.parametrize( diff --git a/python/cudf/cudf/tests/test_concat.py b/python/cudf/cudf/tests/test_concat.py deleted file mode 100644 index a2657228b94..00000000000 --- a/python/cudf/cudf/tests/test_concat.py +++ /dev/null @@ -1,2060 +0,0 @@ -# Copyright (c) 2018-2025, NVIDIA CORPORATION. - -import warnings -from contextlib import contextmanager -from decimal import Decimal - -import numpy as np -import pandas as pd -import pytest - -import cudf -from cudf.core._compat import PANDAS_GE_220 -from cudf.core.dtypes import Decimal32Dtype, Decimal64Dtype, Decimal128Dtype -from cudf.testing import assert_eq -from cudf.testing._utils import assert_exceptions_equal, expect_warning_if - - -@pytest.fixture(params=[True, False]) -def ignore_index(request): - return request.param - - -@pytest.fixture(params=[True, False]) -def sort(request): - return request.param - - -@pytest.fixture(params=["outer", "inner"]) -def join(request): - return request.param - - -@pytest.fixture(params=[0, "index", 1, "columns"]) -def axis(request): - return request.param - - -@contextmanager -def _hide_concat_empty_dtype_warning(): - with warnings.catch_warnings(): - # Ignoring warnings in this test as warnings are - # being caught and validated in other tests. - warnings.filterwarnings( - "ignore", - "The behavior of array concatenation with empty entries " - "is deprecated.", - category=FutureWarning, - ) - yield - - -def make_frames(index=None, nulls="none"): - rng = np.random.default_rng(seed=0) - df = pd.DataFrame( - { - "x": range(10), - "y": list(map(float, range(10))), - "z": list("abcde") * 2, - } - ) - df.z = df.z.astype("category") - df2 = pd.DataFrame( - { - "x": range(10, 20), - "y": list(map(float, range(10, 20))), - "z": list("edcba") * 2, - } - ) - df2.z = df2.z.astype("category") - if nulls == "all": - df.y = np.full_like(df.y, np.nan) - df2.y = np.full_like(df2.y, np.nan) - if nulls == "some": - mask = np.arange(10) - rng.shuffle(mask) - mask = mask[:5] - df.loc[mask, "y"] = np.nan - df2.loc[mask, "y"] = np.nan - gdf = cudf.DataFrame.from_pandas(df) - gdf2 = cudf.DataFrame.from_pandas(df2) - if index: - df = df.set_index(index) - df2 = df2.set_index(index) - gdf = gdf.set_index(index) - gdf2 = gdf2.set_index(index) - return df, df2, gdf, gdf2 - - -@pytest.mark.parametrize("nulls", ["none", "some", "all"]) -@pytest.mark.parametrize("index", [False, "z", "y"]) -@pytest.mark.parametrize("axis", [0, "index"]) -def test_concat_dataframe(index, nulls, axis): - if index == "y" and nulls in ("some", "all"): - pytest.skip("nulls in columns, dont index") - df, df2, gdf, gdf2 = make_frames(index, nulls=nulls) - # Make empty frame - gdf_empty1 = gdf2[:0] - assert len(gdf_empty1) == 0 - df_empty1 = gdf_empty1.to_pandas() - - # DataFrame - with _hide_concat_empty_dtype_warning(): - res = cudf.concat([gdf, gdf2, gdf, gdf_empty1], axis=axis).to_pandas() - sol = pd.concat([df, df2, df, df_empty1], axis=axis) - assert_eq( - res, - sol, - check_names=False, - check_categorical=False, - check_index_type=True, - ) - - # Series - for c in [i for i in ("x", "y", "z") if i != index]: - res = cudf.concat([gdf[c], gdf2[c], gdf[c]], axis=axis).to_pandas() - sol = pd.concat([df[c], df2[c], df[c]], axis=axis) - assert_eq( - res, - sol, - check_names=False, - check_categorical=False, - check_index_type=True, - ) - - # Index - res = cudf.concat([gdf.index, gdf2.index], axis=axis).to_pandas() - sol = df.index.append(df2.index) - assert_eq(res, sol, check_names=False, check_categorical=False) - - -@pytest.mark.parametrize( - "values", - [["foo", "bar"], [1.0, 2.0], pd.Series(["one", "two"], dtype="category")], -) -def test_concat_all_nulls(values): - pa = pd.Series(values) - pb = pd.Series([None]) - ps = pd.concat([pa, pb]) - - ga = cudf.Series(values) - gb = cudf.Series([None]) - gs = cudf.concat([ga, gb]) - - assert_eq( - ps, - gs, - check_dtype=False, - check_categorical=False, - check_index_type=True, - ) - - -def test_concat_errors(): - df, df2, gdf, gdf2 = make_frames() - - # No objs - assert_exceptions_equal( - lfunc=pd.concat, - rfunc=cudf.concat, - lfunc_args_and_kwargs=([], {"objs": []}), - rfunc_args_and_kwargs=([], {"objs": []}), - ) - - # All None - assert_exceptions_equal( - lfunc=pd.concat, - rfunc=cudf.concat, - lfunc_args_and_kwargs=([], {"objs": [None, None]}), - rfunc_args_and_kwargs=([], {"objs": [None, None]}), - ) - - # Mismatched types - assert_exceptions_equal( - lfunc=pd.concat, - rfunc=cudf.concat, - lfunc_args_and_kwargs=([], {"objs": [df, df.index, df.x]}), - rfunc_args_and_kwargs=([], {"objs": [gdf, gdf.index, gdf.x]}), - ) - - # Unknown type - assert_exceptions_equal( - lfunc=pd.concat, - rfunc=cudf.concat, - lfunc_args_and_kwargs=([], {"objs": ["bar", "foo"]}), - rfunc_args_and_kwargs=([], {"objs": ["bar", "foo"]}), - ) - - # Mismatched index dtypes - gdf3 = gdf2.copy() - del gdf3["z"] - gdf4 = gdf2.set_index("z") - - with pytest.raises(ValueError, match="All columns must be the same type"): - cudf.concat([gdf3, gdf4]) - - # Bad axis value - assert_exceptions_equal( - lfunc=pd.concat, - rfunc=cudf.concat, - lfunc_args_and_kwargs=( - [], - {"objs": [gdf.to_pandas(), gdf2.to_pandas()], "axis": "bad_value"}, - ), - rfunc_args_and_kwargs=([], {"objs": [gdf, gdf2], "axis": "bad_value"}), - ) - - -def test_concat_misordered_columns(): - df, df2, gdf, gdf2 = make_frames(False) - gdf2 = gdf2[["z", "x", "y"]] - df2 = df2[["z", "x", "y"]] - - res = cudf.concat([gdf, gdf2]).to_pandas() - sol = pd.concat([df, df2], sort=False) - - assert_eq( - res, - sol, - check_names=False, - check_categorical=False, - check_index_type=True, - ) - - -@pytest.mark.parametrize("axis", [1, "columns"]) -def test_concat_columns(axis): - rng = np.random.default_rng(seed=0) - pdf1 = pd.DataFrame(rng.integers(10, size=(5, 3)), columns=[1, 2, 3]) - pdf2 = pd.DataFrame(rng.integers(10, size=(5, 4)), columns=[4, 5, 6, 7]) - gdf1 = cudf.from_pandas(pdf1) - gdf2 = cudf.from_pandas(pdf2) - - expect = pd.concat([pdf1, pdf2], axis=axis) - got = cudf.concat([gdf1, gdf2], axis=axis) - - assert_eq(expect, got, check_index_type=True) - - -def test_concat_multiindex_dataframe(axis): - gdf = cudf.DataFrame( - { - "w": np.arange(4), - "x": np.arange(4), - "y": np.arange(4), - "z": np.arange(4), - } - ) - gdg = gdf.groupby(["w", "x"]).min() - pdg = gdg.to_pandas() - pdg1 = pdg.iloc[:, :1] - pdg2 = pdg.iloc[:, 1:] - gdg1 = cudf.from_pandas(pdg1) - gdg2 = cudf.from_pandas(pdg2) - expected = pd.concat([pdg1, pdg2], axis=axis) - result = cudf.concat([gdg1, gdg2], axis=axis) - assert_eq( - expected, - result, - check_index_type=True, - ) - - -def test_concat_multiindex_series(): - gdf = cudf.DataFrame( - { - "w": np.arange(4), - "x": np.arange(4), - "y": np.arange(4), - "z": np.arange(4), - } - ) - gdg = gdf.groupby(["w", "x"]).min() - pdg = gdg.to_pandas() - pdg1 = pdg["y"] - pdg2 = pdg["z"] - gdg1 = cudf.from_pandas(pdg1) - gdg2 = cudf.from_pandas(pdg2) - assert_eq( - cudf.concat([gdg1, gdg2]), - pd.concat([pdg1, pdg2]), - check_index_type=True, - ) - assert_eq( - cudf.concat([gdg1, gdg2], axis=1), pd.concat([pdg1, pdg2], axis=1) - ) - - -def test_concat_multiindex_dataframe_and_series(): - gdf = cudf.DataFrame( - { - "w": np.arange(4), - "x": np.arange(4), - "y": np.arange(4), - "z": np.arange(4), - } - ) - gdg = gdf.groupby(["w", "x"]).min() - pdg = gdg.to_pandas() - pdg1 = pdg[["y", "z"]] - pdg2 = pdg["z"] - pdg2.name = "a" - gdg1 = cudf.from_pandas(pdg1) - gdg2 = cudf.from_pandas(pdg2) - assert_eq( - cudf.concat([gdg1, gdg2], axis=1), - pd.concat([pdg1, pdg2], axis=1), - check_index_type=True, - ) - - -def test_concat_multiindex_series_and_dataframe(): - gdf = cudf.DataFrame( - { - "w": np.arange(4), - "x": np.arange(4), - "y": np.arange(4), - "z": np.arange(4), - } - ) - gdg = gdf.groupby(["w", "x"]).min() - pdg = gdg.to_pandas() - pdg1 = pdg["z"] - pdg2 = pdg[["y", "z"]] - pdg1.name = "a" - gdg1 = cudf.from_pandas(pdg1) - gdg2 = cudf.from_pandas(pdg2) - assert_eq( - cudf.concat([gdg1, gdg2], axis=1), - pd.concat([pdg1, pdg2], axis=1), - check_index_type=True, - ) - - -@pytest.mark.parametrize("myindex", ["a", "b"]) -def test_concat_string_index_name(myindex): - # GH-Issue #3420 - data = {"a": [123, 456], "b": ["s1", "s2"]} - df1 = cudf.DataFrame(data).set_index(myindex) - df2 = df1.copy() - df3 = cudf.concat([df1, df2]) - - assert df3.index.name == myindex - - -def test_pandas_concat_compatibility_axis1(): - d1 = cudf.datasets.randomdata( - 3, dtypes={"a": float, "ind": float} - ).set_index("ind") - d2 = cudf.datasets.randomdata( - 3, dtypes={"b": float, "ind": float} - ).set_index("ind") - d3 = cudf.datasets.randomdata( - 3, dtypes={"c": float, "ind": float} - ).set_index("ind") - d4 = cudf.datasets.randomdata( - 3, dtypes={"d": float, "ind": float} - ).set_index("ind") - d5 = cudf.datasets.randomdata( - 3, dtypes={"e": float, "ind": float} - ).set_index("ind") - - pd1 = d1.to_pandas() - pd2 = d2.to_pandas() - pd3 = d3.to_pandas() - pd4 = d4.to_pandas() - pd5 = d5.to_pandas() - - expect = pd.concat([pd1, pd2, pd3, pd4, pd5], axis=1) - got = cudf.concat([d1, d2, d3, d4, d5], axis=1) - - assert_eq( - got.sort_index(), - expect.sort_index(), - check_index_type=True, - ) - - -@pytest.mark.parametrize("index", [[0, 1, 2], [2, 1, 0], [5, 9, 10]]) -@pytest.mark.parametrize("names", [False, (0, 1)]) -@pytest.mark.parametrize( - "data", - [ - (["a", "b", "c"], ["a", "b", "c"]), - (["a", "b", "c"], ["XX", "YY", "ZZ"]), - ], -) -def test_pandas_concat_compatibility_axis1_overlap(index, names, data): - s1 = cudf.Series(data[0], index=[0, 1, 2]) - s2 = cudf.Series(data[1], index=index) - if names: - s1.name = names[0] - s2.name = names[1] - ps1 = s1.to_pandas() - ps2 = s2.to_pandas() - got = cudf.concat([s1, s2], axis=1) - expect = pd.concat([ps1, ps2], axis=1) - assert_eq(got, expect, check_index_type=True) - - -def test_pandas_concat_compatibility_axis1_eq_index(): - s1 = cudf.Series(["a", "b", "c"], index=[0, 1, 2]) - s2 = cudf.Series(["a", "b", "c"], index=[1, 1, 1]) - ps1 = s1.to_pandas() - ps2 = s2.to_pandas() - - assert_exceptions_equal( - lfunc=pd.concat, - rfunc=cudf.concat, - lfunc_args_and_kwargs=([], {"objs": [ps1, ps2], "axis": 1}), - rfunc_args_and_kwargs=([], {"objs": [s1, s2], "axis": 1}), - ) - - -@pytest.mark.parametrize("name", [None, "a"]) -def test_pandas_concat_compatibility_axis1_single_column(name): - # Pandas renames series name `None` to 0 - # and preserves anything else - s = cudf.Series([1, 2, 3], name=name) - got = cudf.concat([s], axis=1) - expected = pd.concat([s.to_pandas()], axis=1) - assert_eq(expected, got) - - -def test_concat_duplicate_columns(): - cdf = cudf.DataFrame( - { - "id4": 4 * list(range(6)), - "id5": 4 * list(reversed(range(6))), - "v3": 6 * list(range(4)), - } - ) - cdf_std = cdf.groupby(["id4", "id5"])[["v3"]].std() - cdf_med = cdf.groupby(["id4", "id5"])[["v3"]].quantile(q=0.5) - with pytest.raises(NotImplementedError): - cudf.concat([cdf_med, cdf_std], axis=1) - - -def test_concat_mixed_input(): - pdf1 = pd.DataFrame({"a": [10, 20, 30]}) - pdf2 = pd.DataFrame({"a": [11, 22, 33]}) - - gdf1 = cudf.from_pandas(pdf1) - gdf2 = cudf.from_pandas(pdf2) - - assert_eq( - pd.concat([pdf1, None, pdf2, None]), - cudf.concat([gdf1, None, gdf2, None]), - check_index_type=True, - ) - assert_eq( - pd.concat([pdf1, None]), - cudf.concat([gdf1, None]), - check_index_type=True, - ) - assert_eq( - pd.concat([None, pdf2]), - cudf.concat([None, gdf2]), - check_index_type=True, - ) - assert_eq( - pd.concat([None, pdf2, pdf1]), - cudf.concat([None, gdf2, gdf1]), - check_index_type=True, - ) - - -@pytest.mark.parametrize( - "objs", - [ - [pd.Series([1, 2, 3]), pd.DataFrame({"a": [1, 2]})], - [pd.Series([1, 2, 3]), pd.DataFrame({"a": []})], - [pd.Series([], dtype="float64"), pd.DataFrame({"a": []})], - [pd.Series([], dtype="float64"), pd.DataFrame({"a": [1, 2]})], - pytest.param( - [ - pd.Series([1, 2, 3.0, 1.2], name="abc"), - pd.DataFrame({"a": [1, 2]}), - ], - marks=pytest.mark.skipif( - not PANDAS_GE_220, - reason="https://github.com/pandas-dev/pandas/pull/56365", - ), - ), - pytest.param( - [ - pd.Series( - [1, 2, 3.0, 1.2], name="abc", index=[100, 110, 120, 130] - ), - pd.DataFrame({"a": [1, 2]}), - ], - marks=pytest.mark.skipif( - not PANDAS_GE_220, - reason="https://github.com/pandas-dev/pandas/pull/56365", - ), - ), - pytest.param( - [ - pd.Series( - [1, 2, 3.0, 1.2], name="abc", index=["a", "b", "c", "d"] - ), - pd.DataFrame({"a": [1, 2]}, index=["a", "b"]), - ], - marks=pytest.mark.skipif( - not PANDAS_GE_220, - reason="https://github.com/pandas-dev/pandas/pull/56365", - ), - ), - pytest.param( - [ - pd.Series( - [1, 2, 3.0, 1.2, 8, 100], - name="New name", - index=["a", "b", "c", "d", "e", "f"], - ), - pd.DataFrame( - {"a": [1, 2, 4, 10, 11, 12]}, - index=["a", "b", "c", "d", "e", "f"], - ), - ], - marks=pytest.mark.skipif( - not PANDAS_GE_220, - reason="https://github.com/pandas-dev/pandas/pull/56365", - ), - ), - pytest.param( - [ - pd.Series( - [1, 2, 3.0, 1.2, 8, 100], - name="New name", - index=["a", "b", "c", "d", "e", "f"], - ), - pd.DataFrame( - {"a": [1, 2, 4, 10, 11, 12]}, - index=["a", "b", "c", "d", "e", "f"], - ), - ] - * 7, - marks=pytest.mark.skipif( - not PANDAS_GE_220, - reason="https://github.com/pandas-dev/pandas/pull/56365", - ), - ), - ], -) -def test_concat_series_dataframe_input(objs): - pd_objs = objs - gd_objs = [cudf.from_pandas(obj) for obj in objs] - - with _hide_concat_empty_dtype_warning(): - expected = pd.concat(pd_objs) - actual = cudf.concat(gd_objs) - - assert_eq( - expected.fillna(-1), - actual.fillna(-1), - check_dtype=False, - check_index_type=False, - ) - - -@pytest.mark.parametrize( - "objs", - [ - [ - pd.Series(["a", "b", "c", "d"]), - pd.Series(["1", "2", "3", "4"]), - pd.DataFrame({"first col": ["10", "11", "12", "13"]}), - ], - [ - pd.Series(["a", "b", "c", "d"]), - pd.Series(["1", "2", "3", "4"]), - pd.DataFrame( - { - "first col": ["10", "11", "12", "13"], - "second col": ["a", "b", "c", "d"], - } - ), - ], - [ - pd.Series(["a", "b", "c"]), - pd.Series(["1", "2", "3", "4"]), - pd.DataFrame( - { - "first col": ["10", "11", "12", "13"], - "second col": ["a", "b", "c", "d"], - } - ), - ], - ], -) -def test_concat_series_dataframe_input_str(objs): - pd_objs = objs - gd_objs = [cudf.from_pandas(obj) for obj in objs] - - expected = pd.concat(pd_objs) - actual = cudf.concat(gd_objs) - assert_eq(expected, actual, check_dtype=False, check_index_type=False) - - -@pytest.mark.parametrize( - "df", - [ - pd.DataFrame(), - pd.DataFrame(index=[10, 20, 30]), - pd.DataFrame( - {"c": [10, 11, 22, 33, 44, 100]}, index=[7, 8, 9, 10, 11, 20] - ), - pd.DataFrame([[5, 6], [7, 8]], columns=list("AB")), - pd.DataFrame({"f": [10.2, 11.2332, 0.22, 3.3, 44.23, 10.0]}), - pd.DataFrame({"l": [10]}), - pd.DataFrame({"l": [10]}, index=[200]), - pd.DataFrame([], index=[100]), - pd.DataFrame({"cat": pd.Series(["one", "two"], dtype="category")}), - ], -) -@pytest.mark.parametrize( - "other", - [ - [pd.DataFrame(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame()], - [ - pd.DataFrame( - {"c": [10, 11, 22, 33, 44, 100]}, index=[7, 8, 9, 10, 11, 20] - ), - pd.DataFrame(), - pd.DataFrame(), - pd.DataFrame([[5, 6], [7, 8]], columns=list("AB")), - ], - [ - pd.DataFrame({"f": [10.2, 11.2332, 0.22, 3.3, 44.23, 10.0]}), - pd.DataFrame({"l": [10]}), - pd.DataFrame({"l": [10]}, index=[200]), - pd.DataFrame( - {"cat": pd.Series(["two", "three"], dtype="category")} - ), - ], - [ - pd.DataFrame([]), - pd.DataFrame([], index=[100]), - pd.DataFrame( - {"cat": pd.Series(["two", "three"], dtype="category")} - ), - ], - ], -) -def test_concat_empty_dataframes(df, other, ignore_index): - other_pd = [df, *other] - - gdf = cudf.from_pandas(df) - other_gd = [gdf] + [cudf.from_pandas(o) for o in other] - - expected = pd.concat(other_pd, ignore_index=ignore_index) - actual = cudf.concat(other_gd, ignore_index=ignore_index) - if expected.shape != df.shape: - for key, col in actual[actual.columns].items(): - if isinstance(col.dtype, cudf.CategoricalDtype): - if not isinstance(expected[key].dtype, pd.CategoricalDtype): - # TODO: Pandas bug: - # https://github.com/pandas-dev/pandas/issues/42840 - expected[key] = expected[key].fillna("-1").astype("str") - else: - expected[key] = ( - expected[key] - .cat.add_categories(["-1"]) - .fillna("-1") - .astype("str") - ) - actual[key] = col.astype("str").fillna("-1") - else: - expected[key] = expected[key].fillna(-1) - actual[key] = col.fillna(-1) - assert_eq(expected, actual, check_dtype=False, check_index_type=True) - else: - assert_eq( - expected, - actual, - check_index_type=not gdf.empty, - check_column_type=False, - ) - - -@pytest.mark.parametrize("axis", [0, "index"]) -@pytest.mark.parametrize( - "data", - [ - (["a", "b", "c"], ["a", "b", "c"]), - (["a", "b", "c"], ["XX", "YY", "ZZ"]), - ], -) -def test_concat_empty_and_nonempty_series(ignore_index, data, axis): - s1 = cudf.Series() - s2 = cudf.Series(data[0]) - ps1 = s1.to_pandas() - ps2 = s2.to_pandas() - got = cudf.concat([s1, s2], axis=axis, ignore_index=ignore_index) - expect = pd.concat([ps1, ps2], axis=axis, ignore_index=ignore_index) - - assert_eq(got, expect, check_index_type=True) - - -@pytest.mark.parametrize("axis", [0, "index"]) -def test_concat_two_empty_series(ignore_index, axis): - s1 = cudf.Series() - s2 = cudf.Series() - ps1 = s1.to_pandas() - ps2 = s2.to_pandas() - got = cudf.concat([s1, s2], axis=axis, ignore_index=ignore_index) - expect = pd.concat([ps1, ps2], axis=axis, ignore_index=ignore_index) - - assert_eq(got, expect, check_index_type=True) - - -@pytest.mark.parametrize( - "key2", [[0, 1], [1, 0]], ids=["matching", "different"] -) -def test_concat_dataframe_with_multiindex(key2): - gdf1 = cudf.DataFrame({"k1": [0, 1], "k2": [2, 3], "v1": [4, 5]}) - gdf1 = gdf1.set_index(["k1", "k2"]) - - gdf2 = cudf.DataFrame({"k1": key2, "k2": [3, 2], "v2": [6, 7]}) - gdf2 = gdf2.set_index(["k1", "k2"]) - - pdf1 = gdf1.to_pandas() - pdf2 = gdf2.to_pandas() - - actual = cudf.concat([gdf1, gdf2], axis=1) - expected = pd.concat([pdf1, pdf2], axis=1) - - # Will need to sort_index before comparing as - # ordering is not deterministic in case of pandas - # multiIndex with concat. - assert_eq( - expected.sort_index(), - actual.sort_index(), - check_index_type=True, - ) - - -@pytest.mark.parametrize( - "objs", - [ - [ - pd.DataFrame( - { - "x": range(10), - "y": list(map(float, range(10))), - "z": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], - } - ), - pd.DataFrame( - {"x": range(10, 20), "y": list(map(float, range(10, 20)))} - ), - ], - [ - pd.DataFrame( - { - "x": range(10), - "y": list(map(float, range(10))), - "z": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], - }, - index=["a", "b", "c", "d", "e", "f", "g", "h", "i", "j"], - ), - pd.DataFrame( - {"x": range(10, 20), "y": list(map(float, range(10, 20)))}, - index=["k", "l", "m", "n", "o", "p", "q", "r", "s", "t"], - ), - pd.DataFrame( - { - "x": range(10), - "y": list(map(float, range(10))), - "z": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], - }, - index=["a", "b", "c", "d", "e", "f", "g", "h", "i", "j"], - ), - pd.DataFrame( - {"x": range(10, 20), "y": list(map(float, range(10, 20)))}, - index=["a", "b", "c", "d", "z", "f", "g", "h", "i", "w"], - ), - ], - ], -) -def test_concat_join(objs, ignore_index, sort, join, axis): - axis = 0 - gpu_objs = [cudf.from_pandas(o) for o in objs] - - assert_eq( - pd.concat( - objs, sort=sort, join=join, ignore_index=ignore_index, axis=axis - ), - cudf.concat( - gpu_objs, - sort=sort, - join=join, - ignore_index=ignore_index, - axis=axis, - ), - check_index_type=True, - ) - - -@pytest.mark.parametrize( - "objs", - [ - [ - pd.DataFrame( - { - "x": range(10), - "y": list(map(float, range(10))), - "z": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], - } - ), - pd.DataFrame( - {"x": range(10, 20), "y": list(map(float, range(10, 20)))} - ), - ], - ], -) -def test_concat_join_axis_1_dup_error(objs): - gpu_objs = [cudf.from_pandas(o) for o in objs] - # we do not support duplicate columns - with pytest.raises(NotImplementedError): - assert_eq( - pd.concat( - objs, - axis=1, - ), - cudf.concat( - gpu_objs, - axis=1, - ), - ) - - -@pytest.mark.parametrize( - "objs", - [ - [ - pd.DataFrame( - { - "x": range(10), - "y": list(map(float, range(10))), - "z": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], - } - ), - pd.DataFrame( - {"l": range(10, 20), "m": list(map(float, range(10, 20)))} - ), - ], - ], -) -def test_concat_join_axis_1(objs, ignore_index, sort, join, axis): - # no duplicate columns - axis = 1 - gpu_objs = [cudf.from_pandas(o) for o in objs] - expected = pd.concat( - objs, sort=sort, join=join, ignore_index=ignore_index, axis=axis - ) - actual = cudf.concat( - gpu_objs, - sort=sort, - join=join, - ignore_index=ignore_index, - axis=axis, - ) - - assert_eq(expected, actual, check_index_type=True) - - -def test_concat_join_many_df_and_empty_df(ignore_index, sort, join, axis): - # no duplicate columns - pdf1 = pd.DataFrame( - { - "x": range(10), - "y": list(map(float, range(10))), - "z": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], - } - ) - pdf2 = pd.DataFrame( - {"l": range(10, 20), "m": list(map(float, range(10, 20)))} - ) - pdf3 = pd.DataFrame({"j": [1, 2], "k": [1, 2], "s": [1, 2], "t": [1, 2]}) - pdf_empty1 = pd.DataFrame() - - gdf1 = cudf.from_pandas(pdf1) - gdf2 = cudf.from_pandas(pdf2) - gdf3 = cudf.from_pandas(pdf3) - gdf_empty1 = cudf.from_pandas(pdf_empty1) - - with _hide_concat_empty_dtype_warning(): - assert_eq( - pd.concat( - [pdf1, pdf2, pdf3, pdf_empty1], - sort=sort, - join=join, - ignore_index=ignore_index, - axis=axis, - ), - cudf.concat( - [gdf1, gdf2, gdf3, gdf_empty1], - sort=sort, - join=join, - ignore_index=ignore_index, - axis=axis, - ), - check_index_type=False, - ) - - -def test_concat_join_one_df(ignore_index, sort, join, axis): - pdf1 = pd.DataFrame( - { - "x": range(10), - "y": list(map(float, range(10))), - "z": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], - } - ) - - gdf1 = cudf.from_pandas(pdf1) - expected = pd.concat( - [pdf1], sort=sort, join=join, ignore_index=ignore_index, axis=axis - ) - actual = cudf.concat( - [gdf1], sort=sort, join=join, ignore_index=ignore_index, axis=axis - ) - - assert_eq(expected, actual, check_index_type=True) - - -@pytest.mark.parametrize( - "pdf1,pdf2", - [ - ( - pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}), - pd.DataFrame({"c": [7, 8, 9], "d": [10, 11, 12]}), - ), - ( - pd.DataFrame( - {"a": [1, 2, 3], "b": [4, 5, 6]}, index=["p", "q", "r"] - ), - pd.DataFrame( - {"c": [7, 8, 9], "d": [10, 11, 12]}, index=["r", "p", "z"] - ), - ), - ], -) -def test_concat_join_no_overlapping_columns( - pdf1, pdf2, ignore_index, sort, join, axis -): - gdf1 = cudf.from_pandas(pdf1) - gdf2 = cudf.from_pandas(pdf2) - - expected = pd.concat( - [pdf1, pdf2], - sort=sort, - join=join, - ignore_index=ignore_index, - axis=axis, - ) - actual = cudf.concat( - [gdf1, gdf2], - sort=sort, - join=join, - ignore_index=ignore_index, - axis=axis, - ) - - assert_eq(expected, actual, check_index_type=True) - - -def test_concat_join_no_overlapping_columns_many_and_empty( - ignore_index, sort, join, axis -): - pdf4 = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) - pdf5 = pd.DataFrame({"c": [7, 8, 9], "d": [10, 11, 12]}) - pdf6 = pd.DataFrame( - { - "x": range(10), - "y": list(map(float, range(10))), - "z": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], - } - ) - pdf_empty = pd.DataFrame() - - gdf4 = cudf.from_pandas(pdf4) - gdf5 = cudf.from_pandas(pdf5) - gdf6 = cudf.from_pandas(pdf6) - gdf_empty = cudf.from_pandas(pdf_empty) - - with _hide_concat_empty_dtype_warning(): - expected = pd.concat( - [pdf4, pdf5, pdf6, pdf_empty], - sort=sort, - join=join, - ignore_index=ignore_index, - axis=axis, - ) - actual = cudf.concat( - [gdf4, gdf5, gdf6, gdf_empty], - sort=sort, - join=join, - ignore_index=ignore_index, - axis=axis, - ) - assert_eq( - expected, - actual, - check_index_type=False, - ) - - -@pytest.mark.parametrize( - "objs", - [ - [ - pd.DataFrame( - {"a": [1, 2, 3], "b": [4, 5, 6]}, index=["z", "t", "k"] - ), - pd.DataFrame( - {"c": [7, 8, 9], "d": [10, 11, 12]}, index=["z", "t", "k"] - ), - pd.DataFrame( - { - "x": range(10), - "y": list(map(float, range(10))), - "z": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], - }, - index=["z", "t", "k", "a", "b", "c", "d", "e", "f", "g"], - ), - pd.DataFrame(index=pd.Index([], dtype="str")), - ], - [ - pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}), - pd.DataFrame({"c": [7, 8, 9], "d": [10, 11, 12]}), - pd.DataFrame( - { - "x": range(10), - "y": list(map(float, range(10))), - "z": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], - } - ), - pd.DataFrame(index=pd.Index([], dtype="str")), - ], - pytest.param( - [ - pd.DataFrame( - {"a": [1, 2, 3], "nb": [10, 11, 12]}, index=["Q", "W", "R"] - ), - None, - ], - ), - ], -) -def test_concat_join_no_overlapping_columns_many_and_empty2( - objs, ignore_index, sort, join, axis -): - objs_gd = [cudf.from_pandas(o) if o is not None else o for o in objs] - - with _hide_concat_empty_dtype_warning(): - expected = pd.concat( - objs, - sort=sort, - join=join, - ignore_index=ignore_index, - axis=axis, - ) - actual = cudf.concat( - objs_gd, - sort=sort, - join=join, - ignore_index=ignore_index, - axis=axis, - ) - assert_eq(expected, actual, check_index_type=False) - - -def test_concat_join_no_overlapping_columns_empty_df_basic( - ignore_index, sort, join, axis -): - pdf6 = pd.DataFrame( - { - "x": range(10), - "y": list(map(float, range(10))), - "z": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], - } - ) - pdf_empty = pd.DataFrame() - - gdf6 = cudf.from_pandas(pdf6) - gdf_empty = cudf.from_pandas(pdf_empty) - - with _hide_concat_empty_dtype_warning(): - expected = pd.concat( - [pdf6, pdf_empty], - sort=sort, - join=join, - ignore_index=ignore_index, - axis=axis, - ) - actual = cudf.concat( - [gdf6, gdf_empty], - sort=sort, - join=join, - ignore_index=ignore_index, - axis=axis, - ) - assert_eq( - expected, - actual, - check_index_type=True, - check_column_type=False, - ) - - -def test_concat_join_series(ignore_index, sort, join, axis): - s1 = cudf.Series(["a", "b", "c"]) - s2 = cudf.Series(["a", "b"]) - s3 = cudf.Series(["a", "b", "c", "d"]) - s4 = cudf.Series(dtype="str") - - ps1 = s1.to_pandas() - ps2 = s2.to_pandas() - ps3 = s3.to_pandas() - ps4 = s4.to_pandas() - - expected = pd.concat( - [ps1, ps2, ps3, ps4], - sort=sort, - join=join, - ignore_index=ignore_index, - axis=axis, - ) - with expect_warning_if(axis in {1, "columns"}): - actual = cudf.concat( - [s1, s2, s3, s4], - sort=sort, - join=join, - ignore_index=ignore_index, - axis=axis, - ) - - assert_eq( - expected, - actual, - check_index_type=True, - ) - - -@pytest.mark.parametrize( - "df", - [ - pd.DataFrame(), - pd.DataFrame(index=[10, 20, 30]), - pd.DataFrame( - {"c": [10, 11, 22, 33, 44, 100]}, index=[7, 8, 9, 10, 11, 20] - ), - pd.DataFrame([[5, 6], [7, 8]], columns=list("AB")), - pd.DataFrame({"f": [10.2, 11.2332, 0.22, 3.3, 44.23, 10.0]}), - pd.DataFrame({"l": [10]}), - pd.DataFrame({"l": [10]}, index=[200]), - pd.DataFrame([], index=[100]), - pd.DataFrame({"cat": pd.Series(["one", "two"], dtype="category")}), - ], -) -@pytest.mark.parametrize( - "other", - [ - [pd.DataFrame(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame()], - [ - pd.DataFrame( - {"b": [10, 11, 22, 33, 44, 100]}, index=[7, 8, 9, 10, 11, 20] - ), - pd.DataFrame(), - pd.DataFrame(), - pd.DataFrame([[5, 6], [7, 8]], columns=list("AB")), - ], - [ - pd.DataFrame({"f": [10.2, 11.2332, 0.22, 3.3, 44.23, 10.0]}), - pd.DataFrame({"l": [10]}), - pd.DataFrame({"k": [10]}, index=[200]), - pd.DataFrame( - {"cat": pd.Series(["two", "three"], dtype="category")} - ), - ], - [ - pd.DataFrame([]), - pd.DataFrame([], index=[100]), - pd.DataFrame( - {"cat": pd.Series(["two", "three"], dtype="category")} - ), - ], - ], -) -def test_concat_join_empty_dataframes( - request, df, other, ignore_index, join, sort -): - axis = 0 - other_pd = [df, *other] - gdf = cudf.from_pandas(df) - other_gd = [gdf] + [cudf.from_pandas(o) for o in other] - - expected = pd.concat( - other_pd, ignore_index=ignore_index, axis=axis, join=join, sort=sort - ) - actual = cudf.concat( - other_gd, ignore_index=ignore_index, axis=axis, join=join, sort=sort - ) - if ( - join == "outer" - and any( - isinstance(dtype, pd.CategoricalDtype) - for dtype in df.dtypes.tolist() - ) - and any( - isinstance(dtype, pd.CategoricalDtype) - for other_df in other - for dtype in other_df.dtypes.tolist() - ) - ): - request.applymarker( - pytest.mark.xfail( - reason="https://github.com/pandas-dev/pandas/issues/42840" - ) - ) - assert_eq( - expected, - actual, - check_dtype=False, - check_column_type=False, - ) - - -@pytest.mark.parametrize( - "df", - [ - pd.DataFrame(), - pd.DataFrame(index=[10, 20, 30]), - pd.DataFrame( - {"c": [10, 11, 22, 33, 44, 100]}, index=[7, 8, 9, 10, 11, 20] - ), - pd.DataFrame([[5, 6], [7, 8]], columns=list("AB")), - pd.DataFrame({"f": [10.2, 11.2332, 0.22, 3.3, 44.23, 10.0]}), - pd.DataFrame({"l": [10]}), - pd.DataFrame({"m": [10]}, index=[200]), - pd.DataFrame([], index=[100]), - pd.DataFrame({"cat": pd.Series(["one", "two"], dtype="category")}), - ], -) -@pytest.mark.parametrize( - "other", - [ - [pd.DataFrame(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame()], - [ - pd.DataFrame( - {"b": [10, 11, 22, 33, 44, 100]}, index=[7, 8, 9, 10, 11, 20] - ), - pd.DataFrame(), - pd.DataFrame(), - pd.DataFrame([[5, 6], [7, 8]], columns=list("CD")), - ], - [ - pd.DataFrame({"g": [10.2, 11.2332, 0.22, 3.3, 44.23, 10.0]}), - pd.DataFrame({"h": [10]}), - pd.DataFrame({"k": [10]}, index=[200]), - pd.DataFrame( - {"dog": pd.Series(["two", "three"], dtype="category")} - ), - ], - [ - pd.DataFrame([]), - pd.DataFrame([], index=[100]), - pd.DataFrame( - {"bird": pd.Series(["two", "three"], dtype="category")} - ), - ], - ], -) -def test_concat_join_empty_dataframes_axis_1( - df, other, ignore_index, axis, join, sort -): - # no duplicate columns - axis = 1 - other_pd = [df, *other] - gdf = cudf.from_pandas(df) - other_gd = [gdf] + [cudf.from_pandas(o) for o in other] - - with _hide_concat_empty_dtype_warning(): - expected = pd.concat( - other_pd, - ignore_index=ignore_index, - axis=axis, - join=join, - sort=sort, - ) - actual = cudf.concat( - other_gd, - ignore_index=ignore_index, - axis=axis, - join=join, - sort=sort, - ) - if expected.shape != df.shape: - if axis == 0: - for key, col in actual[actual.columns].items(): - if isinstance(expected[key].dtype, pd.CategoricalDtype): - expected[key] = expected[key].fillna("-1") - actual[key] = col.astype("str").fillna("-1") - # if not expected.empty: - assert_eq( - expected.fillna(-1), - actual.fillna(-1), - check_dtype=False, - check_index_type=False - if len(expected) == 0 or actual.empty - else True, - check_column_type=False, - ) - else: - # no need to fill in if axis=1 - assert_eq( - expected, - actual, - check_index_type=False, - check_column_type=False, - ) - assert_eq( - expected, actual, check_index_type=False, check_column_type=False - ) - - -def test_concat_preserve_order(): - """Ensure that order is preserved on 'inner' concatenations.""" - df = pd.DataFrame([["d", 3, 4.0], ["c", 4, 5.0]], columns=["c", "b", "a"]) - dfs = [df, df] - - assert_eq( - pd.concat(dfs, join="inner"), - cudf.concat([cudf.DataFrame(df) for df in dfs], join="inner"), - check_index_type=True, - ) - - -@pytest.mark.parametrize("typ", [cudf.DataFrame, cudf.Series]) -def test_concat_single_object(ignore_index, typ): - """Ensure that concat on a single object does not change it.""" - obj = typ([1, 2, 3]) - assert_eq( - cudf.concat([obj], ignore_index=ignore_index, axis=0), - obj, - check_index_type=True, - ) - - -@pytest.mark.parametrize( - "ltype", - [Decimal64Dtype(3, 1), Decimal64Dtype(7, 2), Decimal64Dtype(8, 4)], -) -@pytest.mark.parametrize( - "rtype", - [ - Decimal64Dtype(3, 2), - Decimal64Dtype(8, 4), - cudf.Decimal128Dtype(3, 2), - cudf.Decimal32Dtype(8, 4), - ], -) -def test_concat_decimal_dataframe(ltype, rtype): - rng = np.random.default_rng(seed=0) - gdf1 = cudf.DataFrame( - {"id": rng.integers(0, 10, 3), "val": ["22.3", "59.5", "81.1"]} - ) - gdf2 = cudf.DataFrame( - {"id": rng.integers(0, 10, 3), "val": ["2.35", "5.59", "8.14"]} - ) - - gdf1["val"] = gdf1["val"].astype(ltype) - gdf2["val"] = gdf2["val"].astype(rtype) - - pdf1 = gdf1.to_pandas() - pdf2 = gdf2.to_pandas() - - got = cudf.concat([gdf1, gdf2]) - expected = pd.concat([pdf1, pdf2]) - - assert_eq(expected, got, check_index_type=True) - - -@pytest.mark.parametrize("ltype", [Decimal64Dtype(4, 1), Decimal64Dtype(8, 2)]) -@pytest.mark.parametrize( - "rtype", - [ - Decimal64Dtype(4, 3), - Decimal64Dtype(10, 4), - Decimal32Dtype(8, 3), - Decimal128Dtype(18, 3), - ], -) -def test_concat_decimal_series(ltype, rtype): - gs1 = cudf.Series(["228.3", "559.5", "281.1"]).astype(ltype) - gs2 = cudf.Series(["2.345", "5.259", "8.154"]).astype(rtype) - - ps1 = gs1.to_pandas() - ps2 = gs2.to_pandas() - - got = cudf.concat([gs1, gs2]) - expected = pd.concat([ps1, ps2]) - - assert_eq(expected, got, check_index_type=True) - - -@pytest.mark.parametrize( - "data1, dtype1, index1, data2, dtype2, index2, data3, dtype3, index3, expected_data, expected_dtype, expected_index", - [ - [ - {"val": [Decimal("42.5"), Decimal("8.7")]}, - Decimal64Dtype(5, 2), - None, - {"val": [Decimal("9.23"), Decimal("-67.49")]}, - Decimal64Dtype(6, 4), - None, - {"val": [8, -5]}, - "int32", - None, - { - "val": [ - Decimal("42.5"), - Decimal("8.7"), - Decimal("9.23"), - Decimal("-67.49"), - Decimal("8"), - Decimal("-5"), - ] - }, - Decimal32Dtype(7, 4), - [0, 1, 0, 1, 0, 1], - ], - [ - {"val": [Decimal("95.2"), Decimal("23.4")]}, - Decimal64Dtype(5, 2), - None, - {"val": [54, 509]}, - "uint16", - None, - {"val": [24, -48]}, - "int32", - None, - { - "val": [ - Decimal("95.2"), - Decimal("23.4"), - Decimal("54"), - Decimal("509"), - Decimal("24"), - Decimal("-48"), - ] - }, - Decimal32Dtype(5, 2), - [0, 1, 0, 1, 0, 1], - ], - [ - {"val": [Decimal("36.56"), Decimal("-59.24")]}, - Decimal64Dtype(9, 4), - None, - {"val": [403.21, 45.13]}, - "float32", - None, - {"val": [52.262, -49.25]}, - "float64", - None, - { - "val": [ - Decimal("36.56"), - Decimal("-59.24"), - Decimal("403.21"), - Decimal("45.13"), - Decimal("52.262"), - Decimal("-49.25"), - ] - }, - Decimal32Dtype(9, 4), - [0, 1, 0, 1, 0, 1], - ], - [ - {"val": [Decimal("9563.24"), Decimal("236.633")]}, - Decimal64Dtype(9, 4), - None, - {"val": [5393, -95832]}, - "int64", - None, - {"val": [-29.234, -31.945]}, - "float64", - None, - { - "val": [ - Decimal("9563.24"), - Decimal("236.633"), - Decimal("5393"), - Decimal("-95832"), - Decimal("-29.234"), - Decimal("-31.945"), - ] - }, - Decimal32Dtype(9, 4), - [0, 1, 0, 1, 0, 1], - ], - [ - {"val": [Decimal("95633.24"), Decimal("236.633")]}, - Decimal128Dtype(19, 4), - None, - {"val": [5393, -95832]}, - "int64", - None, - {"val": [-29.234, -31.945]}, - "float64", - None, - { - "val": [ - Decimal("95633.24"), - Decimal("236.633"), - Decimal("5393"), - Decimal("-95832"), - Decimal("-29.234"), - Decimal("-31.945"), - ] - }, - Decimal128Dtype(19, 4), - [0, 1, 0, 1, 0, 1], - ], - ], -) -def test_concat_decimal_numeric_dataframe( - data1, - dtype1, - index1, - data2, - dtype2, - index2, - data3, - dtype3, - index3, - expected_data, - expected_dtype, - expected_index, -): - df1 = cudf.DataFrame(data1, dtype=dtype1, index=index1) - df2 = cudf.DataFrame(data2, dtype=dtype2, index=index2) - df3 = cudf.DataFrame(data3, dtype=dtype3, index=index3) - expected = cudf.DataFrame( - expected_data, dtype=expected_dtype, index=expected_index - ) - df = cudf.concat([df1, df2, df3]) - assert_eq(df, expected, check_index_type=True) - assert_eq(df.val.dtype, expected.val.dtype) - - -@pytest.mark.parametrize( - "data1, dtype1, index1, data2, dtype2, index2, data3, dtype3, index3, expected_data, expected_dtype, expected_index", - [ - [ - [Decimal("32.8"), Decimal("-87.7")], - Decimal64Dtype(6, 2), - None, - [Decimal("101.243"), Decimal("-92.449")], - Decimal64Dtype(9, 6), - None, - [94, -22], - "int32", - None, - [ - Decimal("32.8"), - Decimal("-87.7"), - Decimal("101.243"), - Decimal("-92.449"), - Decimal("94"), - Decimal("-22"), - ], - Decimal64Dtype(10, 6), - [0, 1, 0, 1, 0, 1], - ], - [ - [Decimal("7.2"), Decimal("122.1")], - Decimal64Dtype(5, 2), - None, - [33, 984], - "uint32", - None, - [593, -702], - "int32", - None, - [ - Decimal("7.2"), - Decimal("122.1"), - Decimal("33"), - Decimal("984"), - Decimal("593"), - Decimal("-702"), - ], - Decimal32Dtype(5, 2), - [0, 1, 0, 1, 0, 1], - ], - [ - [Decimal("982.94"), Decimal("-493.626")], - Decimal64Dtype(9, 4), - None, - [847.98, 254.442], - "float32", - None, - [5299.262, -2049.25], - "float64", - None, - [ - Decimal("982.94"), - Decimal("-493.626"), - Decimal("847.98"), - Decimal("254.442"), - Decimal("5299.262"), - Decimal("-2049.25"), - ], - Decimal32Dtype(9, 4), - [0, 1, 0, 1, 0, 1], - ], - [ - [Decimal("492.204"), Decimal("-72824.455")], - Decimal64Dtype(9, 4), - None, - [8438, -27462], - "int64", - None, - [-40.292, 49202.953], - "float64", - None, - [ - Decimal("492.204"), - Decimal("-72824.455"), - Decimal("8438"), - Decimal("-27462"), - Decimal("-40.292"), - Decimal("49202.953"), - ], - Decimal32Dtype(9, 4), - [0, 1, 0, 1, 0, 1], - ], - [ - [Decimal("492.204"), Decimal("-72824.455")], - Decimal64Dtype(10, 4), - None, - [Decimal("8438"), Decimal("-27462")], - Decimal32Dtype(9, 4), - None, - [Decimal("-40.292"), Decimal("49202.953")], - Decimal128Dtype(19, 4), - None, - [ - Decimal("492.204"), - Decimal("-72824.455"), - Decimal("8438"), - Decimal("-27462"), - Decimal("-40.292"), - Decimal("49202.953"), - ], - Decimal128Dtype(19, 4), - [0, 1, 0, 1, 0, 1], - ], - ], -) -def test_concat_decimal_numeric_series( - data1, - dtype1, - index1, - data2, - dtype2, - index2, - data3, - dtype3, - index3, - expected_data, - expected_dtype, - expected_index, -): - s1 = cudf.Series(data1, dtype=dtype1, index=index1) - s2 = cudf.Series(data2, dtype=dtype2, index=index2) - s3 = cudf.Series(data3, dtype=dtype3, index=index3) - expected = cudf.Series( - expected_data, dtype=expected_dtype, index=expected_index - ) - s = cudf.concat([s1, s2, s3]) - assert_eq(s, expected, check_index_type=True) - - -@pytest.mark.parametrize( - "data1, dtype1, index1, data2, dtype2, index2, expected_data, expected_dtype, expected_index", - [ - [ - [Decimal("955.22"), Decimal("8.2")], - Decimal64Dtype(5, 2), - None, - ["2007-06-12", "2006-03-14"], - "datetime64[s]", - None, - [ - "955.22", - "8.20", - "2007-06-12 00:00:00", - "2006-03-14 00:00:00", - ], - None, - [0, 1, 0, 1], - ], - [ - [Decimal("-52.44"), Decimal("365.22")], - Decimal64Dtype(5, 2), - None, - np.arange( - "2005-02-01T12", "2005-02-01T15", dtype="datetime64[h]" - ).astype("datetime64[s]"), - "datetime64[s]", - None, - [ - "-52.44", - "365.22", - "2005-02-01 12:00:00", - "2005-02-01 13:00:00", - "2005-02-01 14:00:00", - ], - None, - [0, 1, 0, 1, 2], - ], - [ - [Decimal("753.0"), Decimal("94.22")], - Decimal64Dtype(5, 2), - None, - [np.timedelta64(111, "s"), np.timedelta64(509, "s")], - None, - None, - [ - "753.00", - "94.22", - "0 days 00:01:51", - "0 days 00:08:29", - ], - None, - [0, 1, 0, 1], - ], - [ - [Decimal("753.0"), Decimal("94.22")], - Decimal64Dtype(5, 2), - None, - [np.timedelta64(940252, "s"), np.timedelta64(758385, "s")], - None, - None, - [ - "753.00", - "94.22", - "10 days 21:10:52", - "8 days 18:39:45", - ], - None, - [0, 1, 0, 1], - ], - ], -) -def test_concat_decimal_non_numeric( - data1, - dtype1, - index1, - data2, - dtype2, - index2, - expected_data, - expected_dtype, - expected_index, -): - s1 = cudf.Series(data1, dtype=dtype1, index=index1) - s2 = cudf.Series(data2, dtype=dtype2, index=index2) - expected = cudf.Series( - expected_data, dtype=expected_dtype, index=expected_index - ) - s = cudf.concat([s1, s2]) - assert_eq(s, expected, check_index_type=True) - - -def test_concat_struct_column(): - s1 = cudf.Series([{"a": 5}, {"c": "hello"}, {"b": 7}]) - s2 = cudf.Series([{"a": 5, "c": "hello", "b": 7}]) - expected = cudf.Series( - [ - {"a": 5, "b": None, "c": None}, - {"a": None, "b": None, "c": "hello"}, - {"a": None, "b": 7, "c": None}, - {"a": 5, "b": 7, "c": "hello"}, - ], - index=[0, 1, 2, 0], - ) - s = cudf.concat([s1, s2]) - assert_eq(s, expected, check_index_type=True) - - -@pytest.mark.parametrize( - "frame1_cls, frame1_data, frame2_cls, frame2_data, expected_cls, expected_data", - [ - ( - cudf.Series, - {"data": [[{"b": 0}], [{"b": 1}], [{"b": 3}]]}, - cudf.Series, - {"data": [[{"b": 10}], [{"b": 12}], None]}, - cudf.Series, - { - "data": [ - [{"b": 0}], - [{"b": 1}], - [{"b": 3}], - [{"b": 10}], - [{"b": 12}], - None, - ], - "index": [0, 1, 2, 0, 1, 2], - }, - ), - ( - cudf.DataFrame, - {"data": {"a": [[{"b": 0}], [{"b": 1}], [{"b": 3}]]}}, - cudf.DataFrame, - {"data": {"a": [[{"b": 10}], [{"b": 12}], None]}}, - cudf.DataFrame, - { - "data": { - "a": [ - [{"b": 0}], - [{"b": 1}], - [{"b": 3}], - [{"b": 10}], - [{"b": 12}], - None, - ] - }, - "index": [0, 1, 2, 0, 1, 2], - }, - ), - ], -) -def test_concat_list_column( - frame1_cls, - frame1_data, - frame2_cls, - frame2_data, - expected_cls, - expected_data, -): - frame1 = frame1_cls(**frame1_data) - frame2 = frame2_cls(**frame2_data) - expected = expected_cls(**expected_data) - actual = cudf.concat([frame1, frame2]) - assert_eq(actual, expected, check_index_type=True) - - -def test_concat_categorical_ordering(): - # https://github.com/rapidsai/cudf/issues/11486 - sr = pd.Series( - ["a", "b", "c", "d", "e", "a", "b", "c", "d", "e"], dtype="category" - ) - sr = sr.cat.set_categories(["d", "a", "b", "c", "e"]) - - df = pd.DataFrame({"a": sr}) - gdf = cudf.from_pandas(df) - - expect = pd.concat([df, df, df]) - got = cudf.concat([gdf, gdf, gdf]) - - assert_eq(expect, got) - - -@pytest.fixture(params=["rangeindex", "index"]) -def singleton_concat_index(request): - if request.param == "rangeindex": - return pd.RangeIndex(0, 4) - else: - return pd.Index(["a", "h", "g", "f"]) - - -@pytest.fixture(params=["dataframe", "series"]) -def singleton_concat_obj(request, singleton_concat_index): - if request.param == "dataframe": - return pd.DataFrame( - { - "b": [1, 2, 3, 4], - "d": [7, 8, 9, 10], - "a": [4, 5, 6, 7], - "c": [10, 11, 12, 13], - }, - index=singleton_concat_index, - ) - else: - return pd.Series([4, 5, 5, 6], index=singleton_concat_index) - - -def test_concat_singleton_sorting( - axis, sort, ignore_index, singleton_concat_obj -): - gobj = cudf.from_pandas(singleton_concat_obj) - gconcat = cudf.concat( - [gobj], axis=axis, sort=sort, ignore_index=ignore_index - ) - pconcat = pd.concat( - [singleton_concat_obj], axis=axis, sort=sort, ignore_index=ignore_index - ) - assert_eq(pconcat, gconcat) - - -@pytest.mark.parametrize("axis", [2, "invalid"]) -def test_concat_invalid_axis(axis): - s = cudf.Series([1, 2, 3]) - with pytest.raises(ValueError): - cudf.concat([s], axis=axis) - - -@pytest.mark.parametrize( - "s1,s2", - [ - ([1, 2], [[1, 2], [3, 4]]), - ], -) -def test_concat_mixed_list_types_error(s1, s2): - s1, s2 = cudf.Series(s1), cudf.Series(s2) - - with pytest.raises(NotImplementedError): - cudf.concat([s1, s2], ignore_index=True) - - -@pytest.mark.parametrize( - "axis", - [ - pytest.param( - 0, - marks=pytest.mark.xfail( - reason="concat dictionaries with axis=0 not implemented" - ), - ), - 1, - "columns", - ], -) -@pytest.mark.parametrize( - "d", - [ - {"first": (cudf.DataFrame, {"data": {"A": [1, 2], "B": [3, 4]}})}, - { - "first": (cudf.DataFrame, {"data": {"A": [1, 2], "B": [3, 4]}}), - "second": (cudf.DataFrame, {"data": {"A": [5, 6], "B": [7, 8]}}), - "third": (cudf.DataFrame, {"data": {"C": [1, 2, 3]}}), - }, - { - "first": (cudf.DataFrame, {"data": {"A": [1, 2], "B": [3, 4]}}), - "second": (cudf.DataFrame, {"data": {"C": [1, 2, 3]}}), - "third": (cudf.DataFrame, {"data": {"A": [5, 6], "B": [7, 8]}}), - }, - { - "first": (cudf.DataFrame, {"data": {"A": [1, 2], "B": [3, 4]}}), - "second": (cudf.DataFrame, {"data": {"C": [1, 2, 3]}}), - "third": (cudf.DataFrame, {"data": {"A": [5, 6], "C": [7, 8]}}), - "fourth": (cudf.DataFrame, {"data": {"B": [9, 10]}}), - }, - pytest.param( - { - "first": (cudf.DataFrame, {"data": {2.0: [1, 1]}}), - "second": (cudf.DataFrame, {"data": {"test": ["abc", "def"]}}), - }, - marks=pytest.mark.xfail( - reason=( - "Cannot construct a MultiIndex column with multiple " - "label types in cuDF at this time. You must convert " - "the labels to the same type." - ) - ), - ), - { - "first": (cudf.Series, {"data": [1, 2, 3]}), - "second": (cudf.Series, {"data": [4, 5, 6]}), - }, - { - "first": (cudf.DataFrame, {"data": {"A": [1, 2], "B": [3, 4]}}), - "second": (cudf.Series, {"data": [5, 6], "name": "C"}), - }, - pytest.param( - { - "first": ( - cudf.DataFrame, - {"data": {("A", "B"): [1, 2], "C": [3, 4]}}, - ), - "second": ( - cudf.DataFrame, - {"data": {"D": [5, 6], ("A", "B"): [7, 8]}}, - ), - }, - marks=pytest.mark.xfail( - reason=( - "Cannot construct a MultiIndex column with multiple " - "label types in cuDF at this time. You must convert " - "the labels to the same type." - ) - ), - ), - pytest.param( - { - "first": ( - cudf.DataFrame, - {"data": {("A", "B"): [3, 4], 2.0: [1, 1]}}, - ), - "second": ( - cudf.DataFrame, - {"data": {("C", "D"): [3, 4], 3.0: [5, 6]}}, - ), - }, - marks=pytest.mark.xfail( - reason=( - "Cannot construct a MultiIndex column with multiple " - "label types in cuDF at this time. You must convert " - "the labels to the same type." - ) - ), - ), - { - "first": ( - cudf.DataFrame, - {"data": {(1, 2): [1, 2], (3, 4): [3, 4]}}, - ), - "second": ( - cudf.DataFrame, - {"data": {(1, 2): [5, 6], (5, 6): [7, 8]}}, - ), - }, - ], -) -def test_concat_dictionary(d, axis): - _dict = {k: c(**v) for k, (c, v) in d.items()} - result = cudf.concat(_dict, axis=axis) - expected = cudf.from_pandas( - pd.concat({k: df.to_pandas() for k, df in _dict.items()}, axis=axis) - ) - assert_eq(expected, result) - - -@pytest.mark.parametrize( - "idx_cls, idx_data", - [ - [cudf.Index, {"data": [1, 2, 3]}], - [ - cudf.MultiIndex, - { - "levels": [[1, 2], ["blue", "red"]], - "codes": [[0, 0, 1, 1], [1, 0, 1, 0]], - }, - ], - [cudf.CategoricalIndex, {"data": [1, 2, 3]}], - ], -) -def test_concat_dict_incorrect_type_index(idx_cls, idx_data): - idx = idx_cls(**idx_data) - with pytest.raises( - TypeError, - match="cannot concatenate a dictionary containing indices", - ): - cudf.concat({"first": idx}, axis=1) From d71e00debdcd91e0eb7a5d258a84fb09c0d49cbc Mon Sep 17 00:00:00 2001 From: Basit Ayantunde Date: Sat, 16 Aug 2025 02:05:31 +0100 Subject: [PATCH 144/366] [FEA] Switch to NVIDIA's JITIFY2 (#19561) This merge request switches CUDF's JITIFY2 upstream from the RAPIDSAI fork (https://github.com/rapidsai/jitify/tree/jitify2) to the NVIDIA fork (https://github.com/NVIDIA/jitify/tree/jitify2). The NVIDIA fork has newer features & improvements, including NVTX ranges, which are needed to properly benchmark each compilation/cache step of a specific JIT workload. This was previously impossible to track accurately and concisely. Follows up: https://github.com/rapidsai/cudf/issues/18023 Authors: - Basit Ayantunde (https://github.com/lamarrr) - Vyas Ramasubramani (https://github.com/vyasr) Approvers: - Bradley Dice (https://github.com/bdice) - Muhammad Haseeb (https://github.com/mhaseeb123) - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/19561 --- conda/environments/all_cuda-129_arch-aarch64.yaml | 1 + conda/environments/all_cuda-129_arch-x86_64.yaml | 1 + conda/recipes/libcudf/recipe.yaml | 1 + cpp/cmake/Modules/JitifyPreprocessKernels.cmake | 5 +++-- cpp/cmake/thirdparty/get_jitify.cmake | 6 +++--- cpp/src/binaryop/jit/kernel.cu | 2 +- cpp/src/jit/accessors.cuh | 4 ++-- cpp/src/jit/cache.hpp | 2 ++ cpp/src/jit/helpers.hpp | 3 +-- cpp/src/rolling/jit/kernel.cu | 6 +++--- cpp/src/rolling/jit/operation.hpp | 6 +++--- cpp/src/stream_compaction/filter/filter.cu | 11 +++++------ cpp/src/stream_compaction/filter/jit/kernel.cu | 6 +++--- cpp/src/transform/jit/kernel.cu | 6 +++--- cpp/src/transform/transform.cpp | 15 +++++++-------- dependencies.yaml | 1 + 16 files changed, 40 insertions(+), 36 deletions(-) diff --git a/conda/environments/all_cuda-129_arch-aarch64.yaml b/conda/environments/all_cuda-129_arch-aarch64.yaml index b53ce1f224e..ce67a5abbca 100644 --- a/conda/environments/all_cuda-129_arch-aarch64.yaml +++ b/conda/environments/all_cuda-129_arch-aarch64.yaml @@ -41,6 +41,7 @@ dependencies: - libcurand-dev - libkvikio==25.10.*,>=0.0.0a0 - libnvcomp-dev==4.2.0.11 +- libnvjitlink-dev - librdkafka>=2.8.0,<2.9.0a0 - librmm==25.10.*,>=0.0.0a0 - make diff --git a/conda/environments/all_cuda-129_arch-x86_64.yaml b/conda/environments/all_cuda-129_arch-x86_64.yaml index 4d1af2746ac..04178f88b83 100644 --- a/conda/environments/all_cuda-129_arch-x86_64.yaml +++ b/conda/environments/all_cuda-129_arch-x86_64.yaml @@ -42,6 +42,7 @@ dependencies: - libcurand-dev - libkvikio==25.10.*,>=0.0.0a0 - libnvcomp-dev==4.2.0.11 +- libnvjitlink-dev - librdkafka>=2.8.0,<2.9.0a0 - librmm==25.10.*,>=0.0.0a0 - make diff --git a/conda/recipes/libcudf/recipe.yaml b/conda/recipes/libcudf/recipe.yaml index 814a304ebef..3e97dc84a7d 100644 --- a/conda/recipes/libcudf/recipe.yaml +++ b/conda/recipes/libcudf/recipe.yaml @@ -68,6 +68,7 @@ cache: - cuda-nvrtc-dev - cuda-nvtx-dev - libcurand-dev + - libnvjitlink-dev - if: linux and x86_64 then: - libcufile-dev diff --git a/cpp/cmake/Modules/JitifyPreprocessKernels.cmake b/cpp/cmake/Modules/JitifyPreprocessKernels.cmake index 388c748c694..7c27920ee28 100644 --- a/cpp/cmake/Modules/JitifyPreprocessKernels.cmake +++ b/cpp/cmake/Modules/JitifyPreprocessKernels.cmake @@ -33,6 +33,8 @@ function(jit_preprocess_files) get_filename_component(jit_output_directory "${ARG_OUTPUT}" DIRECTORY) list(APPEND JIT_PREPROCESSED_FILES "${ARG_OUTPUT}") + get_filename_component(ARG_OUTPUT_DIR "${ARG_OUTPUT}" DIRECTORY) + # Note: need to pass _FILE_OFFSET_BITS=64 in COMMAND due to a limitation in how conda builds # glibc add_custom_command( @@ -43,8 +45,7 @@ function(jit_preprocess_files) COMMAND ${CMAKE_COMMAND} -E make_directory "${jit_output_directory}" COMMAND "${CMAKE_COMMAND}" -E env LD_LIBRARY_PATH=${CUDAToolkit_LIBRARY_DIR} - $ ${ARG_FILE} -o - ${CUDF_GENERATED_INCLUDE_DIR}/include/jit_preprocessed_files -i -std=c++20 + $ ${ARG_FILE} -o ${ARG_OUTPUT_DIR} -i -std=c++20 -remove-unused-globals -D_FILE_OFFSET_BITS=64 -D__CUDACC_RTC__ -DCUDF_RUNTIME_JIT -I${CUDF_SOURCE_DIR}/include -I${CUDF_SOURCE_DIR}/src ${includes} --no-preinclude-workarounds --no-replace-pragma-once diff --git a/cpp/cmake/thirdparty/get_jitify.cmake b/cpp/cmake/thirdparty/get_jitify.cmake index d98abdf8824..b6f11e30d28 100644 --- a/cpp/cmake/thirdparty/get_jitify.cmake +++ b/cpp/cmake/thirdparty/get_jitify.cmake @@ -1,5 +1,5 @@ # ============================================================================= -# Copyright (c) 2020-2022, NVIDIA CORPORATION. +# Copyright (c) 2020-2025, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except # in compliance with the License. You may obtain a copy of the License at @@ -18,8 +18,8 @@ function(find_and_configure_jitify) rapids_cpm_find( jitify 2.0.0 - GIT_REPOSITORY https://github.com/rapidsai/jitify.git - GIT_TAG jitify2 + GIT_REPOSITORY https://github.com/NVIDIA/jitify.git + GIT_TAG 70783a3ad7b0cad2992a26a1ebf8fbe3d6b44e25 # jitify2 branch as of 5th Aug 2025 GIT_SHALLOW TRUE DOWNLOAD_ONLY TRUE ) diff --git a/cpp/src/binaryop/jit/kernel.cu b/cpp/src/binaryop/jit/kernel.cu index dc021ea99a6..2951178af97 100644 --- a/cpp/src/binaryop/jit/kernel.cu +++ b/cpp/src/binaryop/jit/kernel.cu @@ -28,7 +28,7 @@ #include // clang-format off -#include "binaryop/jit/operation-udf.hpp" +#include // clang-format on namespace cudf { diff --git a/cpp/src/jit/accessors.cuh b/cpp/src/jit/accessors.cuh index 705028ddedd..adfb9ea7328 100644 --- a/cpp/src/jit/accessors.cuh +++ b/cpp/src/jit/accessors.cuh @@ -15,13 +15,13 @@ */ #pragma once -#include "jit/span.cuh" - #include #include #include +#include + #include namespace cudf { diff --git a/cpp/src/jit/cache.hpp b/cpp/src/jit/cache.hpp index 1772134bb90..00ad23ace5a 100644 --- a/cpp/src/jit/cache.hpp +++ b/cpp/src/jit/cache.hpp @@ -15,6 +15,8 @@ */ #pragma once +#pragma GCC diagnostic ignored "-Wignored-attributes" // Work-around for JITIFY2's false-positive + // warnings when compiled with GCC13 #include diff --git a/cpp/src/jit/helpers.hpp b/cpp/src/jit/helpers.hpp index 5b2427575b0..65e28e9c39a 100644 --- a/cpp/src/jit/helpers.hpp +++ b/cpp/src/jit/helpers.hpp @@ -15,12 +15,11 @@ */ #pragma once -#include "jit/span.cuh" - #include #include #include +#include #include namespace cudf { diff --git a/cpp/src/rolling/jit/kernel.cu b/cpp/src/rolling/jit/kernel.cu index 12219a286f7..9b4f11ebdb6 100644 --- a/cpp/src/rolling/jit/kernel.cu +++ b/cpp/src/rolling/jit/kernel.cu @@ -14,13 +14,13 @@ * limitations under the License. */ -#include "rolling/detail/rolling_jit.hpp" -#include "rolling/jit/operation.hpp" - #include #include #include +#include +#include + namespace cudf { namespace rolling { namespace jit { diff --git a/cpp/src/rolling/jit/operation.hpp b/cpp/src/rolling/jit/operation.hpp index 3be739ec5bf..51cd0a6b121 100644 --- a/cpp/src/rolling/jit/operation.hpp +++ b/cpp/src/rolling/jit/operation.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2024, NVIDIA CORPORATION. + * Copyright (c) 2019-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -16,10 +16,10 @@ #pragma once -#include "rolling/jit/operation-udf.hpp" - #include +#include + struct rolling_udf_ptx { template static OutType operate(InType const* in_col, cudf::size_type start, cudf::size_type count) diff --git a/cpp/src/stream_compaction/filter/filter.cu b/cpp/src/stream_compaction/filter/filter.cu index 9369eee30d4..a518b566c97 100644 --- a/cpp/src/stream_compaction/filter/filter.cu +++ b/cpp/src/stream_compaction/filter/filter.cu @@ -14,11 +14,6 @@ * limitations under the License. */ -#include "jit/cache.hpp" -#include "jit/helpers.hpp" -#include "jit/parser.hpp" -#include "jit/span.cuh" - #include #include #include @@ -37,6 +32,10 @@ #include #include +#include +#include +#include +#include #include #include @@ -175,7 +174,7 @@ void launch_filter_kernel(jitify2::ConfiguredKernel& kernel, std::array args{&outputs_ptr, &inputs_ptr, &p_user_data}; - kernel->launch(args.data()); + kernel->launch_raw(args.data()); } void perform_checks(column_view base_column, diff --git a/cpp/src/stream_compaction/filter/jit/kernel.cu b/cpp/src/stream_compaction/filter/jit/kernel.cu index 524c389aaae..07aeebee688 100644 --- a/cpp/src/stream_compaction/filter/jit/kernel.cu +++ b/cpp/src/stream_compaction/filter/jit/kernel.cu @@ -14,9 +14,6 @@ * limitations under the License. */ -#include "jit/accessors.cuh" -#include "jit/span.cuh" - #include #include #include @@ -26,6 +23,9 @@ #include +#include +#include + // clang-format off // This header is an inlined header that defines the GENERIC_FILTER_OP function. It is placed here // so the symbols in the headers above can be used by it. diff --git a/cpp/src/transform/jit/kernel.cu b/cpp/src/transform/jit/kernel.cu index 5c8068f0bc7..b3647e374c5 100644 --- a/cpp/src/transform/jit/kernel.cu +++ b/cpp/src/transform/jit/kernel.cu @@ -14,9 +14,6 @@ * limitations under the License. */ -#include "jit/accessors.cuh" -#include "jit/span.cuh" - #include #include #include @@ -26,6 +23,9 @@ #include +#include +#include + // clang-format off // This header is an inlined header that defines the GENERIC_FILTER_OP function. It is placed here // so the symbols in the headers above can be used by it. diff --git a/cpp/src/transform/transform.cpp b/cpp/src/transform/transform.cpp index 9c7fc4772b4..ad3c6bf15c6 100644 --- a/cpp/src/transform/transform.cpp +++ b/cpp/src/transform/transform.cpp @@ -14,12 +14,6 @@ * limitations under the License. */ -#include "jit/cache.hpp" -#include "jit/helpers.hpp" -#include "jit/parser.hpp" -#include "jit/span.cuh" -#include "jit/util.hpp" - #include #include #include @@ -32,6 +26,11 @@ #include +#include +#include +#include +#include +#include #include namespace cudf { @@ -125,7 +124,7 @@ void launch_column_output_kernel(jitify2::ConfiguredKernel& kernel, std::array args{&outputs_ptr, &inputs_ptr, &p_user_data}; - kernel->launch(args.data()); + kernel->launch_raw(args.data()); } template @@ -152,7 +151,7 @@ void launch_span_kernel(jitify2::ConfiguredKernel& kernel, std::array args{&outputs_ptr, &inputs_ptr, &p_user_data}; - kernel->launch(args.data()); + kernel->launch_raw(args.data()); } std::tuple make_transform_null_mask( diff --git a/dependencies.yaml b/dependencies.yaml index 37883b03b4f..8411fa111a7 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -569,6 +569,7 @@ dependencies: - cuda-nvrtc-dev - cuda-nvtx-dev - libcurand-dev + - libnvjitlink-dev - output_types: conda matrices: - matrix: From 063697cc47f5a6427bd402e7f10d0dd2de346772 Mon Sep 17 00:00:00 2001 From: Matthew Murray <41342305+Matt711@users.noreply.github.com> Date: Sun, 17 Aug 2025 13:34:35 -0400 Subject: [PATCH 145/366] Run cudf-polars tests with all supported polars versions (#19353) Runs with all supported polars versions. If there are multiple minor versions, we run on the latest one. - Needs #19352 Authors: - Matthew Murray (https://github.com/Matt711) Approvers: - Bradley Dice (https://github.com/bdice) URL: https://github.com/rapidsai/cudf/pull/19353 --- ci/test_wheel_cudf_polars.sh | 48 ++++++++-- ci/utils/fetch_polars_versions.py | 93 +++++++++++++++++++ dependencies.yaml | 1 + python/cudf_polars/pyproject.toml | 1 + .../tests/expressions/test_stringfunction.py | 8 +- 5 files changed, 141 insertions(+), 10 deletions(-) create mode 100644 ci/utils/fetch_polars_versions.py diff --git a/ci/test_wheel_cudf_polars.sh b/ci/test_wheel_cudf_polars.sh index 5ec9558e0de..b2683eeadbc 100755 --- a/ci/test_wheel_cudf_polars.sh +++ b/ci/test_wheel_cudf_polars.sh @@ -35,6 +35,8 @@ rapids-pip-retry install \ rapids-logger "Run cudf_polars tests" +POLARS_VERSIONS=$(python ci/utils/fetch_polars_versions.py --latest-patch-only dependencies.yaml) + # shellcheck disable=SC2317 function set_exitcode() { @@ -44,16 +46,50 @@ EXITCODE=0 trap set_exitcode ERR set +e -./ci/run_cudf_polars_pytests.sh \ - --cov=cudf_polars \ - --cov-fail-under=100 \ - --cov-report=term-missing:skip-covered \ - --cov-config=./pyproject.toml \ - --junitxml="${RAPIDS_TESTS_DIR}/junit-cudf-polars.xml" +PASSED=() +FAILED=() + +read -r -a VERSIONS <<< "${POLARS_VERSIONS}" +LATEST_VERSION="${VERSIONS[-1]}" + +for version in "${VERSIONS[@]}"; do + rapids-logger "Installing polars==${version}" + pip install -U "polars==${version}" + + rapids-logger "Running tests for polars==${version}" + + if [ "${version}" == "${LATEST_VERSION}" ]; then + COVERAGE_ARGS=( + --cov=cudf_polars + --cov-fail-under=100 + --cov-report=term-missing:skip-covered + --cov-config=./pyproject.toml + ) + else + COVERAGE_ARGS=(--no-cov) + fi + + ./ci/run_cudf_polars_pytests.sh \ + "${COVERAGE_ARGS[@]}" \ + --junitxml="${RAPIDS_TESTS_DIR}/junit-cudf-polars-${version}.xml" + + if [ $? -ne 0 ]; then + EXITCODE=1 + FAILED+=("${version}") + rapids-logger "Tests failed for polars==${version}" + else + PASSED+=("${version}") + rapids-logger "Tests passed for polars==${version}" + fi +done trap ERR set -e +rapids-logger "Polars test summary:" +rapids-logger "PASSED: ${PASSED[*]:-none}" +rapids-logger "FAILED: ${FAILED[*]:-none}" + if [ ${EXITCODE} != 0 ]; then rapids-logger "Testing FAILED: exitcode ${EXITCODE}" else diff --git a/ci/utils/fetch_polars_versions.py b/ci/utils/fetch_polars_versions.py new file mode 100644 index 00000000000..643081c5642 --- /dev/null +++ b/ci/utils/fetch_polars_versions.py @@ -0,0 +1,93 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + +import argparse +import json +import ssl +import urllib.request + +import certifi +import yaml +from packaging.specifiers import SpecifierSet +from packaging.version import Version + + +def get_polars_specifier(deps_yaml_path): + with open(deps_yaml_path, "r") as f: + deps = yaml.safe_load(f) + + try: + includes = deps["files"]["all"]["includes"] + if "run_cudf_polars" not in includes: + raise KeyError() + except KeyError: + raise RuntimeError("run_cudf_polars not found in dependencies.yaml") + + try: + pkgs = deps["dependencies"]["run_cudf_polars"]["common"] + for entry in pkgs: + for pkg in entry.get("packages", []): + if isinstance(pkg, str) and pkg.startswith("polars"): + spec = pkg.removeprefix("polars").strip() + if spec: + return spec + except KeyError: + pass + + raise RuntimeError("Polars specifier not found in dependencies.yaml") + + +def get_latest_versions_per_minor(versions): + latest = {} + for v in versions: + key = (v.major, v.minor) + if key not in latest or v > latest[key]: + latest[key] = v + return sorted(latest.values()) + + +def get_polars_versions(polars_range, latest_only=False): + url = "https://pypi.org/pypi/polars/json" + # Set a timeout for the request to avoid hanging + timeout = 10 # seconds + + try: + context = ssl.create_default_context(cafile=certifi.where()) + with urllib.request.urlopen( + url, timeout=timeout, context=context + ) as response: + data = json.loads(response.read()) + except Exception as e: + raise RuntimeError(f"Failed to fetch polars metadata from PyPI: {e}") + + all_versions = [Version(v) for v in data["releases"]] + specifier = SpecifierSet(polars_range) + matching = [v for v in all_versions if v in specifier] + + if latest_only: + matching = get_latest_versions_per_minor(matching) + + return [str(v) for v in sorted(matching)] + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="Filter polars versions by dependencies.yaml." + ) + parser.add_argument( + "deps_yaml", + nargs="?", + default="./dependencies.yaml", + help="Path to dependencies.yaml", + ) + parser.add_argument( + "--latest-patch-only", + action="store_true", + help="Return only the latest patch per minor version", + ) + args = parser.parse_args() + + polars_range = get_polars_specifier(args.deps_yaml) + versions = get_polars_versions( + polars_range, latest_only=args.latest_patch_only + ) + print(" ".join(versions)) diff --git a/dependencies.yaml b/dependencies.yaml index 8411fa111a7..de83eae1276 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -726,6 +726,7 @@ dependencies: - output_types: [conda, requirements, pyproject] packages: - nvidia-ml-py + - packaging - polars>=1.28,<1.32 specific: - output_types: [requirements, pyproject] diff --git a/python/cudf_polars/pyproject.toml b/python/cudf_polars/pyproject.toml index 472520df984..5fadd6b8656 100644 --- a/python/cudf_polars/pyproject.toml +++ b/python/cudf_polars/pyproject.toml @@ -20,6 +20,7 @@ license = { text = "Apache-2.0" } requires-python = ">=3.10" dependencies = [ "nvidia-ml-py", + "packaging", "polars>=1.28,<1.32", "pylibcudf==25.10.*,>=0.0.0a0", "typing-extensions; python_version < '3.11'", diff --git a/python/cudf_polars/tests/expressions/test_stringfunction.py b/python/cudf_polars/tests/expressions/test_stringfunction.py index 9923f031300..3f1874df702 100644 --- a/python/cudf_polars/tests/expressions/test_stringfunction.py +++ b/python/cudf_polars/tests/expressions/test_stringfunction.py @@ -543,10 +543,10 @@ def test_string_zfill(fill, input_strings): "fill", [ 5 - if not POLARS_VERSION_LT_130 + if not POLARS_VERSION_LT_131 else pytest.param(5, marks=pytest.mark.xfail(reason="fixed in Polars 1.30")), 999 - if not POLARS_VERSION_LT_130 + if not POLARS_VERSION_LT_131 else pytest.param(999, marks=pytest.mark.xfail(reason="fixed in Polars 1.30")), ], ) @@ -563,10 +563,10 @@ def test_string_zfill_pl_129(fill): 1, 2, 5 - if not POLARS_VERSION_LT_130 + if not POLARS_VERSION_LT_131 else pytest.param(5, marks=pytest.mark.xfail(reason="fixed in Polars 1.30")), 999 - if not POLARS_VERSION_LT_130 + if not POLARS_VERSION_LT_131 else pytest.param(999, marks=pytest.mark.xfail(reason="fixed in Polars 1.30")), -1, pytest.param(None, marks=pytest.mark.xfail(reason="None dtype")), From 1bb30c5251522d4c042cc1cd6a41cb4fc7cf7353 Mon Sep 17 00:00:00 2001 From: "Richard (Rick) Zamora" Date: Mon, 18 Aug 2025 07:33:35 -0500 Subject: [PATCH 146/366] RapidsMPF "single" shuffle integration (#19530) Depends on https://github.com/rapidsai/rapidsmpf/pull/380 Enables the `"rapidsmpf"` shuffle options to work with cudf-polars when the `"synchronous"` scheduler is active. Authors: - Richard (Rick) Zamora (https://github.com/rjzamora) - Lawrence Mitchell (https://github.com/wence-) Approvers: - Lawrence Mitchell (https://github.com/wence-) - Tom Augspurger (https://github.com/TomAugspurger) URL: https://github.com/rapidsai/cudf/pull/19530 --- .../cudf_polars/experimental/shuffle.py | 30 ++++++-- .../cudf_polars/cudf_polars/utils/config.py | 61 ++++++++++++---- .../tests/experimental/test_rapidsmpf.py | 69 +++++++++++++++++++ python/cudf_polars/tests/test_config.py | 65 +++++++++++------ 4 files changed, 183 insertions(+), 42 deletions(-) diff --git a/python/cudf_polars/cudf_polars/experimental/shuffle.py b/python/cudf_polars/cudf_polars/experimental/shuffle.py index b6ecfc35fa8..9a4e63e4b76 100644 --- a/python/cudf_polars/cudf_polars/experimental/shuffle.py +++ b/python/cudf_polars/cudf_polars/experimental/shuffle.py @@ -5,7 +5,7 @@ from __future__ import annotations import operator -from typing import TYPE_CHECKING, Any, TypedDict +from typing import TYPE_CHECKING, Any, Literal, TypedDict import pylibcudf as plc from rmm.pylibrmm.stream import DEFAULT_STREAM @@ -39,6 +39,7 @@ class ShuffleOptions(TypedDict): on: Sequence[str] column_names: Sequence[str] dtypes: Sequence[DataType] + cluster_kind: str # Experimental rapidsmpf shuffler integration @@ -57,7 +58,12 @@ def insert_partition( ) -> None: """Add cudf-polars DataFrame chunks to an RMP shuffler.""" from rapidsmpf.integrations.cudf.partition import partition_and_pack - from rapidsmpf.integrations.dask.core import get_worker_context + + if options["cluster_kind"] == "dask": + from rapidsmpf.integrations.dask import get_worker_context + + else: + from rapidsmpf.integrations.single import get_worker_context context = get_worker_context() @@ -90,7 +96,12 @@ def extract_partition( unpack_and_concat, unspill_partitions, ) - from rapidsmpf.integrations.dask.core import get_worker_context + + if options["cluster_kind"] == "dask": + from rapidsmpf.integrations.dask import get_worker_context + + else: + from rapidsmpf.integrations.single import get_worker_context context = get_worker_context() if context.br is None: # pragma: no cover @@ -286,10 +297,18 @@ def _( # Try using rapidsmpf shuffler if we have "simple" shuffle # keys, and the "shuffle_method" config is set to "rapidsmpf" _keys: list[Col] - if shuffle_method == "rapidsmpf" and len( + if shuffle_method in ("rapidsmpf", "rapidsmpf-single") and len( _keys := [ne.value for ne in ir.keys if isinstance(ne.value, Col)] ) == len(ir.keys): # pragma: no cover - from rapidsmpf.integrations.dask import rapidsmpf_shuffle_graph + cluster_kind: Literal["dask", "single"] + if shuffle_method == "rapidsmpf-single": + from rapidsmpf.integrations.single import rapidsmpf_shuffle_graph + + cluster_kind = "single" + else: + from rapidsmpf.integrations.dask import rapidsmpf_shuffle_graph + + cluster_kind = "dask" shuffle_on = [k.name for k in _keys] @@ -304,6 +323,7 @@ def _( "on": shuffle_on, "column_names": list(ir.schema.keys()), "dtypes": list(ir.schema.values()), + "cluster_kind": cluster_kind, }, ) except ValueError as err: diff --git a/python/cudf_polars/cudf_polars/utils/config.py b/python/cudf_polars/cudf_polars/utils/config.py index 578dfd0694e..e65cf877b4e 100644 --- a/python/cudf_polars/cudf_polars/utils/config.py +++ b/python/cudf_polars/cudf_polars/utils/config.py @@ -82,8 +82,17 @@ def get_total_device_memory() -> int | None: @functools.cache -def rapidsmpf_available() -> bool: # pragma: no cover - """Query whether rapidsmpf is available as a shuffle method.""" +def rapidsmpf_single_available() -> bool: # pragma: no cover + """Query whether rapidsmpf is available as a single-process shuffle method.""" + try: + return importlib.util.find_spec("rapidsmpf.integrations.single") is not None + except (ImportError, ValueError): + return False + + +@functools.cache +def rapidsmpf_distributed_available() -> bool: # pragma: no cover + """Query whether rapidsmpf is available as a distributed shuffle method.""" try: return importlib.util.find_spec("rapidsmpf.integrations.dask") is not None except (ImportError, ValueError): @@ -129,15 +138,21 @@ class ShuffleMethod(str, enum.Enum): The method to use for shuffling data between workers with the streaming executor. * ``ShuffleMethod.TASKS`` : Use the task-based shuffler. - * ``ShuffleMethod.RAPIDSMPF`` : Use the rapidsmpf scheduler. + * ``ShuffleMethod.RAPIDSMPF`` : Use the rapidsmpf shuffler. + * ``ShuffleMethod._RAPIDSMPF_SINGLE`` : Use the single-process rapidsmpf shuffler. - With :class:`cudf_polars.utils.config.StreamingExecutor`, the default of ``None`` will attempt to use - ``ShuffleMethod.RAPIDSMPF``, but will fall back to ``ShuffleMethod.TASKS`` - if rapidsmpf is not installed. + With :class:`cudf_polars.utils.config.StreamingExecutor`, the default of ``None`` + will attempt to use ``ShuffleMethod.RAPIDSMPF`` for the distributed scheduler, + but will fall back to ``ShuffleMethod.TASKS`` if rapidsmpf is not installed. + + The user should **not** specify ``ShuffleMethod._RAPIDSMPF_SINGLE`` directly. + A setting of ``ShuffleMethod.RAPIDSMPF`` will be converted to the single-process + shuffler automatically when the 'synchronous' scheduler is active. """ TASKS = "tasks" RAPIDSMPF = "rapidsmpf" + _RAPIDSMPF_SINGLE = "rapidsmpf-single" T = TypeVar("T") @@ -420,24 +435,32 @@ class StreamingExecutor: def __post_init__(self) -> None: # noqa: D105 # Handle shuffle_method defaults for streaming executor if self.shuffle_method is None: - if self.scheduler == "distributed" and rapidsmpf_available(): + if self.scheduler == "distributed" and rapidsmpf_distributed_available(): # For distributed scheduler, prefer rapidsmpf if available object.__setattr__(self, "shuffle_method", "rapidsmpf") else: + # Otherwise, use task-based shuffle for now. + # TODO: Evaluate single-process shuffle by default. object.__setattr__(self, "shuffle_method", "tasks") - else: + elif self.shuffle_method == "rapidsmpf-single": + # The user should NOT specify "rapidsmpf-single" directly. + raise ValueError("rapidsmpf-single is not a supported shuffle method.") + elif self.shuffle_method == "rapidsmpf": + # Check that we have rapidsmpf installed if ( self.scheduler == "distributed" - and self.shuffle_method == "rapidsmpf" - and not rapidsmpf_available() + and not rapidsmpf_distributed_available() ): raise ValueError( - "rapidsmpf shuffle method requested, but rapidsmpf is not installed" + "rapidsmpf shuffle method requested, but rapidsmpf.integrations.dask is not installed." ) - if self.scheduler == "synchronous" and self.shuffle_method == "rapidsmpf": - raise ValueError( - "rapidsmpf shuffle method is not supported for synchronous scheduler" - ) + elif self.scheduler == "synchronous" and not rapidsmpf_single_available(): + raise ValueError( + "rapidsmpf shuffle method requested, but rapidsmpf is not installed." + ) + # Select "rapidsmpf-single" for the synchronous + if self.scheduler == "synchronous": + object.__setattr__(self, "shuffle_method", "rapidsmpf-single") # frozen dataclass, so use object.__setattr__ object.__setattr__( @@ -482,6 +505,14 @@ def __post_init__(self) -> None: # noqa: D105 if not isinstance(self.sink_to_directory, bool): raise TypeError("sink_to_directory must be bool") + # RapidsMPF spill is only supported for the distributed scheduler for now. + # This is because the spilling API is still within the RMPF-Dask integration. + # (See https://github.com/rapidsai/rapidsmpf/issues/439) + if self.scheduler == "synchronous" and self.rapidsmpf_spill: # pragma: no cover + raise ValueError( + "rapidsmpf_spill is not supported for the synchronous scheduler." + ) + def __hash__(self) -> int: # noqa: D105 # cardinality factory, a dict, isn't natively hashable. We'll dump it # to json and hash that. diff --git a/python/cudf_polars/tests/experimental/test_rapidsmpf.py b/python/cudf_polars/tests/experimental/test_rapidsmpf.py index 4b611355b12..c550a5e2d9f 100644 --- a/python/cudf_polars/tests/experimental/test_rapidsmpf.py +++ b/python/cudf_polars/tests/experimental/test_rapidsmpf.py @@ -8,6 +8,7 @@ import polars as pl from cudf_polars.testing.asserts import assert_gpu_result_equal +from cudf_polars.utils.config import ConfigOptions @pytest.mark.parametrize("rapidsmpf_spill", [False, True]) @@ -68,3 +69,71 @@ def test_join_rapidsmpf( q = left.join(right, on="y", how="inner") assert_gpu_result_equal(q, engine=engine, check_row_order=False) + + +@pytest.mark.parametrize("max_rows_per_partition", [1, 5]) +def test_join_rapidsmpf_single(max_rows_per_partition: int) -> None: + # check that we have a rapidsmpf cluster running + pytest.importorskip("rapidsmpf") + + # Setup the GPUEngine config + engine = pl.GPUEngine( + raise_on_fail=True, + executor="streaming", + executor_options={ + "max_rows_per_partition": max_rows_per_partition, + "broadcast_join_limit": 2, + "shuffle_method": "rapidsmpf", + "scheduler": "synchronous", + }, + ) + + left = pl.LazyFrame( + { + "x": range(15), + "y": [1, 2, 3] * 5, + "z": [1.0, 2.0, 3.0, 4.0, 5.0] * 3, + } + ) + right = pl.LazyFrame( + { + "xx": range(6), + "y": [2, 4, 3] * 2, + "zz": [1, 2] * 3, + } + ) + q = left.join(right, on="y", how="inner") + + assert_gpu_result_equal(q, engine=engine, check_row_order=False) + + +def test_join_rapidsmpf_single_private_config() -> None: + # The user may not specify "rapidsmpf-single" directly + engine = pl.GPUEngine( + raise_on_fail=True, + executor="streaming", + executor_options={ + "shuffle_method": "rapidsmpf-single", + "scheduler": "synchronous", + }, + ) + with pytest.raises(ValueError, match="not a supported shuffle method"): + ConfigOptions.from_polars_engine(engine) + + +def test_rapidsmpf_spill_synchronous_unsupported() -> None: + # check that we have a rapidsmpf cluster running + pytest.importorskip("rapidsmpf") + + # rapidsmpf_spill=True is not yet supported with synchronous scheduler. + engine = pl.GPUEngine( + raise_on_fail=True, + executor="streaming", + executor_options={ + "shuffle_method": "rapidsmpf", + "scheduler": "synchronous", + "rapidsmpf_spill": True, + }, + ) + with pytest.raises(ValueError, match="rapidsmpf_spill.*not supported.*synchronous"): + ConfigOptions.from_polars_engine(engine) diff --git a/python/cudf_polars/tests/test_config.py b/python/cudf_polars/tests/test_config.py index 3de18e0a4e4..795f35c5f8e 100644 --- a/python/cudf_polars/tests/test_config.py +++ b/python/cudf_polars/tests/test_config.py @@ -24,10 +24,22 @@ from cudf_polars.utils.versions import POLARS_VERSION_LT_130 -@pytest.fixture(params=[False, True], ids=["norapidsmpf", "rapidsmpf"]) -def rapidsmpf_available(request, monkeypatch): +@pytest.fixture(params=[False, True], ids=["norapidsmpf.single", "rapidsmpf.single"]) +def rapidsmpf_single_available(request, monkeypatch): monkeypatch.setattr( - cudf_polars.utils.config, "rapidsmpf_available", lambda: request.param + cudf_polars.utils.config, + "rapidsmpf_single_available", + lambda: request.param, + ) + return request.param + + +@pytest.fixture(params=[False, True], ids=["norapidsmpf.dask", "rapidsmpf.dask"]) +def rapidsmpf_distributed_available(request, monkeypatch): + monkeypatch.setattr( + cudf_polars.utils.config, + "rapidsmpf_distributed_available", + lambda: request.param, ) return request.param @@ -151,7 +163,9 @@ def test_parquet_options(executor: str) -> None: assert config.parquet_options.n_output_chunks == 16 -def test_validate_streaming_executor_shuffle_method(rapidsmpf_available) -> None: +def test_validate_streaming_executor_shuffle_method( + *, rapidsmpf_distributed_available: bool, rapidsmpf_single_available: bool +) -> None: config = ConfigOptions.from_polars_engine( pl.GPUEngine( executor="streaming", @@ -161,30 +175,34 @@ def test_validate_streaming_executor_shuffle_method(rapidsmpf_available) -> None assert config.executor.name == "streaming" assert config.executor.shuffle_method == "tasks" + # rapidsmpf with distributed scheduler engine = pl.GPUEngine( executor="streaming", executor_options={"shuffle_method": "rapidsmpf", "scheduler": "distributed"}, ) - if rapidsmpf_available: + if rapidsmpf_distributed_available: config = ConfigOptions.from_polars_engine(engine) assert config.executor.name == "streaming" assert config.executor.shuffle_method == "rapidsmpf" else: - with pytest.raises(ValueError, match="rapidsmpf is not installed"): + with pytest.raises( + ValueError, match="rapidsmpf.integrations.dask is not installed" + ): ConfigOptions.from_polars_engine(engine) - # rapidsmpf with sync is not allowed + # rapidsmpf with sync scheduler + engine = pl.GPUEngine( + executor="streaming", + executor_options={"shuffle_method": "rapidsmpf", "scheduler": "synchronous"}, + ) - with pytest.raises(ValueError, match="rapidsmpf shuffle method"): - ConfigOptions.from_polars_engine( - pl.GPUEngine( - executor="streaming", - executor_options={ - "shuffle_method": "rapidsmpf", - "scheduler": "synchronous", - }, - ) - ) + if rapidsmpf_single_available: + config = ConfigOptions.from_polars_engine(engine) + assert config.executor.name == "streaming" + assert config.executor.shuffle_method == "rapidsmpf-single" + else: + with pytest.raises(ValueError, match="rapidsmpf is not installed"): + ConfigOptions.from_polars_engine(engine) @pytest.mark.parametrize("executor", ["in-memory", "streaming"]) @@ -233,7 +251,10 @@ def test_validate_scheduler() -> None: ) -def test_validate_shuffle_method_defaults(rapidsmpf_available) -> None: +def test_validate_shuffle_method_defaults( + *, + rapidsmpf_distributed_available: bool, +) -> None: config = ConfigOptions.from_polars_engine( pl.GPUEngine( executor="streaming", @@ -252,7 +273,7 @@ def test_validate_shuffle_method_defaults(rapidsmpf_available) -> None: ) ) assert config.executor.name == "streaming" - if rapidsmpf_available: + if rapidsmpf_distributed_available: # Should be "rapidsmpf" if available, otherwise "tasks" assert config.executor.shuffle_method == "rapidsmpf" else: @@ -330,7 +351,7 @@ def test_parquet_options_from_env(monkeypatch: pytest.MonkeyPatch) -> None: def test_config_option_from_env( - monkeypatch: pytest.MonkeyPatch, *, rapidsmpf_available: bool + monkeypatch: pytest.MonkeyPatch, *, rapidsmpf_distributed_available: bool ) -> None: with monkeypatch.context() as m: m.setenv("CUDF_POLARS__EXECUTOR__SCHEDULER", "distributed") @@ -343,7 +364,7 @@ def test_config_option_from_env( m.setenv("CUDF_POLARS__EXECUTOR__RAPIDSMPF_SPILL", "1") m.setenv("CUDF_POLARS__EXECUTOR__SINK_TO_DIRECTORY", "1") - if rapidsmpf_available: + if rapidsmpf_distributed_available: m.setenv("CUDF_POLARS__EXECUTOR__SHUFFLE_METHOD", "rapidsmpf") else: m.setenv("CUDF_POLARS__EXECUTOR__SHUFFLE_METHOD", "tasks") @@ -361,7 +382,7 @@ def test_config_option_from_env( assert config.executor.rapidsmpf_spill is True assert config.executor.sink_to_directory is True - if rapidsmpf_available: + if rapidsmpf_distributed_available: assert config.executor.shuffle_method == "rapidsmpf" else: assert config.executor.shuffle_method == "tasks" From 17d1837ef83e25ea4026ea5183bdefe4bd805174 Mon Sep 17 00:00:00 2001 From: Matthew Murray <41342305+Matt711@users.noreply.github.com> Date: Mon, 18 Aug 2025 10:49:41 -0400 Subject: [PATCH 147/366] Handle `TIMESTAMP_DAYS` in rolling window offsets (#19689) Contributes to #18633 Authors: - Matthew Murray (https://github.com/Matt711) Approvers: - Tom Augspurger (https://github.com/TomAugspurger) URL: https://github.com/rapidsai/cudf/pull/19689 --- python/cudf_polars/cudf_polars/dsl/utils/windows.py | 8 +++++++- python/cudf_polars/tests/expressions/test_rolling.py | 4 ++-- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/python/cudf_polars/cudf_polars/dsl/utils/windows.py b/python/cudf_polars/cudf_polars/dsl/utils/windows.py index 91ea566f683..49aebc57fdf 100644 --- a/python/cudf_polars/cudf_polars/dsl/utils/windows.py +++ b/python/cudf_polars/cudf_polars/dsl/utils/windows.py @@ -110,8 +110,14 @@ def duration_to_scalar(dtype: plc.DataType, value: int) -> plc.Scalar: return plc.Scalar.from_py( value // 1_000_000, plc.DataType(plc.TypeId.DURATION_MILLISECONDS) ) + elif tid == plc.TypeId.TIMESTAMP_DAYS: + return plc.Scalar.from_py( + value // 86_400_000_000_000, plc.DataType(plc.TypeId.DURATION_DAYS) + ) else: - raise NotImplementedError("Unsupported data type in rolling window offset") + raise NotImplementedError( + "Unsupported data type in rolling window offset" + ) # pragma: no cover; polars raises first def offsets_to_windows( diff --git a/python/cudf_polars/tests/expressions/test_rolling.py b/python/cudf_polars/tests/expressions/test_rolling.py index 0253f14c416..373e4903cef 100644 --- a/python/cudf_polars/tests/expressions/test_rolling.py +++ b/python/cudf_polars/tests/expressions/test_rolling.py @@ -38,7 +38,7 @@ def test_rolling_datetime(time_unit): assert_gpu_result_equal(q) -def test_rolling_date_raises(): +def test_rolling_date(): dates = [ "2020-01-01", "2020-01-01", @@ -56,7 +56,7 @@ def test_rolling_date_raises(): max_a=pl.max("a").rolling(index_column="dt", period="10d", offset="2d"), ) - assert_ir_translation_raises(q, NotImplementedError) + assert_gpu_result_equal(q) @pytest.mark.parametrize("dtype", [pl.Int32, pl.UInt32, pl.Int64, pl.UInt64]) From 916c64aeac23525352b58b480261adc4715a2cea Mon Sep 17 00:00:00 2001 From: brandon-b-miller <53796099+brandon-b-miller@users.noreply.github.com> Date: Mon, 18 Aug 2025 10:56:25 -0500 Subject: [PATCH 148/366] Enable casting `pl.Datetime` to integer types in `cudf-polars` (#19647) Part of https://github.com/rapidsai/cudf/issues/17060 Authors: - https://github.com/brandon-b-miller Approvers: - Matthew Roeschke (https://github.com/mroeschke) URL: https://github.com/rapidsai/cudf/pull/19647 --- .../cudf_polars/containers/column.py | 15 ++++++++ .../cudf_polars/cudf_polars/utils/dtypes.py | 5 +++ .../tests/expressions/test_casting.py | 6 ++-- .../tests/expressions/test_datetime_basic.py | 36 +++++++++++++++++++ 4 files changed, 60 insertions(+), 2 deletions(-) diff --git a/python/cudf_polars/cudf_polars/containers/column.py b/python/cudf_polars/cudf_polars/containers/column.py index 31e4c66a02c..a278ba32b43 100644 --- a/python/cudf_polars/cudf_polars/containers/column.py +++ b/python/cudf_polars/cudf_polars/containers/column.py @@ -307,6 +307,21 @@ def astype(self, dtype: DataType) -> Column: upcasted.children(), ) return Column(result, dtype=dtype).sorted_like(self) + elif plc.traits.is_integral_not_bool(plc_dtype) and plc.traits.is_timestamp( + self.obj.type() + ): + result = plc.column.Column( + plc.DataType(plc.TypeId.INT64), + self.obj.size(), + self.obj.data(), + self.obj.null_mask(), + self.obj.null_count(), + self.obj.offset(), + self.obj.children(), + ) + return Column(plc.unary.cast(result, plc_dtype), dtype=dtype).sorted_like( + self + ) else: result = Column(plc.unary.cast(self.obj, plc_dtype), dtype=dtype) if is_order_preserving_cast(self.obj.type(), plc_dtype): diff --git a/python/cudf_polars/cudf_polars/utils/dtypes.py b/python/cudf_polars/cudf_polars/utils/dtypes.py index 23a94cf2a67..4756b2a0692 100644 --- a/python/cudf_polars/cudf_polars/utils/dtypes.py +++ b/python/cudf_polars/cudf_polars/utils/dtypes.py @@ -69,6 +69,11 @@ def can_cast(from_: plc.DataType, to: plc.DataType) -> bool: and not to_is_empty and plc.traits.is_timestamp(to) ) + or ( + plc.traits.is_integral_not_bool(to) + and not to_is_empty + and plc.traits.is_timestamp(from_) + ) ) diff --git a/python/cudf_polars/tests/expressions/test_casting.py b/python/cudf_polars/tests/expressions/test_casting.py index 0722a0f198a..a8438509ec6 100644 --- a/python/cudf_polars/tests/expressions/test_casting.py +++ b/python/cudf_polars/tests/expressions/test_casting.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations @@ -14,7 +14,7 @@ _supported_dtypes = [(pl.Int8(), pl.Int64())] _unsupported_dtypes = [ - (pl.Datetime("ns"), pl.Int64()), + (pl.Boolean(), pl.Datetime("ns")), ] @@ -28,6 +28,8 @@ def tests(dtypes): fromtype, totype = dtypes if fromtype == pl.String(): data = ["a", "b", "c"] + elif fromtype == pl.Boolean(): + data = [True, False, True] else: data = [1, 2, 3] return pl.DataFrame( diff --git a/python/cudf_polars/tests/expressions/test_datetime_basic.py b/python/cudf_polars/tests/expressions/test_datetime_basic.py index 5d98165a419..b9f13b9e9b1 100644 --- a/python/cudf_polars/tests/expressions/test_datetime_basic.py +++ b/python/cudf_polars/tests/expressions/test_datetime_basic.py @@ -386,3 +386,39 @@ def test_datetime_from_integer(datetime_dtype, integer_dtype): ) else: assert_gpu_result_equal(q) + + +@pytest.mark.parametrize( + "datetime_dtype", + [ + pl.Datetime("ms"), + pl.Datetime("us"), + pl.Datetime("ns"), + ], +) +@pytest.mark.parametrize( + "integer_dtype", + [ + pl.Int64(), + pytest.param( + pl.UInt64(), marks=pytest.mark.xfail(reason="INT64 can not fit max(UINT64)") + ), + pl.Int32(), + pl.UInt32(), + pl.Int16(), + pl.UInt16(), + pl.Int8(), + pl.UInt8(), + ], +) +def test_integer_from_datetime(datetime_dtype, integer_dtype): + values = [ + 0, + 1, + 100, + pl.select(integer_dtype.max()).item(), + pl.select(integer_dtype.min()).item(), + ] + df = pl.LazyFrame({"data": pl.Series(values, dtype=datetime_dtype)}) + q = df.select(pl.col("data").cast(integer_dtype).alias("int_from_datetime")) + assert_gpu_result_equal(q) From fd7e0821ae7852a5f88bfb185703ea2f708ebf36 Mon Sep 17 00:00:00 2001 From: "Richard (Rick) Zamora" Date: Mon, 18 Aug 2025 12:03:43 -0500 Subject: [PATCH 149/366] Add API to "initialize" column statistics (#19447) Closes https://github.com/rapidsai/cudf/issues/19390 - Adds simple `StatsCollector` API - Adds `collect_base_stats` API (tested in this PR, but not *used* anywhere internally yet) - Adds `initialize_column_stats` dispatch functions and registers IR-specific logic for various IR sub-classes (this dispatch function is used by `collect_base_stats`). Authors: - Richard (Rick) Zamora (https://github.com/rjzamora) Approvers: - Matthew Murray (https://github.com/Matt711) - Tom Augspurger (https://github.com/TomAugspurger) URL: https://github.com/rapidsai/cudf/pull/19447 --- .../cudf_polars/experimental/base.py | 51 ++- .../cudf_polars/experimental/dispatch.py | 37 +- .../cudf_polars/experimental/statistics.py | 208 ++++++++++ python/cudf_polars/docs/overview.md | 85 +++++ .../tests/experimental/test_dataframescan.py | 45 --- .../tests/experimental/test_scan.py | 121 ------ .../tests/experimental/test_stats.py | 357 ++++++++++++++++++ 7 files changed, 735 insertions(+), 169 deletions(-) create mode 100644 python/cudf_polars/cudf_polars/experimental/statistics.py create mode 100644 python/cudf_polars/tests/experimental/test_stats.py diff --git a/python/cudf_polars/cudf_polars/experimental/base.py b/python/cudf_polars/cudf_polars/experimental/base.py index ec4d2109133..01fc4d1e1fc 100644 --- a/python/cudf_polars/cudf_polars/experimental/base.py +++ b/python/cudf_polars/cudf_polars/experimental/base.py @@ -11,6 +11,7 @@ from collections.abc import Generator, Iterator from cudf_polars.dsl.expr import NamedExpr + from cudf_polars.dsl.ir import IR from cudf_polars.dsl.nodebase import Node @@ -124,7 +125,9 @@ class ColumnStats: ---------- name Column name. - source + children + Child ColumnStats objects. + source_info Datasource information. source_name Source-column name. @@ -132,9 +135,10 @@ class ColumnStats: Unique-value statistics. """ - __slots__ = ("name", "source_info", "source_name", "unique_stats") + __slots__ = ("children", "name", "source_info", "source_name", "unique_stats") name: str + children: tuple[ColumnStats, ...] source_info: DataSourceInfo source_name: str unique_stats: UniqueStats @@ -143,11 +147,54 @@ def __init__( self, name: str, *, + children: tuple[ColumnStats, ...] = (), source_info: DataSourceInfo | None = None, source_name: str | None = None, unique_stats: UniqueStats | None = None, ) -> None: self.name = name + self.children = children self.source_info = source_info or DataSourceInfo() self.source_name = source_name or name self.unique_stats = unique_stats or UniqueStats() + + def new_parent( + self, + *, + name: str | None = None, + ) -> ColumnStats: + """ + Initialize a new parent ColumnStats object. + + Parameters + ---------- + name + The new column name. + + Returns + ------- + A new ColumnStats object. + + Notes + ----- + This API preserves the original DataSourceInfo reference. + """ + return ColumnStats( + name=name or self.name, + children=(self,), + # Want to reference the same DataSourceInfo + source_info=self.source_info, + source_name=self.source_name, + # Want fresh UniqueStats so we can mutate in place + unique_stats=UniqueStats(), + ) + + +class StatsCollector: + """Column statistics collector.""" + + __slots__ = ("column_stats", "row_count") + + def __init__(self) -> None: + self.row_count: dict[IR, ColumnStat[int]] = {} + self.column_stats: dict[IR, dict[str, ColumnStats]] = {} diff --git a/python/cudf_polars/cudf_polars/experimental/dispatch.py b/python/cudf_polars/cudf_polars/experimental/dispatch.py index 4ea4460fcb3..b48ab73d671 100644 --- a/python/cudf_polars/cudf_polars/experimental/dispatch.py +++ b/python/cudf_polars/cudf_polars/experimental/dispatch.py @@ -14,7 +14,11 @@ from cudf_polars.dsl import ir from cudf_polars.dsl.ir import IR - from cudf_polars.experimental.base import PartitionInfo + from cudf_polars.experimental.base import ( + ColumnStats, + PartitionInfo, + StatsCollector, + ) from cudf_polars.utils.config import ConfigOptions @@ -97,3 +101,34 @@ def generate_ir_tasks( task_graph """ raise AssertionError(f"Unhandled type {type(ir)}") # pragma: no cover + + +@singledispatch +def initialize_column_stats( + ir: IR, stats: StatsCollector, config_options: ConfigOptions +) -> dict[str, ColumnStats]: + """ + Initialize column statistics for an IR node. + + Parameters + ---------- + ir + The IR node to collect source statistics for. + stats + The `StatsCollector` object containing known source statistics. + config_options + GPUEngine configuration options. + + Returns + ------- + base_stats_mapping + Mapping between column names and base ``ColumnStats`` objects. + + Notes + ----- + Base column stats correspond to ``ColumnStats`` objects **without** + populated ``unique_stats`` information. The purpose of this function + is to propagate ``DataSourceInfo`` references and set ``children`` + attributes for each column of each IR node. + """ + raise AssertionError(f"Unhandled type {type(ir)}") # pragma: no cover diff --git a/python/cudf_polars/cudf_polars/experimental/statistics.py b/python/cudf_polars/cudf_polars/experimental/statistics.py new file mode 100644 index 00000000000..7c302fe59d7 --- /dev/null +++ b/python/cudf_polars/cudf_polars/experimental/statistics.py @@ -0,0 +1,208 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. +# SPDX-License-Identifier: Apache-2.0 + +"""Utilities for tracking column statistics.""" + +from __future__ import annotations + +import itertools +from typing import TYPE_CHECKING + +from cudf_polars.dsl.ir import ( + IR, + DataFrameScan, + Distinct, + GroupBy, + HConcat, + Join, + Scan, + Union, +) +from cudf_polars.dsl.traversal import post_traversal +from cudf_polars.experimental.base import ( + ColumnStats, + StatsCollector, +) +from cudf_polars.experimental.dispatch import initialize_column_stats + +if TYPE_CHECKING: + from collections.abc import Sequence + + from cudf_polars.utils.config import ConfigOptions + + +def collect_base_stats(root: IR, config_options: ConfigOptions) -> StatsCollector: + """ + Collect base datasource statistics. + + Parameters + ---------- + root + Root IR node for collecting base datasource statistics. + config_options + GPUEngine configuration options. + + Returns + ------- + A new StatsCollector object with populated datasource statistics. + """ + stats: StatsCollector = StatsCollector() + for node in post_traversal([root]): + stats.column_stats[node] = initialize_column_stats(node, stats, config_options) + return stats + + +def _update_unique_stats_columns( + child_column_stats: dict[str, ColumnStats], + key_names: Sequence[str], + config_options: ConfigOptions, +) -> None: + """Update set of unique-stats columns in datasource.""" + assert config_options.executor.name == "streaming", ( + "'in-memory' executor not supported in 'add_source_stats'" + ) + unique_fraction = config_options.executor.unique_fraction + for name in key_names: + if ( + name not in unique_fraction + and (column_stats := child_column_stats.get(name)) is not None + and (source_stats := column_stats.source_info) is not None + ): + source_stats.add_unique_stats_column(column_stats.source_name or name) + + +@initialize_column_stats.register(IR) +def _default_initialize_column_stats( + ir: IR, stats: StatsCollector, config_options: ConfigOptions +) -> dict[str, ColumnStats]: + # Default `initialize_column_stats` implementation. + if len(ir.children) == 1: + (child,) = ir.children + child_column_stats = stats.column_stats.get(child, {}) + return { + name: child_column_stats.get(name, ColumnStats(name=name)).new_parent() + for name in ir.schema + } + else: # pragma: no cover + # Multi-child nodes loose all information by default. + return {name: ColumnStats(name=name) for name in ir.schema} + + +@initialize_column_stats.register(Distinct) +def _( + ir: Distinct, stats: StatsCollector, config_options: ConfigOptions +) -> dict[str, ColumnStats]: + # Use default initialize_column_stats after updating + # the known unique-stats columns. + (child,) = ir.children + child_column_stats = stats.column_stats.get(child, {}) + key_names = ir.subset or ir.schema + _update_unique_stats_columns(child_column_stats, list(key_names), config_options) + return _default_initialize_column_stats(ir, stats, config_options) + + +@initialize_column_stats.register(Join) +def _( + ir: Join, stats: StatsCollector, config_options: ConfigOptions +) -> dict[str, ColumnStats]: + # Copy column statistics from both the left and right children. + # Special cases to consider: + # - If a column name appears in both sides of the join, + # we take it from the "primary" column (right for "Right" + # joins, left for all other joins). + # - If a column name doesn't appear in either child, it + # corresponds to a non-"primary" column with a suffix. + + children, on = ir.children, (ir.left_on, ir.right_on) + how = ir.options[0] + suffix = ir.options[3] + if how == "Right": + children, on = children[::-1], on[::-1] + primary, other = children + primary_child_stats = stats.column_stats.get(primary, {}) + other_child_stats = stats.column_stats.get(other, {}) + + # Build output column statistics + column_stats: dict[str, ColumnStats] = {} + for name in ir.schema: + if name in primary.schema: + # "Primary" child stats take preference. + column_stats[name] = primary_child_stats[name].new_parent() + elif name in other.schema: + # "Other" column stats apply to everything else. + column_stats[name] = other_child_stats[name].new_parent() + else: + # If the column name was not in either child table, + # a suffix was added to a column in "other". + _name = name.removesuffix(suffix) + column_stats[name] = other_child_stats[_name].new_parent(name=name) + + # Update children + for p_key, o_key in zip(*on, strict=True): + column_stats[p_key.name].children = ( + primary_child_stats[p_key.name], + other_child_stats[o_key.name], + ) + + return column_stats + + +@initialize_column_stats.register(GroupBy) +def _( + ir: GroupBy, stats: StatsCollector, config_options: ConfigOptions +) -> dict[str, ColumnStats]: + (child,) = ir.children + child_column_stats = stats.column_stats.get(child, {}) + + # Update set of source columns we may lazily sample + _update_unique_stats_columns( + child_column_stats, [n.name for n in ir.keys], config_options + ) + return _default_initialize_column_stats(ir, stats, config_options) + + +@initialize_column_stats.register(HConcat) +def _( + ir: HConcat, stats: StatsCollector, config_options: ConfigOptions +) -> dict[str, ColumnStats]: + child_column_stats = dict( + itertools.chain.from_iterable( + stats.column_stats.get(c, {}).items() for c in ir.children + ) + ) + return { + name: child_column_stats.get(name, ColumnStats(name=name)).new_parent() + for name in ir.schema + } + + +@initialize_column_stats.register(Union) +def _( + ir: IR, stats: StatsCollector, config_options: ConfigOptions +) -> dict[str, ColumnStats]: + # Union looses source information for now. + return { + name: ColumnStats( + name=name, + children=tuple(stats.column_stats[child][name] for child in ir.children), + ) + for name in ir.schema + } + + +@initialize_column_stats.register(Scan) +def _( + ir: Scan, stats: StatsCollector, config_options: ConfigOptions +) -> dict[str, ColumnStats]: + from cudf_polars.experimental.io import _extract_scan_stats + + return _extract_scan_stats(ir, config_options) + + +@initialize_column_stats.register(DataFrameScan) +def _( + ir: DataFrameScan, stats: StatsCollector, config_options: ConfigOptions +) -> dict[str, ColumnStats]: + from cudf_polars.experimental.io import _extract_dataframescan_stats + + return _extract_dataframescan_stats(ir) diff --git a/python/cudf_polars/docs/overview.md b/python/cudf_polars/docs/overview.md index df6af87bbf0..0eaf182762b 100644 --- a/python/cudf_polars/docs/overview.md +++ b/python/cudf_polars/docs/overview.md @@ -383,6 +383,91 @@ def rename(e: Expr, mapping: Mapping[str, str]) -> Expr: return mapper(e) ``` +# Estimated column statistics + +:::{note} +Column-statistics estimation is experimental and the details are +likely to change in the future. +::: + +The `cudf-polars` streaming executor (enabled by default) may use +estimated column statistics to transform translated logical-plan +IR nodes into the final "physical-plan" IR nodes. + +## Storing statistics + +The following classes are used to store column statistics (listed +in order of decreasing granularity): + +- `ColumnStat`: This class is used to store an individual column +statistic (e.g. row count or unique-value count). Each object +has two important attributes: + - `ColumnStat.value`: Returns the actual column-statistic value + (e.g. an `int` if the statistic is a row-count) or `None` if no + estimate is available. + - `ColumnStat.exact`: Whether the statistic is known "exactly". +- `UniqueStats`: Since we usually sample both the unique-value +**count** and the unique-value **fraction** of a column at once, +we use `UniqueStats` to group these `ColumnStat`s into one object. +- `DataSourceInfo`: This class is used to sample and store +`ColumnStat`/`UniqueStats` objects associated with a single +datasource (e.g. a Parquet dataset or in-memory `DataFrame`). + - Since it can be expensive to sample datasource statistics, + this class is specifically designed to enable **lazy** and + **aggregated** column sampling via sub-classing. For example, + The `ParquetSourceInfo` sub-class uses caching to avoid + redundant file-system access. +- `ColumnStats`: This class is used to group together the "base" +`DataSourceInfo` reference and the current `UniqueStats` estimates +for a specific IR + column combination. We bundle these references +together to simplify the design and maintenance of `StatsCollector`. +**NOTE:** The current `UniqueStats` estimates are not yet populated. +- `StatsCollector`: This class is used to collect and store +statistics for all IR nodes within a single query. The statistics +attached to each IR node refer to the **output** columns of the +IR node in question. The `StatsCollector` class is especially important, +because it is used to organize **all** statistics within a logical plan. +Each object has two important attributes: + - `StatsCollector.row_count`: Returns a mapping between each IR + node and the row-count `ColumnStat` estimate for that node. + **NOTE:** This attribute is not yet populated. + - `StatsCollector.column_stats`: Returns a mapping between each IR + node and the `dict[str, ColumnStats]` mapping for that node. + +## Collecting and using statistics + +:::{note} +Column-statistics collection is under active development. +The existing APIs only support the sampling of base +datasource statistics. The current row-count and unique-value +statistics for each IR node are not populated by any traversal +logic yet. +::: + +### Collecting base statistics + +The top-level API for sampling base datasource statistics is +`cudf_polars.experimental.statistics.collect_base_stats`. This +function calls into the `initialize_column_stats` single-dispatch +function to collect a `dict[str, ColumnStats]` mapping for each +IR node in the logical plan. + +The IR-specific logic for each `initialize_column_stats` dispatch is +relatively simple, because the only goal is to collect and propagate +the underlying `DataSourceInfo` reference and child-`ColumnStats` +references for each column. This means that `Scan` and `DataFrameScan` +are the only IR classes needing specialized sampling logic. All other +IR classes are typically propagating reference from child-IR nodes. + +### Using base statistics + +Base `DataSourceInfo` references are currently used to calculate +the partition count when a Parquet-based `Scan` node is lowered +by the `cudf-polars` streaming executor. + +In the future, these statistics will also be used for +parallel-algorithm selection and intermediate repartitioning. + # Containers Containers should be constructed as relatively lightweight objects diff --git a/python/cudf_polars/tests/experimental/test_dataframescan.py b/python/cudf_polars/tests/experimental/test_dataframescan.py index 67348b2c30d..aea526a4459 100644 --- a/python/cudf_polars/tests/experimental/test_dataframescan.py +++ b/python/cudf_polars/tests/experimental/test_dataframescan.py @@ -3,8 +3,6 @@ from __future__ import annotations -import math - import pytest import polars as pl @@ -60,46 +58,3 @@ def test_dataframescan_concat(df): ) df2 = pl.concat([df, df]) assert_gpu_result_equal(df2, engine=engine) - - -def test_source_statistics(df): - from cudf_polars.experimental.io import _extract_dataframescan_stats - - row_count = df.collect().height - engine = pl.GPUEngine( - raise_on_fail=True, - executor="streaming", - executor_options={ - "max_rows_per_partition": 1_000, - "scheduler": DEFAULT_SCHEDULER, - }, - ) - ir = Translator(df._ldf.visit(), engine).translate_ir() - column_stats = _extract_dataframescan_stats(ir) - - # Source info is the same for all columns - source_info = column_stats["x"].source_info - assert source_info is column_stats["y"].source_info - assert source_info is column_stats["z"].source_info - assert source_info.row_count.value == row_count - assert source_info.row_count.exact - - # Storage stats should not be available - assert source_info.storage_size("x").value is None - - # Check unique stats - assert math.isclose( - source_info.unique_stats("x").count.value, row_count, rel_tol=1e-2 - ) - assert math.isclose(source_info.unique_stats("x").fraction.value, 1.0, abs_tol=1e-2) - assert not source_info.unique_stats("x").count.exact - assert math.isclose(source_info.unique_stats("y").count.value, 3, rel_tol=1e-2) - assert math.isclose( - source_info.unique_stats("y").fraction.value, 3 / row_count, abs_tol=1e-2 - ) - assert not source_info.unique_stats("y").count.exact - assert math.isclose(source_info.unique_stats("z").count.value, 5, rel_tol=1e-2) - assert math.isclose( - source_info.unique_stats("z").fraction.value, 5 / row_count, abs_tol=1e-2 - ) - assert not source_info.unique_stats("z").count.exact diff --git a/python/cudf_polars/tests/experimental/test_scan.py b/python/cudf_polars/tests/experimental/test_scan.py index 4123c47bce8..c9ede627ae6 100644 --- a/python/cudf_polars/tests/experimental/test_scan.py +++ b/python/cudf_polars/tests/experimental/test_scan.py @@ -84,124 +84,3 @@ def test_split_scan_predicate(tmp_path, df, mask): }, ) assert_gpu_result_equal(q, engine=engine) - - -@pytest.mark.parametrize("n_files", [1, 3]) -@pytest.mark.parametrize("row_group_size", [None, 10_000]) -@pytest.mark.parametrize("max_footer_samples", [3, 0]) -@pytest.mark.parametrize("max_row_group_samples", [1, 0]) -def test_source_statistics( - tmp_path, - df, - n_files, - row_group_size, - max_footer_samples, - max_row_group_samples, -): - from cudf_polars.experimental.io import ( - _clear_source_info_cache, - _extract_scan_stats, - ) - - _clear_source_info_cache() - make_partitioned_source( - df, - tmp_path, - "parquet", - n_files=n_files, - row_group_size=row_group_size, - ) - q = pl.scan_parquet(tmp_path) - engine = pl.GPUEngine( - raise_on_fail=True, - executor="streaming", - executor_options={ - "target_partition_size": 10_000, - "scheduler": DEFAULT_SCHEDULER, - }, - parquet_options={ - "max_footer_samples": max_footer_samples, - "max_row_group_samples": max_row_group_samples, - }, - ) - ir = Translator(q._ldf.visit(), engine).translate_ir() - column_stats = _extract_scan_stats(ir, ConfigOptions.from_polars_engine(engine)) - - # Source info is the same for all columns - source_info = column_stats["x"].source_info - assert source_info is column_stats["y"].source_info - assert source_info is column_stats["z"].source_info - if max_footer_samples: - assert source_info.row_count.value == df.height - assert source_info.row_count.exact - else: - assert source_info.row_count.value is None - - # Storage stats should be available - if max_footer_samples: - assert source_info.storage_size("x").value > 0 - assert source_info.storage_size("y").value > 0 - else: - assert source_info.storage_size("x").value is None - assert source_info.storage_size("y").value is None - - # Check that we can query a missing column name - assert source_info.storage_size("foo").value is None - assert source_info.unique_stats("foo").count.value is None - assert source_info.unique_stats("foo").fraction.value is None - - # source._unique_stats should be empty - assert set(source_info._unique_stats) == set() - - if max_footer_samples and max_row_group_samples: - assert source_info.unique_stats("x").count.value == df.height - assert source_info.unique_stats("x").fraction.value == 1.0 - else: - assert source_info.unique_stats("x").count.value is None - assert source_info.unique_stats("x").fraction.value is None - - # source_info._unique_stats should only contain 'x' - if max_footer_samples and max_row_group_samples: - assert set(source_info._unique_stats) == {"x"} - else: - assert set(source_info._unique_stats) == set() - - # Check add_unique_stats_column behavior - if max_footer_samples and max_row_group_samples: - # Can add a "bad"/missing key column - source_info.add_unique_stats_column("foo") - assert set(source_info._unique_stats) == {"x"} - - # Mark 'z' as a key column, and query 'y' stats - source_info.add_unique_stats_column("z") - if n_files == 1 and row_group_size == 10_000: - assert source_info.unique_stats("y").count.value == 3 - else: - assert source_info.unique_stats("y").count.value is None - assert source_info.unique_stats("y").fraction.value < 1.0 - - # source_info._unique_stats should contain all columns now - assert set(source_info._unique_stats) == {"x", "y", "z"} - - -def test_source_statistics_csv(tmp_path, df): - from cudf_polars.experimental.io import _extract_scan_stats - - make_partitioned_source(df, tmp_path, "csv", n_files=3) - q = pl.scan_csv(tmp_path) - engine = pl.GPUEngine( - raise_on_fail=True, - executor="streaming", - executor_options={ - "target_partition_size": 10_000, - "scheduler": DEFAULT_SCHEDULER, - }, - ) - ir = Translator(q._ldf.visit(), engine).translate_ir() - column_stats = _extract_scan_stats(ir, ConfigOptions.from_polars_engine(engine)) - - # Source info should be empty for CSV - source_info = column_stats["x"].source_info - assert source_info.row_count.value is None - assert source_info.unique_stats("x").count.value is None - assert source_info.unique_stats("x").fraction.value is None diff --git a/python/cudf_polars/tests/experimental/test_stats.py b/python/cudf_polars/tests/experimental/test_stats.py new file mode 100644 index 00000000000..147cfc98555 --- /dev/null +++ b/python/cudf_polars/tests/experimental/test_stats.py @@ -0,0 +1,357 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. +# SPDX-License-Identifier: Apache-2.0 + +from __future__ import annotations + +import math + +import pytest + +import polars as pl + +from cudf_polars import Translator +from cudf_polars.experimental.io import _clear_source_info_cache +from cudf_polars.experimental.statistics import collect_base_stats +from cudf_polars.testing.asserts import DEFAULT_SCHEDULER, assert_gpu_result_equal +from cudf_polars.testing.io import make_partitioned_source +from cudf_polars.utils.config import ConfigOptions + + +@pytest.fixture(scope="module") +def df(): + return pl.DataFrame( + { + "x": range(3_000), + "y": ["cat", "dog", "fish"] * 1_000, + "z": [1.0, 2.0, 3.0, 4.0, 5.0] * 600, + } + ) + + +def test_base_stats_dataframescan(df): + row_count = df.height + q = pl.LazyFrame(df) + engine = pl.GPUEngine( + raise_on_fail=True, + executor="streaming", + executor_options={ + "max_rows_per_partition": 1_000, + "scheduler": DEFAULT_SCHEDULER, + }, + ) + ir = Translator(q._ldf.visit(), engine).translate_ir() + stats = collect_base_stats(ir, ConfigOptions.from_polars_engine(engine)) + column_stats = stats.column_stats[ir] + + # Source info is the same for all columns + source_info = column_stats["x"].source_info + assert source_info is column_stats["y"].source_info + assert source_info is column_stats["z"].source_info + assert source_info.row_count.value == row_count + assert source_info.row_count.exact + + # Storage stats should not be available + assert source_info.storage_size("x").value is None + + # Check unique stats + assert math.isclose( + source_info.unique_stats("x").count.value, row_count, rel_tol=5e-2 + ) + assert math.isclose(source_info.unique_stats("x").fraction.value, 1.0, abs_tol=1e-2) + assert not source_info.unique_stats("x").count.exact + assert math.isclose(source_info.unique_stats("y").count.value, 3, rel_tol=5e-2) + assert math.isclose( + source_info.unique_stats("y").fraction.value, 3 / row_count, abs_tol=1e-2 + ) + assert not source_info.unique_stats("y").count.exact + assert math.isclose(source_info.unique_stats("z").count.value, 5, rel_tol=5e-2) + assert math.isclose( + source_info.unique_stats("z").fraction.value, 5 / row_count, abs_tol=1e-2 + ) + assert not source_info.unique_stats("z").count.exact + + +@pytest.mark.parametrize("n_files", [1, 3]) +@pytest.mark.parametrize("row_group_size", [None, 10_000]) +@pytest.mark.parametrize("max_footer_samples", [3, 0]) +@pytest.mark.parametrize("max_row_group_samples", [1, 0]) +def test_base_stats_parquet( + tmp_path, + df, + n_files, + row_group_size, + max_footer_samples, + max_row_group_samples, +): + _clear_source_info_cache() + make_partitioned_source( + df, + tmp_path, + "parquet", + n_files=n_files, + row_group_size=row_group_size, + ) + q = pl.scan_parquet(tmp_path) + engine = pl.GPUEngine( + raise_on_fail=True, + executor="streaming", + executor_options={ + "target_partition_size": 10_000, + "scheduler": DEFAULT_SCHEDULER, + }, + parquet_options={ + "max_footer_samples": max_footer_samples, + "max_row_group_samples": max_row_group_samples, + }, + ) + ir = Translator(q._ldf.visit(), engine).translate_ir() + stats = collect_base_stats(ir, ConfigOptions.from_polars_engine(engine)) + column_stats = stats.column_stats[ir] + + # Source info is the same for all columns + source_info = column_stats["x"].source_info + assert source_info is column_stats["y"].source_info + assert source_info is column_stats["z"].source_info + if max_footer_samples: + assert source_info.row_count.value == df.height + assert source_info.row_count.exact + else: + assert source_info.row_count.value is None + + # Storage stats should be available + if max_footer_samples: + assert source_info.storage_size("x").value > 0 + assert source_info.storage_size("y").value > 0 + else: + assert source_info.storage_size("x").value is None + assert source_info.storage_size("y").value is None + + # Check that we can query a missing column name + assert source_info.storage_size("foo").value is None + assert source_info.unique_stats("foo").count.value is None + assert source_info.unique_stats("foo").fraction.value is None + + # source._unique_stats should be empty + assert set(source_info._unique_stats) == set() + + if max_footer_samples and max_row_group_samples: + assert source_info.unique_stats("x").count.value == df.height + assert source_info.unique_stats("x").fraction.value == 1.0 + else: + assert source_info.unique_stats("x").count.value is None + assert source_info.unique_stats("x").fraction.value is None + + # source_info._unique_stats should only contain 'x' + if max_footer_samples and max_row_group_samples: + assert set(source_info._unique_stats) == {"x"} + else: + assert set(source_info._unique_stats) == set() + + # Check add_unique_stats_column behavior + if max_footer_samples and max_row_group_samples: + # Can add a "bad"/missing key column + source_info.add_unique_stats_column("foo") + assert set(source_info._unique_stats) == {"x"} + + # Mark 'z' as a key column, and query 'y' stats + source_info.add_unique_stats_column("z") + if n_files == 1 and row_group_size == 10_000: + assert source_info.unique_stats("y").count.value == 3 + else: + assert source_info.unique_stats("y").count.value is None + assert source_info.unique_stats("y").fraction.value < 1.0 + + # source_info._unique_stats should contain all columns now + assert set(source_info._unique_stats) == {"x", "y", "z"} + + +def test_base_stats_csv(tmp_path, df): + make_partitioned_source(df, tmp_path, "csv", n_files=3) + q = pl.scan_csv(tmp_path) + engine = pl.GPUEngine( + raise_on_fail=True, + executor="streaming", + executor_options={ + "target_partition_size": 10_000, + "scheduler": DEFAULT_SCHEDULER, + }, + ) + ir = Translator(q._ldf.visit(), engine).translate_ir() + stats = collect_base_stats(ir, ConfigOptions.from_polars_engine(engine)) + column_stats = stats.column_stats[ir] + + # Source info should be empty for CSV + source_info = column_stats["x"].source_info + assert source_info.row_count.value is None + assert source_info.unique_stats("x").count.value is None + assert source_info.unique_stats("x").fraction.value is None + + +@pytest.mark.parametrize("max_footer_samples", [1, 3]) +@pytest.mark.parametrize("max_row_group_samples", [1, 2]) +def test_base_stats_parquet_groupby( + tmp_path, + df, + max_footer_samples, + max_row_group_samples, +): + n_files = 3 + _clear_source_info_cache() + make_partitioned_source(df, tmp_path, "parquet", n_files=n_files) + q = pl.scan_parquet(tmp_path) + engine = pl.GPUEngine( + raise_on_fail=True, + executor="streaming", + executor_options={ + "target_partition_size": 10_000, + "scheduler": DEFAULT_SCHEDULER, + }, + parquet_options={ + "max_footer_samples": max_footer_samples, + "max_row_group_samples": max_row_group_samples, + }, + ) + + # Check simple selection + q1 = q.select(pl.col("x"), pl.col("y")) + qir1 = Translator(q1._ldf.visit(), engine).translate_ir() + stats = collect_base_stats(qir1, ConfigOptions.from_polars_engine(engine)) + source_info_y = stats.column_stats[qir1]["y"].source_info + unique_stats_y = source_info_y.unique_stats("y") + y_unique_fraction = unique_stats_y.fraction + y_row_count = source_info_y.row_count + assert y_unique_fraction.value < 1.0 + assert y_unique_fraction.value > 0.0 + assert unique_stats_y.count.value is None + if max_footer_samples >= n_files: + # We should have "exact" row-count statistics + assert y_row_count.value == df.height + assert y_row_count.exact + else: + # We should have "estimated" row-count statistics + assert y_row_count.value > 0 + assert not y_row_count.exact + assert_gpu_result_equal(q1.sort(pl.col("x")).slice(0, 2), engine=engine) + + # Source statistics of "y" should match after GroupBy/Select/HStack/etc + q2 = ( + pl.concat( + [ + q.select(pl.col("x")), + q.select(pl.col("y")), + ], + how="horizontal", + ) + .group_by(pl.col("y")) + .sum() + .select(pl.col("x").max(), pl.col("y")) + .with_columns((pl.col("x") * pl.col("x")).alias("x2")) + ) + qir2 = Translator(q2._ldf.visit(), engine).translate_ir() + stats = collect_base_stats(qir2, ConfigOptions.from_polars_engine(engine)) + source_info_y = stats.column_stats[qir2]["y"].source_info + assert source_info_y.unique_stats("y").fraction == y_unique_fraction + assert y_row_count == source_info_y.row_count + assert_gpu_result_equal(q2.sort(pl.col("y")).slice(0, 2), engine=engine) + + +@pytest.mark.parametrize("how", ["inner", "left", "right"]) +def test_base_stats_join(how): + engine = pl.GPUEngine( + raise_on_fail=True, + executor="streaming", + executor_options={ + "scheduler": DEFAULT_SCHEDULER, + "shuffle_method": "tasks", + }, + ) + left = pl.LazyFrame( + { + "x": range(15), + "y": [1, 2, 3] * 5, + "z": [1.0, 2.0, 3.0, 4.0, 5.0] * 3, + } + ) + right = pl.LazyFrame( + { + "xx": range(9), + "y": [2, 4, 3] * 3, + "z": [1, 2, 3] * 3, + } + ) + q = left.join(right, on="y", how=how) + ir = Translator(q._ldf.visit(), engine).translate_ir() + stats = collect_base_stats(ir, ConfigOptions.from_polars_engine(engine)) + + ir_column_stats = stats.column_stats[ir] + left_count, right_count = 15, 9 + if how == "left": + assert ir_column_stats["x"].source_info.row_count.value == left_count + assert ir_column_stats["y"].source_info.row_count.value == left_count + assert ir_column_stats["z"].source_info.row_count.value == left_count + if how == "inner": + assert ir_column_stats["x"].source_info.row_count.value == left_count + assert ir_column_stats["y"].source_info.row_count.value == left_count + assert ir_column_stats["z"].source_info.row_count.value == left_count + assert ir_column_stats["xx"].source_info.row_count.value == right_count + assert ir_column_stats["z_right"].source_info.row_count.value == right_count + if how == "right": + assert ir_column_stats["xx"].source_info.row_count.value == right_count + assert ir_column_stats["y"].source_info.row_count.value == right_count + assert ir_column_stats["z"].source_info.row_count.value == right_count + + +def test_base_stats_union(): + engine = pl.GPUEngine( + raise_on_fail=True, + executor="streaming", + executor_options={ + "scheduler": DEFAULT_SCHEDULER, + "shuffle_method": "tasks", + }, + ) + left = pl.LazyFrame( + { + "x": range(15), + "y": [1, 2, 3] * 5, + "z": [1.0, 2.0, 3.0, 4.0, 5.0] * 3, + } + ) + right = pl.LazyFrame( + { + "x": range(9), + "y": [2, 4, 3] * 3, + "z": [1.0, 2.0, 3.0] * 3, + } + ) + + q = pl.concat([left, right]) + ir = Translator(q._ldf.visit(), engine).translate_ir() + stats = collect_base_stats(ir, ConfigOptions.from_polars_engine(engine)) + column_stats = stats.column_stats[ir] + + # We lose source info after a Union, but we + # can set accurate row-count and unique-value + # estimates for the current IR in #19392 + source_info = column_stats["x"].source_info + assert source_info.row_count.value is None + + +def test_base_stats_distinct(df): + row_count = df.height + engine = pl.GPUEngine( + raise_on_fail=True, + executor="streaming", + executor_options={ + "scheduler": DEFAULT_SCHEDULER, + "shuffle_method": "tasks", + }, + ) + q = pl.LazyFrame(df).unique(subset=["y"]) + ir = Translator(q._ldf.visit(), engine).translate_ir() + stats = collect_base_stats(ir, ConfigOptions.from_polars_engine(engine)) + column_stats = stats.column_stats[ir] + + source_info = column_stats["y"].source_info + assert source_info.row_count.value == row_count + assert source_info.row_count.exact From a4d6e0566556631e73c3f064945bf17b01d67d09 Mon Sep 17 00:00:00 2001 From: Vukasin Milovanovic Date: Mon, 18 Aug 2025 11:45:45 -0700 Subject: [PATCH 150/366] Use the nvCOMP 5.0 API to better estimate decompression memory requirements (#19616) nvCOMP 5 adds synchronous APIs that more accurately determine the scratch size requirements for decompression (ZSTD only for now). This PR integrates the new API in `batched_decompress`, as well as into the chunked reader (where it's used to determine memory overhead from decompression). The new API is expensive to call, as it launches a kernel. For this reason, it's used differently than the old API in the chunked reader. The result may be imprecise when blocks have very uneven compression ratio, but it should be minor given that the new API reduces the memory overhead significantly*. Another change - the temp sizes in the chunked reader are now stored between subpasses so we don't perform redundant computation. *measured up to 2.2x lower peak memory use in micro-benchmarks. Did not measure any performance overhead from the use of the new API in the non-chunked read_parquet benchmarks. Authors: - Vukasin Milovanovic (https://github.com/vuule) Approvers: - https://github.com/nvdbaranec - Nghia Truong (https://github.com/ttnghia) - Karthikeyan (https://github.com/karthikeyann) - MithunR (https://github.com/mythrocks) URL: https://github.com/rapidsai/cudf/pull/19616 --- cpp/src/io/comp/common.cpp | 13 -- cpp/src/io/comp/common_internal.hpp | 14 +- cpp/src/io/comp/decompression.cpp | 50 +++++- cpp/src/io/comp/decompression.hpp | 23 ++- cpp/src/io/comp/nvcomp_adapter.cpp | 142 +++++++++++++++++- cpp/src/io/comp/nvcomp_adapter.cu | 17 +++ cpp/src/io/comp/nvcomp_adapter.cuh | 6 + cpp/src/io/comp/nvcomp_adapter.hpp | 25 +++ cpp/src/io/parquet/reader_impl_chunking.cu | 6 +- cpp/src/io/parquet/reader_impl_chunking.hpp | 1 + .../io/parquet/reader_impl_chunking_utils.cu | 116 ++++++++++++-- .../io/parquet/reader_impl_chunking_utils.cuh | 11 +- 12 files changed, 381 insertions(+), 43 deletions(-) diff --git a/cpp/src/io/comp/common.cpp b/cpp/src/io/comp/common.cpp index 1db3e48f823..99805f693ed 100644 --- a/cpp/src/io/comp/common.cpp +++ b/cpp/src/io/comp/common.cpp @@ -23,19 +23,6 @@ namespace cudf::io::detail { -[[nodiscard]] std::optional to_nvcomp_compression( - compression_type compression) -{ - switch (compression) { - case compression_type::GZIP: return nvcomp::compression_type::GZIP; - case compression_type::LZ4: return nvcomp::compression_type::LZ4; - case compression_type::SNAPPY: return nvcomp::compression_type::SNAPPY; - case compression_type::ZLIB: return nvcomp::compression_type::DEFLATE; - case compression_type::ZSTD: return nvcomp::compression_type::ZSTD; - default: return std::nullopt; - } -} - [[nodiscard]] std::string compression_type_name(compression_type compression) { switch (compression) { diff --git a/cpp/src/io/comp/common_internal.hpp b/cpp/src/io/comp/common_internal.hpp index 3bf430b9a1e..d6b45a18da3 100644 --- a/cpp/src/io/comp/common_internal.hpp +++ b/cpp/src/io/comp/common_internal.hpp @@ -49,8 +49,18 @@ constexpr double default_host_device_decompression_work_ratio = 100; // single GPU block; higher values lead to more host compression in HYBRID mode constexpr double default_host_device_compression_work_ratio = 100; -[[nodiscard]] std::optional to_nvcomp_compression( - compression_type compression); +[[nodiscard]] constexpr std::optional to_nvcomp_compression( + compression_type compression) +{ + switch (compression) { + case compression_type::GZIP: return nvcomp::compression_type::GZIP; + case compression_type::LZ4: return nvcomp::compression_type::LZ4; + case compression_type::SNAPPY: return nvcomp::compression_type::SNAPPY; + case compression_type::ZLIB: return nvcomp::compression_type::DEFLATE; + case compression_type::ZSTD: return nvcomp::compression_type::ZSTD; + default: return std::nullopt; + } +} struct sorted_codec_parameters { rmm::device_uvector> inputs; diff --git a/cpp/src/io/comp/decompression.cpp b/cpp/src/io/comp/decompression.cpp index 8cf49506d17..7362e41a5fa 100644 --- a/cpp/src/io/comp/decompression.cpp +++ b/cpp/src/io/comp/decompression.cpp @@ -37,7 +37,6 @@ #include #include // memset #include -#include #include namespace cudf::io::detail { @@ -537,10 +536,10 @@ void device_decompress(compression_type compression, CUDF_FUNC_RANGE(); if (compression == compression_type::NONE or inputs.empty()) { return; } - auto const nvcomp_type = to_nvcomp_compression(compression); - auto nvcomp_disabled_reason = nvcomp_type.has_value() - ? nvcomp::is_decompression_disabled(*nvcomp_type) - : "invalid compression type"; + auto const nvcomp_type = to_nvcomp_compression(compression); + auto const nvcomp_disabled_reason = nvcomp_type.has_value() + ? nvcomp::is_decompression_disabled(*nvcomp_type) + : "invalid compression type"; if (not nvcomp_disabled_reason) { return nvcomp::batched_decompress( *nvcomp_type, inputs, outputs, results, max_uncomp_chunk_size, max_total_uncomp_size, stream); @@ -638,9 +637,10 @@ size_t get_uncompressed_size(compression_type compression, host_span const> inputs, + size_t max_uncomp_chunk_size, + size_t max_total_uncomp_size, + rmm::cuda_stream_view stream) +{ + if (compression == compression_type::NONE or + get_host_engine_state(compression) == host_engine_state::ON) { + return 0; + } + + auto const nvcomp_type = to_nvcomp_compression(compression); + auto const nvcomp_disabled = nvcomp_type.has_value() + ? nvcomp::is_decompression_disabled(*nvcomp_type) + : "invalid compression type"; + if (nvcomp_disabled) { + CUDF_FAIL("Cannot compute decompression scratch size for " + + compression_type_name(compression)); + } + return nvcomp::batched_decompress_temp_size_ex( + nvcomp_type.value(), inputs, max_uncomp_chunk_size, max_total_uncomp_size, stream); +} + +[[nodiscard]] bool is_decompression_scratch_size_ex_supported(compression_type compression) +{ + auto const nvcomp_type = to_nvcomp_compression(compression); + auto const nvcomp_disabled = nvcomp_type.has_value() + ? nvcomp::is_decompression_disabled(*nvcomp_type) + : "invalid compression type"; + if (nvcomp_disabled) { return false; } + return nvcomp::is_batched_decompress_temp_size_ex_supported(nvcomp_type.value()); +} + size_t decompress(compression_type compression, host_span src, host_span dst) diff --git a/cpp/src/io/comp/decompression.hpp b/cpp/src/io/comp/decompression.hpp index 932bf182936..08700d65663 100644 --- a/cpp/src/io/comp/decompression.hpp +++ b/cpp/src/io/comp/decompression.hpp @@ -48,12 +48,29 @@ struct decompression_info { }; /** - * @brief Functor which returns total scratch space required based on computed decompression_info - * data. - * + * @brief Returns total scratch space required based on computed decompression_info data. */ [[nodiscard]] size_t get_decompression_scratch_size(decompression_info const& di); +/** + * @brief Returns total scratch space required based on the compressed input data. + * + * Might launch a kernel. Should be used only if is_decompression_scratch_size_ex_supported returns + * true. + */ +[[nodiscard]] size_t get_decompression_scratch_size_ex( + compression_type compression, + device_span const> inputs, + size_t max_uncomp_chunk_size, + size_t max_total_uncomp_size, + rmm::cuda_stream_view stream); + +/** + * @brief Checks if the decompression scratch size can be computed using the extended API of the + * nvcomp library. + */ +[[nodiscard]] bool is_decompression_scratch_size_ex_supported(compression_type compression); + /** * @brief Computes the uncompressed sizes of Snappy-compressed input data. * diff --git a/cpp/src/io/comp/nvcomp_adapter.cpp b/cpp/src/io/comp/nvcomp_adapter.cpp index 2887072fce6..5dd78515bcf 100644 --- a/cpp/src/io/comp/nvcomp_adapter.cpp +++ b/cpp/src/io/comp/nvcomp_adapter.cpp @@ -16,9 +16,9 @@ #include "nvcomp_adapter.hpp" -#include "io/utilities/getenv_or.hpp" #include "nvcomp_adapter.cuh" +#include #include #include #include @@ -558,6 +558,115 @@ std::optional is_decompression_disabled_impl(compression_type compr return "Unsupported compression type"; } +#if NVCOMP_VER_MAJOR >= 5 +// Dispatcher for nvcompBatchedDecompressGetTempSizeSync +auto batched_decompress_get_temp_size_sync(compression_type compression, + void const* const* device_compressed_chunk_ptrs, + size_t const* device_compressed_chunk_bytes, + size_t num_chunks, + size_t max_uncompressed_chunk_bytes, + size_t* temp_bytes, + size_t max_total_uncompressed_bytes, + nvcompStatus_t* device_statuses, + cudaStream_t stream) +{ + switch (compression) { + case compression_type::SNAPPY: + return nvcompBatchedSnappyDecompressGetTempSizeSync(device_compressed_chunk_ptrs, + device_compressed_chunk_bytes, + num_chunks, + max_uncompressed_chunk_bytes, + temp_bytes, + max_total_uncompressed_bytes, + nvcompBatchedSnappyDecompressDefaultOpts, + device_statuses, + stream); + case compression_type::ZSTD: + return nvcompBatchedZstdDecompressGetTempSizeSync(device_compressed_chunk_ptrs, + device_compressed_chunk_bytes, + num_chunks, + max_uncompressed_chunk_bytes, + temp_bytes, + max_total_uncompressed_bytes, + nvcompBatchedZstdDecompressDefaultOpts, + device_statuses, + stream); + case compression_type::LZ4: + return nvcompBatchedLZ4DecompressGetTempSizeSync(device_compressed_chunk_ptrs, + device_compressed_chunk_bytes, + num_chunks, + max_uncompressed_chunk_bytes, + temp_bytes, + max_total_uncompressed_bytes, + nvcompBatchedLZ4DecompressDefaultOpts, + device_statuses, + stream); + case compression_type::DEFLATE: + return nvcompBatchedDeflateDecompressGetTempSizeSync( + device_compressed_chunk_ptrs, + device_compressed_chunk_bytes, + num_chunks, + max_uncompressed_chunk_bytes, + temp_bytes, + max_total_uncompressed_bytes, + nvcompBatchedDeflateDecompressDefaultOpts, + device_statuses, + stream); + case compression_type::GZIP: + return nvcompBatchedGzipDecompressGetTempSizeSync(device_compressed_chunk_ptrs, + device_compressed_chunk_bytes, + num_chunks, + max_uncompressed_chunk_bytes, + temp_bytes, + max_total_uncompressed_bytes, + nvcompBatchedGzipDecompressDefaultOpts, + device_statuses, + stream); + default: UNSUPPORTED_COMPRESSION(compression); + } +} + +#endif + +// Overload for internal use that takes device pointers and sizes directly +size_t batched_decompress_temp_size_ex(compression_type compression, + device_span input_data_ptrs, + device_span input_data_sizes, + size_t max_uncomp_chunk_size, + size_t max_total_uncomp_size, + rmm::cuda_stream_view stream) +{ +#if NVCOMP_VER_MAJOR >= 5 + if (is_batched_decompress_temp_size_ex_supported(compression)) { + size_t temp_size = 0; + auto d_statuses = rmm::device_uvector(input_data_ptrs.size(), stream); + nvcompStatus_t const nvcomp_status = + batched_decompress_get_temp_size_sync(compression, + input_data_ptrs.data(), + input_data_sizes.data(), + input_data_ptrs.size(), + max_uncomp_chunk_size, + &temp_size, + max_total_uncomp_size, + d_statuses.data(), + stream.value()); + if (nvcomp_status == nvcompStatus_t::nvcompSuccess) { + auto const h_statuses = cudf::detail::make_host_vector(d_statuses, stream); + auto const are_all_success = + std::all_of(h_statuses.begin(), h_statuses.end(), [](nvcompStatus_t status) { + return status == nvcompStatus_t::nvcompSuccess; + }); + if (are_all_success) { return temp_size; } + } + CUDF_LOG_WARN( + "batched_decompress_get_temp_size_sync failed, falling back to batched_decompress_temp_size"); + } +#endif + // Fallback to the original batched decompress temp size calculation + return batched_decompress_temp_size( + compression, input_data_ptrs.size(), max_uncomp_chunk_size, max_total_uncomp_size); +} + } // namespace size_t batched_decompress_temp_size(compression_type compression, @@ -577,6 +686,27 @@ size_t batched_decompress_temp_size(compression_type compression, return temp_size; } +bool is_batched_decompress_temp_size_ex_supported(compression_type compression) +{ +#if NVCOMP_VER_MAJOR >= 5 + return compression == compression_type::ZSTD; +#else + return false; +#endif +} + +size_t batched_decompress_temp_size_ex(compression_type compression, + device_span const> inputs, + size_t max_uncomp_chunk_size, + size_t max_total_uncomp_size, + rmm::cuda_stream_view stream) +{ + auto const [d_input_ptrs, d_input_sizes] = create_get_temp_size_args(inputs, stream); + + return batched_decompress_temp_size_ex( + compression, d_input_ptrs, d_input_sizes, max_uncomp_chunk_size, max_total_uncomp_size, stream); +} + void batched_decompress(compression_type compression, device_span const> inputs, device_span const> outputs, @@ -591,10 +721,16 @@ void batched_decompress(compression_type compression, auto const nvcomp_args = create_batched_nvcomp_args(inputs, outputs, stream); rmm::device_uvector actual_uncompressed_data_sizes(num_chunks, stream); rmm::device_uvector nvcomp_statuses(num_chunks, stream); + // Temporary space required for decompression - auto const temp_size = batched_decompress_temp_size( - compression, num_chunks, max_uncomp_chunk_size, max_total_uncomp_size); + auto const temp_size = batched_decompress_temp_size_ex(compression, + nvcomp_args.input_data_ptrs, + nvcomp_args.input_data_sizes, + max_uncomp_chunk_size, + max_total_uncomp_size, + stream); rmm::device_buffer scratch(temp_size, stream); + auto const nvcomp_status = batched_decompress_async(compression, #if NVCOMP_VER_MAJOR >= 5 use_hw_decompression(), diff --git a/cpp/src/io/comp/nvcomp_adapter.cu b/cpp/src/io/comp/nvcomp_adapter.cu index 8c32ffe6c77..f699b7214ba 100644 --- a/cpp/src/io/comp/nvcomp_adapter.cu +++ b/cpp/src/io/comp/nvcomp_adapter.cu @@ -60,6 +60,23 @@ batched_args create_batched_nvcomp_args(device_span c std::move(output_data_sizes)}; } +std::pair, rmm::device_uvector> create_get_temp_size_args( + device_span const> inputs, rmm::cuda_stream_view stream) +{ + rmm::device_uvector input_data_ptrs(inputs.size(), stream); + rmm::device_uvector input_data_sizes(inputs.size(), stream); + + auto ins_it = thrust::make_zip_iterator(input_data_ptrs.begin(), input_data_sizes.begin()); + thrust::transform( + rmm::exec_policy_nosync(stream), + inputs.begin(), + inputs.end(), + ins_it, + [] __device__(auto const& in) { return thrust::make_tuple(in.data(), in.size()); }); + + return {std::move(input_data_ptrs), std::move(input_data_sizes)}; +} + void update_compression_results(device_span nvcomp_stats, device_span actual_output_sizes, device_span results, diff --git a/cpp/src/io/comp/nvcomp_adapter.cuh b/cpp/src/io/comp/nvcomp_adapter.cuh index dd0d9cf7843..a6222d2d4b4 100644 --- a/cpp/src/io/comp/nvcomp_adapter.cuh +++ b/cpp/src/io/comp/nvcomp_adapter.cuh @@ -47,6 +47,12 @@ batched_args create_batched_nvcomp_args(device_span c device_span const> outputs, rmm::cuda_stream_view stream); +/** + * @brief Prepares device arrays of input pointers and sizes for use with nvCOMP temp size APIs. + */ +std::pair, rmm::device_uvector> create_get_temp_size_args( + device_span const> inputs, rmm::cuda_stream_view stream); + /** * @brief Convert nvcomp statuses and output sizes into cuIO compression results. */ diff --git a/cpp/src/io/comp/nvcomp_adapter.hpp b/cpp/src/io/comp/nvcomp_adapter.hpp index 4e586234b94..bd50faee1de 100644 --- a/cpp/src/io/comp/nvcomp_adapter.hpp +++ b/cpp/src/io/comp/nvcomp_adapter.hpp @@ -19,6 +19,7 @@ #include "io/comp/compression.hpp" #include +#include #include #include @@ -63,6 +64,30 @@ size_t batched_decompress_temp_size(compression_type compression, size_t max_uncomp_chunk_size, size_t max_total_uncomp_size); +/** + * @brief Return the amount of temporary space required in bytes for a given decompression + * operation using synchronous nvcomp APIs. + * + * The size returned reflects the size of the scratch buffer to be passed to + * `batched_decompress_async`. This version uses the sync APIs which are more precise, but + * potentially require a kernel launch. + * + * @param[in] compression Compression type + * @param[in] inputs Device span of compressed data chunks + * @param[in] max_uncomp_chunk_size Maximum size of any single uncompressed chunk + * @param[in] max_total_uncomp_size Maximum total size of uncompressed data + * @param[in] stream CUDA stream to use + * @returns The total required size in bytes + */ +[[nodiscard]] size_t batched_decompress_temp_size_ex( + compression_type compression, + device_span const> inputs, + size_t max_uncomp_chunk_size, + size_t max_total_uncomp_size, + rmm::cuda_stream_view stream); + +[[nodiscard]] bool is_batched_decompress_temp_size_ex_supported(compression_type compression); + /** * @brief Gets the maximum size any chunk could compress to in the batch. * diff --git a/cpp/src/io/parquet/reader_impl_chunking.cu b/cpp/src/io/parquet/reader_impl_chunking.cu index 4b42ef36436..15e0ced9fa1 100644 --- a/cpp/src/io/parquet/reader_impl_chunking.cu +++ b/cpp/src/io/parquet/reader_impl_chunking.cu @@ -276,7 +276,11 @@ void reader_impl::setup_next_subpass(read_mode mode) // include scratch space needed for decompression. for certain codecs (eg ZSTD) this // can be considerable. - include_decompression_scratch_size(pass.chunks, pass.pages, c_info, _stream); + if (is_first_subpass) { + pass.decomp_scratch_sizes = + compute_decompression_scratch_sizes(pass.chunks, pass.pages, _stream); + } + include_decompression_scratch_size(pass.decomp_scratch_sizes, c_info, _stream); auto iter = thrust::make_counting_iterator(0); auto const pass_max_row = pass.skip_rows + pass.num_rows; diff --git a/cpp/src/io/parquet/reader_impl_chunking.hpp b/cpp/src/io/parquet/reader_impl_chunking.hpp index 294eaf9ac16..660e51b38c8 100644 --- a/cpp/src/io/parquet/reader_impl_chunking.hpp +++ b/cpp/src/io/parquet/reader_impl_chunking.hpp @@ -133,6 +133,7 @@ struct pass_intermediate_data { rmm::device_uvector page_offsets{0, cudf::get_default_stream()}; rmm::device_buffer decomp_dict_data{0, cudf::get_default_stream()}; + rmm::device_uvector decomp_scratch_sizes{0, cudf::get_default_stream()}; rmm::device_uvector str_dict_index{0, cudf::get_default_stream()}; int level_type_size{0}; diff --git a/cpp/src/io/parquet/reader_impl_chunking_utils.cu b/cpp/src/io/parquet/reader_impl_chunking_utils.cu index 4ae82faf312..38fd132acf7 100644 --- a/cpp/src/io/parquet/reader_impl_chunking_utils.cu +++ b/cpp/src/io/parquet/reader_impl_chunking_utils.cu @@ -37,6 +37,8 @@ #include #include +#include +#include #include #include @@ -684,14 +686,11 @@ void detect_malformed_pages(device_span pages, } } -void include_decompression_scratch_size(device_span chunks, - device_span pages, - device_span c_info, - rmm::cuda_stream_view stream) +rmm::device_uvector compute_decompression_scratch_sizes( + device_span chunks, + device_span pages, + rmm::cuda_stream_view stream) { - CUDF_EXPECTS(pages.size() == c_info.size(), - "Encountered page/cumulative_page_info size mismatch"); - auto page_keys = make_page_key_iterator(pages); // per-codec page counts and decompression sizes @@ -712,17 +711,112 @@ void include_decompression_scratch_size(device_span chunk return cudf::io::detail::get_decompression_scratch_size(d); }); - // add to the cumulative_page_info data rmm::device_uvector d_temp_cost = cudf::detail::make_device_uvector_async( temp_cost, stream, cudf::get_current_device_resource_ref()); + + std::array codecs{compression_type::BROTLI, + compression_type::GZIP, + compression_type::LZ4, + compression_type::SNAPPY, + compression_type::ZSTD}; + for (auto const codec : codecs) { + if (cudf::io::detail::is_decompression_scratch_size_ex_supported(codec)) { + auto const total_decomp_info = thrust::transform_reduce( + rmm::exec_policy(stream), + decomp_iter, + decomp_iter + pages.size(), + cuda::proclaim_return_type( + [codec] __device__(decompression_info const& d) { + return d.type == codec ? d : decompression_info{codec, 0, 0, 0}; + }), + decompression_info{codec, 0, 0, 0}, + decomp_sum{}); + + // Collect pages with matching codecs + rmm::device_uvector> temp_spans(pages.size(), stream); + auto iter = thrust::make_counting_iterator(size_t{0}); + thrust::for_each( + rmm::exec_policy_nosync(stream), + iter, + iter + pages.size(), + [pages = pages.begin(), + chunks = chunks.begin(), + temp_spans = temp_spans.begin(), + codec] __device__(size_t i) { + auto const& page = pages[i]; + if (parquet_compression_support(chunks[page.chunk_idx].codec).first == codec) { + temp_spans[i] = {page.page_data, static_cast(page.compressed_page_size)}; + } else { + temp_spans[i] = {nullptr, 0}; // Mark pages with other codecs as empty + } + }); + // Copy only non-null spans + rmm::device_uvector> page_spans(pages.size(), stream); + auto end_iter = + thrust::copy_if(rmm::exec_policy_nosync(stream), + temp_spans.begin(), + temp_spans.end(), + page_spans.begin(), + [] __device__(auto const& span) { return span.data() != nullptr; }); + if (end_iter == page_spans.begin()) { + // No pages compressed with this codec, skip + continue; + } + page_spans.resize(end_iter - page_spans.begin(), stream); + + auto const total_temp_size = get_decompression_scratch_size(total_decomp_info); + auto const total_temp_size_ex = cudf::io::detail::get_decompression_scratch_size_ex( + total_decomp_info.type, + page_spans, + total_decomp_info.max_page_decompressed_size, + total_decomp_info.total_decompressed_size, + stream); + + // Make use of the extended API if it provides a more accurate estimate + if (total_temp_size_ex < total_temp_size) { + // The new extended API provides a more accurate (smaller) estimate than the legacy API. + // We cannot efficiently use the extended API to get per-page scratch sizes, so we adjust + // the per-page scratch sizes to on-average reflect the better estimate. This means that + // the scratch size might not be accurate for each page, but it will in aggregate. + auto const adjustment_ratio = static_cast(total_temp_size_ex) / total_temp_size; + + // Apply the adjustment ratio to each page's temporary cost + thrust::for_each(rmm::exec_policy_nosync(stream), + thrust::make_counting_iterator(size_t{0}), + thrust::make_counting_iterator(pages.size()), + [pages = pages.begin(), + chunks = chunks.begin(), + d_temp_cost_ptr = d_temp_cost.begin(), + adjustment_ratio, + codec] __device__(size_t i) { + auto const page_codec = + parquet_compression_support(chunks[pages[i].chunk_idx].codec).first; + // Only adjust pages that use the current compression codec + if (page_codec == codec) { + auto const cost = d_temp_cost_ptr[i]; + // Scale down the cost and round up to ensure we don't underestimate + auto const adjusted = + static_cast(cuda::std::ceil(cost * adjustment_ratio)); + d_temp_cost_ptr[i] = adjusted; + } + }); + } + } + } + return d_temp_cost; +} + +void include_decompression_scratch_size(device_span temp_cost, + device_span c_info, + rmm::cuda_stream_view stream) +{ auto iter = thrust::make_counting_iterator(size_t{0}); thrust::for_each(rmm::exec_policy_nosync(stream), iter, - iter + pages.size(), - [temp_cost = d_temp_cost.begin(), c_info = c_info.begin()] __device__(size_t i) { + iter + c_info.size(), + [temp_cost = temp_cost.begin(), c_info = c_info.begin()] __device__(size_t i) { c_info[i].size_bytes += temp_cost[i]; }); - stream.synchronize(); } } // namespace cudf::io::parquet::detail diff --git a/cpp/src/io/parquet/reader_impl_chunking_utils.cuh b/cpp/src/io/parquet/reader_impl_chunking_utils.cuh index 6a8cd599953..1101c98c86d 100644 --- a/cpp/src/io/parquet/reader_impl_chunking_utils.cuh +++ b/cpp/src/io/parquet/reader_impl_chunking_utils.cuh @@ -227,12 +227,19 @@ void detect_malformed_pages(device_span pages, std::optional expected_row_count, rmm::cuda_stream_view stream); +/** + * @brief Computes the per-page scratch space required for decompression. + */ +rmm::device_uvector compute_decompression_scratch_sizes( + device_span chunks, + device_span pages, + rmm::cuda_stream_view stream); + /** * @brief Add the cost of decompression codec scratch space to the per-page cumulative * size information */ -void include_decompression_scratch_size(device_span chunks, - device_span pages, +void include_decompression_scratch_size(device_span pages, device_span c_info, rmm::cuda_stream_view stream); From e431c40686cb32d698736f8beb38c52972b4741d Mon Sep 17 00:00:00 2001 From: Basit Ayantunde Date: Mon, 18 Aug 2025 20:11:10 +0100 Subject: [PATCH 151/366] [FEA] Add Filter Benchmark (#19678) This merge request implements a simple benchmark to compare the performance of AST `compute_column` + `apply_boolean_mask` to a UDF-Filter. Authors: - Basit Ayantunde (https://github.com/lamarrr) Approvers: - Nghia Truong (https://github.com/ttnghia) - Muhammad Haseeb (https://github.com/mhaseeb123) URL: https://github.com/rapidsai/cudf/pull/19678 --- cpp/benchmarks/CMakeLists.txt | 5 + cpp/benchmarks/filter/minmax_filter.cpp | 169 ++++++++++++++++++++++++ 2 files changed, 174 insertions(+) create mode 100644 cpp/benchmarks/filter/minmax_filter.cpp diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt index ec1fb42cdab..9aad6a21012 100644 --- a/cpp/benchmarks/CMakeLists.txt +++ b/cpp/benchmarks/CMakeLists.txt @@ -173,6 +173,11 @@ ConfigureNVBench(NDSH_Q06_NVBENCH ndsh/q06.cpp ndsh/utilities.cpp) ConfigureNVBench(NDSH_Q09_NVBENCH ndsh/q09.cpp ndsh/utilities.cpp) ConfigureNVBench(NDSH_Q10_NVBENCH ndsh/q10.cpp ndsh/utilities.cpp) +# ################################################################################################## +# * filter benchmark ------------------------------------------------------------------- + +ConfigureNVBench(FILTER_NVBENCH filter/minmax_filter.cpp) + # ################################################################################################## # * stream_compaction benchmark ------------------------------------------------------------------- ConfigureNVBench( diff --git a/cpp/benchmarks/filter/minmax_filter.cpp b/cpp/benchmarks/filter/minmax_filter.cpp new file mode 100644 index 00000000000..069bfe67107 --- /dev/null +++ b/cpp/benchmarks/filter/minmax_filter.cpp @@ -0,0 +1,169 @@ +/* + * Copyright (c) 2025, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include + +#include +#include + +template +struct benchmark_data; + +template +struct benchmark_data { + static T dist_min() { return 0; } + + static T dist_max() { return 1; } + + static T filter_min() { return 0.05; } + + static T filter_max() { return 0.07; } +}; + +template +struct benchmark_data { + static T dist_min() { return -128; } + + static T dist_max() { return 127; } + + static T filter_min() { return -64; } + + static T filter_max() { return 64; } +}; + +enum class engine_type : uint8_t { AST = 0, JIT = 1 }; + +engine_type engine_from_string(std::string_view str) +{ + if (str == "ast") { + return engine_type::AST; + } else if (str == "jit") { + return engine_type::JIT; + } else { + CUDF_FAIL("unrecognized engine enum: " + std::string(str)); + } +} + +bool boolean_from_string(std::string_view str) +{ + if (str == "true") { + return true; + } else if (str == "false") { + return false; + } else { + CUDF_FAIL("unrecognized boolean value: " + std::string(str)); + } +} + +template +static void BM_filter_min_max(nvbench::state& state) +{ + auto const num_rows = static_cast(state.get_int64("num_rows")); + auto const engine_name = state.get_string("engine"); + auto const nullable = boolean_from_string(state.get_string("nullable")); + auto const engine = engine_from_string(engine_name); + + auto profile = data_profile{}; + profile.set_distribution_params(cudf::type_to_id(), + distribution_id::NORMAL, + benchmark_data::dist_min(), + benchmark_data::dist_max()); + profile.set_null_probability(nullable ? std::optional{0.3} : std::nullopt); + + auto const column = + create_random_column(cudf::type_to_id(), row_count{num_rows}, profile); + + std::string type_name = cudf::type_to_name(cudf::data_type{cudf::type_to_id()}); + + auto udf = std::format( + R"***( + __device__ void transform(bool * out, {0} c0, {0} min, {0} max) {{ + *out = (c0 >= min && c0 <= max); + }} + )***", + type_name); + + auto tree = cudf::ast::tree{}; + auto min_scalar = cudf::numeric_scalar{benchmark_data::filter_min()}; + auto max_scalar = cudf::numeric_scalar{benchmark_data::filter_max()}; + auto min_scalar_column = cudf::make_column_from_scalar(min_scalar, 1); + auto max_scalar_column = cudf::make_column_from_scalar(max_scalar, 1); + + { + auto& column_ref = tree.push(cudf::ast::column_reference{0}); + auto& min_literal = tree.push(cudf::ast::literal{min_scalar}); + auto& max_literal = tree.push(cudf::ast::literal{max_scalar}); + auto& filter_min = tree.push( + cudf::ast::operation{cudf::ast::ast_operator::GREATER_EQUAL, column_ref, min_literal}); + auto& filter_max = + tree.push(cudf::ast::operation{cudf::ast::ast_operator::LESS_EQUAL, column_ref, max_literal}); + tree.push(cudf::ast::operation{cudf::ast::ast_operator::LOGICAL_AND, filter_min, filter_max}); + } + + std::vector filter_inputs; + filter_inputs.push_back(column->view()); + filter_inputs.push_back(min_scalar_column->view()); + filter_inputs.push_back(max_scalar_column->view()); + + // Use the number of bytes read from global memory + state.add_global_memory_reads(static_cast(num_rows)); + state.add_global_memory_writes(num_rows); + + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { + auto stream = launch.get_stream().get_stream(); + auto mr = cudf::get_current_device_resource_ref(); + + switch (engine) { + case engine_type::AST: { + auto input_table = cudf::table_view{{column->view()}}; + auto const filter_boolean = cudf::compute_column(input_table, tree.back(), stream, mr); + auto const result = + cudf::apply_boolean_mask(input_table, filter_boolean->view(), stream, mr); + } break; + case engine_type::JIT: { + auto result = cudf::filter( + filter_inputs, udf, false, std::nullopt, std::vector{true, false, false}, stream, mr); + } break; + default: CUDF_UNREACHABLE("Unrecognised engine type requested"); + } + }); +} + +#define FILTER_BENCHMARK_DEFINE(name, key_type) \ + static void name(::nvbench::state& st) { ::BM_filter_min_max(st); } \ + NVBENCH_BENCH(name) \ + .set_name(#name) \ + .add_string_axis("engine", {"ast", "jit"}) \ + .add_int64_axis("num_rows", {100'000, 1'000'000, 10'000'000, 100'000'000}) \ + .add_string_axis("nullable", {"true", "false"}) + +FILTER_BENCHMARK_DEFINE(filter_min_max_int32, int32_t); +FILTER_BENCHMARK_DEFINE(filter_min_max_int64, int64_t); +FILTER_BENCHMARK_DEFINE(filter_min_max_float32, float); +FILTER_BENCHMARK_DEFINE(filter_min_max_float64, double); From 36a7519b00ae47eb0222f5f18dca173152082a2f Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com> Date: Mon, 18 Aug 2025 13:34:59 -0700 Subject: [PATCH 152/366] Correctly decode boolean lists in chunked parquet reader (#19707) Closes #19702 This PR adds crucial bug fixes to correctly decode boolean lists in the chunked parquet reader. Authors: - Muhammad Haseeb (https://github.com/mhaseeb123) Approvers: - Bradley Dice (https://github.com/bdice) - Paul Mattione (https://github.com/pmattione-nvidia) - Vukasin Milovanovic (https://github.com/vuule) URL: https://github.com/rapidsai/cudf/pull/19707 --- cpp/src/io/parquet/decode_fixed.cu | 41 ++++----- cpp/src/io/parquet/reader_impl_preprocess.cu | 6 +- cpp/tests/io/parquet_chunked_reader_test.cu | 95 +++++++++++++++++++- cpp/tests/io/parquet_common.cpp | 6 +- 4 files changed, 119 insertions(+), 29 deletions(-) diff --git a/cpp/src/io/parquet/decode_fixed.cu b/cpp/src/io/parquet/decode_fixed.cu index 077f9f80715..051dd79b5db 100644 --- a/cpp/src/io/parquet/decode_fixed.cu +++ b/cpp/src/io/parquet/decode_fixed.cu @@ -894,32 +894,27 @@ __device__ inline bool maybe_has_nulls(page_state_s* s) return run_val != s->col.max_level[lvl]; } -template -inline __device__ void bool_plain_decode(page_state_s* s, state_buf* sb, int t, int to_decode) +template +inline __device__ void bool_plain_decode(page_state_s* s, + state_buf* sb, + int target_pos, + thread_group const& group) { - int pos = s->dict_pos; - int const target_pos = pos + to_decode; - __syncthreads(); // Make sure all threads have read dict_pos before it changes at the end. + int const pos = s->dict_pos; + int const t = group.thread_rank(); + // Ensure all threads have the dict_pos + group.sync(); - while (pos < target_pos) { - int const batch_len = min(target_pos - pos, decode_block_size_t); + for (auto bit_pos = pos + t; bit_pos < target_pos; bit_pos += group.size()) { + int const byte_offset = bit_pos >> 3; + int const bit_in_byte_index = bit_pos & 7; - if (t < batch_len) { - int const bit_pos = pos + t; - int const byte_offset = bit_pos >> 3; - int const bit_in_byte_index = bit_pos & 7; + uint8_t const* const read_from = s->data_start + byte_offset; + bool const read_bit = (*read_from) & (1 << bit_in_byte_index); - uint8_t const* const read_from = s->data_start + byte_offset; - bool const read_bit = (*read_from) & (1 << bit_in_byte_index); - - int const write_to_index = rolling_index(bit_pos); - sb->dict_idx[write_to_index] = read_bit; - } - - pos += batch_len; + int const write_to_index = rolling_index(bit_pos); + sb->dict_idx[write_to_index] = read_bit; } - - if (t == 0) { s->dict_pos = pos; } } template @@ -1235,7 +1230,9 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size_t, 8) if (bools_are_rle_stream) { bool_stream.decode_next(t, next_valid_count - valid_count); } else { - bool_plain_decode(s, sb, t, next_valid_count - valid_count); + auto const target_pos = next_valid_count + skipped_leaf_values; + bool_plain_decode(s, sb, target_pos, block); + if (t == 0) { s->dict_pos = target_pos; } } block.sync(); } diff --git a/cpp/src/io/parquet/reader_impl_preprocess.cu b/cpp/src/io/parquet/reader_impl_preprocess.cu index 87fb1950017..1c5c1f2641c 100644 --- a/cpp/src/io/parquet/reader_impl_preprocess.cu +++ b/cpp/src/io/parquet/reader_impl_preprocess.cu @@ -606,9 +606,9 @@ void reader_impl::preprocess_subpass_pages(read_mode mode, size_t chunk_read_lim // subpass since we know that will safely completed. bool const is_list = last_chunk.max_level[level_type::REPETITION] > 0; // corner case: only decode up to the second-to-last row, except if this is the last page in the - // entire pass. this handles the case where we only have 1 chunk, 1 page, and potentially even - // just 1 row. - if (is_list && max_col_row < last_pass_row) { + // entire pass or if we have the page index. this handles the case where we only have 1 chunk, 1 + // page, and potentially even just 1 row. + if (is_list and std::cmp_less(max_col_row, last_pass_row) and not _has_page_index) { // compute min row for this column in the subpass auto const& first_page = subpass.pages[first_page_index]; auto const& first_chunk = pass.chunks[first_page.chunk_idx]; diff --git a/cpp/tests/io/parquet_chunked_reader_test.cu b/cpp/tests/io/parquet_chunked_reader_test.cu index 52d277de869..0dbccc18230 100644 --- a/cpp/tests/io/parquet_chunked_reader_test.cu +++ b/cpp/tests/io/parquet_chunked_reader_test.cu @@ -1907,6 +1907,97 @@ TEST_F(ParquetChunkedReaderTest, TestNumRowsPerSourceEmptyTable) std::equal(expected_counts.cbegin(), expected_counts.cend(), num_rows_per_source.cbegin())); } +TEST_F(ParquetReaderTest, BooleanList) +{ + std::mt19937 gen(0xcaffe); + + // Parquet file + auto const parquet_filepath = temp_env->get_temp_filepath("BooleanList.parquet"); + { + auto constexpr num_rows = num_ordered_rows; + + // Validity helpers + std::bernoulli_distribution bn(0.7f); + auto valids = + cudf::detail::make_counting_transform_iterator(0, [&](int index) { return bn(gen); }); + auto list_valids = + cudf::detail::make_counting_transform_iterator(0, [&](int index) { return index % 100; }); + + // str(non-nullable) + auto col0 = testdata::ascending(); + + // list(nullable) + auto bools_iter = + cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 2; }); + auto bools_col = + cudf::test::fixed_width_column_wrapper(bools_iter, bools_iter + num_rows, valids); + auto offsets_iter = thrust::counting_iterator(0); + auto offsets_col = cudf::test::fixed_width_column_wrapper( + offsets_iter, offsets_iter + num_rows + 1); + auto [null_mask, null_count] = + cudf::test::detail::make_null_mask(list_valids, list_valids + num_rows); + auto _col1 = cudf::make_lists_column( + num_rows, offsets_col.release(), bools_col.release(), null_count, std::move(null_mask)); + auto col1 = cudf::purge_nonempty_nulls(*_col1); + + // list(nullable)>(nullable) + auto col2 = make_parquet_list_list_col(0, num_rows, 5, 8, true); + + // Input table + auto constexpr num_concat = 3; + auto table = cudf::table_view{{col0, *col1, *col2}}; + auto expected = cudf::concatenate(std::vector(num_concat, table)); + table = expected->view(); + // Write to parquet buffer + cudf::io::parquet_writer_options out_opts = + cudf::io::parquet_writer_options::builder(cudf::io::sink_info{parquet_filepath}, table) + // Note: The following writer options have been chosen to work with the chunked reader's + // empirical memory limits. We may need to adjust them in the future. + .row_group_size_rows(num_rows) + .max_page_size_rows(page_size_for_ordered_tests) + .compression(cudf::io::compression_type::SNAPPY) + .dictionary_policy(cudf::io::dictionary_policy::ALWAYS); + + cudf::io::write_parquet(out_opts); + } + + // Parquet reader options + cudf::io::parquet_reader_options const options = + cudf::io::parquet_reader_options::builder(cudf::io::source_info(parquet_filepath)); + + // Read parquet using the chunked parquet reader + auto const read = [&]() { + // Note that this test relies on these empirically chosen memory limits to have the chunked + // reader create subpasses such that we end a subpass at `page.num_rows - 1` and let next + // subpass read that last row from that page. We may need to adjust these limits in the future + // to hit this edge case in chunking. + auto constexpr input_mem_limit = 102400; + auto constexpr output_mem_limit = 0; + auto reader = cudf::io::chunked_parquet_reader(output_mem_limit, input_mem_limit, options); + + auto table_chunks = std::vector>(); + while (reader.has_next()) { + auto chunk = reader.read_chunk(); + table_chunks.push_back(std::move(chunk.tbl)); + } + + // Commented as future changes to chunking logic may change the number of table_chunks + // EXPECT_GT(table_chunks.size(), 1); + + auto out_tviews = std::vector{}; + for (auto const& tbl : table_chunks) { + out_tviews.emplace_back(tbl->view()); + } + + return cudf::concatenate(out_tviews); + }(); + + // Read using the main parquet reader + auto const expected = cudf::io::read_parquet(options).tbl; + + CUDF_TEST_EXPECT_TABLES_EQUAL(expected->view(), read->view()); +} + TEST_F(ParquetReaderTest, ManyLargeLists) { auto const stream = cudf::get_default_stream(); @@ -1968,8 +2059,8 @@ TEST_F(ParquetReaderTest, ManyLargeLists) EXPECT_NO_THROW(std::ignore = reader.read_chunk()); num_chunks++; } - // We will end up with exactly two chunks as the total number of leaf rows is just above 2B rows - // per table chunk limit and we haven't set any chunk or pass read limits + // We will end up with exactly two chunks as the total number of leaf rows is just above 2B + // rows per table chunk limit and we haven't set any chunk or pass read limits EXPECT_EQ(num_chunks, 2); }; diff --git a/cpp/tests/io/parquet_common.cpp b/cpp/tests/io/parquet_common.cpp index f0be469e5ed..7fd9f474723 100644 --- a/cpp/tests/io/parquet_common.cpp +++ b/cpp/tests/io/parquet_common.cpp @@ -139,8 +139,8 @@ std::unique_ptr make_parquet_list_list_col( // child values std::vector child_values(num_rows * lists_per_row * list_size); - T first_child_value_index = skip_rows * lists_per_row * list_size; - int child_value_count = 0; + auto first_child_value_index = skip_rows * lists_per_row * list_size; + int child_value_count = 0; { for (int idx = 0; idx < (num_rows * lists_per_row * list_size); idx++) { int row_index = idx / (lists_per_row * list_size); @@ -178,6 +178,8 @@ std::unique_ptr make_parquet_list_list_col( template std::unique_ptr make_parquet_list_list_col( int skip_rows, int num_rows, int lists_per_row, int list_size, bool include_validity); +template std::unique_ptr make_parquet_list_list_col( + int skip_rows, int num_rows, int lists_per_row, int list_size, bool include_validity); template std::vector random_values(size_t size) From 15031459759977f6a94feecfb37ad22706b0ba0a Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Mon, 18 Aug 2025 16:09:59 -0700 Subject: [PATCH 153/366] Add streams to reshape (#19728) Contributes to #15163 Authors: - Vyas Ramasubramani (https://github.com/vyasr) Approvers: - Matthew Roeschke (https://github.com/mroeschke) URL: https://github.com/rapidsai/cudf/pull/19728 --- python/pylibcudf/pylibcudf/libcudf/reshape.pxd | 7 +++++-- python/pylibcudf/pylibcudf/reshape.pxd | 4 ++-- python/pylibcudf/pylibcudf/reshape.pyi | 10 +++++++--- python/pylibcudf/pylibcudf/reshape.pyx | 18 ++++++++++++------ 4 files changed, 26 insertions(+), 13 deletions(-) diff --git a/python/pylibcudf/pylibcudf/libcudf/reshape.pxd b/python/pylibcudf/pylibcudf/libcudf/reshape.pxd index 84fcd7cdaa8..f5024648137 100644 --- a/python/pylibcudf/pylibcudf/libcudf/reshape.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/reshape.pxd @@ -16,10 +16,13 @@ cdef extern from "cuda/functional" namespace "cuda::std": cdef extern from "cudf/reshape.hpp" namespace "cudf" nogil: cdef unique_ptr[column] interleave_columns( - table_view source_table + table_view source_table, + cuda_stream_view stream ) except +libcudf_exception_handler cdef unique_ptr[table] tile( - table_view source_table, size_type count + table_view source_table, + size_type count, + cuda_stream_view stream ) except +libcudf_exception_handler cdef void table_to_array( table_view input_table, diff --git a/python/pylibcudf/pylibcudf/reshape.pxd b/python/pylibcudf/pylibcudf/reshape.pxd index efb7217f0e8..ba61a06a7b8 100644 --- a/python/pylibcudf/pylibcudf/reshape.pxd +++ b/python/pylibcudf/pylibcudf/reshape.pxd @@ -14,8 +14,8 @@ from .table cimport Table from .types cimport DataType -cpdef Column interleave_columns(Table source_table) -cpdef Table tile(Table source_table, size_type count) +cpdef Column interleave_columns(Table source_table, Stream stream=*) +cpdef Table tile(Table source_table, size_type count, Stream stream=*) cpdef void table_to_array( Table input_table, uintptr_t ptr, diff --git a/python/pylibcudf/pylibcudf/reshape.pyi b/python/pylibcudf/pylibcudf/reshape.pyi index 8f2e33903af..4d8d0c16ded 100644 --- a/python/pylibcudf/pylibcudf/reshape.pyi +++ b/python/pylibcudf/pylibcudf/reshape.pyi @@ -1,12 +1,16 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. +# Copyright (c) 2024-2025, NVIDIA CORPORATION. from rmm.pylibrmm.stream import Stream from pylibcudf.column import Column from pylibcudf.table import Table -def interleave_columns(source_table: Table) -> Column: ... -def tile(source_table: Table, count: int) -> Table: ... +def interleave_columns( + source_table: Table, stream: Stream | None = None +) -> Column: ... +def tile( + source_table: Table, count: int, stream: Stream | None = None +) -> Table: ... def table_to_array( input_table: Table, ptr: int, diff --git a/python/pylibcudf/pylibcudf/reshape.pyx b/python/pylibcudf/pylibcudf/reshape.pyx index 0ebe61af713..a8ffa348eef 100644 --- a/python/pylibcudf/pylibcudf/reshape.pyx +++ b/python/pylibcudf/pylibcudf/reshape.pyx @@ -25,7 +25,7 @@ from .utils cimport _get_stream __all__ = ["interleave_columns", "tile", "table_to_array"] -cpdef Column interleave_columns(Table source_table): +cpdef Column interleave_columns(Table source_table, Stream stream=None): """Interleave columns of a table into a single column. Converts the column major table `input` into a row major column. @@ -40,6 +40,8 @@ cpdef Column interleave_columns(Table source_table): ---------- source_table: Table The input table to interleave + stream : Stream | None + CUDA stream on which to perform the operation. Returns ------- @@ -47,14 +49,15 @@ cpdef Column interleave_columns(Table source_table): A new column which is the result of interleaving the input columns """ cdef unique_ptr[column] c_result + stream = _get_stream(stream) with nogil: - c_result = cpp_interleave_columns(source_table.view()) + c_result = cpp_interleave_columns(source_table.view(), stream.view()) - return Column.from_libcudf(move(c_result)) + return Column.from_libcudf(move(c_result), stream) -cpdef Table tile(Table source_table, size_type count): +cpdef Table tile(Table source_table, size_type count, Stream stream=None): """Repeats the rows from input table count times to form a new table. For details, see :cpp:func:`tile`. @@ -65,6 +68,8 @@ cpdef Table tile(Table source_table, size_type count): The input table containing rows to be repeated count: size_type The number of times to tile "rows". Must be non-negative + stream : Stream | None + CUDA stream on which to perform the operation. Returns ------- @@ -72,11 +77,12 @@ cpdef Table tile(Table source_table, size_type count): The table containing the tiled "rows" """ cdef unique_ptr[table] c_result + stream = _get_stream(stream) with nogil: - c_result = cpp_tile(source_table.view(), count) + c_result = cpp_tile(source_table.view(), count, stream.view()) - return Table.from_libcudf(move(c_result)) + return Table.from_libcudf(move(c_result), stream) cpdef void table_to_array( From 7fbce137570ff1a66b112f4c96e29e4c286e32c5 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Mon, 18 Aug 2025 16:10:06 -0700 Subject: [PATCH 154/366] Add streams to null mask APIs (#19727) Contributes to #15163 Authors: - Vyas Ramasubramani (https://github.com/vyasr) Approvers: - Matthew Murray (https://github.com/Matt711) URL: https://github.com/rapidsai/cudf/pull/19727 --- .../pylibcudf/pylibcudf/libcudf/null_mask.pxd | 16 +++-- python/pylibcudf/pylibcudf/null_mask.pxd | 16 +++-- python/pylibcudf/pylibcudf/null_mask.pyi | 23 +++++-- python/pylibcudf/pylibcudf/null_mask.pyx | 60 ++++++++++++++----- 4 files changed, 83 insertions(+), 32 deletions(-) diff --git a/python/pylibcudf/pylibcudf/libcudf/null_mask.pxd b/python/pylibcudf/pylibcudf/libcudf/null_mask.pxd index 5b49ddc3bbe..bb53d5716d8 100644 --- a/python/pylibcudf/pylibcudf/libcudf/null_mask.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/null_mask.pxd @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. +# Copyright (c) 2020-2025, NVIDIA CORPORATION. from libc.stdint cimport int32_t from libcpp.pair cimport pair from pylibcudf.exception_handler cimport libcudf_exception_handler @@ -7,11 +7,13 @@ from pylibcudf.libcudf.table.table_view cimport table_view from pylibcudf.libcudf.types cimport bitmask_type, mask_state, size_type from rmm.librmm.device_buffer cimport device_buffer +from rmm.librmm.cuda_stream_view cimport cuda_stream_view cdef extern from "cudf/null_mask.hpp" namespace "cudf" nogil: cdef device_buffer copy_bitmask "cudf::copy_bitmask" ( - column_view view + column_view view, + cuda_stream_view stream ) except +libcudf_exception_handler cdef size_t bitmask_allocation_size_bytes ( @@ -25,19 +27,23 @@ cdef extern from "cudf/null_mask.hpp" namespace "cudf" nogil: cdef device_buffer create_null_mask ( size_type size, - mask_state state + mask_state state, + cuda_stream_view stream ) except +libcudf_exception_handler cdef pair[device_buffer, size_type] bitmask_and( - table_view view + table_view view, + cuda_stream_view stream ) cdef pair[device_buffer, size_type] bitmask_or( - table_view view + table_view view, + cuda_stream_view stream ) cdef size_type null_count( const bitmask_type * bitmask, size_type start, size_type stop, + cuda_stream_view stream ) diff --git a/python/pylibcudf/pylibcudf/null_mask.pxd b/python/pylibcudf/pylibcudf/null_mask.pxd index bd6969cc415..96aa80b4b4f 100644 --- a/python/pylibcudf/pylibcudf/null_mask.pxd +++ b/python/pylibcudf/pylibcudf/null_mask.pxd @@ -5,18 +5,24 @@ from pylibcudf.libcudf.types cimport mask_state, size_type from pylibcudf.gpumemoryview cimport gpumemoryview from rmm.pylibrmm.device_buffer cimport DeviceBuffer +from rmm.pylibrmm.stream cimport Stream from .column cimport Column -cpdef DeviceBuffer copy_bitmask(Column col) +cpdef DeviceBuffer copy_bitmask(Column col, Stream stream=*) cpdef size_t bitmask_allocation_size_bytes(size_type number_of_bits) -cpdef DeviceBuffer create_null_mask(size_type size, mask_state state = *) +cpdef DeviceBuffer create_null_mask(size_type size, mask_state state=*, Stream stream=*) -cpdef tuple bitmask_and(list columns) +cpdef tuple bitmask_and(list columns, Stream stream=*) -cpdef tuple bitmask_or(list columns) +cpdef tuple bitmask_or(list columns, Stream stream=*) -cpdef size_type null_count(gpumemoryview bitmask, size_type start, size_type stop) +cpdef size_type null_count( + gpumemoryview bitmask, + size_type start, + size_type stop, + Stream stream=* +) diff --git a/python/pylibcudf/pylibcudf/null_mask.pyi b/python/pylibcudf/pylibcudf/null_mask.pyi index 524b3d432b2..d3e6d9e8642 100644 --- a/python/pylibcudf/pylibcudf/null_mask.pyi +++ b/python/pylibcudf/pylibcudf/null_mask.pyi @@ -1,16 +1,27 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. +# Copyright (c) 2024-2025, NVIDIA CORPORATION. from rmm.pylibrmm.device_buffer import DeviceBuffer +from rmm.pylibrmm.stream import Stream from pylibcudf.column import Column from pylibcudf.gpumemoryview import gpumemoryview from pylibcudf.types import MaskState -def copy_bitmask(col: Column) -> DeviceBuffer: ... +def copy_bitmask( + col: Column, stream: Stream | None = None +) -> DeviceBuffer: ... def bitmask_allocation_size_bytes(number_of_bits: int) -> int: ... def create_null_mask( - size: int, state: MaskState = MaskState.UNINITIALIZED + size: int, + state: MaskState = MaskState.UNINITIALIZED, + stream: Stream | None = None, ) -> DeviceBuffer: ... -def bitmask_and(columns: list[Column]) -> tuple[DeviceBuffer, int]: ... -def bitmask_or(columns: list[Column]) -> tuple[DeviceBuffer, int]: ... -def null_count(bitmask: gpumemoryview, start: int, stop: int) -> int: ... +def bitmask_and( + columns: list[Column], stream: Stream | None = None +) -> tuple[DeviceBuffer, int]: ... +def bitmask_or( + columns: list[Column], stream: Stream | None = None +) -> tuple[DeviceBuffer, int]: ... +def null_count( + bitmask: gpumemoryview, start: int, stop: int, stream: Stream | None = None +) -> int: ... diff --git a/python/pylibcudf/pylibcudf/null_mask.pyx b/python/pylibcudf/pylibcudf/null_mask.pyx index 8a38563481a..c06ce424f26 100644 --- a/python/pylibcudf/pylibcudf/null_mask.pyx +++ b/python/pylibcudf/pylibcudf/null_mask.pyx @@ -8,11 +8,13 @@ from pylibcudf.gpumemoryview cimport gpumemoryview from rmm.librmm.device_buffer cimport device_buffer from rmm.pylibrmm.device_buffer cimport DeviceBuffer +from rmm.pylibrmm.stream cimport Stream from pylibcudf.libcudf.types import mask_state as MaskState # no-cython-lint from .column cimport Column from .table cimport Table +from .utils cimport _get_stream __all__ = [ "bitmask_allocation_size_bytes", @@ -23,11 +25,11 @@ __all__ = [ "null_count", ] -cdef DeviceBuffer buffer_to_python(device_buffer buf): - return DeviceBuffer.c_from_unique_ptr(make_unique[device_buffer](move(buf))) +cdef DeviceBuffer buffer_to_python(device_buffer buf, Stream stream): + return DeviceBuffer.c_from_unique_ptr(make_unique[device_buffer](move(buf)), stream) -cpdef DeviceBuffer copy_bitmask(Column col): +cpdef DeviceBuffer copy_bitmask(Column col, Stream stream=None): """Copies ``col``'s bitmask into a ``DeviceBuffer``. For details, see :cpp:func:`copy_bitmask`. @@ -36,6 +38,8 @@ cpdef DeviceBuffer copy_bitmask(Column col): ---------- col : Column Column whose bitmask needs to be copied + stream : Stream | None + CUDA stream on which to perform the operation. Returns ------- @@ -44,11 +48,12 @@ cpdef DeviceBuffer copy_bitmask(Column col): ``DeviceBuffer`` if ``col`` is not nullable """ cdef device_buffer db + stream = _get_stream(stream) with nogil: - db = cpp_null_mask.copy_bitmask(col.view()) + db = cpp_null_mask.copy_bitmask(col.view(), stream.view()) - return buffer_to_python(move(db)) + return buffer_to_python(move(db), stream) cpdef size_t bitmask_allocation_size_bytes(size_type number_of_bits): """ @@ -73,7 +78,8 @@ cpdef size_t bitmask_allocation_size_bytes(size_type number_of_bits): cpdef DeviceBuffer create_null_mask( size_type size, - mask_state state = mask_state.UNINITIALIZED + mask_state state = mask_state.UNINITIALIZED, + Stream stream=None ): """Creates a ``DeviceBuffer`` for use as a null value indicator bitmask of a ``Column``. @@ -88,6 +94,8 @@ cpdef DeviceBuffer create_null_mask( The desired state of the mask. Can be one of { MaskState.UNALLOCATED, MaskState.UNINITIALIZED, MaskState.ALL_VALID, MaskState.ALL_NULL } (default MaskState.UNINITIALIZED) + stream : Stream | None + CUDA stream on which to perform the operation. Returns ------- @@ -96,14 +104,15 @@ cpdef DeviceBuffer create_null_mask( state """ cdef device_buffer db + stream = _get_stream(stream) with nogil: - db = cpp_null_mask.create_null_mask(size, state) + db = cpp_null_mask.create_null_mask(size, state, stream.view()) - return buffer_to_python(move(db)) + return buffer_to_python(move(db), stream) -cpdef tuple bitmask_and(list columns): +cpdef tuple bitmask_and(list columns, Stream stream=None): """Performs bitwise AND of the bitmasks of a list of columns. For details, see :cpp:func:`bitmask_and`. @@ -112,6 +121,8 @@ cpdef tuple bitmask_and(list columns): ---------- columns : list The list of columns + stream : Stream | None + CUDA stream on which to perform the operation. Returns ------- @@ -120,14 +131,15 @@ cpdef tuple bitmask_and(list columns): """ cdef Table c_table = Table(columns) cdef pair[device_buffer, size_type] c_result + stream = _get_stream(stream) with nogil: - c_result = cpp_null_mask.bitmask_and(c_table.view()) + c_result = cpp_null_mask.bitmask_and(c_table.view(), stream.view()) - return buffer_to_python(move(c_result.first)), c_result.second + return buffer_to_python(move(c_result.first), stream), c_result.second -cpdef tuple bitmask_or(list columns): +cpdef tuple bitmask_or(list columns, Stream stream=None): """Performs bitwise OR of the bitmasks of a list of columns. For details, see :cpp:func:`bitmask_or`. @@ -136,6 +148,8 @@ cpdef tuple bitmask_or(list columns): ---------- columns : list The list of columns + stream : Stream | None + CUDA stream on which to perform the operation. Returns ------- @@ -144,14 +158,20 @@ cpdef tuple bitmask_or(list columns): """ cdef Table c_table = Table(columns) cdef pair[device_buffer, size_type] c_result + stream = _get_stream(stream) with nogil: - c_result = cpp_null_mask.bitmask_or(c_table.view()) + c_result = cpp_null_mask.bitmask_or(c_table.view(), stream.view()) - return buffer_to_python(move(c_result.first)), c_result.second + return buffer_to_python(move(c_result.first), stream), c_result.second -cpdef size_type null_count(gpumemoryview bitmask, size_type start, size_type stop): +cpdef size_type null_count( + gpumemoryview bitmask, + size_type start, + size_type stop, + Stream stream=None +): """Given a validity bitmask, counts the number of null elements. For details, see :cpp:func:`null_count`. @@ -164,11 +184,19 @@ cpdef size_type null_count(gpumemoryview bitmask, size_type start, size_type sto Index of the first bit to count (inclusive). stop : int Index of the last bit to count (exclusive). + stream : Stream | None + CUDA stream on which to perform the operation. Returns ------- int The number of null elements in the specified range. """ + stream = _get_stream(stream) with nogil: - return cpp_null_mask.null_count((bitmask.ptr), start, stop) + return cpp_null_mask.null_count( + (bitmask.ptr), + start, + stop, + stream.view() + ) From 79ec4f28e676ce4717f3116dc9997c6278a15c1c Mon Sep 17 00:00:00 2001 From: Basit Ayantunde Date: Tue, 19 Aug 2025 00:15:40 +0100 Subject: [PATCH 155/366] [FEA] Implement null-aware transforms and filters (#19502) This merge request implements null-aware transforms where the user can decide the nullness of the output and predicate on the nullness of the inputs. It also implements null-aware filters to allow UDFs to predicate on the null-ness of the inputs. This enables operators like `IS_NULL` and `NULL_EQUAL`. Follows-up: https://github.com/rapidsai/cudf/issues/18023 Closes: https://github.com/rapidsai/cudf/issues/18820 Authors: - Basit Ayantunde (https://github.com/lamarrr) - Vyas Ramasubramani (https://github.com/vyasr) Approvers: - David Wendt (https://github.com/davidwendt) - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/19502 --- cpp/benchmarks/ndsh/q09.cpp | 1 + cpp/benchmarks/transform/polynomials.cpp | 1 + cpp/benchmarks/transform/transform.cpp | 1 + .../compute_checksum_jit.cpp | 10 +- .../string_transforms/extract_email_jit.cpp | 10 +- .../string_transforms/format_phone_jit.cpp | 1 + .../string_transforms/localize_phone_jit.cpp | 1 + cpp/include/cudf/stream_compaction.hpp | 2 + cpp/include/cudf/transform.hpp | 2 + cpp/include/cudf/types.hpp | 8 +- cpp/src/jit/accessors.cuh | 50 +++---- cpp/src/jit/helpers.hpp | 2 + cpp/src/jit/span.cuh | 14 ++ cpp/src/stream_compaction/filter/filter.cu | 14 +- .../stream_compaction/filter/jit/kernel.cu | 20 ++- cpp/src/transform/jit/kernel.cu | 61 ++++++--- cpp/src/transform/transform.cpp | 38 +++++- cpp/tests/filter/filter_test.cpp | 60 ++++++--- cpp/tests/streams/transform_test.cpp | 1 + .../integration/unary_transform_test.cpp | 124 +++++++++++++++--- .../pylibcudf/pylibcudf/libcudf/transform.pxd | 3 +- python/pylibcudf/pylibcudf/libcudf/types.pxd | 4 + python/pylibcudf/pylibcudf/transform.pxd | 3 +- python/pylibcudf/pylibcudf/transform.pyi | 3 +- python/pylibcudf/pylibcudf/transform.pyx | 8 +- python/pylibcudf/pylibcudf/types.pxd | 1 + python/pylibcudf/pylibcudf/types.pyi | 4 + python/pylibcudf/pylibcudf/types.pyx | 3 + python/pylibcudf/tests/test_transform.py | 1 + 29 files changed, 349 insertions(+), 102 deletions(-) diff --git a/cpp/benchmarks/ndsh/q09.cpp b/cpp/benchmarks/ndsh/q09.cpp index 15dbcbd485e..28063533f16 100644 --- a/cpp/benchmarks/ndsh/q09.cpp +++ b/cpp/benchmarks/ndsh/q09.cpp @@ -161,6 +161,7 @@ struct q9_data { cudf::data_type{cudf::type_id::FLOAT64}, false, std::nullopt, + cudf::null_aware::NO, stream, mr); } diff --git a/cpp/benchmarks/transform/polynomials.cpp b/cpp/benchmarks/transform/polynomials.cpp index a3e09535fa3..5b03ae1dfbe 100644 --- a/cpp/benchmarks/transform/polynomials.cpp +++ b/cpp/benchmarks/transform/polynomials.cpp @@ -95,6 +95,7 @@ static void BM_transform_polynomials(nvbench::state& state) cudf::data_type{cudf::type_to_id()}, false, std::nullopt, + cudf::null_aware::NO, launch.get_stream().get_stream()); }); } diff --git a/cpp/benchmarks/transform/transform.cpp b/cpp/benchmarks/transform/transform.cpp index f487f191d94..a404a9e9582 100644 --- a/cpp/benchmarks/transform/transform.cpp +++ b/cpp/benchmarks/transform/transform.cpp @@ -101,6 +101,7 @@ static void BM_transform(nvbench::state& state) cudf::data_type{cudf::type_to_id()}, false, std::nullopt, + cudf::null_aware::NO, launch.get_stream().get_stream()); }); } diff --git a/cpp/examples/string_transforms/compute_checksum_jit.cpp b/cpp/examples/string_transforms/compute_checksum_jit.cpp index f1fbc289406..e23729a930d 100644 --- a/cpp/examples/string_transforms/compute_checksum_jit.cpp +++ b/cpp/examples/string_transforms/compute_checksum_jit.cpp @@ -49,8 +49,14 @@ std::tuple, std::vector> transform( auto name = table.column(0); auto email = table.column(1); - auto result = cudf::transform( - {name, email}, udf, cudf::data_type{cudf::type_id::UINT16}, false, std::nullopt, stream, mr); + auto result = cudf::transform({name, email}, + udf, + cudf::data_type{cudf::type_id::UINT16}, + false, + std::nullopt, + cudf::null_aware::NO, + stream, + mr); return std::make_tuple(std::move(result), transformed); } diff --git a/cpp/examples/string_transforms/extract_email_jit.cpp b/cpp/examples/string_transforms/extract_email_jit.cpp index 7a20946dcd5..686dc5814b3 100644 --- a/cpp/examples/string_transforms/extract_email_jit.cpp +++ b/cpp/examples/string_transforms/extract_email_jit.cpp @@ -64,8 +64,14 @@ __device__ void email_provider(cudf::string_view* out, auto transformed = std::vector{1}; auto emails = table.column(1); - auto providers = cudf::transform( - {emails, *alt}, udf, cudf::data_type{cudf::type_id::STRING}, false, std::nullopt, stream, mr); + auto providers = cudf::transform({emails, *alt}, + udf, + cudf::data_type{cudf::type_id::STRING}, + false, + std::nullopt, + cudf::null_aware::NO, + stream, + mr); return {std::move(providers), std::move(transformed)}; } diff --git a/cpp/examples/string_transforms/format_phone_jit.cpp b/cpp/examples/string_transforms/format_phone_jit.cpp index 680d82a489b..dfbf106fec4 100644 --- a/cpp/examples/string_transforms/format_phone_jit.cpp +++ b/cpp/examples/string_transforms/format_phone_jit.cpp @@ -132,6 +132,7 @@ __device__ void e164_format(void* scratch, cudf::data_type{cudf::type_id::STRING}, false, scratch.data(), + cudf::null_aware::NO, stream, mr); diff --git a/cpp/examples/string_transforms/localize_phone_jit.cpp b/cpp/examples/string_transforms/localize_phone_jit.cpp index 1c065ca11f0..cd720ec47d0 100644 --- a/cpp/examples/string_transforms/localize_phone_jit.cpp +++ b/cpp/examples/string_transforms/localize_phone_jit.cpp @@ -155,6 +155,7 @@ __device__ void format_phone(void* scratch, cudf::data_type{cudf::type_id::STRING}, false, scratch.data(), + cudf::null_aware::NO, stream, mr); diff --git a/cpp/include/cudf/stream_compaction.hpp b/cpp/include/cudf/stream_compaction.hpp index a956905b6d6..8ac3d7bb2e2 100644 --- a/cpp/include/cudf/stream_compaction.hpp +++ b/cpp/include/cudf/stream_compaction.hpp @@ -451,6 +451,7 @@ cudf::size_type distinct_count(table_view const& input, * @param user_data User-defined device data to pass to the UDF. * @param copy_mask Optional vector of booleans indicating which columns to copy from the input * columns to the output. If not provided, all columns are copied. + * @param is_null_aware Signifies the UDF will receive row inputs as optional values * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned column's device memory * @return The filtered target columns @@ -461,6 +462,7 @@ std::vector> filter( bool is_ptx, std::optional user_data = std::nullopt, std::optional> copy_mask = std::nullopt, + null_aware is_null_aware = null_aware::NO, rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); diff --git a/cpp/include/cudf/transform.hpp b/cpp/include/cudf/transform.hpp index 49bf3114778..c49392c92aa 100644 --- a/cpp/include/cudf/transform.hpp +++ b/cpp/include/cudf/transform.hpp @@ -56,6 +56,7 @@ namespace CUDF_EXPORT cudf { * @param output_type The output type that is compatible with the output type in the UDF * @param is_ptx true: the UDF is treated as PTX code; false: the UDF is treated as CUDA code * @param user_data User-defined device data to pass to the UDF. + * @param is_null_aware Signifies the UDF will receive row inputs as optional values * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned column's device memory * @return The column resulting from applying the transform function to @@ -67,6 +68,7 @@ std::unique_ptr transform( data_type output_type, bool is_ptx, std::optional user_data = std::nullopt, + null_aware is_null_aware = null_aware::NO, rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); diff --git a/cpp/include/cudf/types.hpp b/cpp/include/cudf/types.hpp index 9443bd5cb52..408594bf080 100644 --- a/cpp/include/cudf/types.hpp +++ b/cpp/include/cudf/types.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2024, NVIDIA CORPORATION. + * Copyright (c) 2018-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -234,6 +234,12 @@ enum class type_id : int32_t { NUM_TYPE_IDS ///< Total number of type ids }; +/// @brief Indicates whether a function is null-aware or not. +enum class null_aware : bool { + NO = 0, ///< The function is not null-aware + YES = 1 ///< The function is null-aware +}; + /** * @brief Indicator for the logical data type of an element in a column. * diff --git a/cpp/src/jit/accessors.cuh b/cpp/src/jit/accessors.cuh index adfb9ea7328..a8e68d49e44 100644 --- a/cpp/src/jit/accessors.cuh +++ b/cpp/src/jit/accessors.cuh @@ -19,6 +19,7 @@ #include #include +#include #include @@ -32,16 +33,10 @@ struct column_accessor { using type = T; static constexpr int32_t index = Index; - static __device__ decltype(auto) element(cudf::mutable_column_device_view_core const* outputs, - cudf::size_type row) + template + static __device__ decltype(auto) element(ColumnView const* columns, cudf::size_type row) { - return outputs[index].element(row); - } - - static __device__ decltype(auto) element(cudf::column_device_view_core const* inputs, - cudf::size_type row) - { - return inputs[index].element(row); + return columns[index].template element(row); } static __device__ void assign(cudf::mutable_column_device_view_core const* outputs, @@ -51,15 +46,18 @@ struct column_accessor { outputs[index].assign(row, value); } - static __device__ bool is_null(cudf::column_device_view_core const* inputs, cudf::size_type row) + template + static __device__ bool is_null(ColumnView const* inputs, cudf::size_type row) { return inputs[index].is_null(row); } - static __device__ bool is_null(cudf::mutable_column_device_view_core const* inputs, - cudf::size_type row) + template + static __device__ cuda::std::optional nullable_element(ColumnView const* outputs, + cudf::size_type row) { - return inputs[index].is_null(row); + if (is_null(outputs, row)) { return cuda::std::nullopt; } + return outputs[index].template element(row); } }; @@ -86,6 +84,13 @@ struct span_accessor { { return inputs[index].is_null(row); } + + static __device__ cuda::std::optional nullable_element( + cudf::jit::device_optional_span const* outputs, cudf::size_type row) + { + if (is_null(outputs, row)) { return cuda::std::nullopt; } + return outputs[index].element(row); + } }; template @@ -93,18 +98,12 @@ struct scalar_accessor { using type = typename Accessor::type; static constexpr int32_t index = Accessor::index; - static __device__ decltype(auto) element(cudf::mutable_column_device_view_core const* outputs, - cudf::size_type) + template + static __device__ decltype(auto) element(ColumnView const* outputs, cudf::size_type) { return Accessor::element(outputs, 0); } - static __device__ decltype(auto) element(cudf::column_device_view_core const* inputs, - cudf::size_type) - { - return Accessor::element(inputs, 0); - } - static __device__ void assign(cudf::mutable_column_device_view_core const* outputs, cudf::size_type, type value) @@ -112,15 +111,16 @@ struct scalar_accessor { return Accessor::assign(outputs, 0, value); } - static __device__ bool is_null(cudf::column_device_view_core const* inputs, cudf::size_type) + template + static __device__ bool is_null(ColumnView const* inputs, cudf::size_type) { return Accessor::is_null(inputs, 0); } - static __device__ bool is_null(cudf::mutable_column_device_view_core const* inputs, - cudf::size_type) + template + static __device__ decltype(auto) nullable_element(ColumnView const* outputs, cudf::size_type) { - return Accessor::is_null(inputs, 0); + return Accessor::nullable_element(outputs, 0); } }; diff --git a/cpp/src/jit/helpers.hpp b/cpp/src/jit/helpers.hpp index 65e28e9c39a..e8aa089fa3c 100644 --- a/cpp/src/jit/helpers.hpp +++ b/cpp/src/jit/helpers.hpp @@ -48,6 +48,7 @@ struct input_column_reflection { jitify2::StringVec build_jit_template_params( bool has_user_data, + null_aware is_null_aware, std::vector const& span_outputs, std::vector const& column_outputs, std::vector const& column_inputs) @@ -55,6 +56,7 @@ jitify2::StringVec build_jit_template_params( jitify2::StringVec tparams; tparams.emplace_back(jitify2::reflection::reflect(has_user_data)); + tparams.emplace_back(jitify2::reflection::reflect(is_null_aware == null_aware::YES)); std::transform(thrust::counting_iterator(0), thrust::counting_iterator(span_outputs.size()), diff --git a/cpp/src/jit/span.cuh b/cpp/src/jit/span.cuh index 07f91d8a06a..f26bbf3c4f6 100644 --- a/cpp/src/jit/span.cuh +++ b/cpp/src/jit/span.cuh @@ -154,6 +154,20 @@ struct device_optional_span : device_span { return !is_valid(element_index); } + CUDF_HOST_DEVICE constexpr T& element(size_t idx) const { return base::operator[](idx); } + + /// @copydoc column_device_view::element + __device__ void set_valid(size_type element_index) const noexcept + { + return set_bit(_null_mask, element_index); + } + + /// @copydoc column_device_view::set_null + __device__ void set_null(size_type element_index) const noexcept + { + return clear_bit(_null_mask, element_index); + } + /// @brief converts the optional span to a regular non-nullable span. [[nodiscard]] __device__ base to_span() const noexcept { return static_cast(*this); } diff --git a/cpp/src/stream_compaction/filter/filter.cu b/cpp/src/stream_compaction/filter/filter.cu index a518b566c97..79f17d7d7c9 100644 --- a/cpp/src/stream_compaction/filter/filter.cu +++ b/cpp/src/stream_compaction/filter/filter.cu @@ -218,11 +218,15 @@ jitify2::ConfiguredKernel build_kernel(std::string const& kernel_name, std::vector const& span_outputs, std::vector const& input_columns, bool has_user_data, + null_aware is_null_aware, std::string const& udf, bool is_ptx, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { + CUDF_EXPECTS(!(is_null_aware == null_aware::YES && is_ptx), + "Optional types are not supported in PTX UDFs", + std::invalid_argument); auto const cuda_source = is_ptx ? cudf::jit::parse_single_function_ptx( udf, @@ -234,6 +238,7 @@ jitify2::ConfiguredKernel build_kernel(std::string const& kernel_name, return get_kernel(jitify2::reflection::Template(kernel_name) .instantiate(cudf::jit::build_jit_template_params( has_user_data, + is_null_aware, span_outputs, {}, cudf::jit::reflect_input_columns(base_column_size, input_columns))), @@ -247,6 +252,7 @@ std::vector> filter_operation(column_view base_column, bool is_ptx, std::optional user_data, std::optional> copy_mask, + null_aware is_null_aware, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { @@ -258,6 +264,7 @@ std::vector> filter_operation(column_view base_column, {"cudf::size_type"}, columns, user_data.has_value(), + is_null_aware, predicate_udf, is_ptx, stream, @@ -309,6 +316,7 @@ std::vector> filter(std::vector const& colu bool is_ptx, std::optional user_data, std::optional> copy_mask, + null_aware is_null_aware, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { @@ -320,7 +328,7 @@ std::vector> filter(std::vector const& colu perform_checks(*base_column, columns, copy_mask); auto filtered = filter_operation( - *base_column, columns, predicate_udf, is_ptx, user_data, copy_mask, stream, mr); + *base_column, columns, predicate_udf, is_ptx, user_data, copy_mask, is_null_aware, stream, mr); return filtered; } @@ -332,11 +340,13 @@ std::vector> filter(std::vector const& colu bool is_ptx, std::optional user_data, std::optional> copy_mask, + null_aware is_null_aware, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - return detail::filter(columns, predicate_udf, is_ptx, user_data, copy_mask, stream, mr); + return detail::filter( + columns, predicate_udf, is_ptx, user_data, copy_mask, is_null_aware, stream, mr); } } // namespace cudf diff --git a/cpp/src/stream_compaction/filter/jit/kernel.cu b/cpp/src/stream_compaction/filter/jit/kernel.cu index 07aeebee688..c43c896ef86 100644 --- a/cpp/src/stream_compaction/filter/jit/kernel.cu +++ b/cpp/src/stream_compaction/filter/jit/kernel.cu @@ -36,7 +36,7 @@ namespace cudf { namespace filtering { namespace jit { -template +template CUDF_KERNEL void kernel(cudf::jit::device_optional_span const* outputs, cudf::column_device_view_core const* inputs, void* user_data) @@ -51,15 +51,23 @@ CUDF_KERNEL void kernel(cudf::jit::device_optional_span cons auto const size = output.size(); for (auto i = start; i < size; i += stride) { - auto const any_null = (false || ... || In::is_null(inputs, i)); - bool applies = false; - if (!any_null) { + if constexpr (!is_null_aware) { + auto const any_null = (false || ... || In::is_null(inputs, i)); + + if (!any_null) { + if constexpr (has_user_data) { + GENERIC_FILTER_OP(user_data, i, &applies, In::element(inputs, i)...); + } else { + GENERIC_FILTER_OP(&applies, In::element(inputs, i)...); + } + } + } else { if constexpr (has_user_data) { - GENERIC_FILTER_OP(user_data, i, &applies, In::element(inputs, i)...); + GENERIC_FILTER_OP(user_data, i, &applies, In::nullable_element(inputs, i)...); } else { - GENERIC_FILTER_OP(&applies, In::element(inputs, i)...); + GENERIC_FILTER_OP(&applies, In::nullable_element(inputs, i)...); } } diff --git a/cpp/src/transform/jit/kernel.cu b/cpp/src/transform/jit/kernel.cu index b3647e374c5..53409e9defd 100644 --- a/cpp/src/transform/jit/kernel.cu +++ b/cpp/src/transform/jit/kernel.cu @@ -36,7 +36,7 @@ namespace cudf { namespace transformation { namespace jit { -template +template CUDF_KERNEL void kernel(cudf::mutable_column_device_view_core const* outputs, cudf::column_device_view_core const* inputs, void* user_data) @@ -49,17 +49,26 @@ CUDF_KERNEL void kernel(cudf::mutable_column_device_view_core const* outputs, auto const size = outputs[0].size(); for (auto i = start; i < size; i += stride) { - if (Out::is_null(outputs, i)) { continue; } - - if constexpr (has_user_data) { - GENERIC_TRANSFORM_OP(user_data, i, &Out::element(outputs, i), In::element(inputs, i)...); + if constexpr (!is_null_aware) { + if (Out::is_null(outputs, i)) { continue; } + + if constexpr (has_user_data) { + GENERIC_TRANSFORM_OP(user_data, i, &Out::element(outputs, i), In::element(inputs, i)...); + } else { + GENERIC_TRANSFORM_OP(&Out::element(outputs, i), In::element(inputs, i)...); + } } else { - GENERIC_TRANSFORM_OP(&Out::element(outputs, i), In::element(inputs, i)...); + if constexpr (has_user_data) { + GENERIC_TRANSFORM_OP( + user_data, i, &Out::element(outputs, i), In::nullable_element(inputs, i)...); + } else { + GENERIC_TRANSFORM_OP(&Out::element(outputs, i), In::nullable_element(inputs, i)...); + } } } } -template +template CUDF_KERNEL void fixed_point_kernel(cudf::mutable_column_device_view_core const* outputs, cudf::column_device_view_core const* inputs, void* user_data) @@ -72,19 +81,28 @@ CUDF_KERNEL void fixed_point_kernel(cudf::mutable_column_device_view_core const* for (auto i = start; i < size; i += stride) { typename Out::type result{numeric::scaled_integer{0, output_scale}}; - if (Out::is_null(outputs, i)) { continue; } + if constexpr (!is_null_aware) { + if (Out::is_null(outputs, i)) { continue; } + + if constexpr (has_user_data) { + GENERIC_TRANSFORM_OP(user_data, i, &result, In::element(inputs, i)...); + } else { + GENERIC_TRANSFORM_OP(&result, In::element(inputs, i)...); + } - if constexpr (has_user_data) { - GENERIC_TRANSFORM_OP(user_data, i, &result, In::element(inputs, i)...); } else { - GENERIC_TRANSFORM_OP(&result, In::element(inputs, i)...); + if constexpr (has_user_data) { + GENERIC_TRANSFORM_OP(user_data, i, &result, In::nullable_element(inputs, i)...); + } else { + GENERIC_TRANSFORM_OP(&result, In::nullable_element(inputs, i)...); + } } Out::assign(outputs, i, result); } } -template +template CUDF_KERNEL void span_kernel(cudf::jit::device_optional_span const* outputs, cudf::column_device_view_core const* inputs, void* user_data) @@ -94,12 +112,21 @@ CUDF_KERNEL void span_kernel(cudf::jit::device_optional_span auto const size = outputs[0].size(); for (auto i = start; i < size; i += stride) { - if (Out::is_null(outputs, i)) { continue; } - - if constexpr (has_user_data) { - GENERIC_TRANSFORM_OP(user_data, i, &Out::element(outputs, i), In::element(inputs, i)...); + if constexpr (!is_null_aware) { + if (Out::is_null(outputs, i)) { continue; } + + if constexpr (has_user_data) { + GENERIC_TRANSFORM_OP(user_data, i, &Out::element(outputs, i), In::element(inputs, i)...); + } else { + GENERIC_TRANSFORM_OP(&Out::element(outputs, i), In::element(inputs, i)...); + } } else { - GENERIC_TRANSFORM_OP(&Out::element(outputs, i), In::element(inputs, i)...); + if constexpr (has_user_data) { + GENERIC_TRANSFORM_OP( + user_data, i, &Out::element(outputs, i), In::nullable_element(inputs, i)...); + } else { + GENERIC_TRANSFORM_OP(&Out::element(outputs, i), In::nullable_element(inputs, i)...); + } } } } diff --git a/cpp/src/transform/transform.cpp b/cpp/src/transform/transform.cpp index ad3c6bf15c6..4d3aa4cb443 100644 --- a/cpp/src/transform/transform.cpp +++ b/cpp/src/transform/transform.cpp @@ -17,6 +17,7 @@ #include #include #include +#include #include #include #include @@ -53,6 +54,7 @@ jitify2::ConfiguredKernel build_transform_kernel( std::vector const& output_columns, std::vector const& input_columns, bool has_user_data, + null_aware is_null_aware, std::string const& udf, bool is_ptx, rmm::cuda_stream_view stream, @@ -70,6 +72,7 @@ jitify2::ConfiguredKernel build_transform_kernel( return get_kernel(jitify2::reflection::Template(kernel_name) .instantiate(cudf::jit::build_jit_template_params( has_user_data, + is_null_aware, {}, cudf::jit::column_type_names(output_columns), cudf::jit::reflect_input_columns(base_column_size, input_columns))), @@ -82,6 +85,7 @@ jitify2::ConfiguredKernel build_span_kernel(std::string const& kernel_name, std::vector const& span_outputs, std::vector const& input_columns, bool has_user_data, + null_aware is_null_aware, std::string const& udf, bool is_ptx, rmm::cuda_stream_view stream, @@ -98,6 +102,7 @@ jitify2::ConfiguredKernel build_span_kernel(std::string const& kernel_name, return get_kernel(jitify2::reflection::Template(kernel_name) .instantiate(cudf::jit::build_jit_template_params( has_user_data, + is_null_aware, span_outputs, {}, cudf::jit::reflect_input_columns(base_column_size, input_columns))), @@ -188,10 +193,17 @@ std::unique_ptr transform_operation(column_view base_column, std::string const& udf, bool is_ptx, std::optional user_data, + null_aware is_null_aware, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { - auto [null_mask, null_count] = make_transform_null_mask(base_column, inputs, stream, mr); + rmm::device_buffer null_mask{}; + cudf::size_type null_count{0}; + if (is_null_aware == null_aware::NO) { + std::tie(null_mask, null_count) = make_transform_null_mask(base_column, inputs, stream, mr); + } else { + null_mask = create_null_mask(base_column.size(), mask_state::UNALLOCATED, stream, mr); + } auto output = make_fixed_width_column( output_type, base_column.size(), std::move(null_mask), null_count, stream, mr); @@ -203,13 +215,13 @@ std::unique_ptr transform_operation(column_view base_column, {*output}, inputs, user_data.has_value(), + is_null_aware, udf, is_ptx, stream, mr); launch_column_output_kernel(kernel, {*output}, inputs, user_data, stream, mr); - return output; } @@ -218,16 +230,25 @@ std::unique_ptr string_view_operation(column_view base_column, std::string const& udf, bool is_ptx, std::optional user_data, + null_aware is_null_aware, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { - auto [null_mask, null_count] = make_transform_null_mask(base_column, inputs, stream, mr); + rmm::device_buffer null_mask{}; + cudf::size_type null_count{0}; + + if (is_null_aware == null_aware::NO) { + std::tie(null_mask, null_count) = make_transform_null_mask(base_column, inputs, stream, mr); + } else { + null_mask = create_null_mask(base_column.size(), mask_state::UNALLOCATED, stream, mr); + } auto kernel = build_span_kernel("cudf::transformation::jit::span_kernel", base_column.size(), {"cudf::string_view"}, inputs, user_data.has_value(), + is_null_aware, udf, is_ptx, stream, @@ -287,11 +308,15 @@ std::unique_ptr transform(std::vector const& inputs, data_type output_type, bool is_ptx, std::optional user_data, + null_aware is_null_aware, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { CUDF_EXPECTS( !inputs.empty(), "Transform must have at least 1 input column", std::invalid_argument); + CUDF_EXPECTS(!(is_null_aware == null_aware::YES && is_ptx), + "Optional types are not supported in PTX UDFs", + std::invalid_argument); auto const base_column = std::max_element( inputs.begin(), inputs.end(), [](auto& a, auto& b) { return a.size() < b.size(); }); @@ -300,10 +325,10 @@ std::unique_ptr transform(std::vector const& inputs, if (is_fixed_width(output_type)) { return transformation::jit::transform_operation( - *base_column, output_type, inputs, udf, is_ptx, user_data, stream, mr); + *base_column, output_type, inputs, udf, is_ptx, user_data, is_null_aware, stream, mr); } else if (output_type.id() == type_id::STRING) { return transformation::jit::string_view_operation( - *base_column, inputs, udf, is_ptx, user_data, stream, mr); + *base_column, inputs, udf, is_ptx, user_data, is_null_aware, stream, mr); } else { CUDF_FAIL("Unsupported output type for transform operation"); } @@ -316,11 +341,12 @@ std::unique_ptr transform(std::vector const& inputs, data_type output_type, bool is_ptx, std::optional user_data, + null_aware is_null_aware, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - return detail::transform(inputs, udf, output_type, is_ptx, user_data, stream, mr); + return detail::transform(inputs, udf, output_type, is_ptx, user_data, is_null_aware, stream, mr); } } // namespace cudf diff --git a/cpp/tests/filter/filter_test.cpp b/cpp/tests/filter/filter_test.cpp index 1c080b374c7..9c0a577b137 100644 --- a/cpp/tests/filter/filter_test.cpp +++ b/cpp/tests/filter/filter_test.cpp @@ -55,8 +55,9 @@ TYPED_TEST(FilterNumericTest, NoAssertions) std::vector> results; - EXPECT_NO_THROW(results = - cudf::filter({a, b}, this->udf, false, std::nullopt, std::vector{true, false})); + EXPECT_NO_THROW( + results = cudf::filter( + {a, b}, this->udf, false, std::nullopt, std::vector{true, false}, cudf::null_aware::NO)); CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, results[0]->view()); } @@ -77,8 +78,9 @@ TYPED_TEST(FilterChronoTest, NoAssertions) auto expected = cudf::test::fixed_width_column_wrapper{T{}, T{}, T{}, T{}, T{}, T{}}; std::vector> results; - EXPECT_NO_THROW(results = - cudf::filter({a, b}, this->udf, false, std::nullopt, std::vector{true, false})); + EXPECT_NO_THROW( + results = cudf::filter( + {a, b}, this->udf, false, std::nullopt, std::vector{true, false}, cudf::null_aware::NO)); CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, results[0]->view()); } @@ -101,8 +103,9 @@ TYPED_TEST(FilterFixedPointTest, NoAssertions) std::vector> results; - EXPECT_NO_THROW(results = - cudf::filter({a, b}, this->udf, false, std::nullopt, std::vector{true, false})); + EXPECT_NO_THROW( + results = cudf::filter( + {a, b}, this->udf, false, std::nullopt, std::vector{true, false}, cudf::null_aware::NO)); CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, results[0]->view()); } @@ -118,8 +121,9 @@ TEST_F(FilterTestFixture, StringNoAssertions) std::vector> results; - EXPECT_NO_THROW(results = - cudf::filter({a, b}, this->udf, false, std::nullopt, std::vector{true, false})); + EXPECT_NO_THROW( + results = cudf::filter( + {a, b}, this->udf, false, std::nullopt, std::vector{true, false}, cudf::null_aware::NO)); CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, results[0]->view()); } @@ -133,26 +137,42 @@ TEST_F(FilterAssertsTest, CopyMask) __device__ void is_divisible(bool* out, int32_t a, int32_t b) { *out = ((a % b) == 0); } )***"; - EXPECT_NO_THROW(cudf::filter({a, b}, cuda, false, std::nullopt, std::vector{true, true})); - EXPECT_THROW(cudf::filter({a, b}, cuda, false, std::nullopt, std::vector{true}), - std::invalid_argument); - EXPECT_THROW(cudf::filter({a, b}, cuda, false, std::nullopt, std::vector{true, true, true}), - std::invalid_argument); + EXPECT_NO_THROW( + cudf::filter({a, b}, cuda, false, std::nullopt, std::vector{true, true}, cudf::null_aware::NO)); + EXPECT_THROW( + cudf::filter({a, b}, cuda, false, std::nullopt, std::vector{true}, cudf::null_aware::NO), + std::invalid_argument); + EXPECT_THROW( + cudf::filter( + {a, b}, cuda, false, std::nullopt, std::vector{true, true, true}, cudf::null_aware::NO), + std::invalid_argument); } struct FilterTest : public FilterTestFixture {}; TEST_F(FilterTest, Basic) { - auto a = cudf::test::fixed_width_column_wrapper{1, 2, 3, 4, 5, 6, 7, 8, 9, 10}; + auto a = cudf::test::fixed_width_column_wrapper({1, 2, 3, 4, 5, 6, 7, 8, 9, 10}, + {1, 1, 1, 1, 1, 1, 1, 0, 0, 0}); std::string cuda = R"***( __device__ void is_even(bool* out, int32_t a) { *out = (a % 2 == 0); } )***"; - auto result = cudf::filter({a}, cuda, false, std::nullopt, std::vector{true}); - auto expected = cudf::test::fixed_width_column_wrapper{2, 4, 6, 8, 10}; + auto result = + cudf::filter({a}, cuda, false, std::nullopt, std::vector{true}, cudf::null_aware::NO); + auto expected = cudf::test::fixed_width_column_wrapper{2, 4, 6}; CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result[0]->view()); + + std::string null_cuda = R"***( +__device__ void is_even(bool* out, cuda::std::optional a) { *out = a.has_value() && (*a % 2 == 0); } + )***"; + + auto null_result = + cudf::filter({a}, null_cuda, false, std::nullopt, std::vector{true}, cudf::null_aware::YES); + auto null_expected = cudf::test::fixed_width_column_wrapper{2, 4, 6}; + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(null_expected, null_result[0]->view()); } TEST_F(FilterTest, ScalarBroadcast) @@ -163,7 +183,7 @@ TEST_F(FilterTest, ScalarBroadcast) __device__ void is_divisible(bool* out, int32_t a, int32_t b) { *out = ((a % b) == 0); } )***"; - auto result = cudf::filter({a, b}, cuda, false, std::nullopt); + auto result = cudf::filter({a, b}, cuda, false, std::nullopt, std::nullopt, cudf::null_aware::NO); auto expected_a = cudf::test::fixed_width_column_wrapper{2, 4, 6, 8, 10}; auto expected_b = cudf::test::fixed_width_column_wrapper{2, 2, 2, 2, 2}; @@ -220,7 +240,8 @@ __device__ void filter(bool* out, cuda, false, std::nullopt, - std::vector{true, true, false, false, false, false, false, false, false, false}); + std::vector{true, true, false, false, false, false, false, false, false, false}, + cudf::null_aware::NO); EXPECT_EQ(result.size(), 2); @@ -282,7 +303,8 @@ __device__ void filter(bool* out, cuda, false, std::nullopt, - std::vector{true, true, false, false, false, false, false, false, false, false}); + std::vector{true, true, false, false, false, false, false, false, false, false}, + cudf::null_aware::NO); auto expected_countries = cudf::test::strings_column_wrapper{"Germany", "Spain"}; diff --git a/cpp/tests/streams/transform_test.cpp b/cpp/tests/streams/transform_test.cpp index 993ddc90819..91f6c197857 100644 --- a/cpp/tests/streams/transform_test.cpp +++ b/cpp/tests/streams/transform_test.cpp @@ -38,6 +38,7 @@ void test_udf(char const* udf, Data data_init, cudf::size_type size, bool is_ptx cudf::data_type(cudf::type_to_id()), is_ptx, std::nullopt, + cudf::null_aware::NO, cudf::test::get_default_stream()); } diff --git a/cpp/tests/transform/integration/unary_transform_test.cpp b/cpp/tests/transform/integration/unary_transform_test.cpp index fdf16c34aab..ab82c94c348 100644 --- a/cpp/tests/transform/integration/unary_transform_test.cpp +++ b/cpp/tests/transform/integration/unary_transform_test.cpp @@ -56,31 +56,49 @@ struct AssertsTest : public RuntimeSupportTest {}; TEST_F(AssertsTest, TypeSupport) { - EXPECT_NO_THROW( - cudf::transform({a, b, t}, udf, cudf::data_type{cudf::type_id::FLOAT32}, false, std::nullopt)); - - EXPECT_THROW( - cudf::transform({a, b, t}, udf, cudf::data_type{cudf::type_id::STRUCT}, false, std::nullopt), - std::invalid_argument); - - EXPECT_THROW( - cudf::transform( - {struct_col, t}, udf, cudf::data_type{cudf::type_id::FLOAT32}, false, std::nullopt), - std::invalid_argument); + EXPECT_NO_THROW(cudf::transform({a, b, t}, + udf, + cudf::data_type{cudf::type_id::FLOAT32}, + false, + std::nullopt, + cudf::null_aware::NO)); + + EXPECT_THROW(cudf::transform({a, b, t}, + udf, + cudf::data_type{cudf::type_id::STRUCT}, + false, + std::nullopt, + cudf::null_aware::NO), + std::invalid_argument); + + EXPECT_THROW(cudf::transform({struct_col, t}, + udf, + cudf::data_type{cudf::type_id::FLOAT32}, + false, + std::nullopt, + cudf::null_aware::NO), + std::invalid_argument); } TEST_F(AssertsTest, UnequalRowCount) { - EXPECT_THROW( - cudf::transform( - {a, b, bad_col}, udf, cudf::data_type{cudf::type_id::FLOAT32}, false, std::nullopt), - std::invalid_argument); + EXPECT_THROW(cudf::transform({a, b, bad_col}, + udf, + cudf::data_type{cudf::type_id::FLOAT32}, + false, + std::nullopt, + cudf::null_aware::NO), + std::invalid_argument); } TEST_F(AssertsTest, NullSupport) { - EXPECT_NO_THROW(cudf::transform( - {a, b_nulls, t}, udf, cudf::data_type{cudf::type_id::FLOAT32}, false, std::nullopt)); + EXPECT_NO_THROW(cudf::transform({a, b_nulls, t}, + udf, + cudf::data_type{cudf::type_id::FLOAT32}, + false, + std::nullopt, + cudf::null_aware::NO)); } struct UnaryOperationIntegrationTest : public cudf::test::BaseFixture {}; @@ -590,7 +608,8 @@ __device__ void transform(void* user_data, cudf::size_type row, cuda, cudf::data_type(cudf::type_id::STRING), false, - scratch.data()); + scratch.data(), + cudf::null_aware::NO); CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view()); } @@ -781,4 +800,73 @@ TEST_F(NullTest, ColumnNulls_And_ScalarNull) CUDF_TEST_EXPECT_COLUMNS_EQUAL(*ptx_result, *expected); } +TEST_F(NullTest, IsNull) +{ + auto udf = R"***( + __device__ inline void is_null(bool * output, cuda::std::optional input) + { + *output = !input.has_value(); + } + )***"; + + auto value = cudf::test::fixed_width_column_wrapper({1.0f, 2.0f, 3.0f, 4.0f, 5.0f}, + {false, false, true, false, true}) + .release(); + + auto expected = cudf::test::fixed_width_column_wrapper({true, true, false, true, false}); + + auto result = cudf::transform({*value}, + udf, + cudf::data_type(cudf::type_id::BOOL8), + false, + std::nullopt, + cudf::null_aware::YES); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*result, expected); +} + +TEST_F(NullTest, NullProject) +{ + auto udf = R"***( +__device__ inline void null_lerp( + float* output, + cuda::std::optional low, + cuda::std::optional high, + cuda::std::optional t +) +{ +auto lerp = [] (auto l, auto h, auto t) { +return l - t * l + t * h; +}; + *output = low.has_value() && high.has_value() && t.has_value() + ? lerp(*low, *high, *t) + : 0.0F; +} +)***"; + + auto low = + cudf::test::fixed_width_column_wrapper(low_host.begin(), low_host.end(), fourth()) + .release(); + auto high = + cudf::test::fixed_width_column_wrapper(high_host.begin(), high_host.end(), fifth()) + .release(); + auto t = cudf::test::fixed_width_column_wrapper(t_host.begin(), t_host.end()).release(); + + auto expected_iter = cudf::detail::make_counting_transform_iterator( + 0, [&](auto i) { return ((i % 5) == 0) && ((i % 4) == 0) ? expected_host[i] : 0.0F; }); + + auto expected = + cudf::test::fixed_width_column_wrapper(expected_iter, expected_iter + low_host.size()) + .release(); + + auto cuda_result = cudf::transform({*low, *high, *t}, + udf, + cudf::data_type(cudf::type_id::FLOAT32), + false, + std::nullopt, + cudf::null_aware::YES); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*cuda_result, *expected); +} + } // namespace transformation diff --git a/python/pylibcudf/pylibcudf/libcudf/transform.pxd b/python/pylibcudf/pylibcudf/libcudf/transform.pxd index ce55a4a841a..19d49e143a6 100644 --- a/python/pylibcudf/pylibcudf/libcudf/transform.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/transform.pxd @@ -11,7 +11,7 @@ from pylibcudf.libcudf.column.column_view cimport column_view from pylibcudf.libcudf.expressions cimport expression from pylibcudf.libcudf.table.table cimport table from pylibcudf.libcudf.table.table_view cimport table_view -from pylibcudf.libcudf.types cimport bitmask_type, data_type, size_type +from pylibcudf.libcudf.types cimport bitmask_type, data_type, size_type, null_aware from rmm.librmm.device_buffer cimport device_buffer from rmm.librmm.cuda_stream_view cimport cuda_stream_view @@ -45,6 +45,7 @@ cdef extern from "cudf/transform.hpp" namespace "cudf" nogil: data_type output_type, bool is_ptx, optional[void *] user_data, + null_aware is_null_aware, cuda_stream_view stream ) except +libcudf_exception_handler diff --git a/python/pylibcudf/pylibcudf/libcudf/types.pxd b/python/pylibcudf/pylibcudf/libcudf/types.pxd index 627ebe96ff8..727bc4e926c 100644 --- a/python/pylibcudf/pylibcudf/libcudf/types.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/types.pxd @@ -49,6 +49,10 @@ cdef extern from "cudf/types.hpp" namespace "cudf" nogil: EQUAL UNEQUAL + cpdef enum class null_aware(bool): + NO + YES + cpdef enum class nan_equality(bool): ALL_EQUAL UNEQUAL diff --git a/python/pylibcudf/pylibcudf/transform.pxd b/python/pylibcudf/pylibcudf/transform.pxd index 09fc4cf72bd..8785dd64f40 100644 --- a/python/pylibcudf/pylibcudf/transform.pxd +++ b/python/pylibcudf/pylibcudf/transform.pxd @@ -1,6 +1,6 @@ # Copyright (c) 2024-2025, NVIDIA CORPORATION. from libcpp cimport bool -from pylibcudf.libcudf.types cimport bitmask_type, data_type +from pylibcudf.libcudf.types cimport bitmask_type, data_type, null_aware from rmm.pylibrmm.stream cimport Stream from .column cimport Column @@ -27,6 +27,7 @@ cpdef Column transform(list[Column] inputs, str transform_udf, DataType output_type, bool is_ptx, + null_aware is_null_aware, Stream stream = *) cpdef tuple[Table, Column] encode(Table input, Stream stream = *) diff --git a/python/pylibcudf/pylibcudf/transform.pyi b/python/pylibcudf/pylibcudf/transform.pyi index b38d19d732a..3d277b4f016 100644 --- a/python/pylibcudf/pylibcudf/transform.pyi +++ b/python/pylibcudf/pylibcudf/transform.pyi @@ -5,7 +5,7 @@ from pylibcudf.column import Column from pylibcudf.expressions import Expression from pylibcudf.gpumemoryview import gpumemoryview from pylibcudf.table import Table -from pylibcudf.types import DataType +from pylibcudf.types import DataType, NullAware def nans_to_nulls( input: Column, stream: Stream | None = None @@ -24,6 +24,7 @@ def transform( transform_udf: str, output_type: DataType, is_ptx: bool, + null_aware: NullAware = NullAware.NO, stream: Stream | None = None, ) -> Column: ... def encode( diff --git a/python/pylibcudf/pylibcudf/transform.pyx b/python/pylibcudf/pylibcudf/transform.pyx index 0581bef2b19..13049782219 100644 --- a/python/pylibcudf/pylibcudf/transform.pyx +++ b/python/pylibcudf/pylibcudf/transform.pyx @@ -21,7 +21,7 @@ from rmm.pylibrmm.stream cimport Stream from .column cimport Column from .gpumemoryview cimport gpumemoryview -from .types cimport DataType +from .types cimport DataType, null_aware from .utils cimport _get_stream __all__ = [ @@ -163,6 +163,7 @@ cpdef Column transform(list[Column] inputs, str transform_udf, DataType output_type, bool is_ptx, + null_aware is_null_aware, Stream stream=None): """Create a new column by applying a transform function against multiple input columns. @@ -178,6 +179,9 @@ cpdef Column transform(list[Column] inputs, is_ptx : bool If `True`, the UDF is treated as PTX code. If `False`, the UDF is treated as CUDA code. + is_null_aware: NullAware + If `NO`, the UDF gets non-nullable parameters + If `YES`, the UDF gets nullable parameters Returns ------- @@ -188,6 +192,7 @@ cpdef Column transform(list[Column] inputs, cdef unique_ptr[column] c_result cdef string c_transform_udf = transform_udf.encode() cdef bool c_is_ptx = is_ptx + cdef null_aware c_is_null_aware = is_null_aware cdef optional[void *] user_data stream = _get_stream(stream) @@ -202,6 +207,7 @@ cpdef Column transform(list[Column] inputs, output_type.c_obj, c_is_ptx, user_data, + c_is_null_aware, stream.view(), ) diff --git a/python/pylibcudf/pylibcudf/types.pxd b/python/pylibcudf/pylibcudf/types.pxd index e9de74b878c..2a263862b59 100644 --- a/python/pylibcudf/pylibcudf/types.pxd +++ b/python/pylibcudf/pylibcudf/types.pxd @@ -12,6 +12,7 @@ from pylibcudf.libcudf.types cimport ( null_equality, null_order, null_policy, + null_aware, order, size_type, sorted, diff --git a/python/pylibcudf/pylibcudf/types.pyi b/python/pylibcudf/pylibcudf/types.pyi index 377a389dafa..e3c0951da52 100644 --- a/python/pylibcudf/pylibcudf/types.pyi +++ b/python/pylibcudf/pylibcudf/types.pyi @@ -29,6 +29,10 @@ class NullEquality(IntEnum): EQUAL = ... UNEQUAL = ... +class NullAware(IntEnum): + NO = ... + YES = ... + class NullOrder(IntEnum): AFTER = ... BEFORE = ... diff --git a/python/pylibcudf/pylibcudf/types.pyx b/python/pylibcudf/pylibcudf/types.pyx index 6e3eb19be1a..a84171f991c 100644 --- a/python/pylibcudf/pylibcudf/types.pyx +++ b/python/pylibcudf/pylibcudf/types.pyx @@ -17,6 +17,7 @@ from pylibcudf.libcudf.types import interpolation as Interpolation # no-cython- from pylibcudf.libcudf.types import mask_state as MaskState # no-cython-lint, isort:skip from pylibcudf.libcudf.types import nan_equality as NanEquality # no-cython-lint, isort:skip from pylibcudf.libcudf.types import null_equality as NullEquality # no-cython-lint, isort:skip +from pylibcudf.libcudf.types import null_aware as NullAware # no-cython-lint, isort:skip from pylibcudf.libcudf.types import null_order as NullOrder # no-cython-lint, isort:skip from pylibcudf.libcudf.types import order as Order # no-cython-lint, isort:skip from pylibcudf.libcudf.types import sorted as Sorted # no-cython-lint, isort:skip @@ -79,6 +80,7 @@ __all__ = [ "NanPolicy", "NullEquality", "NullOrder", + "NullAware", "NullPolicy", "Order", "SIZE_TYPE", @@ -313,6 +315,7 @@ Interpolation.__str__ = Interpolation.__repr__ MaskState.__str__ = MaskState.__repr__ NanEquality.__str__ = NanEquality.__repr__ NullEquality.__str__ = NullEquality.__repr__ +NullAware.__str__ = NullAware.__repr__ NullOrder.__str__ = NullOrder.__repr__ Order.__str__ = Order.__repr__ Sorted.__str__ = Sorted.__repr__ diff --git a/python/pylibcudf/tests/test_transform.py b/python/pylibcudf/tests/test_transform.py index 62b9b6f636f..2d0a9e188fd 100644 --- a/python/pylibcudf/tests/test_transform.py +++ b/python/pylibcudf/tests/test_transform.py @@ -102,5 +102,6 @@ def op(a, b, c): transform_udf=ptx, output_type=plc.DataType(plc.TypeId.FLOAT64), is_ptx=True, + is_null_aware=plc.types.NullAware.NO, ) assert_column_eq(expect, got) From e16e76291b5cbc4f8f265367451c49d9dfd133a8 Mon Sep 17 00:00:00 2001 From: Matthew Murray <41342305+Matt711@users.noreply.github.com> Date: Tue, 19 Aug 2025 07:59:05 -0400 Subject: [PATCH 156/366] Pin polars version to <1.33 (#19582) Updates the upper pinning for polars to `1.32.1`. Also bumped the Narwhals version to `2.0.1` to avoid having to xfail ~10 more tests (so this PR contributes to https://github.com/rapidsai/cudf/issues/19594 too). Authors: - Matthew Murray (https://github.com/Matt711) Approvers: - Bradley Dice (https://github.com/bdice) - Matthew Roeschke (https://github.com/mroeschke) - Tom Augspurger (https://github.com/TomAugspurger) - Lawrence Mitchell (https://github.com/wence-) URL: https://github.com/rapidsai/cudf/pull/19582 --- ci/test_narwhals.sh | 37 +---- .../all_cuda-129_arch-aarch64.yaml | 2 +- .../all_cuda-129_arch-x86_64.yaml | 2 +- conda/recipes/cudf-polars/recipe.yaml | 2 +- dependencies.yaml | 4 +- .../cudf_polars/dsl/expressions/boolean.py | 7 + .../cudf_polars/dsl/expressions/datetime.py | 1 + .../cudf_polars/dsl/expressions/string.py | 44 ++++-- .../cudf_polars/dsl/expressions/struct.py | 7 +- python/cudf_polars/cudf_polars/dsl/ir.py | 128 +++++++++++++----- .../cudf_polars/cudf_polars/dsl/translate.py | 28 +++- .../cudf_polars/dsl/utils/aggregations.py | 5 +- .../cudf_polars/experimental/expressions.py | 2 +- .../cudf_polars/experimental/io.py | 9 +- .../cudf_polars/experimental/select.py | 2 +- .../cudf_polars/testing/asserts.py | 29 ++-- .../cudf_polars/cudf_polars/testing/plugin.py | 18 +-- .../cudf_polars/cudf_polars/utils/versions.py | 4 +- python/cudf_polars/pyproject.toml | 2 +- .../tests/dsl/test_serialization.py | 33 ++++- .../tests/expressions/test_booleanfunction.py | 14 +- .../tests/expressions/test_stringfunction.py | 23 +++- python/cudf_polars/tests/test_cache.py | 11 +- python/cudf_polars/tests/test_groupby.py | 9 +- python/cudf_polars/tests/test_join.py | 15 ++ python/cudf_polars/tests/test_scan.py | 15 +- 26 files changed, 317 insertions(+), 136 deletions(-) diff --git a/ci/test_narwhals.sh b/ci/test_narwhals.sh index a6ade73e5f3..d30d0d92c05 100755 --- a/ci/test_narwhals.sh +++ b/ci/test_narwhals.sh @@ -27,53 +27,20 @@ rapids-pip-retry install -U -e . rapids-logger "Check narwhals versions" python -c "import narwhals; print(narwhals.show_versions())" -# test_horizontal_slice_with_series: xpassing in Narwhals, fixed in cuDF https://github.com/rapidsai/cudf/pull/18558 -# test_rolling_mean_expr_lazy_grouped: xpassing in Narwhals -# test_rolling_std_expr_lazy_grouped: xpassing in Narwhals -# test_rolling_sum_expr_lazy_grouped: xpassing in Narwhals -# test_rolling_var_expr_lazy_grouped: xpassing in Narwhals -# test_offset_by_tz: xpassing in Narwhals -# test_double_same_aggregation: xpassing in Narwhals -# test_all_kind_of_aggs: xpassing in Narwhals -TESTS_THAT_NEED_NARWHALS_FIX_FOR_CUDF="not test_rolling_mean_expr_lazy_grouped[cudf-expected_a4-3-1-True] \ -and not test_rolling_mean_expr_lazy_grouped[cudf-expected_a5-4-1-True] \ -and not test_rolling_mean_expr_lazy_grouped[cudf-expected_a6-5-1-True] \ -and not test_rolling_std_expr_lazy_grouped[cudf-expected_a4-3-1-True-1] \ -and not test_rolling_std_expr_lazy_grouped[cudf-expected_a5-4-1-True-1] \ -and not test_rolling_std_expr_lazy_grouped[cudf-expected_a6-5-1-True-0] \ -and not test_rolling_sum_expr_lazy_grouped[cudf-expected_a4-3-1-True] \ -and not test_rolling_sum_expr_lazy_grouped[cudf-expected_a5-4-1-True] \ -and not test_rolling_sum_expr_lazy_grouped[cudf-expected_a6-5-1-True] \ -and not test_rolling_var_expr_lazy_grouped[cudf-expected_a4-3-1-True-1] \ -and not test_rolling_var_expr_lazy_grouped[cudf-expected_a5-4-1-True-1] \ -and not test_rolling_var_expr_lazy_grouped[cudf-expected_a6-5-1-True-0] \ -and not test_horizontal_slice_with_series \ -and not test_offset_by_tz \ -and not test_double_same_aggregation \ -and not test_all_kind_of_aggs" - rapids-logger "Run narwhals tests for cuDF" PYTEST_DISABLE_PLUGIN_AUTOLOAD=1 python -m pytest \ --cache-clear \ - --junitxml="${RAPIDS_TESTS_DIR}/junit-cudf-narwhals.xml" \ -p xdist \ -p env \ -p no:pytest_benchmark \ -p cudf.testing.narwhals_test_plugin \ - -k "$TESTS_THAT_NEED_NARWHALS_FIX_FOR_CUDF" \ --numprocesses=8 \ --dist=worksteal \ --constructors=cudf -# test_dtypes: With cudf.pandas loaded, to_pandas() preserves Arrow dtypes like list and struct, so pandas -# columns aren't object anymore. The test expects object, causing a mismatch. -# test_nan: Narwhals expect this test to fail, but as of polars 1.30 we raise a RuntimeError, -# not polars ComputeError. So the test is looking for the wrong error and fails. -# test_floordiv_int_by_zero: This bug is fixed as of 25.08, narwhals should remove the xfail +# test_datetime[polars[lazy]]: Fixed in the next narwhals release >2.0.1 TESTS_THAT_NEED_NARWHALS_FIX_FOR_CUDF_POLARS=" \ -test_dtypes or \ -test_nan or \ -test_floordiv_int_by_zero \ +test_datetime[polars[lazy]] \ " rapids-logger "Run narwhals tests for cuDF Polars" diff --git a/conda/environments/all_cuda-129_arch-aarch64.yaml b/conda/environments/all_cuda-129_arch-aarch64.yaml index ce67a5abbca..bd9e288cb43 100644 --- a/conda/environments/all_cuda-129_arch-aarch64.yaml +++ b/conda/environments/all_cuda-129_arch-aarch64.yaml @@ -66,7 +66,7 @@ dependencies: - pandas - pandas>=2.0,<2.4.0dev0 - pandoc -- polars>=1.28,<1.32 +- polars>=1.28,<1.33 - pre-commit - pyarrow>=14.0.0,<20.0.0a0 - pydata-sphinx-theme>=0.15.4 diff --git a/conda/environments/all_cuda-129_arch-x86_64.yaml b/conda/environments/all_cuda-129_arch-x86_64.yaml index 04178f88b83..7d3b41c97e5 100644 --- a/conda/environments/all_cuda-129_arch-x86_64.yaml +++ b/conda/environments/all_cuda-129_arch-x86_64.yaml @@ -67,7 +67,7 @@ dependencies: - pandas - pandas>=2.0,<2.4.0dev0 - pandoc -- polars>=1.28,<1.32 +- polars>=1.28,<1.33 - pre-commit - pyarrow>=14.0.0,<20.0.0a0 - pydata-sphinx-theme>=0.15.4 diff --git a/conda/recipes/cudf-polars/recipe.yaml b/conda/recipes/cudf-polars/recipe.yaml index d3c080249f1..4c398235a1a 100644 --- a/conda/recipes/cudf-polars/recipe.yaml +++ b/conda/recipes/cudf-polars/recipe.yaml @@ -50,7 +50,7 @@ requirements: - nvidia-ml-py - python - pylibcudf =${{ version }} - - polars >=1.28,<1.32 + - polars >=1.28,<1.33 - ${{ pin_compatible("cuda-version", upper_bound="x", lower_bound="x") }} - if: python == "3.10" then: typing_extensions diff --git a/dependencies.yaml b/dependencies.yaml index de83eae1276..d28f2b5e5fb 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -727,7 +727,7 @@ dependencies: packages: - nvidia-ml-py - packaging - - polars>=1.28,<1.32 + - polars>=1.28,<1.33 specific: - output_types: [requirements, pyproject] matrices: @@ -1142,4 +1142,4 @@ dependencies: common: - output_types: [conda, requirements, pyproject] packages: - - narwhals==1.47 + - narwhals==2.0.1 diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/boolean.py b/python/cudf_polars/cudf_polars/dsl/expressions/boolean.py index 81b28be8086..ce8c4fc3276 100644 --- a/python/cudf_polars/cudf_polars/dsl/expressions/boolean.py +++ b/python/cudf_polars/cudf_polars/dsl/expressions/boolean.py @@ -38,6 +38,7 @@ class Name(IntEnum): Any = auto() AnyHorizontal = auto() IsBetween = auto() + IsClose = auto() IsDuplicated = auto() IsFinite = auto() IsFirstDistinct = auto() @@ -85,6 +86,12 @@ def __init__( BooleanFunction.Name.IsLastDistinct, BooleanFunction.Name.IsUnique, ) + if self.name in { + BooleanFunction.Name.IsClose, + }: + raise NotImplementedError( + f"Boolean function {self.name}" + ) # pragma: no cover @staticmethod def _distinct( diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/datetime.py b/python/cudf_polars/cudf_polars/dsl/expressions/datetime.py index 46eade5c507..5c663f15697 100644 --- a/python/cudf_polars/cudf_polars/dsl/expressions/datetime.py +++ b/python/cudf_polars/cudf_polars/dsl/expressions/datetime.py @@ -38,6 +38,7 @@ class Name(IntEnum): Datetime = auto() DatetimeFunction = auto() Day = auto() + DaysInMonth = auto() Duration = auto() Hour = auto() IsLeapYear = auto() diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/string.py b/python/cudf_polars/cudf_polars/dsl/expressions/string.py index d47828b4175..b7cd24972ad 100644 --- a/python/cudf_polars/cudf_polars/dsl/expressions/string.py +++ b/python/cudf_polars/cudf_polars/dsl/expressions/string.py @@ -19,6 +19,7 @@ from cudf_polars.dsl.expressions.base import ExecutionContext, Expr from cudf_polars.dsl.expressions.literal import Literal, LiteralColumn from cudf_polars.dsl.utils.reshape import broadcast +from cudf_polars.utils.versions import POLARS_VERSION_LT_132 if TYPE_CHECKING: from typing_extensions import Self @@ -270,7 +271,11 @@ def _validate_input(self) -> None: if isinstance(self.children[1], Literal): _, width = self.children assert isinstance(width, Literal) - if width.value is not None and width.value < 0: + if ( + POLARS_VERSION_LT_132 + and width.value is not None + and width.value < 0 + ): # pragma: no cover dtypestr = dtype_str_repr(width.dtype.polars) raise InvalidOperationError( f"conversion from `{dtypestr}` to `u64` " @@ -283,6 +288,8 @@ def _create_regex_program( pattern: str, flags: plc.strings.regex_flags.RegexFlags = plc.strings.regex_flags.RegexFlags.DEFAULT, ) -> plc.strings.regex_program.RegexProgram: + if pattern == "": + raise NotImplementedError("Empty regex pattern is not yet supported") try: return plc.strings.regex_program.RegexProgram.create( pattern, @@ -380,11 +387,14 @@ def do_evaluate( plc.DataType(plc.TypeId.BOOL8), ) - if not plc.reduce.reduce( - all_gt_0, - plc.aggregation.all(), - plc.DataType(plc.TypeId.BOOL8), - ).to_py(): + if ( + POLARS_VERSION_LT_132 + and not plc.reduce.reduce( + all_gt_0, + plc.aggregation.all(), + plc.DataType(plc.TypeId.BOOL8), + ).to_py() + ): # pragma: no cover raise InvalidOperationError("fill conversion failed.") return Column( @@ -792,8 +802,15 @@ def do_evaluate( dtype=self.dtype, ) elif self.name is StringFunction.Name.PadStart: - (column,) = columns - width, char = self.options + if POLARS_VERSION_LT_132: # pragma: no cover + (column,) = columns + width, char = self.options + else: + (column, width_col) = columns + (char,) = self.options + # TODO: Maybe accept a string scalar in + # cudf::strings::pad to avoid DtoH transfer + width = width_col.obj.to_scalar().to_py() return Column( plc.strings.padding.pad( column.obj, width, plc.strings.SideType.LEFT, char @@ -801,8 +818,15 @@ def do_evaluate( dtype=self.dtype, ) elif self.name is StringFunction.Name.PadEnd: - (column,) = columns - width, char = self.options + if POLARS_VERSION_LT_132: # pragma: no cover + (column,) = columns + width, char = self.options + else: + (column, width_col) = columns + (char,) = self.options + # TODO: Maybe accept a string scalar in + # cudf::strings::pad to avoid DtoH transfer + width = width_col.obj.to_scalar().to_py() return Column( plc.strings.padding.pad( column.obj, width, plc.strings.SideType.RIGHT, char diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/struct.py b/python/cudf_polars/cudf_polars/dsl/expressions/struct.py index 034e1253981..f19f074254b 100644 --- a/python/cudf_polars/cudf_polars/dsl/expressions/struct.py +++ b/python/cudf_polars/cudf_polars/dsl/expressions/struct.py @@ -29,7 +29,6 @@ class StructFunction(Expr): class Name(IntEnum): """Internal and picklable representation of polars' `StructFunction`.""" - FieldByIndex = auto() FieldByName = auto() RenameFields = auto() PrefixFields = auto() @@ -37,6 +36,7 @@ class Name(IntEnum): JsonEncode = auto() WithFields = auto() # TODO: https://github.com/rapidsai/cudf/issues/19284 MapFieldNames = auto() # TODO: https://github.com/rapidsai/cudf/issues/19285 + FieldByIndex = auto() MultipleFields = ( auto() ) # https://github.com/pola-rs/polars/pull/23022#issuecomment-2933910958 @@ -56,8 +56,7 @@ def from_polars(cls, obj: pl_expr.StructFunction) -> Self: __slots__ = ("name", "options") _non_child = ("dtype", "name", "options") - _valid_ops: ClassVar[set[Name]] = { - Name.FieldByIndex, + _supported_ops: ClassVar[set[Name]] = { Name.FieldByName, Name.RenameFields, Name.PrefixFields, @@ -77,7 +76,7 @@ def __init__( self.name = name self.children = children self.is_pointwise = True - if self.name not in self._valid_ops: + if self.name not in self._supported_ops: raise NotImplementedError( f"Struct function {self.name}" ) # pragma: no cover diff --git a/python/cudf_polars/cudf_polars/dsl/ir.py b/python/cudf_polars/cudf_polars/dsl/ir.py index 28b20c835a0..161c8f4a576 100644 --- a/python/cudf_polars/cudf_polars/dsl/ir.py +++ b/python/cudf_polars/cudf_polars/dsl/ir.py @@ -47,6 +47,7 @@ from polars.polars import _expr_nodes as pl_expr + from cudf_polars.containers.dataframe import NamedColumn from cudf_polars.typing import CSECache, ClosedInterval, Schema, Slice as Zlice from cudf_polars.utils.config import ParquetOptions from cudf_polars.utils.timer import Timer @@ -324,8 +325,17 @@ def __init__( raise NotImplementedError( "Read from cloud storage" ) # pragma: no cover; no test yet - if any(str(p).startswith("https:/") for p in self.paths): + if any( + str(p).startswith("https:/" if POLARS_VERSION_LT_131 else "https://") + for p in self.paths + ): raise NotImplementedError("Read from https") + if any( + str(p).startswith("file:/" if POLARS_VERSION_LT_131 else "file://") + for p in self.paths + ): + # TODO: removing the file:// may work + raise NotImplementedError("Read from file URI") if self.typ == "csv": if self.reader_options["skip_rows_after_header"] != 0: raise NotImplementedError("Skipping rows after header in CSV reader") @@ -954,10 +964,10 @@ class Cache(IR): _non_child = ("schema", "key", "refcount") key: int """The cache key.""" - refcount: int + refcount: int | None """The number of cache hits.""" - def __init__(self, schema: Schema, key: int, refcount: int, value: IR): + def __init__(self, schema: Schema, key: int, refcount: int | None, value: IR): self.schema = schema self.key = key self.refcount = refcount @@ -979,7 +989,7 @@ def is_equal(self, other: Self) -> bool: # noqa: D102 @classmethod @nvtx_annotate_cudf_polars(message="Cache") def do_evaluate( - cls, key: int, refcount: int, df: DataFrame + cls, key: int, refcount: int | None, df: DataFrame ) -> DataFrame: # pragma: no cover; basic evaluation never calls this """Evaluate and return a dataframe.""" # Our value has already been computed for us, so let's just @@ -998,12 +1008,15 @@ def evaluate(self, *, cache: CSECache, timer: Timer | None) -> DataFrame: cache[self.key] = (result, 0) return result else: - hits += 1 - if hits == self.refcount: + if self.refcount is None: + return result + + hits += 1 # pragma: no cover + if hits == self.refcount: # pragma: no cover del cache[self.key] - else: + else: # pragma: no cover cache[self.key] = (result, hits) - return result + return result # pragma: no cover class DataFrameScan(IR): @@ -1758,6 +1771,38 @@ def _reorder_maps( [plc.types.NullOrder.AFTER, plc.types.NullOrder.AFTER], ).columns() + @staticmethod + def _build_columns( + columns: Iterable[plc.Column], + template: Iterable[NamedColumn], + *, + left: bool = True, + empty: bool = False, + rename: Callable[[str], str] = lambda name: name, + ) -> list[Column]: + if empty: + return [ + Column( + plc.column_factories.make_empty_column(col.dtype.plc), + col.dtype, + name=rename(col.name), + ) + for col in template + ] + + columns = [ + Column(new, col.dtype, name=rename(col.name)) + for new, col in zip(columns, template, strict=True) + ] + + if left: + columns = [ + col.sorted_like(orig) + for col, orig in zip(columns, template, strict=True) + ] + + return columns + @classmethod @nvtx_annotate_cudf_polars(message="Join") def do_evaluate( @@ -1780,28 +1825,32 @@ def do_evaluate( if how == "Cross": # Separate implementation, since cross_join returns the # result, not the gather maps - columns = plc.join.cross_join(left.table, right.table).columns() - left_cols = [ - Column(new, name=old.name, dtype=old.dtype).sorted_like(old) - for new, old in zip( - columns[: left.num_columns], left.columns, strict=True - ) - ] - right_cols = [ - Column( - new, - name=name + if right.num_rows == 0: + left_cols = Join._build_columns([], left.columns, empty=True) + right_cols = Join._build_columns( + [], + right.columns, + left=False, + empty=True, + rename=lambda name: name if name not in left.column_names_set else f"{name}{suffix}", - dtype=old.dtype, - ) - for new, name, old in zip( - columns[left.num_columns :], - right.column_names, - right.columns, - strict=True, ) - ] + return DataFrame([*left_cols, *right_cols]) + + columns = plc.join.cross_join(left.table, right.table).columns() + left_cols = Join._build_columns( + columns[: left.num_columns], + left.columns, + ) + right_cols = Join._build_columns( + columns[left.num_columns :], + right.columns, + rename=lambda name: name + if name not in left.column_names_set + else f"{name}{suffix}", + left=False, + ) return DataFrame([*left_cols, *right_cols]).slice(zlice) # TODO: Waiting on clarity based on https://github.com/pola-rs/polars/issues/17184 left_on = DataFrame(broadcast(*(e.evaluate(left) for e in left_on_exprs))) @@ -2485,18 +2534,27 @@ def do_evaluate( class Empty(IR): - """Represents an empty DataFrame.""" + """Represents an empty DataFrame with a known schema.""" - __slots__ = () - _non_child = () + __slots__ = ("schema",) + _non_child = ("schema",) - def __init__(self) -> None: - self.schema = {} - self._non_child_args = () + def __init__(self, schema: Schema): + self.schema = schema + self._non_child_args = (schema,) self.children = () @classmethod @nvtx_annotate_cudf_polars(message="Empty") - def do_evaluate(cls) -> DataFrame: # pragma: no cover + def do_evaluate(cls, schema: Schema) -> DataFrame: # pragma: no cover """Evaluate and return a dataframe.""" - return DataFrame([]) + return DataFrame( + [ + Column( + plc.column_factories.make_empty_column(dtype.plc), + dtype=dtype, + name=name, + ) + for name, dtype in schema.items() + ] + ) diff --git a/python/cudf_polars/cudf_polars/dsl/translate.py b/python/cudf_polars/cudf_polars/dsl/translate.py index aa82883a51d..27aa6bfbf4a 100644 --- a/python/cudf_polars/cudf_polars/dsl/translate.py +++ b/python/cudf_polars/cudf_polars/dsl/translate.py @@ -30,7 +30,11 @@ from cudf_polars.dsl.utils.rolling import rewrite_rolling from cudf_polars.typing import Schema from cudf_polars.utils import config, sorting -from cudf_polars.utils.versions import POLARS_VERSION_LT_131 +from cudf_polars.utils.versions import ( + POLARS_VERSION_LT_131, + POLARS_VERSION_LT_132, + POLARS_VERSION_LT_1323, +) if TYPE_CHECKING: from polars import GPUEngine @@ -91,7 +95,7 @@ def translate_ir(self, *, n: int | None = None) -> ir.IR: # IR is versioned with major.minor, minor is bumped for backwards # compatible changes (e.g. adding new nodes), major is bumped for # incompatible changes (e.g. renaming nodes). - if (version := self.visitor.version()) >= (8, 1): + if (version := self.visitor.version()) >= (10, 1): e = NotImplementedError( f"No support for polars IR {version=}" ) # pragma: no cover; no such version for now. @@ -218,6 +222,13 @@ def _(node: pl_ir.PythonScan, translator: Translator, schema: Schema) -> ir.IR: @_translate_ir.register def _(node: pl_ir.Scan, translator: Translator, schema: Schema) -> ir.IR: typ, *options = node.scan_type + paths = node.paths + # Polars can produce a Scan with an empty ``node.paths`` (eg. the native + # Iceberg reader on a table with no data files yet). In this case, polars returns an + # empty DataFrame with the declared schema. Mirror that here by + # replacing the Scan with an Empty IR node. + if not paths: # pragma: no cover + return ir.Empty(schema) if typ == "ndjson": (reader_options,) = map(json.loads, options) cloud_options = None @@ -248,7 +259,7 @@ def _(node: pl_ir.Scan, translator: Translator, schema: Schema) -> ir.IR: typ, reader_options, cloud_options, - node.paths, + paths, with_columns, skip_rows, n_rows, @@ -263,8 +274,15 @@ def _(node: pl_ir.Scan, translator: Translator, schema: Schema) -> ir.IR: @_translate_ir.register def _(node: pl_ir.Cache, translator: Translator, schema: Schema) -> ir.IR: + if POLARS_VERSION_LT_1323: # pragma: no cover + refcount = node.cache_hits + else: + refcount = None return ir.Cache( - schema, node.id_, node.cache_hits, translator.translate_ir(n=node.input) + schema, + node.id_, + refcount, + translator.translate_ir(n=node.input), ) @@ -523,7 +541,7 @@ def _(node: pl_ir.Sink, translator: Translator, schema: Schema) -> ir.IR: return ir.Sink( schema=schema, kind=sink_kind, - path=file["target"], + path=file["target"] if POLARS_VERSION_LT_132 else file["target"]["Local"], parquet_options=translator.config_options.parquet_options, options=options, cloud_options=cloud_options, diff --git a/python/cudf_polars/cudf_polars/dsl/utils/aggregations.py b/python/cudf_polars/cudf_polars/dsl/utils/aggregations.py index e1ed7a0d3a2..29fd7155799 100644 --- a/python/cudf_polars/cudf_polars/dsl/utils/aggregations.py +++ b/python/cudf_polars/cudf_polars/dsl/utils/aggregations.py @@ -16,6 +16,7 @@ from cudf_polars.containers import DataType from cudf_polars.dsl import expr, ir from cudf_polars.dsl.expressions.base import ExecutionContext +from cudf_polars.utils.versions import POLARS_VERSION_LT_1323 if TYPE_CHECKING: from collections.abc import Callable, Generator, Iterable, Sequence @@ -158,12 +159,12 @@ def decompose_single_agg( # - ROLLING: sum(all-null window) => null; sum(empty window) => 0 (fill only if empty) # # Must post-process because libcudf returns null for both empty and all-null windows/groups - if context == ExecutionContext.GROUPBY: + if not POLARS_VERSION_LT_1323 or context == ExecutionContext.GROUPBY: # GROUPBY: always fill top-level nulls with 0 return [(named_expr, True)], expr.NamedExpr( name, replace_nulls(col, 0, is_top=is_top) ) - else: + else: # pragma: no cover # ROLLING: # Add a second rolling agg to compute the window size, then only # replace nulls with 0 when the window size is 0 (ie. empty window). diff --git a/python/cudf_polars/cudf_polars/experimental/expressions.py b/python/cudf_polars/cudf_polars/experimental/expressions.py index 68dee9b9974..3c4ff10fc57 100644 --- a/python/cudf_polars/cudf_polars/experimental/expressions.py +++ b/python/cudf_polars/cudf_polars/experimental/expressions.py @@ -419,7 +419,7 @@ def _decompose_expr_node( # For Literal nodes, we don't actually want an # input IR with real columns, because it will # mess up the result of ``HConcat``. - input_ir = Empty() + input_ir = Empty({}) partition_info[input_ir] = PartitionInfo(count=1) partition_count = partition_info[input_ir].count diff --git a/python/cudf_polars/cudf_polars/experimental/io.py b/python/cudf_polars/cudf_polars/experimental/io.py index e45b6aae470..57f314ed571 100644 --- a/python/cudf_polars/cudf_polars/experimental/io.py +++ b/python/cudf_polars/cudf_polars/experimental/io.py @@ -17,7 +17,7 @@ import pylibcudf as plc -from cudf_polars.dsl.ir import IR, DataFrameScan, Scan, Sink, Union +from cudf_polars.dsl.ir import IR, DataFrameScan, Empty, Scan, Sink, Union from cudf_polars.experimental.base import ( ColumnStat, ColumnStats, @@ -277,6 +277,13 @@ def do_evaluate( ) +@lower_ir_node.register(Empty) +def _( + ir: Empty, rec: LowerIRTransformer +) -> tuple[IR, MutableMapping[IR, PartitionInfo]]: + return ir, {ir: PartitionInfo(count=1)} # pragma: no cover + + @lower_ir_node.register(Scan) def _( ir: Scan, rec: LowerIRTransformer diff --git a/python/cudf_polars/cudf_polars/experimental/select.py b/python/cudf_polars/cudf_polars/experimental/select.py index 0a3b5820b3d..1ca199a8af6 100644 --- a/python/cudf_polars/cudf_polars/experimental/select.py +++ b/python/cudf_polars/cudf_polars/experimental/select.py @@ -138,7 +138,7 @@ def _( isinstance(expr, (Col, Len)) for expr in traversal([e.value for e in ir.exprs]) ): # Special Case: Selection does not depend on any columns. - new_node = ir.reconstruct([input_ir := Empty()]) + new_node = ir.reconstruct([input_ir := Empty({})]) partition_info[input_ir] = partition_info[new_node] = PartitionInfo(count=1) return new_node, partition_info diff --git a/python/cudf_polars/cudf_polars/testing/asserts.py b/python/cudf_polars/cudf_polars/testing/asserts.py index 256e0a6267f..80e5eed574b 100644 --- a/python/cudf_polars/cudf_polars/testing/asserts.py +++ b/python/cudf_polars/cudf_polars/testing/asserts.py @@ -14,6 +14,7 @@ from cudf_polars.dsl.translate import Translator from cudf_polars.utils.config import ConfigOptions, StreamingFallbackMode +from cudf_polars.utils.versions import POLARS_VERSION_LT_1323 if TYPE_CHECKING: from cudf_polars.typing import OptimizationArgs @@ -112,16 +113,26 @@ def assert_gpu_result_equal( # the 'misc' is for 'error: Keywords must be strings' expect = lazydf.collect(**final_polars_collect_kwargs) # type: ignore[call-overload,misc] got = lazydf.collect(**final_cudf_collect_kwargs, engine=engine) # type: ignore[call-overload,misc] + + assert_kwargs_bool: dict[str, bool] = { + "check_row_order": check_row_order, + "check_column_order": check_column_order, + "check_dtypes": check_dtypes, + "check_exact": check_exact, + "categorical_as_str": categorical_as_str, + } + + tol_kwargs: dict[str, float] + if POLARS_VERSION_LT_1323: # pragma: no cover + tol_kwargs = {"rtol": rtol, "atol": atol} + else: + tol_kwargs = {"rel_tol": rtol, "abs_tol": atol} + assert_frame_equal( expect, got, - check_row_order=check_row_order, - check_column_order=check_column_order, - check_dtypes=check_dtypes, - check_exact=check_exact, - rtol=rtol, - atol=atol, - categorical_as_str=categorical_as_str, + **assert_kwargs_bool, + **tol_kwargs, ) @@ -246,10 +257,10 @@ def assert_collect_raises( Useful for controlling optimization settings. polars_except Exception or exceptions polars CPU is expected to raise. If - None, CPU is not expected to raise an exception. + an empty tuple ``()``, CPU is expected to succeed without raising. cudf_except Exception or exceptions polars GPU is expected to raise. If - None, GPU is not expected to raise an exception. + an empty tuple ``()``, GPU is expected to succeed without raising. collect_kwargs Common keyword arguments to pass to collect for both polars CPU and cudf-polars. diff --git a/python/cudf_polars/cudf_polars/testing/plugin.py b/python/cudf_polars/cudf_polars/testing/plugin.py index 8491b2fa400..14956d2cfbc 100644 --- a/python/cudf_polars/cudf_polars/testing/plugin.py +++ b/python/cudf_polars/cudf_polars/testing/plugin.py @@ -54,6 +54,9 @@ def pytest_configure(config: pytest.Config) -> None: "tests/unit/io/test_delta.py::test_scan_delta_version": "Need to expose hive partitioning", "tests/unit/io/test_delta.py::test_scan_delta_relative": "Need to expose hive partitioning", "tests/unit/io/test_delta.py::test_read_delta_version": "Need to expose hive partitioning", + "tests/unit/io/test_delta.py::test_scan_delta_schema_evolution_nested_struct_field_19915": "Need to expose hive partitioning", + "tests/unit/io/test_delta.py::test_scan_delta_nanosecond_timestamp": "polars generates the wrong schema: https://github.com/pola-rs/polars/issues/23949", + "tests/unit/io/test_delta.py::test_scan_delta_nanosecond_timestamp_nested": "polars generates the wrong schema: https://github.com/pola-rs/polars/issues/23949", "tests/unit/io/test_lazy_count_star.py::test_count_compressed_csv_18057": "Need to determine if file is compressed", "tests/unit/io/test_lazy_count_star.py::test_count_parquet[small.parquet-4]": "Debug output on stderr doesn't match", "tests/unit/io/test_lazy_count_star.py::test_count_parquet[foods*.parquet-54]": "Debug output on stderr doesn't match", @@ -64,7 +67,6 @@ def pytest_configure(config: pytest.Config) -> None: "tests/unit/io/test_partition.py::test_partition_to_memory[io_type1]": "partition sinks not yet supported in standard engine.", "tests/unit/io/test_partition.py::test_partition_to_memory[io_type2]": "partition sinks not yet supported in standard engine.", "tests/unit/io/test_partition.py::test_partition_to_memory[io_type3]": "partition sinks not yet supported in standard engine.", - "tests/unit/io/test_partition.py::test_partition_key_order_22645": "partition sinks not yet supported in standard engine.", "tests/unit/io/test_partition.py::test_partition_to_memory_finish_callback[io_type1]": "partition sinks not yet supported in standard engine.", "tests/unit/io/test_partition.py::test_partition_to_memory_finish_callback[io_type2]": "partition sinks not yet supported in standard engine.", "tests/unit/io/test_partition.py::test_partition_to_memory_finish_callback[io_type3]": "partition sinks not yet supported in standard engine.", @@ -115,11 +117,6 @@ def pytest_configure(config: pytest.Config) -> None: "tests/unit/io/test_parquet_field_overwrites.py::test_required_list[dtype0]": "cannot serialize in-memory sink target.", "tests/unit/io/test_parquet_field_overwrites.py::test_required_list[dtype1]": "cannot serialize in-memory sink target.", "tests/unit/io/test_parquet_field_overwrites.py::test_required_struct": "cannot serialize in-memory sink target.", - "tests/unit/io/test_write.py::test_write_async[read_parquet-write_parquet]": "Need to add include_file_path to IR", - "tests/unit/io/test_write.py::test_write_async[-write_csv]": "Need to add include_file_path to IR", - "tests/unit/io/test_write.py::test_write_async[read_parquet-]": "Need to add include_file_path to IR", - "tests/unit/io/test_write.py::test_write_async[-0]": "Need to add include_file_path to IR", - "tests/unit/io/test_write.py::test_write_async[-2]": "Need to add include_file_path to IR", "tests/unit/lazyframe/test_engine_selection.py::test_engine_import_error_raises[gpu]": "Expect this to pass because cudf-polars is installed", "tests/unit/lazyframe/test_engine_selection.py::test_engine_import_error_raises[engine1]": "Expect this to pass because cudf-polars is installed", "tests/unit/lazyframe/test_lazyframe.py::test_round[dtype1-123.55-1-123.6]": "Rounding midpoints is handled incorrectly", @@ -143,9 +140,11 @@ def pytest_configure(config: pytest.Config) -> None: "tests/unit/operations/test_group_by.py::test_group_by_median_by_dtype[input16-expected16-input_dtype16-output_dtype16]": "Unsupported groupby-agg for a particular dtype", "tests/unit/operations/test_group_by.py::test_group_by_binary_agg_with_literal": "Incorrect broadcasting of literals in groupby-agg", "tests/unit/operations/test_group_by.py::test_group_by_lit_series": "Incorrect broadcasting of literals in groupby-agg", + "tests/unit/operations/test_group_by.py::test_group_by_series_lit_22103[False]": "Incorrect broadcasting of literals in groupby-agg", + "tests/unit/operations/test_group_by.py::test_group_by_series_lit_22103[True]": "Incorrect broadcasting of literals in groupby-agg", "tests/unit/operations/test_join.py::test_cross_join_slice_pushdown": "Need to implement slice pushdown for cross joins", + "tests/unit/operations/test_join.py::test_join_filter_pushdown_iejoin": "Row order differs due to multiple matches per left row index; join results are correct but unsorted", "tests/unit/operations/namespaces/string/test_pad.py::test_str_zfill_unicode_not_respected": "polars doesn't add zeros for unicode characters.", - "tests/unit/operations/test_rolling.py::test_rolling_group_by_empty_groups_by_take_6330": "Ordering difference, might be polars bug", "tests/unit/sql/test_cast.py::test_cast_errors[values0-values::uint8-conversion from `f64` to `u64` failed]": "Casting that raises not supported on GPU", "tests/unit/sql/test_cast.py::test_cast_errors[values1-values::uint4-conversion from `i64` to `u32` failed]": "Casting that raises not supported on GPU", "tests/unit/sql/test_cast.py::test_cast_errors[values2-values::int1-conversion from `i64` to `i8` failed]": "Casting that raises not supported on GPU", @@ -156,10 +155,6 @@ def pytest_configure(config: pytest.Config) -> None: "tests/unit/test_cse.py::test_nested_cache_no_panic_16553": "Needs https://github.com/rapidsai/cudf/issues/18630", "tests/unit/test_errors.py::test_error_on_empty_group_by": "Incorrect exception raised", "tests/unit/test_predicates.py::test_predicate_pushdown_split_pushable": "Casting that raises not supported on GPU", - "tests/unit/io/test_scan.py::test_async_read_21945[scan_type0]": "Debug output on stderr doesn't match", - "tests/unit/io/test_scan.py::test_async_read_21945[scan_type1]": "Debug output on stderr doesn't match", - "tests/unit/io/test_scan.py::test_async_read_21945[scan_type2]": "Debug output on stderr doesn't match", - "tests/unit/io/test_scan.py::test_async_read_21945[scan_type3]": "Debug output on stderr doesn't match", "tests/unit/io/test_scan_row_deletion.py::test_scan_row_deletion_skips_file_with_all_rows_deleted": "The test intentionally corrupts the parquet file, so we cannot read the row count from the header.", "tests/unit/io/test_multiscan.py::test_multiscan_row_index[scan_csv-write_csv-csv]": "Debug output on stderr doesn't match", "tests/unit/functions/range/test_linear_space.py::test_linear_space_date": "Needs https://github.com/pola-rs/polars/issues/23020", @@ -196,6 +191,7 @@ def pytest_configure(config: pytest.Config) -> None: "tests/unit/lazyframe/test_serde.py::test_lf_serde_roundtrip_binary": "chrono_tz doesn't have all tzdata symlink names", # Tests performance difference of CPU engine "tests/unit/operations/test_join.py::test_join_where_eager_perf_21145": "Tests performance bug in CPU engine", + "tests/unit/operations/namespaces/list/test_list.py::test_list_struct_field_perf": "Tests CPU Engine perf", # The test may segfault with the legacy streaming engine. We should # remove this skip when all polars tests use the new streaming engine. "tests/unit/streaming/test_streaming_group_by.py::test_streaming_group_by_literal[1]": "May segfault w/the legacy streaming engine", diff --git a/python/cudf_polars/cudf_polars/utils/versions.py b/python/cudf_polars/cudf_polars/utils/versions.py index 82ba779b234..d8695cdb851 100644 --- a/python/cudf_polars/cudf_polars/utils/versions.py +++ b/python/cudf_polars/cudf_polars/utils/versions.py @@ -11,11 +11,13 @@ from polars import __version__ POLARS_VERSION = parse(__version__) - POLARS_LOWER_BOUND = parse("1.28") POLARS_VERSION_LT_129 = POLARS_VERSION < parse("1.29") POLARS_VERSION_LT_130 = POLARS_VERSION < parse("1.30") POLARS_VERSION_LT_131 = POLARS_VERSION < parse("1.31") +POLARS_VERSION_LT_132 = POLARS_VERSION < parse("1.32") +POLARS_VERSION_LT_1321 = POLARS_VERSION < parse("1.32.1") +POLARS_VERSION_LT_1323 = POLARS_VERSION < parse("1.32.3") def _ensure_polars_version() -> None: diff --git a/python/cudf_polars/pyproject.toml b/python/cudf_polars/pyproject.toml index 5fadd6b8656..d50c2d08b24 100644 --- a/python/cudf_polars/pyproject.toml +++ b/python/cudf_polars/pyproject.toml @@ -21,7 +21,7 @@ requires-python = ">=3.10" dependencies = [ "nvidia-ml-py", "packaging", - "polars>=1.28,<1.32", + "polars>=1.28,<1.33", "pylibcudf==25.10.*,>=0.0.0a0", "typing-extensions; python_version < '3.11'", ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. diff --git a/python/cudf_polars/tests/dsl/test_serialization.py b/python/cudf_polars/tests/dsl/test_serialization.py index 3e6a75e45ba..b4b89992e67 100644 --- a/python/cudf_polars/tests/dsl/test_serialization.py +++ b/python/cudf_polars/tests/dsl/test_serialization.py @@ -12,7 +12,11 @@ from cudf_polars.dsl.expressions.boolean import BooleanFunction from cudf_polars.dsl.expressions.datetime import TemporalFunction from cudf_polars.dsl.expressions.string import StringFunction -from cudf_polars.utils.versions import POLARS_VERSION_LT_131 +from cudf_polars.utils.versions import ( + POLARS_VERSION_LT_131, + POLARS_VERSION_LT_132, + POLARS_VERSION_LT_1321, +) if not POLARS_VERSION_LT_131: from cudf_polars.dsl.expressions.struct import StructFunction @@ -46,8 +50,31 @@ def test_from_polars_all_names(function): polars_function = getattr(pl_expr, function.__name__) polars_names = [name for name in dir(polars_function) if not name.startswith("_")] # Check names advertised by polars are the same as we advertise - assert set(polars_names) == set(function.Name.__members__) - for name in function.Name: + polars_names_set = set(polars_names) + cudf_polars_names_set = set(function.Name.__members__) + if not POLARS_VERSION_LT_132 and function == StructFunction: + cudf_polars_names_set = cudf_polars_names_set - { + "FieldByIndex", + "MultipleFields", + } + if POLARS_VERSION_LT_1321 and function == TemporalFunction: + cudf_polars_names_set = cudf_polars_names_set - { + "DaysInMonth", + } + if POLARS_VERSION_LT_132 and function == BooleanFunction: + cudf_polars_names_set = cudf_polars_names_set - {"IsClose"} + assert polars_names_set == cudf_polars_names_set + names = function.Name + if not POLARS_VERSION_LT_132 and function == StructFunction: + names = set(names) - { + StructFunction.Name.FieldByIndex, + StructFunction.Name.MultipleFields, + } + if POLARS_VERSION_LT_1321 and function == TemporalFunction: + names = set(names) - {TemporalFunction.Name.DaysInMonth} + if POLARS_VERSION_LT_132 and function == BooleanFunction: + names = set(names) - {BooleanFunction.Name.IsClose} + for name in names: attr = getattr(polars_function, name.name) assert function.Name.from_polars(attr) == name diff --git a/python/cudf_polars/tests/expressions/test_booleanfunction.py b/python/cudf_polars/tests/expressions/test_booleanfunction.py index 200b9571c7f..236a35935b8 100644 --- a/python/cudf_polars/tests/expressions/test_booleanfunction.py +++ b/python/cudf_polars/tests/expressions/test_booleanfunction.py @@ -12,7 +12,7 @@ assert_gpu_result_equal, assert_ir_translation_raises, ) -from cudf_polars.utils.versions import POLARS_VERSION_LT_130 +from cudf_polars.utils.versions import POLARS_VERSION_LT_130, POLARS_VERSION_LT_132 if TYPE_CHECKING: from collections.abc import Callable @@ -242,3 +242,15 @@ def test_expr_is_in_empty_list(): ldf = pl.LazyFrame({"a": [1, 2, 3, 4]}) q = ldf.select(pl.col("a").is_in([])) assert_gpu_result_equal(q) + + +def test_boolean_is_close(request): + request.applymarker( + pytest.mark.xfail( + condition=POLARS_VERSION_LT_132, reason="Not supported until polars 1.32" + ) + ) + ldf = pl.LazyFrame({"a": [1.0, 1.2, 1.4, 1.45, 1.6]}) + q = ldf.select(pl.col("a").is_close(1.4, abs_tol=0.1)) + + assert_ir_translation_raises(q, NotImplementedError) diff --git a/python/cudf_polars/tests/expressions/test_stringfunction.py b/python/cudf_polars/tests/expressions/test_stringfunction.py index 3f1874df702..21fa5779757 100644 --- a/python/cudf_polars/tests/expressions/test_stringfunction.py +++ b/python/cudf_polars/tests/expressions/test_stringfunction.py @@ -18,6 +18,7 @@ POLARS_VERSION_LT_129, POLARS_VERSION_LT_130, POLARS_VERSION_LT_131, + POLARS_VERSION_LT_132, ) @@ -440,7 +441,7 @@ def test_invalid_regex_raises(): ) -@pytest.mark.parametrize("pattern", ["a{1000}", "a(?i:B)"]) +@pytest.mark.parametrize("pattern", ["a{1000}", "a(?i:B)", ""]) def test_unsupported_regex_raises(pattern): df = pl.LazyFrame({"a": ["abc"]}) @@ -530,10 +531,15 @@ def test_string_zfill(fill, input_strings): q = ldf.select(pl.col("a").str.zfill(fill)) if fill is not None and fill < 0: + cudf_except = ( + pl.exceptions.InvalidOperationError + if not POLARS_VERSION_LT_132 + else pl.exceptions.ComputeError + ) assert_collect_raises( q, polars_except=pl.exceptions.InvalidOperationError, - cudf_except=pl.exceptions.ComputeError, + cudf_except=cudf_except, ) else: assert_gpu_result_equal(q) @@ -581,12 +587,19 @@ def test_string_zfill_column(fill): ).lazy() q = ldf.select(pl.col("input_strings").str.zfill(pl.col("fill"))) if fill is not None and fill < 0: + cudf_except = ( + ( + pl.exceptions.InvalidOperationError + if not POLARS_VERSION_LT_130 + else pl.exceptions.ComputeError + ) + if POLARS_VERSION_LT_132 + else () + ) assert_collect_raises( q, polars_except=pl.exceptions.InvalidOperationError, - cudf_except=pl.exceptions.InvalidOperationError - if not POLARS_VERSION_LT_130 - else pl.exceptions.ComputeError, + cudf_except=cudf_except, ) else: assert_gpu_result_equal(q) diff --git a/python/cudf_polars/tests/test_cache.py b/python/cudf_polars/tests/test_cache.py index 90c14911287..e242cd52cb2 100644 --- a/python/cudf_polars/tests/test_cache.py +++ b/python/cudf_polars/tests/test_cache.py @@ -2,15 +2,24 @@ # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations +import pytest + import polars as pl from cudf_polars import Translator from cudf_polars.dsl import ir from cudf_polars.dsl.traversal import traversal from cudf_polars.testing.asserts import assert_gpu_result_equal +from cudf_polars.utils.versions import POLARS_VERSION_LT_1323 -def test_cache(): +def test_cache(request): + request.applymarker( + pytest.mark.xfail( + condition=not POLARS_VERSION_LT_1323, + reason="python no longer manages cache hits", + ) + ) df1 = pl.LazyFrame( { "a": [1, 2, 3, 4, 5, 6, 7], diff --git a/python/cudf_polars/tests/test_groupby.py b/python/cudf_polars/tests/test_groupby.py index b47cc2962db..38a1cee6ab7 100644 --- a/python/cudf_polars/tests/test_groupby.py +++ b/python/cudf_polars/tests/test_groupby.py @@ -14,6 +14,7 @@ assert_gpu_result_equal, assert_ir_translation_raises, ) +from cudf_polars.utils.versions import POLARS_VERSION_LT_1321 @pytest.fixture @@ -213,7 +214,13 @@ def test_groupby_nan_minmax_raises(op): pl.lit([[4, 5, 6]]).alias("value"), marks=pytest.mark.xfail(reason="Need to expose OtherScalar in rust IR"), ), - pl.Series("value", [[4, 5, 6]], dtype=pl.List(pl.Int32)), + pytest.param( + pl.Series("value", [[4, 5, 6]], dtype=pl.List(pl.Int32)), + marks=pytest.mark.xfail( + condition=not POLARS_VERSION_LT_1321, + reason="https://github.com/rapidsai/cudf/issues/19610", + ), + ), pl.col("float") * (1 - pl.col("int")), [pl.lit(2).alias("value"), pl.col("float") * 2], ], diff --git a/python/cudf_polars/tests/test_join.py b/python/cudf_polars/tests/test_join.py index f7d67341bcb..0670e6c5643 100644 --- a/python/cudf_polars/tests/test_join.py +++ b/python/cudf_polars/tests/test_join.py @@ -11,6 +11,7 @@ assert_ir_translation_raises, get_default_engine, ) +from cudf_polars.utils.versions import POLARS_VERSION_LT_132 @pytest.fixture(params=[False, True], ids=["nulls_not_equal", "nulls_equal"]) @@ -157,3 +158,17 @@ def test_join_where(left, right, conditions, zlice): # therefore we only check the length assert_gpu_result_equal(q_len) + + +def test_cross_join_empty_right_table(request): + request.applymarker( + pytest.mark.xfail(condition=POLARS_VERSION_LT_132, reason="nested loop join") + ) + a = pl.LazyFrame({"a": [1, 2, 3], "x": [7, 2, 1]}) + b = pl.LazyFrame({"b": [2, 2, 2], "x": [7, 1, 3]}) + + q = a.join(b, how="cross").filter( + (pl.col("a") == pl.col("a")) & (pl.col("b") < pl.col("b")) + ) + + assert_gpu_result_equal(q) diff --git a/python/cudf_polars/tests/test_scan.py b/python/cudf_polars/tests/test_scan.py index 84641be497f..8481105baad 100644 --- a/python/cudf_polars/tests/test_scan.py +++ b/python/cudf_polars/tests/test_scan.py @@ -467,7 +467,14 @@ def test_scan_with_row_index(tmp_path: Path) -> None: df.write_csv(tmp_path / "test-0.csv") df.write_csv(tmp_path / "test-1.csv") - result = pl.scan_csv( - tmp_path / "test-*.csv", row_index_name="index", row_index_offset=0 - ) - assert_gpu_result_equal(result) + q = pl.scan_csv(tmp_path / "test-*.csv", row_index_name="index", row_index_offset=0) + assert_gpu_result_equal(q) + + +def test_scan_from_file_uri(tmp_path: Path) -> None: + tmp_path.mkdir(exist_ok=True) + path = tmp_path / "out.parquet" + df = pl.DataFrame({"a": 1}) + df.write_parquet(path) + q = pl.scan_parquet(f"file://{path}") + assert_ir_translation_raises(q, NotImplementedError) From 3c36493bbdb39a62c3075fd343450cac526fc55e Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Tue, 19 Aug 2025 07:11:28 -0700 Subject: [PATCH 157/366] Fix filter call in benchmark (#19732) https://github.com/rapidsai/cudf/pull/19502/ modified the API for `cudf::filter` but did not update the `minmax_filter` benchmark accordingly. I'm not sure how CI passed on that PR since I thought we were building benchmarks in CI. Probably some race condition I'm not thinking of at the moment. Authors: - Vyas Ramasubramani (https://github.com/vyasr) - Basit Ayantunde (https://github.com/lamarrr) Approvers: - David Wendt (https://github.com/davidwendt) - Basit Ayantunde (https://github.com/lamarrr) URL: https://github.com/rapidsai/cudf/pull/19732 --- cpp/benchmarks/filter/minmax_filter.cpp | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/cpp/benchmarks/filter/minmax_filter.cpp b/cpp/benchmarks/filter/minmax_filter.cpp index 069bfe67107..af3e02b7eb0 100644 --- a/cpp/benchmarks/filter/minmax_filter.cpp +++ b/cpp/benchmarks/filter/minmax_filter.cpp @@ -147,8 +147,14 @@ static void BM_filter_min_max(nvbench::state& state) cudf::apply_boolean_mask(input_table, filter_boolean->view(), stream, mr); } break; case engine_type::JIT: { - auto result = cudf::filter( - filter_inputs, udf, false, std::nullopt, std::vector{true, false, false}, stream, mr); + auto result = cudf::filter(filter_inputs, + udf, + false, + std::nullopt, + std::vector{true, false, false}, + cudf::null_aware::NO, + stream, + mr); } break; default: CUDF_UNREACHABLE("Unrecognised engine type requested"); } From d3f2d4021e948723216df0266ac8e1a456cc3842 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Tue, 19 Aug 2025 09:22:57 -0700 Subject: [PATCH 158/366] Add reduction with overflow detection (#19641) Contributes to #19243 This PR adds overflow detection to `reduce`, enabling the `SUM_WITH_OVERFLOW` aggregation kind. It returns a struct scalar containing the sum result and a boolean flag indicating whether an overflow occurred. Currently, only `INT64` is supported. Authors: - Yunsong Wang (https://github.com/PointKernel) Approvers: - Robert (Bobby) Evans (https://github.com/revans2) - Nghia Truong (https://github.com/ttnghia) - Vukasin Milovanovic (https://github.com/vuule) URL: https://github.com/rapidsai/cudf/pull/19641 --- cpp/CMakeLists.txt | 1 + .../cudf/detail/aggregation/aggregation.hpp | 4 +- cpp/include/cudf/reduction.hpp | 22 +- .../reduction/detail/reduction_functions.hpp | 21 ++ cpp/src/aggregation/aggregation.cpp | 4 + cpp/src/reductions/reductions.cpp | 27 +- cpp/src/reductions/sum_with_overflow.cu | 188 +++++++++++++ cpp/tests/reductions/reduction_tests.cpp | 249 ++++++++++++++++++ 8 files changed, 502 insertions(+), 14 deletions(-) create mode 100644 cpp/src/reductions/sum_with_overflow.cu diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 6b9601856c0..1598d7fb51b 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -662,6 +662,7 @@ add_library( src/reductions/std.cu src/reductions/sum.cu src/reductions/sum_of_squares.cu + src/reductions/sum_with_overflow.cu src/reductions/var.cu src/replace/clamp.cu src/replace/nans.cu diff --git a/cpp/include/cudf/detail/aggregation/aggregation.hpp b/cpp/include/cudf/detail/aggregation/aggregation.hpp index 81084f8bdfb..32f4d8c572e 100644 --- a/cpp/include/cudf/detail/aggregation/aggregation.hpp +++ b/cpp/include/cudf/detail/aggregation/aggregation.hpp @@ -185,7 +185,9 @@ class sum_aggregation final : public rolling_aggregation, * @brief Derived class for specifying a sum_with_overflow aggregation */ class sum_with_overflow_aggregation final : public groupby_aggregation, - public groupby_scan_aggregation { + public groupby_scan_aggregation, + public reduce_aggregation, + public segmented_reduce_aggregation { public: sum_with_overflow_aggregation() : aggregation(SUM_WITH_OVERFLOW) {} diff --git a/cpp/include/cudf/reduction.hpp b/cpp/include/cudf/reduction.hpp index e4150f4153a..216bf5b9734 100644 --- a/cpp/include/cudf/reduction.hpp +++ b/cpp/include/cudf/reduction.hpp @@ -39,10 +39,13 @@ enum class scan_type : bool { INCLUSIVE, EXCLUSIVE }; /** * @brief Computes the reduction of the values in all rows of a column. * - * This function does not detect overflows in reductions. When `output_type` - * does not match the `col.type()`, their values may be promoted to - * `int64_t` or `double` for computing aggregations and then cast to - * `output_type` before returning. + * This function does not detect overflows in reductions except for the `SUM_WITH_OVERFLOW` + * aggregation. When `output_type` does not match the `col.type()`, their values may be promoted to + * `int64_t` or `double` for computing aggregations and then cast to `output_type` before returning. + * + * The `SUM_WITH_OVERFLOW` aggregation is a special case that detects integer + * overflow during summation of `int64_t` values and returns a struct containing + * both the sum result and an overflow flag. * * Only `min` and `max` ops are supported for reduction of non-arithmetic * types (e.g. timestamp or string). @@ -61,6 +64,7 @@ enum class scan_type : bool { INCLUSIVE, EXCLUSIVE }; * | Aggregation | Output Type | Init Value | Empty Input | Comments | * | :---------: | ----------- | :--------: | ----------- | -------- | * | SUM/PRODUCT | output_type | yes | NA | Input accumulated into output_type variable | + * | SUM_WITH_OVERFLOW | STRUCT{INT64,BOOL8} | yes | {null,false} | {sum, overflow_flag}, input must be INT64 | * | SUM_OF_SQUARES | output_type | no | NA | Input accumulated into output_type variable | * | MIN/MAX | col.type | yes | NA | Supports arithmetic, timestamp, duration, string types only | * | ANY/ALL | BOOL8 | yes | True for ALL only | Checks for non-zero elements | @@ -84,6 +88,8 @@ enum class scan_type : bool { INCLUSIVE, EXCLUSIVE }; * @throw std::invalid_argument if `any` or `all` reduction is called and the output type is not BOOL8. * @throw std::invalid_argument if `mean`, `var`, or `std` reduction is called and * the `output_type` is not floating point. + * @throw std::invalid_argument if `sum_with_overflow` reduction is called and the + * input column type is not `INT64` or the `output_dtype` is not `STRUCT`. * * @param col Input column view * @param agg Aggregation operator applied by the reduction @@ -103,13 +109,15 @@ std::unique_ptr reduce( /** * @brief Computes the reduction of the values in all rows of a column with an initial value * - * Only `sum`, `product`, `min`, `max`, `any`, and `all` reductions are supported. + * Only `sum`, `product`, `min`, `max`, `any`, `all`, and `sum_with_overflow` reductions are + * supported. For `sum_with_overflow`, the initial value is added to the sum and overflow + * detection is performed throughout the entire computation. * * @see cudf::reduce(column_view const&,reduce_aggregation * const&,data_type,rmm::cuda_stream_view,rmm::device_async_resource_ref) for more details * - * @throw std::invalid_argument if reduction is not `sum`, `product`, `min`, `max`, `any`, or `all` - * and `init` is specified. + * @throw std::invalid_argument if reduction is not `sum`, `product`, `min`, `max`, `any`, `all`, + * or `sum_with_overflow` and `init` is specified. * * @param col Input column view * @param agg Aggregation operator applied by the reduction diff --git a/cpp/include/cudf/reduction/detail/reduction_functions.hpp b/cpp/include/cudf/reduction/detail/reduction_functions.hpp index 910e3b9c2e3..d7c49acca32 100644 --- a/cpp/include/cudf/reduction/detail/reduction_functions.hpp +++ b/cpp/include/cudf/reduction/detail/reduction_functions.hpp @@ -51,6 +51,27 @@ std::unique_ptr sum(column_view const& col, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr); +/** + * @brief Computes sum with overflow detection of int64_t elements in input column + * + * Returns a struct scalar with {sum: int64_t, overflow: bool} fields. + * Only supports int64_t input columns. + * + * @throw std::invalid_argument if input column type is not int64_t + * + * @param col input column to compute sum with overflow detection (must be int64_t) + * @param output_type data type of return type (must be struct) + * @param init initial value of the sum + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned scalar's device memory + * @return Struct scalar with sum and overflow flag + */ +std::unique_ptr sum_with_overflow(column_view const& col, + data_type const output_type, + std::optional> init, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr); + /** * @brief Computes minimum of elements in input column * diff --git a/cpp/src/aggregation/aggregation.cpp b/cpp/src/aggregation/aggregation.cpp index cb3a2b80ae4..39355983ed4 100644 --- a/cpp/src/aggregation/aggregation.cpp +++ b/cpp/src/aggregation/aggregation.cpp @@ -480,6 +480,10 @@ template CUDF_EXPORT std::unique_ptr make_sum_with_overflow_aggregation(); template CUDF_EXPORT std::unique_ptr make_sum_with_overflow_aggregation(); +template CUDF_EXPORT std::unique_ptr +make_sum_with_overflow_aggregation(); +template CUDF_EXPORT std::unique_ptr +make_sum_with_overflow_aggregation(); /// Factory to create a PRODUCT aggregation template diff --git a/cpp/src/reductions/reductions.cpp b/cpp/src/reductions/reductions.cpp index 4c75cc312ba..2c1c582091b 100644 --- a/cpp/src/reductions/reductions.cpp +++ b/cpp/src/reductions/reductions.cpp @@ -16,6 +16,8 @@ #include #include +#include +#include #include #include #include @@ -49,6 +51,13 @@ std::unique_ptr reduce_aggregate_impl( { switch (agg.kind) { case aggregation::SUM: return sum(col, output_dtype, init, stream, mr); + case aggregation::SUM_WITH_OVERFLOW: { + // Validate that input column is int64_t (unified validation for SUM_WITH_OVERFLOW) + CUDF_EXPECTS(col.type().id() == cudf::type_id::INT64, + "SUM_WITH_OVERFLOW aggregation only supports int64_t input types", + std::invalid_argument); + return sum_with_overflow(col, output_dtype, init, stream, mr); + } case aggregation::PRODUCT: return product(col, output_dtype, init, stream, mr); case aggregation::MIN: return min(col, output_dtype, init, stream, mr); case aggregation::MAX: return max(col, output_dtype, init, stream, mr); @@ -173,6 +182,10 @@ std::unique_ptr reduce_no_data_impl(reduce_aggregation const& agg, auto valid = !col.is_empty() && (nunique_agg._null_handling == cudf::null_policy::INCLUDE); return std::make_unique>(!col.is_empty(), valid, stream, mr); } + case aggregation::SUM_WITH_OVERFLOW: { + // For empty input, return {null, false} struct + return sum_with_overflow(col, output_dtype, std::nullopt, stream, mr); + } default: { return cudf::is_nested(output_dtype) ? make_empty_scalar_like(col, stream, mr) @@ -192,13 +205,14 @@ std::unique_ptr reduce(column_view const& col, CUDF_EXPECTS(!init.has_value() || cudf::have_same_types(col, init.value().get()), "column and initial value must be the same type", cudf::data_type_error); - if (init.has_value() && !(agg.kind == aggregation::SUM || agg.kind == aggregation::PRODUCT || - agg.kind == aggregation::MIN || agg.kind == aggregation::MAX || - agg.kind == aggregation::ANY || agg.kind == aggregation::ALL || - agg.kind == aggregation::HOST_UDF)) { + if (init.has_value() && + !(agg.kind == aggregation::SUM || agg.kind == aggregation::SUM_WITH_OVERFLOW || + agg.kind == aggregation::PRODUCT || agg.kind == aggregation::MIN || + agg.kind == aggregation::MAX || agg.kind == aggregation::ANY || + agg.kind == aggregation::ALL || agg.kind == aggregation::HOST_UDF)) { CUDF_FAIL( - "Initial value is only supported for SUM, PRODUCT, MIN, MAX, ANY, ALL, and HOST_UDF " - "aggregation types", + "Initial value is only supported for SUM, SUM_WITH_OVERFLOW, PRODUCT, MIN, MAX, ANY, ALL, " + "and HOST_UDF aggregation types", std::invalid_argument); } @@ -230,4 +244,5 @@ std::unique_ptr reduce(column_view const& col, CUDF_FUNC_RANGE(); return reduction::detail::reduce(col, agg, output_dtype, init, stream, mr); } + } // namespace cudf diff --git a/cpp/src/reductions/sum_with_overflow.cu b/cpp/src/reductions/sum_with_overflow.cu new file mode 100644 index 00000000000..5f98fc28443 --- /dev/null +++ b/cpp/src/reductions/sum_with_overflow.cu @@ -0,0 +1,188 @@ +/* + * Copyright (c) 2025, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include + +namespace cudf::reduction::detail { + +// Simple pair to hold sum and overflow flag +struct sum_overflow_result { + int64_t sum; + bool overflow; + + CUDF_HOST_DEVICE sum_overflow_result() : sum(0), overflow(false) {} + CUDF_HOST_DEVICE sum_overflow_result(int64_t s, bool o) : sum(s), overflow(o) {} +}; + +// Binary operator for combining sum_overflow_result values +struct overflow_sum_op { + __device__ sum_overflow_result operator()(sum_overflow_result const& lhs, + sum_overflow_result const& rhs) const + { + // If either operand already has overflow, result has overflow + if (lhs.overflow || rhs.overflow) { + // Still compute the sum for consistency, but mark as overflow + // This addition may wrap but we've already detected overflow + return sum_overflow_result{lhs.sum + rhs.sum, true}; + } + + // Check for overflow BEFORE performing the addition to avoid UB + bool overflow_detected = false; + + // Check for positive overflow: would the addition exceed INT64_MAX? + if (rhs.sum > 0 && lhs.sum > cuda::std::numeric_limits::max() - rhs.sum) { + overflow_detected = true; + } + // Check for negative overflow: would the addition go below INT64_MIN? + else if (rhs.sum < 0 && lhs.sum < cuda::std::numeric_limits::min() - rhs.sum) { + overflow_detected = true; + } + + // Perform the addition (safe if no overflow detected) + int64_t const result_sum = lhs.sum + rhs.sum; + + return sum_overflow_result{result_sum, overflow_detected}; + } +}; + +// Transform function to convert int64_t values to sum_overflow_result +struct to_sum_overflow { + __device__ sum_overflow_result operator()(int64_t value) const + { + return sum_overflow_result{value, false}; + } +}; + +// Transform functor for null-aware conversion using index +struct null_aware_to_sum_overflow { + cudf::column_device_view const* dcol_ptr; + + CUDF_HOST_DEVICE null_aware_to_sum_overflow(cudf::column_device_view const* dcol) : dcol_ptr(dcol) + { + } + + __device__ sum_overflow_result operator()(cudf::size_type idx) const + { + return dcol_ptr->is_valid(idx) ? sum_overflow_result{dcol_ptr->element(idx), false} + : sum_overflow_result{0, false}; + } +}; + +std::unique_ptr sum_with_overflow( + column_view const& col, + cudf::data_type const output_dtype, + std::optional> init, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + CUDF_FUNC_RANGE(); + + // SUM_WITH_OVERFLOW only supports int64_t input + CUDF_EXPECTS(col.type().id() == cudf::type_id::INT64, + "SUM_WITH_OVERFLOW only supports int64_t input types", + std::invalid_argument); + + // Handle empty column + if (col.size() == 0 || col.size() == col.null_count()) { + // Create struct with {null sum, false overflow} + auto sum_scalar = + cudf::make_default_constructed_scalar(cudf::data_type{cudf::type_id::INT64}, stream, mr); + sum_scalar->set_valid_async(false, stream); + auto overflow_scalar = cudf::make_fixed_width_scalar(false, stream, mr); + + std::vector> children; + children.push_back(cudf::make_column_from_scalar(*sum_scalar, 1, stream, mr)); + children.push_back(cudf::make_column_from_scalar(*overflow_scalar, 1, stream, mr)); + + // Use host_span of column_views instead of table_view to avoid double wrapping + std::vector child_views; + child_views.push_back(children[0]->view()); + child_views.push_back(children[1]->view()); + + return cudf::make_struct_scalar( + cudf::host_span{child_views}, stream, mr); + } + + // Create device view + auto dcol = cudf::column_device_view::create(col, stream); + + // Set up initial value + sum_overflow_result initial_value{0, false}; + if (init.has_value() && init.value().get().is_valid(stream)) { + auto const& init_scalar = static_cast const&>(init.value().get()); + initial_value.sum = init_scalar.value(stream); + } + + // Perform the reduction using thrust::transform_reduce + auto counting_iter = thrust::make_counting_iterator(0); + auto dcol_ptr = dcol.get(); + sum_overflow_result result; + + if (col.has_nulls()) { + // Use null-aware transform functor + result = thrust::transform_reduce(rmm::exec_policy_nosync(stream), + counting_iter, + counting_iter + col.size(), + null_aware_to_sum_overflow{dcol_ptr}, + initial_value, + overflow_sum_op{}); + } else { + // Use direct iterator for non-null case + auto input_iter = dcol->begin(); + result = thrust::transform_reduce(rmm::exec_policy_nosync(stream), + input_iter, + input_iter + col.size(), + to_sum_overflow{}, + initial_value, + overflow_sum_op{}); + } + + // Create result struct scalar with {sum: int64_t, overflow: bool} + auto sum_scalar = cudf::make_fixed_width_scalar(result.sum, stream, mr); + auto overflow_scalar = cudf::make_fixed_width_scalar(result.overflow, stream, mr); + + // Create struct scalar using cudf::make_struct_scalar with host_span of column_views + std::vector> children; + children.push_back(cudf::make_column_from_scalar(*sum_scalar, 1, stream, mr)); + children.push_back(cudf::make_column_from_scalar(*overflow_scalar, 1, stream, mr)); + + // Use host_span of column_views instead of table_view to avoid double wrapping + std::vector child_views; + child_views.push_back(children[0]->view()); + child_views.push_back(children[1]->view()); + + return cudf::make_struct_scalar( + cudf::host_span{child_views}, stream, mr); +} + +} // namespace cudf::reduction::detail diff --git a/cpp/tests/reductions/reduction_tests.cpp b/cpp/tests/reductions/reduction_tests.cpp index ff0806d6dfa..ec22328e84a 100644 --- a/cpp/tests/reductions/reduction_tests.cpp +++ b/cpp/tests/reductions/reduction_tests.cpp @@ -3173,4 +3173,253 @@ TEST_F(StructReductionTest, StructReductionMinMaxWithNulls) } } +// Test for SUM_WITH_OVERFLOW aggregation using regular reduce() function +struct ReduceWithOverflowTest : public cudf::test::BaseFixture { + // Helper function to extract sum and overflow from struct scalar returned by reduce() + std::pair, std::unique_ptr> extract_sum_overflow( + std::unique_ptr const& result) + { + EXPECT_TRUE(result->is_valid()); + EXPECT_EQ(result->type().id(), cudf::type_id::STRUCT); + + auto struct_scalar_ptr = static_cast(result.get()); + auto table_view = struct_scalar_ptr->view(); + + EXPECT_EQ(table_view.num_columns(), 2); + EXPECT_EQ(table_view.column(0).size(), 1); + EXPECT_EQ(table_view.column(1).size(), 1); + + auto sum_result = cudf::get_element(table_view.column(0), 0); + auto overflow_flag = cudf::get_element(table_view.column(1), 0); + return std::make_pair(std::move(sum_result), std::move(overflow_flag)); + } +}; + +TEST_F(ReduceWithOverflowTest, SumWithoutOverflow) +{ + std::vector values{1, 2, 3, 4, 5}; + cudf::test::fixed_width_column_wrapper col(values.begin(), values.end()); + + auto result = cudf::reduce(col, + *cudf::make_sum_with_overflow_aggregation(), + cudf::data_type{cudf::type_id::STRUCT}); + + auto [sum_result, overflow_flag] = extract_sum_overflow(result); + + EXPECT_TRUE(sum_result->is_valid()); + EXPECT_TRUE(overflow_flag->is_valid()); + + auto sum_value = static_cast const*>(sum_result.get())->value(); + auto overflow_value = + static_cast const*>(overflow_flag.get())->value(); + + EXPECT_EQ(sum_value, 15); // 1+2+3+4+5 = 15 + EXPECT_FALSE(overflow_value); // No overflow expected +} + +TEST_F(ReduceWithOverflowTest, PositiveOverflow) +{ + std::vector positive_overflow_values{std::numeric_limits::max(), + 1}; // max + 1 should overflow + cudf::test::fixed_width_column_wrapper col(positive_overflow_values.begin(), + positive_overflow_values.end()); + + auto result = cudf::reduce(col, + *cudf::make_sum_with_overflow_aggregation(), + cudf::data_type{cudf::type_id::STRUCT}); + + auto [sum_result, overflow_flag] = extract_sum_overflow(result); + + EXPECT_TRUE(sum_result->is_valid()); + EXPECT_TRUE(overflow_flag->is_valid()); + + auto overflow_value = + static_cast const*>(overflow_flag.get())->value(); + + EXPECT_TRUE(overflow_value); // Should detect positive overflow +} + +TEST_F(ReduceWithOverflowTest, NegativeOverflow) +{ + std::vector negative_overflow_values{std::numeric_limits::min(), + -1}; // min - 1 should overflow + cudf::test::fixed_width_column_wrapper col(negative_overflow_values.begin(), + negative_overflow_values.end()); + + auto result = cudf::reduce(col, + *cudf::make_sum_with_overflow_aggregation(), + cudf::data_type{cudf::type_id::STRUCT}); + + auto [sum_result, overflow_flag] = extract_sum_overflow(result); + + EXPECT_TRUE(sum_result->is_valid()); + EXPECT_TRUE(overflow_flag->is_valid()); + + auto overflow_value = + static_cast const*>(overflow_flag.get())->value(); + + EXPECT_TRUE(overflow_value); // Should detect negative overflow +} + +TEST_F(ReduceWithOverflowTest, AccumulatingOverflow) +{ + // Use large values that when accumulated could cause overflow + std::vector accumulating_overflow{ + std::numeric_limits::max() / 3, + std::numeric_limits::max() / 3, + std::numeric_limits::max() / 3, + std::numeric_limits::max() / 3}; // This should overflow + cudf::test::fixed_width_column_wrapper col(accumulating_overflow.begin(), + accumulating_overflow.end()); + + auto result = cudf::reduce(col, + *cudf::make_sum_with_overflow_aggregation(), + cudf::data_type{cudf::type_id::STRUCT}); + + auto [sum_result, overflow_flag] = extract_sum_overflow(result); + + EXPECT_TRUE(sum_result->is_valid()); + EXPECT_TRUE(overflow_flag->is_valid()); + + auto overflow_value = + static_cast const*>(overflow_flag.get())->value(); + + // Should detect overflow since we're adding 4 * (max/3) which > max + EXPECT_TRUE(overflow_value); // Should detect accumulating overflow +} + +TEST_F(ReduceWithOverflowTest, EmptyColumn) +{ + cudf::test::fixed_width_column_wrapper empty_col{}; + + auto result = cudf::reduce(empty_col, + *cudf::make_sum_with_overflow_aggregation(), + cudf::data_type{cudf::type_id::STRUCT}); + + auto [sum_result, overflow_flag] = extract_sum_overflow(result); + + EXPECT_FALSE(sum_result->is_valid()); // Should be null for empty input + EXPECT_TRUE(overflow_flag->is_valid()); + + auto overflow_value = + static_cast const*>(overflow_flag.get())->value(); + EXPECT_FALSE(overflow_value); // No overflow for empty input +} + +TEST_F(ReduceWithOverflowTest, AllNullColumn) +{ + std::vector values{1, 2, 3}; + std::vector validity{false, false, false}; + cudf::test::fixed_width_column_wrapper null_col( + values.begin(), values.end(), validity.begin()); + + auto result = cudf::reduce(null_col, + *cudf::make_sum_with_overflow_aggregation(), + cudf::data_type{cudf::type_id::STRUCT}); + + auto [sum_result, overflow_flag] = extract_sum_overflow(result); + + EXPECT_FALSE(sum_result->is_valid()); // Should be null for all-null input + EXPECT_TRUE(overflow_flag->is_valid()); + + auto overflow_value = + static_cast const*>(overflow_flag.get())->value(); + EXPECT_FALSE(overflow_value); // No overflow for all-null input +} + +TEST_F(ReduceWithOverflowTest, WithInitialValue) +{ + std::vector values{1, 2, 3}; + cudf::test::fixed_width_column_wrapper col(values.begin(), values.end()); + auto init_scalar = cudf::make_fixed_width_scalar(10); + + auto result = cudf::reduce(col, + *cudf::make_sum_with_overflow_aggregation(), + cudf::data_type{cudf::type_id::STRUCT}, + *init_scalar); + + auto [sum_result, overflow_flag] = extract_sum_overflow(result); + + EXPECT_TRUE(sum_result->is_valid()); + EXPECT_TRUE(overflow_flag->is_valid()); + + auto sum_value = static_cast const*>(sum_result.get())->value(); + auto overflow_value = + static_cast const*>(overflow_flag.get())->value(); + + EXPECT_EQ(sum_value, 16); // 10 + 1 + 2 + 3 = 16 + EXPECT_FALSE(overflow_value); // No overflow expected +} + +TEST_F(ReduceWithOverflowTest, InitialValuePositiveOverflow) +{ + std::vector values{1, 2, 3}; + cudf::test::fixed_width_column_wrapper col(values.begin(), values.end()); + auto init_scalar = cudf::make_fixed_width_scalar(std::numeric_limits::max() - + 3); // max - 3 + 6 = max + 3 (overflow) + + auto result = cudf::reduce(col, + *cudf::make_sum_with_overflow_aggregation(), + cudf::data_type{cudf::type_id::STRUCT}, + *init_scalar); + + auto [sum_result, overflow_flag] = extract_sum_overflow(result); + + EXPECT_TRUE(sum_result->is_valid()); + EXPECT_TRUE(overflow_flag->is_valid()); + + auto overflow_value = + static_cast const*>(overflow_flag.get())->value(); + + // (max - 3) + 1 + 2 + 3 = max + 3, which should overflow + EXPECT_TRUE(overflow_value); // Should detect overflow with initial value +} + +TEST_F(ReduceWithOverflowTest, InitialValueNegativeOverflow) +{ + std::vector values{-1, -2, -3}; + cudf::test::fixed_width_column_wrapper col(values.begin(), values.end()); + auto init_scalar = cudf::make_fixed_width_scalar(std::numeric_limits::min() + + 3); // min + 3 - 6 = min - 3 (overflow) + + auto result = cudf::reduce(col, + *cudf::make_sum_with_overflow_aggregation(), + cudf::data_type{cudf::type_id::STRUCT}, + *init_scalar); + + auto [sum_result, overflow_flag] = extract_sum_overflow(result); + + EXPECT_TRUE(sum_result->is_valid()); + EXPECT_TRUE(overflow_flag->is_valid()); + + auto overflow_value = + static_cast const*>(overflow_flag.get())->value(); + + // (min + 3) + (-1) + (-2) + (-3) = min - 3, which should overflow + EXPECT_TRUE(overflow_value); // Should detect negative overflow with initial value +} + +TEST_F(ReduceWithOverflowTest, ErrorHandlingNonInt64) +{ + std::vector int32_values{1, 2, 3}; + cudf::test::fixed_width_column_wrapper int32_col(int32_values.begin(), + int32_values.end()); + + EXPECT_THROW(cudf::reduce(int32_col, + *cudf::make_sum_with_overflow_aggregation(), + cudf::data_type{cudf::type_id::STRUCT}), + std::invalid_argument); +} + +TEST_F(ReduceWithOverflowTest, ErrorHandlingNonArithmetic) +{ + std::vector string_values{"a", "b", "c"}; + cudf::test::strings_column_wrapper string_col(string_values.begin(), string_values.end()); + + EXPECT_THROW(cudf::reduce(string_col, + *cudf::make_sum_with_overflow_aggregation(), + cudf::data_type{cudf::type_id::STRUCT}), + std::invalid_argument); +} + CUDF_TEST_PROGRAM_MAIN() From c5b93f18fe07851a023d4573cf5b9b2bb26b73fb Mon Sep 17 00:00:00 2001 From: Avinash Raj Date: Tue, 19 Aug 2025 22:23:33 +0530 Subject: [PATCH 159/366] Updated libcudf-example conda package to preserve directories structure (#19440) Currently, the `libcudf-example` conda package installs all example files into a single directory, making it hard to identify which files belong to each example. This PR resolves the issue by organizing the files into respective subfolders. closes #19360 Authors: - Avinash Raj (https://github.com/Avinash-Raj) - Muhammad Haseeb (https://github.com/mhaseeb123) - Vyas Ramasubramani (https://github.com/vyasr) - Shruti Shivakumar (https://github.com/shrshi) Approvers: - Nghia Truong (https://github.com/ttnghia) - Muhammad Haseeb (https://github.com/mhaseeb123) - Shruti Shivakumar (https://github.com/shrshi) - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/19440 --- ci/run_cudf_examples.sh | 10 ++++++++++ cpp/examples/basic/CMakeLists.txt | 4 ++-- cpp/examples/billion_rows/CMakeLists.txt | 6 +++--- cpp/examples/nested_types/CMakeLists.txt | 4 ++-- cpp/examples/parquet_io/CMakeLists.txt | 6 +++--- cpp/examples/string_transforms/CMakeLists.txt | 18 ++++++++++-------- cpp/examples/strings/CMakeLists.txt | 10 +++++----- 7 files changed, 35 insertions(+), 23 deletions(-) diff --git a/ci/run_cudf_examples.sh b/ci/run_cudf_examples.sh index 24de63ef45f..cc202fa9d56 100755 --- a/ci/run_cudf_examples.sh +++ b/ci/run_cudf_examples.sh @@ -9,14 +9,21 @@ trap "EXITCODE=1" ERR # Support customizing the examples' install location cd "${INSTALL_PREFIX:-${CONDA_PREFIX:-/usr}}/bin/examples/libcudf/" || exit +cd basic || exit compute-sanitizer --tool memcheck basic_example +cd .. +cd nested_types || exit compute-sanitizer --tool memcheck deduplication +cd .. +cd strings || exit compute-sanitizer --tool memcheck custom_optimized names.csv compute-sanitizer --tool memcheck custom_prealloc names.csv compute-sanitizer --tool memcheck custom_with_malloc names.csv +cd .. +cd string_transformers || exit compute-sanitizer --tool memcheck compute_checksum_jit info.csv output.csv compute-sanitizer --tool memcheck extract_email_jit info.csv output.csv compute-sanitizer --tool memcheck extract_email_precompiled info.csv output.csv @@ -24,11 +31,14 @@ compute-sanitizer --tool memcheck format_phone_jit info.csv output.csv compute-sanitizer --tool memcheck format_phone_precompiled info.csv output.csv compute-sanitizer --tool memcheck localize_phone_jit info.csv output.csv compute-sanitizer --tool memcheck localize_phone_precompiled info.csv output.csv +cd .. +cd parquet_io || exit compute-sanitizer --tool memcheck parquet_io example.parquet compute-sanitizer --tool memcheck parquet_io example.parquet output.parquet DELTA_BINARY_PACKED ZSTD TRUE compute-sanitizer --tool memcheck parquet_io_multithreaded example.parquet compute-sanitizer --tool memcheck parquet_io_multithreaded example.parquet 4 DEVICE_BUFFER 2 2 +cd .. exit ${EXITCODE} diff --git a/cpp/examples/basic/CMakeLists.txt b/cpp/examples/basic/CMakeLists.txt index 447c8709297..6e9604ff15e 100644 --- a/cpp/examples/basic/CMakeLists.txt +++ b/cpp/examples/basic/CMakeLists.txt @@ -28,5 +28,5 @@ add_executable(basic_example src/process_csv.cpp) target_link_libraries(basic_example PRIVATE cudf::cudf) target_compile_features(basic_example PRIVATE cxx_std_20) -install(TARGETS basic_example DESTINATION bin/examples/libcudf) -install(FILES ${CMAKE_CURRENT_LIST_DIR}/4stock_5day.csv DESTINATION bin/examples/libcudf) +install(TARGETS basic_example DESTINATION bin/examples/libcudf/basic) +install(FILES ${CMAKE_CURRENT_LIST_DIR}/4stock_5day.csv DESTINATION bin/examples/libcudf/basic) diff --git a/cpp/examples/billion_rows/CMakeLists.txt b/cpp/examples/billion_rows/CMakeLists.txt index 962ff79c537..ed83be6216a 100644 --- a/cpp/examples/billion_rows/CMakeLists.txt +++ b/cpp/examples/billion_rows/CMakeLists.txt @@ -35,7 +35,7 @@ target_link_libraries( $ ) target_compile_features(brc PRIVATE cxx_std_20) -install(TARGETS brc DESTINATION bin/examples/libcudf) +install(TARGETS brc DESTINATION bin/examples/libcudf/billion_rows) add_executable(brc_chunks brc_chunks.cpp) target_link_libraries( @@ -43,7 +43,7 @@ target_link_libraries( $ ) target_compile_features(brc_chunks PRIVATE cxx_std_20) -install(TARGETS brc_chunks DESTINATION bin/examples/libcudf) +install(TARGETS brc_chunks DESTINATION bin/examples/libcudf/billion_rows) add_executable(brc_pipeline brc_pipeline.cpp) target_link_libraries( @@ -51,4 +51,4 @@ target_link_libraries( $ ) target_compile_features(brc_pipeline PRIVATE cxx_std_20) -install(TARGETS brc_pipeline DESTINATION bin/examples/libcudf) +install(TARGETS brc_pipeline DESTINATION bin/examples/libcudf/billion_rows) diff --git a/cpp/examples/nested_types/CMakeLists.txt b/cpp/examples/nested_types/CMakeLists.txt index e91a85d4ee0..6532585f496 100644 --- a/cpp/examples/nested_types/CMakeLists.txt +++ b/cpp/examples/nested_types/CMakeLists.txt @@ -28,5 +28,5 @@ add_executable(deduplication deduplication.cpp) target_link_libraries(deduplication PRIVATE cudf::cudf) target_compile_features(deduplication PRIVATE cxx_std_20) -install(TARGETS deduplication DESTINATION bin/examples/libcudf) -install(FILES ${CMAKE_CURRENT_LIST_DIR}/example.json DESTINATION bin/examples/libcudf) +install(TARGETS deduplication DESTINATION bin/examples/libcudf/nested_types) +install(FILES ${CMAKE_CURRENT_LIST_DIR}/example.json DESTINATION bin/examples/libcudf/nested_types) diff --git a/cpp/examples/parquet_io/CMakeLists.txt b/cpp/examples/parquet_io/CMakeLists.txt index 4f520b2bdad..3c381d1b4c0 100644 --- a/cpp/examples/parquet_io/CMakeLists.txt +++ b/cpp/examples/parquet_io/CMakeLists.txt @@ -34,7 +34,7 @@ target_link_libraries( $ ) target_compile_features(parquet_io PRIVATE cxx_std_20) -install(TARGETS parquet_io DESTINATION bin/examples/libcudf) +install(TARGETS parquet_io DESTINATION bin/examples/libcudf/parquet_io) # Build and install parquet_io_multithreaded add_executable(parquet_io_multithreaded parquet_io_multithreaded.cpp) @@ -43,7 +43,7 @@ target_link_libraries( $ ) target_compile_features(parquet_io_multithreaded PRIVATE cxx_std_20) -install(TARGETS parquet_io_multithreaded DESTINATION bin/examples/libcudf) +install(TARGETS parquet_io_multithreaded DESTINATION bin/examples/libcudf/parquet_io) # Install the example.parquet file -install(FILES ${CMAKE_CURRENT_LIST_DIR}/example.parquet DESTINATION bin/examples/libcudf) +install(FILES ${CMAKE_CURRENT_LIST_DIR}/example.parquet DESTINATION bin/examples/libcudf/parquet_io) diff --git a/cpp/examples/string_transforms/CMakeLists.txt b/cpp/examples/string_transforms/CMakeLists.txt index 90830eb2820..fb31c93ba21 100644 --- a/cpp/examples/string_transforms/CMakeLists.txt +++ b/cpp/examples/string_transforms/CMakeLists.txt @@ -33,7 +33,7 @@ target_compile_options( target_link_libraries( compute_checksum_jit PRIVATE cudf::cudf $ ) -install(TARGETS compute_checksum_jit DESTINATION bin/examples/libcudf) +install(TARGETS compute_checksum_jit DESTINATION bin/examples/libcudf/string_transformers) add_executable(extract_email_jit extract_email_jit.cpp) target_compile_features(extract_email_jit PRIVATE cxx_std_20) @@ -41,7 +41,7 @@ target_compile_options(extract_email_jit PRIVATE "$<$:${C target_link_libraries( extract_email_jit PRIVATE cudf::cudf $ ) -install(TARGETS extract_email_jit DESTINATION bin/examples/libcudf) +install(TARGETS extract_email_jit DESTINATION bin/examples/libcudf/string_transformers) add_executable(extract_email_precompiled extract_email_precompiled.cpp) target_compile_features(extract_email_precompiled PRIVATE cxx_std_20) @@ -51,13 +51,13 @@ target_compile_options( target_link_libraries( extract_email_precompiled PRIVATE cudf::cudf $ ) -install(TARGETS extract_email_precompiled DESTINATION bin/examples/libcudf) +install(TARGETS extract_email_precompiled DESTINATION bin/examples/libcudf/string_transformers) add_executable(format_phone_jit format_phone_jit.cpp) target_compile_features(format_phone_jit PRIVATE cxx_std_20) target_compile_options(format_phone_jit PRIVATE "$<$:${CUDF_CUDA_FLAGS}>") target_link_libraries(format_phone_jit PRIVATE cudf::cudf $) -install(TARGETS format_phone_jit DESTINATION bin/examples/libcudf) +install(TARGETS format_phone_jit DESTINATION bin/examples/libcudf/string_transformers) add_executable(format_phone_precompiled format_phone_precompiled.cpp) target_compile_features(format_phone_precompiled PRIVATE cxx_std_20) @@ -67,7 +67,7 @@ target_compile_options( target_link_libraries( format_phone_precompiled PRIVATE cudf::cudf $ ) -install(TARGETS format_phone_precompiled DESTINATION bin/examples/libcudf) +install(TARGETS format_phone_precompiled DESTINATION bin/examples/libcudf/string_transformers) add_executable(localize_phone_jit localize_phone_jit.cpp) target_compile_features(localize_phone_jit PRIVATE cxx_std_20) @@ -75,7 +75,7 @@ target_compile_options(localize_phone_jit PRIVATE "$<$:${ target_link_libraries( localize_phone_jit PRIVATE cudf::cudf $ ) -install(TARGETS localize_phone_jit DESTINATION bin/examples/libcudf) +install(TARGETS localize_phone_jit DESTINATION bin/examples/libcudf/string_transformers) add_executable(localize_phone_precompiled localize_phone_precompiled.cpp) target_compile_features(localize_phone_precompiled PRIVATE cxx_std_20) @@ -85,6 +85,8 @@ target_compile_options( target_link_libraries( localize_phone_precompiled PRIVATE cudf::cudf $ ) -install(TARGETS localize_phone_precompiled DESTINATION bin/examples/libcudf) +install(TARGETS localize_phone_precompiled DESTINATION bin/examples/libcudf/string_transformers) -install(FILES ${CMAKE_CURRENT_LIST_DIR}/info.csv DESTINATION bin/examples/libcudf) +install(FILES ${CMAKE_CURRENT_LIST_DIR}/info.csv + DESTINATION bin/examples/libcudf/string_transformers +) diff --git a/cpp/examples/strings/CMakeLists.txt b/cpp/examples/strings/CMakeLists.txt index 4e890118dcb..1a7eb60d571 100644 --- a/cpp/examples/strings/CMakeLists.txt +++ b/cpp/examples/strings/CMakeLists.txt @@ -28,7 +28,7 @@ list(APPEND CUDF_CUDA_FLAGS --expt-extended-lambda --expt-relaxed-constexpr) add_executable(libcudf_apis libcudf_apis.cpp) target_compile_features(libcudf_apis PRIVATE cxx_std_20) target_link_libraries(libcudf_apis PRIVATE cudf::cudf $) -install(TARGETS libcudf_apis DESTINATION bin/examples/libcudf) +install(TARGETS libcudf_apis DESTINATION bin/examples/libcudf/strings) add_executable(custom_with_malloc custom_with_malloc.cu) target_compile_features(custom_with_malloc PRIVATE cxx_std_20) @@ -36,18 +36,18 @@ target_compile_options(custom_with_malloc PRIVATE "$<$:${ target_link_libraries( custom_with_malloc PRIVATE cudf::cudf $ ) -install(TARGETS custom_with_malloc DESTINATION bin/examples/libcudf) +install(TARGETS custom_with_malloc DESTINATION bin/examples/libcudf/strings) add_executable(custom_prealloc custom_prealloc.cu) target_compile_features(custom_prealloc PRIVATE cxx_std_20) target_compile_options(custom_prealloc PRIVATE "$<$:${CUDF_CUDA_FLAGS}>") target_link_libraries(custom_prealloc PRIVATE cudf::cudf $) -install(TARGETS custom_prealloc DESTINATION bin/examples/libcudf) +install(TARGETS custom_prealloc DESTINATION bin/examples/libcudf/strings) add_executable(custom_optimized custom_optimized.cu) target_compile_features(custom_optimized PRIVATE cxx_std_20) target_compile_options(custom_optimized PRIVATE "$<$:${CUDF_CUDA_FLAGS}>") target_link_libraries(custom_optimized PRIVATE cudf::cudf $) -install(TARGETS custom_optimized DESTINATION bin/examples/libcudf) +install(TARGETS custom_optimized DESTINATION bin/examples/libcudf/strings) -install(FILES ${CMAKE_CURRENT_LIST_DIR}/names.csv DESTINATION bin/examples/libcudf) +install(FILES ${CMAKE_CURRENT_LIST_DIR}/names.csv DESTINATION bin/examples/libcudf/strings) From 0deee7b8a2d40a3b3f40dad218de27fb1726006f Mon Sep 17 00:00:00 2001 From: Paul Taylor <178183+trxcllnt@users.noreply.github.com> Date: Tue, 19 Aug 2025 11:16:06 -0700 Subject: [PATCH 160/366] Use build cluster in devcontainers (#19652) RAPIDS has deployed an autoscaling cloud build cluster that can be used to accelerate building large RAPIDS projects. This contributes to https://github.com/rapidsai/build-planning/issues/209. Authors: - Paul Taylor (https://github.com/trxcllnt) Approvers: - Bradley Dice (https://github.com/bdice) URL: https://github.com/rapidsai/cudf/pull/19652 --- .devcontainer/Dockerfile | 36 +++++++++++++++++-- .../cuda12.9-conda/devcontainer.json | 4 ++- .devcontainer/cuda12.9-pip/devcontainer.json | 4 ++- .github/workflows/pr.yaml | 16 ++++++--- 4 files changed, 50 insertions(+), 10 deletions(-) diff --git a/.devcontainer/Dockerfile b/.devcontainer/Dockerfile index 315a389339a..b0f367e1f87 100644 --- a/.devcontainer/Dockerfile +++ b/.devcontainer/Dockerfile @@ -18,6 +18,8 @@ ENV DEFAULT_CONDA_ENV=rapids FROM ${PYTHON_PACKAGE_MANAGER}-base +ARG TARGETARCH + ARG CUDA ENV CUDAARCHS="RAPIDS" ENV CUDA_VERSION="${CUDA_VERSION:-${CUDA}}" @@ -29,8 +31,36 @@ ENV PYTHONSAFEPATH="1" ENV PYTHONUNBUFFERED="1" ENV PYTHONDONTWRITEBYTECODE="1" -ENV SCCACHE_REGION="us-east-2" -ENV SCCACHE_BUCKET="rapids-sccache-devs" -ENV AWS_ROLE_ARN="arn:aws:iam::279114543810:role/nv-gha-token-sccache-devs" ENV HISTFILE="/home/coder/.cache/._bash_history" ENV LIBCUDF_KERNEL_CACHE_PATH="/home/coder/cudf/cpp/build/${PYTHON_PACKAGE_MANAGER}/cuda-${CUDA_VERSION}/latest/jitify_cache" + +### +# sccache configuration +### +ENV AWS_ROLE_ARN="arn:aws:iam::279114543810:role/nv-gha-token-sccache-devs" +ENV SCCACHE_REGION="us-east-2" +ENV SCCACHE_BUCKET="rapids-sccache-devs" +# 2hr (1 minute longer than sccache-dist request timeout) +ENV SCCACHE_IDLE_TIMEOUT=7200 + +### +# sccache-dist configuration +### +# Enable sccache-dist by default +ENV DEVCONTAINER_UTILS_ENABLE_SCCACHE_DIST=1 +# Compile locally if max retries exceeded +ENV SCCACHE_DIST_FALLBACK_TO_LOCAL_COMPILE=true +# Retry transient errors 4 times (for a total of 5 attempts) +ENV SCCACHE_DIST_MAX_RETRIES=4 +ENV SCCACHE_DIST_CONNECT_TIMEOUT=30 +ENV SCCACHE_DIST_CONNECTION_POOL=false +# 1hr 59min (to accommodate debug builds) +ENV SCCACHE_DIST_REQUEST_TIMEOUT=7140 +ENV SCCACHE_DIST_KEEPALIVE_ENABLED=true +ENV SCCACHE_DIST_KEEPALIVE_INTERVAL=20 +ENV SCCACHE_DIST_KEEPALIVE_TIMEOUT=600 +ENV SCCACHE_DIST_URL="https://${TARGETARCH}.linux.sccache.rapids.nvidia.com" + +# Build as much in parallel as possible +ENV INFER_NUM_DEVICE_ARCHITECTURES=1 +ENV MAX_DEVICE_OBJ_TO_COMPILE_IN_PARALLEL=20 diff --git a/.devcontainer/cuda12.9-conda/devcontainer.json b/.devcontainer/cuda12.9-conda/devcontainer.json index 1ed542f11f3..9e5bc0306a3 100644 --- a/.devcontainer/cuda12.9-conda/devcontainer.json +++ b/.devcontainer/cuda12.9-conda/devcontainer.json @@ -11,7 +11,9 @@ "runArgs": [ "--rm", "--name", - "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-25.10-cuda12.9-conda" + "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-25.10-cuda12.9-conda", + "--ulimit", + "nofile=500000" ], "hostRequirements": { "gpu": "optional" diff --git a/.devcontainer/cuda12.9-pip/devcontainer.json b/.devcontainer/cuda12.9-pip/devcontainer.json index 3b35d4398c5..ea7d5a19515 100644 --- a/.devcontainer/cuda12.9-pip/devcontainer.json +++ b/.devcontainer/cuda12.9-pip/devcontainer.json @@ -11,7 +11,9 @@ "runArgs": [ "--rm", "--name", - "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-25.10-cuda12.9-pip" + "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-25.10-cuda12.9-pip", + "--ulimit", + "nofile=500000" ], "hostRequirements": { "gpu": "optional" diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml index 2592a50a05e..061a24e226b 100644 --- a/.github/workflows/pr.yaml +++ b/.github/workflows/pr.yaml @@ -316,13 +316,19 @@ jobs: needs: telemetry-setup uses: rapidsai/shared-workflows/.github/workflows/build-in-devcontainer.yaml@branch-25.10 with: - node_type: "cpu32" - arch: '["amd64"]' + arch: '["amd64", "arm64"]' cuda: '["12.9"]' + node_type: "cpu8" + rapids-aux-secret-1: GIST_REPO_READ_ORG_GITHUB_TOKEN + env: | + SCCACHE_DIST_MAX_RETRIES=inf + SCCACHE_SERVER_LOG=sccache=debug + SCCACHE_DIST_FALLBACK_TO_LOCAL_COMPILE=false + SCCACHE_DIST_AUTH_TOKEN_VAR=RAPIDS_AUX_SECRET_1 build_command: | - sccache -z; - build-all -DBUILD_BENCHMARKS=ON --verbose; - sccache -s; + sccache --zero-stats; + build-all -j0 -DBUILD_BENCHMARKS=ON --verbose 2>&1 | tee telemetry-artifacts/build.log; + sccache --show-adv-stats | tee telemetry-artifacts/sccache-stats.txt; unit-tests-cudf-pandas: needs: [wheel-build-cudf, changed-files] secrets: inherit From 0223c7a39b3decf5e21a7fd81f50ef7e31bd7842 Mon Sep 17 00:00:00 2001 From: Matthew Murray <41342305+Matt711@users.noreply.github.com> Date: Tue, 19 Aug 2025 15:09:21 -0400 Subject: [PATCH 161/366] Fix `group_by().agg()` on non-aggregatable dtypes (#19669) Closes #19664. The bug was caused by not checking if the dtype was actually summable. Now we do. Authors: - Matthew Murray (https://github.com/Matt711) Approvers: - Matthew Roeschke (https://github.com/mroeschke) URL: https://github.com/rapidsai/cudf/pull/19669 --- .../cudf_polars/dsl/utils/aggregations.py | 15 +++++++++++++++ python/cudf_polars/tests/test_groupby.py | 16 ++++++++++++++++ python/pylibcudf/pylibcudf/aggregation.pyx | 1 + 3 files changed, 32 insertions(+) diff --git a/python/cudf_polars/cudf_polars/dsl/utils/aggregations.py b/python/cudf_polars/cudf_polars/dsl/utils/aggregations.py index 29fd7155799..ebebc9bc361 100644 --- a/python/cudf_polars/cudf_polars/dsl/utils/aggregations.py +++ b/python/cudf_polars/cudf_polars/dsl/utils/aggregations.py @@ -136,6 +136,21 @@ def decompose_single_agg( ) if any(has_agg for _, has_agg in aggs): raise NotImplementedError("Nested aggs in groupby not supported") + + child_dtype = child.dtype.plc + req = agg.agg_request + is_median = agg.name == "median" + is_quantile = agg.name == "quantile" + + is_group_quantile_supported = plc.traits.is_integral( + child_dtype + ) or plc.traits.is_floating_point(child_dtype) + + unsupported = ( + (is_median or is_quantile) and not is_group_quantile_supported + ) or (not plc.aggregation.is_valid_aggregation(child_dtype, req)) + if unsupported: + return [], named_expr.reconstruct(expr.Literal(child.dtype, None)) if needs_masking: child = expr.UnaryFunction(child.dtype, "mask_nans", (), child) # The aggregation is just reconstructed with the new diff --git a/python/cudf_polars/tests/test_groupby.py b/python/cudf_polars/tests/test_groupby.py index 38a1cee6ab7..36196522f34 100644 --- a/python/cudf_polars/tests/test_groupby.py +++ b/python/cudf_polars/tests/test_groupby.py @@ -330,3 +330,19 @@ def test_groupby_sum_all_null_group_returns_null(): q = df.group_by("key").agg(out=pl.col("null_groups").sum()) assert_gpu_result_equal(q, check_row_order=False) + + +@pytest.mark.parametrize( + "agg_expr", + [ + pl.all().sum(), + pl.all().mean(), + pl.all().median(), + pl.all().quantile(0.5), + ], + ids=["sum", "mean", "median", "quantile-0.5"], +) +def test_groupby_aggs_keep_unsupported_as_null(df: pl.LazyFrame, agg_expr) -> None: + lf = df.filter(pl.col("datetime") == date(2004, 12, 1)) + q = lf.group_by("datetime").agg(agg_expr) + assert_gpu_result_equal(q) diff --git a/python/pylibcudf/pylibcudf/aggregation.pyx b/python/pylibcudf/pylibcudf/aggregation.pyx index 9bef36e5c06..87a39193d38 100644 --- a/python/pylibcudf/pylibcudf/aggregation.pyx +++ b/python/pylibcudf/pylibcudf/aggregation.pyx @@ -102,6 +102,7 @@ __all__ = [ "covariance", "ewma", "histogram", + "is_valid_aggregation", "lag", "lead", "m2", From 38223a57e4cf0b49c36e25f75d811b9a5f6c7cbd Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 19 Aug 2025 13:05:54 -0700 Subject: [PATCH 162/366] Move test_{io}.py files to new cudf classic test directory (#19709) Towards https://github.com/rapidsai/cudf/issues/9999 Towards https://github.com/rapidsai/cudf/issues/15723 Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/19709 --- python/cudf/cudf/io/orc.py | 7 + .../cudf/cudf/tests/input_output/test_csv.py | 38 + .../cudf/cudf/tests/input_output/test_hdf5.py | 141 +- .../tests/{ => input_output}/test_hdfs.py | 9 +- .../cudf/cudf/tests/input_output/test_orc.py | 2023 +++++++- .../cudf/tests/input_output/test_parquet.py | 4564 +++++++++++++++- .../cudf/tests/{ => input_output}/test_s3.py | 0 python/cudf/cudf/tests/test_gcs.py | 69 - python/cudf/cudf/tests/test_hdf.py | 142 - python/cudf/cudf/tests/test_orc.py | 2054 -------- python/cudf/cudf/tests/test_parquet.py | 4601 ----------------- 11 files changed, 6773 insertions(+), 6875 deletions(-) rename python/cudf/cudf/tests/{ => input_output}/test_hdfs.py (98%) rename python/cudf/cudf/tests/{ => input_output}/test_s3.py (100%) delete mode 100644 python/cudf/cudf/tests/test_gcs.py delete mode 100644 python/cudf/cudf/tests/test_hdf.py delete mode 100644 python/cudf/cudf/tests/test_orc.py delete mode 100644 python/cudf/cudf/tests/test_parquet.py diff --git a/python/cudf/cudf/io/orc.py b/python/cudf/cudf/io/orc.py index cbab80a9ee7..4b334587815 100644 --- a/python/cudf/cudf/io/orc.py +++ b/python/cudf/cudf/io/orc.py @@ -544,6 +544,13 @@ def __init__( self.stripe_size_rows = stripe_size_rows self.row_index_stride = row_index_stride self.initialized = False + self.writer = None + + def __enter__(self): + return self + + def __exit__(self, exc_type, exc_value, traceback): + self.close() def write_table(self, table): """Writes a single table to the file""" diff --git a/python/cudf/cudf/tests/input_output/test_csv.py b/python/cudf/cudf/tests/input_output/test_csv.py index 9ab62b11c7b..24ea277ccbb 100644 --- a/python/cudf/cudf/tests/input_output/test_csv.py +++ b/python/cudf/cudf/tests/input_output/test_csv.py @@ -2251,3 +2251,41 @@ def test_empty_file_pandas_compat_raises(tmp_path): cudf.read_csv(empty_file) with pytest.raises(pd.errors.EmptyDataError): cudf.read_csv(str(empty_file)) + + +def test_read_csv_gcs(monkeypatch): + gcsfs = pytest.importorskip("gcsfs") + pdf = pd.DataFrame( + { + "Integer": np.array([2345, 11987, 9027, 9027]), + "Float": np.array([9.001, 8.343, 6, 2.781]), + "Integer2": np.array([2345, 106, 2088, 789277]), + "String": np.array(["Alpha", "Beta", "Gamma", "Delta"]), + "Boolean": np.array([True, False, True, False]), + } + ) + + # Write to buffer + fpath = "cudf-gcs-test-bucket/test_csv_reader.csv" + buffer = pdf.to_csv(index=False) + + def mock_open(*args, **kwargs): + return BytesIO(buffer.encode()) + + def mock_size(*args): + return len(buffer.encode()) + + monkeypatch.setattr(gcsfs.GCSFileSystem, "open", mock_open) + monkeypatch.setattr(gcsfs.GCSFileSystem, "size", mock_size) + + # Test read from explicit path. + got = cudf.read_csv(f"gcs://{fpath}") + assert_eq(pdf, got) + + # AbstractBufferedFile -> PythonFile conversion + # will work fine with the monkey-patched FS if we + # pass in an fsspec file object + fs = gcsfs.GCSFileSystem() + with fs.open(f"gcs://{fpath}") as f: + got = cudf.read_csv(f) + assert_eq(pdf, got) diff --git a/python/cudf/cudf/tests/input_output/test_hdf5.py b/python/cudf/cudf/tests/input_output/test_hdf5.py index 06777c8e6af..dc48a9bdc4f 100644 --- a/python/cudf/cudf/tests/input_output/test_hdf5.py +++ b/python/cudf/cudf/tests/input_output/test_hdf5.py @@ -1 +1,140 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. +# Copyright (c) 2018-2025, NVIDIA CORPORATION. + +import os +from string import ascii_letters + +import numpy as np +import pandas as pd +import pytest + +import cudf +from cudf.testing import assert_eq +from cudf.testing._utils import NUMERIC_TYPES, UNSIGNED_TYPES + +pytest.importorskip("tables") + + +@pytest.fixture(params=[0, 10]) +def pdf(request): + types = set([*NUMERIC_TYPES, "datetime64[ns]", "bool"]) - set( + UNSIGNED_TYPES + ) + typer = {"col_" + val: val for val in types} + ncols = len(types) + nrows = request.param + + rng = np.random.default_rng(1) + # Create a pandas dataframe with random data of mixed types + test_pdf = pd.DataFrame( + rng.integers(0, 50, size=(nrows, ncols)), + columns=pd.Index([f"col_{typ}" for typ in types]), + index=pd.RangeIndex(nrows, name="test_index"), + ) + # Cast all the column dtypes to objects, rename them, and then cast to + # appropriate types + test_pdf = test_pdf.astype(typer).rename( + {"col_datetime64[ns]": "col_datetime64"}, axis=1 + ) + + # Create non-numeric categorical data otherwise may be typecasted + data = rng.choice(list(ascii_letters), size=nrows) + test_pdf["col_category"] = pd.Series(data, dtype="category") + + return (test_pdf, nrows) + + +@pytest.fixture +def gdf(pdf): + pdf, nrows = pdf + return (cudf.DataFrame.from_pandas(pdf), nrows) + + +@pytest.fixture(params=["fixed", "table"]) +def hdf_files(request, tmp_path, pdf): + pdf, nrows = pdf + if request.param == "fixed": + pdf = pdf.drop("col_category", axis=1) + + fname_df = tmp_path / "test_df.hdf" + pdf.to_hdf(fname_df, key="hdf_df_tests", format=request.param) + + fname_series = {} + for column in pdf.columns: + fname_series[column] = tmp_path / "test_series.hdf" + pdf[column].to_hdf( + fname_series[column], key="hdf_series_tests", format=request.param + ) + return (fname_df, fname_series, request.param, nrows) + + +@pytest.mark.filterwarnings("ignore:Using CPU") +@pytest.mark.filterwarnings("ignore:Strings are not yet supported") +@pytest.mark.parametrize( + "columns", + [["col_int8"], ["col_category"], ["col_int32", "col_float32"], None], +) +def test_hdf_reader(hdf_files, columns): + hdf_df_file, hdf_series, format, nrows = hdf_files + if format == "fixed" and columns is not None: + pytest.skip("Can't use columns with format 'fixed'") + if format == "table" and nrows == 0: + pytest.skip("Can't read 0 row table with format 'table'") + expect_df = pd.read_hdf(hdf_df_file, columns=columns) + got_df = cudf.read_hdf(hdf_df_file, columns=columns) + + assert_eq( + expect_df, got_df, check_categorical=False, check_index_type=False + ) + + for column in hdf_series.keys(): + expect_series = pd.read_hdf(hdf_series[column]) + got_series = cudf.read_hdf(hdf_series[column]) + + assert_eq(expect_series, got_series, check_index_type=False) + + +@pytest.mark.parametrize("format", ["fixed", "table"]) +@pytest.mark.parametrize("complib", ["zlib", "bzip2", "lzo", "blosc"]) +@pytest.mark.filterwarnings("ignore:Using CPU") +def test_hdf_writer(tmp_path, pdf, gdf, complib, format): + pdf, nrows = pdf + if format == "table" and nrows == 0: + pytest.skip("Can't read 0 row table with format 'table'") + gdf, _ = gdf + + if format == "fixed": + pdf = pdf.drop("col_category", axis=1) + gdf = gdf.drop("col_category", axis=1) + + pdf_df_fname = tmp_path / "pdf_df.hdf" + gdf_df_fname = tmp_path / "gdf_df.hdf" + + pdf.to_hdf(pdf_df_fname, key="hdf_tests", format=format, complib=complib) + gdf.to_hdf(gdf_df_fname, key="hdf_tests", format=format, complib=complib) + + assert os.path.exists(pdf_df_fname) + assert os.path.exists(gdf_df_fname) + + expect = pd.read_hdf(pdf_df_fname) + got = pd.read_hdf(gdf_df_fname) + + assert_eq(expect, got, check_index_type=False) + + for column in pdf.columns: + pdf_series_fname = tmp_path / (column + "_" + "pdf_series.hdf") + gdf_series_fname = tmp_path / (column + "_" + "gdf_series.hdf") + + pdf[column].to_hdf( + pdf_series_fname, key="hdf_tests", format=format, complib=complib + ) + gdf[column].to_hdf( + gdf_series_fname, key="hdf_tests", format=format, complib=complib + ) + + assert os.path.exists(pdf_series_fname) + assert os.path.exists(gdf_series_fname) + + expect_series = pd.read_hdf(pdf_series_fname) + got_series = pd.read_hdf(gdf_series_fname) + + assert_eq(expect_series, got_series, check_index_type=False) diff --git a/python/cudf/cudf/tests/test_hdfs.py b/python/cudf/cudf/tests/input_output/test_hdfs.py similarity index 98% rename from python/cudf/cudf/tests/test_hdfs.py rename to python/cudf/cudf/tests/input_output/test_hdfs.py index 098b5192d4a..f7845e7cef0 100644 --- a/python/cudf/cudf/tests/test_hdfs.py +++ b/python/cudf/cudf/tests/input_output/test_hdfs.py @@ -1,6 +1,5 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. +# Copyright (c) 2020-2025, NVIDIA CORPORATION. -import os from io import BytesIO import fastavro @@ -12,8 +11,10 @@ import cudf from cudf.testing import assert_eq -if not os.environ.get("RUN_HDFS_TESTS"): - pytestmark = pytest.mark.skip("Env not configured to run HDFS tests") +pytest.skip( + reason="https://github.com/rapidsai/cudf/issues/19633", + allow_module_level=True, +) basedir = "/tmp/test-hdfs" diff --git a/python/cudf/cudf/tests/input_output/test_orc.py b/python/cudf/cudf/tests/input_output/test_orc.py index 06777c8e6af..2590c41a315 100644 --- a/python/cudf/cudf/tests/input_output/test_orc.py +++ b/python/cudf/cudf/tests/input_output/test_orc.py @@ -1 +1,2022 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. +# Copyright (c) 2019-2025, NVIDIA CORPORATION. + +import datetime +import decimal +import os +import random +from io import BytesIO +from string import ascii_letters, ascii_lowercase + +import numpy as np +import pandas as pd +import pyarrow as pa +import pytest +from pyarrow import orc + +import cudf +from cudf.core._compat import PANDAS_CURRENT_SUPPORTED_VERSION, PANDAS_VERSION +from cudf.io.orc import ORCWriter +from cudf.testing import assert_eq, assert_frame_equal +from cudf.testing._utils import ( + expect_warning_if, + gen_rand_series, + supported_numpy_dtypes, +) + +# Removal of these deprecated features is no longer imminent. They will not be +# removed until a suitable alternative has been implemented. As a result, we +# also do not want to stop testing them yet. +# https://github.com/rapidsai/cudf/issues/11519 +pytestmark = pytest.mark.filterwarnings( + "ignore:(num_rows|skiprows) is deprecated and will be removed." +) + + +@pytest.fixture(scope="module") +def datadir(datadir): + return datadir / "orc" + + +@pytest.fixture +def path_or_buf(datadir): + fname = datadir / "TestOrcFile.test1.orc" + try: + with open(fname, "rb") as f: + buffer = BytesIO(f.read()) + except Exception as excpr: + if type(excpr).__name__ == "FileNotFoundError": + pytest.skip(".parquet file is not found") + raise excpr + + def _make_path_or_buf(src): + if src == "filepath": + return str(fname) + if src == "pathobj": + return fname + if src == "bytes_io": + return buffer + if src == "bytes": + return buffer.getvalue() + if src == "url": + return fname.as_uri() + + raise ValueError("Invalid source type") + + yield _make_path_or_buf + + +@pytest.fixture(params=[True, False]) +def use_index(request): + return request.param + + +@pytest.fixture(params=["pyarrow", "cudf"]) +def engine(request): + return request.param + + +@pytest.mark.filterwarnings("ignore:Using CPU") +@pytest.mark.parametrize( + "inputfile, columns", + [ + ("TestOrcFile.emptyFile.orc", ["boolean1"]), + ( + "TestOrcFile.test1.orc", + [ + "boolean1", + "byte1", + "short1", + "int1", + "long1", + "float1", + "double1", + ], + ), + ("TestOrcFile.RLEv2.orc", ["x", "y"]), + ("TestOrcFile.testSnappy.orc", None), + ("TestOrcFile.demo-12-zlib.orc", ["_col2", "_col3", "_col4", "_col5"]), + ], +) +def test_orc_reader_basic(datadir, inputfile, columns, use_index, engine): + path = datadir / inputfile + + expect = pd.read_orc(path, columns=columns) + got = cudf.read_orc( + path, engine=engine, columns=columns, use_index=use_index + ) + + assert_frame_equal(cudf.from_pandas(expect), got, check_categorical=False) + + +def test_orc_reader_filenotfound(tmpdir): + with pytest.raises(FileNotFoundError): + cudf.read_orc("TestMissingFile.orc") + + with pytest.raises(FileNotFoundError): + cudf.read_orc(tmpdir.mkdir("cudf_orc")) + + +def test_orc_reader_local_filepath(): + path = "~/TestLocalFile.orc" + if not os.path.isfile(path): + pytest.skip("Local .orc file is not found") + + cudf.read_orc(path) + + +@pytest.mark.parametrize( + "src", ["filepath", "pathobj", "bytes_io", "bytes", "url"] +) +def test_orc_reader_filepath_or_buffer(path_or_buf, src): + cols = ["int1", "long1", "float1", "double1"] + + expect = pd.read_orc(path_or_buf("filepath"), columns=cols) + got = cudf.read_orc(path_or_buf(src), columns=cols) + + assert_eq(expect, got) + + +@pytest.mark.skipif( + PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, + reason="Bug in older version of pandas", +) +def test_orc_reader_trailing_nulls(datadir): + path = datadir / "TestOrcFile.nulls-at-end-snappy.orc" + expect = pd.read_orc(path) + got = cudf.read_orc(path) + + assert_eq(expect, got, check_categorical=True) + + +@pytest.mark.parametrize( + "inputfile", + ["TestOrcFile.testDate1900.orc", "TestOrcFile.testDate2038.orc"], +) +def test_orc_reader_datetimestamp(datadir, inputfile, use_index): + path = datadir / inputfile + try: + orcfile = orc.ORCFile(path) + except pa.ArrowIOError as e: + pytest.skip(".orc file is not found: %s" % e) + + pdf = orcfile.read().to_pandas(date_as_object=False) + gdf = cudf.read_orc(path, use_index=use_index) + + assert_eq(pdf, gdf, check_categorical=False, check_exact=False) + + +def test_orc_reader_strings(datadir): + path = datadir / "TestOrcFile.testStringAndBinaryStatistics.orc" + + expect = pd.read_orc(path, columns=["string1"]) + got = cudf.read_orc(path, columns=["string1"]) + + assert_eq(expect, got, check_categorical=False) + + +def test_orc_read_statistics(datadir): + # Read in file containing 2 columns ("int1" and "string1") and 3 stripes + # (sizes 5000, 5000 and 1000 respectively). Each stripe has the same value + # in every one of its rows. The values the stripes have are 1, 2, and 3 in + # "int1" and "one", "two", and "three" in "string1". + path = datadir / "TestOrcFile.testStripeLevelStats.orc" + try: + ( + file_statistics, + stripes_statistics, + ) = cudf.io.orc.read_orc_statistics([path, path]) + except pa.ArrowIOError as e: + pytest.skip(".orc file is not found: %s" % e) + + # Check numberOfValues + assert_eq(file_statistics[0]["int1"].number_of_values, 11_000) + assert_eq( + file_statistics[0]["int1"].number_of_values, + sum( + [ + stripes_statistics[0]["int1"].number_of_values, + stripes_statistics[1]["int1"].number_of_values, + stripes_statistics[2]["int1"].number_of_values, + ] + ), + ) + assert_eq( + stripes_statistics[1]["int1"].number_of_values, + stripes_statistics[1]["string1"].number_of_values, + ) + assert_eq(stripes_statistics[2]["string1"].number_of_values, 1_000) + + # Check other statistics + assert_eq(stripes_statistics[2]["string1"].has_null, False) + assert_eq( + file_statistics[0]["int1"]["minimum"], + min( + stripes_statistics[0]["int1"]["minimum"], + stripes_statistics[1]["int1"]["minimum"], + stripes_statistics[2]["int1"]["minimum"], + ), + ) + assert_eq(file_statistics[0]["int1"]["minimum"], 1) + assert_eq(file_statistics[0]["string1"]["minimum"], "one") + + +@pytest.mark.filterwarnings("ignore:Using CPU") +@pytest.mark.parametrize( + "predicate,expected_len", + [ + (("int1", "==", 1), 5000), + (("int1", "<=", 2), 10000), + (("int1", "==", -1), 0), + (("int1", "in", range(3)), 10000), + (("int1", "in", {1, 3}), 6000), + (("int1", "not in", {1, 3}), 5000), + ], +) +def test_orc_read_filtered(datadir, engine, predicate, expected_len): + predicate = [[predicate]] + path = datadir / "TestOrcFile.testStripeLevelStats.orc" + try: + df_filtered = cudf.read_orc(path, engine=engine, filters=predicate) + except pa.ArrowIOError as e: + pytest.skip(".orc file is not found: %s" % e) + + # Assert # of rows after filtering + assert len(df_filtered) == expected_len + + +@pytest.mark.filterwarnings("ignore:Using CPU") +def test_orc_read_stripes(datadir, engine): + path = datadir / "TestOrcFile.testDate1900.orc" + try: + pdf = cudf.read_orc(path, engine=engine) + except pa.ArrowIOError as e: + pytest.skip(".orc file is not found: %s" % e) + + num_rows, stripes, col_names = cudf.io.read_orc_metadata(path) + + # Read stripes one at a time + gdf = [ + cudf.read_orc(path, engine=engine, stripes=[[i]]) + for i in range(stripes) + ] + gdf = cudf.concat(gdf).reset_index(drop=True) + assert_eq(pdf, gdf, check_categorical=False, check_index_type=True) + + # Read stripes all at once + gdf = cudf.read_orc( + path, engine=engine, stripes=[[int(x) for x in range(stripes)]] + ) + assert_eq(pdf, gdf, check_categorical=False) + + # Read only some stripes + gdf = cudf.read_orc(path, engine=engine, stripes=[[0, 1]]) + assert_eq(gdf, pdf.head(25000)) + gdf = cudf.read_orc(path, engine=engine, stripes=[[0, stripes - 1]]) + assert_eq( + gdf, + cudf.concat([pdf.head(15000), pdf.tail(10000)], ignore_index=True), + check_index_type=True, + ) + + +@pytest.mark.parametrize("num_rows", [1, 100, 3000]) +@pytest.mark.parametrize("skiprows", [0, 1, 3000]) +def test_orc_read_rows(datadir, skiprows, num_rows): + path = datadir / "TestOrcFile.decimal.orc" + + pdf = pd.read_orc(path) + gdf = cudf.read_orc(path, skiprows=skiprows, num_rows=num_rows) + + # Slice rows out of the whole dataframe for comparison as PyArrow doesn't + # have an API to read a subsection of rows from the file + pdf = pdf[skiprows : skiprows + num_rows] + pdf = pdf.reset_index(drop=True) + + assert_eq(pdf, gdf) + + +def test_orc_read_skiprows(): + buff = BytesIO() + df = pd.DataFrame( + { + "a": [ + True, + False, + True, + False, + None, + True, + True, + True, + False, + None, + False, + False, + True, + True, + True, + True, + ] + } + ) + df.to_orc(buff) + # testing 10 skiprows due to a boolean specific bug fix that didn't + # repro for other sizes of data + skiprows = 10 + + expected = ( + pd.read_orc(buff)[skiprows:].reset_index(drop=True).astype("bool") + ) + got = cudf.read_orc(buff, skiprows=skiprows) + assert_eq(expected, got) + + +def test_orc_reader_uncompressed_block(datadir): + path = datadir / "uncompressed_snappy.orc" + + expect = pd.read_orc(path) + got = cudf.read_orc(path) + + assert_eq(expect, got, check_categorical=False) + + +def test_orc_reader_nodata_block(datadir): + path = datadir / "nodata.orc" + + expect = pd.read_orc(path) + got = cudf.read_orc(path, num_rows=1) + + assert_eq(expect, got, check_categorical=False) + + +@pytest.mark.parametrize("compression", [None, "snappy"]) +@pytest.mark.parametrize( + "reference_file, columns", + [ + ( + "TestOrcFile.test1.orc", + [ + "boolean1", + "byte1", + "short1", + "int1", + "long1", + "float1", + "double1", + ], + ), + ("TestOrcFile.demo-12-zlib.orc", ["_col1", "_col3", "_col5"]), + ], +) +def test_orc_writer(datadir, tmp_path, reference_file, columns, compression): + pdf_fname = datadir / reference_file + gdf_fname = tmp_path / "gdf.orc" + + expect = cudf.from_pandas(pd.read_orc(pdf_fname, columns=columns)) + expect.to_orc(gdf_fname, compression=compression) + got = cudf.from_pandas(pd.read_orc(gdf_fname, columns=columns)) + + assert_frame_equal(expect, got) + + +@pytest.mark.parametrize("stats_freq", ["NONE", "STRIPE", "ROWGROUP"]) +def test_orc_writer_statistics_frequency(datadir, tmp_path, stats_freq): + reference_file = "TestOrcFile.demo-12-zlib.orc" + pdf_fname = datadir / reference_file + gdf_fname = tmp_path / "gdf.orc" + + expect = cudf.from_pandas(pd.read_orc(pdf_fname)) + expect.to_orc(gdf_fname, statistics=stats_freq) + got = cudf.from_pandas(pd.read_orc(gdf_fname)) + + assert_frame_equal(expect, got) + + +@pytest.mark.parametrize("stats_freq", ["NONE", "STRIPE", "ROWGROUP"]) +def test_chunked_orc_writer_statistics_frequency( + datadir, tmp_path, stats_freq +): + reference_file = "TestOrcFile.test1.orc" + pdf_fname = datadir / reference_file + gdf_fname = tmp_path / "chunked_gdf.orc" + + columns = [ + "boolean1", + "byte1", + "short1", + "int1", + "long1", + "float1", + "double1", + ] + pdf = pd.read_orc(pdf_fname, columns=columns) + gdf = cudf.from_pandas(pdf) + expect = pd.concat([pdf, pdf]).reset_index(drop=True) + + with ORCWriter(gdf_fname, statistics=stats_freq) as writer: + writer.write_table(gdf) + writer.write_table(gdf) + + got = pd.read_orc(gdf_fname) + + assert_eq(expect, got) + + +@pytest.mark.parametrize("compression", [None, "snappy"]) +@pytest.mark.parametrize( + "reference_file, columns", + [ + ( + "TestOrcFile.test1.orc", + [ + "boolean1", + "byte1", + "short1", + "int1", + "long1", + "float1", + "double1", + ], + ), + ("TestOrcFile.demo-12-zlib.orc", ["_col1", "_col3", "_col5"]), + ], +) +def test_chunked_orc_writer( + datadir, tmp_path, reference_file, columns, compression +): + pdf_fname = datadir / reference_file + gdf_fname = tmp_path / "chunked_gdf.orc" + + pdf = pd.read_orc(pdf_fname, columns=columns) + gdf = cudf.from_pandas(pdf) + expect = pd.concat([pdf, pdf]).reset_index(drop=True) + + with ORCWriter(gdf_fname, compression=compression) as writer: + writer.write_table(gdf) + writer.write_table(gdf) + + got = pd.read_orc(gdf_fname, columns=columns) + assert_frame_equal(cudf.from_pandas(expect), cudf.from_pandas(got)) + + +@pytest.mark.parametrize( + "dtypes", + [ + {"c": str, "a": int}, + {"c": int, "a": str}, + {"c": int, "a": str, "b": float}, + {"c": str, "a": object}, + ], +) +def test_orc_writer_strings(tmp_path, dtypes): + gdf_fname = tmp_path / "gdf_strings.orc" + + expect = cudf.datasets.randomdata(nrows=10, dtypes=dtypes, seed=1) + expect.to_orc(gdf_fname) + got = pd.read_orc(gdf_fname) + + assert_eq(expect, got) + + +@pytest.mark.parametrize( + "dtypes", + [ + {"c": str, "a": int}, + {"c": int, "a": str}, + {"c": int, "a": str, "b": float}, + {"c": str, "a": object}, + ], +) +def test_chunked_orc_writer_strings(tmp_path, dtypes): + gdf_fname = tmp_path / "chunked_gdf_strings.orc" + + gdf = cudf.datasets.randomdata(nrows=10, dtypes=dtypes, seed=1) + pdf = gdf.to_pandas() + expect = pd.concat([pdf, pdf]).reset_index(drop=True) + with ORCWriter(gdf_fname) as writer: + writer.write_table(gdf) + writer.write_table(gdf) + + got = pd.read_orc(gdf_fname) + + assert_eq(expect, got) + + +def test_orc_writer_sliced(tmp_path): + cudf_path = tmp_path / "cudf.orc" + + df = pd.DataFrame() + df["String"] = np.array(["Alpha", "Beta", "Gamma", "Delta"]) + df = cudf.from_pandas(df) + + df_select = df.iloc[1:3] + + df_select.to_orc(cudf_path) + assert_eq(cudf.read_orc(cudf_path), df_select) + + +@pytest.mark.parametrize( + "orc_file", + [ + "TestOrcFile.decimal.orc", + "TestOrcFile.decimal.same.values.orc", + "TestOrcFile.decimal.multiple.values.orc", + # For additional information take look at PR 7034 + "TestOrcFile.decimal.runpos.issue.orc", + ], +) +def test_orc_reader_decimal_type(datadir, orc_file): + file_path = datadir / orc_file + + pdf = pd.read_orc(file_path) + df = cudf.read_orc(file_path) + + assert_eq(pdf, df) + + +def test_orc_decimal_precision_fail(datadir): + file_path = datadir / "TestOrcFile.int_decimal.precision_19.orc" + + # Shouldn't cause failure if decimal column is not chosen to be read. + pdf = pd.read_orc(file_path, columns=["int"]) + gdf = cudf.read_orc(file_path, columns=["int"]) + + assert_eq(pdf, gdf) + + +# For additional information take look at PR 6636 and 6702 +@pytest.mark.parametrize( + "orc_file", + [ + "TestOrcFile.boolean_corruption_PR_6636.orc", + "TestOrcFile.boolean_corruption_PR_6702.orc", + ], +) +def test_orc_reader_boolean_type(datadir, orc_file): + file_path = datadir / orc_file + + pdf = pd.read_orc(file_path) + df = cudf.read_orc(file_path).to_pandas() + + assert_eq(pdf, df) + + +def test_orc_reader_tzif_timestamps(datadir): + # Contains timstamps in the range covered by the TZif file + # Other timedate tests only cover "future" times + path = datadir / "TestOrcFile.lima_timezone.orc" + + pdf = pd.read_orc(path) + gdf = cudf.read_orc(path) + + assert_eq(pdf, gdf) + + +def test_int_overflow(tmp_path): + file_path = tmp_path / "gdf_overflow.orc" + + # The number of rows and the large element trigger delta encoding + num_rows = 513 + df = cudf.DataFrame({"a": [None] * num_rows}, dtype="int64") + df["a"][0] = 1024 * 1024 * 1024 + df["a"][num_rows - 1] = 1 + df.to_orc(file_path) + + assert_eq(cudf.read_orc(file_path), df) + + +def normalized_equals(value1, value2): + # need naive time object for numpy to convert to datetime64 + if isinstance(value1, datetime.datetime): + value1 = value1.replace(tzinfo=None) + if isinstance(value2, datetime.datetime): + value2 = value2.replace(tzinfo=None) + + if isinstance(value1, (datetime.datetime, np.datetime64)): + value1 = np.datetime64(value1, "ms") + if isinstance(value2, (datetime.datetime, np.datetime64)): + value2 = np.datetime64(value2, "ms") + + # Compare integers with floats now + if isinstance(value1, float) or isinstance(value2, float): + return np.isclose(value1, value2) + + return value1 == value2 + + +@pytest.mark.parametrize("stats_freq", ["STRIPE", "ROWGROUP"]) +@pytest.mark.parametrize("nrows", [1, 100, 100000]) +def test_orc_write_statistics(tmp_path, datadir, nrows, stats_freq): + supported_stat_types = [*supported_numpy_dtypes, "str"] + # Writing bool columns to multiple row groups is disabled + # until #6763 is fixed + if nrows == 100000: + supported_stat_types.remove("bool") + + # Make a dataframe + gdf = cudf.DataFrame( + { + "col_" + str(dtype): gen_rand_series(dtype, nrows, has_nulls=True) + for dtype in supported_stat_types + } + ) + fname = tmp_path / "gdf.orc" + + # Write said dataframe to ORC with cuDF + gdf.to_orc(fname, statistics=stats_freq, stripe_size_rows=30000) + + # Read back written ORC's statistics + orc_file = orc.ORCFile(fname) + ( + file_stats, + stripes_stats, + ) = cudf.io.orc.read_orc_statistics([fname]) + + # check file stats + for col in gdf: + if "minimum" in file_stats[0][col]: + stats_min = file_stats[0][col]["minimum"] + if stats_min is not None: + actual_min = gdf[col].min() + assert normalized_equals(actual_min, stats_min) + if "maximum" in file_stats[0][col]: + stats_max = file_stats[0][col]["maximum"] + if stats_max is not None: + actual_max = gdf[col].max() + assert normalized_equals(actual_max, stats_max) + if "number_of_values" in file_stats[0][col]: + stats_num_vals = file_stats[0][col]["number_of_values"] + if stats_num_vals is not None: + actual_num_vals = gdf[col].count() + assert stats_num_vals == actual_num_vals + + # compare stripe statistics with actual min/max + for stripe_idx in range(0, orc_file.nstripes): + stripe = orc_file.read_stripe(stripe_idx) + # pandas is unable to handle min/max of string col with nulls + stripe_df = cudf.DataFrame(stripe.to_pandas()) + for col in stripe_df: + if "minimum" in stripes_stats[stripe_idx][col]: + stats_min = stripes_stats[stripe_idx][col]["minimum"] + if stats_min is not None: + actual_min = stripe_df[col].min() + assert normalized_equals(actual_min, stats_min) + + if "maximum" in stripes_stats[stripe_idx][col]: + stats_max = stripes_stats[stripe_idx][col]["maximum"] + if stats_max is not None: + actual_max = stripe_df[col].max() + assert normalized_equals(actual_max, stats_max) + + if "number_of_values" in stripes_stats[stripe_idx][col]: + stats_num_vals = stripes_stats[stripe_idx][col][ + "number_of_values" + ] + if stats_num_vals is not None: + actual_num_vals = stripe_df[col].count() + assert stats_num_vals == actual_num_vals + + +@pytest.mark.parametrize("stats_freq", ["STRIPE", "ROWGROUP"]) +@pytest.mark.parametrize("nrows", [2, 100, 1024]) +def test_orc_chunked_write_statistics(tmp_path, datadir, nrows, stats_freq): + supported_stat_types = [*supported_numpy_dtypes, "str"] + # Writing bool columns to multiple row groups is disabled + # until #6763 is fixed + if nrows == 1024: + supported_stat_types.remove("bool") + + gdf_fname = tmp_path / "chunked_stats.orc" + with ORCWriter( + gdf_fname, statistics=stats_freq, stripe_size_rows=512 + ) as writer: + max_char_length = 100 if nrows < 1000 else 10 + + # Make a dataframe + gdf = cudf.DataFrame( + { + "col_" + str(dtype): gen_rand_series( + dtype, + nrows // 2, + has_nulls=True, + low=0, + high=max_char_length, + seed=0, + ) + for dtype in supported_stat_types + } + ) + + pdf1 = gdf.to_pandas() + writer.write_table(gdf) + # gdf is specifically being reused here to ensure the data is destroyed + # before the next write_table call to ensure the data is persisted inside + # write and no pointers are saved into the original table + gdf = cudf.DataFrame( + { + "col_" + str(dtype): gen_rand_series( + dtype, + nrows // 2, + has_nulls=True, + low=0, + high=max_char_length, + ) + for dtype in supported_stat_types + } + ) + pdf2 = gdf.to_pandas() + writer.write_table(gdf) + + # pandas is unable to handle min/max of string col with nulls + expect = cudf.DataFrame(pd.concat([pdf1, pdf2]).reset_index(drop=True)) + + # Read back written ORC's statistics + orc_file = orc.ORCFile(gdf_fname) + ( + file_stats, + stripes_stats, + ) = cudf.io.orc.read_orc_statistics([gdf_fname]) + + # check file stats + for col in expect: + if "minimum" in file_stats[0][col]: + stats_min = file_stats[0][col]["minimum"] + if stats_min is not None: + actual_min = expect[col].min() + assert normalized_equals(actual_min, stats_min) + if "maximum" in file_stats[0][col]: + stats_max = file_stats[0][col]["maximum"] + if stats_max is not None: + actual_max = expect[col].max() + assert normalized_equals(actual_max, stats_max) + if "number_of_values" in file_stats[0][col]: + stats_num_vals = file_stats[0][col]["number_of_values"] + if stats_num_vals is not None: + actual_num_vals = expect[col].count() + assert stats_num_vals == actual_num_vals + + # compare stripe statistics with actual min/max + for stripe_idx in range(0, orc_file.nstripes): + stripe = orc_file.read_stripe(stripe_idx) + # pandas is unable to handle min/max of string col with nulls + stripe_df = cudf.DataFrame(stripe.to_pandas()) + for col in stripe_df: + if "minimum" in stripes_stats[stripe_idx][col]: + stats_min = stripes_stats[stripe_idx][col]["minimum"] + if stats_min is not None: + actual_min = stripe_df[col].min() + assert normalized_equals(actual_min, stats_min) + + if "maximum" in stripes_stats[stripe_idx][col]: + stats_max = stripes_stats[stripe_idx][col]["maximum"] + if stats_max is not None: + actual_max = stripe_df[col].max() + assert normalized_equals(actual_max, stats_max) + + if "number_of_values" in stripes_stats[stripe_idx][col]: + stats_num_vals = stripes_stats[stripe_idx][col][ + "number_of_values" + ] + if stats_num_vals is not None: + actual_num_vals = stripe_df[col].count() + assert stats_num_vals == actual_num_vals + + +@pytest.mark.parametrize("nrows", [1, 100, 100000]) +def test_orc_write_bool_statistics(tmp_path, datadir, nrows): + # Make a dataframe + gdf = cudf.DataFrame({"col_bool": gen_rand_series("bool", nrows)}) + fname = tmp_path / "gdf.orc" + + # Write said dataframe to ORC with cuDF + gdf.to_orc(fname, stripe_size_rows=30000) + + # Read back written ORC's statistics + orc_file = orc.ORCFile(fname) + ( + file_stats, + stripes_stats, + ) = cudf.io.orc.read_orc_statistics([fname]) + + # check file stats + col = "col_bool" + if "true_count" in file_stats[0][col]: + stats_true_count = file_stats[0][col]["true_count"] + actual_true_count = gdf[col].sum() + assert normalized_equals(actual_true_count, stats_true_count) + + if "number_of_values" in file_stats[0][col]: + stats_valid_count = file_stats[0][col]["number_of_values"] + actual_valid_count = len(gdf[col]) - gdf[col].null_count + assert normalized_equals(actual_valid_count, stats_valid_count) + + # compare stripe statistics with actual min/max + for stripe_idx in range(0, orc_file.nstripes): + stripe = orc_file.read_stripe(stripe_idx) + # pandas is unable to handle min/max of string col with nulls + stripe_df = cudf.DataFrame(stripe.to_pandas()) + + if "true_count" in stripes_stats[stripe_idx][col]: + actual_true_count = stripe_df[col].sum() + stats_true_count = stripes_stats[stripe_idx][col]["true_count"] + assert normalized_equals(actual_true_count, stats_true_count) + + if "number_of_values" in stripes_stats[stripe_idx][col]: + actual_valid_count = ( + len(stripe_df[col]) - stripe_df[col].null_count + ) + stats_valid_count = stripes_stats[stripe_idx][col][ + "number_of_values" + ] + assert normalized_equals(actual_valid_count, stats_valid_count) + + +def test_orc_reader_gmt_timestamps(datadir): + path = datadir / "TestOrcFile.gmt.orc" + + pdf = pd.read_orc(path) + gdf = cudf.read_orc(path) + assert_eq(pdf, gdf) + + +def test_orc_bool_encode_fail(): + buffer = BytesIO() + + # Generate a boolean column longer than a single row group + fail_df = cudf.DataFrame({"col": gen_rand_series("bool", 20000)}) + # Invalidate a row in the first row group + fail_df["col"][5000] = None + + # Should throw instead of generating a file that is incompatible + # with other readers (see issue #6763) + with pytest.raises(RuntimeError): + fail_df.to_orc(buffer) + + # Generate a boolean column longer than a single row group + okay_df = cudf.DataFrame({"col": gen_rand_series("bool", 20000)}) + okay_df["col"][15000] = None + # Invalid row is in the last row group; encoding is assumed to be correct + okay_df.to_orc(buffer) + + # Also validate data + pdf = pd.read_orc(buffer) + + assert_eq(okay_df.to_pandas(nullable=True), pdf) + + +def test_nanoseconds_overflow(): + buffer = BytesIO() + # Use nanosecond values that take more than 32 bits to encode + s = cudf.Series([710424008, -1338482640], dtype="datetime64[ns]") + expected = cudf.DataFrame({"s": s}) + expected.to_orc(buffer) + + cudf_got = cudf.read_orc(buffer) + assert_eq(expected, cudf_got) + + pandas_got = pd.read_orc(buffer) + assert_eq(expected, pandas_got) + + +def test_empty_dataframe(): + buffer = BytesIO() + expected = cudf.DataFrame() + expected.to_orc(buffer) + + # Raise error if column name is mentioned, but it doesn't exist. + with pytest.raises(RuntimeError): + cudf.read_orc(buffer, columns=["a"]) + + got_df = cudf.read_orc(buffer) + expected_pdf = pd.read_orc(buffer) + + assert_eq(expected, got_df) + assert_eq(expected_pdf, got_df) + + +@pytest.mark.parametrize( + "data", [[None, ""], ["", None], [None, None], ["", ""]] +) +def test_empty_string_columns(data): + buffer = BytesIO() + + expected = cudf.DataFrame({"string": data}, dtype="str") + expected.to_orc(buffer) + + expected_pdf = pd.read_orc(buffer) + got_df = cudf.read_orc(buffer) + + assert_eq(expected, got_df) + assert_eq( + expected_pdf, + got_df.to_pandas(nullable=True) + if expected_pdf["string"].dtype == pd.StringDtype() + else got_df, + ) + + +@pytest.mark.parametrize("scale", [-3, 0, 3]) +@pytest.mark.parametrize( + "decimal_type", + [cudf.Decimal32Dtype, cudf.Decimal64Dtype, cudf.Decimal128Dtype], +) +def test_orc_writer_decimal(tmp_path, scale, decimal_type): + fname = tmp_path / "decimal.orc" + + expected = cudf.DataFrame({"dec_val": gen_rand_series("i", 100)}) + expected["dec_val"] = expected["dec_val"].astype(decimal_type(7, scale)) + + expected.to_orc(fname) + + got = pd.read_orc(fname) + assert_eq(expected.to_pandas()["dec_val"], got["dec_val"]) + + +@pytest.mark.parametrize("num_rows", [1, 100, 3000]) +def test_orc_reader_multiple_files(datadir, num_rows): + path = datadir / "TestOrcFile.testSnappy.orc" + + df_1 = pd.read_orc(path) + df_2 = pd.read_orc(path) + df = pd.concat([df_1, df_2], ignore_index=True) + + gdf = cudf.read_orc([path, path], num_rows=num_rows).to_pandas() + + # Slice rows out of the whole dataframe for comparison as PyArrow doesn't + # have an API to read a subsection of rows from the file + df = df[:num_rows] + df = df.reset_index(drop=True) + + assert_eq(df, gdf) + + +def test_orc_reader_multi_file_single_stripe(datadir): + path = datadir / "TestOrcFile.testSnappy.orc" + + # should raise an exception + with pytest.raises(ValueError): + cudf.read_orc([path, path], stripes=[0]) + + +def test_orc_reader_multi_file_multi_stripe(datadir): + path = datadir / "TestOrcFile.testStripeLevelStats.orc" + gdf = cudf.read_orc([path, path], stripes=[[0, 1], [2]]) + pdf = pd.read_orc(path) + assert_eq(pdf, gdf) + + +def test_orc_string_stream_offset_issue(): + size = 30000 + vals = { + str(x): [decimal.Decimal(1)] * size if x != 0 else ["XYZ"] * size + for x in range(0, 5) + } + df = cudf.DataFrame(vals) + + buffer = BytesIO() + df.to_orc(buffer) + + assert_eq(df, cudf.read_orc(buffer)) + + +@pytest.fixture(scope="module") +def list_struct_buff(): + size = 100 + rd = random.Random(1) + rng = np.random.default_rng(seed=1) + + buff = BytesIO() + + lvl3_list = [ + rd.choice( + [ + None, + [ + [ + [ + rd.choice([None, rng.integers(1, 3)]) + for _ in range(rng.integers(1, 3)) + ] + for _ in range(rng.integers(0, 3)) + ] + for _ in range(rng.integers(0, 3)) + ], + ] + ) + for _ in range(size) + ] + lvl1_list = [ + [ + rd.choice([None, rng.integers(0, 3)]) + for _ in range(rng.integers(1, 4)) + ] + for _ in range(size) + ] + lvl1_struct = [ + rd.choice( + [ + None, + {"a": rng.integers(0, 3), "b": rng.integers(0, 3)}, + ] + ) + for _ in range(size) + ] + lvl2_struct = [ + rd.choice( + [ + None, + {"a": rd.choice([None, rng.integers(0, 3)])}, + { + "lvl1_struct": { + "c": rd.choice([None, rng.integers(0, 3)]), + "d": rng.integers(0, 3), + }, + }, + ] + ) + for _ in range(size) + ] + list_nests_struct = [ + [ + {"a": rd.choice(lvl1_struct), "b": rd.choice(lvl1_struct)} + for _ in range(rng.integers(1, 4)) + ] + for _ in range(size) + ] + struct_nests_list = [ + {"struct": lvl1_struct[x], "list": lvl1_list[x]} for x in range(size) + ] + + pa_table = pa.table( + { + "lvl3_list": lvl3_list, + "lvl1_list": lvl1_list, + "lvl1_struct": lvl1_struct, + "lvl2_struct": lvl2_struct, + "list_nests_struct": list_nests_struct, + "struct_nests_list": struct_nests_list, + } + ) + with orc.ORCWriter(buff, stripe_size=16) as writer: + writer.write(pa_table) + return buff + + +@pytest.mark.parametrize( + "columns", + [ + None, + ["lvl3_list", "list_nests_struct", "lvl2_struct", "struct_nests_list"], + ["lvl2_struct", "lvl1_struct"], + ], +) +@pytest.mark.parametrize("num_rows", [0, 15, 100]) +def test_lists_struct_nests(columns, num_rows, use_index, list_struct_buff): + gdf = cudf.read_orc( + list_struct_buff, + columns=columns, + num_rows=num_rows, + use_index=use_index, + ) + + pyarrow_tbl = orc.ORCFile(list_struct_buff).read() + + pyarrow_tbl = ( + pyarrow_tbl[:num_rows] + if columns is None + else pyarrow_tbl.select(columns)[:num_rows] + ) + + if num_rows > 0: + assert pyarrow_tbl.equals(gdf.to_arrow()) + else: + assert_eq(pyarrow_tbl.to_pandas(), gdf) + + +@pytest.mark.parametrize("columns", [None, ["lvl1_struct"], ["lvl1_list"]]) +def test_skip_rows_for_nested_types(columns, list_struct_buff): + with pytest.raises( + RuntimeError, match="skip_rows is not supported by nested column" + ): + cudf.read_orc( + list_struct_buff, + columns=columns, + use_index=True, + skiprows=5, + ) + + +def test_pyspark_struct(datadir): + path = datadir / "TestOrcFile.testPySparkStruct.orc" + + pdf = pd.read_orc(path) + gdf = cudf.read_orc(path) + + assert_eq(pdf, gdf) + + +@pytest.fixture +def map_buff(): + size = 100 + rd = random.Random(1) + rng = np.random.default_rng(seed=1) + + buff = BytesIO() + + lvl1_map = pa.array( + [ + rd.choice( + [ + None, + { + rd.choice(ascii_letters): rd.choice( + [None, rng.integers(1, 1500)] + ), + }, + ] + ) + for _ in range(size) + ], + type=pa.map_(pa.string(), pa.int64()), + ) + lvl2_map = pa.array( + [ + rd.choice( + [ + None, + *( + { + rd.choice(ascii_letters): rd.choice( + [ + None, + [ + rd.choice( + [None, rng.integers(1, 1500)] + ) + for _ in range(5) + ], + ] + ) + } + for _ in range(2) + ), + ] + ) + for _ in range(size) + ], + type=pa.map_(pa.string(), pa.list_(pa.int64())), + ) + lvl2_struct_map = pa.array( + [ + rd.choice( + [ + None, + *( + { + rd.choice(ascii_letters): rd.choice( + [ + None, + { + "a": rd.choice( + [None, rng.integers(1, 1500)] + ), + "b": rd.choice( + [None, rng.integers(1, 1500)] + ), + }, + ] + ) + } + for _ in range(2) + ), + ] + ) + for _ in range(size) + ], + type=pa.map_( + pa.string(), pa.struct({"a": pa.int64(), "b": pa.int64()}) + ), + ) + + pa_table = pa.Table.from_arrays( + [lvl1_map, lvl2_map, lvl2_struct_map], + ["lvl1_map", "lvl2_map", "lvl2_struct_map"], + ) + + orc.write_table(pa_table, buff, stripe_size=16, compression="UNCOMPRESSED") + return buff + + +@pytest.mark.parametrize( + "columns", + [None, ["lvl1_map", "lvl2_struct_map"], ["lvl2_struct_map", "lvl2_map"]], +) +@pytest.mark.parametrize("num_rows", [0, 15, 100]) +def test_map_type_read(columns, num_rows, use_index, map_buff): + tbl = orc.read_table(map_buff) + + lvl1_map = ( + tbl["lvl1_map"] + .combine_chunks() + .view(pa.list_(pa.struct({"key": pa.string(), "value": pa.int64()}))) + ) + lvl2_map = ( + tbl["lvl2_map"] + .combine_chunks() + .view( + pa.list_( + pa.struct({"key": pa.string(), "value": pa.list_(pa.int64())}) + ) + ) + ) + lvl2_struct_map = ( + tbl["lvl2_struct_map"] + .combine_chunks() + .view( + pa.list_( + pa.struct( + { + "key": pa.string(), + "value": pa.struct({"a": pa.int64(), "b": pa.int64()}), + } + ) + ) + ) + ) + + expected_tbl = pa.table( + { + "lvl1_map": lvl1_map, + "lvl2_map": lvl2_map, + "lvl2_struct_map": lvl2_struct_map, + } + ) + gdf = cudf.read_orc( + map_buff, columns=columns, num_rows=num_rows, use_index=use_index + ) + + expected_tbl = ( + expected_tbl[:num_rows] + if columns is None + else expected_tbl.select(columns)[:num_rows] + ) + + if num_rows > 0: + assert expected_tbl.equals(gdf.to_arrow()) + else: + assert_eq(expected_tbl.to_pandas(), gdf) + + +def test_orc_reader_decimal(datadir): + path = datadir / "TestOrcFile.decimal.orc" + + pdf = pd.read_orc(path) + gdf = cudf.read_orc(path) + + assert_eq(pdf, gdf) + + +# This test case validates the issue raised in #8665, +# please check the issue for more details. +def test_orc_timestamp_read(datadir): + path = datadir / "TestOrcFile.timestamp.issue.orc" + + pdf = pd.read_orc(path) + gdf = cudf.read_orc(path) + + assert_eq(pdf, gdf) + + +def dec(num): + return decimal.Decimal(str(num)) + + +@pytest.mark.parametrize( + "data", + [ + # basic + nested strings + { + "lls": [[["a"], ["bb"]] * 5 for i in range(12345)], + "lls2": [[["ccc", "dddd"]] * 6 for i in range(12345)], + "ls_dict": [["X"] * 7 for i in range(12345)], + "ls_direct": [[str(i)] * 9 for i in range(12345)], + "li": [[i] * 11 for i in range(12345)], + "lf": [[i * 0.5] * 13 for i in range(12345)], + "ld": [[dec(i / 2)] * 15 for i in range(12345)], + }, + # with nulls + { + "ls": [ + [str(i) if i % 5 else None, str(2 * i)] if i % 2 else None + for i in range(12345) + ], + "li": [[i, i * i, i % 2] if i % 3 else None for i in range(12345)], + "ld": [ + [dec(i), dec(i / 2) if i % 7 else None] if i % 5 else None + for i in range(12345) + ], + }, + # with empty elements + { + "ls": [ + [str(i), str(2 * i)] if i % 2 else [] for i in range(12345) + ], + "lls": [ + [[str(i), str(2 * i)]] if i % 2 else [[], []] + for i in range(12345) + ], + "li": [[i, i * i, i % 2] if i % 3 else [] for i in range(12345)], + "lli": [ + [[i], [i * i], [i % 2]] if i % 3 else [[]] + for i in range(12345) + ], + "ld": [ + [dec(i), dec(i / 2)] if i % 5 else [] for i in range(12345) + ], + }, + # variable list lengths + { + "ls": [[str(i)] * i for i in range(123)], + "li": [[i, i * i] * i for i in range(123)], + "ld": [[dec(i), dec(i / 2)] * i for i in range(123)], + }, + # many child elements (more that max_stripe_rows) + {"li": [[i] * 1100 for i in range(11000)]}, + ], +) +def test_orc_writer_lists(data): + buffer = BytesIO() + cudf.DataFrame(data).to_orc( + buffer, stripe_size_rows=2048, row_index_stride=512 + ) + # Read in as pandas but compare with pyarrow + # since pandas doesn't have a list type + pa_out = pa.Table.from_pandas(pd.read_orc(buffer)) + pa_in = pa.table(data) + assert pa_out.equals(pa_in) + + +def test_chunked_orc_writer_lists(): + num_rows = 3000 + pdf_in = pd.DataFrame( + { + "ls": [[str(i), str(2 * i)] for i in range(num_rows)], + "ld": [[dec(i / 2)] * 5 for i in range(num_rows)], + } + ) + + gdf = cudf.from_pandas(pdf_in) + expect = pd.concat([pdf_in, pdf_in]).reset_index(drop=True) + + buffer = BytesIO() + with ORCWriter(buffer) as writer: + writer.write_table(gdf) + writer.write_table(gdf) + + got = pd.read_orc(buffer) + assert_eq(expect, got) + + +def test_writer_timestamp_stream_size(datadir, tmp_path): + pdf_fname = datadir / "TestOrcFile.largeTimestamps.orc" + gdf_fname = tmp_path / "gdf.orc" + + expect = pd.read_orc(pdf_fname) + cudf.from_pandas(expect).to_orc(gdf_fname) + got = pd.read_orc(gdf_fname) + + assert_eq(expect, got) + + +@pytest.mark.parametrize( + "fname", + [ + "TestOrcFile.NoIndStrm.StructWithNoNulls.orc", + "TestOrcFile.NoIndStrm.StructAndIntWithNulls.orc", + "TestOrcFile.NoIndStrm.StructAndIntWithNulls.TwoStripes.orc", + "TestOrcFile.NoIndStrm.IntWithNulls.orc", + ], +) +def test_no_row_group_index_orc_read(datadir, fname): + fpath = datadir / fname + + expect = orc.ORCFile(fpath).read() + got = cudf.read_orc(fpath) + + assert expect.equals(got.to_arrow()) + + +def test_names_in_struct_dtype_nesting(datadir): + fname = datadir / "TestOrcFile.NestedStructDataFrame.orc" + + expect = orc.ORCFile(fname).read() + got = cudf.read_orc(fname) + + # test dataframes + assert expect.equals(got.to_arrow()) + + edf = cudf.DataFrame(expect.to_pandas()) + # test schema + assert edf.dtypes.equals(got.dtypes) + + +def test_writer_lists_structs(list_struct_buff): + df_in = cudf.read_orc(list_struct_buff) + + buff = BytesIO() + df_in.to_orc(buff) + + pyarrow_tbl = orc.ORCFile(buff).read() + + assert pyarrow_tbl.equals(df_in.to_arrow()) + + +def test_orc_writer_lists_empty_rg(): + data = { + "with_pd": [ + [i if i % 3 else None] if i < 25 or i > 30 else None + for i in range(50) + ], + "no_pd": [ + [i if i % 3 else None] if i < 25 or i > 30 else [] + for i in range(50) + ], + } + pdf_in = pd.DataFrame(data) + buffer = BytesIO() + cudf_in = cudf.from_pandas(pdf_in) + + cudf_in.to_orc(buffer) + + df = cudf.read_orc(buffer) + assert_eq(df, cudf_in) + + pdf_out = pd.read_orc(buffer) + assert_eq(pdf_in, pdf_out) + + +def test_statistics_sum_overflow(): + maxint64 = np.iinfo(np.int64).max + minint64 = np.iinfo(np.int64).min + + buff = BytesIO() + df = pd.DataFrame( + {"a": [maxint64, 1], "b": [minint64, -1], "c": [minint64, 1]} + ) + df.to_orc(buff) + + file_stats, stripe_stats = cudf.io.orc.read_orc_statistics([buff]) + assert file_stats[0]["a"].get("sum") is None + assert file_stats[0]["b"].get("sum") is None + assert file_stats[0]["c"].get("sum") == minint64 + 1 + + assert stripe_stats[0]["a"].get("sum") is None + assert stripe_stats[0]["b"].get("sum") is None + assert stripe_stats[0]["c"].get("sum") == minint64 + 1 + + +def test_empty_statistics(): + buff = BytesIO() + pa_table = pa.Table.from_arrays( + [ + pa.array([None], type=pa.int64()), + pa.array([None], type=pa.float64()), + pa.array([None], type=pa.string()), + pa.array([None], type=pa.decimal128(11, 2)), + pa.array([None], type=pa.timestamp("ns")), + pa.array([None], type=pa.date64()), + pa.array([None], type=pa.bool_()), + pa.array([None], type=pa.binary()), + pa.array([1], type=pa.int64()), + ], + ["a", "b", "c", "d", "e", "f", "g", "h", "i"], + ) + orc.write_table(pa_table, buff) + + got = cudf.io.orc.read_orc_statistics([buff]) + + # Check for both file and stripe stats + for stats in got: + # Similar expected stats for the first 6 columns in this case + for col_name in ascii_lowercase[:6]: + assert stats[0][col_name].number_of_values == 0 + assert stats[0][col_name].has_null is True + assert stats[0][col_name].get("minimum") is None + assert stats[0][col_name].get("maximum") is None + for col_name in ascii_lowercase[:3]: + assert stats[0][col_name].get("sum") == 0 + # Sum for decimal column is a string + assert stats[0]["d"].get("sum") == "0" + + assert stats[0]["g"].number_of_values == 0 + assert stats[0]["g"].has_null is True + assert stats[0]["g"].get("true_count") == 0 + assert stats[0]["g"].get("false_count") == 0 + + assert stats[0]["h"].number_of_values == 0 + assert stats[0]["h"].has_null is True + assert stats[0]["h"].get("sum") == 0 + + assert stats[0]["i"].number_of_values == 1 + assert stats[0]["i"].has_null is False + assert stats[0]["i"].get("minimum") == 1 + assert stats[0]["i"].get("maximum") == 1 + assert stats[0]["i"].get("sum") == 1 + + +@pytest.mark.parametrize( + "equivalent_columns", + [ + (["lvl1_struct.a", "lvl1_struct.b"], ["lvl1_struct"]), + (["lvl1_struct", "lvl1_struct.a"], ["lvl1_struct"]), + (["lvl1_struct.a", "lvl1_struct"], ["lvl1_struct"]), + (["lvl1_struct.b", "lvl1_struct.a"], ["lvl1_struct.b", "lvl1_struct"]), + (["lvl2_struct.lvl1_struct", "lvl2_struct"], ["lvl2_struct"]), + ( + ["lvl2_struct.a", "lvl2_struct.lvl1_struct.c", "lvl2_struct"], + ["lvl2_struct"], + ), + ], +) +def test_select_nested(list_struct_buff, equivalent_columns): + # The two column selections should be equivalent + df_cols1 = cudf.read_orc(list_struct_buff, columns=equivalent_columns[0]) + df_cols2 = cudf.read_orc(list_struct_buff, columns=equivalent_columns[1]) + assert_eq(df_cols1, df_cols2) + + +def test_orc_writer_rle_stream_size(datadir, tmp_path): + original = datadir / "TestOrcFile.int16.rle.size.orc" + reencoded = tmp_path / "int16_map.orc" + + df = cudf.read_orc(original) + df.to_orc(reencoded) + + # Segfaults when RLE stream sizes don't account for varint length + pa_out = orc.ORCFile(reencoded).read() + assert df.to_arrow().equals(pa_out) + + +def test_empty_columns(): + buffer = BytesIO() + # string and decimal columns have additional steps that need to be skipped + expected = cudf.DataFrame( + { + "string": cudf.Series([], dtype="str"), + "decimal": cudf.Series([], dtype=cudf.Decimal64Dtype(10, 1)), + } + ) + expected.to_orc(buffer, compression="snappy") + + got_df = cudf.read_orc(buffer) + assert_eq(expected, got_df) + + +def test_orc_reader_zstd_compression(list_struct_buff): + expected = cudf.read_orc(list_struct_buff) + # save with ZSTD compression + buffer = BytesIO() + pyarrow_tbl = orc.ORCFile(list_struct_buff).read() + with orc.ORCWriter(buffer, compression="zstd") as writer: + writer.write(pyarrow_tbl) + got = cudf.read_orc(buffer) + # compare with pyarrow since pandas doesn't + # have a list or struct + assert expected.to_arrow().equals(got.to_arrow()) + + +def test_writer_protobuf_large_rowindexentry(): + s = [ + "Length of the two strings needs to add up to at least ~120", + "So that the encoded statistics are larger than 128 bytes", + ] * 5001 # generate more than 10K rows to have two row groups + df = cudf.DataFrame({"s1": s}) + + buff = BytesIO() + df.to_orc(buff) + + got = cudf.read_orc(buff) + assert_frame_equal(df, got) + + +@pytest.mark.parametrize("compression", ["ZLIB", "ZSTD"]) +def test_orc_writer_nvcomp(compression): + expected = cudf.datasets.randomdata( + nrows=12345, dtypes={"a": int, "b": str, "c": float}, seed=1 + ) + + buff = BytesIO() + try: + expected.to_orc(buff, compression=compression) + except RuntimeError: + pytest.skip(reason="Newer nvCOMP version is required") + else: + got = pd.read_orc(buff) + assert_eq(expected, got) + + +def run_orc_columns_and_index_param(index_obj, index, columns): + buffer = BytesIO() + df = cudf.DataFrame( + {"a": [1, 2, 3], "b": ["a", "b", "c"]}, index=index_obj + ) + df.to_orc(buffer, index=index) + + expected = pd.read_orc(buffer, columns=columns) + got = cudf.read_orc(buffer, columns=columns) + + assert_eq(expected, got, check_index_type=True) + + +@pytest.mark.parametrize("index_obj", [None, [10, 11, 12], ["x", "y", "z"]]) +@pytest.mark.parametrize("index", [True, False, None]) +@pytest.mark.parametrize( + "columns", + [ + None, + pytest.param( + [], + marks=pytest.mark.skipif( + PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, + reason="Bug in older version of pandas", + ), + ), + ], +) +def test_orc_columns_and_index_param(index_obj, index, columns): + run_orc_columns_and_index_param(index_obj, index, columns) + + +@pytest.mark.parametrize( + "columns,index,index_obj", + [ + ( + ["a", "b"], + True, + None, + ), + ( + ["a", "b"], + True, + [10, 11, 12], + ), + ( + ["a", "b"], + True, + ["x", "y", "z"], + ), + ( + ["a", "b"], + None, + [10, 11, 12], + ), + ( + ["a", "b"], + None, + ["x", "y", "z"], + ), + ], +) +@pytest.mark.xfail(reason="https://github.com/rapidsai/cudf/issues/12026") +def test_orc_columns_and_index_param_read_index(index_obj, index, columns): + run_orc_columns_and_index_param(index_obj, index, columns) + + +@pytest.mark.parametrize( + "columns,index,index_obj", + [ + (["a", "b"], False, None), + (["a", "b"], False, [10, 11, 12]), + (["a", "b"], False, ["x", "y", "z"]), + (["a", "b"], None, None), + ], +) +def test_orc_columns_and_index_param_no_read_index(index_obj, index, columns): + run_orc_columns_and_index_param(index_obj, index, columns) + + +@pytest.mark.parametrize( + "df_data,cols_as_map_type,expected_data", + [ + ( + {"a": [[{"a": 10, "b": 20}], [{"a": 1, "b": 21}]]}, + ["a"], + {"a": [[(10, 20)], [(1, 21)]]}, + ), + ( + { + "a": [[{"a": 10, "b": 20}], [{"a": 1, "b": 21}]], + "b": [[{"a": 10, "b": 20}], [{"a": 1, "b": 21}]], + }, + ["b"], + { + "a": [[{"a": 10, "b": 20}], [{"a": 1, "b": 21}]], + "b": [[(10, 20)], [(1, 21)]], + }, + ), + ( + { + "a": [[{"a": 10, "b": 20}], [{"a": 1, "b": 21}]], + "b": [[{"a": 10, "b": 20}], [{"a": 1, "b": 21}]], + "c": [ + [{"a": {"a": 10}, "b": 20}], + [{"a": {"a": 12}, "b": 21}], + ], + }, + ["b", "c"], + { + "a": [[{"a": 10, "b": 20}], [{"a": 1, "b": 21}]], + "b": [[(10, 20)], [(1, 21)]], + "c": [[({"a": 10}, 20)], [({"a": 12}, 21)]], + }, + ), + ], +) +def test_orc_writer_cols_as_map_type(df_data, cols_as_map_type, expected_data): + df = cudf.DataFrame(df_data) + buffer = BytesIO() + df.to_orc(buffer, cols_as_map_type=cols_as_map_type) + + got = pd.read_orc(buffer) + expected = pd.DataFrame(expected_data) + + assert_eq(got, expected) + + +def test_orc_writer_cols_as_map_type_error(): + df = cudf.DataFrame( + {"a": cudf.Series([[{"a": 10, "b": 20}], [{"a": 1, "b": 21}]])} + ) + buffer = BytesIO() + with pytest.raises( + TypeError, match="cols_as_map_type must be a list of column names." + ): + df.to_orc(buffer, cols_as_map_type=1) + + +def test_orc_reader_negative_timestamp(engine): + negative_timestamp_df = cudf.DataFrame( + { + "a": [ + pd.Timestamp("1969-12-31 23:59:59.000123"), + pd.Timestamp("1969-12-31 23:59:58.000999"), + pd.Timestamp("1969-12-31 23:59:58.001001"), + pd.Timestamp("1839-12-24 03:58:56.000826"), + ] + } + ) + buffer = BytesIO() + negative_timestamp_df.to_orc(buffer) + + # We warn the user that this function will fall back to the CPU for reading + # when the engine is pyarrow. + with expect_warning_if(engine == "pyarrow", UserWarning): + got = cudf.read_orc(buffer, engine=engine) + + assert_eq(negative_timestamp_df, got, check_dtype=False) + + +def test_orc_writer_negative_timestamp(): + negative_timestamp_df = cudf.DataFrame( + { + "a": [ + pd.Timestamp("1969-12-31 23:59:59.000123"), + pd.Timestamp("1969-12-31 23:59:58.000999"), + pd.Timestamp("1969-12-31 23:59:58.001001"), + pd.Timestamp("1839-12-24 03:58:56.000826"), + ] + } + ) + buffer = BytesIO() + negative_timestamp_df.to_orc(buffer) + + assert_eq(negative_timestamp_df, pd.read_orc(buffer), check_dtype=False) + assert_eq( + negative_timestamp_df, orc.ORCFile(buffer).read(), check_dtype=False + ) + + +@pytest.mark.skip( + reason="Bug specific to rockylinux8: https://github.com/rapidsai/cudf/issues/15802", +) +def test_orc_reader_apache_negative_timestamp(datadir): + path = datadir / "TestOrcFile.apache_timestamp.orc" + + pdf = pd.read_orc(path) + gdf = cudf.read_orc(path) + + assert_eq(pdf, gdf) + + +def test_statistics_string_sum(): + strings = ["a string", "another string!"] + buff = BytesIO() + df = cudf.DataFrame({"str": strings}) + df.to_orc(buff) + + file_stats, stripe_stats = cudf.io.orc.read_orc_statistics([buff]) + assert_eq(file_stats[0]["str"].get("sum"), sum(len(s) for s in strings)) + + +@pytest.mark.parametrize( + "fname", + [ + "TestOrcFile.Hive.OneEmptyMap.orc", + "TestOrcFile.Hive.OneEmptyList.orc", + "TestOrcFile.Hive.OneNullStruct.orc", + "TestOrcFile.Hive.EmptyListStripe.orc", + "TestOrcFile.Hive.NullStructStripe.orc", + "TestOrcFile.Hive.AllNulls.orc", + ], +) +def test_reader_empty_stripe(datadir, fname): + path = datadir / fname + + expected = pd.read_orc(path) + got = cudf.read_orc(path) + assert_eq(expected, got) + + +# needs enough data for multiple row groups +@pytest.mark.parametrize("data", [["*"] * 10001, ["**", None] * 5001]) +def test_reader_row_index_order(data): + expected = cudf.DataFrame({"str": data}, dtype="string") + + buffer = BytesIO() + expected.to_pandas().to_orc(buffer) + got = cudf.read_orc(buffer) + assert_eq(expected, got) + + +# Test the corner case where empty blocks are compressed +# Decompressed data size is zero, even though compressed data size is non-zero +# For more information see https://github.com/rapidsai/cudf/issues/13608 +def test_orc_reader_empty_decomp_data(datadir): + path = datadir / "TestOrcFile.Spark.EmptyDecompData.orc" + + expect = pd.read_orc(path) + got = cudf.read_orc(path) + + assert_eq(expect, got) + + +def test_orc_reader_empty_deeply_nested_level(datadir): + # Test the case where top level struct has nulls, but the nested struct is + # not nullable. In this case there is no data in the second level, but we + # still need to pass the parent null mask to the third level. + path = datadir / "TestOrcFile.Spark.NestedNotNullableStruct.orc" + + expect = pd.read_orc(path) + got = cudf.read_orc(path) + + assert_eq(expect, got) + + +def test_orc_chunked_writer_stripe_size(datadir): + df = cudf.DataFrame({"col": gen_rand_series("int", 100000)}) + + buffer = BytesIO() + with ORCWriter(buffer, stripe_size_bytes=64 * 1024) as writer: + writer.write_table(df) + + orc_file = orc.ORCFile(buffer) + assert_eq(orc_file.nstripes, 10) + + buffer = BytesIO() + with ORCWriter(buffer, stripe_size_rows=20000) as writer: + writer.write_table(df) + + orc_file = orc.ORCFile(buffer) + assert_eq(orc_file.nstripes, 5) + + +def test_reader_lz4(): + pdf = pd.DataFrame({"ints": [1, 2] * 5001}) + pa_table = pa.Table.from_pandas(pdf) + + buffer = BytesIO() + with orc.ORCWriter(buffer, compression="LZ4") as writer: + writer.write(pa_table) + + got = cudf.read_orc(buffer) + assert_eq(pdf, got) + + +def test_writer_lz4(): + gdf = cudf.DataFrame({"ints": [1, 2] * 5001}) + + buffer = BytesIO() + gdf.to_orc(buffer, compression="LZ4") + + got = pd.read_orc(buffer) + assert_eq(gdf, got) + + +def test_row_group_alignment(datadir): + path = datadir / "TestOrcFile.MapManyNulls.parquet" + + expected = cudf.read_parquet(path) + + buffer = BytesIO() + expected.to_orc(buffer) + + got = cudf.read_orc(buffer) + + assert_eq(expected, got) + + +@pytest.mark.parametrize( + "inputfile", + [ + # These sample data have a single column my_timestamp of the TIMESTAMP type, + # 2660 rows, and 1536 rows per row group. + "TestOrcFile.timestamp.desynced.uncompressed.RLEv2.orc", + "TestOrcFile.timestamp.desynced.snappy.RLEv2.orc", + # These two data are the same with the above, except that every 100 rows start + # with a null value. + "TestOrcFile.timestamp.desynced.uncompressed.RLEv2.hasNull.orc", + "TestOrcFile.timestamp.desynced.snappy.RLEv2.hasNull.orc", + ], +) +def test_orc_reader_desynced_timestamp(datadir, inputfile): + # Test a special case where the DATA stream (second) in a TIMESTAMP column + # is progressed faster than the SECONDARY stream (nanosecond) at the start of a row + # group. In this case, the "run cache manager" in the decoder kernel is used to + # orchestrate the dual-stream processing. + # For more information, see https://github.com/rapidsai/cudf/issues/17155. + + path = datadir / inputfile + + expect = pd.read_orc(path) + got = cudf.read_orc(path) + + assert_frame_equal(cudf.from_pandas(expect), got) + + +@pytest.mark.parametrize("compression", ["LZ4", "SNAPPY", "ZLIB", "ZSTD"]) +def test_orc_decompression(set_decomp_env_vars, compression): + # Write the DataFrame to a Parquet file + buffer = BytesIO() + rng = np.random.default_rng(seed=0) + types = [ + "bool", + "int8", + "int16", + "int32", + "int64", + "float32", + "float64", + "datetime64[ns]", + "str", + ] + nrows = 123 + + # Create a pandas dataframe with random data of mixed types + pd_data = { + f"col_{typ}": rng.integers(0, nrows, nrows).astype(typ) + for typ in types + } + pd_data["col_datetime64[ns]"] = pd.Series( + np.asarray( + rng.integers(0, (0x7FFFFFFFFFFFFFFF / 1000), size=nrows), + dtype="datetime64[ns]", + ) + ) + + # Create non-numeric str data + pd_data["col_str"] = pd.Series( + rng.choice(list(ascii_letters), size=nrows), dtype="str" + ) + + non_nested_pdf = pd.DataFrame(pd_data) + non_nested_pdf.to_orc(buffer, engine_kwargs={"compression": compression}) + + # Read the Parquet file back into a DataFrame + got = cudf.read_orc(buffer) + + assert_eq(non_nested_pdf, got) + + +def test_write_orc_gcs(monkeypatch, tmp_path): + gcsfs = pytest.importorskip("gcsfs") + pdf = pd.DataFrame( + { + "Integer": np.array([2345, 11987, 9027, 9027]), + "Float": np.array([9.001, 8.343, 6, 2.781]), + "Integer2": np.array([2345, 106, 2088, 789277]), + "String": np.array(["Alpha", "Beta", "Gamma", "Delta"]), + "Boolean": np.array([True, False, True, False]), + } + ) + gcs_fname = "cudf-gcs-test-bucket/test_orc_writer.orc" + local_filepath = tmp_path / "test_orc.orc" + gdf = cudf.from_pandas(pdf) + + def mock_open(*args, **kwargs): + return local_filepath.open("wb") + + monkeypatch.setattr(gcsfs.GCSFileSystem, "open", mock_open) + gdf.to_orc(f"gcs://{gcs_fname}") + + got = pd.read_orc(local_filepath) + assert_eq(pdf, got) diff --git a/python/cudf/cudf/tests/input_output/test_parquet.py b/python/cudf/cudf/tests/input_output/test_parquet.py index 13acf6825b4..c57f91e6c42 100644 --- a/python/cudf/cudf/tests/input_output/test_parquet.py +++ b/python/cudf/cudf/tests/input_output/test_parquet.py @@ -1,14 +1,4572 @@ # Copyright (c) 2023-2025, NVIDIA CORPORATION. +import datetime +import decimal +import glob +import hashlib +import math +import os +import pathlib +import random +import string +from contextlib import contextmanager from io import BytesIO +from string import ascii_letters +import cupy +import numpy as np import pandas as pd import pyarrow as pa -import pyarrow.parquet as pq import pytest +from fsspec.core import get_fs_token_paths +from packaging import version +from pyarrow import parquet as pq import cudf -from cudf.testing import assert_eq +from cudf.core._compat import PANDAS_CURRENT_SUPPORTED_VERSION, PANDAS_VERSION +from cudf.io.parquet import ( + ParquetDatasetWriter, + ParquetWriter, + merge_parquet_filemetadata, +) +from cudf.testing import assert_eq, dataset_generator as dg +from cudf.testing._utils import TIMEDELTA_TYPES, set_random_null_mask_inplace + + +@contextmanager +def _hide_pyarrow_parquet_cpu_warnings(engine): + if engine == "pyarrow": + with pytest.warns( + UserWarning, + match="Using CPU via PyArrow to read Parquet dataset. This option " + "is both inefficient and unstable!", + ): + yield + else: + yield + + +@pytest.fixture(scope="module") +def datadir(datadir): + return datadir / "parquet" + + +@pytest.fixture +def simple_pdf(): + nrows = 10 + rng = np.random.default_rng(seed=0) + types = [ + "bool", + "int8", + "int16", + "int32", + "int64", + "uint8", + "uint16", + # "uint32", pandas promotes uint32 to int64 + # https://issues.apache.org/jira/browse/ARROW-9215 + "uint64", + "float32", + "float64", + ] + # Create a pandas dataframe with random data of mixed types + test_pdf = pd.DataFrame( + { + f"col_{typ}": rng.integers(0, nrows, nrows).astype(typ) + for typ in types + }, + # Need to ensure that this index is not a RangeIndex to get the + # expected round-tripping behavior from Parquet reader/writer. + index=pd.Index(list(range(nrows))), + ) + # Delete the name of the column index, and rename the row index + test_pdf.columns.name = None + test_pdf.index.name = "test_index" + + return test_pdf + + +@pytest.fixture +def simple_gdf(simple_pdf): + return cudf.DataFrame.from_pandas(simple_pdf) + + +def build_pdf(num_columns, day_resolution_timestamps): + rng = np.random.default_rng(seed=0) + types = [ + "bool", + "int8", + "int16", + "int32", + "int64", + "uint8", + "uint16", + # "uint32", pandas promotes uint32 to int64 + # https://issues.apache.org/jira/browse/ARROW-9215 + "uint64", + "float32", + "float64", + "datetime64[ms]", + "datetime64[us]", + "str", + ] + nrows = num_columns + + # Create a pandas dataframe with random data of mixed types + test_pdf = pd.DataFrame( + { + f"col_{typ}": rng.integers(0, nrows, nrows).astype(typ) + for typ in types + }, + # Need to ensure that this index is not a RangeIndex to get the + # expected round-tripping behavior from Parquet reader/writer. + index=pd.Index(list(range(nrows))), + ) + # Delete the name of the column index, and rename the row index + test_pdf.columns.name = None + test_pdf.index.name = "test_index" + + # make datetime64's a little more interesting by increasing the range of + # dates note that pandas will convert these to ns timestamps, so care is + # taken to avoid overflowing a ns timestamp. There is also the ability to + # request timestamps be whole days only via `day_resolution_timestamps`. + for t in [ + { + "name": "datetime64[ms]", + "nsDivisor": 1000000, + "dayModulus": 86400000, + }, + { + "name": "datetime64[us]", + "nsDivisor": 1000, + "dayModulus": 86400000000, + }, + ]: + data = [ + rng.integers(0, (0x7FFFFFFFFFFFFFFF / t["nsDivisor"])) + for i in range(nrows) + ] + if day_resolution_timestamps: + data = [int(d / t["dayModulus"]) * t["dayModulus"] for d in data] + test_pdf["col_" + t["name"]] = pd.Series( + np.asarray(data, dtype=t["name"]) + ) + + # Create non-numeric categorical data otherwise parquet may typecast it + data = [ascii_letters[rng.integers(0, 52)] for i in range(nrows)] + test_pdf["col_category"] = pd.Series(data, dtype="category") + + # Create non-numeric str data + data = [ascii_letters[rng.integers(0, 52)] for i in range(nrows)] + test_pdf["col_str"] = pd.Series(data, dtype="str") + + return test_pdf + + +@pytest.fixture(params=[0, 10]) +def pdf(request): + return build_pdf(request.param, False) + + +@pytest.fixture(params=[0, 10]) +def pdf_day_timestamps(request): + return build_pdf(request.param, True) + + +@pytest.fixture +def gdf(pdf): + return cudf.DataFrame.from_pandas(pdf) + + +@pytest.fixture +def gdf_day_timestamps(pdf_day_timestamps): + return cudf.DataFrame.from_pandas(pdf_day_timestamps) + + +@pytest.fixture +def parquet_path_or_buf(datadir): + fname = datadir / "spark_timestamp.snappy.parquet" + try: + with open(fname, "rb") as f: + buffer = BytesIO(f.read()) + except Exception as excpr: + if type(excpr).__name__ == "FileNotFoundError": + pytest.skip(".parquet file is not found") + raise excpr + + def _make_parquet_path_or_buf(src): + if src == "filepath": + return str(fname) + if src == "pathobj": + return fname + if src == "bytes_io": + return buffer + if src == "bytes": + return buffer.getvalue() + if src == "url": + return fname.as_uri() + + raise ValueError("Invalid source type") + + yield _make_parquet_path_or_buf + + +@pytest.fixture(scope="module") +def large_int64_gdf(): + return cudf.DataFrame.from_pandas(pd.DataFrame({"col": range(0, 1 << 20)})) + + +@pytest.fixture(params=["pyarrow", "cudf"]) +def engine(request): + return request.param + + +@pytest.mark.filterwarnings("ignore:Using CPU") +@pytest.mark.parametrize( + "compression", ["snappy", "gzip", "brotli", None, np.str_("snappy")] +) +@pytest.mark.parametrize( + "columns", + [ + ["col_int8"], + ["col_category"], + ["col_int32", "col_float32"], + ["col_int16", "col_float64", "col_int8"], + None, + ], +) +def test_parquet_reader_basic(tmp_path, pdf, columns, engine, compression): + parquet_file = tmp_path / f"{compression}_test.parquet" + pdf.to_parquet(parquet_file, engine="pyarrow", compression=compression) + expect = pd.read_parquet(parquet_file, columns=columns) + got = cudf.read_parquet(parquet_file, engine=engine, columns=columns) + + # PANDAS returns category objects whereas cuDF returns hashes + if engine == "cudf": + if "col_category" in expect.columns: + expect = expect.drop(columns=["col_category"]) + if "col_category" in got.columns: + got = got.drop(columns=["col_category"]) + + assert_eq(expect, got) + + +@pytest.mark.filterwarnings("ignore:Using CPU") +def test_parquet_reader_empty_pandas_dataframe(tmp_path): + df = pd.DataFrame() + fname = tmp_path / "test_pq_reader_empty_pandas_dataframe.parquet" + df.to_parquet(fname) + assert os.path.exists(fname) + expect = pd.read_parquet(fname) + got = cudf.read_parquet(fname, engine="cudf") + expect = expect.reset_index(drop=True) + got = got.reset_index(drop=True) + + assert_eq(expect, got) + + +@pytest.mark.parametrize("has_null", [False, True]) +def test_parquet_reader_strings(tmp_path, has_null): + df = pd.DataFrame( + [(1, "aaa", 9.0), (2, "bbb", 8.0), (3, "ccc", 7.0)], + columns=pd.Index(list("abc")), + ) + if has_null: + df.at[1, "b"] = None + fname = tmp_path / "test_pq_reader_strings.parquet" + df.to_parquet(fname) + assert os.path.exists(fname) + + gdf = cudf.read_parquet(fname, engine="cudf") + + assert gdf["b"].dtype == np.dtype("object") + assert_eq(gdf["b"], df["b"]) + + +@pytest.mark.parametrize("columns", [None, ["b"]]) +@pytest.mark.parametrize("index_col", ["b", "Nameless", None]) +def test_parquet_reader_index_col(tmp_path, index_col, columns): + df = pd.DataFrame({"a": range(3), "b": range(3, 6), "c": range(6, 9)}) + + if index_col is None: + # No index column + df.reset_index(drop=True, inplace=True) + elif index_col == "Nameless": + # Index column but no name + df.set_index("a", inplace=True) + df.index.name = None + else: + # Index column as normal + df.set_index(index_col, inplace=True) + + fname = tmp_path / "test_pq_reader_index_col.parquet" + + # PANDAS' PyArrow backend always writes the index unless disabled + df.to_parquet(fname, index=(index_col is not None)) + assert os.path.exists(fname) + + pdf = pd.read_parquet(fname, columns=columns) + gdf = cudf.read_parquet(fname, engine="cudf", columns=columns) + + assert_eq(pdf, gdf, check_categorical=False) + + +@pytest.mark.parametrize("pandas_compat", [True, False]) +@pytest.mark.parametrize( + "columns", [["a"], ["d"], ["a", "b"], ["a", "d"], None] +) +def test_parquet_reader_pandas_metadata(tmp_path, columns, pandas_compat): + df = pd.DataFrame( + { + "a": range(6, 9), + "b": range(3, 6), + "c": range(6, 9), + "d": ["abc", "def", "xyz"], + } + ) + df.set_index("b", inplace=True) + + fname = tmp_path / "test_pq_reader_pandas_metadata.parquet" + df.to_parquet(fname) + assert os.path.exists(fname) + + # PANDAS `read_parquet()` and PyArrow `read_pandas()` always includes index + # Instead, directly use PyArrow to optionally omit the index + expect = pa.parquet.read_table( + fname, columns=columns, use_pandas_metadata=pandas_compat + ).to_pandas() + got = cudf.read_parquet( + fname, columns=columns, use_pandas_metadata=pandas_compat + ) + + if pandas_compat or columns is None or "b" in columns: + assert got.index.name == "b" + else: + assert got.index.name is None + assert_eq(expect, got, check_categorical=False) + + +@pytest.mark.parametrize("pandas_compat", [True, False]) +@pytest.mark.parametrize("as_bytes", [True, False]) +def test_parquet_range_index_pandas_metadata( + tmp_path, pandas_compat, as_bytes +): + df = pd.DataFrame( + {"a": range(6, 9), "b": ["abc", "def", "xyz"]}, + index=pd.RangeIndex(3, 6, 1, name="c"), + ) + + fname = tmp_path / "test_parquet_range_index_pandas_metadata.parquet" + df.to_parquet(fname) + assert os.path.exists(fname) + + # PANDAS `read_parquet()` and PyArrow `read_pandas()` always includes index + # Instead, directly use PyArrow to optionally omit the index + expect = pa.parquet.read_table( + fname, use_pandas_metadata=pandas_compat + ).to_pandas() + if as_bytes: + # Make sure we can handle RangeIndex parsing + # in pandas when the input is `bytes` + with open(fname, "rb") as f: + got = cudf.read_parquet( + f.read(), use_pandas_metadata=pandas_compat + ) + else: + got = cudf.read_parquet(fname, use_pandas_metadata=pandas_compat) + + assert_eq(expect, got) + + +def test_parquet_read_metadata(tmp_path, pdf): + def num_row_groups(rows, group_size): + return max(1, (rows + (group_size - 1)) // group_size) + + fname = tmp_path / "metadata.parquet" + row_group_size = 5 + pdf.to_parquet(fname, compression="snappy", row_group_size=row_group_size) + + ( + num_rows, + row_groups, + col_names, + num_columns, + _, # rowgroup_metadata + ) = cudf.io.read_parquet_metadata(fname) + + assert num_columns == len(pdf.columns) + assert num_rows == len(pdf.index) + assert row_groups == num_row_groups(num_rows, row_group_size) + for a, b in zip(col_names, pdf.columns, strict=True): + assert a == b + + +def test_parquet_read_filtered(set_decomp_env_vars, tmp_path): + # Generate data + fname = tmp_path / "filtered.parquet" + dg.generate( + fname, + dg.Parameters( + num_rows=100, + column_parameters=[ + dg.ColumnParameters( + cardinality=40, + null_frequency=0.05, + generator=lambda: [ + "".join( + random.sample( + string.ascii_letters, random.randint(4, 8) + ) + ) + for _ in range(10) + ], + is_sorted=False, + ), + dg.ColumnParameters( + 40, + 0.2, + lambda: np.random.default_rng(seed=0).integers( + 0, 100, size=10 + ), + True, + ), + ], + seed=42, + ), + format={"name": "parquet", "row_group_size": 10}, + use_threads=False, + ) + + # Get dataframes to compare + df = cudf.read_parquet(fname) + df_filtered = cudf.read_parquet(fname, filters=[("1", ">", 60)]) + # PyArrow's read_table function does row-group-level filtering in addition + # to applying given filters once the table has been read into memory. + # Because of this, we aren't using PyArrow as a reference for testing our + # row-group selection method since the only way to only select row groups + # with PyArrow is with the method we use and intend to test. + tbl_filtered = pq.read_table(fname, filters=[("1", ">", 60)]) + + assert_eq(cudf.io.read_parquet_metadata(fname)[1], 10) + assert len(df_filtered) < len(df) + assert len(tbl_filtered) <= len(df_filtered) + + +def test_parquet_read_filtered_everything(tmp_path): + # Generate data + fname = tmp_path / "filtered_everything.parquet" + df = pd.DataFrame({"x": range(10), "y": list("aabbccddee")}) + df.to_parquet(fname, row_group_size=2) + + # Check filter + df_filtered = cudf.read_parquet(fname, filters=[("x", "==", 12)]) + assert_eq(len(df_filtered), 0) + assert_eq(df_filtered["x"].dtype, "int64") + assert_eq(df_filtered["y"].dtype, "object") + + +def test_parquet_read_filtered_multiple_files(tmp_path): + # Generate data + fname_0 = tmp_path / "filtered_multiple_files_0.parquet" + df = pd.DataFrame({"x": range(10), "y": list("aabbccddee")}) + df.to_parquet(fname_0, row_group_size=2) + fname_1 = tmp_path / "filtered_multiple_files_1.parquet" + df = pd.DataFrame({"x": range(10), "y": list("aaccccddee")}) + df.to_parquet(fname_1, row_group_size=2) + fname_2 = tmp_path / "filtered_multiple_files_2.parquet" + df = pd.DataFrame( + {"x": [0, 1, 9, 9, 4, 5, 6, 7, 8, 9], "y": list("aabbzzddee")} + ) + df.to_parquet(fname_2, row_group_size=2) + + # Check filter + filtered_df = cudf.read_parquet( + [fname_0, fname_1, fname_2], filters=[("x", "==", 2)] + ) + assert_eq( + filtered_df, + cudf.DataFrame({"x": [2, 2], "y": list("bc")}, index=[2, 2]), + ) + + +@pytest.mark.parametrize( + "predicate,expected_len", + [ + ([[("x", "==", 0)], [("z", "==", 0)]], 2), + ([("x", "==", 0), ("z", "==", 0)], 0), + ([("x", "==", 0), ("z", "!=", 0)], 1), + ([("y", "==", "c"), ("x", ">", 8)], 0), + ([("y", "==", "c"), ("x", ">=", 5)], 1), + ([[("y", "==", "c")], [("x", "<", 3)]], 5), + ([[("x", "not in", (0, 9)), ("z", "not in", (4, 5))]], 6), + ([[("y", "==", "c")], [("x", "in", (0, 9)), ("z", "in", (0, 9))]], 4), + ([[("x", "==", 0)], [("x", "==", 1)], [("x", "==", 2)]], 3), + ([[("x", "==", 0), ("z", "==", 9), ("y", "==", "a")]], 1), + ], +) +def test_parquet_read_filtered_complex_predicate( + tmp_path, predicate, expected_len +): + # Generate data + fname = tmp_path / "filtered_complex_predicate.parquet" + df = pd.DataFrame( + { + "x": range(10), + "y": list("aabbccddee"), + "z": reversed(range(10)), + } + ) + df.to_parquet(fname, row_group_size=2) + + # Check filters + df_filtered = cudf.read_parquet(fname, filters=predicate) + assert_eq(cudf.io.read_parquet_metadata(fname)[1], 10 / 2) + assert_eq(len(df_filtered), expected_len) + + +@pytest.mark.parametrize("row_group_size", [1, 5]) +def test_parquet_read_row_groups(tmp_path, pdf, row_group_size): + if "col_category" in pdf.columns: + pdf = pdf.drop(columns=["col_category"]) + fname = tmp_path / "row_group.parquet" + pdf.to_parquet(fname, compression="gzip", row_group_size=row_group_size) + + num_rows, row_groups, col_names, _, _ = cudf.io.read_parquet_metadata( + fname + ) + + gdf = [cudf.read_parquet(fname, row_groups=[i]) for i in range(row_groups)] + gdf = cudf.concat(gdf) + assert_eq(pdf.reset_index(drop=True), gdf.reset_index(drop=True)) + + # first half rows come from the first source, rest from the second + gdf = cudf.read_parquet( + [fname, fname], + row_groups=[ + list(range(row_groups // 2)), + list(range(row_groups // 2, row_groups)), + ], + ) + assert_eq(pdf.reset_index(drop=True), gdf.reset_index(drop=True)) + + +@pytest.mark.parametrize("row_group_size", [1, 5]) +def test_parquet_read_row_groups_non_contiguous(tmp_path, pdf, row_group_size): + fname = tmp_path / "row_group.parquet" + pdf.to_parquet(fname, compression="gzip", row_group_size=row_group_size) + + num_rows, row_groups, col_names, _, _ = cudf.io.read_parquet_metadata( + fname + ) + + # alternate rows between the two sources + gdf = cudf.read_parquet( + [fname, fname], + row_groups=[ + list(range(0, row_groups, 2)), + list(range(1, row_groups, 2)), + ], + ) + + ref_df = [ + cudf.read_parquet(fname, row_groups=i) + for i in list(range(0, row_groups, 2)) + list(range(1, row_groups, 2)) + ] + ref_df = cudf.concat(ref_df) + + assert_eq(ref_df, gdf) + + +def test_parquet_reader_spark_timestamps(datadir): + fname = datadir / "spark_timestamp.snappy.parquet" + + expect = pd.read_parquet(fname) + got = cudf.read_parquet(fname) + + assert_eq(expect, got) + + +def test_parquet_reader_spark_decimals(datadir): + fname = datadir / "spark_decimal.parquet" + + expect = pd.read_parquet(fname) + got = cudf.read_parquet(fname) + + assert_eq(expect, got) + + +@pytest.mark.parametrize("columns", [["a"], ["b", "a"], None]) +def test_parquet_reader_decimal128(datadir, columns): + fname = datadir / "nested_decimal128_file.parquet" + got = cudf.read_parquet(fname, columns=columns) + expect = cudf.read_parquet(fname, columns=columns) + + assert_eq(expect, got) + + +def test_parquet_reader_microsecond_timestamps(datadir): + fname = datadir / "usec_timestamp.parquet" + + expect = pd.read_parquet(fname) + got = cudf.read_parquet(fname) + + assert_eq(expect, got) + + +def test_parquet_reader_mixedcompression(datadir): + fname = datadir / "mixed_compression.parquet" + + expect = pd.read_parquet(fname) + got = cudf.read_parquet(fname) + + assert_eq(expect, got) + + +def test_parquet_reader_select_columns(datadir): + fname = datadir / "nested_column_map.parquet" + + expect = cudf.read_parquet(fname).to_pandas()[["value"]] + got = cudf.read_parquet(fname, columns=["value"]) + + assert_eq(expect, got) + + +def test_parquet_reader_invalids(tmp_path): + test_pdf = cudf.DataFrame( + {"a": np.array([1, np.nan, np.nan, 2])}, index=cudf.Index([0, 1, 2, 3]) + ) + + fname = tmp_path / "invalids.parquet" + test_pdf.to_parquet(fname, engine="pyarrow") + + expect = pd.read_parquet(fname) + got = cudf.read_parquet(fname) + + assert_eq(expect, got.to_pandas(nullable=True)) + + +def test_parquet_reader_filenotfound(tmpdir): + with pytest.raises(FileNotFoundError): + cudf.read_parquet("TestMissingFile.parquet") + + with pytest.raises(FileNotFoundError): + cudf.read_parquet(tmpdir.mkdir("cudf_parquet")) + + +def test_parquet_reader_local_filepath(): + fname = "~/TestLocalFile.parquet" + if not os.path.isfile(fname): + pytest.skip("Local .parquet file is not found") + + cudf.read_parquet(fname) + + +@pytest.mark.parametrize( + "src", ["filepath", "pathobj", "bytes_io", "bytes", "url"] +) +def test_parquet_reader_filepath_or_buffer(parquet_path_or_buf, src): + expect = pd.read_parquet(parquet_path_or_buf("filepath")) + got = cudf.read_parquet(parquet_path_or_buf(src)) + + assert_eq(expect, got) + + +def test_parquet_reader_file_types(parquet_path_or_buf): + expect = cudf.read_parquet(parquet_path_or_buf("filepath")) + fs, _, paths = get_fs_token_paths(parquet_path_or_buf("filepath")) + + # Pass open fsspec file + with fs.open(paths[0], mode="rb") as fil: + got1 = cudf.read_parquet(fil) + assert_eq(expect, got1) + + # Pass path only + got2 = cudf.read_parquet(paths[0]) + assert_eq(expect, got2) + + +def create_parquet_source(df, src_type, fname): + if src_type == "filepath": + df.to_parquet(fname, engine="pyarrow") + return str(fname) + if src_type == "pathobj": + df.to_parquet(fname, engine="pyarrow") + return fname + if src_type == "bytes_io": + buffer = BytesIO() + df.to_parquet(buffer, engine="pyarrow") + return buffer + if src_type == "bytes": + buffer = BytesIO() + df.to_parquet(buffer, engine="pyarrow") + return buffer.getvalue() + if src_type == "url": + df.to_parquet(fname, engine="pyarrow") + return pathlib.Path(fname).as_uri() + + +@pytest.mark.parametrize( + "src", ["filepath", "pathobj", "bytes_io", "bytes", "url"] +) +def test_parquet_reader_multiple_files(tmp_path, src): + test_pdf1 = pd.DataFrame( + { + "a": np.concatenate( + [np.arange(10, dtype="float64"), np.full(10, np.nan)] + ) + }, + index=pd.Index(np.arange(20)), + ) + test_pdf2 = pd.DataFrame( + {"a": np.arange(10, dtype="float64")}, index=pd.Index(np.arange(10)) + ) + expect = pd.concat([test_pdf1, test_pdf2]) + + src1 = create_parquet_source(test_pdf1, src, tmp_path / "multi1.parquet") + src2 = create_parquet_source(test_pdf2, src, tmp_path / "multi2.parquet") + got = cudf.read_parquet([src1, src2]) + + assert_eq(expect, got) + + +def test_parquet_reader_reordered_columns(tmp_path): + src = pd.DataFrame( + {"name": ["cow", None, "duck", "fish", None], "id": [0, 1, 2, 3, 4]} + ) + fname = tmp_path / "test_parquet_reader_reordered_columns.parquet" + src.to_parquet(fname) + assert os.path.exists(fname) + expect = pd.DataFrame( + {"id": [0, 1, 2, 3, 4], "name": ["cow", None, "duck", "fish", None]} + ) + got = cudf.read_parquet(fname, columns=["id", "name"]) + assert_eq(expect, got, check_dtype=False) + + +def test_parquet_reader_reordered_columns_mixed(tmp_path): + src = pd.DataFrame( + { + "name": ["cow", None, "duck", "fish", None], + "list0": [ + [[1, 2], [3, 4]], + None, + [[5, 6], None], + [[1]], + [[5], [6, None, 8]], + ], + "id": [0, 1, 2, 3, 4], + "list1": [ + [[1, 2], [3, 4]], + [[0, 0]], + [[5, 6], [10, 12]], + [[1]], + [[5], [6, 8]], + ], + } + ) + fname = tmp_path / "test_parquet_reader_reordered_columns.parquet" + src.to_parquet(fname) + assert os.path.exists(fname) + expect = pd.DataFrame( + { + "list1": [ + [[1, 2], [3, 4]], + [[0, 0]], + [[5, 6], [10, 12]], + [[1]], + [[5], [6, 8]], + ], + "id": [0, 1, 2, 3, 4], + "list0": [ + [[1, 2], [3, 4]], + None, + [[5, 6], None], + [[1]], + [[5], [6, None, 8]], + ], + "name": ["cow", None, "duck", "fish", None], + } + ) + got = cudf.read_parquet(fname, columns=["list1", "id", "list0", "name"]) + assert_eq(expect, got, check_dtype=False) + + +def test_parquet_reader_list_basic(tmp_path): + expect = pd.DataFrame({"a": [[[1, 2], [3, 4]], None, [[5, 6], None]]}) + fname = tmp_path / "test_parquet_reader_list_basic.parquet" + expect.to_parquet(fname) + assert os.path.exists(fname) + got = cudf.read_parquet(fname) + assert_eq(expect, got) + + +def test_parquet_reader_list_table(tmp_path): + expect = pd.DataFrame( + { + "a": [[[1, 2], [3, 4]], None, [[5, 6], None]], + "b": [[None, None], None, [None, None]], + "c": [[[1, 2, 3]], [[None]], [[], None]], + "d": [[[]], [[None]], [[1, 2, 3], None]], + "e": [[["cows"]], [["dogs"]], [["cats", "birds", "owls"], None]], + } + ) + fname = tmp_path / "test_parquet_reader_list_table.parquet" + expect.to_parquet(fname) + assert os.path.exists(fname) + got = cudf.read_parquet(fname) + assert pa.Table.from_pandas(expect).equals(got.to_arrow()) + + +def int_gen(first_val, i): + """ + Returns an integer based on an absolute index and a starting value. Used + as input to `list_gen`. + """ + return int(i + first_val) + + +strings = [ + "cats", + "dogs", + "cows", + "birds", + "fish", + "sheep", + "owls", + "bears", + "ants", +] + + +def string_gen(first_val, i): + """ + Returns a string based on an absolute index and a starting value. Used as + input to `list_gen`. + """ + return strings[int_gen(first_val, i) % len(strings)] + + +def list_row_gen( + gen, first_val, list_size, lists_per_row, include_validity=False +): + """ + Generate a single row for a List> column based on input parameters. + + Parameters + ---------- + gen : A callable which generates an individual leaf element based on an + absolute index. + first_val : Generate the column as if it had started at 'first_val' + instead of 0. + list_size : Size of each generated list. + lists_per_row : Number of lists to generate per row. + include_validity : Whether or not to include nulls as part of the + column. If true, it will add a selection of nulls at both the + topmost row level and at the leaf level. + + Returns + ------- + The generated list column. + """ + + def L(list_size, first_val): + return [ + (gen(first_val, i) if i % 2 == 0 else None) + if include_validity + else (gen(first_val, i)) + for i in range(list_size) + ] + + return [ + (L(list_size, first_val + (list_size * i)) if i % 2 == 0 else None) + if include_validity + else L(list_size, first_val + (list_size * i)) + for i in range(lists_per_row) + ] + + +def list_gen(gen, num_rows, lists_per_row, list_size, include_validity=False): + """ + Generate a list column based on input parameters. + + Parameters + ---------- + gen : A callable which generates an individual leaf element based on an + absolute index. + num_rows : Number of rows to generate. + lists_per_row : Number of lists to generate per row. + list_size : Size of each generated list. + include_validity : Whether or not to include nulls as part of the + column. If true, it will add a selection of nulls at both the + topmost row level and at the leaf level. + + Returns + ------- + The generated list column. + """ + + def L(list_size, first_val): + return [ + (gen(first_val, i) if i % 2 == 0 else None) + if include_validity + else (gen(first_val, i)) + for i in range(list_size) + ] + + def R(first_val, lists_per_row, list_size): + return [ + L(list_size, first_val + (list_size * i)) + for i in range(lists_per_row) + ] + + return [ + ( + R( + lists_per_row * list_size * i, + lists_per_row, + list_size, + ) + if i % 2 == 0 + else None + ) + if include_validity + else R( + lists_per_row * list_size * i, + lists_per_row, + list_size, + ) + for i in range(num_rows) + ] + + +def test_parquet_reader_list_large(tmp_path): + expect = pd.DataFrame({"a": list_gen(int_gen, 64, 40, 25)}) + fname = tmp_path / "test_parquet_reader_list_large.parquet" + expect.to_parquet(fname) + assert os.path.exists(fname) + got = cudf.read_parquet(fname) + assert_eq(expect, got, check_dtype=False) + + +def test_parquet_reader_list_validity(tmp_path): + expect = pd.DataFrame( + {"a": list_gen(int_gen, 64, 40, 25, include_validity=True)} + ) + fname = tmp_path / "test_parquet_reader_list_validity.parquet" + expect.to_parquet(fname) + assert os.path.exists(fname) + got = cudf.read_parquet(fname) + assert_eq(expect, got, check_dtype=False) + + +def test_parquet_reader_list_large_mixed(tmp_path): + expect = pd.DataFrame( + { + "a": list_gen(string_gen, 64, 40, 25), + "b": list_gen(int_gen, 64, 40, 25), + "c": list_gen(int_gen, 64, 40, 25, include_validity=True), + "d": list_gen(string_gen, 64, 40, 25, include_validity=True), + } + ) + fname = tmp_path / "test_parquet_reader_list_large_mixed.parquet" + expect.to_parquet(fname) + assert os.path.exists(fname) + got = cudf.read_parquet(fname) + assert pa.Table.from_pandas(expect).equals(got.to_arrow()) + + +def test_parquet_reader_list_large_multi_rowgroup(tmp_path): + # > 40 row groups + num_rows = 10000 + num_docs = num_rows / 2 + num_categories = 100 + row_group_size = 100 + + cupy.random.seed(0) + + # generate a random pairing of doc: category + documents = cudf.DataFrame( + { + "document_id": cupy.random.randint(num_docs, size=num_rows), + "category_id": cupy.random.randint(num_categories, size=num_rows), + } + ) + + # group categories by document_id to create a list column + expect = documents.groupby("document_id").agg({"category_id": ["collect"]}) + expect.columns = expect.columns.get_level_values(0) + expect.reset_index(inplace=True) + + # round trip the dataframe to/from parquet + fname = tmp_path / "test_parquet_reader_list_large_multi_rowgroup.parquet" + expect.to_pandas().to_parquet(fname, row_group_size=row_group_size) + got = cudf.read_parquet(fname) + + assert_eq(expect, got) + + +def test_parquet_reader_list_large_multi_rowgroup_nulls(tmp_path): + # 25 row groups + num_rows = 2500 + row_group_size = 100 + + expect = cudf.DataFrame( + {"a": list_gen(int_gen, num_rows, 3, 2, include_validity=True)} + ) + + # round trip the dataframe to/from parquet + fname = ( + tmp_path + / "test_parquet_reader_list_large_multi_rowgroup_nulls.parquet" + ) + expect.to_pandas().to_parquet(fname, row_group_size=row_group_size) + assert os.path.exists(fname) + got = cudf.read_parquet(fname) + assert_eq(expect, got) + + +def struct_gen(gen, skip_rows, num_rows, include_validity=False): + """ + Generate a struct column based on input parameters. + + Parameters + ---------- + gen : A array of callables which generate an individual row based on an + absolute index. + skip_rows : Generate the column as if it had started at 'skip_rows' + instead of 0. The intent here is to emulate the skip_rows + parameter of the parquet reader. + num_fields : Number of fields in the struct. + include_validity : Whether or not to include nulls as part of the + column. If true, it will add a selection of nulls at both the + field level and at the value level. + + Returns + ------- + The generated struct column. + """ + + def R(first_val, num_fields): + return { + "col" + str(f): ( + gen[f](first_val, first_val) if f % 4 != 0 else None + ) + if include_validity + else (gen[f](first_val, first_val)) + for f in range(len(gen)) + } + + return [ + (R((i + skip_rows), len(gen)) if (i + skip_rows) % 4 != 0 else None) + if include_validity + else R((i + skip_rows), len(gen)) + for i in range(num_rows) + ] + + +@pytest.mark.parametrize( + "data", + [ + # struct + [ + {"a": 1, "b": 2}, + {"a": 10, "b": 20}, + {"a": None, "b": 22}, + {"a": None, "b": None}, + {"a": 15, "b": None}, + ], + # struct-of-list + [ + {"a": 1, "b": 2, "c": [1, 2, 3]}, + {"a": 10, "b": 20, "c": [4, 5]}, + {"a": None, "b": 22, "c": [6]}, + {"a": None, "b": None, "c": None}, + {"a": 15, "b": None, "c": [-1, -2]}, + None, + {"a": 100, "b": 200, "c": [-10, None, -20]}, + ], + # list-of-struct + [ + [{"a": 1, "b": 2}, {"a": 2, "b": 3}, {"a": 4, "b": 5}], + None, + [{"a": 10, "b": 20}], + [{"a": 100, "b": 200}, {"a": None, "b": 300}, None], + ], + # struct-of-struct + [ + {"a": 1, "b": {"inner_a": 10, "inner_b": 20}, "c": 2}, + {"a": 3, "b": {"inner_a": 30, "inner_b": 40}, "c": 4}, + {"a": 5, "b": {"inner_a": 50, "inner_b": None}, "c": 6}, + {"a": 7, "b": None, "c": 8}, + {"a": None, "b": {"inner_a": None, "inner_b": None}, "c": None}, + None, + {"a": None, "b": {"inner_a": None, "inner_b": 100}, "c": 10}, + ], + ], +) +def test_parquet_reader_struct_basic(tmp_path, data): + expect = pa.Table.from_pydict({"struct": data}) + fname = tmp_path / "test_parquet_reader_struct_basic.parquet" + pa.parquet.write_table(expect, fname) + assert os.path.exists(fname) + got = cudf.read_parquet(fname) + assert expect.equals(got.to_arrow()) + + +def select_columns_params(): + dfs = [ + # struct + ( + [ + {"a": 1, "b": 2}, + {"a": 10, "b": 20}, + {"a": None, "b": 22}, + {"a": None, "b": None}, + {"a": 15, "b": None}, + ], + [["struct"], ["struct.a"], ["struct.b"], ["c"]], + ), + # struct-of-list + ( + [ + {"a": 1, "b": 2, "c": [1, 2, 3]}, + {"a": 10, "b": 20, "c": [4, 5]}, + {"a": None, "b": 22, "c": [6]}, + {"a": None, "b": None, "c": None}, + {"a": 15, "b": None, "c": [-1, -2]}, + None, + {"a": 100, "b": 200, "c": [-10, None, -20]}, + ], + [ + ["struct"], + ["struct.c"], + ["struct.c.list"], + ["struct.c.list.item"], + ["struct.b", "struct.c"], + ["struct.b", "struct.d", "struct.c"], + ], + ), + # list-of-struct + ( + [ + [{"a": 1, "b": 2}, {"a": 2, "b": 3}, {"a": 4, "b": 5}], + None, + [{"a": 10, "b": 20}], + [{"a": 100, "b": 200}, {"a": None, "b": 300}, None], + ], + [ + ["struct"], + ["struct.list"], + ["struct.list.item"], + ["struct.list.item.a", "struct.list.item.b"], + ["struct.list.item.c"], + ], + ), + # struct with "." in field names + ( + [ + {"a.b": 1, "b.a": 2}, + {"a.b": 10, "b.a": 20}, + {"a.b": None, "b.a": 22}, + {"a.b": None, "b.a": None}, + {"a.b": 15, "b.a": None}, + ], + [["struct"], ["struct.a"], ["struct.b.a"]], + ), + ] + for df_col_pair in dfs: + for cols in df_col_pair[1]: + yield df_col_pair[0], cols + + +@pytest.mark.parametrize("data, columns", select_columns_params()) +def test_parquet_reader_struct_select_columns(data, columns): + table = pa.Table.from_pydict({"struct": data}) + buff = BytesIO() + + pa.parquet.write_table(table, buff) + + expect = pq.ParquetFile(buff).read(columns=columns) + got = cudf.read_parquet(buff, columns=columns) + assert expect.equals(got.to_arrow()) + + +def test_parquet_reader_struct_los_large(tmp_path): + num_rows = 256 + list_size = 64 + data = [ + struct_gen([string_gen, int_gen, string_gen], 0, list_size, False) + if i % 2 == 0 + else None + for i in range(num_rows) + ] + expect = pa.Table.from_pydict({"los": data}) + fname = tmp_path / "test_parquet_reader_struct_los_large.parquet" + pa.parquet.write_table(expect, fname) + assert os.path.exists(fname) + got = cudf.read_parquet(fname) + assert expect.equals(got.to_arrow()) + + +@pytest.mark.parametrize( + "params", [[3, 4, 32, False], [3, 4, 32, True], [50, 10, 64, True]] +) +def test_parquet_reader_struct_sol_table(tmp_path, params): + # Struct> + lists_per_row = params[0] + list_size = params[1] + num_rows = params[2] + include_validity = params[3] + + def list_gen_wrapped(x, y): + return list_row_gen( + int_gen, x * list_size * lists_per_row, list_size, lists_per_row + ) + + def string_list_gen_wrapped(x, y): + return list_row_gen( + string_gen, + x * list_size * lists_per_row, + list_size, + lists_per_row, + include_validity, + ) + + data = struct_gen( + [int_gen, string_gen, list_gen_wrapped, string_list_gen_wrapped], + 0, + num_rows, + include_validity, + ) + expect = pa.Table.from_pydict({"sol": data}) + fname = tmp_path / "test_parquet_reader_struct_sol_table.parquet" + pa.parquet.write_table(expect, fname) + assert os.path.exists(fname) + got = cudf.read_parquet(fname) + assert expect.equals(got.to_arrow()) + + +def test_parquet_reader_v2(tmp_path, simple_pdf): + pdf_fname = tmp_path / "pdfv2.parquet" + simple_pdf.to_parquet(pdf_fname, data_page_version="2.0") + assert_eq(cudf.read_parquet(pdf_fname), simple_pdf) + + cudf.from_pandas(simple_pdf).to_parquet(pdf_fname, header_version="2.0") + assert_eq(cudf.read_parquet(pdf_fname), simple_pdf) + + +def test_parquet_delta_byte_array(datadir): + fname = datadir / "delta_byte_arr.parquet" + assert_eq(cudf.read_parquet(fname), pd.read_parquet(fname)) + + +# values chosen to exercise: +# 1 - header only, no bitpacked values +# 2 - one bitpacked value +# 23 - one partially filled miniblock +# 32 - almost full miniblock +# 33 - one full miniblock +# 34 - one full miniblock plus one value in new miniblock +# 128 - almost full block +# 129 - one full block +# 130 - one full block plus one value in new block +# 129 * 3 - multiple blocks +@pytest.fixture(params=[1, 2, 23, 32, 33, 34, 128, 129, 130, 129 * 3]) +def delta_num_rows(request): + return request.param + + +@pytest.mark.parametrize("add_nulls", [True, False]) +def test_delta_binary( + delta_num_rows, add_nulls, signed_integer_types_as_str, tmp_path +): + null_frequency = 0.25 if add_nulls else 0 + + # Create a pandas dataframe with random data of mixed types + arrow_table = dg.rand_dataframe( + dtypes_meta=[ + { + "dtype": signed_integer_types_as_str, + "null_frequency": null_frequency, + "cardinality": delta_num_rows, + }, + ], + rows=delta_num_rows, + seed=0, + use_threads=False, + ) + # Roundabout conversion to pandas to preserve nulls/data types + cudf_table = cudf.DataFrame.from_arrow(arrow_table) + test_pdf = cudf_table.to_pandas(nullable=True) + pdf_fname = tmp_path / "pdfv2.parquet" + test_pdf.to_parquet( + pdf_fname, + version="2.6", + column_encoding="DELTA_BINARY_PACKED", + data_page_version="2.0", + data_page_size=64 * 1024, + engine="pyarrow", + use_dictionary=False, + ) + cdf = cudf.read_parquet(pdf_fname) + pcdf = cudf.from_pandas(test_pdf) + assert_eq(cdf, pcdf) + + # Write back out with cudf and make sure pyarrow can read it + cudf_fname = tmp_path / "cudfv2.parquet" + pcdf.to_parquet( + cudf_fname, + compression=None, + header_version="2.0", + use_dictionary=False, + ) + + cdf2 = cudf.from_pandas(pd.read_parquet(cudf_fname)) + assert_eq(cdf2, cdf) + + +@pytest.mark.parametrize("add_nulls", [True, False]) +@pytest.mark.parametrize("max_string_length", [12, 48, 96, 128]) +@pytest.mark.parametrize( + "str_encoding", ["DELTA_BYTE_ARRAY", "DELTA_LENGTH_BYTE_ARRAY"] +) +def test_delta_byte_array_roundtrip( + delta_num_rows, add_nulls, max_string_length, str_encoding, tmp_path +): + null_frequency = 0.25 if add_nulls else 0 + + # Create a pandas dataframe with random data of mixed lengths + test_pdf = dg.rand_dataframe( + dtypes_meta=[ + { + "dtype": "str", + "null_frequency": null_frequency, + "cardinality": delta_num_rows, + "max_string_length": max_string_length, + }, + ], + rows=delta_num_rows, + seed=0, + use_threads=False, + ).to_pandas() + + pdf_fname = tmp_path / "pdfdeltaba.parquet" + test_pdf.to_parquet( + pdf_fname, + version="2.6", + column_encoding=str_encoding, + data_page_version="2.0", + data_page_size=64 * 1024, + engine="pyarrow", + use_dictionary=False, + ) + cdf = cudf.read_parquet(pdf_fname) + pcdf = cudf.from_pandas(test_pdf) + assert_eq(cdf, pcdf) + + # Write back out with cudf and make sure pyarrow can read it + cudf_fname = tmp_path / "cdfdeltaba.parquet" + pcdf.to_parquet( + cudf_fname, + compression="snappy", + header_version="2.0", + use_dictionary=False, + ) + cdf2 = cudf.from_pandas(pd.read_parquet(cudf_fname)) + assert_eq(cdf2, cdf) + + +@pytest.mark.parametrize("add_nulls", [True, False]) +@pytest.mark.parametrize( + "str_encoding", ["DELTA_BYTE_ARRAY", "DELTA_LENGTH_BYTE_ARRAY"] +) +def test_delta_struct_list(tmp_path, delta_num_rows, add_nulls, str_encoding): + # Struct> + lists_per_row = 3 + list_size = 4 + num_rows = delta_num_rows + include_validity = add_nulls + + def list_gen_wrapped(x, y): + return list_row_gen( + int_gen, x * list_size * lists_per_row, list_size, lists_per_row + ) + + def string_list_gen_wrapped(x, y): + return list_row_gen( + string_gen, + x * list_size * lists_per_row, + list_size, + lists_per_row, + include_validity, + ) + + data = struct_gen( + [int_gen, string_gen, list_gen_wrapped, string_list_gen_wrapped], + 0, + num_rows, + include_validity, + ) + test_pdf = pa.Table.from_pydict({"sol": data}).to_pandas() + pdf_fname = tmp_path / "pdfdeltaba.parquet" + test_pdf.to_parquet( + pdf_fname, + version="2.6", + column_encoding={ + "sol.col0": "DELTA_BINARY_PACKED", + "sol.col1": str_encoding, + "sol.col2.list.element.list.element": "DELTA_BINARY_PACKED", + "sol.col3.list.element.list.element": str_encoding, + }, + data_page_version="2.0", + data_page_size=64 * 1024, + engine="pyarrow", + use_dictionary=False, + ) + # sanity check to verify file is written properly + assert_eq(test_pdf, pd.read_parquet(pdf_fname)) + cdf = cudf.read_parquet(pdf_fname) + pcdf = cudf.from_pandas(test_pdf) + assert_eq(cdf, pcdf) + + # Write back out with cudf and make sure pyarrow can read it + cudf_fname = tmp_path / "cdfdeltaba.parquet" + pcdf.to_parquet( + cudf_fname, + compression="snappy", + header_version="2.0", + use_dictionary=False, + ) + cdf2 = cudf.from_pandas(pd.read_parquet(cudf_fname)) + assert_eq(cdf2, cdf) + + +@pytest.mark.parametrize( + "data", + [ + # Structs + { + "being": [ + None, + {"human?": True, "Deets": {"Name": "Carrot", "Age": 27}}, + {"human?": None, "Deets": {"Name": "Angua", "Age": 25}}, + {"human?": False, "Deets": {"Name": "Cheery", "Age": 31}}, + {"human?": False, "Deets": None}, + {"human?": None, "Deets": {"Name": "Mr", "Age": None}}, + ] + }, + # List of Structs + { + "family": [ + [None, {"human?": True, "deets": {"weight": 2.4, "age": 27}}], + [ + {"human?": None, "deets": {"weight": 5.3, "age": 25}}, + {"human?": False, "deets": {"weight": 8.0, "age": 31}}, + {"human?": False, "deets": None}, + ], + [], + [{"human?": None, "deets": {"weight": 6.9, "age": None}}], + ] + }, + # Struct of Lists + { + "Real estate records": [ + None, + { + "Status": "NRI", + "Ownerships": { + "land_unit": [None, 2, None], + "flats": [[1, 2, 3], [], [4, 5], [], [0, 6, 0]], + }, + }, + { + "Status": None, + "Ownerships": { + "land_unit": [4, 5], + "flats": [[7, 8], []], + }, + }, + { + "Status": "RI", + "Ownerships": {"land_unit": None, "flats": [[]]}, + }, + {"Status": "RI", "Ownerships": None}, + { + "Status": None, + "Ownerships": { + "land_unit": [7, 8, 9], + "flats": [[], [], []], + }, + }, + ] + }, + ], +) +def test_parquet_reader_nested_v2(tmp_path, data): + expect = pd.DataFrame(data) + pdf_fname = tmp_path / "pdfv2.parquet" + expect.to_parquet(pdf_fname, data_page_version="2.0") + assert_eq(cudf.read_parquet(pdf_fname), expect) + + +@pytest.mark.filterwarnings("ignore:Using CPU") +def test_parquet_writer_cpu_pyarrow( + tmp_path, pdf_day_timestamps, gdf_day_timestamps +): + pdf_fname = tmp_path / "pdf.parquet" + gdf_fname = tmp_path / "gdf.parquet" + + if len(pdf_day_timestamps) == 0: + pdf_day_timestamps = pdf_day_timestamps.reset_index(drop=True) + gdf_day_timestamps = pdf_day_timestamps.reset_index(drop=True) + + pdf_day_timestamps.to_parquet(pdf_fname) + gdf_day_timestamps.to_parquet(gdf_fname, engine="pyarrow") + + assert os.path.exists(pdf_fname) + assert os.path.exists(gdf_fname) + + expect = pa.parquet.read_pandas(pdf_fname) + got = pa.parquet.read_pandas(gdf_fname) + + assert_eq(expect, got) + + def clone_field(table, name, datatype): + f = table.schema.field(name) + return pa.field(f.name, datatype, f.nullable, f.metadata) + + # Pandas uses a datetime64[ns] while we use a datetime64[ms] + for t in [expect, got]: + for t_col in ["col_datetime64[ms]", "col_datetime64[us]"]: + idx = t.schema.get_field_index(t_col) + field = clone_field(t, t_col, pa.timestamp("ms")) + t = t.set_column(idx, field, t.column(idx).cast(field.type)) + t = t.replace_schema_metadata() + + assert_eq(expect, got) + + +@pytest.mark.filterwarnings("ignore:Using CPU") +def test_parquet_writer_int96_timestamps(tmp_path, pdf, gdf): + gdf_fname = tmp_path / "gdf.parquet" + + if len(pdf) == 0: + pdf = pdf.reset_index(drop=True) + gdf = gdf.reset_index(drop=True) + + if "col_category" in pdf.columns: + pdf = pdf.drop(columns=["col_category"]) + if "col_category" in gdf.columns: + gdf = gdf.drop(columns=["col_category"]) + + assert_eq(pdf, gdf) + + # Write out the gdf using the GPU accelerated writer with INT96 timestamps + gdf.to_parquet( + gdf_fname, + index=None, + int96_timestamps=True, + ) + + assert os.path.exists(gdf_fname) + + expect = pdf + got = pd.read_parquet(gdf_fname) + + # verify INT96 timestamps were converted back to the same data. + assert_eq(expect, got, check_categorical=False, check_dtype=False) + + +def test_multifile_parquet_folder(tmp_path): + test_pdf1 = pd.DataFrame( + { + "a": np.concatenate( + [np.arange(10, dtype="float64"), np.full(10, np.nan)] + ) + }, + index=pd.Index(np.arange(20)), + ) + test_pdf2 = pd.DataFrame( + {"a": np.arange(10, dtype="float64")}, index=pd.Index(np.arange(10)) + ) + expect = pd.concat([test_pdf1, test_pdf2]) + + par_dir = tmp_path / "multi_part" + par_dir.mkdir() + + create_parquet_source(test_pdf1, "filepath", par_dir / "multi1.parquet") + create_parquet_source(test_pdf2, "filepath", par_dir / "multi2.parquet") + + got1 = cudf.read_parquet(tmp_path / "multi_part/*.parquet") + assert_eq(expect, got1) + + got2 = cudf.read_parquet(tmp_path / "multi_part") + assert_eq(expect, got2) + + +# Validates the metadata return path of the parquet writer +def test_parquet_writer_return_metadata(tmp_path, simple_gdf): + gdf_fname = tmp_path / "data1.parquet" + + # Write out the gdf using the GPU accelerated writer + df_metadata = simple_gdf.to_parquet( + gdf_fname, index=None, metadata_file_path="test/data1.parquet" + ) + # Verify that we got a valid parquet signature in the initial metadata blob + assert df_metadata.tobytes()[0:4] == b"PAR1" + + df_metadata_list1 = [df_metadata] + df_metadata_list2 = [df_metadata, df_metadata] + merged_metadata1 = merge_parquet_filemetadata(df_metadata_list1) + merged_metadata2 = merge_parquet_filemetadata(df_metadata_list2) + + # Verify that we got a valid parquet signature in the final metadata blob + assert merged_metadata1.tobytes()[0:4] == b"PAR1" + assert merged_metadata2.tobytes()[0:4] == b"PAR1" + + # Make sure aggregation is combining metadata correctly + fmd1 = pa.parquet.ParquetFile(BytesIO(merged_metadata1.tobytes())).metadata + fmd2 = pa.parquet.ParquetFile(BytesIO(merged_metadata2.tobytes())).metadata + assert fmd2.num_columns == fmd1.num_columns + assert fmd2.num_rows == 2 * fmd1.num_rows + assert fmd2.num_row_groups == 2 * fmd1.num_row_groups + + +# Validates the integrity of the GPU accelerated parquet writer. +def test_parquet_writer_gpu_none_index(tmp_path, simple_pdf, simple_gdf): + gdf_fname = tmp_path / "gdf.parquet" + pdf_fname = tmp_path / "pdf.parquet" + + assert_eq(simple_pdf, simple_gdf) + + # Write out the gdf using the GPU accelerated writer + simple_gdf.to_parquet(gdf_fname, index=None) + simple_pdf.to_parquet(pdf_fname, index=None) + + assert os.path.exists(gdf_fname) + assert os.path.exists(pdf_fname) + + expect = pd.read_parquet(pdf_fname) + got = pd.read_parquet(gdf_fname) + + assert_eq(expect, got, check_categorical=False) + + +def test_parquet_writer_gpu_true_index(tmp_path, simple_pdf, simple_gdf): + gdf_fname = tmp_path / "gdf.parquet" + pdf_fname = tmp_path / "pdf.parquet" + + assert_eq(simple_pdf, simple_gdf) + + # Write out the gdf using the GPU accelerated writer + simple_gdf.to_parquet(gdf_fname, index=True) + simple_pdf.to_parquet(pdf_fname, index=True) + + assert os.path.exists(gdf_fname) + assert os.path.exists(pdf_fname) + + expect = pd.read_parquet(pdf_fname) + got = pd.read_parquet(gdf_fname) + + assert_eq(expect, got, check_categorical=False) + + +def test_parquet_writer_gpu_false_index(tmp_path, simple_pdf, simple_gdf): + gdf_fname = tmp_path / "gdf.parquet" + pdf_fname = tmp_path / "pdf.parquet" + + assert_eq(simple_pdf, simple_gdf) + + # Write out the gdf using the GPU accelerated writer + simple_gdf.to_parquet(gdf_fname, index=False) + simple_pdf.to_parquet(pdf_fname, index=False) + + assert os.path.exists(gdf_fname) + assert os.path.exists(pdf_fname) + + expect = pd.read_parquet(pdf_fname) + got = pd.read_parquet(gdf_fname) + + assert_eq(expect, got, check_categorical=False) + + +def test_parquet_writer_gpu_multi_index(tmp_path, simple_pdf, simple_gdf): + gdf_fname = tmp_path / "gdf.parquet" + pdf_fname = tmp_path / "pdf.parquet" + + simple_pdf = simple_pdf.set_index(["col_bool", "col_int8"]) + simple_gdf = simple_gdf.set_index(["col_bool", "col_int8"]) + + assert_eq(simple_pdf, simple_gdf) + + # Write out the gdf using the GPU accelerated writer + simple_gdf.to_parquet(gdf_fname, index=None) + simple_pdf.to_parquet(pdf_fname, index=None) + + assert os.path.exists(gdf_fname) + assert os.path.exists(pdf_fname) + + expect = pd.read_parquet(pdf_fname) + got = pd.read_parquet(gdf_fname) + + assert_eq(expect, got, check_categorical=False) + + +def test_parquet_writer_gpu_chunked(tmp_path, simple_pdf, simple_gdf): + gdf_fname = tmp_path / "gdf.parquet" + + writer = ParquetWriter(gdf_fname) + writer.write_table(simple_gdf) + writer.write_table(simple_gdf) + writer.close() + + assert_eq(pd.read_parquet(gdf_fname), pd.concat([simple_pdf, simple_pdf])) + + +def test_parquet_writer_gpu_chunked_context(tmp_path, simple_pdf, simple_gdf): + gdf_fname = tmp_path / "gdf.parquet" + + with ParquetWriter(gdf_fname) as writer: + writer.write_table(simple_gdf) + writer.write_table(simple_gdf) + + got = pd.read_parquet(gdf_fname) + expect = pd.concat([simple_pdf, simple_pdf]) + assert_eq(got, expect) + + +def test_parquet_write_bytes_io(simple_gdf): + output = BytesIO() + simple_gdf.to_parquet(output) + assert_eq(cudf.read_parquet(output), simple_gdf) + + +@pytest.mark.parametrize("store_schema", [True, False]) +def test_parquet_writer_bytes_io(simple_gdf, store_schema): + output = BytesIO() + + writer = ParquetWriter(output, store_schema=store_schema) + writer.write_table(simple_gdf) + writer.write_table(simple_gdf) + writer.close() + + assert_eq(cudf.read_parquet(output), cudf.concat([simple_gdf, simple_gdf])) + + +@pytest.mark.parametrize( + "row_group_size_kwargs", + [ + {"row_group_size_bytes": 4 * 1024}, + {"row_group_size_rows": 5000}, + ], +) +def test_parquet_writer_row_group_size(tmp_path, row_group_size_kwargs): + # Check that row_group_size options are exposed in Python + # See https://github.com/rapidsai/cudf/issues/10978 + + size = 20000 + gdf = cudf.DataFrame({"a": range(size), "b": [1] * size}) + + fname = tmp_path / "gdf.parquet" + with ParquetWriter(fname, **row_group_size_kwargs) as writer: + writer.write_table(gdf) + + # Simple check for multiple row-groups + nrows, nrow_groups, columns, _, _ = cudf.io.parquet.read_parquet_metadata( + fname + ) + assert nrows == size + assert nrow_groups > 1 + assert columns == ["a", "b"] + + # Know the specific row-group count for row_group_size_rows + if "row_group_size_rows" in row_group_size_kwargs: + assert ( + nrow_groups == size // row_group_size_kwargs["row_group_size_rows"] + ) + + assert_eq(cudf.read_parquet(fname), gdf) + + +def test_parquet_writer_column_index(tmp_path): + # Simple test for presence of indices. validity is checked + # in libcudf tests. + # Write 2 files, one with column index set, one without. + # Make sure the former is larger in size. + + size = 20000 + gdf = cudf.DataFrame({"a": range(size), "b": [1] * size}) + + fname = tmp_path / "gdf.parquet" + with ParquetWriter(fname, statistics="ROWGROUP") as writer: + writer.write_table(gdf) + s1 = os.path.getsize(fname) + + fname = tmp_path / "gdfi.parquet" + with ParquetWriter(fname, statistics="COLUMN") as writer: + writer.write_table(gdf) + s2 = os.path.getsize(fname) + assert s2 > s1 + + +@pytest.mark.parametrize( + "max_page_size_kwargs", + [ + {"max_page_size_bytes": 4 * 1024}, + {"max_page_size_rows": 5000}, + ], +) +def test_parquet_writer_max_page_size(tmp_path, max_page_size_kwargs): + # Check that max_page_size options are exposed in Python + # Since we don't have access to page metadata, instead check that + # file written with more pages will be slightly larger + + size = 20000 + gdf = cudf.DataFrame({"a": range(size), "b": [1] * size}) + + fname = tmp_path / "gdf.parquet" + with ParquetWriter(fname, **max_page_size_kwargs) as writer: + writer.write_table(gdf) + s1 = os.path.getsize(fname) + + assert_eq(cudf.read_parquet(fname), gdf) + + fname = tmp_path / "gdf0.parquet" + with ParquetWriter(fname) as writer: + writer.write_table(gdf) + s2 = os.path.getsize(fname) + + assert_eq(cudf.read_parquet(fname), gdf) + assert s1 > s2 + + +@pytest.mark.parametrize("use_dict", [False, True]) +@pytest.mark.parametrize("max_dict_size", [0, 1048576]) +def test_parquet_writer_dictionary_setting(use_dict, max_dict_size): + # Simple test for checking the validity of dictionary encoding setting + # and behavior of ParquetWriter in cudf. + # Write a table with repetitive data with varying dictionary settings. + # Make sure the written columns are dictionary-encoded accordingly. + + # Table with repetitive data + table = cudf.DataFrame( + { + "int32": cudf.Series([1024] * 1024, dtype="int64"), + } + ) + + # Write to Parquet using ParquetWriter + buffer = BytesIO() + writer = ParquetWriter( + buffer, + use_dictionary=use_dict, + max_dictionary_size=max_dict_size, + ) + writer.write_table(table) + writer.close() + + # Read encodings from parquet file + got = pq.ParquetFile(buffer) + encodings = got.metadata.row_group(0).column(0).encodings + + # Check for `PLAIN_DICTIONARY` encoding if dictionary encoding enabled + # and dictionary page limit > 0 + if use_dict is True and max_dict_size > 0: + assert "PLAIN_DICTIONARY" in encodings + else: + assert "PLAIN_DICTIONARY" not in encodings + + +@pytest.mark.parametrize("filename", ["myfile.parquet", None]) +@pytest.mark.parametrize("cols", [["b"], ["c", "b"]]) +def test_parquet_partitioned(tmpdir_factory, cols, filename): + rng = np.random.default_rng(seed=0) + # Checks that write_to_dataset is wrapping to_parquet + # as expected + gdf_dir = str(tmpdir_factory.mktemp("gdf_dir")) + pdf_dir = str(tmpdir_factory.mktemp("pdf_dir")) + size = 100 + pdf = pd.DataFrame( + { + "a": np.arange(0, stop=size, dtype="int64"), + "b": rng.choice(list("abcd"), size=size), + "c": rng.choice(np.arange(4), size=size), + } + ) + pdf.to_parquet(pdf_dir, index=False, partition_cols=cols) + gdf = cudf.from_pandas(pdf) + gdf.to_parquet( + gdf_dir, index=False, partition_cols=cols, partition_file_name=filename + ) + + # Read back with pandas to compare + expect_pd = pd.read_parquet(pdf_dir) + got_pd = pd.read_parquet(gdf_dir) + assert_eq(expect_pd, got_pd) + + # Check that cudf and pd return the same read + got_cudf = cudf.read_parquet(gdf_dir) + if isinstance(got_pd["c"].dtype, pd.CategoricalDtype): + # Work-around for pandas bug: + # https://github.com/pandas-dev/pandas/issues/53345 + got_pd["c"] = got_pd["c"].astype( + pd.CategoricalDtype( + categories=got_pd["c"].dtype.categories.astype("int64"), + ordered=got_pd["c"].dtype.ordered, + ) + ) + assert_eq(got_pd, got_cudf) + + # If filename is specified, check that it is correct + if filename: + for _, _, files in os.walk(gdf_dir): + for fn in files: + assert fn == filename + + +@pytest.mark.parametrize("kwargs", [{"nrows": 1}, {"skip_rows": 1}]) +def test_parquet_partitioned_notimplemented(tmpdir_factory, kwargs): + rng = np.random.default_rng(seed=0) + # Checks that write_to_dataset is wrapping to_parquet + # as expected + pdf_dir = str(tmpdir_factory.mktemp("pdf_dir")) + size = 100 + pdf = pd.DataFrame( + { + "a": np.arange(0, stop=size, dtype="int64"), + "b": rng.choice(list("abcd"), size=size), + "c": rng.choice(np.arange(4), size=size), + } + ) + pdf.to_parquet(pdf_dir, index=False, partition_cols=["b"]) + + with pytest.raises(NotImplementedError): + cudf.read_parquet(pdf_dir, **kwargs) + + +@pytest.mark.parametrize("return_meta", [True, False]) +def test_parquet_writer_chunked_partitioned(tmpdir_factory, return_meta): + pdf_dir = str(tmpdir_factory.mktemp("pdf_dir")) + gdf_dir = str(tmpdir_factory.mktemp("gdf_dir")) + + df1 = cudf.DataFrame({"a": [1, 1, 2, 2, 1], "b": [9, 8, 7, 6, 5]}) + df2 = cudf.DataFrame({"a": [1, 3, 3, 1, 3], "b": [4, 3, 2, 1, 0]}) + + cw = ParquetDatasetWriter(gdf_dir, partition_cols=["a"], index=False) + cw.write_table(df1) + cw.write_table(df2) + meta_byte_array = cw.close(return_metadata=return_meta) + pdf = cudf.concat([df1, df2]).to_pandas() + pdf.to_parquet(pdf_dir, index=False, partition_cols=["a"]) + + if return_meta: + fmd = pq.ParquetFile(BytesIO(meta_byte_array)).metadata + assert fmd.num_rows == len(pdf) + assert fmd.num_row_groups == 4 + files = { + os.path.join(directory, files[0]) + for directory, _, files in os.walk(gdf_dir) + if files + } + meta_files = { + os.path.join(gdf_dir, fmd.row_group(i).column(c).file_path) + for i in range(fmd.num_row_groups) + for c in range(fmd.row_group(i).num_columns) + } + assert files == meta_files + + # Read back with pandas to compare + expect_pd = pd.read_parquet(pdf_dir) + got_pd = pd.read_parquet(gdf_dir) + assert_eq(expect_pd, got_pd) + + # Check that cudf and pd return the same read + got_cudf = cudf.read_parquet(gdf_dir) + + # Work-around for pandas bug: + # https://github.com/pandas-dev/pandas/issues/53345 + got_pd["a"] = got_pd["a"].astype( + pd.CategoricalDtype( + categories=got_pd["a"].dtype.categories.astype("int64"), + ordered=got_pd["a"].dtype.ordered, + ) + ) + assert_eq(got_pd, got_cudf) + + +@pytest.mark.parametrize( + "max_file_size,max_file_size_in_bytes", + [("500KB", 500000), ("MB", 1000000)], +) +def test_parquet_writer_chunked_max_file_size( + tmpdir_factory, max_file_size, max_file_size_in_bytes +): + pdf_dir = str(tmpdir_factory.mktemp("pdf_dir")) + gdf_dir = str(tmpdir_factory.mktemp("gdf_dir")) + + df1 = cudf.DataFrame({"a": [1, 1, 2, 2, 1] * 1000, "b": range(0, 5000)}) + df2 = cudf.DataFrame( + {"a": [1, 3, 3, 1, 3] * 1000, "b": range(5000, 10000)} + ) + + cw = ParquetDatasetWriter( + gdf_dir, + partition_cols=["a"], + max_file_size=max_file_size, + file_name_prefix="sample", + ) + cw.write_table(df1) + cw.write_table(df2) + cw.close() + pdf = cudf.concat([df1, df2]).to_pandas() + pdf.to_parquet(pdf_dir, index=False, partition_cols=["a"]) + + expect_pd = pd.read_parquet(pdf_dir) + got_pd = pd.read_parquet(gdf_dir) + + assert_eq( + expect_pd.sort_values(["b"]).reset_index(drop=True), + got_pd.sort_values(["b"]).reset_index(drop=True), + ) + + # Check that cudf and pd return the same read + got_cudf = cudf.read_parquet(gdf_dir) + + # Work-around for pandas bug: + # https://github.com/pandas-dev/pandas/issues/53345 + got_pd["a"] = got_pd["a"].astype( + pd.CategoricalDtype( + categories=got_pd["a"].dtype.categories.astype("int64"), + ordered=got_pd["a"].dtype.ordered, + ) + ) + assert_eq( + got_pd.sort_values(["b"]).reset_index(drop=True), + got_cudf.sort_values(["b"]).reset_index(drop=True), + ) + + all_files = glob.glob(gdf_dir + "/**/*.parquet", recursive=True) + for each_file in all_files: + # Validate file sizes with some extra 1000 + # bytes buffer to spare + assert os.path.getsize(each_file) <= (max_file_size_in_bytes), ( + "File exceeded max_file_size" + ) + + +def test_parquet_writer_chunked_max_file_size_error(): + with pytest.raises( + ValueError, + match="file_name_prefix cannot be None if max_file_size is passed", + ): + ParquetDatasetWriter("sample", partition_cols=["a"], max_file_size=100) + + +def test_parquet_writer_chunked_partitioned_context(tmpdir_factory): + pdf_dir = str(tmpdir_factory.mktemp("pdf_dir")) + gdf_dir = str(tmpdir_factory.mktemp("gdf_dir")) + + df1 = cudf.DataFrame({"a": [1, 1, 2, 2, 1], "b": [9, 8, 7, 6, 5]}) + df2 = cudf.DataFrame({"a": [1, 3, 3, 1, 3], "b": [4, 3, 2, 1, 0]}) + + with ParquetDatasetWriter( + gdf_dir, partition_cols=["a"], index=False + ) as cw: + cw.write_table(df1) + cw.write_table(df2) + + pdf = cudf.concat([df1, df2]).to_pandas() + pdf.to_parquet(pdf_dir, index=False, partition_cols=["a"]) + + # Read back with pandas to compare + expect_pd = pd.read_parquet(pdf_dir) + got_pd = pd.read_parquet(gdf_dir) + assert_eq(expect_pd, got_pd) + + # Check that cudf and pd return the same read + got_cudf = cudf.read_parquet(gdf_dir) + + # Work-around for pandas bug: + # https://github.com/pandas-dev/pandas/issues/53345 + got_pd["a"] = got_pd["a"].astype( + pd.CategoricalDtype( + categories=got_pd["a"].dtype.categories.astype("int64"), + ordered=got_pd["a"].dtype.ordered, + ) + ) + assert_eq(got_pd, got_cudf) + + +@pytest.mark.parametrize("cols", [None, ["b"]]) +@pytest.mark.parametrize("store_schema", [True, False]) +def test_parquet_write_to_dataset(tmpdir_factory, cols, store_schema): + rng = np.random.default_rng(seed=0) + dir1 = tmpdir_factory.mktemp("dir1") + dir2 = tmpdir_factory.mktemp("dir2") + if cols is None: + dir1 = dir1.join("file.pq") + dir2 = dir2.join("file.pq") + dir1 = str(dir1) + dir2 = str(dir2) + + size = 100 + gdf = cudf.DataFrame( + { + "a": np.arange(0, stop=size), + "b": rng.choice(np.arange(4), size=size), + } + ) + gdf.to_parquet(dir1, partition_cols=cols, store_schema=store_schema) + cudf.io.write_to_dataset(gdf, dir2, partition_cols=cols) + + # Read back with cudf + expect = cudf.read_parquet(dir1) + got = cudf.read_parquet(dir2) + assert_eq(expect, got) + + gdf = cudf.DataFrame( + { + "a": cudf.Series([1, 2, 3]), + "b": cudf.Series([1, 2, 3]), + "c": cudf.Series(["a", "b", "c"], dtype="category"), + } + ) + with pytest.raises(ValueError): + gdf.to_parquet(dir1, partition_cols=cols, store_schema=store_schema) + + +@pytest.mark.parametrize( + "pfilters", + [[("b", "==", "b")], [("b", "==", "a"), ("c", "==", 1)]], +) +@pytest.mark.parametrize("selection", ["directory", "files", "row-groups"]) +@pytest.mark.parametrize("use_cat", [True, False]) +def test_read_parquet_partitioned_filtered( + tmp_path, pfilters, selection, use_cat +): + rng = np.random.default_rng(2) + path = str(tmp_path) + size = 100 + df = cudf.DataFrame( + { + "a": np.arange(0, stop=size, dtype="int64"), + "b": rng.choice(list("abcd"), size=size), + "c": rng.choice(np.arange(4), size=size), + } + ) + df.to_parquet(path, partition_cols=["c", "b"]) + + if selection == "files": + # Pass in a list of paths + fs = get_fs_token_paths(path)[0] + read_path = fs.find(path) + row_groups = None + elif selection == "row-groups": + # Pass in a list of paths AND row-group ids + fs = get_fs_token_paths(path)[0] + read_path = fs.find(path) + row_groups = [[0] for p in read_path] + else: + # Pass in a directory path + # (row-group selection not allowed in this case) + read_path = path + row_groups = None + + # Filter on partitioned columns + expect = pd.read_parquet(read_path, filters=pfilters) + got = cudf.read_parquet( + read_path, + filters=pfilters, + row_groups=row_groups, + categorical_partitions=use_cat, + ) + expect["b"] = expect["b"].astype(str) + expect["c"] = expect["c"].astype(int) + if use_cat: + assert got.dtypes["b"] == "category" + assert got.dtypes["c"] == "category" + got["b"] = got["b"].astype(str) + got["c"] = got["c"].astype(int) + else: + # Check that we didn't get categorical + # columns, but convert back to categorical + # for comparison with pandas + assert got.dtypes["b"] == "object" + assert got.dtypes["c"] == "int" + assert_eq(expect, got) + + +@pytest.mark.parametrize( + "filters", [[("a", "==", 10)], [[("a", "==", 10)], [("c", "==", 1)]]] +) +def test_read_parquet_partitioned_filtered_other(tmp_path, filters): + rng = np.random.default_rng(2) + path = str(tmp_path) + size = 10 + df = cudf.DataFrame( + { + "a": np.arange(0, stop=size, dtype="int64"), + "b": rng.choice(list("abcd"), size=size), + "c": rng.choice(np.arange(4), size=size), + } + ) + df.to_parquet(path, partition_cols=["c", "b"]) + got = cudf.read_parquet(path, filters=filters) + expect = pd.read_parquet(path, filters=filters) + + # Work-around for pandas bug: + # https://github.com/pandas-dev/pandas/issues/53345 + expect["c"] = expect["c"].astype( + pd.CategoricalDtype( + categories=expect["c"].dtype.categories.astype("int64"), + ordered=expect["c"].dtype.ordered, + ) + ) + assert_eq(expect, got) + + +def test_parquet_writer_chunked_metadata(tmp_path, simple_pdf, simple_gdf): + gdf_fname = tmp_path / "gdf.parquet" + test_path = "test/path" + + writer = ParquetWriter(gdf_fname) + writer.write_table(simple_gdf) + writer.write_table(simple_gdf) + meta_byte_array = writer.close(metadata_file_path=test_path) + fmd = pq.ParquetFile(BytesIO(meta_byte_array)).metadata + + assert fmd.num_rows == 2 * len(simple_gdf) + assert fmd.num_row_groups == 2 + + for r in range(fmd.num_row_groups): + for c in range(fmd.num_columns): + assert fmd.row_group(r).column(c).file_path == test_path + + +def test_write_read_cudf(tmp_path, pdf): + file_path = tmp_path / "cudf.parquet" + if "col_category" in pdf.columns: + pdf = pdf.drop(columns=["col_category"]) + + gdf = cudf.from_pandas(pdf) + gdf.to_parquet(file_path) + gdf = cudf.read_parquet(file_path) + + assert_eq(gdf, pdf, check_index_type=not pdf.empty) + + +def test_write_cudf_read_pandas_pyarrow(tmp_path, pdf): + cudf_path = tmp_path / "cudf.parquet" + pandas_path = tmp_path / "pandas.parquet" + + if "col_category" in pdf.columns: + pdf = pdf.drop(columns=["col_category"]) + + df = cudf.from_pandas(pdf) + + df.to_parquet(cudf_path) + pdf.to_parquet(pandas_path) + + cudf_res = pd.read_parquet(cudf_path) + pd_res = pd.read_parquet(pandas_path) + + assert_eq(pd_res, cudf_res, check_index_type=not pdf.empty) + + cudf_res = pa.parquet.read_table( + cudf_path, use_pandas_metadata=True + ).to_pandas() + pd_res = pa.parquet.read_table( + pandas_path, use_pandas_metadata=True + ).to_pandas() + + assert_eq(cudf_res, pd_res, check_index_type=not pdf.empty) + + +def test_parquet_writer_criteo(tmp_path): + # To run this test, download the day 0 of criteo dataset from + # http://labs.criteo.com/2013/12/download-terabyte-click-logs/ + # and place the uncompressed dataset in the home directory + fname = os.path.expanduser("~/day_0") + if not os.path.isfile(fname): + pytest.skip("Local criteo day 0 tsv file is not found") + + cudf_path = tmp_path / "cudf.parquet" + + cont_names = ["I" + str(x) for x in range(1, 14)] + cat_names = ["C" + str(x) for x in range(1, 27)] + cols = ["label", *cont_names, *cat_names] + + df = cudf.read_csv(fname, sep="\t", names=cols, byte_range=(0, 1000000000)) + df = df.drop(columns=cont_names) + + df.to_parquet(cudf_path) + + +def test_trailing_nans(datadir, tmp_path): + fname = "trailing_nans.parquet" + file_path = datadir / fname + cu_df = cudf.read_parquet(file_path) + + tmp_file_path = tmp_path / fname + cu_df.to_parquet(tmp_file_path) + + pd.read_parquet(tmp_file_path) + + +def test_parquet_writer_sliced(tmp_path): + cudf_path = tmp_path / "cudf.parquet" + + df = pd.DataFrame() + df["String"] = np.array(["Alpha", "Beta", "Gamma", "Delta"]) + df = cudf.from_pandas(df) + + df_select = df.iloc[1:3] + + df_select.to_parquet(cudf_path) + assert_eq(cudf.read_parquet(cudf_path), df_select) + + +def test_parquet_writer_list_basic(tmp_path): + expect = pd.DataFrame({"a": [[[1, 2], [3, 4]], None, [[5, 6], None]]}) + fname = tmp_path / "test_parquet_writer_list_basic.parquet" + + gdf = cudf.from_pandas(expect) + + gdf.to_parquet(fname) + assert os.path.exists(fname) + + got = pd.read_parquet(fname) + assert_eq(expect, got) + + +def test_parquet_writer_list_large(tmp_path): + gdf = cudf.DataFrame({"a": list_gen(int_gen, 128, 40, 25)}) + fname = tmp_path / "test_parquet_writer_list_large.parquet" + + gdf.to_parquet(fname) + assert os.path.exists(fname) + + got = pd.read_parquet(fname) + assert gdf.to_arrow().equals(pa.Table.from_pandas(got)) + + +def test_parquet_writer_list_large_mixed(tmp_path): + expect = pd.DataFrame( + { + "a": list_gen(string_gen, 64, 40, 25), + "b": list_gen(int_gen, 64, 40, 25), + "c": list_gen(int_gen, 64, 40, 25, include_validity=True), + "d": list_gen(string_gen, 64, 40, 25, include_validity=True), + } + ) + fname = tmp_path / "test_parquet_writer_list_large_mixed.parquet" + gdf = cudf.from_pandas(expect) + + gdf.to_parquet(fname) + assert os.path.exists(fname) + + got = pd.read_parquet(fname) + assert_eq(expect, got) + + +@pytest.mark.parametrize("store_schema", [True, False]) +def test_parquet_writer_list_chunked(tmp_path, store_schema): + if store_schema and version.parse(pa.__version__) < version.parse( + "15.0.0" + ): + pytest.skip("https://github.com/apache/arrow/pull/37792") + table1 = cudf.DataFrame( + { + "a": list_gen(string_gen, 64, 40, 25), + "b": list_gen(int_gen, 64, 40, 25), + "c": list_gen(int_gen, 64, 40, 25, include_validity=True), + "d": list_gen(string_gen, 64, 40, 25, include_validity=True), + } + ) + table2 = cudf.DataFrame( + { + "a": list_gen(string_gen, 64, 40, 25), + "b": list_gen(int_gen, 64, 40, 25), + "c": list_gen(int_gen, 64, 40, 25, include_validity=True), + "d": list_gen(string_gen, 64, 40, 25, include_validity=True), + } + ) + fname = tmp_path / "test_parquet_writer_list_chunked.parquet" + expect = cudf.concat([table1, table2]) + expect = expect.reset_index(drop=True) + + with ParquetWriter(fname, store_schema=store_schema) as writer: + writer.write_table(table1) + writer.write_table(table2) + + assert os.path.exists(fname) + got = pq.read_table(fname) + # compare with pyarrow since pandas doesn't + # have a list or struct dtype + assert expect.to_arrow().equals(got) + + +def test_parquet_nullable_boolean(tmp_path, engine): + pandas_path = tmp_path / "pandas_bools.parquet" + + pdf = pd.DataFrame( + { + "a": pd.Series( + [True, False, None, True, False], dtype=pd.BooleanDtype() + ) + } + ) + expected_gdf = cudf.DataFrame({"a": [True, False, None, True, False]}) + + pdf.to_parquet(pandas_path) + with _hide_pyarrow_parquet_cpu_warnings(engine): + actual_gdf = cudf.read_parquet(pandas_path, engine=engine) + + assert_eq(actual_gdf, expected_gdf) + + +def run_parquet_index(pdf, index): + pandas_buffer = BytesIO() + cudf_buffer = BytesIO() + + gdf = cudf.from_pandas(pdf) + + pdf.to_parquet(pandas_buffer, index=index) + gdf.to_parquet(cudf_buffer, index=index) + + expected = pd.read_parquet(cudf_buffer) + actual = cudf.read_parquet(pandas_buffer) + + if expected.empty and actual.empty: + # We return RangeIndex columns compared + # to pandas' Index[object] columns + actual.columns = expected.columns + + assert_eq(expected, actual, check_index_type=True) + + expected = pd.read_parquet(pandas_buffer) + actual = cudf.read_parquet(cudf_buffer) + + if expected.empty and actual.empty: + # We return RangeIndex columns compared + # to pandas' Index[object] columns + actual.columns = expected.columns + + assert_eq( + expected, + actual, + check_index_type=True, + ) + + +@pytest.mark.parametrize( + "pdf", + [ + pd.DataFrame(index=[1, 2, 3]), + pd.DataFrame({"a": [1, 2, 3]}, index=[0.43534, 345, 0.34534]), + pd.DataFrame( + {"b": [11, 22, 33], "c": ["a", "b", "c"]}, + index=pd.Index(["a", "b", "c"], name="custom name"), + ), + pd.DataFrame( + {"a": [10, 11, 12], "b": [99, 88, 77]}, + index=pd.RangeIndex(12, 17, 2), + ), + pd.DataFrame( + {"b": [99, 88, 77]}, + index=pd.RangeIndex(22, 27, 2, name="hello index"), + ), + pd.DataFrame(index=pd.Index(["a", "b", "c"], name="custom name")), + pd.DataFrame( + {"a": ["a", "bb", "cc"], "b": [10, 21, 32]}, + index=pd.MultiIndex.from_tuples([[1, 2], [10, 11], [15, 16]]), + ), + pd.DataFrame( + {"a": ["a", "bb", "cc"], "b": [10, 21, 32]}, + index=pd.MultiIndex.from_tuples( + [[1, 2], [10, 11], [15, 16]], names=["first", "second"] + ), + ), + ], +) +@pytest.mark.parametrize("index", [None, True, False]) +def test_parquet_index(pdf, index): + run_parquet_index(pdf, index) + + +@pytest.mark.parametrize( + "index", + [ + pytest.param( + None, + marks=pytest.mark.xfail( + reason="https://github.com/apache/arrow/issues/40743" + ), + ), + True, + ], +) +def test_parquet_index_empty(index): + pdf = pd.DataFrame(index=pd.RangeIndex(0, 10, 1)) + run_parquet_index(pdf, index) + + +def test_parquet_no_index_empty(): + pdf = pd.DataFrame(index=pd.RangeIndex(0, 10, 1)) + run_parquet_index(pdf, index=False) + + +def test_parquet_allnull_str(tmp_path, engine): + pandas_path = tmp_path / "pandas_allnulls.parquet" + + pdf = pd.DataFrame( + {"a": pd.Series([None, None, None, None, None], dtype="str")} + ) + expected_gdf = cudf.DataFrame( + {"a": cudf.Series([None, None, None, None, None], dtype="str")} + ) + + pdf.to_parquet(pandas_path) + with _hide_pyarrow_parquet_cpu_warnings(engine): + actual_gdf = cudf.read_parquet(pandas_path, engine=engine) + + assert_eq(actual_gdf, expected_gdf) + + +def normalized_equals(value1, value2): + if value1 is pd.NA or value1 is pd.NaT: + value1 = None + if value2 is pd.NA or value2 is pd.NaT: + value2 = None + if isinstance(value1, np.datetime64): + value1 = pd.Timestamp(value1).to_pydatetime() + if isinstance(value2, np.datetime64): + value2 = pd.Timestamp(value2).to_pydatetime() + if isinstance(value1, pd.Timestamp): + value1 = value1.to_pydatetime() + if isinstance(value2, pd.Timestamp): + value2 = value2.to_pydatetime() + if isinstance(value1, datetime.datetime): + value1 = value1.replace(tzinfo=None) + if isinstance(value2, datetime.datetime): + value2 = value2.replace(tzinfo=None) + if isinstance(value1, pd.Timedelta): + unit = "ms" if value1.unit == "s" else value1.unit + value2 = pd.Timedelta(value2, unit=unit) + + # if one is datetime then both values are datetimes now + if isinstance(value1, datetime.datetime): + return value1 == value2 + + # Compare integers with floats now + if isinstance(value1, float) or isinstance(value2, float): + return math.isclose(value1, value2) + + return value1 == value2 + + +@pytest.mark.parametrize("add_nulls", [True, False]) +@pytest.mark.parametrize("store_schema", [True, False]) +def test_parquet_writer_statistics(tmp_path, pdf, add_nulls, store_schema): + if store_schema and version.parse(pa.__version__) < version.parse( + "15.0.0" + ): + pytest.skip("https://github.com/apache/arrow/pull/37792") + file_path = tmp_path / "cudf.parquet" + if "col_category" in pdf.columns: + pdf = pdf.drop(columns=["col_category", "col_bool"]) + + if not add_nulls: + # Timedelta types convert NaT to None when reading from parquet into + # pandas which interferes with series.max()/min() + for t in TIMEDELTA_TYPES: + pdf["col_" + t] = pd.Series(np.arange(len(pdf.index))).astype(t) + # pyarrow can't read values with non-zero nanoseconds + pdf["col_timedelta64[ns]"] = pdf["col_timedelta64[ns]"] * 1000 + + gdf = cudf.from_pandas(pdf) + if add_nulls: + for col in gdf: + set_random_null_mask_inplace(gdf[col]) + gdf.to_parquet(file_path, index=False, store_schema=store_schema) + + # Read back from pyarrow + pq_file = pq.ParquetFile(file_path) + # verify each row group's statistics + for rg in range(0, pq_file.num_row_groups): + pd_slice = pq_file.read_row_group(rg).to_pandas() + + # statistics are per-column. So need to verify independently + for i, col in enumerate(pd_slice): + stats = pq_file.metadata.row_group(rg).column(i).statistics + + actual_min = pd_slice[col].min() + stats_min = stats.min + assert normalized_equals(actual_min, stats_min) + + actual_max = pd_slice[col].max() + stats_max = stats.max + assert normalized_equals(actual_max, stats_max) + + assert stats.null_count == pd_slice[col].isna().sum() + assert stats.num_values == pd_slice[col].count() + + +def test_parquet_writer_list_statistics(tmp_path): + df = pd.DataFrame( + { + "a": list_gen(string_gen, 64, 40, 25), + "b": list_gen(int_gen, 64, 40, 25), + "c": list_gen(int_gen, 64, 40, 25, include_validity=True), + "d": list_gen(string_gen, 64, 40, 25, include_validity=True), + } + ) + fname = tmp_path / "test_parquet_writer_list_statistics.parquet" + gdf = cudf.from_pandas(df) + + gdf.to_parquet(fname) + assert os.path.exists(fname) + + # Read back from pyarrow + pq_file = pq.ParquetFile(fname) + # verify each row group's statistics + for rg in range(0, pq_file.num_row_groups): + pd_slice = pq_file.read_row_group(rg).to_pandas() + + # statistics are per-column. So need to verify independently + for i, col in enumerate(pd_slice): + stats = pq_file.metadata.row_group(rg).column(i).statistics + + actual_min = pd_slice[col].explode().explode().dropna().min() + stats_min = stats.min + assert normalized_equals(actual_min, stats_min) + + actual_max = pd_slice[col].explode().explode().dropna().max() + stats_max = stats.max + assert normalized_equals(actual_max, stats_max) + + +@pytest.mark.parametrize( + "data", + [ + # Structs + { + "being": [ + None, + {"human?": True, "Deets": {"Name": "Carrot", "Age": 27}}, + {"human?": None, "Deets": {"Name": "Angua", "Age": 25}}, + {"human?": False, "Deets": {"Name": "Cheery", "Age": 31}}, + {"human?": False, "Deets": None}, + {"human?": None, "Deets": {"Name": "Mr", "Age": None}}, + ] + }, + # List of Structs + { + "family": [ + [None, {"human?": True, "deets": {"weight": 2.4, "age": 27}}], + [ + {"human?": None, "deets": {"weight": 5.3, "age": 25}}, + {"human?": False, "deets": {"weight": 8.0, "age": 31}}, + {"human?": False, "deets": None}, + ], + [], + [{"human?": None, "deets": {"weight": 6.9, "age": None}}], + ] + }, + # Struct of Lists + { + "Real estate records": [ + None, + { + "Status": "NRI", + "Ownerships": { + "land_unit": [None, 2, None], + "flats": [[1, 2, 3], [], [4, 5], [], [0, 6, 0]], + }, + }, + { + "Status": None, + "Ownerships": { + "land_unit": [4, 5], + "flats": [[7, 8], []], + }, + }, + { + "Status": "RI", + "Ownerships": {"land_unit": None, "flats": [[]]}, + }, + {"Status": "RI", "Ownerships": None}, + { + "Status": None, + "Ownerships": { + "land_unit": [7, 8, 9], + "flats": [[], [], []], + }, + }, + ] + }, + ], +) +def test_parquet_writer_nested(tmp_path, data): + expect = pd.DataFrame(data) + gdf = cudf.from_pandas(expect) + + fname = tmp_path / "test_parquet_writer_nested.parquet" + gdf.to_parquet(fname) + assert os.path.exists(fname) + + got = pd.read_parquet(fname) + assert_eq(expect, got) + + +@pytest.mark.parametrize( + "decimal_type", + [cudf.Decimal32Dtype, cudf.Decimal64Dtype, cudf.Decimal128Dtype], +) +@pytest.mark.parametrize("data", [[1, 2, 3], [0.00, 0.01, None, 0.5]]) +def test_parquet_writer_decimal(decimal_type, data): + gdf = cudf.DataFrame({"val": data}) + + gdf["dec_val"] = gdf["val"].astype(decimal_type(7, 2)) + + buff = BytesIO() + gdf.to_parquet(buff) + + got = pd.read_parquet(buff, dtype_backend="numpy_nullable") + assert_eq(gdf["val"].to_pandas(nullable=True), got["val"]) + assert_eq(gdf["dec_val"].to_pandas(), got["dec_val"]) + + +def test_parquet_writer_column_validation(): + cudf_parquet = BytesIO() + pandas_parquet = BytesIO() + df = cudf.DataFrame({1: [1, 2, 3], "a": ["a", "b", "c"]}) + pdf = df.to_pandas() + + with cudf.option_context("mode.pandas_compatible", True): + with pytest.warns(UserWarning): + df.to_parquet(cudf_parquet) + + with pytest.warns(UserWarning): + pdf.to_parquet(pandas_parquet) + + assert_eq( + pd.read_parquet(cudf_parquet), + cudf.read_parquet(pandas_parquet), + ) + assert_eq( + cudf.read_parquet(cudf_parquet), + pd.read_parquet(pandas_parquet), + ) + + with cudf.option_context("mode.pandas_compatible", False): + with pytest.raises(ValueError): + df.to_parquet(cudf_parquet) + + +def test_parquet_writer_nulls_pandas_read(tmp_path, pdf): + if "col_bool" in pdf.columns: + pdf.drop(columns="col_bool", inplace=True) + if "col_category" in pdf.columns: + pdf.drop(columns="col_category", inplace=True) + gdf = cudf.from_pandas(pdf) + + num_rows = len(gdf) + + if num_rows > 0: + for col in gdf.columns: + gdf[col][random.randint(0, num_rows - 1)] = None + + fname = tmp_path / "test_parquet_writer_nulls_pandas_read.parquet" + gdf.to_parquet(fname) + assert os.path.exists(fname) + + got = pd.read_parquet(fname) + nullable = num_rows > 0 + + if nullable: + gdf = gdf.drop(columns="col_datetime64[ms]") + gdf = gdf.drop(columns="col_datetime64[us]") + got = got.drop(columns="col_datetime64[ms]") + got = got.drop(columns="col_datetime64[us]") + + assert_eq(gdf.to_pandas(nullable=nullable), got) + + +@pytest.mark.parametrize( + "decimal_type", + [cudf.Decimal32Dtype, cudf.Decimal64Dtype, cudf.Decimal128Dtype], +) +def test_parquet_decimal_precision(tmp_path, decimal_type): + df = cudf.DataFrame({"val": ["3.5", "4.2"]}).astype(decimal_type(5, 2)) + assert df.val.dtype.precision == 5 + + fname = tmp_path / "decimal_test.parquet" + df.to_parquet(fname) + df = cudf.read_parquet(fname) + assert df.val.dtype.precision == 5 + + +def test_parquet_decimal_precision_empty(tmp_path): + df = ( + cudf.DataFrame({"val": ["3.5", "4.2"]}) + .astype(cudf.Decimal64Dtype(5, 2)) + .iloc[:0] + ) + assert df.val.dtype.precision == 5 + + fname = tmp_path / "decimal_test.parquet" + df.to_parquet(fname) + df = cudf.read_parquet(fname) + assert df.val.dtype.precision == 5 + + +def test_parquet_reader_brotli(datadir): + fname = datadir / "brotli_int16.parquet" + + expect = pd.read_parquet(fname) + got = cudf.read_parquet(fname).to_pandas(nullable=True) + + assert_eq(expect, got) + + +def test_parquet_reader_one_level_list(datadir): + fname = datadir / "one_level_list.parquet" + + expect = pd.read_parquet(fname) + got = cudf.read_parquet(fname) + + assert_eq(expect, got) + + +def test_parquet_reader_binary_decimal(datadir): + fname = datadir / "binary_decimal.parquet" + + expect = pd.read_parquet(fname) + got = cudf.read_parquet(fname).to_pandas() + + assert_eq(expect, got) + + +def test_parquet_reader_fixed_bin(datadir): + fname = datadir / "fixed_len_byte_array.parquet" + + expect = pd.read_parquet(fname) + got = cudf.read_parquet(fname) + + assert_eq(expect, got) + + +def test_parquet_reader_fixed_len_with_dict(tmp_path): + def flba(i): + hasher = hashlib.sha256() + hasher.update(i.to_bytes(4, "little")) + return hasher.digest() + + # use pyarrow to write table of fixed_len_byte_array + num_rows = 200 + data = pa.array([flba(i) for i in range(num_rows)], type=pa.binary(32)) + padf = pa.Table.from_arrays([data], names=["flba"]) + padf_fname = tmp_path / "padf.parquet" + pq.write_table(padf, padf_fname, use_dictionary=True) + + expect = pd.read_parquet(padf_fname) + got = cudf.read_parquet(padf_fname) + assert_eq(expect, got) + + +def test_parquet_flba_round_trip(tmp_path): + def flba(i): + hasher = hashlib.sha256() + hasher.update(i.to_bytes(4, "little")) + return hasher.digest() + + # use pyarrow to write table of fixed_len_byte_array + num_rows = 200 + data = pa.array([flba(i) for i in range(num_rows)], type=pa.binary(32)) + padf = pa.Table.from_arrays([data], names=["flba"]) + padf_fname = tmp_path / "padf.parquet" + pq.write_table(padf, padf_fname) + + # round trip data with cudf + cdf = cudf.read_parquet(padf_fname) + cdf_fname = tmp_path / "cdf.parquet" + cdf.to_parquet(cdf_fname, column_type_length={"flba": 32}) + + # now read back in with pyarrow to test it was written properly by cudf + padf2 = pq.read_table(padf_fname) + padf3 = pq.read_table(cdf_fname) + assert_eq(padf2, padf3) + assert_eq(padf2.schema[0].type, padf3.schema[0].type) + + +@pytest.mark.parametrize( + "encoding", + [ + "PLAIN", + "DICTIONARY", + "DELTA_BINARY_PACKED", + "BYTE_STREAM_SPLIT", + "USE_DEFAULT", + ], +) +def test_per_column_encoding_option(encoding): + pdf = pd.DataFrame({"ilist": [[1, 2, 3, 1, 2, 3]], "i1": [1]}) + cdf = cudf.from_pandas(pdf) + buffer = BytesIO() + cdf.to_parquet( + buffer, + column_encoding={"ilist.list.element": encoding}, + ) + # DICTIONARY and USE_DEFAULT should both result in a PLAIN_DICTIONARY encoding in parquet + encoding_name = ( + "PLAIN_DICTIONARY" + if encoding == "DICTIONARY" or encoding == "USE_DEFAULT" + else encoding + ) + pf = pq.ParquetFile(buffer) + fmd = pf.metadata + assert encoding_name in fmd.row_group(0).column(0).encodings + + +@pytest.mark.parametrize("compression", ["SNAPPY", "ZSTD"]) +def test_per_column_compression_option(set_decomp_env_vars, compression): + pdf = pd.DataFrame( + {"ilist": [[1, 2, 3, 1, 2, 3]], "i1": [[1, 2, 3, 1, 2, 3]]} + ) + cdf = cudf.from_pandas(pdf) + buffer = BytesIO() + cdf.to_parquet( + buffer, + compression=compression, + skip_compression={"ilist.list.element"}, + use_dictionary=False, # to make sure that data is compressible + ) + + pf = pq.ParquetFile(buffer) + fmd = pf.metadata + assert fmd.row_group(0).column(0).compression == "UNCOMPRESSED" + assert fmd.row_group(0).column(1).compression == compression + + +@pytest.mark.parametrize( + "encoding", + ["DELTA_LENGTH_BYTE_ARRAY", "DELTA_BYTE_ARRAY"], +) +def test_per_column_options_string_col(tmp_path, encoding): + pdf = pd.DataFrame({"s": ["a string"], "i1": [1]}) + cdf = cudf.from_pandas(pdf) + fname = tmp_path / "strcol.parquet" + cdf.to_parquet( + fname, + column_encoding={"s": encoding}, + compression="SNAPPY", + ) + pf = pq.ParquetFile(fname) + fmd = pf.metadata + assert encoding in fmd.row_group(0).column(0).encodings + + +@pytest.mark.skipif( + version.parse(pa.__version__) < version.parse("16.0.0"), + reason="https://github.com/apache/arrow/pull/39748", +) +def test_parquet_bss_round_trip(tmp_path): + num_rows = 200 + + def flba(i): + hasher = hashlib.sha256() + hasher.update(i.to_bytes(4, "little")) + return hasher.digest() + + # use pyarrow to write table of types that support BYTE_STREAM_SPLIT encoding + rows_per_rowgroup = 5000 + fixed_data = pa.array( + [flba(i) for i in range(num_rows)], type=pa.binary(32) + ) + i32_data = pa.array(list(range(num_rows)), type=pa.int32()) + i64_data = pa.array(list(range(num_rows)), type=pa.int64()) + f32_data = pa.array([float(i) for i in range(num_rows)], type=pa.float32()) + f64_data = pa.array([float(i) for i in range(num_rows)], type=pa.float64()) + padf = pa.Table.from_arrays( + [fixed_data, i32_data, i64_data, f32_data, f64_data], + names=["flba", "i32", "i64", "f32", "f64"], + ) + padf_fname = tmp_path / "padf.parquet" + pq.write_table( + padf, + padf_fname, + column_encoding="BYTE_STREAM_SPLIT", + use_dictionary=False, + row_group_size=rows_per_rowgroup, + ) + + # round trip data with cudf + cdf = cudf.read_parquet(padf_fname) + cdf_fname = tmp_path / "cdf.parquet" + cdf.to_parquet( + cdf_fname, + column_type_length={"flba": 32}, + column_encoding={ + "flba": "BYTE_STREAM_SPLIT", + "i32": "BYTE_STREAM_SPLIT", + "i64": "BYTE_STREAM_SPLIT", + "f32": "BYTE_STREAM_SPLIT", + "f64": "BYTE_STREAM_SPLIT", + }, + row_group_size_rows=rows_per_rowgroup, + ) + + # now read back in with pyarrow to test it was written properly by cudf + padf2 = pq.read_table(padf_fname) + padf3 = pq.read_table(cdf_fname) + assert_eq(padf2, padf3) + assert_eq(padf2.schema[0].type, padf3.schema[0].type) + + +def test_parquet_reader_rle_boolean(datadir): + fname = datadir / "rle_boolean_encoding.parquet" + + expect = pd.read_parquet(fname) + got = cudf.read_parquet(fname) + + assert_eq(expect, got) + + +# testing a specific bug-fix/edge case. +# specifically: int a parquet file containing a particular way of representing +# a list column in a schema, the cudf reader was confusing +# nesting information between a list column and a subsequent +# string column, ultimately causing a crash. +@pytest.mark.skipif( + PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, + reason="Older versions of pandas do not have DataFrame.map()", +) +def test_parquet_reader_one_level_list2(datadir): + # we are reading in a file containing binary types, but cudf returns + # those as strings. so we have to massage the pandas data to get + # them to compare correctly. + def postprocess(val): + if isinstance(val, bytes): + return val.decode() + elif isinstance(val, np.ndarray): + return np.array([v.decode() for v in val]) + else: + return val + + fname = datadir / "one_level_list2.parquet" + + expect = pd.read_parquet(fname) + expect = expect.map(postprocess) + got = cudf.read_parquet(fname) + + assert_eq(expect, got, check_dtype=False) + + +# testing a specific bug-fix/edge case. +# specifically: in a parquet file containing a particular way of representing +# a list column in a schema, the cudf reader was confusing +# nesting information and building a list of list of int instead +# of a list of int +def test_parquet_reader_one_level_list3(datadir): + fname = datadir / "one_level_list3.parquet" + + expect = pd.read_parquet(fname) + got = cudf.read_parquet(fname) + + assert_eq(expect, got, check_dtype=True) + + +@pytest.mark.parametrize("size_bytes", [4_000_000, 1_000_000, 600_000]) +@pytest.mark.parametrize("size_rows", [1_000_000, 100_000, 10_000]) +def test_to_parquet_row_group_size( + tmp_path, large_int64_gdf, size_bytes, size_rows +): + fname = tmp_path / "row_group_size.parquet" + large_int64_gdf.to_parquet( + fname, row_group_size_bytes=size_bytes, row_group_size_rows=size_rows + ) + + num_rows, row_groups, col_names, _, _ = cudf.io.read_parquet_metadata( + fname + ) + # 8 bytes per row, as the column is int64 + expected_num_rows = max( + math.ceil(num_rows / size_rows), math.ceil(8 * num_rows / size_bytes) + ) + assert expected_num_rows == row_groups + + +@pytest.mark.parametrize("size_rows", [500_000, 100_000, 10_000]) +def test_parquet_row_group_metadata(tmp_path, large_int64_gdf, size_rows): + fname = tmp_path / "row_group_size.parquet" + large_int64_gdf.to_parquet(fname, row_group_size_rows=size_rows) + + # read file metadata from parquet + ( + num_rows, + row_groups, + _, # col_names + _, # num_columns + row_group_metadata, + ) = cudf.io.read_parquet_metadata(fname) + + # length(RowGroupsMetaData) == number of row groups + assert len(row_group_metadata) == row_groups + # sum of rows in row groups == total rows + assert num_rows == sum( + [row_group["num_rows"] for row_group in row_group_metadata] + ) + + +def test_parquet_reader_decimal_columns(): + df = cudf.DataFrame( + { + "col1": cudf.Series([1, 2, 3], dtype=cudf.Decimal64Dtype(10, 2)), + "col2": [10, 11, 12], + "col3": [12, 13, 14], + "col4": ["a", "b", "c"], + } + ) + buffer = BytesIO() + df.to_parquet(buffer) + + actual = cudf.read_parquet(buffer, columns=["col3", "col2", "col1"]) + expected = pd.read_parquet(buffer, columns=["col3", "col2", "col1"]) + + assert_eq(actual, expected) + + +def test_parquet_reader_zstd_compression(datadir): + fname = datadir / "spark_zstd.parquet" + try: + df = cudf.read_parquet(fname) + pdf = pd.read_parquet(fname) + assert_eq(df, pdf) + except RuntimeError: + pytest.skip(reason="zstd support is not enabled") + + +def test_read_parquet_multiple_files(tmp_path): + df_1_path = tmp_path / "df_1.parquet" + df_2_path = tmp_path / "df_2.parquet" + df_1 = cudf.DataFrame({"id": range(100), "a": [1] * 100}) + df_1.to_parquet(df_1_path) + + df_2 = cudf.DataFrame({"id": range(200, 2200), "a": [2] * 2000}) + df_2.to_parquet(df_2_path) + + expected = pd.read_parquet([df_1_path, df_2_path]) + actual = cudf.read_parquet([df_1_path, df_2_path]) + assert_eq(expected, actual) + + expected = pd.read_parquet([df_2_path, df_1_path]) + actual = cudf.read_parquet([df_2_path, df_1_path]) + assert_eq(expected, actual) + + +@pytest.mark.parametrize("index", [True, False, None]) +@pytest.mark.parametrize("columns", [None, [], ["b", "a"]]) +def test_parquet_columns_and_index_param(index, columns): + buffer = BytesIO() + df = cudf.DataFrame({"a": [1, 2, 3], "b": ["a", "b", "c"]}) + df.to_parquet(buffer, index=index) + + expected = pd.read_parquet(buffer, columns=columns) + got = cudf.read_parquet(buffer, columns=columns) + if columns == [] and index in {False, None}: + # cuDF returns RangeIndex columns compared + # to pandas' Index[object] columns + got.columns = expected.columns + + assert_eq(expected, got, check_index_type=True) + + +@pytest.mark.parametrize("columns", [None, ["b", "a"]]) +def test_parquet_columns_and_range_index(columns): + buffer = BytesIO() + df = cudf.DataFrame( + {"a": [1, 2, 3], "b": ["a", "b", "c"]}, index=pd.RangeIndex(2, 5) + ) + df.to_parquet(buffer) + + expected = pd.read_parquet(buffer, columns=columns) + got = cudf.read_parquet(buffer, columns=columns) + + assert_eq(expected, got, check_index_type=True) + + +def test_parquet_nested_struct_list(): + buffer = BytesIO() + data = { + "payload": { + "Domain": { + "Name": "abc", + "Id": {"Name": "host", "Value": "127.0.0.8"}, + }, + "StreamId": "12345678", + "Duration": 10, + "Offset": 12, + "Resource": [{"Name": "ZoneName", "Value": "RAPIDS"}], + } + } + df = cudf.DataFrame({"a": cudf.Series(data)}) + + df.to_parquet(buffer) + expected = pd.read_parquet(buffer) + actual = cudf.read_parquet(buffer) + assert_eq(expected, actual) + assert_eq(actual.a.dtype, df.a.dtype) + + +def test_parquet_writer_zstd(): + size = 12345 + rng = np.random.default_rng(seed=0) + expected = cudf.DataFrame( + { + "a": np.arange(0, stop=size, dtype="float64"), + "b": rng.choice(list("abcd"), size=size), + "c": rng.choice(np.arange(4), size=size), + } + ) + + buff = BytesIO() + try: + expected.to_parquet(buff, compression="ZSTD") + except RuntimeError: + pytest.mark.xfail(reason="Newer nvCOMP version is required") + else: + got = pd.read_parquet(buff) + assert_eq(expected, got) + + +@pytest.mark.parametrize("store_schema", [True, False]) +def test_parquet_writer_time_delta_physical_type(store_schema): + df = cudf.DataFrame( + { + "s": cudf.Series([1], dtype="timedelta64[s]"), + "ms": cudf.Series([2], dtype="timedelta64[ms]"), + "us": cudf.Series([3], dtype="timedelta64[us]"), + # 4K because Pandas/pyarrow don't support non-zero nanoseconds + # in Parquet files + "ns": cudf.Series([4000], dtype="timedelta64[ns]"), + } + ) + buffer = BytesIO() + df.to_parquet(buffer, store_schema=store_schema) + + got = pd.read_parquet(buffer) + + if store_schema: + expected = pd.DataFrame( + { + "s": ["0 days 00:00:01"], + "ms": ["0 days 00:00:00.002000"], + "us": ["0 days 00:00:00.000003"], + "ns": ["0 days 00:00:00.000004"], + }, + dtype="str", + ) + else: + expected = pd.DataFrame( + { + "s": ["00:00:01"], + "ms": ["00:00:00.002000"], + "us": ["00:00:00.000003"], + "ns": ["00:00:00.000004"], + }, + dtype="str", + ) + assert_eq(got.astype("str"), expected) + + +@pytest.mark.parametrize("store_schema", [True, False]) +def test_parquet_roundtrip_time_delta(store_schema): + num_rows = 12345 + df = cudf.DataFrame( + { + "s": cudf.Series( + random.sample(range(0, 200000), num_rows), + dtype="timedelta64[s]", + ), + "ms": cudf.Series( + random.sample(range(0, 200000), num_rows), + dtype="timedelta64[ms]", + ), + "us": cudf.Series( + random.sample(range(0, 200000), num_rows), + dtype="timedelta64[us]", + ), + "ns": cudf.Series( + random.sample(range(0, 200000), num_rows), + dtype="timedelta64[ns]", + ), + } + ) + buffer = BytesIO() + df.to_parquet(buffer, store_schema=store_schema) + # `check_dtype` cannot be removed here as timedelta64[s] will change to `timedelta[ms]` + assert_eq(df, cudf.read_parquet(buffer), check_dtype=False) + if store_schema: + assert_eq(df, pd.read_parquet(buffer)) + + +def test_parquet_reader_malformed_file(datadir): + fname = datadir / "nested-unsigned-malformed.parquet" + + # expect a failure when reading the whole file + with pytest.raises(RuntimeError): + cudf.read_parquet(fname) + + +def test_parquet_reader_unsupported_page_encoding(datadir): + fname = datadir / "delta_encoding.parquet" + + # expect a failure when reading the whole file + with pytest.raises(RuntimeError): + cudf.read_parquet(fname) + + +def test_parquet_reader_detect_bad_dictionary(datadir): + fname = datadir / "bad_dict.parquet" + + # expect a failure when reading the whole file + with pytest.raises(RuntimeError): + cudf.read_parquet(fname) + + +@pytest.mark.parametrize("data", [{"a": [1, 2, 3, 4]}, {"b": [1, None, 2, 3]}]) +@pytest.mark.parametrize("force_nullable_schema", [True, False]) +def test_parquet_writer_schema_nullability(data, force_nullable_schema): + df = cudf.DataFrame(data) + file_obj = BytesIO() + + df.to_parquet(file_obj, force_nullable_schema=force_nullable_schema) + + assert pa.parquet.read_schema(file_obj).field(0).nullable == ( + force_nullable_schema or df.isnull().any().any() + ) + + +def test_parquet_read_filter_and_project(): + # Filter on columns that are not included + # in the current column projection + + with BytesIO() as buffer: + # Write parquet data + df = cudf.DataFrame( + { + "a": [1, 2, 3, 4, 5] * 10, + "b": [0, 1, 2, 3, 4] * 10, + "c": range(50), + "d": [6, 7] * 25, + "e": [8, 9] * 25, + } + ) + df.to_parquet(buffer) + + # Read back with filter and projection + columns = ["b"] + filters = [[("a", "==", 5), ("c", ">", 20)]] + got = cudf.read_parquet(buffer, columns=columns, filters=filters) + + # Check result + expected = df[(df.a == 5) & (df.c > 20)][columns].reset_index(drop=True) + assert_eq(got, expected) + + +def test_parquet_reader_multiindex(): + expected = pd.DataFrame( + {"A": [1, 2, 3]}, + index=pd.MultiIndex.from_tuples([("a", 1), ("a", 2), ("b", 1)]), + ) + file_obj = BytesIO() + expected.to_parquet(file_obj, engine="pyarrow") + with pytest.warns(UserWarning): + actual = cudf.read_parquet(file_obj, engine="pyarrow") + assert_eq(actual, expected) + + +def test_parquet_reader_engine_error(): + with pytest.raises(ValueError): + cudf.read_parquet(BytesIO(), engine="abc") + + +def test_reader_lz4(): + pdf = pd.DataFrame({"ints": [1, 2] * 5001}) + + buffer = BytesIO() + pdf.to_parquet(buffer, compression="LZ4") + + got = cudf.read_parquet(buffer) + assert_eq(pdf, got) + + +def test_writer_lz4(): + gdf = cudf.DataFrame({"ints": [1, 2] * 5001}) + + buffer = BytesIO() + gdf.to_parquet(buffer, compression="LZ4") + + got = pd.read_parquet(buffer) + assert_eq(gdf, got) + + +def test_parquet_reader_zstd_huff_tables(datadir): + # Ensure that this zstd-compressed file does not overrun buffers. The + # problem was fixed in nvcomp 3.0.6. + # See https://github.com/rapidsai/cudf/issues/15096 + fname = datadir / "zstd_huff_tables_bug.parquet" + + expected = pa.parquet.read_table(fname).to_pandas() + actual = cudf.read_parquet(fname) + assert_eq(actual, expected) + + +def test_parquet_reader_roundtrip_with_arrow_schema(): + # Ensure that the nested types are faithfully being roundtripped + # across Parquet with arrow schema which is used to faithfully + # round trip duration types (timedelta64) across Parquet read and write. + pdf = pd.DataFrame( + { + "s": pd.Series([None, None, None], dtype="timedelta64[s]"), + "ms": pd.Series([1234, None, 32442], dtype="timedelta64[ms]"), + "us": pd.Series([None, 3456, None], dtype="timedelta64[us]"), + "ns": pd.Series([1234, 3456, 32442], dtype="timedelta64[ns]"), + "duration_list": list( + [ + [ + datetime.timedelta(minutes=7, seconds=4), + datetime.timedelta(minutes=7), + ], + [ + None, + None, + ], + [ + datetime.timedelta(minutes=7, seconds=4), + None, + ], + ] + ), + "int64": pd.Series([1234, 123, 4123], dtype="int64"), + "list": list([[1, 2], [1, 2], [1, 2]]), + "datetime": pd.Series([1234, 123, 4123], dtype="datetime64[ms]"), + "map": pd.Series(["cat", "dog", "lion"]).map( + {"cat": "kitten", "dog": "puppy", "lion": "cub"} + ), + } + ) + + # Write parquet with arrow for now (to write arrow:schema) + buffer = BytesIO() + pdf.to_parquet(buffer, engine="pyarrow") + + # Read parquet with arrow schema + got = cudf.read_parquet(buffer) + # Convert to cudf table for an apple to apple comparison + expected = cudf.from_pandas(pdf) + + # Check results for reader with schema + assert_eq(expected, got) + + # Reset buffer + buffer = BytesIO() + + # Write to buffer with cudf + expected.to_parquet(buffer, store_schema=True) + + # Read parquet with arrow schema + got = cudf.read_parquet(buffer) + # Convert to cudf table for an apple to apple comparison + expected = cudf.from_pandas(pdf) + + +@pytest.mark.parametrize( + "data", + [ + # struct + [ + {"a": 1, "b": 2}, + {"a": 10, "b": 20}, + {"a": None, "b": 22}, + {"a": None, "b": None}, + {"a": 15, "b": None}, + ], + # struct-of-list + [ + {"a": 1, "b": 2, "c": [1, 2, 3]}, + {"a": 10, "b": 20, "c": [4, 5]}, + {"a": None, "b": 22, "c": [6]}, + {"a": None, "b": None, "c": None}, + {"a": 15, "b": None, "c": [-1, -2]}, + None, + {"a": 100, "b": 200, "c": [-10, None, -20]}, + ], + # list-of-struct + [ + [{"a": 1, "b": 2}, {"a": 2, "b": 3}, {"a": 4, "b": 5}], + None, + [{"a": 10, "b": 20}], + [{"a": 100, "b": 200}, {"a": None, "b": 300}, None], + ], + # struct-of-struct + [ + {"a": 1, "b": {"inner_a": 10, "inner_b": 20}, "c": 2}, + {"a": 3, "b": {"inner_a": 30, "inner_b": 40}, "c": 4}, + {"a": 5, "b": {"inner_a": 50, "inner_b": None}, "c": 6}, + {"a": 7, "b": None, "c": 8}, + {"a": None, "b": {"inner_a": None, "inner_b": None}, "c": None}, + None, + {"a": None, "b": {"inner_a": None, "inner_b": 100}, "c": 10}, + ], + # struct-with-mixed-types + [ + { + "struct": { + "payload": { + "Domain": { + "Name": "abc", + "Id": {"Name": "host", "Value": "127.0.0.8"}, + "Duration": datetime.timedelta(minutes=12), + }, + "StreamId": "12345678", + "Duration": datetime.timedelta(minutes=4), + "Offset": None, + "Resource": [ + { + "Name": "ZoneName", + "Value": "RAPIDS", + "Duration": datetime.timedelta(seconds=1), + } + ], + } + } + } + ], + ], +) +def test_parquet_reader_roundtrip_structs_with_arrow_schema(tmp_path, data): + # Ensure that the structs with duration types are faithfully being + # roundtripped across Parquet with arrow schema + pdf = pd.DataFrame({"struct": pd.Series(data)}) + + buffer = BytesIO() + pdf.to_parquet(buffer, engine="pyarrow") + + # Read parquet with arrow schema + got = cudf.read_parquet(buffer) + # Convert to cudf table for an apple to apple comparison + expected = cudf.from_pandas(pdf) + + # Check results + assert_eq(expected, got) + + # Reset buffer + buffer = BytesIO() + + # Write to buffer with cudf + expected.to_parquet(buffer, store_schema=True) + + # Read parquet with arrow schema + got = cudf.read_parquet(buffer) + # Convert to cudf table for an apple to apple comparison + expected = cudf.from_pandas(pdf) + + # Check results + assert_eq(expected, got) + + +@pytest.mark.parametrize("index", [None, True, False]) +@pytest.mark.skipif( + version.parse(pa.__version__) < version.parse("15.0.0"), + reason="https://github.com/apache/arrow/pull/37792", +) +def test_parquet_writer_roundtrip_with_arrow_schema(index): + # Ensure that the concrete and nested types are faithfully being roundtripped + # across Parquet with arrow schema + expected = cudf.DataFrame( + { + "s": cudf.Series([None, None, None], dtype="timedelta64[s]"), + "us": cudf.Series([None, 3456, None], dtype="timedelta64[us]"), + "duration_list": list( + [ + [ + datetime.timedelta(minutes=7, seconds=4), + datetime.timedelta(minutes=7), + ], + [ + None, + None, + ], + [ + datetime.timedelta(minutes=7, seconds=4), + None, + ], + ] + ), + "int64": cudf.Series([-1234, 123, 4123], dtype="int64"), + "uint32": cudf.Series([1234, 123, 4123], dtype="uint32"), + "list": list([[1, 2], [1, 2], [1, 2]]), + "bool": cudf.Series([True, None, False], dtype=bool), + "fixed32": cudf.Series([0.00, 1.0, None]).astype( + cudf.Decimal32Dtype(7, 2) + ), + "fixed64": cudf.Series([0.00, 1.0, None]).astype( + cudf.Decimal64Dtype(7, 2) + ), + "fixed128": cudf.Series([0.00, 1.0, None]).astype( + cudf.Decimal128Dtype(7, 2) + ), + "datetime": cudf.Series([1234, 123, 4123], dtype="datetime64[ms]"), + "map": cudf.Series(["cat", "dog", "lion"]).map( + {"cat": "kitten", "dog": "puppy", "lion": "cub"} + ), + } + ) + + # Convert decimals32/64 to decimal128 if pyarrow version is < 19.0.0 + if version.parse(pa.__version__) < version.parse("19.0.0"): + expected = expected.astype({"fixed32": cudf.Decimal128Dtype(9, 2)}) + expected = expected.astype({"fixed64": cudf.Decimal128Dtype(18, 2)}) + + # Write to Parquet with arrow schema for faithful roundtrip + buffer = BytesIO() + expected.to_parquet(buffer, store_schema=True, index=index) + + # Read parquet with pyarrow, pandas and cudf readers + got = cudf.DataFrame.from_arrow(pq.read_table(buffer)) + got2 = cudf.DataFrame.from_pandas(pd.read_parquet(buffer)) + got3 = cudf.read_parquet(buffer) + + # drop the index column for comparison: __index_level_0__ + if index: + got.drop(columns="__index_level_0__", inplace=True) + got2.drop(columns="__index_level_0__", inplace=True) + + # Check results + assert_eq(expected, got) + assert_eq(expected, got2) + assert_eq(expected, got3) + + +def test_parquet_writer_int96_timestamps_and_arrow_schema(): + df = cudf.DataFrame( + { + "timestamp": cudf.Series( + [1234, 123, 4123], dtype="datetime64[ms]" + ), + } + ) + + # Output buffer + buffer = BytesIO() + + # Writing out parquet with both INT96 timestamps and arrow_schema + # enabled should throw an exception. + with pytest.raises(RuntimeError): + df.to_parquet(buffer, int96_timestamps=True, store_schema=True) + + +@pytest.mark.parametrize( + "data", + [ + # struct + [ + {"a": 1, "b": 2}, + {"a": 10, "b": 20}, + {"a": None, "b": 22}, + {"a": None, "b": None}, + {"a": 15, "b": None}, + ], + # struct-of-list + [ + {"a": 1, "b": 2, "c": [1, 2, 3]}, + {"a": 10, "b": 20, "c": [4, 5]}, + {"a": None, "b": 22, "c": [6]}, + {"a": None, "b": None, "c": None}, + {"a": 15, "b": None, "c": [-1, -2]}, + None, + {"a": 100, "b": 200, "c": [-10, None, -20]}, + ], + # list-of-struct + [ + [{"a": 1, "b": 2}, {"a": 2, "b": 3}, {"a": 4, "b": 5}], + None, + [{"a": 10, "b": 20}], + [{"a": 100, "b": 200}, {"a": None, "b": 300}, None], + ], + # struct-of-struct + [ + {"a": 1, "b": {"inner_a": 10, "inner_b": 20}, "c": 2}, + {"a": 3, "b": {"inner_a": 30, "inner_b": 40}, "c": 4}, + {"a": 5, "b": {"inner_a": 50, "inner_b": None}, "c": 6}, + {"a": 7, "b": None, "c": 8}, + {"a": None, "b": {"inner_a": None, "inner_b": None}, "c": None}, + None, + {"a": None, "b": {"inner_a": None, "inner_b": 100}, "c": 10}, + ], + # struct-with-mixed-types + [ + { + "struct": { + "payload": { + "Domain": { + "Name": "abc", + "Id": {"Name": "host", "Value": "127.0.0.8"}, + "Duration": datetime.timedelta(minutes=12), + }, + "StreamId": "12345678", + "Duration": datetime.timedelta(minutes=4), + "Offset": None, + "Resource": [ + { + "Name": "ZoneName", + "Value": "RAPIDS", + "Duration": datetime.timedelta(seconds=1), + } + ], + } + } + } + ], + ], +) +@pytest.mark.parametrize("index", [None, True, False]) +@pytest.mark.skipif( + version.parse(pa.__version__) < version.parse("15.0.0"), + reason="https://github.com/apache/arrow/pull/37792", +) +def test_parquet_writer_roundtrip_structs_with_arrow_schema(data, index): + # Ensure that the structs are faithfully being roundtripped across + # Parquet with arrow schema + pa_expected = pa.Table.from_pydict({"struct": data}) + + expected = cudf.DataFrame.from_arrow(pa_expected) + + # Write expected data frame to Parquet with arrow schema + buffer = BytesIO() + expected.to_parquet(buffer, store_schema=True, index=index) + + # Read Parquet with pyarrow + pa_got = pq.read_table(buffer) + + # drop the index column for comparison: __index_level_0__ + if index: + pa_got = pa_got.drop(columns="__index_level_0__") + + # Check results + assert_eq(pa_expected, pa_got) + + # Convert to cuDF table and also read Parquet with cuDF reader + got = cudf.DataFrame.from_arrow(pa_got) + got2 = cudf.read_parquet(buffer) + + # Check results + assert_eq(expected, got) + assert_eq(expected, got2) + + +@pytest.mark.parametrize("chunk_read_limit", [0, 240, 1024000000]) +@pytest.mark.parametrize("pass_read_limit", [0, 240, 1024000000]) +@pytest.mark.parametrize("use_pandas_metadata", [True, False]) +@pytest.mark.parametrize("row_groups", [[[0]], None, [[0, 1]]]) +def test_parquet_chunked_reader( + chunk_read_limit, pass_read_limit, use_pandas_metadata, row_groups +): + df = pd.DataFrame( + {"a": [1, 2, 3, None] * 1000, "b": ["av", "qw", None, "xyz"] * 1000} + ) + buffer = BytesIO() + df.to_parquet(buffer, row_group_size=1000) + with cudf.option_context("io.parquet.low_memory", True): + actual = cudf.read_parquet( + [buffer], + _chunk_read_limit=chunk_read_limit, + _pass_read_limit=pass_read_limit, + use_pandas_metadata=use_pandas_metadata, + row_groups=row_groups, + ) + expected = cudf.read_parquet( + buffer, use_pandas_metadata=use_pandas_metadata, row_groups=row_groups + ) + assert_eq(expected, actual) + + +@pytest.mark.parametrize("chunk_read_limit", [256, 2560]) +@pytest.mark.parametrize("pass_read_limit", [256, 2560]) +@pytest.mark.parametrize("num_rows", [49, 291]) +@pytest.mark.parametrize("skip_rows", [412, 601]) +@pytest.mark.parametrize("data_size", [100, 200]) +def test_parquet_chunked_reader_structs( + chunk_read_limit, pass_read_limit, num_rows, skip_rows, data_size +): + data = [ + { + "a": "g", + "b": { + "b_a": 10, + "b_b": {"b_b_b": None, "b_b_a": 2}, + }, + "c": None, + }, + {"a": None, "b": {"b_a": None, "b_b": None}, "c": [15, 16]}, + {"a": "j", "b": None, "c": [8, 10]}, + {"a": None, "b": {"b_a": None, "b_b": None}, "c": None}, + None, + { + "a": None, + "b": {"b_a": None, "b_b": {"b_b_b": 1}}, + "c": [18, 19], + }, + {"a": None, "b": None, "c": None}, + ] * data_size + + pa_struct = pa.Table.from_pydict({"struct": data}) + df = cudf.DataFrame.from_arrow(pa_struct) + buffer = BytesIO() + df.to_parquet(buffer, row_group_size_rows=7000, max_page_size_rows=100) + + # Number of rows to read + nrows = num_rows if skip_rows + num_rows < len(df) else len(df) - skip_rows + + with cudf.option_context("io.parquet.low_memory", True): + actual = cudf.read_parquet( + [buffer], + _chunk_read_limit=chunk_read_limit, + _pass_read_limit=pass_read_limit, + nrows=nrows, + skip_rows=skip_rows, + ).reset_index(drop=True) + expected = cudf.read_parquet( + buffer, nrows=nrows, skip_rows=skip_rows + ).reset_index(drop=True) + assert_eq(expected, actual) + + +@pytest.mark.parametrize("chunk_read_limit", [0, 24, 10240000]) +@pytest.mark.parametrize("pass_read_limit", [0, 24, 10240000]) +@pytest.mark.parametrize("num_rows", [47, 97, None]) +@pytest.mark.parametrize( + "str_encoding", + [ + "PLAIN", + "DELTA_BYTE_ARRAY", + "DELTA_LENGTH_BYTE_ARRAY", + ], +) +def test_parquet_chunked_reader_string_decoders( + chunk_read_limit, + pass_read_limit, + num_rows, + str_encoding, +): + df = pd.DataFrame( + { + "i64": [1, 2, 3, None] * 100, + "str": ["av", "qw", "asd", "xyz"] * 100, + "list": list( + [["ad", "cd"], ["asd", "fd"], None, ["asd", None]] * 100 + ), + } + ) + buffer = BytesIO() + # Write 4 Parquet row groups with string column encoded + df.to_parquet( + buffer, + row_group_size=100, + use_dictionary=False, + column_encoding={"str": str_encoding}, + ) + + # Number of rows to read + nrows = num_rows if num_rows is not None else len(df) + + # Check with num_rows specified + with cudf.option_context("io.parquet.low_memory", True): + actual = cudf.read_parquet( + [buffer], + _chunk_read_limit=chunk_read_limit, + _pass_read_limit=pass_read_limit, + nrows=nrows, + ) + expected = cudf.read_parquet( + buffer, + nrows=nrows, + ) + assert_eq(expected, actual) + + +@pytest.mark.parametrize( + "nrows, skip_rows", + [ + (0, 0), + (99, 101), + (988, 61), + (99, 1011), + (101, 1601), + (99, 1901), + ], +) +@pytest.mark.parametrize( + "row_group_size_rows, page_size_rows", + [ + (1000, 1000), # 1 RG, 1 page per RG + (1000, 100), # 1 RG, multiple pages per RG + (100, 100), # multiple RGs, 1 page per RG + (100, 10), # multiple RGs, multiple pages per RG + ], +) +@pytest.mark.parametrize( + "chunk_read_limit, pass_read_limit", + [ + (256, 256), # small chunk and pass read limits + (0, 1024), # zero chunk and small pass read limit + (256, 0), # small chunk and zero pass read limit + (256000, 256000), # large chunk and pass read limits + ], +) +def test_chunked_parquet_reader_nrows_skiprows( + nrows, + skip_rows, + row_group_size_rows, + page_size_rows, + chunk_read_limit, + pass_read_limit, +): + df = cudf.DataFrame( + { + "a": list( + [ + ["cat", "lion", "deer"], + ["bear", "ibex", None], + ["tiger", None, "bull"], + [None, "wolf", "fox"], + ] + ) + * 500, + "b": ["av", "qw", None, "xyz"] * 500, + } + ) + expected = df[skip_rows : skip_rows + nrows] + buffer = BytesIO() + df.to_parquet( + buffer, + row_group_size_rows=row_group_size_rows, + max_page_size_rows=page_size_rows, + ) + got = cudf.read_parquet(buffer, nrows=nrows, skip_rows=skip_rows) + assert_eq(expected, got) + + # Check for chunked parquet reader + with cudf.option_context("io.parquet.low_memory", True): + got = cudf.read_parquet( + [buffer], + _chunk_read_limit=chunk_read_limit, + _pass_read_limit=pass_read_limit, + nrows=nrows, + skip_rows=skip_rows, + ).reset_index(drop=True) + # Reset index for comparison + expected = expected.reset_index(drop=True) + assert_eq(expected, got) + + +def test_parquet_reader_pandas_compatibility(): + df = pd.DataFrame( + {"a": [1, 2, 3, 4] * 10000, "b": ["av", "qw", "hi", "xyz"] * 10000} + ) + buffer = BytesIO() + df.to_parquet(buffer) + with cudf.option_context("io.parquet.low_memory", True): + expected = cudf.read_parquet(buffer) + assert_eq(expected, df) + + +@pytest.mark.parametrize("store_schema", [True, False]) +def test_parquet_reader_with_mismatched_tables(store_schema): + # cuDF tables with mixed types + df1 = cudf.DataFrame( + { + "i32": cudf.Series([None, None, None], dtype="int32"), + "i64": cudf.Series([1234, 467, 123], dtype="int64"), + "list": list([[1, 2], None, [None, 6]]), + "time": cudf.Series([1234, 123, 4123], dtype="datetime64[ms]"), + "str": ["vfd", None, "ghu"], + "d_list": list( + [ + [pd.Timedelta(minutes=1), pd.Timedelta(minutes=2)], + [None, pd.Timedelta(minutes=3)], + [pd.Timedelta(minutes=8), None], + ] + ), + } + ) + + df2 = cudf.DataFrame( + { + "str": ["abc", "def", "ghi"], + "i64": cudf.Series([None, 65, 98], dtype="int64"), + "times": cudf.Series([1234, None, 4123], dtype="datetime64[us]"), + "list": list([[7, 8], [9, 10], [11, 12]]), + "d_list": list( + [ + [pd.Timedelta(minutes=4), None], + None, + [pd.Timedelta(minutes=6), None], + ] + ), + } + ) + + # IO buffers + buf1 = BytesIO() + buf2 = BytesIO() + + # Write Parquet with and without arrow schema + df1.to_parquet(buf1, store_schema=store_schema) + df2.to_parquet(buf2, store_schema=store_schema) + + # Read mismatched Parquet files + got = cudf.read_parquet( + [buf1, buf2], + columns=["list", "d_list", "str"], + filters=[("i64", ">", 20)], + allow_mismatched_pq_schemas=True, + ) + + # Construct the expected table + expected = cudf.concat( + [ + df1[df1["i64"] > 20][["list", "d_list", "str"]], + df2[df2["i64"] > 20][["list", "d_list", "str"]], + ] + ).reset_index(drop=True) + + # Read with chunked reader (filter columns not supported) + with cudf.option_context("io.parquet.low_memory", True): + got_chunked = cudf.read_parquet( + [buf1, buf2], + columns=["list", "d_list", "str"], + _chunk_read_limit=240, + _pass_read_limit=240, + allow_mismatched_pq_schemas=True, + ) + + # Construct the expected table without filter columns + expected_chunked = cudf.concat( + [df1[["list", "d_list", "str"]], df2[["list", "d_list", "str"]]] + ).reset_index(drop=True) + + # Check results + assert_eq(expected, got) + assert_eq(expected_chunked, got_chunked) + + +def test_parquet_reader_with_mismatched_structs(): + data1 = [ + { + "a": 1, + "b": { + "a_a": 10, + "b_b": {"b_b_b": 1, "b_b_a": 2}, + }, + "c": 2, + }, + { + "a": 3, + "b": {"b_a": 30, "b_b": {"b_b_a": 210}}, + "c": 4, + }, + {"a": 5, "b": {"b_a": 50, "b_b": None}, "c": 6}, + {"a": 7, "b": None, "c": 8}, + {"a": 5, "b": {"b_a": None, "b_b": None}, "c": None}, + ] + + data2 = [ + {"a": 1, "b": {"b_b": {"b_b_a": None}}}, + {"a": 5, "b": {"b_b": None}}, + {"a": 7, "b": {"b_b": {"b_b_b": 1, "b_b_a": 0}}}, + {"a": None, "b": {"b_b": None}}, + None, + ] + + # cuDF tables from struct data + df1 = cudf.DataFrame.from_arrow(pa.Table.from_pydict({"struct": data1})) + df2 = cudf.DataFrame.from_arrow(pa.Table.from_pydict({"struct": data2})) + + # Buffers + buf1 = BytesIO() + buf2 = BytesIO() + + # Write to parquet + df1.to_parquet(buf1) + df2.to_parquet(buf2) + + # Read the struct.b.inner_b.inner_inner_a column from parquet + got = cudf.read_parquet( + [buf1, buf2], + columns=["struct.b.b_b.b_b_a"], + allow_mismatched_pq_schemas=True, + ) + got = ( + cudf.Series(got["struct"]) + .struct.field("b") + .struct.field("b_b") + .struct.field("b_b_a") + ) + + # Read with chunked reader + with cudf.option_context("io.parquet.low_memory", True): + got_chunked = cudf.read_parquet( + [buf1, buf2], + columns=["struct.b.b_b.b_b_a"], + _chunk_read_limit=240, + _pass_read_limit=240, + allow_mismatched_pq_schemas=True, + ) + got_chunked = ( + cudf.Series(got_chunked["struct"]) + .struct.field("b") + .struct.field("b_b") + .struct.field("b_b_a") + ) + + # Construct the expected series + expected = cudf.concat( + [ + cudf.Series(df1["struct"]) + .struct.field("b") + .struct.field("b_b") + .struct.field("b_b_a"), + cudf.Series(df2["struct"]) + .struct.field("b") + .struct.field("b_b") + .struct.field("b_b_a"), + ] + ).reset_index(drop=True) + + # Check results + assert_eq(expected, got) + assert_eq(expected, got_chunked) + + +def test_parquet_reader_with_mismatched_schemas_error(): + df1 = cudf.DataFrame( + { + "millis": cudf.Series([123, 3454, 123], dtype="timedelta64[ms]"), + "i64": cudf.Series([123, 3454, 123], dtype="int64"), + "i32": cudf.Series([123, 3454, 123], dtype="int32"), + } + ) + df2 = cudf.DataFrame( + { + "i64": cudf.Series([123, 3454, 123], dtype="int64"), + "millis": cudf.Series([123, 3454, 123], dtype="timedelta64[ms]"), + } + ) + + buf1 = BytesIO() + buf2 = BytesIO() + + df1.to_parquet(buf1, store_schema=True) + df2.to_parquet(buf2, store_schema=False) + + with pytest.raises( + ValueError, + match="Encountered mismatching SchemaElement properties for a column in the selected path", + ): + cudf.read_parquet( + [buf1, buf2], columns=["millis"], allow_mismatched_pq_schemas=True + ) + + data1 = [ + {"a": 1, "b": {"b_a": 1, "b_b": 6}}, + {"a": 3, "b": {"b_a": None, "b_b": 2}}, + ] + data2 = [ + {"b": {"b_a": 1}, "c": "str"}, + {"b": {"b_a": None}, "c": None}, + ] + + # cuDF tables from struct data + df1 = cudf.DataFrame.from_arrow(pa.Table.from_pydict({"struct": data1})) + df2 = cudf.DataFrame.from_arrow(pa.Table.from_pydict({"struct": data2})) + + # Buffers + buf1 = BytesIO() + buf2 = BytesIO() + + # Write to parquet + df1.to_parquet(buf1) + df2.to_parquet(buf2) + + with pytest.raises( + IndexError, + match="Encountered mismatching number of children for a column in the selected path", + ): + cudf.read_parquet( + [buf1, buf2], + columns=["struct.b"], + allow_mismatched_pq_schemas=True, + ) + + with pytest.raises( + IndexError, + match="Encountered mismatching schema tree depths across data sources", + ): + cudf.read_parquet( + [buf1, buf2], + columns=["struct.b.b_b"], + allow_mismatched_pq_schemas=True, + ) + + +def test_parquet_roundtrip_zero_rows_no_column_mask(): + expected = cudf.DataFrame._from_data( + { + "int": cudf.core.column.column_empty(0, np.dtype(np.int64)), + "float": cudf.core.column.column_empty(0, np.dtype(np.float64)), + "datetime": cudf.core.column.column_empty( + 0, np.dtype("datetime64[ns]") + ), + "timedelta": cudf.core.column.column_empty( + 0, np.dtype("timedelta64[ns]") + ), + "bool": cudf.core.column.column_empty(0, np.dtype(np.bool_)), + "decimal": cudf.core.column.column_empty( + 0, cudf.Decimal64Dtype(1) + ), + "struct": cudf.core.column.column_empty( + 0, cudf.StructDtype({"a": "int64"}) + ), + "list": cudf.core.column.column_empty( + 0, cudf.ListDtype("float64") + ), + } + ) + with BytesIO() as bio: + expected.to_parquet(bio) + result = cudf.read_parquet(bio) + assert_eq(result, expected) + + +def test_parquet_reader_mismatched_nullability(): + # Ensure that we can faithfully read the tables with mismatched nullabilities + df1 = cudf.DataFrame( + { + "timedelta": cudf.Series([12, 54, 1231], dtype="timedelta64[ms]"), + "duration_list": list( + [ + [ + [ + [pd.Timedelta(minutes=1), pd.Timedelta(minutes=2)], + None, + [pd.Timedelta(minutes=8), None], + ], + None, + ], + None, + [ + [ + [pd.Timedelta(minutes=1), pd.Timedelta(minutes=2)], + [pd.Timedelta(minutes=5), pd.Timedelta(minutes=3)], + [pd.Timedelta(minutes=8), pd.Timedelta(minutes=4)], + ] + ], + ] + ), + "int64": cudf.Series([1234, None, 4123], dtype="int64"), + "int32": cudf.Series([1234, 123, 4123], dtype="int32"), + "list": list([[1, 2], [1, 2], [1, 2]]), + "datetime": cudf.Series([1234, 123, 4123], dtype="datetime64[ms]"), + "string": cudf.Series(["kitten", "puppy", "cub"]), + } + ) + + df2 = cudf.DataFrame( + { + "timedelta": cudf.Series( + [None, None, None], dtype="timedelta64[ms]" + ), + "duration_list": list( + [ + [ + [ + [pd.Timedelta(minutes=1), pd.Timedelta(minutes=2)], + [pd.Timedelta(minutes=8), pd.Timedelta(minutes=1)], + ], + ], + [ + [ + [pd.Timedelta(minutes=1), pd.Timedelta(minutes=2)], + [pd.Timedelta(minutes=5), pd.Timedelta(minutes=3)], + [pd.Timedelta(minutes=8), pd.Timedelta(minutes=4)], + ] + ], + [ + [ + [pd.Timedelta(minutes=1), pd.Timedelta(minutes=2)], + [pd.Timedelta(minutes=5), pd.Timedelta(minutes=3)], + [pd.Timedelta(minutes=8), pd.Timedelta(minutes=4)], + ] + ], + ] + ), + "int64": cudf.Series([1234, 123, 4123], dtype="int64"), + "int32": cudf.Series([1234, None, 4123], dtype="int32"), + "list": list([[1, 2], None, [1, 2]]), + "datetime": cudf.Series( + [1234, None, 4123], dtype="datetime64[ms]" + ), + "string": cudf.Series(["kitten", None, "cub"]), + } + ) + + # Write tables to parquet with arrow schema for compatibility for duration column(s) + fname1 = BytesIO() + df1.to_parquet(fname1, store_schema=True) + fname2 = BytesIO() + df2.to_parquet(fname2, store_schema=True) + + # Read tables back with cudf and arrow in either order and compare + assert_eq( + cudf.read_parquet([fname1, fname2]), + cudf.concat([df1, df2]).reset_index(drop=True), + ) + assert_eq( + cudf.read_parquet([fname2, fname1]), + cudf.concat([df2, df1]).reset_index(drop=True), + ) + + +def test_parquet_reader_mismatched_nullability_structs(tmp_path): + data1 = [ + { + "a": "a", + "b": { + "b_a": 10, + "b_b": {"b_b_b": 1, "b_b_a": 12}, + }, + "c": [1, 2], + }, + { + "a": "b", + "b": { + "b_a": 30, + "b_b": {"b_b_b": 2, "b_b_a": 2}, + }, + "c": [3, 4], + }, + { + "a": "c", + "b": { + "b_a": 50, + "b_b": {"b_b_b": 4, "b_b_a": 5}, + }, + "c": [5, 6], + }, + { + "a": "d", + "b": { + "b_a": 135, + "b_b": {"b_b_b": 12, "b_b_a": 32}, + }, + "c": [7, 8], + }, + { + "a": "e", + "b": { + "b_a": 1, + "b_b": {"b_b_b": 1, "b_b_a": 5}, + }, + "c": [9, 10], + }, + { + "a": "f", + "b": { + "b_a": 32, + "b_b": {"b_b_b": 1, "b_b_a": 6}, + }, + "c": [11, 12], + }, + ] + + data2 = [ + { + "a": "g", + "b": { + "b_a": 10, + "b_b": {"b_b_b": None, "b_b_a": 2}, + }, + "c": None, + }, + {"a": None, "b": {"b_a": None, "b_b": None}, "c": [15, 16]}, + {"a": "j", "b": None, "c": [8, 10]}, + {"a": None, "b": {"b_a": None, "b_b": None}, "c": None}, + None, + { + "a": None, + "b": {"b_a": None, "b_b": {"b_b_b": 1}}, + "c": [18, 19], + }, + {"a": None, "b": None, "c": None}, + ] + + pa_table1 = pa.Table.from_pydict({"struct": data1}) + df1 = cudf.DataFrame.from_arrow(pa_table1) + + pa_table2 = pa.Table.from_pydict({"struct": data2}) + df2 = cudf.DataFrame.from_arrow(pa_table2) + + # Write tables to parquet + buf1 = BytesIO() + df1.to_parquet(buf1) + buf2 = BytesIO() + df2.to_parquet(buf2) + + # Read tables back with cudf and compare with expected. + assert_eq( + cudf.read_parquet([buf1, buf2]), + cudf.concat([df1, df2]).reset_index(drop=True), + ) + assert_eq( + cudf.read_parquet([buf2, buf1]), + cudf.concat([df2, df1]).reset_index(drop=True), + ) + + +@pytest.mark.skipif( + pa.__version__ == "19.0.0", + reason="https://github.com/rapidsai/cudf/issues/17806", +) +@pytest.mark.parametrize( + "stats_fname,bloom_filter_fname", + [ + ( + "mixed_card_ndv_100_chunk_stats.snappy.parquet", + "mixed_card_ndv_100_bf_fpp0.1_nostats.snappy.parquet", + ), + ( + "mixed_card_ndv_500_chunk_stats.snappy.parquet", + "mixed_card_ndv_500_bf_fpp0.1_nostats.snappy.parquet", + ), + ], +) +@pytest.mark.parametrize( + "predicate,expected_len", + [ + ([[("str", "==", "FINDME")], [("fp64", "==", float(500))]], 2), + ([("fixed_pt", "==", decimal.Decimal(float(500)))], 2), + ([[("ui32", "==", np.uint32(500)), ("str", "==", "FINDME")]], 2), + ([[("str", "==", "FINDME")], [("ui32", ">=", np.uint32(0))]], 1000), + ( + [ + ("str", "!=", "FINDME"), + ("fixed_pt", "==", decimal.Decimal(float(500))), + ], + 0, + ), + ], +) +def test_parquet_bloom_filters( + datadir, stats_fname, bloom_filter_fname, predicate, expected_len +): + fname_stats = datadir / stats_fname + fname_bf = datadir / bloom_filter_fname + df_stats = cudf.read_parquet(fname_stats, filters=predicate).reset_index( + drop=True + ) + df_bf = cudf.read_parquet(fname_bf, filters=predicate).reset_index( + drop=True + ) + + # Check if tables equal + assert_eq( + df_stats, + df_bf, + ) + + # Check for table length + assert_eq( + len(df_stats), + expected_len, + ) + + +@pytest.fixture(params=["cuda", "pool", "cuda_async"]) +def memory_resource(request): + import rmm + + current_mr = rmm.mr.get_current_device_resource() + + kind = request.param + if kind == "cuda": + mr = rmm.mr.CudaMemoryResource() + elif kind == "pool": + base = rmm.mr.CudaMemoryResource() + free, _ = rmm.mr.available_device_memory() + size = int(round(free * 0.5 / 256) * 256) + mr = rmm.mr.PoolMemoryResource(base, size, size) + elif kind == "cuda_async": + mr = rmm.mr.CudaAsyncMemoryResource() + + rmm.mr.set_current_device_resource(mr) + + try: + yield mr + finally: + rmm.mr.set_current_device_resource(current_mr) + + +@pytest.mark.parametrize("columns", [["r_reason_desc"], None]) +def test_parquet_bloom_filters_alignment(datadir, columns, memory_resource): + fname = datadir / "bloom_filter_alignment.parquet" + filters = [("r_reason_desc", "==", "Did not like the color")] + + # Read expected table using pyarrow + expected = pq.read_table(fname, columns=columns, filters=filters) + + # Read with cudf using the memory resource from fixture + read = cudf.read_parquet( + fname, columns=columns, filters=filters + ).to_arrow() + + assert_eq(expected, read) + + +def test_parquet_reader_unsupported_compression(datadir): + fname = datadir / "hadoop_lz4_compressed.parquet" + + with pytest.raises( + RuntimeError, + match="Unsupported Parquet compression type: LZ4", + ): + cudf.read_parquet(fname) + + +def test_parquet_reader_empty_compressed_page(datadir): + fname = datadir / "empty_datapage_v2.parquet" + + df = cudf.DataFrame({"value": cudf.Series([None], dtype="float32")}) + assert_eq(cudf.read_parquet(fname), df) + + +@pytest.mark.parametrize("compression", ["brotli", "gzip", "snappy", "zstd"]) +def test_parquet_decompression( + set_decomp_env_vars, pdf_day_timestamps, compression +): + # PANDAS returns category objects whereas cuDF returns hashes + expect = pdf_day_timestamps.drop(columns=["col_category"]) + + # Write the DataFrame to a Parquet file + buffer = BytesIO() + expect.to_parquet(buffer, compression=compression) + + # Read the Parquet file back into a DataFrame + got = cudf.read_parquet(buffer) + + assert_eq(expect, got) def test_parquet_long_list(tmp_path): @@ -63,7 +4621,7 @@ def test_parquet_long_list(tmp_path): @pytest.mark.parametrize( "index", - [range(1, 11), list(range(1, 11)), range(1, 11)[::2]], + [range(1, 11), list(range(1, 11)), range(1, 11, 2)], ids=["RangeIndex", "IntIndex", "StridedRange"], ) @pytest.mark.parametrize("write_index", [False, True, None]) diff --git a/python/cudf/cudf/tests/test_s3.py b/python/cudf/cudf/tests/input_output/test_s3.py similarity index 100% rename from python/cudf/cudf/tests/test_s3.py rename to python/cudf/cudf/tests/input_output/test_s3.py diff --git a/python/cudf/cudf/tests/test_gcs.py b/python/cudf/cudf/tests/test_gcs.py deleted file mode 100644 index b9b0161e6c0..00000000000 --- a/python/cudf/cudf/tests/test_gcs.py +++ /dev/null @@ -1,69 +0,0 @@ -# Copyright (c) 2020-2025, NVIDIA CORPORATION. - -import io -import os - -import numpy as np -import pandas as pd -import pytest - -import cudf -from cudf.testing import assert_eq - -gcsfs = pytest.importorskip("gcsfs") - -TEST_BUCKET = "cudf-gcs-test-bucket" - - -@pytest.fixture -def pdf(scope="module"): - df = pd.DataFrame() - df["Integer"] = np.array([2345, 11987, 9027, 9027]) - df["Float"] = np.array([9.001, 8.343, 6, 2.781]) - df["Integer2"] = np.array([2345, 106, 2088, 789277]) - df["String"] = np.array(["Alpha", "Beta", "Gamma", "Delta"]) - df["Boolean"] = np.array([True, False, True, False]) - return df - - -def test_read_csv(pdf, monkeypatch): - # Write to buffer - fpath = TEST_BUCKET + "test_csv_reader.csv" - buffer = pdf.to_csv(index=False) - - def mock_open(*args, **kwargs): - return io.BytesIO(buffer.encode()) - - def mock_size(*args): - return len(buffer.encode()) - - monkeypatch.setattr(gcsfs.core.GCSFileSystem, "open", mock_open) - monkeypatch.setattr(gcsfs.core.GCSFileSystem, "size", mock_size) - - # Test read from explicit path. - with pytest.warns(FutureWarning): - got = cudf.read_csv(f"gcs://{fpath}") - assert_eq(pdf, got) - - # AbstractBufferedFile -> PythonFile conversion - # will work fine with the monkey-patched FS if we - # pass in an fsspec file object - fs = gcsfs.core.GCSFileSystem() - with fs.open(f"gcs://{fpath}") as f: - got = cudf.read_csv(f) - assert_eq(pdf, got) - - -def test_write_orc(pdf, monkeypatch, tmpdir): - gcs_fname = TEST_BUCKET + "test_orc_writer.orc" - local_filepath = os.path.join(tmpdir, "test_orc.orc") - gdf = cudf.from_pandas(pdf) - - def mock_open(*args, **kwargs): - return open(local_filepath, "wb") - - monkeypatch.setattr(gcsfs.core.GCSFileSystem, "open", mock_open) - gdf.to_orc(f"gcs://{gcs_fname}") - - got = pd.read_orc(local_filepath) - assert_eq(pdf, got) diff --git a/python/cudf/cudf/tests/test_hdf.py b/python/cudf/cudf/tests/test_hdf.py deleted file mode 100644 index 4921b7b51fc..00000000000 --- a/python/cudf/cudf/tests/test_hdf.py +++ /dev/null @@ -1,142 +0,0 @@ -# Copyright (c) 2018-2024, NVIDIA CORPORATION. - -import os -from string import ascii_letters - -import numpy as np -import pandas as pd -import pytest - -import cudf -from cudf.testing import assert_eq -from cudf.testing._utils import NUMERIC_TYPES, UNSIGNED_TYPES - -pytest.importorskip("tables") - - -@pytest.fixture(params=[0, 1, 10, 100]) -def pdf(request): - types = set([*NUMERIC_TYPES, "datetime64[ns]", "bool"]) - set( - UNSIGNED_TYPES - ) - typer = {"col_" + val: val for val in types} - ncols = len(types) - nrows = request.param - - rng = np.random.default_rng(1) - # Create a pandas dataframe with random data of mixed types - test_pdf = pd.DataFrame( - rng.integers(0, 50, size=(nrows, ncols)), - columns=pd.Index([f"col_{typ}" for typ in types]), - index=pd.RangeIndex(nrows, name="test_index"), - ) - # Cast all the column dtypes to objects, rename them, and then cast to - # appropriate types - test_pdf = test_pdf.astype(typer).rename( - {"col_datetime64[ns]": "col_datetime64"}, axis=1 - ) - - # Create non-numeric categorical data otherwise may be typecasted - data = rng.choice(list(ascii_letters), size=nrows) - test_pdf["col_category"] = pd.Series(data, dtype="category") - - return (test_pdf, nrows) - - -@pytest.fixture -def gdf(pdf): - pdf, nrows = pdf - return (cudf.DataFrame.from_pandas(pdf), nrows) - - -@pytest.fixture(params=["fixed", "table"]) -def hdf_files(request, tmp_path_factory, pdf): - pdf, nrows = pdf - if request.param == "fixed": - pdf = pdf.drop("col_category", axis=1) - - fname_df = tmp_path_factory.mktemp("hdf") / "test_df.hdf" - pdf.to_hdf(fname_df, key="hdf_df_tests", format=request.param) - - fname_series = {} - for column in pdf.columns: - fname_series[column] = ( - tmp_path_factory.mktemp("hdf") / "test_series.hdf" - ) - pdf[column].to_hdf( - fname_series[column], key="hdf_series_tests", format=request.param - ) - return (fname_df, fname_series, request.param, nrows) - - -@pytest.mark.filterwarnings("ignore:Using CPU") -@pytest.mark.filterwarnings("ignore:Strings are not yet supported") -@pytest.mark.parametrize( - "columns", - [["col_int8"], ["col_category"], ["col_int32", "col_float32"], None], -) -def test_hdf_reader(hdf_files, columns): - hdf_df_file, hdf_series, format, nrows = hdf_files - if format == "fixed" and columns is not None: - pytest.skip("Can't use columns with format 'fixed'") - if format == "table" and nrows == 0: - pytest.skip("Can't read 0 row table with format 'table'") - expect_df = pd.read_hdf(hdf_df_file, columns=columns) - got_df = cudf.read_hdf(hdf_df_file, columns=columns) - - assert_eq( - expect_df, got_df, check_categorical=False, check_index_type=False - ) - - for column in hdf_series.keys(): - expect_series = pd.read_hdf(hdf_series[column]) - got_series = cudf.read_hdf(hdf_series[column]) - - assert_eq(expect_series, got_series, check_index_type=False) - - -@pytest.mark.parametrize("format", ["fixed", "table"]) -@pytest.mark.parametrize("complib", ["zlib", "bzip2", "lzo", "blosc"]) -@pytest.mark.filterwarnings("ignore:Using CPU") -def test_hdf_writer(tmpdir, pdf, gdf, complib, format): - pdf, nrows = pdf - if format == "table" and nrows == 0: - pytest.skip("Can't read 0 row table with format 'table'") - gdf, _ = gdf - - if format == "fixed": - pdf = pdf.drop("col_category", axis=1) - gdf = gdf.drop("col_category", axis=1) - - pdf_df_fname = tmpdir.join("pdf_df.hdf") - gdf_df_fname = tmpdir.join("gdf_df.hdf") - - pdf.to_hdf(pdf_df_fname, key="hdf_tests", format=format, complib=complib) - gdf.to_hdf(gdf_df_fname, key="hdf_tests", format=format, complib=complib) - - assert os.path.exists(pdf_df_fname) - assert os.path.exists(gdf_df_fname) - - expect = pd.read_hdf(pdf_df_fname) - got = pd.read_hdf(gdf_df_fname) - - assert_eq(expect, got, check_index_type=False) - - for column in pdf.columns: - pdf_series_fname = tmpdir.join(column + "_" + "pdf_series.hdf") - gdf_series_fname = tmpdir.join(column + "_" + "gdf_series.hdf") - - pdf[column].to_hdf( - pdf_series_fname, key="hdf_tests", format=format, complib=complib - ) - gdf[column].to_hdf( - gdf_series_fname, key="hdf_tests", format=format, complib=complib - ) - - assert os.path.exists(pdf_series_fname) - assert os.path.exists(gdf_series_fname) - - expect_series = pd.read_hdf(pdf_series_fname) - got_series = pd.read_hdf(gdf_series_fname) - - assert_eq(expect_series, got_series, check_index_type=False) diff --git a/python/cudf/cudf/tests/test_orc.py b/python/cudf/cudf/tests/test_orc.py deleted file mode 100644 index fd5dc4c914c..00000000000 --- a/python/cudf/cudf/tests/test_orc.py +++ /dev/null @@ -1,2054 +0,0 @@ -# Copyright (c) 2019-2025, NVIDIA CORPORATION. - -import datetime -import decimal -import os -import random -from io import BytesIO -from string import ascii_letters, ascii_lowercase - -import numpy as np -import pandas as pd -import pyarrow as pa -import pytest -from pyarrow import orc - -import cudf -from cudf.core._compat import PANDAS_CURRENT_SUPPORTED_VERSION, PANDAS_VERSION -from cudf.io.orc import ORCWriter -from cudf.testing import assert_eq, assert_frame_equal -from cudf.testing._utils import ( - expect_warning_if, - gen_rand_series, - supported_numpy_dtypes, -) - -# Removal of these deprecated features is no longer imminent. They will not be -# removed until a suitable alternative has been implemented. As a result, we -# also do not want to stop testing them yet. -# https://github.com/rapidsai/cudf/issues/11519 -pytestmark = pytest.mark.filterwarnings( - "ignore:(num_rows|skiprows) is deprecated and will be removed." -) - - -@pytest.fixture(scope="module") -def datadir(datadir): - return datadir / "orc" - - -@pytest.fixture -def path_or_buf(datadir): - fname = datadir / "TestOrcFile.test1.orc" - try: - with open(fname, "rb") as f: - buffer = BytesIO(f.read()) - except Exception as excpr: - if type(excpr).__name__ == "FileNotFoundError": - pytest.skip(".parquet file is not found") - raise excpr - - def _make_path_or_buf(src): - if src == "filepath": - return str(fname) - if src == "pathobj": - return fname - if src == "bytes_io": - return buffer - if src == "bytes": - return buffer.getvalue() - if src == "url": - return fname.as_uri() - - raise ValueError("Invalid source type") - - yield _make_path_or_buf - - -@pytest.fixture(scope="module") -def non_nested_pdf(): - rng = np.random.default_rng(seed=0) - types = [ - "bool", - "int8", - "int16", - "int32", - "int64", - "float32", - "float64", - "datetime64[ns]", - "str", - ] - nrows = 12345 - - # Create a pandas dataframe with random data of mixed types - test_pdf = pd.DataFrame( - { - f"col_{typ}": rng.integers(0, nrows, nrows).astype(typ) - for typ in types - }, - ) - - for t in [ - { - "name": "datetime64[ns]", - "nsDivisor": 1000, - "dayModulus": 86400000000, - }, - ]: - data = [ - rng.integers(0, (0x7FFFFFFFFFFFFFFF / t["nsDivisor"])) - for i in range(nrows) - ] - - test_pdf["col_" + t["name"]] = pd.Series( - np.asarray(data, dtype=t["name"]) - ) - - # Create non-numeric str data - data = [ascii_letters[rng.integers(0, 52)] for i in range(nrows)] - test_pdf["col_str"] = pd.Series(data, dtype="str") - - return test_pdf - - -@pytest.mark.filterwarnings("ignore:Using CPU") -@pytest.mark.parametrize("engine", ["pyarrow", "cudf"]) -@pytest.mark.parametrize("use_index", [False, True]) -@pytest.mark.parametrize( - "inputfile, columns", - [ - ("TestOrcFile.emptyFile.orc", ["boolean1"]), - ( - "TestOrcFile.test1.orc", - [ - "boolean1", - "byte1", - "short1", - "int1", - "long1", - "float1", - "double1", - ], - ), - ("TestOrcFile.RLEv2.orc", ["x", "y"]), - ("TestOrcFile.testSnappy.orc", None), - ("TestOrcFile.demo-12-zlib.orc", ["_col2", "_col3", "_col4", "_col5"]), - ], -) -def test_orc_reader_basic(datadir, inputfile, columns, use_index, engine): - path = datadir / inputfile - - expect = pd.read_orc(path, columns=columns) - got = cudf.read_orc( - path, engine=engine, columns=columns, use_index=use_index - ) - - assert_frame_equal(cudf.from_pandas(expect), got, check_categorical=False) - - -def test_orc_reader_filenotfound(tmpdir): - with pytest.raises(FileNotFoundError): - cudf.read_orc("TestMissingFile.orc") - - with pytest.raises(FileNotFoundError): - cudf.read_orc(tmpdir.mkdir("cudf_orc")) - - -def test_orc_reader_local_filepath(): - path = "~/TestLocalFile.orc" - if not os.path.isfile(path): - pytest.skip("Local .orc file is not found") - - cudf.read_orc(path) - - -@pytest.mark.parametrize( - "src", ["filepath", "pathobj", "bytes_io", "bytes", "url"] -) -def test_orc_reader_filepath_or_buffer(path_or_buf, src): - cols = ["int1", "long1", "float1", "double1"] - - expect = pd.read_orc(path_or_buf("filepath"), columns=cols) - got = cudf.read_orc(path_or_buf(src), columns=cols) - - assert_eq(expect, got) - - -@pytest.mark.skipif( - PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, - reason="Bug in older version of pandas", -) -def test_orc_reader_trailing_nulls(datadir): - path = datadir / "TestOrcFile.nulls-at-end-snappy.orc" - expect = pd.read_orc(path) - got = cudf.read_orc(path) - - assert_eq(expect, got, check_categorical=True) - - -@pytest.mark.parametrize("use_index", [False, True]) -@pytest.mark.parametrize( - "inputfile", - ["TestOrcFile.testDate1900.orc", "TestOrcFile.testDate2038.orc"], -) -def test_orc_reader_datetimestamp(datadir, inputfile, use_index): - from pyarrow import orc - - path = datadir / inputfile - try: - orcfile = orc.ORCFile(path) - except pa.ArrowIOError as e: - pytest.skip(".orc file is not found: %s" % e) - - pdf = orcfile.read().to_pandas(date_as_object=False) - gdf = cudf.read_orc(path, use_index=use_index) - - assert_eq(pdf, gdf, check_categorical=False, check_exact=False) - - -def test_orc_reader_strings(datadir): - path = datadir / "TestOrcFile.testStringAndBinaryStatistics.orc" - - expect = pd.read_orc(path, columns=["string1"]) - got = cudf.read_orc(path, columns=["string1"]) - - assert_eq(expect, got, check_categorical=False) - - -def test_orc_read_statistics(datadir): - # Read in file containing 2 columns ("int1" and "string1") and 3 stripes - # (sizes 5000, 5000 and 1000 respectively). Each stripe has the same value - # in every one of its rows. The values the stripes have are 1, 2, and 3 in - # "int1" and "one", "two", and "three" in "string1". - path = datadir / "TestOrcFile.testStripeLevelStats.orc" - try: - ( - file_statistics, - stripes_statistics, - ) = cudf.io.orc.read_orc_statistics([path, path]) - except pa.ArrowIOError as e: - pytest.skip(".orc file is not found: %s" % e) - - # Check numberOfValues - assert_eq(file_statistics[0]["int1"].number_of_values, 11_000) - assert_eq( - file_statistics[0]["int1"].number_of_values, - sum( - [ - stripes_statistics[0]["int1"].number_of_values, - stripes_statistics[1]["int1"].number_of_values, - stripes_statistics[2]["int1"].number_of_values, - ] - ), - ) - assert_eq( - stripes_statistics[1]["int1"].number_of_values, - stripes_statistics[1]["string1"].number_of_values, - ) - assert_eq(stripes_statistics[2]["string1"].number_of_values, 1_000) - - # Check other statistics - assert_eq(stripes_statistics[2]["string1"].has_null, False) - assert_eq( - file_statistics[0]["int1"]["minimum"], - min( - stripes_statistics[0]["int1"]["minimum"], - stripes_statistics[1]["int1"]["minimum"], - stripes_statistics[2]["int1"]["minimum"], - ), - ) - assert_eq(file_statistics[0]["int1"]["minimum"], 1) - assert_eq(file_statistics[0]["string1"]["minimum"], "one") - - -@pytest.mark.filterwarnings("ignore:Using CPU") -@pytest.mark.parametrize("engine", ["cudf", "pyarrow"]) -@pytest.mark.parametrize( - "predicate,expected_len", - [ - ([[("int1", "==", 1)]], 5000), - ([[("int1", "<=", 2)]], 10000), - ([[("int1", "==", -1)]], 0), - ([[("int1", "in", range(3))]], 10000), - ([[("int1", "in", {1, 3})]], 6000), - ([[("int1", "not in", {1, 3})]], 5000), - ], -) -def test_orc_read_filtered(datadir, engine, predicate, expected_len): - path = datadir / "TestOrcFile.testStripeLevelStats.orc" - try: - df_filtered = cudf.read_orc(path, engine=engine, filters=predicate) - except pa.ArrowIOError as e: - pytest.skip(".orc file is not found: %s" % e) - - # Assert # of rows after filtering - assert len(df_filtered) == expected_len - - -@pytest.mark.filterwarnings("ignore:Using CPU") -@pytest.mark.parametrize("engine", ["cudf", "pyarrow"]) -def test_orc_read_stripes(datadir, engine): - path = datadir / "TestOrcFile.testDate1900.orc" - try: - pdf = cudf.read_orc(path, engine=engine) - except pa.ArrowIOError as e: - pytest.skip(".orc file is not found: %s" % e) - - num_rows, stripes, col_names = cudf.io.read_orc_metadata(path) - - # Read stripes one at a time - gdf = [ - cudf.read_orc(path, engine=engine, stripes=[[i]]) - for i in range(stripes) - ] - gdf = cudf.concat(gdf).reset_index(drop=True) - assert_eq(pdf, gdf, check_categorical=False, check_index_type=True) - - # Read stripes all at once - gdf = cudf.read_orc( - path, engine=engine, stripes=[[int(x) for x in range(stripes)]] - ) - assert_eq(pdf, gdf, check_categorical=False) - - # Read only some stripes - gdf = cudf.read_orc(path, engine=engine, stripes=[[0, 1]]) - assert_eq(gdf, pdf.head(25000)) - gdf = cudf.read_orc(path, engine=engine, stripes=[[0, stripes - 1]]) - assert_eq( - gdf, - cudf.concat([pdf.head(15000), pdf.tail(10000)], ignore_index=True), - check_index_type=True, - ) - - -@pytest.mark.parametrize("num_rows", [1, 100, 3000]) -@pytest.mark.parametrize("skiprows", [0, 1, 3000]) -def test_orc_read_rows(datadir, skiprows, num_rows): - path = datadir / "TestOrcFile.decimal.orc" - - pdf = pd.read_orc(path) - gdf = cudf.read_orc(path, skiprows=skiprows, num_rows=num_rows) - - # Slice rows out of the whole dataframe for comparison as PyArrow doesn't - # have an API to read a subsection of rows from the file - pdf = pdf[skiprows : skiprows + num_rows] - pdf = pdf.reset_index(drop=True) - - assert_eq(pdf, gdf) - - -def test_orc_read_skiprows(): - buff = BytesIO() - df = pd.DataFrame( - { - "a": [ - True, - False, - True, - False, - None, - True, - True, - True, - False, - None, - False, - False, - True, - True, - True, - True, - ] - } - ) - df.to_orc(buff) - # testing 10 skiprows due to a boolean specific bug fix that didn't - # repro for other sizes of data - skiprows = 10 - - expected = ( - pd.read_orc(buff)[skiprows:].reset_index(drop=True).astype("bool") - ) - got = cudf.read_orc(buff, skiprows=skiprows) - assert_eq(expected, got) - - -def test_orc_reader_uncompressed_block(datadir): - path = datadir / "uncompressed_snappy.orc" - - expect = pd.read_orc(path) - got = cudf.read_orc(path) - - assert_eq(expect, got, check_categorical=False) - - -def test_orc_reader_nodata_block(datadir): - path = datadir / "nodata.orc" - - expect = pd.read_orc(path) - got = cudf.read_orc(path, num_rows=1) - - assert_eq(expect, got, check_categorical=False) - - -@pytest.mark.parametrize("compression", [None, "snappy"]) -@pytest.mark.parametrize( - "reference_file, columns", - [ - ( - "TestOrcFile.test1.orc", - [ - "boolean1", - "byte1", - "short1", - "int1", - "long1", - "float1", - "double1", - ], - ), - ("TestOrcFile.demo-12-zlib.orc", ["_col1", "_col3", "_col5"]), - ], -) -def test_orc_writer(datadir, tmpdir, reference_file, columns, compression): - pdf_fname = datadir / reference_file - gdf_fname = tmpdir.join("gdf.orc") - - expect = cudf.from_pandas(pd.read_orc(pdf_fname, columns=columns)) - expect.to_orc(gdf_fname.strpath, compression=compression) - got = cudf.from_pandas(pd.read_orc(gdf_fname, columns=columns)) - - assert_frame_equal(expect, got) - - -@pytest.mark.parametrize("stats_freq", ["NONE", "STRIPE", "ROWGROUP"]) -def test_orc_writer_statistics_frequency(datadir, tmpdir, stats_freq): - reference_file = "TestOrcFile.demo-12-zlib.orc" - pdf_fname = datadir / reference_file - gdf_fname = tmpdir.join("gdf.orc") - - expect = cudf.from_pandas(pd.read_orc(pdf_fname)) - expect.to_orc(gdf_fname.strpath, statistics=stats_freq) - got = cudf.from_pandas(pd.read_orc(gdf_fname)) - - assert_frame_equal(expect, got) - - -@pytest.mark.parametrize("stats_freq", ["NONE", "STRIPE", "ROWGROUP"]) -def test_chunked_orc_writer_statistics_frequency(datadir, tmpdir, stats_freq): - reference_file = "TestOrcFile.test1.orc" - pdf_fname = datadir / reference_file - gdf_fname = tmpdir.join("chunked_gdf.orc") - - columns = [ - "boolean1", - "byte1", - "short1", - "int1", - "long1", - "float1", - "double1", - ] - pdf = pd.read_orc(pdf_fname, columns=columns) - gdf = cudf.from_pandas(pdf) - expect = pd.concat([pdf, pdf]).reset_index(drop=True) - - writer = ORCWriter(gdf_fname, statistics=stats_freq) - writer.write_table(gdf) - writer.write_table(gdf) - writer.close() - - got = pd.read_orc(gdf_fname) - - assert_eq(expect, got) - - -@pytest.mark.parametrize("compression", [None, "snappy"]) -@pytest.mark.parametrize( - "reference_file, columns", - [ - ( - "TestOrcFile.test1.orc", - [ - "boolean1", - "byte1", - "short1", - "int1", - "long1", - "float1", - "double1", - ], - ), - ("TestOrcFile.demo-12-zlib.orc", ["_col1", "_col3", "_col5"]), - ], -) -def test_chunked_orc_writer( - datadir, tmpdir, reference_file, columns, compression -): - pdf_fname = datadir / reference_file - gdf_fname = tmpdir.join("chunked_gdf.orc") - - pdf = pd.read_orc(pdf_fname, columns=columns) - gdf = cudf.from_pandas(pdf) - expect = pd.concat([pdf, pdf]).reset_index(drop=True) - - writer = ORCWriter(gdf_fname, compression=compression) - writer.write_table(gdf) - writer.write_table(gdf) - writer.close() - - got = pd.read_orc(gdf_fname, columns=columns) - assert_frame_equal(cudf.from_pandas(expect), cudf.from_pandas(got)) - - -@pytest.mark.parametrize( - "dtypes", - [ - {"c": str, "a": int}, - {"c": int, "a": str}, - {"c": int, "a": str, "b": float}, - {"c": str, "a": object}, - ], -) -def test_orc_writer_strings(tmpdir, dtypes): - gdf_fname = tmpdir.join("gdf_strings.orc") - - expect = cudf.datasets.randomdata(nrows=10, dtypes=dtypes, seed=1) - expect.to_orc(gdf_fname) - got = pd.read_orc(gdf_fname) - - assert_eq(expect, got) - - -@pytest.mark.parametrize( - "dtypes", - [ - {"c": str, "a": int}, - {"c": int, "a": str}, - {"c": int, "a": str, "b": float}, - {"c": str, "a": object}, - ], -) -def test_chunked_orc_writer_strings(tmpdir, dtypes): - gdf_fname = tmpdir.join("chunked_gdf_strings.orc") - - gdf = cudf.datasets.randomdata(nrows=10, dtypes=dtypes, seed=1) - pdf = gdf.to_pandas() - expect = pd.concat([pdf, pdf]).reset_index(drop=True) - writer = ORCWriter(gdf_fname) - writer.write_table(gdf) - writer.write_table(gdf) - writer.close() - - got = pd.read_orc(gdf_fname) - - assert_eq(expect, got) - - -def test_orc_writer_sliced(tmpdir): - cudf_path = tmpdir.join("cudf.orc") - - df = pd.DataFrame() - df["String"] = np.array(["Alpha", "Beta", "Gamma", "Delta"]) - df = cudf.from_pandas(df) - - df_select = df.iloc[1:3] - - df_select.to_orc(cudf_path) - assert_eq(cudf.read_orc(cudf_path), df_select) - - -@pytest.mark.parametrize( - "orc_file", - [ - "TestOrcFile.decimal.orc", - "TestOrcFile.decimal.same.values.orc", - "TestOrcFile.decimal.multiple.values.orc", - # For additional information take look at PR 7034 - "TestOrcFile.decimal.runpos.issue.orc", - ], -) -def test_orc_reader_decimal_type(datadir, orc_file): - file_path = datadir / orc_file - - pdf = pd.read_orc(file_path) - df = cudf.read_orc(file_path) - - assert_eq(pdf, df) - - -def test_orc_decimal_precision_fail(datadir): - file_path = datadir / "TestOrcFile.int_decimal.precision_19.orc" - - # Shouldn't cause failure if decimal column is not chosen to be read. - pdf = pd.read_orc(file_path, columns=["int"]) - gdf = cudf.read_orc(file_path, columns=["int"]) - - assert_eq(pdf, gdf) - - -# For additional information take look at PR 6636 and 6702 -@pytest.mark.parametrize( - "orc_file", - [ - "TestOrcFile.boolean_corruption_PR_6636.orc", - "TestOrcFile.boolean_corruption_PR_6702.orc", - ], -) -def test_orc_reader_boolean_type(datadir, orc_file): - file_path = datadir / orc_file - - pdf = pd.read_orc(file_path) - df = cudf.read_orc(file_path).to_pandas() - - assert_eq(pdf, df) - - -def test_orc_reader_tzif_timestamps(datadir): - # Contains timstamps in the range covered by the TZif file - # Other timedate tests only cover "future" times - path = datadir / "TestOrcFile.lima_timezone.orc" - - pdf = pd.read_orc(path) - gdf = cudf.read_orc(path) - - assert_eq(pdf, gdf) - - -def test_int_overflow(tmpdir): - file_path = tmpdir.join("gdf_overflow.orc") - - # The number of rows and the large element trigger delta encoding - num_rows = 513 - df = cudf.DataFrame({"a": [None] * num_rows}, dtype="int64") - df["a"][0] = 1024 * 1024 * 1024 - df["a"][num_rows - 1] = 1 - df.to_orc(file_path) - - assert_eq(cudf.read_orc(file_path), df) - - -def normalized_equals(value1, value2): - # need naive time object for numpy to convert to datetime64 - if isinstance(value1, datetime.datetime): - value1 = value1.replace(tzinfo=None) - if isinstance(value2, datetime.datetime): - value2 = value2.replace(tzinfo=None) - - if isinstance(value1, (datetime.datetime, np.datetime64)): - value1 = np.datetime64(value1, "ms") - if isinstance(value2, (datetime.datetime, np.datetime64)): - value2 = np.datetime64(value2, "ms") - - # Compare integers with floats now - if isinstance(value1, float) or isinstance(value2, float): - return np.isclose(value1, value2) - - return value1 == value2 - - -@pytest.mark.parametrize("stats_freq", ["STRIPE", "ROWGROUP"]) -@pytest.mark.parametrize("nrows", [1, 100, 100000]) -def test_orc_write_statistics(tmpdir, datadir, nrows, stats_freq): - from pyarrow import orc - - supported_stat_types = [*supported_numpy_dtypes, "str"] - # Writing bool columns to multiple row groups is disabled - # until #6763 is fixed - if nrows == 100000: - supported_stat_types.remove("bool") - - # Make a dataframe - gdf = cudf.DataFrame( - { - "col_" + str(dtype): gen_rand_series(dtype, nrows, has_nulls=True) - for dtype in supported_stat_types - } - ) - fname = tmpdir.join("gdf.orc") - - # Write said dataframe to ORC with cuDF - gdf.to_orc(fname.strpath, statistics=stats_freq, stripe_size_rows=30000) - - # Read back written ORC's statistics - orc_file = orc.ORCFile(fname) - ( - file_stats, - stripes_stats, - ) = cudf.io.orc.read_orc_statistics([fname]) - - # check file stats - for col in gdf: - if "minimum" in file_stats[0][col]: - stats_min = file_stats[0][col]["minimum"] - if stats_min is not None: - actual_min = gdf[col].min() - assert normalized_equals(actual_min, stats_min) - if "maximum" in file_stats[0][col]: - stats_max = file_stats[0][col]["maximum"] - if stats_max is not None: - actual_max = gdf[col].max() - assert normalized_equals(actual_max, stats_max) - if "number_of_values" in file_stats[0][col]: - stats_num_vals = file_stats[0][col]["number_of_values"] - if stats_num_vals is not None: - actual_num_vals = gdf[col].count() - assert stats_num_vals == actual_num_vals - - # compare stripe statistics with actual min/max - for stripe_idx in range(0, orc_file.nstripes): - stripe = orc_file.read_stripe(stripe_idx) - # pandas is unable to handle min/max of string col with nulls - stripe_df = cudf.DataFrame(stripe.to_pandas()) - for col in stripe_df: - if "minimum" in stripes_stats[stripe_idx][col]: - stats_min = stripes_stats[stripe_idx][col]["minimum"] - if stats_min is not None: - actual_min = stripe_df[col].min() - assert normalized_equals(actual_min, stats_min) - - if "maximum" in stripes_stats[stripe_idx][col]: - stats_max = stripes_stats[stripe_idx][col]["maximum"] - if stats_max is not None: - actual_max = stripe_df[col].max() - assert normalized_equals(actual_max, stats_max) - - if "number_of_values" in stripes_stats[stripe_idx][col]: - stats_num_vals = stripes_stats[stripe_idx][col][ - "number_of_values" - ] - if stats_num_vals is not None: - actual_num_vals = stripe_df[col].count() - assert stats_num_vals == actual_num_vals - - -@pytest.mark.parametrize("stats_freq", ["STRIPE", "ROWGROUP"]) -@pytest.mark.parametrize("nrows", [2, 100, 1024]) -def test_orc_chunked_write_statistics(tmpdir, datadir, nrows, stats_freq): - from pyarrow import orc - - supported_stat_types = [*supported_numpy_dtypes, "str"] - # Writing bool columns to multiple row groups is disabled - # until #6763 is fixed - if nrows == 1024: - supported_stat_types.remove("bool") - - gdf_fname = tmpdir.join("chunked_stats.orc") - writer = ORCWriter(gdf_fname, statistics=stats_freq, stripe_size_rows=512) - - max_char_length = 100 if nrows < 1000 else 10 - - # Make a dataframe - gdf = cudf.DataFrame( - { - "col_" + str(dtype): gen_rand_series( - dtype, - nrows // 2, - has_nulls=True, - low=0, - high=max_char_length, - seed=0, - ) - for dtype in supported_stat_types - } - ) - - pdf1 = gdf.to_pandas() - writer.write_table(gdf) - # gdf is specifically being reused here to ensure the data is destroyed - # before the next write_table call to ensure the data is persisted inside - # write and no pointers are saved into the original table - gdf = cudf.DataFrame( - { - "col_" + str(dtype): gen_rand_series( - dtype, - nrows // 2, - has_nulls=True, - low=0, - high=max_char_length, - ) - for dtype in supported_stat_types - } - ) - pdf2 = gdf.to_pandas() - writer.write_table(gdf) - writer.close() - - # pandas is unable to handle min/max of string col with nulls - expect = cudf.DataFrame(pd.concat([pdf1, pdf2]).reset_index(drop=True)) - - # Read back written ORC's statistics - orc_file = orc.ORCFile(gdf_fname) - ( - file_stats, - stripes_stats, - ) = cudf.io.orc.read_orc_statistics([gdf_fname]) - - # check file stats - for col in expect: - if "minimum" in file_stats[0][col]: - stats_min = file_stats[0][col]["minimum"] - if stats_min is not None: - actual_min = expect[col].min() - assert normalized_equals(actual_min, stats_min) - if "maximum" in file_stats[0][col]: - stats_max = file_stats[0][col]["maximum"] - if stats_max is not None: - actual_max = expect[col].max() - assert normalized_equals(actual_max, stats_max) - if "number_of_values" in file_stats[0][col]: - stats_num_vals = file_stats[0][col]["number_of_values"] - if stats_num_vals is not None: - actual_num_vals = expect[col].count() - assert stats_num_vals == actual_num_vals - - # compare stripe statistics with actual min/max - for stripe_idx in range(0, orc_file.nstripes): - stripe = orc_file.read_stripe(stripe_idx) - # pandas is unable to handle min/max of string col with nulls - stripe_df = cudf.DataFrame(stripe.to_pandas()) - for col in stripe_df: - if "minimum" in stripes_stats[stripe_idx][col]: - stats_min = stripes_stats[stripe_idx][col]["minimum"] - if stats_min is not None: - actual_min = stripe_df[col].min() - assert normalized_equals(actual_min, stats_min) - - if "maximum" in stripes_stats[stripe_idx][col]: - stats_max = stripes_stats[stripe_idx][col]["maximum"] - if stats_max is not None: - actual_max = stripe_df[col].max() - assert normalized_equals(actual_max, stats_max) - - if "number_of_values" in stripes_stats[stripe_idx][col]: - stats_num_vals = stripes_stats[stripe_idx][col][ - "number_of_values" - ] - if stats_num_vals is not None: - actual_num_vals = stripe_df[col].count() - assert stats_num_vals == actual_num_vals - - -@pytest.mark.parametrize("nrows", [1, 100, 100000]) -def test_orc_write_bool_statistics(tmpdir, datadir, nrows): - from pyarrow import orc - - # Make a dataframe - gdf = cudf.DataFrame({"col_bool": gen_rand_series("bool", nrows)}) - fname = tmpdir.join("gdf.orc") - - # Write said dataframe to ORC with cuDF - gdf.to_orc(fname.strpath, stripe_size_rows=30000) - - # Read back written ORC's statistics - orc_file = orc.ORCFile(fname) - ( - file_stats, - stripes_stats, - ) = cudf.io.orc.read_orc_statistics([fname]) - - # check file stats - col = "col_bool" - if "true_count" in file_stats[0][col]: - stats_true_count = file_stats[0][col]["true_count"] - actual_true_count = gdf[col].sum() - assert normalized_equals(actual_true_count, stats_true_count) - - if "number_of_values" in file_stats[0][col]: - stats_valid_count = file_stats[0][col]["number_of_values"] - actual_valid_count = len(gdf[col]) - gdf[col].null_count - assert normalized_equals(actual_valid_count, stats_valid_count) - - # compare stripe statistics with actual min/max - for stripe_idx in range(0, orc_file.nstripes): - stripe = orc_file.read_stripe(stripe_idx) - # pandas is unable to handle min/max of string col with nulls - stripe_df = cudf.DataFrame(stripe.to_pandas()) - - if "true_count" in stripes_stats[stripe_idx][col]: - actual_true_count = stripe_df[col].sum() - stats_true_count = stripes_stats[stripe_idx][col]["true_count"] - assert normalized_equals(actual_true_count, stats_true_count) - - if "number_of_values" in stripes_stats[stripe_idx][col]: - actual_valid_count = ( - len(stripe_df[col]) - stripe_df[col].null_count - ) - stats_valid_count = stripes_stats[stripe_idx][col][ - "number_of_values" - ] - assert normalized_equals(actual_valid_count, stats_valid_count) - - -def test_orc_reader_gmt_timestamps(datadir): - path = datadir / "TestOrcFile.gmt.orc" - - pdf = pd.read_orc(path) - gdf = cudf.read_orc(path) - assert_eq(pdf, gdf) - - -def test_orc_bool_encode_fail(): - buffer = BytesIO() - - # Generate a boolean column longer than a single row group - fail_df = cudf.DataFrame({"col": gen_rand_series("bool", 20000)}) - # Invalidate a row in the first row group - fail_df["col"][5000] = None - - # Should throw instead of generating a file that is incompatible - # with other readers (see issue #6763) - with pytest.raises(RuntimeError): - fail_df.to_orc(buffer) - - # Generate a boolean column longer than a single row group - okay_df = cudf.DataFrame({"col": gen_rand_series("bool", 20000)}) - okay_df["col"][15000] = None - # Invalid row is in the last row group; encoding is assumed to be correct - okay_df.to_orc(buffer) - - # Also validate data - pdf = pd.read_orc(buffer) - - assert_eq(okay_df.to_pandas(nullable=True), pdf) - - -def test_nanoseconds_overflow(): - buffer = BytesIO() - # Use nanosecond values that take more than 32 bits to encode - s = cudf.Series([710424008, -1338482640], dtype="datetime64[ns]") - expected = cudf.DataFrame({"s": s}) - expected.to_orc(buffer) - - cudf_got = cudf.read_orc(buffer) - assert_eq(expected, cudf_got) - - pandas_got = pd.read_orc(buffer) - assert_eq(expected, pandas_got) - - -def test_empty_dataframe(): - buffer = BytesIO() - expected = cudf.DataFrame() - expected.to_orc(buffer) - - # Raise error if column name is mentioned, but it doesn't exist. - with pytest.raises(RuntimeError): - cudf.read_orc(buffer, columns=["a"]) - - got_df = cudf.read_orc(buffer) - expected_pdf = pd.read_orc(buffer) - - assert_eq(expected, got_df) - assert_eq(expected_pdf, got_df) - - -@pytest.mark.parametrize( - "data", [[None, ""], ["", None], [None, None], ["", ""]] -) -def test_empty_string_columns(data): - buffer = BytesIO() - - expected = cudf.DataFrame({"string": data}, dtype="str") - expected.to_orc(buffer) - - expected_pdf = pd.read_orc(buffer) - got_df = cudf.read_orc(buffer) - - assert_eq(expected, got_df) - assert_eq( - expected_pdf, - got_df.to_pandas(nullable=True) - if expected_pdf["string"].dtype == pd.StringDtype() - else got_df, - ) - - -@pytest.mark.parametrize("scale", [-3, 0, 3]) -@pytest.mark.parametrize( - "decimal_type", - [cudf.Decimal32Dtype, cudf.Decimal64Dtype, cudf.Decimal128Dtype], -) -def test_orc_writer_decimal(tmpdir, scale, decimal_type): - fname = tmpdir / "decimal.orc" - - expected = cudf.DataFrame({"dec_val": gen_rand_series("i", 100)}) - expected["dec_val"] = expected["dec_val"].astype(decimal_type(7, scale)) - - expected.to_orc(fname) - - got = pd.read_orc(fname) - assert_eq(expected.to_pandas()["dec_val"], got["dec_val"]) - - -@pytest.mark.parametrize("num_rows", [1, 100, 3000]) -def test_orc_reader_multiple_files(datadir, num_rows): - path = datadir / "TestOrcFile.testSnappy.orc" - - df_1 = pd.read_orc(path) - df_2 = pd.read_orc(path) - df = pd.concat([df_1, df_2], ignore_index=True) - - gdf = cudf.read_orc([path, path], num_rows=num_rows).to_pandas() - - # Slice rows out of the whole dataframe for comparison as PyArrow doesn't - # have an API to read a subsection of rows from the file - df = df[:num_rows] - df = df.reset_index(drop=True) - - assert_eq(df, gdf) - - -def test_orc_reader_multi_file_single_stripe(datadir): - path = datadir / "TestOrcFile.testSnappy.orc" - - # should raise an exception - with pytest.raises(ValueError): - cudf.read_orc([path, path], stripes=[0]) - - -def test_orc_reader_multi_file_multi_stripe(datadir): - path = datadir / "TestOrcFile.testStripeLevelStats.orc" - gdf = cudf.read_orc([path, path], stripes=[[0, 1], [2]]) - pdf = pd.read_orc(path) - assert_eq(pdf, gdf) - - -def test_orc_string_stream_offset_issue(): - size = 30000 - vals = { - str(x): [decimal.Decimal(1)] * size if x != 0 else ["XYZ"] * size - for x in range(0, 5) - } - df = cudf.DataFrame(vals) - - buffer = BytesIO() - df.to_orc(buffer) - - assert_eq(df, cudf.read_orc(buffer)) - - -def generate_list_struct_buff(size=10_000): - rd = random.Random(1) - rng = np.random.default_rng(seed=1) - - buff = BytesIO() - - lvl3_list = [ - rd.choice( - [ - None, - [ - [ - [ - rd.choice([None, rng.integers(1, 3)]) - for _ in range(rng.integers(1, 3)) - ] - for _ in range(rng.integers(0, 3)) - ] - for _ in range(rng.integers(0, 3)) - ], - ] - ) - for _ in range(size) - ] - lvl1_list = [ - [ - rd.choice([None, rng.integers(0, 3)]) - for _ in range(rng.integers(1, 4)) - ] - for _ in range(size) - ] - lvl1_struct = [ - rd.choice( - [ - None, - {"a": rng.integers(0, 3), "b": rng.integers(0, 3)}, - ] - ) - for _ in range(size) - ] - lvl2_struct = [ - rd.choice( - [ - None, - {"a": rd.choice([None, rng.integers(0, 3)])}, - { - "lvl1_struct": { - "c": rd.choice([None, rng.integers(0, 3)]), - "d": rng.integers(0, 3), - }, - }, - ] - ) - for _ in range(size) - ] - list_nests_struct = [ - [ - {"a": rd.choice(lvl1_struct), "b": rd.choice(lvl1_struct)} - for _ in range(rng.integers(1, 4)) - ] - for _ in range(size) - ] - struct_nests_list = [ - {"struct": lvl1_struct[x], "list": lvl1_list[x]} for x in range(size) - ] - - pa_table = pa.table( - { - "lvl3_list": lvl3_list, - "lvl1_list": lvl1_list, - "lvl1_struct": lvl1_struct, - "lvl2_struct": lvl2_struct, - "list_nests_struct": list_nests_struct, - "struct_nests_list": struct_nests_list, - } - ) - with orc.ORCWriter(buff, stripe_size=1024) as writer: - writer.write(pa_table) - return buff - - -@pytest.fixture(scope="module") -def list_struct_buff(): - return generate_list_struct_buff() - - -@pytest.mark.parametrize( - "columns", - [ - None, - ["lvl3_list", "list_nests_struct", "lvl2_struct", "struct_nests_list"], - ["lvl2_struct", "lvl1_struct"], - ], -) -@pytest.mark.parametrize("num_rows", [0, 15, 1005, 10561, 10_000]) -@pytest.mark.parametrize("use_index", [True, False]) -def test_lists_struct_nests(columns, num_rows, use_index, list_struct_buff): - from pyarrow import orc - - gdf = cudf.read_orc( - list_struct_buff, - columns=columns, - num_rows=num_rows, - use_index=use_index, - ) - - pyarrow_tbl = orc.ORCFile(list_struct_buff).read() - - pyarrow_tbl = ( - pyarrow_tbl[:num_rows] - if columns is None - else pyarrow_tbl.select(columns)[:num_rows] - ) - - if num_rows > 0: - assert pyarrow_tbl.equals(gdf.to_arrow()) - else: - assert_eq(pyarrow_tbl.to_pandas(), gdf) - - -@pytest.mark.parametrize("columns", [None, ["lvl1_struct"], ["lvl1_list"]]) -def test_skip_rows_for_nested_types(columns, list_struct_buff): - with pytest.raises( - RuntimeError, match="skip_rows is not supported by nested column" - ): - cudf.read_orc( - list_struct_buff, - columns=columns, - use_index=True, - skiprows=5, - ) - - -def test_pyspark_struct(datadir): - path = datadir / "TestOrcFile.testPySparkStruct.orc" - - pdf = pd.read_orc(path) - gdf = cudf.read_orc(path) - - assert_eq(pdf, gdf) - - -def gen_map_buff(size): - from string import ascii_letters as al - - from pyarrow import orc - - rd = random.Random(1) - rng = np.random.default_rng(seed=1) - - buff = BytesIO() - - lvl1_map = pa.array( - [ - rd.choice( - [ - None, - { - rd.choice(al): rd.choice( - [None, rng.integers(1, 1500)] - ), - }, - ] - ) - for _ in range(size) - ], - type=pa.map_(pa.string(), pa.int64()), - ) - lvl2_map = pa.array( - [ - rd.choice( - [ - None, - *( - { - rd.choice(al): rd.choice( - [ - None, - [ - rd.choice( - [None, rng.integers(1, 1500)] - ) - for _ in range(5) - ], - ] - ) - } - for _ in range(2) - ), - ] - ) - for _ in range(size) - ], - type=pa.map_(pa.string(), pa.list_(pa.int64())), - ) - lvl2_struct_map = pa.array( - [ - rd.choice( - [ - None, - *( - { - rd.choice(al): rd.choice( - [ - None, - { - "a": rd.choice( - [None, rng.integers(1, 1500)] - ), - "b": rd.choice( - [None, rng.integers(1, 1500)] - ), - }, - ] - ) - } - for _ in range(2) - ), - ] - ) - for _ in range(size) - ], - type=pa.map_( - pa.string(), pa.struct({"a": pa.int64(), "b": pa.int64()}) - ), - ) - - pa_table = pa.Table.from_arrays( - [lvl1_map, lvl2_map, lvl2_struct_map], - ["lvl1_map", "lvl2_map", "lvl2_struct_map"], - ) - - orc.write_table( - pa_table, buff, stripe_size=1024, compression="UNCOMPRESSED" - ) - - return buff - - -map_buff = gen_map_buff(size=100000) - - -@pytest.mark.parametrize( - "columns", - [None, ["lvl1_map", "lvl2_struct_map"], ["lvl2_struct_map", "lvl2_map"]], -) -@pytest.mark.parametrize("num_rows", [0, 15, 1005, 10561, 100000]) -@pytest.mark.parametrize("use_index", [True, False]) -def test_map_type_read(columns, num_rows, use_index): - from pyarrow import orc - - tbl = orc.read_table(map_buff) - - lvl1_map = ( - tbl["lvl1_map"] - .combine_chunks() - .view(pa.list_(pa.struct({"key": pa.string(), "value": pa.int64()}))) - ) - lvl2_map = ( - tbl["lvl2_map"] - .combine_chunks() - .view( - pa.list_( - pa.struct({"key": pa.string(), "value": pa.list_(pa.int64())}) - ) - ) - ) - lvl2_struct_map = ( - tbl["lvl2_struct_map"] - .combine_chunks() - .view( - pa.list_( - pa.struct( - { - "key": pa.string(), - "value": pa.struct({"a": pa.int64(), "b": pa.int64()}), - } - ) - ) - ) - ) - - expected_tbl = pa.table( - { - "lvl1_map": lvl1_map, - "lvl2_map": lvl2_map, - "lvl2_struct_map": lvl2_struct_map, - } - ) - gdf = cudf.read_orc( - map_buff, columns=columns, num_rows=num_rows, use_index=use_index - ) - - expected_tbl = ( - expected_tbl[:num_rows] - if columns is None - else expected_tbl.select(columns)[:num_rows] - ) - - if num_rows > 0: - assert expected_tbl.equals(gdf.to_arrow()) - else: - assert_eq(expected_tbl.to_pandas(), gdf) - - -def test_orc_reader_decimal(datadir): - path = datadir / "TestOrcFile.decimal.orc" - - pdf = pd.read_orc(path) - gdf = cudf.read_orc(path) - - assert_eq(pdf, gdf) - - -# This test case validates the issue raised in #8665, -# please check the issue for more details. -def test_orc_timestamp_read(datadir): - path = datadir / "TestOrcFile.timestamp.issue.orc" - - pdf = pd.read_orc(path) - gdf = cudf.read_orc(path) - - assert_eq(pdf, gdf) - - -def dec(num): - return decimal.Decimal(str(num)) - - -@pytest.mark.parametrize( - "data", - [ - # basic + nested strings - { - "lls": [[["a"], ["bb"]] * 5 for i in range(12345)], - "lls2": [[["ccc", "dddd"]] * 6 for i in range(12345)], - "ls_dict": [["X"] * 7 for i in range(12345)], - "ls_direct": [[str(i)] * 9 for i in range(12345)], - "li": [[i] * 11 for i in range(12345)], - "lf": [[i * 0.5] * 13 for i in range(12345)], - "ld": [[dec(i / 2)] * 15 for i in range(12345)], - }, - # with nulls - { - "ls": [ - [str(i) if i % 5 else None, str(2 * i)] if i % 2 else None - for i in range(12345) - ], - "li": [[i, i * i, i % 2] if i % 3 else None for i in range(12345)], - "ld": [ - [dec(i), dec(i / 2) if i % 7 else None] if i % 5 else None - for i in range(12345) - ], - }, - # with empty elements - { - "ls": [ - [str(i), str(2 * i)] if i % 2 else [] for i in range(12345) - ], - "lls": [ - [[str(i), str(2 * i)]] if i % 2 else [[], []] - for i in range(12345) - ], - "li": [[i, i * i, i % 2] if i % 3 else [] for i in range(12345)], - "lli": [ - [[i], [i * i], [i % 2]] if i % 3 else [[]] - for i in range(12345) - ], - "ld": [ - [dec(i), dec(i / 2)] if i % 5 else [] for i in range(12345) - ], - }, - # variable list lengths - { - "ls": [[str(i)] * i for i in range(123)], - "li": [[i, i * i] * i for i in range(123)], - "ld": [[dec(i), dec(i / 2)] * i for i in range(123)], - }, - # many child elements (more that max_stripe_rows) - {"li": [[i] * 1100 for i in range(11000)]}, - ], -) -def test_orc_writer_lists(data): - buffer = BytesIO() - cudf.DataFrame(data).to_orc( - buffer, stripe_size_rows=2048, row_index_stride=512 - ) - # Read in as pandas but compare with pyarrow - # since pandas doesn't have a list type - pa_out = pa.Table.from_pandas(pd.read_orc(buffer)) - pa_in = pa.table(data) - assert pa_out.equals(pa_in) - - -def test_chunked_orc_writer_lists(): - num_rows = 12345 - pdf_in = pd.DataFrame( - { - "ls": [[str(i), str(2 * i)] for i in range(num_rows)], - "ld": [[dec(i / 2)] * 5 for i in range(num_rows)], - } - ) - - gdf = cudf.from_pandas(pdf_in) - expect = pd.concat([pdf_in, pdf_in]).reset_index(drop=True) - - buffer = BytesIO() - writer = ORCWriter(buffer) - writer.write_table(gdf) - writer.write_table(gdf) - writer.close() - - got = pd.read_orc(buffer) - assert_eq(expect, got) - - -def test_writer_timestamp_stream_size(datadir, tmpdir): - pdf_fname = datadir / "TestOrcFile.largeTimestamps.orc" - gdf_fname = tmpdir.join("gdf.orc") - - expect = pd.read_orc(pdf_fname) - cudf.from_pandas(expect).to_orc(gdf_fname.strpath) - got = pd.read_orc(gdf_fname) - - assert_eq(expect, got) - - -@pytest.mark.parametrize( - "fname", - [ - "TestOrcFile.NoIndStrm.StructWithNoNulls.orc", - "TestOrcFile.NoIndStrm.StructAndIntWithNulls.orc", - "TestOrcFile.NoIndStrm.StructAndIntWithNulls.TwoStripes.orc", - "TestOrcFile.NoIndStrm.IntWithNulls.orc", - ], -) -def test_no_row_group_index_orc_read(datadir, fname): - from pyarrow import orc - - fpath = datadir / fname - - expect = orc.ORCFile(fpath).read() - got = cudf.read_orc(fpath) - - assert expect.equals(got.to_arrow()) - - -def test_names_in_struct_dtype_nesting(datadir): - from pyarrow import orc - - fname = datadir / "TestOrcFile.NestedStructDataFrame.orc" - - expect = orc.ORCFile(fname).read() - got = cudf.read_orc(fname) - - # test dataframes - assert expect.equals(got.to_arrow()) - - edf = cudf.DataFrame(expect.to_pandas()) - # test schema - assert edf.dtypes.equals(got.dtypes) - - -def test_writer_lists_structs(list_struct_buff): - from pyarrow import orc - - df_in = cudf.read_orc(list_struct_buff) - - buff = BytesIO() - df_in.to_orc(buff) - - pyarrow_tbl = orc.ORCFile(buff).read() - - assert pyarrow_tbl.equals(df_in.to_arrow()) - - -@pytest.mark.parametrize( - "data", - [ - { - "with_pd": [ - [i if i % 3 else None] if i < 9999 or i > 20001 else None - for i in range(21000) - ], - "no_pd": [ - [i if i % 3 else None] if i < 9999 or i > 20001 else [] - for i in range(21000) - ], - }, - ], -) -def test_orc_writer_lists_empty_rg(data): - pdf_in = pd.DataFrame(data) - buffer = BytesIO() - cudf_in = cudf.from_pandas(pdf_in) - - cudf_in.to_orc(buffer) - - df = cudf.read_orc(buffer) - assert_eq(df, cudf_in) - - pdf_out = pd.read_orc(buffer) - assert_eq(pdf_in, pdf_out) - - -def test_statistics_sum_overflow(): - maxint64 = np.iinfo(np.int64).max - minint64 = np.iinfo(np.int64).min - - buff = BytesIO() - df = pd.DataFrame( - {"a": [maxint64, 1], "b": [minint64, -1], "c": [minint64, 1]} - ) - df.to_orc(buff) - - file_stats, stripe_stats = cudf.io.orc.read_orc_statistics([buff]) - assert file_stats[0]["a"].get("sum") is None - assert file_stats[0]["b"].get("sum") is None - assert file_stats[0]["c"].get("sum") == minint64 + 1 - - assert stripe_stats[0]["a"].get("sum") is None - assert stripe_stats[0]["b"].get("sum") is None - assert stripe_stats[0]["c"].get("sum") == minint64 + 1 - - -def test_empty_statistics(): - from pyarrow import orc - - buff = BytesIO() - pa_table = pa.Table.from_arrays( - [ - pa.array([None], type=pa.int64()), - pa.array([None], type=pa.float64()), - pa.array([None], type=pa.string()), - pa.array([None], type=pa.decimal128(11, 2)), - pa.array([None], type=pa.timestamp("ns")), - pa.array([None], type=pa.date64()), - pa.array([None], type=pa.bool_()), - pa.array([None], type=pa.binary()), - pa.array([1], type=pa.int64()), - ], - ["a", "b", "c", "d", "e", "f", "g", "h", "i"], - ) - orc.write_table(pa_table, buff) - - got = cudf.io.orc.read_orc_statistics([buff]) - - # Check for both file and stripe stats - for stats in got: - # Similar expected stats for the first 6 columns in this case - for col_name in ascii_lowercase[:6]: - assert stats[0][col_name].number_of_values == 0 - assert stats[0][col_name].has_null is True - assert stats[0][col_name].get("minimum") is None - assert stats[0][col_name].get("maximum") is None - for col_name in ascii_lowercase[:3]: - assert stats[0][col_name].get("sum") == 0 - # Sum for decimal column is a string - assert stats[0]["d"].get("sum") == "0" - - assert stats[0]["g"].number_of_values == 0 - assert stats[0]["g"].has_null is True - assert stats[0]["g"].get("true_count") == 0 - assert stats[0]["g"].get("false_count") == 0 - - assert stats[0]["h"].number_of_values == 0 - assert stats[0]["h"].has_null is True - assert stats[0]["h"].get("sum") == 0 - - assert stats[0]["i"].number_of_values == 1 - assert stats[0]["i"].has_null is False - assert stats[0]["i"].get("minimum") == 1 - assert stats[0]["i"].get("maximum") == 1 - assert stats[0]["i"].get("sum") == 1 - - -@pytest.mark.parametrize( - "equivalent_columns", - [ - (["lvl1_struct.a", "lvl1_struct.b"], ["lvl1_struct"]), - (["lvl1_struct", "lvl1_struct.a"], ["lvl1_struct"]), - (["lvl1_struct.a", "lvl1_struct"], ["lvl1_struct"]), - (["lvl1_struct.b", "lvl1_struct.a"], ["lvl1_struct.b", "lvl1_struct"]), - (["lvl2_struct.lvl1_struct", "lvl2_struct"], ["lvl2_struct"]), - ( - ["lvl2_struct.a", "lvl2_struct.lvl1_struct.c", "lvl2_struct"], - ["lvl2_struct"], - ), - ], -) -def test_select_nested(list_struct_buff, equivalent_columns): - # The two column selections should be equivalent - df_cols1 = cudf.read_orc(list_struct_buff, columns=equivalent_columns[0]) - df_cols2 = cudf.read_orc(list_struct_buff, columns=equivalent_columns[1]) - assert_eq(df_cols1, df_cols2) - - -def test_orc_writer_rle_stream_size(datadir, tmpdir): - from pyarrow import orc - - original = datadir / "TestOrcFile.int16.rle.size.orc" - reencoded = tmpdir.join("int16_map.orc") - - df = cudf.read_orc(original) - df.to_orc(reencoded) - - # Segfaults when RLE stream sizes don't account for varint length - pa_out = orc.ORCFile(reencoded).read() - assert df.to_arrow().equals(pa_out) - - -def test_empty_columns(): - buffer = BytesIO() - # string and decimal columns have additional steps that need to be skipped - expected = cudf.DataFrame( - { - "string": cudf.Series([], dtype="str"), - "decimal": cudf.Series([], dtype=cudf.Decimal64Dtype(10, 1)), - } - ) - expected.to_orc(buffer, compression="snappy") - - got_df = cudf.read_orc(buffer) - assert_eq(expected, got_df) - - -def test_orc_reader_zstd_compression(list_struct_buff): - from pyarrow import orc - - expected = cudf.read_orc(list_struct_buff) - # save with ZSTD compression - buffer = BytesIO() - pyarrow_tbl = orc.ORCFile(list_struct_buff).read() - with orc.ORCWriter(buffer, compression="zstd") as writer: - writer.write(pyarrow_tbl) - got = cudf.read_orc(buffer) - # compare with pyarrow since pandas doesn't - # have a list or struct - assert expected.to_arrow().equals(got.to_arrow()) - - -def test_writer_protobuf_large_rowindexentry(): - s = [ - "Length of the two strings needs to add up to at least ~120", - "So that the encoded statistics are larger than 128 bytes", - ] * 5001 # generate more than 10K rows to have two row groups - df = cudf.DataFrame({"s1": s}) - - buff = BytesIO() - df.to_orc(buff) - - got = cudf.read_orc(buff) - assert_frame_equal(df, got) - - -@pytest.mark.parametrize("compression", ["ZLIB", "ZSTD"]) -def test_orc_writer_nvcomp(compression): - expected = cudf.datasets.randomdata( - nrows=12345, dtypes={"a": int, "b": str, "c": float}, seed=1 - ) - - buff = BytesIO() - try: - expected.to_orc(buff, compression=compression) - except RuntimeError: - pytest.mark.xfail(reason="Newer nvCOMP version is required") - else: - got = pd.read_orc(buff) - assert_eq(expected, got) - - -def run_orc_columns_and_index_param(index_obj, index, columns): - buffer = BytesIO() - df = cudf.DataFrame( - {"a": [1, 2, 3], "b": ["a", "b", "c"]}, index=index_obj - ) - df.to_orc(buffer, index=index) - - expected = pd.read_orc(buffer, columns=columns) - got = cudf.read_orc(buffer, columns=columns) - - assert_eq(expected, got, check_index_type=True) - - -@pytest.mark.parametrize("index_obj", [None, [10, 11, 12], ["x", "y", "z"]]) -@pytest.mark.parametrize("index", [True, False, None]) -@pytest.mark.parametrize( - "columns", - [ - None, - pytest.param( - [], - marks=pytest.mark.skipif( - PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, - reason="Bug in older version of pandas", - ), - ), - ], -) -def test_orc_columns_and_index_param(index_obj, index, columns): - run_orc_columns_and_index_param(index_obj, index, columns) - - -@pytest.mark.parametrize( - "columns,index,index_obj", - [ - ( - ["a", "b"], - True, - None, - ), - ( - ["a", "b"], - True, - [10, 11, 12], - ), - ( - ["a", "b"], - True, - ["x", "y", "z"], - ), - ( - ["a", "b"], - None, - [10, 11, 12], - ), - ( - ["a", "b"], - None, - ["x", "y", "z"], - ), - ], -) -@pytest.mark.xfail(reason="https://github.com/rapidsai/cudf/issues/12026") -def test_orc_columns_and_index_param_read_index(index_obj, index, columns): - run_orc_columns_and_index_param(index_obj, index, columns) - - -@pytest.mark.parametrize( - "columns,index,index_obj", - [ - (["a", "b"], False, None), - (["a", "b"], False, [10, 11, 12]), - (["a", "b"], False, ["x", "y", "z"]), - (["a", "b"], None, None), - ], -) -def test_orc_columns_and_index_param_no_read_index(index_obj, index, columns): - run_orc_columns_and_index_param(index_obj, index, columns) - - -@pytest.mark.parametrize( - "df_data,cols_as_map_type,expected_data", - [ - ( - {"a": [[{"a": 10, "b": 20}], [{"a": 1, "b": 21}]]}, - ["a"], - {"a": [[(10, 20)], [(1, 21)]]}, - ), - ( - { - "a": [[{"a": 10, "b": 20}], [{"a": 1, "b": 21}]], - "b": [[{"a": 10, "b": 20}], [{"a": 1, "b": 21}]], - }, - ["b"], - { - "a": [[{"a": 10, "b": 20}], [{"a": 1, "b": 21}]], - "b": [[(10, 20)], [(1, 21)]], - }, - ), - ( - { - "a": [[{"a": 10, "b": 20}], [{"a": 1, "b": 21}]], - "b": [[{"a": 10, "b": 20}], [{"a": 1, "b": 21}]], - "c": [ - [{"a": {"a": 10}, "b": 20}], - [{"a": {"a": 12}, "b": 21}], - ], - }, - ["b", "c"], - { - "a": [[{"a": 10, "b": 20}], [{"a": 1, "b": 21}]], - "b": [[(10, 20)], [(1, 21)]], - "c": [[({"a": 10}, 20)], [({"a": 12}, 21)]], - }, - ), - ], -) -def test_orc_writer_cols_as_map_type(df_data, cols_as_map_type, expected_data): - df = cudf.DataFrame(df_data) - buffer = BytesIO() - df.to_orc(buffer, cols_as_map_type=cols_as_map_type) - - got = pd.read_orc(buffer) - expected = pd.DataFrame(expected_data) - - assert_eq(got, expected) - - -def test_orc_writer_cols_as_map_type_error(): - df = cudf.DataFrame( - {"a": cudf.Series([[{"a": 10, "b": 20}], [{"a": 1, "b": 21}]])} - ) - buffer = BytesIO() - with pytest.raises( - TypeError, match="cols_as_map_type must be a list of column names." - ): - df.to_orc(buffer, cols_as_map_type=1) - - -@pytest.fixture -def negative_timestamp_df(): - return cudf.DataFrame( - { - "a": [ - pd.Timestamp("1969-12-31 23:59:59.000123"), - pd.Timestamp("1969-12-31 23:59:58.000999"), - pd.Timestamp("1969-12-31 23:59:58.001001"), - pd.Timestamp("1839-12-24 03:58:56.000826"), - ] - } - ) - - -@pytest.mark.parametrize("engine", ["cudf", "pyarrow"]) -def test_orc_reader_negative_timestamp(negative_timestamp_df, engine): - buffer = BytesIO() - negative_timestamp_df.to_orc(buffer) - - # We warn the user that this function will fall back to the CPU for reading - # when the engine is pyarrow. - with expect_warning_if(engine == "pyarrow", UserWarning): - got = cudf.read_orc(buffer, engine=engine) - - assert_eq(negative_timestamp_df, got, check_dtype=False) - - -def test_orc_writer_negative_timestamp(negative_timestamp_df): - from pyarrow import orc - - buffer = BytesIO() - negative_timestamp_df.to_orc(buffer) - - assert_eq(negative_timestamp_df, pd.read_orc(buffer), check_dtype=False) - assert_eq( - negative_timestamp_df, orc.ORCFile(buffer).read(), check_dtype=False - ) - - -@pytest.mark.skip( - reason="Bug specific to rockylinux8: https://github.com/rapidsai/cudf/issues/15802", -) -def test_orc_reader_apache_negative_timestamp(datadir): - path = datadir / "TestOrcFile.apache_timestamp.orc" - - pdf = pd.read_orc(path) - gdf = cudf.read_orc(path) - - assert_eq(pdf, gdf) - - -def test_statistics_string_sum(): - strings = ["a string", "another string!"] - buff = BytesIO() - df = cudf.DataFrame({"str": strings}) - df.to_orc(buff) - - file_stats, stripe_stats = cudf.io.orc.read_orc_statistics([buff]) - assert_eq(file_stats[0]["str"].get("sum"), sum(len(s) for s in strings)) - - -@pytest.mark.parametrize( - "fname", - [ - "TestOrcFile.Hive.OneEmptyMap.orc", - "TestOrcFile.Hive.OneEmptyList.orc", - "TestOrcFile.Hive.OneNullStruct.orc", - "TestOrcFile.Hive.EmptyListStripe.orc", - "TestOrcFile.Hive.NullStructStripe.orc", - "TestOrcFile.Hive.AllNulls.orc", - ], -) -def test_reader_empty_stripe(datadir, fname): - path = datadir / fname - - expected = pd.read_orc(path) - got = cudf.read_orc(path) - assert_eq(expected, got) - - -# needs enough data for multiple row groups -@pytest.mark.parametrize("data", [["*"] * 10001, ["**", None] * 5001]) -def test_reader_row_index_order(data): - expected = cudf.DataFrame({"str": data}, dtype="string") - - buffer = BytesIO() - expected.to_pandas().to_orc(buffer) - got = cudf.read_orc(buffer) - assert_eq(expected, got) - - -# Test the corner case where empty blocks are compressed -# Decompressed data size is zero, even though compressed data size is non-zero -# For more information see https://github.com/rapidsai/cudf/issues/13608 -def test_orc_reader_empty_decomp_data(datadir): - path = datadir / "TestOrcFile.Spark.EmptyDecompData.orc" - - expect = pd.read_orc(path) - got = cudf.read_orc(path) - - assert_eq(expect, got) - - -def test_orc_reader_empty_deeply_nested_level(datadir): - # Test the case where top level struct has nulls, but the nested struct is - # not nullable. In this case there is no data in the second level, but we - # still need to pass the parent null mask to the third level. - path = datadir / "TestOrcFile.Spark.NestedNotNullableStruct.orc" - - expect = pd.read_orc(path) - got = cudf.read_orc(path) - - assert_eq(expect, got) - - -def test_orc_chunked_writer_stripe_size(datadir): - from pyarrow import orc - - df = cudf.DataFrame({"col": gen_rand_series("int", 100000)}) - - buffer = BytesIO() - writer = ORCWriter(buffer, stripe_size_bytes=64 * 1024) - writer.write_table(df) - writer.close() - - orc_file = orc.ORCFile(buffer) - assert_eq(orc_file.nstripes, 10) - - buffer = BytesIO() - writer = ORCWriter(buffer, stripe_size_rows=20000) - writer.write_table(df) - writer.close() - - orc_file = orc.ORCFile(buffer) - assert_eq(orc_file.nstripes, 5) - - -def test_reader_lz4(): - from pyarrow import orc - - pdf = pd.DataFrame({"ints": [1, 2] * 5001}) - pa_table = pa.Table.from_pandas(pdf) - - buffer = BytesIO() - writer = orc.ORCWriter(buffer, compression="LZ4") - writer.write(pa_table) - writer.close() - - got = cudf.read_orc(buffer) - assert_eq(pdf, got) - - -def test_writer_lz4(): - gdf = cudf.DataFrame({"ints": [1, 2] * 5001}) - - buffer = BytesIO() - gdf.to_orc(buffer, compression="LZ4") - - got = pd.read_orc(buffer) - assert_eq(gdf, got) - - -def test_row_group_alignment(datadir): - path = datadir / "TestOrcFile.MapManyNulls.parquet" - - expected = cudf.read_parquet(path) - - buffer = BytesIO() - expected.to_orc(buffer) - - got = cudf.read_orc(buffer) - - assert_eq(expected, got) - - -@pytest.mark.parametrize( - "inputfile", - [ - # These sample data have a single column my_timestamp of the TIMESTAMP type, - # 2660 rows, and 1536 rows per row group. - "TestOrcFile.timestamp.desynced.uncompressed.RLEv2.orc", - "TestOrcFile.timestamp.desynced.snappy.RLEv2.orc", - # These two data are the same with the above, except that every 100 rows start - # with a null value. - "TestOrcFile.timestamp.desynced.uncompressed.RLEv2.hasNull.orc", - "TestOrcFile.timestamp.desynced.snappy.RLEv2.hasNull.orc", - ], -) -def test_orc_reader_desynced_timestamp(datadir, inputfile): - # Test a special case where the DATA stream (second) in a TIMESTAMP column - # is progressed faster than the SECONDARY stream (nanosecond) at the start of a row - # group. In this case, the "run cache manager" in the decoder kernel is used to - # orchestrate the dual-stream processing. - # For more information, see https://github.com/rapidsai/cudf/issues/17155. - - path = datadir / inputfile - - expect = pd.read_orc(path) - got = cudf.read_orc(path) - - assert_frame_equal(cudf.from_pandas(expect), got) - - -@pytest.mark.parametrize("compression", ["LZ4", "SNAPPY", "ZLIB", "ZSTD"]) -def test_orc_decompression(set_decomp_env_vars, compression, non_nested_pdf): - # Write the DataFrame to a Parquet file - buffer = BytesIO() - non_nested_pdf.to_orc(buffer, engine_kwargs={"compression": compression}) - - # Read the Parquet file back into a DataFrame - got = cudf.read_orc(buffer) - - assert_eq(non_nested_pdf, got) diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py deleted file mode 100644 index 2ab48f7ecce..00000000000 --- a/python/cudf/cudf/tests/test_parquet.py +++ /dev/null @@ -1,4601 +0,0 @@ -# Copyright (c) 2019-2025, NVIDIA CORPORATION. - -import datetime -import decimal -import glob -import hashlib -import math -import os -import pathlib -import random -import string -from contextlib import contextmanager -from io import BytesIO -from string import ascii_letters - -import cupy -import numpy as np -import pandas as pd -import pyarrow as pa -import pytest -from fsspec.core import get_fs_token_paths -from packaging import version -from pyarrow import parquet as pq - -import cudf -from cudf.core._compat import PANDAS_CURRENT_SUPPORTED_VERSION, PANDAS_VERSION -from cudf.io.parquet import ( - ParquetDatasetWriter, - ParquetWriter, - merge_parquet_filemetadata, -) -from cudf.testing import assert_eq, dataset_generator as dg -from cudf.testing._utils import TIMEDELTA_TYPES, set_random_null_mask_inplace - - -@contextmanager -def _hide_pyarrow_parquet_cpu_warnings(engine): - if engine == "pyarrow": - with pytest.warns( - UserWarning, - match="Using CPU via PyArrow to read Parquet dataset. This option " - "is both inefficient and unstable!", - ): - yield - else: - yield - - -@pytest.fixture(scope="module") -def datadir(datadir): - return datadir / "parquet" - - -@pytest.fixture(params=[1, 5, 10, 100]) -def simple_pdf(request): - rng = np.random.default_rng(seed=0) - types = [ - "bool", - "int8", - "int16", - "int32", - "int64", - "uint8", - "uint16", - # "uint32", pandas promotes uint32 to int64 - # https://issues.apache.org/jira/browse/ARROW-9215 - "uint64", - "float32", - "float64", - ] - nrows = request.param - - # Create a pandas dataframe with random data of mixed types - test_pdf = pd.DataFrame( - { - f"col_{typ}": rng.integers(0, nrows, nrows).astype(typ) - for typ in types - }, - # Need to ensure that this index is not a RangeIndex to get the - # expected round-tripping behavior from Parquet reader/writer. - index=pd.Index(list(range(nrows))), - ) - # Delete the name of the column index, and rename the row index - test_pdf.columns.name = None - test_pdf.index.name = "test_index" - - return test_pdf - - -@pytest.fixture -def simple_gdf(simple_pdf): - return cudf.DataFrame.from_pandas(simple_pdf) - - -def build_pdf(num_columns, day_resolution_timestamps): - rng = np.random.default_rng(seed=0) - types = [ - "bool", - "int8", - "int16", - "int32", - "int64", - "uint8", - "uint16", - # "uint32", pandas promotes uint32 to int64 - # https://issues.apache.org/jira/browse/ARROW-9215 - "uint64", - "float32", - "float64", - "datetime64[ms]", - "datetime64[us]", - "str", - ] - nrows = num_columns.param - - # Create a pandas dataframe with random data of mixed types - test_pdf = pd.DataFrame( - { - f"col_{typ}": rng.integers(0, nrows, nrows).astype(typ) - for typ in types - }, - # Need to ensure that this index is not a RangeIndex to get the - # expected round-tripping behavior from Parquet reader/writer. - index=pd.Index(list(range(nrows))), - ) - # Delete the name of the column index, and rename the row index - test_pdf.columns.name = None - test_pdf.index.name = "test_index" - - # make datetime64's a little more interesting by increasing the range of - # dates note that pandas will convert these to ns timestamps, so care is - # taken to avoid overflowing a ns timestamp. There is also the ability to - # request timestamps be whole days only via `day_resolution_timestamps`. - for t in [ - { - "name": "datetime64[ms]", - "nsDivisor": 1000000, - "dayModulus": 86400000, - }, - { - "name": "datetime64[us]", - "nsDivisor": 1000, - "dayModulus": 86400000000, - }, - ]: - data = [ - rng.integers(0, (0x7FFFFFFFFFFFFFFF / t["nsDivisor"])) - for i in range(nrows) - ] - if day_resolution_timestamps: - data = [int(d / t["dayModulus"]) * t["dayModulus"] for d in data] - test_pdf["col_" + t["name"]] = pd.Series( - np.asarray(data, dtype=t["name"]) - ) - - # Create non-numeric categorical data otherwise parquet may typecast it - data = [ascii_letters[rng.integers(0, 52)] for i in range(nrows)] - test_pdf["col_category"] = pd.Series(data, dtype="category") - - # Create non-numeric str data - data = [ascii_letters[rng.integers(0, 52)] for i in range(nrows)] - test_pdf["col_str"] = pd.Series(data, dtype="str") - - return test_pdf - - -@pytest.fixture(params=[0, 1, 10, 100]) -def pdf(request): - return build_pdf(request, False) - - -@pytest.fixture(params=[0, 1, 10, 100]) -def pdf_day_timestamps(request): - return build_pdf(request, True) - - -@pytest.fixture -def gdf(pdf): - return cudf.DataFrame.from_pandas(pdf) - - -@pytest.fixture -def gdf_day_timestamps(pdf_day_timestamps): - return cudf.DataFrame.from_pandas(pdf_day_timestamps) - - -@pytest.fixture(params=["snappy", "gzip", "brotli", None, np.str_("snappy")]) -def parquet_file(request, tmp_path_factory, pdf): - fname = tmp_path_factory.mktemp("parquet") / ( - str(request.param) + "_test.parquet" - ) - pdf.to_parquet(fname, engine="pyarrow", compression=request.param) - return fname - - -def make_pdf(nrows, ncolumns=1, nvalids=0, dtype=np.int64): - test_pdf = pd.DataFrame( - [list(range(ncolumns * i, ncolumns * (i + 1))) for i in range(nrows)], - columns=pd.Index(["foo"], name="bar"), - # Need to ensure that this index is not a RangeIndex to get the - # expected round-tripping behavior from Parquet reader/writer. - index=pd.Index(list(range(nrows))), - ) - test_pdf.columns.name = None - - if nvalids: - # Randomly but reproducibly mark subset of rows as invalid - random.seed(1337) - mask = random.sample(range(nrows), nvalids) - test_pdf[test_pdf.index.isin(mask)] = np.nan - if dtype: - test_pdf = test_pdf.astype(dtype) - - return test_pdf - - -@pytest.fixture -def parquet_path_or_buf(datadir): - fname = datadir / "spark_timestamp.snappy.parquet" - try: - with open(fname, "rb") as f: - buffer = BytesIO(f.read()) - except Exception as excpr: - if type(excpr).__name__ == "FileNotFoundError": - pytest.skip(".parquet file is not found") - raise excpr - - def _make_parquet_path_or_buf(src): - if src == "filepath": - return str(fname) - if src == "pathobj": - return fname - if src == "bytes_io": - return buffer - if src == "bytes": - return buffer.getvalue() - if src == "url": - return fname.as_uri() - - raise ValueError("Invalid source type") - - yield _make_parquet_path_or_buf - - -@pytest.fixture(scope="module") -def large_int64_gdf(): - return cudf.DataFrame.from_pandas(pd.DataFrame({"col": range(0, 1 << 20)})) - - -@pytest.mark.filterwarnings("ignore:Using CPU") -@pytest.mark.parametrize("engine", ["pyarrow", "cudf"]) -@pytest.mark.parametrize( - "columns", - [ - ["col_int8"], - ["col_category"], - ["col_int32", "col_float32"], - ["col_int16", "col_float64", "col_int8"], - None, - ], -) -def test_parquet_reader_basic(parquet_file, columns, engine): - expect = pd.read_parquet(parquet_file, columns=columns) - got = cudf.read_parquet(parquet_file, engine=engine, columns=columns) - - # PANDAS returns category objects whereas cuDF returns hashes - if engine == "cudf": - if "col_category" in expect.columns: - expect = expect.drop(columns=["col_category"]) - if "col_category" in got.columns: - got = got.drop(columns=["col_category"]) - - assert_eq(expect, got) - - -@pytest.mark.filterwarnings("ignore:Using CPU") -@pytest.mark.parametrize("engine", ["cudf"]) -def test_parquet_reader_empty_pandas_dataframe(tmpdir, engine): - df = pd.DataFrame() - fname = tmpdir.join("test_pq_reader_empty_pandas_dataframe.parquet") - df.to_parquet(fname) - assert os.path.exists(fname) - expect = pd.read_parquet(fname) - got = cudf.read_parquet(fname, engine=engine) - expect = expect.reset_index(drop=True) - got = got.reset_index(drop=True) - - assert_eq(expect, got) - - -@pytest.mark.parametrize("has_null", [False, True]) -def test_parquet_reader_strings(tmpdir, has_null): - df = pd.DataFrame( - [(1, "aaa", 9.0), (2, "bbb", 8.0), (3, "ccc", 7.0)], - columns=pd.Index(list("abc")), - ) - if has_null: - df.at[1, "b"] = None - fname = tmpdir.join("test_pq_reader_strings.parquet") - df.to_parquet(fname) - assert os.path.exists(fname) - - gdf = cudf.read_parquet(fname, engine="cudf") - - assert gdf["b"].dtype == np.dtype("object") - assert_eq(gdf["b"], df["b"]) - - -@pytest.mark.parametrize("columns", [None, ["b"]]) -@pytest.mark.parametrize("index_col", ["b", "Nameless", None]) -def test_parquet_reader_index_col(tmpdir, index_col, columns): - df = pd.DataFrame({"a": range(3), "b": range(3, 6), "c": range(6, 9)}) - - if index_col is None: - # No index column - df.reset_index(drop=True, inplace=True) - elif index_col == "Nameless": - # Index column but no name - df.set_index("a", inplace=True) - df.index.name = None - else: - # Index column as normal - df.set_index(index_col, inplace=True) - - fname = tmpdir.join("test_pq_reader_index_col.parquet") - - # PANDAS' PyArrow backend always writes the index unless disabled - df.to_parquet(fname, index=(index_col is not None)) - assert os.path.exists(fname) - - pdf = pd.read_parquet(fname, columns=columns) - gdf = cudf.read_parquet(fname, engine="cudf", columns=columns) - - assert_eq(pdf, gdf, check_categorical=False) - - -@pytest.mark.parametrize("pandas_compat", [True, False]) -@pytest.mark.parametrize( - "columns", [["a"], ["d"], ["a", "b"], ["a", "d"], None] -) -def test_parquet_reader_pandas_metadata(tmpdir, columns, pandas_compat): - df = pd.DataFrame( - { - "a": range(6, 9), - "b": range(3, 6), - "c": range(6, 9), - "d": ["abc", "def", "xyz"], - } - ) - df.set_index("b", inplace=True) - - fname = tmpdir.join("test_pq_reader_pandas_metadata.parquet") - df.to_parquet(fname) - assert os.path.exists(fname) - - # PANDAS `read_parquet()` and PyArrow `read_pandas()` always includes index - # Instead, directly use PyArrow to optionally omit the index - expect = pa.parquet.read_table( - fname, columns=columns, use_pandas_metadata=pandas_compat - ).to_pandas() - got = cudf.read_parquet( - fname, columns=columns, use_pandas_metadata=pandas_compat - ) - - if pandas_compat or columns is None or "b" in columns: - assert got.index.name == "b" - else: - assert got.index.name is None - assert_eq(expect, got, check_categorical=False) - - -@pytest.mark.parametrize("pandas_compat", [True, False]) -@pytest.mark.parametrize("as_bytes", [True, False]) -def test_parquet_range_index_pandas_metadata(tmpdir, pandas_compat, as_bytes): - df = pd.DataFrame( - {"a": range(6, 9), "b": ["abc", "def", "xyz"]}, - index=pd.RangeIndex(3, 6, 1, name="c"), - ) - - fname = tmpdir.join("test_parquet_range_index_pandas_metadata") - df.to_parquet(fname) - assert os.path.exists(fname) - - # PANDAS `read_parquet()` and PyArrow `read_pandas()` always includes index - # Instead, directly use PyArrow to optionally omit the index - expect = pa.parquet.read_table( - fname, use_pandas_metadata=pandas_compat - ).to_pandas() - if as_bytes: - # Make sure we can handle RangeIndex parsing - # in pandas when the input is `bytes` - with open(fname, "rb") as f: - got = cudf.read_parquet( - f.read(), use_pandas_metadata=pandas_compat - ) - else: - got = cudf.read_parquet(fname, use_pandas_metadata=pandas_compat) - - assert_eq(expect, got) - - -def test_parquet_read_metadata(tmp_path, pdf): - if len(pdf) > 100: - pytest.skip("Skipping long setup test") - - def num_row_groups(rows, group_size): - return max(1, (rows + (group_size - 1)) // group_size) - - fname = tmp_path / "metadata.parquet" - row_group_size = 5 - pdf.to_parquet(fname, compression="snappy", row_group_size=row_group_size) - - ( - num_rows, - row_groups, - col_names, - num_columns, - _, # rowgroup_metadata - ) = cudf.io.read_parquet_metadata(fname) - - assert num_columns == len(pdf.columns) - assert num_rows == len(pdf.index) - assert row_groups == num_row_groups(num_rows, row_group_size) - for a, b in zip(col_names, pdf.columns, strict=True): - assert a == b - - -def test_parquet_read_filtered(set_decomp_env_vars, tmpdir): - # Generate data - fname = tmpdir.join("filtered.parquet") - dg.generate( - fname, - dg.Parameters( - num_rows=100, - column_parameters=[ - dg.ColumnParameters( - cardinality=40, - null_frequency=0.05, - generator=lambda: [ - "".join( - random.sample( - string.ascii_letters, random.randint(4, 8) - ) - ) - for _ in range(10) - ], - is_sorted=False, - ), - dg.ColumnParameters( - 40, - 0.2, - lambda: np.random.default_rng(seed=0).integers( - 0, 100, size=10 - ), - True, - ), - ], - seed=42, - ), - format={"name": "parquet", "row_group_size": 10}, - use_threads=False, - ) - - # Get dataframes to compare - df = cudf.read_parquet(fname) - df_filtered = cudf.read_parquet(fname, filters=[("1", ">", 60)]) - # PyArrow's read_table function does row-group-level filtering in addition - # to applying given filters once the table has been read into memory. - # Because of this, we aren't using PyArrow as a reference for testing our - # row-group selection method since the only way to only select row groups - # with PyArrow is with the method we use and intend to test. - tbl_filtered = pq.read_table(fname, filters=[("1", ">", 60)]) - - assert_eq(cudf.io.read_parquet_metadata(fname)[1], 10) - assert len(df_filtered) < len(df) - assert len(tbl_filtered) <= len(df_filtered) - - -def test_parquet_read_filtered_everything(tmpdir): - # Generate data - fname = tmpdir.join("filtered_everything.parquet") - df = pd.DataFrame({"x": range(10), "y": list("aabbccddee")}) - df.to_parquet(fname, row_group_size=2) - - # Check filter - df_filtered = cudf.read_parquet(fname, filters=[("x", "==", 12)]) - assert_eq(len(df_filtered), 0) - assert_eq(df_filtered["x"].dtype, "int64") - assert_eq(df_filtered["y"].dtype, "object") - - -def test_parquet_read_filtered_multiple_files(tmpdir): - # Generate data - fname_0 = tmpdir.join("filtered_multiple_files_0.parquet") - df = pd.DataFrame({"x": range(10), "y": list("aabbccddee")}) - df.to_parquet(fname_0, row_group_size=2) - fname_1 = tmpdir.join("filtered_multiple_files_1.parquet") - df = pd.DataFrame({"x": range(10), "y": list("aaccccddee")}) - df.to_parquet(fname_1, row_group_size=2) - fname_2 = tmpdir.join("filtered_multiple_files_2.parquet") - df = pd.DataFrame( - {"x": [0, 1, 9, 9, 4, 5, 6, 7, 8, 9], "y": list("aabbzzddee")} - ) - df.to_parquet(fname_2, row_group_size=2) - - # Check filter - filtered_df = cudf.read_parquet( - [fname_0, fname_1, fname_2], filters=[("x", "==", 2)] - ) - assert_eq( - filtered_df, - cudf.DataFrame({"x": [2, 2], "y": list("bc")}, index=[2, 2]), - ) - - -@pytest.mark.parametrize( - "predicate,expected_len", - [ - ([[("x", "==", 0)], [("z", "==", 0)]], 2), - ([("x", "==", 0), ("z", "==", 0)], 0), - ([("x", "==", 0), ("z", "!=", 0)], 1), - ([("y", "==", "c"), ("x", ">", 8)], 0), - ([("y", "==", "c"), ("x", ">=", 5)], 1), - ([[("y", "==", "c")], [("x", "<", 3)]], 5), - ([[("x", "not in", (0, 9)), ("z", "not in", (4, 5))]], 6), - ([[("y", "==", "c")], [("x", "in", (0, 9)), ("z", "in", (0, 9))]], 4), - ([[("x", "==", 0)], [("x", "==", 1)], [("x", "==", 2)]], 3), - ([[("x", "==", 0), ("z", "==", 9), ("y", "==", "a")]], 1), - ], -) -def test_parquet_read_filtered_complex_predicate( - tmpdir, predicate, expected_len -): - # Generate data - fname = tmpdir.join("filtered_complex_predicate.parquet") - df = pd.DataFrame( - { - "x": range(10), - "y": list("aabbccddee"), - "z": reversed(range(10)), - } - ) - df.to_parquet(fname, row_group_size=2) - - # Check filters - df_filtered = cudf.read_parquet(fname, filters=predicate) - assert_eq(cudf.io.read_parquet_metadata(fname)[1], 10 / 2) - assert_eq(len(df_filtered), expected_len) - - -@pytest.mark.parametrize("row_group_size", [1, 5, 100]) -def test_parquet_read_row_groups(tmpdir, pdf, row_group_size): - if len(pdf) > 100: - pytest.skip("Skipping long setup test") - - if "col_category" in pdf.columns: - pdf = pdf.drop(columns=["col_category"]) - fname = tmpdir.join("row_group.parquet") - pdf.to_parquet(fname, compression="gzip", row_group_size=row_group_size) - - num_rows, row_groups, col_names, _, _ = cudf.io.read_parquet_metadata( - fname - ) - - gdf = [cudf.read_parquet(fname, row_groups=[i]) for i in range(row_groups)] - gdf = cudf.concat(gdf) - assert_eq(pdf.reset_index(drop=True), gdf.reset_index(drop=True)) - - # first half rows come from the first source, rest from the second - gdf = cudf.read_parquet( - [fname, fname], - row_groups=[ - list(range(row_groups // 2)), - list(range(row_groups // 2, row_groups)), - ], - ) - assert_eq(pdf.reset_index(drop=True), gdf.reset_index(drop=True)) - - -@pytest.mark.parametrize("row_group_size", [1, 5, 100]) -def test_parquet_read_row_groups_non_contiguous(tmpdir, pdf, row_group_size): - if len(pdf) > 100: - pytest.skip("Skipping long setup test") - - fname = tmpdir.join("row_group.parquet") - pdf.to_parquet(fname, compression="gzip", row_group_size=row_group_size) - - num_rows, row_groups, col_names, _, _ = cudf.io.read_parquet_metadata( - fname - ) - - # alternate rows between the two sources - gdf = cudf.read_parquet( - [fname, fname], - row_groups=[ - list(range(0, row_groups, 2)), - list(range(1, row_groups, 2)), - ], - ) - - ref_df = [ - cudf.read_parquet(fname, row_groups=i) - for i in list(range(0, row_groups, 2)) + list(range(1, row_groups, 2)) - ] - ref_df = cudf.concat(ref_df) - - assert_eq(ref_df, gdf) - - -def test_parquet_reader_spark_timestamps(datadir): - fname = datadir / "spark_timestamp.snappy.parquet" - - expect = pd.read_parquet(fname) - got = cudf.read_parquet(fname) - - assert_eq(expect, got) - - -def test_parquet_reader_spark_decimals(datadir): - fname = datadir / "spark_decimal.parquet" - - expect = pd.read_parquet(fname) - got = cudf.read_parquet(fname) - - assert_eq(expect, got) - - -@pytest.mark.parametrize("columns", [["a"], ["b", "a"], None]) -def test_parquet_reader_decimal128(datadir, columns): - fname = datadir / "nested_decimal128_file.parquet" - got = cudf.read_parquet(fname, columns=columns) - expect = cudf.read_parquet(fname, columns=columns) - - assert_eq(expect, got) - - -def test_parquet_reader_microsecond_timestamps(datadir): - fname = datadir / "usec_timestamp.parquet" - - expect = pd.read_parquet(fname) - got = cudf.read_parquet(fname) - - assert_eq(expect, got) - - -def test_parquet_reader_mixedcompression(datadir): - fname = datadir / "mixed_compression.parquet" - - expect = pd.read_parquet(fname) - got = cudf.read_parquet(fname) - - assert_eq(expect, got) - - -def test_parquet_reader_select_columns(datadir): - fname = datadir / "nested_column_map.parquet" - - expect = cudf.read_parquet(fname).to_pandas()[["value"]] - got = cudf.read_parquet(fname, columns=["value"]) - - assert_eq(expect, got) - - -def test_parquet_reader_invalids(tmpdir): - test_pdf = make_pdf(nrows=1000, nvalids=1000 // 4, dtype="Int64") - - fname = tmpdir.join("invalids.parquet") - test_pdf.to_parquet(fname, engine="pyarrow") - - expect = pd.read_parquet(fname) - got = cudf.read_parquet(fname) - - assert_eq(expect, got.to_pandas(nullable=True)) - - -def test_parquet_reader_filenotfound(tmpdir): - with pytest.raises(FileNotFoundError): - cudf.read_parquet("TestMissingFile.parquet") - - with pytest.raises(FileNotFoundError): - cudf.read_parquet(tmpdir.mkdir("cudf_parquet")) - - -def test_parquet_reader_local_filepath(): - fname = "~/TestLocalFile.parquet" - if not os.path.isfile(fname): - pytest.skip("Local .parquet file is not found") - - cudf.read_parquet(fname) - - -@pytest.mark.parametrize( - "src", ["filepath", "pathobj", "bytes_io", "bytes", "url"] -) -def test_parquet_reader_filepath_or_buffer(parquet_path_or_buf, src): - expect = pd.read_parquet(parquet_path_or_buf("filepath")) - got = cudf.read_parquet(parquet_path_or_buf(src)) - - assert_eq(expect, got) - - -def test_parquet_reader_file_types(parquet_path_or_buf): - expect = cudf.read_parquet(parquet_path_or_buf("filepath")) - fs, _, paths = get_fs_token_paths(parquet_path_or_buf("filepath")) - - # Pass open fsspec file - with fs.open(paths[0], mode="rb") as fil: - got1 = cudf.read_parquet(fil) - assert_eq(expect, got1) - - # Pass path only - got2 = cudf.read_parquet(paths[0]) - assert_eq(expect, got2) - - -def create_parquet_source(df, src_type, fname): - if src_type == "filepath": - df.to_parquet(fname, engine="pyarrow") - return str(fname) - if src_type == "pathobj": - df.to_parquet(fname, engine="pyarrow") - return fname - if src_type == "bytes_io": - buffer = BytesIO() - df.to_parquet(buffer, engine="pyarrow") - return buffer - if src_type == "bytes": - buffer = BytesIO() - df.to_parquet(buffer, engine="pyarrow") - return buffer.getvalue() - if src_type == "url": - df.to_parquet(fname, engine="pyarrow") - return pathlib.Path(fname).as_uri() - - -@pytest.mark.parametrize( - "src", ["filepath", "pathobj", "bytes_io", "bytes", "url"] -) -def test_parquet_reader_multiple_files(tmpdir, src): - test_pdf1 = make_pdf(nrows=1000, nvalids=1000 // 2, dtype="float64") - test_pdf2 = make_pdf(nrows=500, dtype="float64") - expect = pd.concat([test_pdf1, test_pdf2]) - - src1 = create_parquet_source(test_pdf1, src, tmpdir.join("multi1.parquet")) - src2 = create_parquet_source(test_pdf2, src, tmpdir.join("multi2.parquet")) - got = cudf.read_parquet([src1, src2]) - - assert_eq(expect, got) - - -def test_parquet_reader_reordered_columns(tmpdir): - src = pd.DataFrame( - {"name": ["cow", None, "duck", "fish", None], "id": [0, 1, 2, 3, 4]} - ) - fname = tmpdir.join("test_parquet_reader_reordered_columns.parquet") - src.to_parquet(fname) - assert os.path.exists(fname) - expect = pd.DataFrame( - {"id": [0, 1, 2, 3, 4], "name": ["cow", None, "duck", "fish", None]} - ) - got = cudf.read_parquet(fname, columns=["id", "name"]) - assert_eq(expect, got, check_dtype=False) - - -def test_parquet_reader_reordered_columns_mixed(tmpdir): - src = pd.DataFrame( - { - "name": ["cow", None, "duck", "fish", None], - "list0": [ - [[1, 2], [3, 4]], - None, - [[5, 6], None], - [[1]], - [[5], [6, None, 8]], - ], - "id": [0, 1, 2, 3, 4], - "list1": [ - [[1, 2], [3, 4]], - [[0, 0]], - [[5, 6], [10, 12]], - [[1]], - [[5], [6, 8]], - ], - } - ) - fname = tmpdir.join("test_parquet_reader_reordered_columns.parquet") - src.to_parquet(fname) - assert os.path.exists(fname) - expect = pd.DataFrame( - { - "list1": [ - [[1, 2], [3, 4]], - [[0, 0]], - [[5, 6], [10, 12]], - [[1]], - [[5], [6, 8]], - ], - "id": [0, 1, 2, 3, 4], - "list0": [ - [[1, 2], [3, 4]], - None, - [[5, 6], None], - [[1]], - [[5], [6, None, 8]], - ], - "name": ["cow", None, "duck", "fish", None], - } - ) - got = cudf.read_parquet(fname, columns=["list1", "id", "list0", "name"]) - assert_eq(expect, got, check_dtype=False) - - -def test_parquet_reader_list_basic(tmpdir): - expect = pd.DataFrame({"a": [[[1, 2], [3, 4]], None, [[5, 6], None]]}) - fname = tmpdir.join("test_parquet_reader_list_basic.parquet") - expect.to_parquet(fname) - assert os.path.exists(fname) - got = cudf.read_parquet(fname) - assert_eq(expect, got) - - -def test_parquet_reader_list_table(tmpdir): - expect = pd.DataFrame( - { - "a": [[[1, 2], [3, 4]], None, [[5, 6], None]], - "b": [[None, None], None, [None, None]], - "c": [[[1, 2, 3]], [[None]], [[], None]], - "d": [[[]], [[None]], [[1, 2, 3], None]], - "e": [[["cows"]], [["dogs"]], [["cats", "birds", "owls"], None]], - } - ) - fname = tmpdir.join("test_parquet_reader_list_table.parquet") - expect.to_parquet(fname) - assert os.path.exists(fname) - got = cudf.read_parquet(fname) - assert pa.Table.from_pandas(expect).equals(got.to_arrow()) - - -def int_gen(first_val, i): - """ - Returns an integer based on an absolute index and a starting value. Used - as input to `list_gen`. - """ - return int(i + first_val) - - -strings = [ - "cats", - "dogs", - "cows", - "birds", - "fish", - "sheep", - "owls", - "bears", - "ants", -] - - -def string_gen(first_val, i): - """ - Returns a string based on an absolute index and a starting value. Used as - input to `list_gen`. - """ - return strings[int_gen(first_val, i) % len(strings)] - - -def list_row_gen( - gen, first_val, list_size, lists_per_row, include_validity=False -): - """ - Generate a single row for a List> column based on input parameters. - - Parameters - ---------- - gen : A callable which generates an individual leaf element based on an - absolute index. - first_val : Generate the column as if it had started at 'first_val' - instead of 0. - list_size : Size of each generated list. - lists_per_row : Number of lists to generate per row. - include_validity : Whether or not to include nulls as part of the - column. If true, it will add a selection of nulls at both the - topmost row level and at the leaf level. - - Returns - ------- - The generated list column. - """ - - def L(list_size, first_val): - return [ - (gen(first_val, i) if i % 2 == 0 else None) - if include_validity - else (gen(first_val, i)) - for i in range(list_size) - ] - - return [ - (L(list_size, first_val + (list_size * i)) if i % 2 == 0 else None) - if include_validity - else L(list_size, first_val + (list_size * i)) - for i in range(lists_per_row) - ] - - -def list_gen(gen, num_rows, lists_per_row, list_size, include_validity=False): - """ - Generate a list column based on input parameters. - - Parameters - ---------- - gen : A callable which generates an individual leaf element based on an - absolute index. - num_rows : Number of rows to generate. - lists_per_row : Number of lists to generate per row. - list_size : Size of each generated list. - include_validity : Whether or not to include nulls as part of the - column. If true, it will add a selection of nulls at both the - topmost row level and at the leaf level. - - Returns - ------- - The generated list column. - """ - - def L(list_size, first_val): - return [ - (gen(first_val, i) if i % 2 == 0 else None) - if include_validity - else (gen(first_val, i)) - for i in range(list_size) - ] - - def R(first_val, lists_per_row, list_size): - return [ - L(list_size, first_val + (list_size * i)) - for i in range(lists_per_row) - ] - - return [ - ( - R( - lists_per_row * list_size * i, - lists_per_row, - list_size, - ) - if i % 2 == 0 - else None - ) - if include_validity - else R( - lists_per_row * list_size * i, - lists_per_row, - list_size, - ) - for i in range(num_rows) - ] - - -def test_parquet_reader_list_large(tmpdir): - expect = pd.DataFrame({"a": list_gen(int_gen, 64, 40, 25)}) - fname = tmpdir.join("test_parquet_reader_list_large.parquet") - expect.to_parquet(fname) - assert os.path.exists(fname) - got = cudf.read_parquet(fname) - assert_eq(expect, got, check_dtype=False) - - -def test_parquet_reader_list_validity(tmpdir): - expect = pd.DataFrame( - {"a": list_gen(int_gen, 64, 40, 25, include_validity=True)} - ) - fname = tmpdir.join("test_parquet_reader_list_validity.parquet") - expect.to_parquet(fname) - assert os.path.exists(fname) - got = cudf.read_parquet(fname) - assert_eq(expect, got, check_dtype=False) - - -def test_parquet_reader_list_large_mixed(tmpdir): - expect = pd.DataFrame( - { - "a": list_gen(string_gen, 64, 40, 25), - "b": list_gen(int_gen, 64, 40, 25), - "c": list_gen(int_gen, 64, 40, 25, include_validity=True), - "d": list_gen(string_gen, 64, 40, 25, include_validity=True), - } - ) - fname = tmpdir.join("test_parquet_reader_list_large_mixed.parquet") - expect.to_parquet(fname) - assert os.path.exists(fname) - got = cudf.read_parquet(fname) - assert pa.Table.from_pandas(expect).equals(got.to_arrow()) - - -def test_parquet_reader_list_large_multi_rowgroup(tmpdir): - # > 40 row groups - num_rows = 10000 - num_docs = num_rows / 2 - num_categories = 100 - row_group_size = 100 - - cupy.random.seed(0) - - # generate a random pairing of doc: category - documents = cudf.DataFrame( - { - "document_id": cupy.random.randint(num_docs, size=num_rows), - "category_id": cupy.random.randint(num_categories, size=num_rows), - } - ) - - # group categories by document_id to create a list column - expect = documents.groupby("document_id").agg({"category_id": ["collect"]}) - expect.columns = expect.columns.get_level_values(0) - expect.reset_index(inplace=True) - - # round trip the dataframe to/from parquet - fname = tmpdir.join( - "test_parquet_reader_list_large_multi_rowgroup.parquet" - ) - expect.to_pandas().to_parquet(fname, row_group_size=row_group_size) - got = cudf.read_parquet(fname) - - assert_eq(expect, got) - - -def test_parquet_reader_list_large_multi_rowgroup_nulls(tmpdir): - # 25 row groups - num_rows = 2500 - row_group_size = 100 - - expect = cudf.DataFrame( - {"a": list_gen(int_gen, num_rows, 3, 2, include_validity=True)} - ) - - # round trip the dataframe to/from parquet - fname = tmpdir.join( - "test_parquet_reader_list_large_multi_rowgroup_nulls.parquet" - ) - expect.to_pandas().to_parquet(fname, row_group_size=row_group_size) - assert os.path.exists(fname) - got = cudf.read_parquet(fname) - assert_eq(expect, got) - - -def struct_gen(gen, skip_rows, num_rows, include_validity=False): - """ - Generate a struct column based on input parameters. - - Parameters - ---------- - gen : A array of callables which generate an individual row based on an - absolute index. - skip_rows : Generate the column as if it had started at 'skip_rows' - instead of 0. The intent here is to emulate the skip_rows - parameter of the parquet reader. - num_fields : Number of fields in the struct. - include_validity : Whether or not to include nulls as part of the - column. If true, it will add a selection of nulls at both the - field level and at the value level. - - Returns - ------- - The generated struct column. - """ - - def R(first_val, num_fields): - return { - "col" + str(f): ( - gen[f](first_val, first_val) if f % 4 != 0 else None - ) - if include_validity - else (gen[f](first_val, first_val)) - for f in range(len(gen)) - } - - return [ - (R((i + skip_rows), len(gen)) if (i + skip_rows) % 4 != 0 else None) - if include_validity - else R((i + skip_rows), len(gen)) - for i in range(num_rows) - ] - - -@pytest.mark.parametrize( - "data", - [ - # struct - [ - {"a": 1, "b": 2}, - {"a": 10, "b": 20}, - {"a": None, "b": 22}, - {"a": None, "b": None}, - {"a": 15, "b": None}, - ], - # struct-of-list - [ - {"a": 1, "b": 2, "c": [1, 2, 3]}, - {"a": 10, "b": 20, "c": [4, 5]}, - {"a": None, "b": 22, "c": [6]}, - {"a": None, "b": None, "c": None}, - {"a": 15, "b": None, "c": [-1, -2]}, - None, - {"a": 100, "b": 200, "c": [-10, None, -20]}, - ], - # list-of-struct - [ - [{"a": 1, "b": 2}, {"a": 2, "b": 3}, {"a": 4, "b": 5}], - None, - [{"a": 10, "b": 20}], - [{"a": 100, "b": 200}, {"a": None, "b": 300}, None], - ], - # struct-of-struct - [ - {"a": 1, "b": {"inner_a": 10, "inner_b": 20}, "c": 2}, - {"a": 3, "b": {"inner_a": 30, "inner_b": 40}, "c": 4}, - {"a": 5, "b": {"inner_a": 50, "inner_b": None}, "c": 6}, - {"a": 7, "b": None, "c": 8}, - {"a": None, "b": {"inner_a": None, "inner_b": None}, "c": None}, - None, - {"a": None, "b": {"inner_a": None, "inner_b": 100}, "c": 10}, - ], - ], -) -def test_parquet_reader_struct_basic(tmpdir, data): - expect = pa.Table.from_pydict({"struct": data}) - fname = tmpdir.join("test_parquet_reader_struct_basic.parquet") - pa.parquet.write_table(expect, fname) - assert os.path.exists(fname) - got = cudf.read_parquet(fname) - assert expect.equals(got.to_arrow()) - - -def select_columns_params(): - dfs = [ - # struct - ( - [ - {"a": 1, "b": 2}, - {"a": 10, "b": 20}, - {"a": None, "b": 22}, - {"a": None, "b": None}, - {"a": 15, "b": None}, - ], - [["struct"], ["struct.a"], ["struct.b"], ["c"]], - ), - # struct-of-list - ( - [ - {"a": 1, "b": 2, "c": [1, 2, 3]}, - {"a": 10, "b": 20, "c": [4, 5]}, - {"a": None, "b": 22, "c": [6]}, - {"a": None, "b": None, "c": None}, - {"a": 15, "b": None, "c": [-1, -2]}, - None, - {"a": 100, "b": 200, "c": [-10, None, -20]}, - ], - [ - ["struct"], - ["struct.c"], - ["struct.c.list"], - ["struct.c.list.item"], - ["struct.b", "struct.c"], - ["struct.b", "struct.d", "struct.c"], - ], - ), - # list-of-struct - ( - [ - [{"a": 1, "b": 2}, {"a": 2, "b": 3}, {"a": 4, "b": 5}], - None, - [{"a": 10, "b": 20}], - [{"a": 100, "b": 200}, {"a": None, "b": 300}, None], - ], - [ - ["struct"], - ["struct.list"], - ["struct.list.item"], - ["struct.list.item.a", "struct.list.item.b"], - ["struct.list.item.c"], - ], - ), - # struct with "." in field names - ( - [ - {"a.b": 1, "b.a": 2}, - {"a.b": 10, "b.a": 20}, - {"a.b": None, "b.a": 22}, - {"a.b": None, "b.a": None}, - {"a.b": 15, "b.a": None}, - ], - [["struct"], ["struct.a"], ["struct.b.a"]], - ), - ] - for df_col_pair in dfs: - for cols in df_col_pair[1]: - yield df_col_pair[0], cols - - -@pytest.mark.parametrize("data, columns", select_columns_params()) -def test_parquet_reader_struct_select_columns(tmpdir, data, columns): - table = pa.Table.from_pydict({"struct": data}) - buff = BytesIO() - - pa.parquet.write_table(table, buff) - - expect = pq.ParquetFile(buff).read(columns=columns) - got = cudf.read_parquet(buff, columns=columns) - assert expect.equals(got.to_arrow()) - - -def test_parquet_reader_struct_los_large(tmpdir): - num_rows = 256 - list_size = 64 - data = [ - struct_gen([string_gen, int_gen, string_gen], 0, list_size, False) - if i % 2 == 0 - else None - for i in range(num_rows) - ] - expect = pa.Table.from_pydict({"los": data}) - fname = tmpdir.join("test_parquet_reader_struct_los_large.parquet") - pa.parquet.write_table(expect, fname) - assert os.path.exists(fname) - got = cudf.read_parquet(fname) - assert expect.equals(got.to_arrow()) - - -@pytest.mark.parametrize( - "params", [[3, 4, 32, False], [3, 4, 32, True], [50, 10, 64, True]] -) -def test_parquet_reader_struct_sol_table(tmpdir, params): - # Struct> - lists_per_row = params[0] - list_size = params[1] - num_rows = params[2] - include_validity = params[3] - - def list_gen_wrapped(x, y): - return list_row_gen( - int_gen, x * list_size * lists_per_row, list_size, lists_per_row - ) - - def string_list_gen_wrapped(x, y): - return list_row_gen( - string_gen, - x * list_size * lists_per_row, - list_size, - lists_per_row, - include_validity, - ) - - data = struct_gen( - [int_gen, string_gen, list_gen_wrapped, string_list_gen_wrapped], - 0, - num_rows, - include_validity, - ) - expect = pa.Table.from_pydict({"sol": data}) - fname = tmpdir.join("test_parquet_reader_struct_sol_table.parquet") - pa.parquet.write_table(expect, fname) - assert os.path.exists(fname) - got = cudf.read_parquet(fname) - assert expect.equals(got.to_arrow()) - - -def test_parquet_reader_v2(tmpdir, simple_pdf): - pdf_fname = tmpdir.join("pdfv2.parquet") - simple_pdf.to_parquet(pdf_fname, data_page_version="2.0") - assert_eq(cudf.read_parquet(pdf_fname), simple_pdf) - - cudf.from_pandas(simple_pdf).to_parquet(pdf_fname, header_version="2.0") - assert_eq(cudf.read_parquet(pdf_fname), simple_pdf) - - -def test_parquet_delta_byte_array(datadir): - fname = datadir / "delta_byte_arr.parquet" - assert_eq(cudf.read_parquet(fname), pd.read_parquet(fname)) - - -# values chosen to exercise: -# 1 - header only, no bitpacked values -# 2 - one bitpacked value -# 23 - one partially filled miniblock -# 32 - almost full miniblock -# 33 - one full miniblock -# 34 - one full miniblock plus one value in new miniblock -# 128 - almost full block -# 129 - one full block -# 130 - one full block plus one value in new block -# 129 * 3 - multiple blocks -def delta_num_rows(): - return [1, 2, 23, 32, 33, 34, 128, 129, 130, 129 * 3] - - -@pytest.mark.parametrize("nrows", delta_num_rows()) -@pytest.mark.parametrize("add_nulls", [True, False]) -@pytest.mark.parametrize( - "dtype", - [ - "int8", - "int16", - "int32", - "int64", - ], -) -def test_delta_binary(nrows, add_nulls, dtype, tmpdir): - null_frequency = 0.25 if add_nulls else 0 - - # Create a pandas dataframe with random data of mixed types - arrow_table = dg.rand_dataframe( - dtypes_meta=[ - { - "dtype": dtype, - "null_frequency": null_frequency, - "cardinality": nrows, - }, - ], - rows=nrows, - seed=0, - use_threads=False, - ) - # Roundabout conversion to pandas to preserve nulls/data types - cudf_table = cudf.DataFrame.from_arrow(arrow_table) - test_pdf = cudf_table.to_pandas(nullable=True) - pdf_fname = tmpdir.join("pdfv2.parquet") - test_pdf.to_parquet( - pdf_fname, - version="2.6", - column_encoding="DELTA_BINARY_PACKED", - data_page_version="2.0", - data_page_size=64 * 1024, - engine="pyarrow", - use_dictionary=False, - ) - cdf = cudf.read_parquet(pdf_fname) - pcdf = cudf.from_pandas(test_pdf) - assert_eq(cdf, pcdf) - - # Write back out with cudf and make sure pyarrow can read it - cudf_fname = tmpdir.join("cudfv2.parquet") - pcdf.to_parquet( - cudf_fname, - compression=None, - header_version="2.0", - use_dictionary=False, - ) - - cdf2 = cudf.from_pandas(pd.read_parquet(cudf_fname)) - assert_eq(cdf2, cdf) - - -@pytest.mark.parametrize("nrows", delta_num_rows()) -@pytest.mark.parametrize("add_nulls", [True, False]) -@pytest.mark.parametrize("max_string_length", [12, 48, 96, 128]) -@pytest.mark.parametrize( - "str_encoding", ["DELTA_BYTE_ARRAY", "DELTA_LENGTH_BYTE_ARRAY"] -) -def test_delta_byte_array_roundtrip( - nrows, add_nulls, max_string_length, str_encoding, tmpdir -): - null_frequency = 0.25 if add_nulls else 0 - - # Create a pandas dataframe with random data of mixed lengths - test_pdf = dg.rand_dataframe( - dtypes_meta=[ - { - "dtype": "str", - "null_frequency": null_frequency, - "cardinality": nrows, - "max_string_length": max_string_length, - }, - ], - rows=nrows, - seed=0, - use_threads=False, - ).to_pandas() - - pdf_fname = tmpdir.join("pdfdeltaba.parquet") - test_pdf.to_parquet( - pdf_fname, - version="2.6", - column_encoding=str_encoding, - data_page_version="2.0", - data_page_size=64 * 1024, - engine="pyarrow", - use_dictionary=False, - ) - cdf = cudf.read_parquet(pdf_fname) - pcdf = cudf.from_pandas(test_pdf) - assert_eq(cdf, pcdf) - - # Write back out with cudf and make sure pyarrow can read it - cudf_fname = tmpdir.join("cdfdeltaba.parquet") - pcdf.to_parquet( - cudf_fname, - compression="snappy", - header_version="2.0", - use_dictionary=False, - ) - cdf2 = cudf.from_pandas(pd.read_parquet(cudf_fname)) - assert_eq(cdf2, cdf) - - -@pytest.mark.parametrize("nrows", delta_num_rows()) -@pytest.mark.parametrize("add_nulls", [True, False]) -@pytest.mark.parametrize( - "str_encoding", ["DELTA_BYTE_ARRAY", "DELTA_LENGTH_BYTE_ARRAY"] -) -def test_delta_struct_list(tmpdir, nrows, add_nulls, str_encoding): - # Struct> - lists_per_row = 3 - list_size = 4 - num_rows = nrows - include_validity = add_nulls - - def list_gen_wrapped(x, y): - return list_row_gen( - int_gen, x * list_size * lists_per_row, list_size, lists_per_row - ) - - def string_list_gen_wrapped(x, y): - return list_row_gen( - string_gen, - x * list_size * lists_per_row, - list_size, - lists_per_row, - include_validity, - ) - - data = struct_gen( - [int_gen, string_gen, list_gen_wrapped, string_list_gen_wrapped], - 0, - num_rows, - include_validity, - ) - test_pdf = pa.Table.from_pydict({"sol": data}).to_pandas() - pdf_fname = tmpdir.join("pdfdeltaba.parquet") - test_pdf.to_parquet( - pdf_fname, - version="2.6", - column_encoding={ - "sol.col0": "DELTA_BINARY_PACKED", - "sol.col1": str_encoding, - "sol.col2.list.element.list.element": "DELTA_BINARY_PACKED", - "sol.col3.list.element.list.element": str_encoding, - }, - data_page_version="2.0", - data_page_size=64 * 1024, - engine="pyarrow", - use_dictionary=False, - ) - # sanity check to verify file is written properly - assert_eq(test_pdf, pd.read_parquet(pdf_fname)) - cdf = cudf.read_parquet(pdf_fname) - pcdf = cudf.from_pandas(test_pdf) - assert_eq(cdf, pcdf) - - # Write back out with cudf and make sure pyarrow can read it - cudf_fname = tmpdir.join("cdfdeltaba.parquet") - pcdf.to_parquet( - cudf_fname, - compression="snappy", - header_version="2.0", - use_dictionary=False, - ) - cdf2 = cudf.from_pandas(pd.read_parquet(cudf_fname)) - assert_eq(cdf2, cdf) - - -@pytest.mark.parametrize( - "data", - [ - # Structs - { - "being": [ - None, - {"human?": True, "Deets": {"Name": "Carrot", "Age": 27}}, - {"human?": None, "Deets": {"Name": "Angua", "Age": 25}}, - {"human?": False, "Deets": {"Name": "Cheery", "Age": 31}}, - {"human?": False, "Deets": None}, - {"human?": None, "Deets": {"Name": "Mr", "Age": None}}, - ] - }, - # List of Structs - { - "family": [ - [None, {"human?": True, "deets": {"weight": 2.4, "age": 27}}], - [ - {"human?": None, "deets": {"weight": 5.3, "age": 25}}, - {"human?": False, "deets": {"weight": 8.0, "age": 31}}, - {"human?": False, "deets": None}, - ], - [], - [{"human?": None, "deets": {"weight": 6.9, "age": None}}], - ] - }, - # Struct of Lists - { - "Real estate records": [ - None, - { - "Status": "NRI", - "Ownerships": { - "land_unit": [None, 2, None], - "flats": [[1, 2, 3], [], [4, 5], [], [0, 6, 0]], - }, - }, - { - "Status": None, - "Ownerships": { - "land_unit": [4, 5], - "flats": [[7, 8], []], - }, - }, - { - "Status": "RI", - "Ownerships": {"land_unit": None, "flats": [[]]}, - }, - {"Status": "RI", "Ownerships": None}, - { - "Status": None, - "Ownerships": { - "land_unit": [7, 8, 9], - "flats": [[], [], []], - }, - }, - ] - }, - ], -) -def test_parquet_reader_nested_v2(tmpdir, data): - expect = pd.DataFrame(data) - pdf_fname = tmpdir.join("pdfv2.parquet") - expect.to_parquet(pdf_fname, data_page_version="2.0") - assert_eq(cudf.read_parquet(pdf_fname), expect) - - -@pytest.mark.filterwarnings("ignore:Using CPU") -def test_parquet_writer_cpu_pyarrow( - tmpdir, pdf_day_timestamps, gdf_day_timestamps -): - pdf_fname = tmpdir.join("pdf.parquet") - gdf_fname = tmpdir.join("gdf.parquet") - - if len(pdf_day_timestamps) == 0: - pdf_day_timestamps = pdf_day_timestamps.reset_index(drop=True) - gdf_day_timestamps = pdf_day_timestamps.reset_index(drop=True) - - pdf_day_timestamps.to_parquet(pdf_fname.strpath) - gdf_day_timestamps.to_parquet(gdf_fname.strpath, engine="pyarrow") - - assert os.path.exists(pdf_fname) - assert os.path.exists(gdf_fname) - - expect = pa.parquet.read_pandas(pdf_fname) - got = pa.parquet.read_pandas(gdf_fname) - - assert_eq(expect, got) - - def clone_field(table, name, datatype): - f = table.schema.field(name) - return pa.field(f.name, datatype, f.nullable, f.metadata) - - # Pandas uses a datetime64[ns] while we use a datetime64[ms] - for t in [expect, got]: - for t_col in ["col_datetime64[ms]", "col_datetime64[us]"]: - idx = t.schema.get_field_index(t_col) - field = clone_field(t, t_col, pa.timestamp("ms")) - t = t.set_column(idx, field, t.column(idx).cast(field.type)) - t = t.replace_schema_metadata() - - assert_eq(expect, got) - - -@pytest.mark.filterwarnings("ignore:Using CPU") -def test_parquet_writer_int96_timestamps(tmpdir, pdf, gdf): - gdf_fname = tmpdir.join("gdf.parquet") - - if len(pdf) == 0: - pdf = pdf.reset_index(drop=True) - gdf = gdf.reset_index(drop=True) - - if "col_category" in pdf.columns: - pdf = pdf.drop(columns=["col_category"]) - if "col_category" in gdf.columns: - gdf = gdf.drop(columns=["col_category"]) - - assert_eq(pdf, gdf) - - # Write out the gdf using the GPU accelerated writer with INT96 timestamps - gdf.to_parquet( - gdf_fname.strpath, - index=None, - int96_timestamps=True, - ) - - assert os.path.exists(gdf_fname) - - expect = pdf - got = pd.read_parquet(gdf_fname) - - # verify INT96 timestamps were converted back to the same data. - assert_eq(expect, got, check_categorical=False, check_dtype=False) - - -def test_multifile_parquet_folder(tmpdir): - test_pdf1 = make_pdf(nrows=10, nvalids=10 // 2, dtype="float64") - test_pdf2 = make_pdf(nrows=20, dtype="float64") - expect = pd.concat([test_pdf1, test_pdf2]) - - tmpdir.mkdir("multi_part") - - create_parquet_source( - test_pdf1, "filepath", tmpdir.join("multi_part/multi1.parquet") - ) - create_parquet_source( - test_pdf2, "filepath", tmpdir.join("multi_part/multi2.parquet") - ) - - got1 = cudf.read_parquet(tmpdir.join("multi_part/*.parquet")) - assert_eq(expect, got1) - - got2 = cudf.read_parquet(tmpdir.join("multi_part")) - assert_eq(expect, got2) - - -# Validates the metadata return path of the parquet writer -def test_parquet_writer_return_metadata(tmpdir, simple_gdf): - gdf_fname = tmpdir.join("data1.parquet") - - # Write out the gdf using the GPU accelerated writer - df_metadata = simple_gdf.to_parquet( - gdf_fname.strpath, index=None, metadata_file_path="test/data1.parquet" - ) - # Verify that we got a valid parquet signature in the initial metadata blob - assert df_metadata.tobytes()[0:4] == b"PAR1" - - df_metadata_list1 = [df_metadata] - df_metadata_list2 = [df_metadata, df_metadata] - merged_metadata1 = merge_parquet_filemetadata(df_metadata_list1) - merged_metadata2 = merge_parquet_filemetadata(df_metadata_list2) - - # Verify that we got a valid parquet signature in the final metadata blob - assert merged_metadata1.tobytes()[0:4] == b"PAR1" - assert merged_metadata2.tobytes()[0:4] == b"PAR1" - - # Make sure aggregation is combining metadata correctly - fmd1 = pa.parquet.ParquetFile(BytesIO(merged_metadata1.tobytes())).metadata - fmd2 = pa.parquet.ParquetFile(BytesIO(merged_metadata2.tobytes())).metadata - assert fmd2.num_columns == fmd1.num_columns - assert fmd2.num_rows == 2 * fmd1.num_rows - assert fmd2.num_row_groups == 2 * fmd1.num_row_groups - - -# Validates the integrity of the GPU accelerated parquet writer. -def test_parquet_writer_gpu_none_index(tmpdir, simple_pdf, simple_gdf): - gdf_fname = tmpdir.join("gdf.parquet") - pdf_fname = tmpdir.join("pdf.parquet") - - assert_eq(simple_pdf, simple_gdf) - - # Write out the gdf using the GPU accelerated writer - simple_gdf.to_parquet(gdf_fname.strpath, index=None) - simple_pdf.to_parquet(pdf_fname.strpath, index=None) - - assert os.path.exists(gdf_fname) - assert os.path.exists(pdf_fname) - - expect = pd.read_parquet(pdf_fname) - got = pd.read_parquet(gdf_fname) - - assert_eq(expect, got, check_categorical=False) - - -def test_parquet_writer_gpu_true_index(tmpdir, simple_pdf, simple_gdf): - gdf_fname = tmpdir.join("gdf.parquet") - pdf_fname = tmpdir.join("pdf.parquet") - - assert_eq(simple_pdf, simple_gdf) - - # Write out the gdf using the GPU accelerated writer - simple_gdf.to_parquet(gdf_fname.strpath, index=True) - simple_pdf.to_parquet(pdf_fname.strpath, index=True) - - assert os.path.exists(gdf_fname) - assert os.path.exists(pdf_fname) - - expect = pd.read_parquet(pdf_fname) - got = pd.read_parquet(gdf_fname) - - assert_eq(expect, got, check_categorical=False) - - -def test_parquet_writer_gpu_false_index(tmpdir, simple_pdf, simple_gdf): - gdf_fname = tmpdir.join("gdf.parquet") - pdf_fname = tmpdir.join("pdf.parquet") - - assert_eq(simple_pdf, simple_gdf) - - # Write out the gdf using the GPU accelerated writer - simple_gdf.to_parquet(gdf_fname.strpath, index=False) - simple_pdf.to_parquet(pdf_fname.strpath, index=False) - - assert os.path.exists(gdf_fname) - assert os.path.exists(pdf_fname) - - expect = pd.read_parquet(pdf_fname) - got = pd.read_parquet(gdf_fname) - - assert_eq(expect, got, check_categorical=False) - - -def test_parquet_writer_gpu_multi_index(tmpdir, simple_pdf, simple_gdf): - gdf_fname = tmpdir.join("gdf.parquet") - pdf_fname = tmpdir.join("pdf.parquet") - - simple_pdf = simple_pdf.set_index(["col_bool", "col_int8"]) - simple_gdf = simple_gdf.set_index(["col_bool", "col_int8"]) - - assert_eq(simple_pdf, simple_gdf) - - # Write out the gdf using the GPU accelerated writer - simple_gdf.to_parquet(gdf_fname.strpath, index=None) - simple_pdf.to_parquet(pdf_fname.strpath, index=None) - - assert os.path.exists(gdf_fname) - assert os.path.exists(pdf_fname) - - expect = pd.read_parquet(pdf_fname) - got = pd.read_parquet(gdf_fname) - - assert_eq(expect, got, check_categorical=False) - - -def test_parquet_writer_gpu_chunked(tmpdir, simple_pdf, simple_gdf): - gdf_fname = tmpdir.join("gdf.parquet") - - writer = ParquetWriter(gdf_fname) - writer.write_table(simple_gdf) - writer.write_table(simple_gdf) - writer.close() - - assert_eq(pd.read_parquet(gdf_fname), pd.concat([simple_pdf, simple_pdf])) - - -def test_parquet_writer_gpu_chunked_context(tmpdir, simple_pdf, simple_gdf): - gdf_fname = tmpdir.join("gdf.parquet") - - with ParquetWriter(gdf_fname) as writer: - writer.write_table(simple_gdf) - writer.write_table(simple_gdf) - - got = pd.read_parquet(gdf_fname) - expect = pd.concat([simple_pdf, simple_pdf]) - assert_eq(got, expect) - - -def test_parquet_write_bytes_io(simple_gdf): - output = BytesIO() - simple_gdf.to_parquet(output) - assert_eq(cudf.read_parquet(output), simple_gdf) - - -@pytest.mark.parametrize("store_schema", [True, False]) -def test_parquet_writer_bytes_io(simple_gdf, store_schema): - output = BytesIO() - - writer = ParquetWriter(output, store_schema=store_schema) - writer.write_table(simple_gdf) - writer.write_table(simple_gdf) - writer.close() - - assert_eq(cudf.read_parquet(output), cudf.concat([simple_gdf, simple_gdf])) - - -@pytest.mark.parametrize( - "row_group_size_kwargs", - [ - {"row_group_size_bytes": 4 * 1024}, - {"row_group_size_rows": 5000}, - ], -) -def test_parquet_writer_row_group_size(tmpdir, row_group_size_kwargs): - # Check that row_group_size options are exposed in Python - # See https://github.com/rapidsai/cudf/issues/10978 - - size = 20000 - gdf = cudf.DataFrame({"a": range(size), "b": [1] * size}) - - fname = tmpdir.join("gdf.parquet") - with ParquetWriter(fname, **row_group_size_kwargs) as writer: - writer.write_table(gdf) - - # Simple check for multiple row-groups - nrows, nrow_groups, columns, _, _ = cudf.io.parquet.read_parquet_metadata( - fname - ) - assert nrows == size - assert nrow_groups > 1 - assert columns == ["a", "b"] - - # Know the specific row-group count for row_group_size_rows - if "row_group_size_rows" in row_group_size_kwargs: - assert ( - nrow_groups == size // row_group_size_kwargs["row_group_size_rows"] - ) - - assert_eq(cudf.read_parquet(fname), gdf) - - -def test_parquet_writer_column_index(tmpdir): - # Simple test for presence of indices. validity is checked - # in libcudf tests. - # Write 2 files, one with column index set, one without. - # Make sure the former is larger in size. - - size = 20000 - gdf = cudf.DataFrame({"a": range(size), "b": [1] * size}) - - fname = tmpdir.join("gdf.parquet") - with ParquetWriter(fname, statistics="ROWGROUP") as writer: - writer.write_table(gdf) - s1 = os.path.getsize(fname) - - fname = tmpdir.join("gdfi.parquet") - with ParquetWriter(fname, statistics="COLUMN") as writer: - writer.write_table(gdf) - s2 = os.path.getsize(fname) - assert s2 > s1 - - -@pytest.mark.parametrize( - "max_page_size_kwargs", - [ - {"max_page_size_bytes": 4 * 1024}, - {"max_page_size_rows": 5000}, - ], -) -def test_parquet_writer_max_page_size(tmpdir, max_page_size_kwargs): - # Check that max_page_size options are exposed in Python - # Since we don't have access to page metadata, instead check that - # file written with more pages will be slightly larger - - size = 20000 - gdf = cudf.DataFrame({"a": range(size), "b": [1] * size}) - - fname = tmpdir.join("gdf.parquet") - with ParquetWriter(fname, **max_page_size_kwargs) as writer: - writer.write_table(gdf) - s1 = os.path.getsize(fname) - - assert_eq(cudf.read_parquet(fname), gdf) - - fname = tmpdir.join("gdf0.parquet") - with ParquetWriter(fname) as writer: - writer.write_table(gdf) - s2 = os.path.getsize(fname) - - assert_eq(cudf.read_parquet(fname), gdf) - assert s1 > s2 - - -@pytest.mark.parametrize("use_dict", [False, True]) -@pytest.mark.parametrize("max_dict_size", [0, 1048576]) -def test_parquet_writer_dictionary_setting(use_dict, max_dict_size): - # Simple test for checking the validity of dictionary encoding setting - # and behavior of ParquetWriter in cudf. - # Write a table with repetitive data with varying dictionary settings. - # Make sure the written columns are dictionary-encoded accordingly. - - # Table with repetitive data - table = cudf.DataFrame( - { - "int32": cudf.Series([1024] * 1024, dtype="int64"), - } - ) - - # Write to Parquet using ParquetWriter - buffer = BytesIO() - writer = ParquetWriter( - buffer, - use_dictionary=use_dict, - max_dictionary_size=max_dict_size, - ) - writer.write_table(table) - writer.close() - - # Read encodings from parquet file - got = pq.ParquetFile(buffer) - encodings = got.metadata.row_group(0).column(0).encodings - - # Check for `PLAIN_DICTIONARY` encoding if dictionary encoding enabled - # and dictionary page limit > 0 - if use_dict is True and max_dict_size > 0: - assert "PLAIN_DICTIONARY" in encodings - else: - assert "PLAIN_DICTIONARY" not in encodings - - -@pytest.mark.parametrize("filename", ["myfile.parquet", None]) -@pytest.mark.parametrize("cols", [["b"], ["c", "b"]]) -def test_parquet_partitioned(tmpdir_factory, cols, filename): - rng = np.random.default_rng(seed=0) - # Checks that write_to_dataset is wrapping to_parquet - # as expected - gdf_dir = str(tmpdir_factory.mktemp("gdf_dir")) - pdf_dir = str(tmpdir_factory.mktemp("pdf_dir")) - size = 100 - pdf = pd.DataFrame( - { - "a": np.arange(0, stop=size, dtype="int64"), - "b": rng.choice(list("abcd"), size=size), - "c": rng.choice(np.arange(4), size=size), - } - ) - pdf.to_parquet(pdf_dir, index=False, partition_cols=cols) - gdf = cudf.from_pandas(pdf) - gdf.to_parquet( - gdf_dir, index=False, partition_cols=cols, partition_file_name=filename - ) - - # Read back with pandas to compare - expect_pd = pd.read_parquet(pdf_dir) - got_pd = pd.read_parquet(gdf_dir) - assert_eq(expect_pd, got_pd) - - # Check that cudf and pd return the same read - got_cudf = cudf.read_parquet(gdf_dir) - if isinstance(got_pd["c"].dtype, pd.CategoricalDtype): - # Work-around for pandas bug: - # https://github.com/pandas-dev/pandas/issues/53345 - got_pd["c"] = got_pd["c"].astype( - pd.CategoricalDtype( - categories=got_pd["c"].dtype.categories.astype("int64"), - ordered=got_pd["c"].dtype.ordered, - ) - ) - assert_eq(got_pd, got_cudf) - - # If filename is specified, check that it is correct - if filename: - for _, _, files in os.walk(gdf_dir): - for fn in files: - assert fn == filename - - -@pytest.mark.parametrize("kwargs", [{"nrows": 1}, {"skip_rows": 1}]) -def test_parquet_partitioned_notimplemented(tmpdir_factory, kwargs): - rng = np.random.default_rng(seed=0) - # Checks that write_to_dataset is wrapping to_parquet - # as expected - pdf_dir = str(tmpdir_factory.mktemp("pdf_dir")) - size = 100 - pdf = pd.DataFrame( - { - "a": np.arange(0, stop=size, dtype="int64"), - "b": rng.choice(list("abcd"), size=size), - "c": rng.choice(np.arange(4), size=size), - } - ) - pdf.to_parquet(pdf_dir, index=False, partition_cols=["b"]) - - with pytest.raises(NotImplementedError): - cudf.read_parquet(pdf_dir, **kwargs) - - -@pytest.mark.parametrize("return_meta", [True, False]) -def test_parquet_writer_chunked_partitioned(tmpdir_factory, return_meta): - pdf_dir = str(tmpdir_factory.mktemp("pdf_dir")) - gdf_dir = str(tmpdir_factory.mktemp("gdf_dir")) - - df1 = cudf.DataFrame({"a": [1, 1, 2, 2, 1], "b": [9, 8, 7, 6, 5]}) - df2 = cudf.DataFrame({"a": [1, 3, 3, 1, 3], "b": [4, 3, 2, 1, 0]}) - - cw = ParquetDatasetWriter(gdf_dir, partition_cols=["a"], index=False) - cw.write_table(df1) - cw.write_table(df2) - meta_byte_array = cw.close(return_metadata=return_meta) - pdf = cudf.concat([df1, df2]).to_pandas() - pdf.to_parquet(pdf_dir, index=False, partition_cols=["a"]) - - if return_meta: - fmd = pq.ParquetFile(BytesIO(meta_byte_array)).metadata - assert fmd.num_rows == len(pdf) - assert fmd.num_row_groups == 4 - files = { - os.path.join(directory, files[0]) - for directory, _, files in os.walk(gdf_dir) - if files - } - meta_files = { - os.path.join(gdf_dir, fmd.row_group(i).column(c).file_path) - for i in range(fmd.num_row_groups) - for c in range(fmd.row_group(i).num_columns) - } - assert files == meta_files - - # Read back with pandas to compare - expect_pd = pd.read_parquet(pdf_dir) - got_pd = pd.read_parquet(gdf_dir) - assert_eq(expect_pd, got_pd) - - # Check that cudf and pd return the same read - got_cudf = cudf.read_parquet(gdf_dir) - - # Work-around for pandas bug: - # https://github.com/pandas-dev/pandas/issues/53345 - got_pd["a"] = got_pd["a"].astype( - pd.CategoricalDtype( - categories=got_pd["a"].dtype.categories.astype("int64"), - ordered=got_pd["a"].dtype.ordered, - ) - ) - assert_eq(got_pd, got_cudf) - - -@pytest.mark.parametrize( - "max_file_size,max_file_size_in_bytes", - [("500KB", 500000), ("MB", 1000000)], -) -def test_parquet_writer_chunked_max_file_size( - tmpdir_factory, max_file_size, max_file_size_in_bytes -): - pdf_dir = str(tmpdir_factory.mktemp("pdf_dir")) - gdf_dir = str(tmpdir_factory.mktemp("gdf_dir")) - - df1 = cudf.DataFrame({"a": [1, 1, 2, 2, 1] * 1000, "b": range(0, 5000)}) - df2 = cudf.DataFrame( - {"a": [1, 3, 3, 1, 3] * 1000, "b": range(5000, 10000)} - ) - - cw = ParquetDatasetWriter( - gdf_dir, - partition_cols=["a"], - max_file_size=max_file_size, - file_name_prefix="sample", - ) - cw.write_table(df1) - cw.write_table(df2) - cw.close() - pdf = cudf.concat([df1, df2]).to_pandas() - pdf.to_parquet(pdf_dir, index=False, partition_cols=["a"]) - - expect_pd = pd.read_parquet(pdf_dir) - got_pd = pd.read_parquet(gdf_dir) - - assert_eq( - expect_pd.sort_values(["b"]).reset_index(drop=True), - got_pd.sort_values(["b"]).reset_index(drop=True), - ) - - # Check that cudf and pd return the same read - got_cudf = cudf.read_parquet(gdf_dir) - - # Work-around for pandas bug: - # https://github.com/pandas-dev/pandas/issues/53345 - got_pd["a"] = got_pd["a"].astype( - pd.CategoricalDtype( - categories=got_pd["a"].dtype.categories.astype("int64"), - ordered=got_pd["a"].dtype.ordered, - ) - ) - assert_eq( - got_pd.sort_values(["b"]).reset_index(drop=True), - got_cudf.sort_values(["b"]).reset_index(drop=True), - ) - - all_files = glob.glob(gdf_dir + "/**/*.parquet", recursive=True) - for each_file in all_files: - # Validate file sizes with some extra 1000 - # bytes buffer to spare - assert os.path.getsize(each_file) <= (max_file_size_in_bytes), ( - "File exceeded max_file_size" - ) - - -def test_parquet_writer_chunked_max_file_size_error(): - with pytest.raises( - ValueError, - match="file_name_prefix cannot be None if max_file_size is passed", - ): - ParquetDatasetWriter("sample", partition_cols=["a"], max_file_size=100) - - -def test_parquet_writer_chunked_partitioned_context(tmpdir_factory): - pdf_dir = str(tmpdir_factory.mktemp("pdf_dir")) - gdf_dir = str(tmpdir_factory.mktemp("gdf_dir")) - - df1 = cudf.DataFrame({"a": [1, 1, 2, 2, 1], "b": [9, 8, 7, 6, 5]}) - df2 = cudf.DataFrame({"a": [1, 3, 3, 1, 3], "b": [4, 3, 2, 1, 0]}) - - with ParquetDatasetWriter( - gdf_dir, partition_cols=["a"], index=False - ) as cw: - cw.write_table(df1) - cw.write_table(df2) - - pdf = cudf.concat([df1, df2]).to_pandas() - pdf.to_parquet(pdf_dir, index=False, partition_cols=["a"]) - - # Read back with pandas to compare - expect_pd = pd.read_parquet(pdf_dir) - got_pd = pd.read_parquet(gdf_dir) - assert_eq(expect_pd, got_pd) - - # Check that cudf and pd return the same read - got_cudf = cudf.read_parquet(gdf_dir) - - # Work-around for pandas bug: - # https://github.com/pandas-dev/pandas/issues/53345 - got_pd["a"] = got_pd["a"].astype( - pd.CategoricalDtype( - categories=got_pd["a"].dtype.categories.astype("int64"), - ordered=got_pd["a"].dtype.ordered, - ) - ) - assert_eq(got_pd, got_cudf) - - -@pytest.mark.parametrize("cols", [None, ["b"]]) -@pytest.mark.parametrize("store_schema", [True, False]) -def test_parquet_write_to_dataset(tmpdir_factory, cols, store_schema): - rng = np.random.default_rng(seed=0) - dir1 = tmpdir_factory.mktemp("dir1") - dir2 = tmpdir_factory.mktemp("dir2") - if cols is None: - dir1 = dir1.join("file.pq") - dir2 = dir2.join("file.pq") - dir1 = str(dir1) - dir2 = str(dir2) - - size = 100 - gdf = cudf.DataFrame( - { - "a": np.arange(0, stop=size), - "b": rng.choice(np.arange(4), size=size), - } - ) - gdf.to_parquet(dir1, partition_cols=cols, store_schema=store_schema) - cudf.io.write_to_dataset(gdf, dir2, partition_cols=cols) - - # Read back with cudf - expect = cudf.read_parquet(dir1) - got = cudf.read_parquet(dir2) - assert_eq(expect, got) - - gdf = cudf.DataFrame( - { - "a": cudf.Series([1, 2, 3]), - "b": cudf.Series([1, 2, 3]), - "c": cudf.Series(["a", "b", "c"], dtype="category"), - } - ) - with pytest.raises(ValueError): - gdf.to_parquet(dir1, partition_cols=cols, store_schema=store_schema) - - -@pytest.mark.parametrize( - "pfilters", - [[("b", "==", "b")], [("b", "==", "a"), ("c", "==", 1)]], -) -@pytest.mark.parametrize("selection", ["directory", "files", "row-groups"]) -@pytest.mark.parametrize("use_cat", [True, False]) -def test_read_parquet_partitioned_filtered( - tmpdir, pfilters, selection, use_cat -): - rng = np.random.default_rng(2) - path = str(tmpdir) - size = 100 - df = cudf.DataFrame( - { - "a": np.arange(0, stop=size, dtype="int64"), - "b": rng.choice(list("abcd"), size=size), - "c": rng.choice(np.arange(4), size=size), - } - ) - df.to_parquet(path, partition_cols=["c", "b"]) - - if selection == "files": - # Pass in a list of paths - fs = get_fs_token_paths(path)[0] - read_path = fs.find(path) - row_groups = None - elif selection == "row-groups": - # Pass in a list of paths AND row-group ids - fs = get_fs_token_paths(path)[0] - read_path = fs.find(path) - row_groups = [[0] for p in read_path] - else: - # Pass in a directory path - # (row-group selection not allowed in this case) - read_path = path - row_groups = None - - # Filter on partitioned columns - expect = pd.read_parquet(read_path, filters=pfilters) - got = cudf.read_parquet( - read_path, - filters=pfilters, - row_groups=row_groups, - categorical_partitions=use_cat, - ) - expect["b"] = expect["b"].astype(str) - expect["c"] = expect["c"].astype(int) - if use_cat: - assert got.dtypes["b"] == "category" - assert got.dtypes["c"] == "category" - got["b"] = got["b"].astype(str) - got["c"] = got["c"].astype(int) - else: - # Check that we didn't get categorical - # columns, but convert back to categorical - # for comparison with pandas - assert got.dtypes["b"] == "object" - assert got.dtypes["c"] == "int" - assert_eq(expect, got) - - -@pytest.mark.parametrize( - "filters", [[("a", "==", 10)], [[("a", "==", 10)], [("c", "==", 1)]]] -) -def test_read_parquet_partitioned_filtered_other(tmpdir, filters): - rng = np.random.default_rng(2) - path = str(tmpdir) - size = 10 - df = cudf.DataFrame( - { - "a": np.arange(0, stop=size, dtype="int64"), - "b": rng.choice(list("abcd"), size=size), - "c": rng.choice(np.arange(4), size=size), - } - ) - df.to_parquet(path, partition_cols=["c", "b"]) - got = cudf.read_parquet(path, filters=filters) - expect = pd.read_parquet(path, filters=filters) - - # Work-around for pandas bug: - # https://github.com/pandas-dev/pandas/issues/53345 - expect["c"] = expect["c"].astype( - pd.CategoricalDtype( - categories=expect["c"].dtype.categories.astype("int64"), - ordered=expect["c"].dtype.ordered, - ) - ) - assert_eq(expect, got) - - -def test_parquet_writer_chunked_metadata(tmpdir, simple_pdf, simple_gdf): - gdf_fname = tmpdir.join("gdf.parquet") - test_path = "test/path" - - writer = ParquetWriter(gdf_fname) - writer.write_table(simple_gdf) - writer.write_table(simple_gdf) - meta_byte_array = writer.close(metadata_file_path=test_path) - fmd = pq.ParquetFile(BytesIO(meta_byte_array)).metadata - - assert fmd.num_rows == 2 * len(simple_gdf) - assert fmd.num_row_groups == 2 - - for r in range(fmd.num_row_groups): - for c in range(fmd.num_columns): - assert fmd.row_group(r).column(c).file_path == test_path - - -def test_write_read_cudf(tmpdir, pdf): - file_path = tmpdir.join("cudf.parquet") - if "col_category" in pdf.columns: - pdf = pdf.drop(columns=["col_category"]) - - gdf = cudf.from_pandas(pdf) - gdf.to_parquet(file_path) - gdf = cudf.read_parquet(file_path) - - assert_eq(gdf, pdf, check_index_type=not pdf.empty) - - -def test_write_cudf_read_pandas_pyarrow(tmpdir, pdf): - cudf_path = tmpdir.join("cudf.parquet") - pandas_path = tmpdir.join("pandas.parquet") - - if "col_category" in pdf.columns: - pdf = pdf.drop(columns=["col_category"]) - - df = cudf.from_pandas(pdf) - - df.to_parquet(cudf_path) - pdf.to_parquet(pandas_path) - - cudf_res = pd.read_parquet(cudf_path) - pd_res = pd.read_parquet(pandas_path) - - assert_eq(pd_res, cudf_res, check_index_type=not pdf.empty) - - cudf_res = pa.parquet.read_table( - cudf_path, use_pandas_metadata=True - ).to_pandas() - pd_res = pa.parquet.read_table( - pandas_path, use_pandas_metadata=True - ).to_pandas() - - assert_eq(cudf_res, pd_res, check_index_type=not pdf.empty) - - -def test_parquet_writer_criteo(tmpdir): - # To run this test, download the day 0 of criteo dataset from - # http://labs.criteo.com/2013/12/download-terabyte-click-logs/ - # and place the uncompressed dataset in the home directory - fname = os.path.expanduser("~/day_0") - if not os.path.isfile(fname): - pytest.skip("Local criteo day 0 tsv file is not found") - - cudf_path = tmpdir.join("cudf.parquet") - - cont_names = ["I" + str(x) for x in range(1, 14)] - cat_names = ["C" + str(x) for x in range(1, 27)] - cols = ["label", *cont_names, *cat_names] - - df = cudf.read_csv(fname, sep="\t", names=cols, byte_range=(0, 1000000000)) - df = df.drop(columns=cont_names) - - df.to_parquet(cudf_path) - - -def test_trailing_nans(datadir, tmpdir): - fname = "trailing_nans.parquet" - file_path = datadir / fname - cu_df = cudf.read_parquet(file_path) - - tmp_file_path = tmpdir.join(fname) - cu_df.to_parquet(tmp_file_path) - - pd.read_parquet(tmp_file_path) - - -def test_parquet_writer_sliced(tmpdir): - cudf_path = tmpdir.join("cudf.parquet") - - df = pd.DataFrame() - df["String"] = np.array(["Alpha", "Beta", "Gamma", "Delta"]) - df = cudf.from_pandas(df) - - df_select = df.iloc[1:3] - - df_select.to_parquet(cudf_path) - assert_eq(cudf.read_parquet(cudf_path), df_select) - - -def test_parquet_writer_list_basic(tmpdir): - expect = pd.DataFrame({"a": [[[1, 2], [3, 4]], None, [[5, 6], None]]}) - fname = tmpdir.join("test_parquet_writer_list_basic.parquet") - - gdf = cudf.from_pandas(expect) - - gdf.to_parquet(fname) - assert os.path.exists(fname) - - got = pd.read_parquet(fname) - assert_eq(expect, got) - - -def test_parquet_writer_list_large(tmpdir): - gdf = cudf.DataFrame({"a": list_gen(int_gen, 128, 40, 25)}) - fname = tmpdir.join("test_parquet_writer_list_large.parquet") - - gdf.to_parquet(fname) - assert os.path.exists(fname) - - got = pd.read_parquet(fname) - assert gdf.to_arrow().equals(pa.Table.from_pandas(got)) - - -def test_parquet_writer_list_large_mixed(tmpdir): - expect = pd.DataFrame( - { - "a": list_gen(string_gen, 64, 40, 25), - "b": list_gen(int_gen, 64, 40, 25), - "c": list_gen(int_gen, 64, 40, 25, include_validity=True), - "d": list_gen(string_gen, 64, 40, 25, include_validity=True), - } - ) - fname = tmpdir.join("test_parquet_writer_list_large_mixed.parquet") - gdf = cudf.from_pandas(expect) - - gdf.to_parquet(fname) - assert os.path.exists(fname) - - got = pd.read_parquet(fname) - assert_eq(expect, got) - - -@pytest.mark.parametrize("store_schema", [True, False]) -def test_parquet_writer_list_chunked(tmpdir, store_schema): - if store_schema and version.parse(pa.__version__) < version.parse( - "15.0.0" - ): - pytest.skip("https://github.com/apache/arrow/pull/37792") - table1 = cudf.DataFrame( - { - "a": list_gen(string_gen, 64, 40, 25), - "b": list_gen(int_gen, 64, 40, 25), - "c": list_gen(int_gen, 64, 40, 25, include_validity=True), - "d": list_gen(string_gen, 64, 40, 25, include_validity=True), - } - ) - table2 = cudf.DataFrame( - { - "a": list_gen(string_gen, 64, 40, 25), - "b": list_gen(int_gen, 64, 40, 25), - "c": list_gen(int_gen, 64, 40, 25, include_validity=True), - "d": list_gen(string_gen, 64, 40, 25, include_validity=True), - } - ) - fname = tmpdir.join("test_parquet_writer_list_chunked.parquet") - expect = cudf.concat([table1, table2]) - expect = expect.reset_index(drop=True) - - with ParquetWriter(fname, store_schema=store_schema) as writer: - writer.write_table(table1) - writer.write_table(table2) - - assert os.path.exists(fname) - got = pq.read_table(fname) - # compare with pyarrow since pandas doesn't - # have a list or struct dtype - assert expect.to_arrow().equals(got) - - -@pytest.mark.parametrize("engine", ["cudf", "pyarrow"]) -def test_parquet_nullable_boolean(tmpdir, engine): - pandas_path = tmpdir.join("pandas_bools.parquet") - - pdf = pd.DataFrame( - { - "a": pd.Series( - [True, False, None, True, False], dtype=pd.BooleanDtype() - ) - } - ) - expected_gdf = cudf.DataFrame({"a": [True, False, None, True, False]}) - - pdf.to_parquet(pandas_path) - with _hide_pyarrow_parquet_cpu_warnings(engine): - actual_gdf = cudf.read_parquet(pandas_path, engine=engine) - - assert_eq(actual_gdf, expected_gdf) - - -def run_parquet_index(pdf, index): - pandas_buffer = BytesIO() - cudf_buffer = BytesIO() - - gdf = cudf.from_pandas(pdf) - - pdf.to_parquet(pandas_buffer, index=index) - gdf.to_parquet(cudf_buffer, index=index) - - expected = pd.read_parquet(cudf_buffer) - actual = cudf.read_parquet(pandas_buffer) - - if expected.empty and actual.empty: - # We return RangeIndex columns compared - # to pandas' Index[object] columns - actual.columns = expected.columns - - assert_eq(expected, actual, check_index_type=True) - - expected = pd.read_parquet(pandas_buffer) - actual = cudf.read_parquet(cudf_buffer) - - if expected.empty and actual.empty: - # We return RangeIndex columns compared - # to pandas' Index[object] columns - actual.columns = expected.columns - - assert_eq( - expected, - actual, - check_index_type=True, - ) - - -@pytest.mark.parametrize( - "pdf", - [ - pd.DataFrame(index=[1, 2, 3]), - pd.DataFrame({"a": [1, 2, 3]}, index=[0.43534, 345, 0.34534]), - pd.DataFrame( - {"b": [11, 22, 33], "c": ["a", "b", "c"]}, - index=pd.Index(["a", "b", "c"], name="custom name"), - ), - pd.DataFrame( - {"a": [10, 11, 12], "b": [99, 88, 77]}, - index=pd.RangeIndex(12, 17, 2), - ), - pd.DataFrame( - {"b": [99, 88, 77]}, - index=pd.RangeIndex(22, 27, 2, name="hello index"), - ), - pd.DataFrame(index=pd.Index(["a", "b", "c"], name="custom name")), - pd.DataFrame( - {"a": ["a", "bb", "cc"], "b": [10, 21, 32]}, - index=pd.MultiIndex.from_tuples([[1, 2], [10, 11], [15, 16]]), - ), - pd.DataFrame( - {"a": ["a", "bb", "cc"], "b": [10, 21, 32]}, - index=pd.MultiIndex.from_tuples( - [[1, 2], [10, 11], [15, 16]], names=["first", "second"] - ), - ), - ], -) -@pytest.mark.parametrize("index", [None, True, False]) -def test_parquet_index(pdf, index): - run_parquet_index(pdf, index) - - -@pytest.mark.parametrize( - "index", - [ - pytest.param( - None, - marks=pytest.mark.xfail( - reason="https://github.com/apache/arrow/issues/40743" - ), - ), - True, - ], -) -def test_parquet_index_empty(index): - pdf = pd.DataFrame(index=pd.RangeIndex(0, 10, 1)) - run_parquet_index(pdf, index) - - -def test_parquet_no_index_empty(): - pdf = pd.DataFrame(index=pd.RangeIndex(0, 10, 1)) - run_parquet_index(pdf, index=False) - - -@pytest.mark.parametrize("engine", ["cudf", "pyarrow"]) -def test_parquet_allnull_str(tmpdir, engine): - pandas_path = tmpdir.join("pandas_allnulls.parquet") - - pdf = pd.DataFrame( - {"a": pd.Series([None, None, None, None, None], dtype="str")} - ) - expected_gdf = cudf.DataFrame( - {"a": cudf.Series([None, None, None, None, None], dtype="str")} - ) - - pdf.to_parquet(pandas_path) - with _hide_pyarrow_parquet_cpu_warnings(engine): - actual_gdf = cudf.read_parquet(pandas_path, engine=engine) - - assert_eq(actual_gdf, expected_gdf) - - -def normalized_equals(value1, value2): - if value1 is pd.NA or value1 is pd.NaT: - value1 = None - if value2 is pd.NA or value2 is pd.NaT: - value2 = None - if isinstance(value1, np.datetime64): - value1 = pd.Timestamp(value1).to_pydatetime() - if isinstance(value2, np.datetime64): - value2 = pd.Timestamp(value2).to_pydatetime() - if isinstance(value1, pd.Timestamp): - value1 = value1.to_pydatetime() - if isinstance(value2, pd.Timestamp): - value2 = value2.to_pydatetime() - if isinstance(value1, datetime.datetime): - value1 = value1.replace(tzinfo=None) - if isinstance(value2, datetime.datetime): - value2 = value2.replace(tzinfo=None) - if isinstance(value1, pd.Timedelta): - unit = "ms" if value1.unit == "s" else value1.unit - value2 = pd.Timedelta(value2, unit=unit) - - # if one is datetime then both values are datetimes now - if isinstance(value1, datetime.datetime): - return value1 == value2 - - # Compare integers with floats now - if isinstance(value1, float) or isinstance(value2, float): - return math.isclose(value1, value2) - - return value1 == value2 - - -@pytest.mark.parametrize("add_nulls", [True, False]) -@pytest.mark.parametrize("store_schema", [True, False]) -def test_parquet_writer_statistics(tmpdir, pdf, add_nulls, store_schema): - if store_schema and version.parse(pa.__version__) < version.parse( - "15.0.0" - ): - pytest.skip("https://github.com/apache/arrow/pull/37792") - file_path = tmpdir.join("cudf.parquet") - if "col_category" in pdf.columns: - pdf = pdf.drop(columns=["col_category", "col_bool"]) - - if not add_nulls: - # Timedelta types convert NaT to None when reading from parquet into - # pandas which interferes with series.max()/min() - for t in TIMEDELTA_TYPES: - pdf["col_" + t] = pd.Series(np.arange(len(pdf.index))).astype(t) - # pyarrow can't read values with non-zero nanoseconds - pdf["col_timedelta64[ns]"] = pdf["col_timedelta64[ns]"] * 1000 - - gdf = cudf.from_pandas(pdf) - if add_nulls: - for col in gdf: - set_random_null_mask_inplace(gdf[col]) - gdf.to_parquet(file_path, index=False, store_schema=store_schema) - - # Read back from pyarrow - pq_file = pq.ParquetFile(file_path) - # verify each row group's statistics - for rg in range(0, pq_file.num_row_groups): - pd_slice = pq_file.read_row_group(rg).to_pandas() - - # statistics are per-column. So need to verify independently - for i, col in enumerate(pd_slice): - stats = pq_file.metadata.row_group(rg).column(i).statistics - - actual_min = pd_slice[col].min() - stats_min = stats.min - assert normalized_equals(actual_min, stats_min) - - actual_max = pd_slice[col].max() - stats_max = stats.max - assert normalized_equals(actual_max, stats_max) - - assert stats.null_count == pd_slice[col].isna().sum() - assert stats.num_values == pd_slice[col].count() - - -def test_parquet_writer_list_statistics(tmpdir): - df = pd.DataFrame( - { - "a": list_gen(string_gen, 64, 40, 25), - "b": list_gen(int_gen, 64, 40, 25), - "c": list_gen(int_gen, 64, 40, 25, include_validity=True), - "d": list_gen(string_gen, 64, 40, 25, include_validity=True), - } - ) - fname = tmpdir.join("test_parquet_writer_list_statistics.parquet") - gdf = cudf.from_pandas(df) - - gdf.to_parquet(fname) - assert os.path.exists(fname) - - # Read back from pyarrow - pq_file = pq.ParquetFile(fname) - # verify each row group's statistics - for rg in range(0, pq_file.num_row_groups): - pd_slice = pq_file.read_row_group(rg).to_pandas() - - # statistics are per-column. So need to verify independently - for i, col in enumerate(pd_slice): - stats = pq_file.metadata.row_group(rg).column(i).statistics - - actual_min = pd_slice[col].explode().explode().dropna().min() - stats_min = stats.min - assert normalized_equals(actual_min, stats_min) - - actual_max = pd_slice[col].explode().explode().dropna().max() - stats_max = stats.max - assert normalized_equals(actual_max, stats_max) - - -@pytest.mark.parametrize( - "data", - [ - # Structs - { - "being": [ - None, - {"human?": True, "Deets": {"Name": "Carrot", "Age": 27}}, - {"human?": None, "Deets": {"Name": "Angua", "Age": 25}}, - {"human?": False, "Deets": {"Name": "Cheery", "Age": 31}}, - {"human?": False, "Deets": None}, - {"human?": None, "Deets": {"Name": "Mr", "Age": None}}, - ] - }, - # List of Structs - { - "family": [ - [None, {"human?": True, "deets": {"weight": 2.4, "age": 27}}], - [ - {"human?": None, "deets": {"weight": 5.3, "age": 25}}, - {"human?": False, "deets": {"weight": 8.0, "age": 31}}, - {"human?": False, "deets": None}, - ], - [], - [{"human?": None, "deets": {"weight": 6.9, "age": None}}], - ] - }, - # Struct of Lists - { - "Real estate records": [ - None, - { - "Status": "NRI", - "Ownerships": { - "land_unit": [None, 2, None], - "flats": [[1, 2, 3], [], [4, 5], [], [0, 6, 0]], - }, - }, - { - "Status": None, - "Ownerships": { - "land_unit": [4, 5], - "flats": [[7, 8], []], - }, - }, - { - "Status": "RI", - "Ownerships": {"land_unit": None, "flats": [[]]}, - }, - {"Status": "RI", "Ownerships": None}, - { - "Status": None, - "Ownerships": { - "land_unit": [7, 8, 9], - "flats": [[], [], []], - }, - }, - ] - }, - ], -) -def test_parquet_writer_nested(tmpdir, data): - expect = pd.DataFrame(data) - gdf = cudf.from_pandas(expect) - - fname = tmpdir.join("test_parquet_writer_nested.parquet") - gdf.to_parquet(fname) - assert os.path.exists(fname) - - got = pd.read_parquet(fname) - assert_eq(expect, got) - - -@pytest.mark.parametrize( - "decimal_type", - [cudf.Decimal32Dtype, cudf.Decimal64Dtype, cudf.Decimal128Dtype], -) -@pytest.mark.parametrize("data", [[1, 2, 3], [0.00, 0.01, None, 0.5]]) -def test_parquet_writer_decimal(decimal_type, data): - gdf = cudf.DataFrame({"val": data}) - - gdf["dec_val"] = gdf["val"].astype(decimal_type(7, 2)) - - buff = BytesIO() - gdf.to_parquet(buff) - - got = pd.read_parquet(buff, dtype_backend="numpy_nullable") - assert_eq(gdf["val"].to_pandas(nullable=True), got["val"]) - assert_eq(gdf["dec_val"].to_pandas(), got["dec_val"]) - - -def test_parquet_writer_column_validation(): - cudf_parquet = BytesIO() - pandas_parquet = BytesIO() - df = cudf.DataFrame({1: [1, 2, 3], "a": ["a", "b", "c"]}) - pdf = df.to_pandas() - - with cudf.option_context("mode.pandas_compatible", True): - with pytest.warns(UserWarning): - df.to_parquet(cudf_parquet) - - with pytest.warns(UserWarning): - pdf.to_parquet(pandas_parquet) - - assert_eq( - pd.read_parquet(cudf_parquet), - cudf.read_parquet(pandas_parquet), - ) - assert_eq( - cudf.read_parquet(cudf_parquet), - pd.read_parquet(pandas_parquet), - ) - - with cudf.option_context("mode.pandas_compatible", False): - with pytest.raises(ValueError): - df.to_parquet(cudf_parquet) - - -def test_parquet_writer_nulls_pandas_read(tmpdir, pdf): - if "col_bool" in pdf.columns: - pdf.drop(columns="col_bool", inplace=True) - if "col_category" in pdf.columns: - pdf.drop(columns="col_category", inplace=True) - gdf = cudf.from_pandas(pdf) - - num_rows = len(gdf) - - if num_rows > 0: - for col in gdf.columns: - gdf[col][random.randint(0, num_rows - 1)] = None - - fname = tmpdir.join("test_parquet_writer_nulls_pandas_read.parquet") - gdf.to_parquet(fname) - assert os.path.exists(fname) - - got = pd.read_parquet(fname) - nullable = num_rows > 0 - - if nullable: - gdf = gdf.drop(columns="col_datetime64[ms]") - gdf = gdf.drop(columns="col_datetime64[us]") - got = got.drop(columns="col_datetime64[ms]") - got = got.drop(columns="col_datetime64[us]") - - assert_eq(gdf.to_pandas(nullable=nullable), got) - - -@pytest.mark.parametrize( - "decimal_type", - [cudf.Decimal32Dtype, cudf.Decimal64Dtype, cudf.Decimal128Dtype], -) -def test_parquet_decimal_precision(tmpdir, decimal_type): - df = cudf.DataFrame({"val": ["3.5", "4.2"]}).astype(decimal_type(5, 2)) - assert df.val.dtype.precision == 5 - - fname = tmpdir.join("decimal_test.parquet") - df.to_parquet(fname) - df = cudf.read_parquet(fname) - assert df.val.dtype.precision == 5 - - -def test_parquet_decimal_precision_empty(tmpdir): - df = ( - cudf.DataFrame({"val": ["3.5", "4.2"]}) - .astype(cudf.Decimal64Dtype(5, 2)) - .iloc[:0] - ) - assert df.val.dtype.precision == 5 - - fname = tmpdir.join("decimal_test.parquet") - df.to_parquet(fname) - df = cudf.read_parquet(fname) - assert df.val.dtype.precision == 5 - - -def test_parquet_reader_brotli(datadir): - fname = datadir / "brotli_int16.parquet" - - expect = pd.read_parquet(fname) - got = cudf.read_parquet(fname).to_pandas(nullable=True) - - assert_eq(expect, got) - - -def test_parquet_reader_one_level_list(datadir): - fname = datadir / "one_level_list.parquet" - - expect = pd.read_parquet(fname) - got = cudf.read_parquet(fname) - - assert_eq(expect, got) - - -def test_parquet_reader_binary_decimal(datadir): - fname = datadir / "binary_decimal.parquet" - - expect = pd.read_parquet(fname) - got = cudf.read_parquet(fname).to_pandas() - - assert_eq(expect, got) - - -def test_parquet_reader_fixed_bin(datadir): - fname = datadir / "fixed_len_byte_array.parquet" - - expect = pd.read_parquet(fname) - got = cudf.read_parquet(fname) - - assert_eq(expect, got) - - -def test_parquet_reader_fixed_len_with_dict(tmpdir): - def flba(i): - hasher = hashlib.sha256() - hasher.update(i.to_bytes(4, "little")) - return hasher.digest() - - # use pyarrow to write table of fixed_len_byte_array - num_rows = 200 - data = pa.array([flba(i) for i in range(num_rows)], type=pa.binary(32)) - padf = pa.Table.from_arrays([data], names=["flba"]) - padf_fname = tmpdir.join("padf.parquet") - pq.write_table(padf, padf_fname, use_dictionary=True) - - expect = pd.read_parquet(padf_fname) - got = cudf.read_parquet(padf_fname) - assert_eq(expect, got) - - -def test_parquet_flba_round_trip(tmpdir): - def flba(i): - hasher = hashlib.sha256() - hasher.update(i.to_bytes(4, "little")) - return hasher.digest() - - # use pyarrow to write table of fixed_len_byte_array - num_rows = 200 - data = pa.array([flba(i) for i in range(num_rows)], type=pa.binary(32)) - padf = pa.Table.from_arrays([data], names=["flba"]) - padf_fname = tmpdir.join("padf.parquet") - pq.write_table(padf, padf_fname) - - # round trip data with cudf - cdf = cudf.read_parquet(padf_fname) - cdf_fname = tmpdir.join("cdf.parquet") - cdf.to_parquet(cdf_fname, column_type_length={"flba": 32}) - - # now read back in with pyarrow to test it was written properly by cudf - padf2 = pq.read_table(padf_fname) - padf3 = pq.read_table(cdf_fname) - assert_eq(padf2, padf3) - assert_eq(padf2.schema[0].type, padf3.schema[0].type) - - -@pytest.mark.parametrize( - "encoding", - [ - "PLAIN", - "DICTIONARY", - "DELTA_BINARY_PACKED", - "BYTE_STREAM_SPLIT", - "USE_DEFAULT", - ], -) -def test_per_column_encoding_option(encoding): - pdf = pd.DataFrame({"ilist": [[1, 2, 3, 1, 2, 3]], "i1": [1]}) - cdf = cudf.from_pandas(pdf) - buffer = BytesIO() - cdf.to_parquet( - buffer, - column_encoding={"ilist.list.element": encoding}, - ) - # DICTIONARY and USE_DEFAULT should both result in a PLAIN_DICTIONARY encoding in parquet - encoding_name = ( - "PLAIN_DICTIONARY" - if encoding == "DICTIONARY" or encoding == "USE_DEFAULT" - else encoding - ) - pf = pq.ParquetFile(buffer) - fmd = pf.metadata - assert encoding_name in fmd.row_group(0).column(0).encodings - - -@pytest.mark.parametrize("compression", ["SNAPPY", "ZSTD"]) -def test_per_column_compression_option(set_decomp_env_vars, compression): - pdf = pd.DataFrame( - {"ilist": [[1, 2, 3, 1, 2, 3]], "i1": [[1, 2, 3, 1, 2, 3]]} - ) - cdf = cudf.from_pandas(pdf) - buffer = BytesIO() - cdf.to_parquet( - buffer, - compression=compression, - skip_compression={"ilist.list.element"}, - use_dictionary=False, # to make sure that data is compressible - ) - - pf = pq.ParquetFile(buffer) - fmd = pf.metadata - assert fmd.row_group(0).column(0).compression == "UNCOMPRESSED" - assert fmd.row_group(0).column(1).compression == compression - - -@pytest.mark.parametrize( - "encoding", - ["DELTA_LENGTH_BYTE_ARRAY", "DELTA_BYTE_ARRAY"], -) -def test_per_column_options_string_col(tmpdir, encoding): - pdf = pd.DataFrame({"s": ["a string"], "i1": [1]}) - cdf = cudf.from_pandas(pdf) - fname = tmpdir.join("strcol.parquet") - cdf.to_parquet( - fname, - column_encoding={"s": encoding}, - compression="SNAPPY", - ) - pf = pq.ParquetFile(fname) - fmd = pf.metadata - assert encoding in fmd.row_group(0).column(0).encodings - - -@pytest.mark.skipif( - version.parse(pa.__version__) < version.parse("16.0.0"), - reason="https://github.com/apache/arrow/pull/39748", -) -@pytest.mark.parametrize( - "num_rows", - [200, 10000], -) -def test_parquet_bss_round_trip(tmpdir, num_rows): - def flba(i): - hasher = hashlib.sha256() - hasher.update(i.to_bytes(4, "little")) - return hasher.digest() - - # use pyarrow to write table of types that support BYTE_STREAM_SPLIT encoding - rows_per_rowgroup = 5000 - fixed_data = pa.array( - [flba(i) for i in range(num_rows)], type=pa.binary(32) - ) - i32_data = pa.array(list(range(num_rows)), type=pa.int32()) - i64_data = pa.array(list(range(num_rows)), type=pa.int64()) - f32_data = pa.array([float(i) for i in range(num_rows)], type=pa.float32()) - f64_data = pa.array([float(i) for i in range(num_rows)], type=pa.float64()) - padf = pa.Table.from_arrays( - [fixed_data, i32_data, i64_data, f32_data, f64_data], - names=["flba", "i32", "i64", "f32", "f64"], - ) - padf_fname = tmpdir.join("padf.parquet") - pq.write_table( - padf, - padf_fname, - column_encoding="BYTE_STREAM_SPLIT", - use_dictionary=False, - row_group_size=rows_per_rowgroup, - ) - - # round trip data with cudf - cdf = cudf.read_parquet(padf_fname) - cdf_fname = tmpdir.join("cdf.parquet") - cdf.to_parquet( - cdf_fname, - column_type_length={"flba": 32}, - column_encoding={ - "flba": "BYTE_STREAM_SPLIT", - "i32": "BYTE_STREAM_SPLIT", - "i64": "BYTE_STREAM_SPLIT", - "f32": "BYTE_STREAM_SPLIT", - "f64": "BYTE_STREAM_SPLIT", - }, - row_group_size_rows=rows_per_rowgroup, - ) - - # now read back in with pyarrow to test it was written properly by cudf - padf2 = pq.read_table(padf_fname) - padf3 = pq.read_table(cdf_fname) - assert_eq(padf2, padf3) - assert_eq(padf2.schema[0].type, padf3.schema[0].type) - - -def test_parquet_reader_rle_boolean(datadir): - fname = datadir / "rle_boolean_encoding.parquet" - - expect = pd.read_parquet(fname) - got = cudf.read_parquet(fname) - - assert_eq(expect, got) - - -# testing a specific bug-fix/edge case. -# specifically: int a parquet file containing a particular way of representing -# a list column in a schema, the cudf reader was confusing -# nesting information between a list column and a subsequent -# string column, ultimately causing a crash. -@pytest.mark.skipif( - PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, - reason="Older versions of pandas do not have DataFrame.map()", -) -def test_parquet_reader_one_level_list2(datadir): - # we are reading in a file containing binary types, but cudf returns - # those as strings. so we have to massage the pandas data to get - # them to compare correctly. - def postprocess(val): - if isinstance(val, bytes): - return val.decode() - elif isinstance(val, np.ndarray): - return np.array([v.decode() for v in val]) - else: - return val - - fname = datadir / "one_level_list2.parquet" - - expect = pd.read_parquet(fname) - expect = expect.map(postprocess) - got = cudf.read_parquet(fname) - - assert_eq(expect, got, check_dtype=False) - - -# testing a specific bug-fix/edge case. -# specifically: in a parquet file containing a particular way of representing -# a list column in a schema, the cudf reader was confusing -# nesting information and building a list of list of int instead -# of a list of int -def test_parquet_reader_one_level_list3(datadir): - fname = datadir / "one_level_list3.parquet" - - expect = pd.read_parquet(fname) - got = cudf.read_parquet(fname) - - assert_eq(expect, got, check_dtype=True) - - -@pytest.mark.parametrize("size_bytes", [4_000_000, 1_000_000, 600_000]) -@pytest.mark.parametrize("size_rows", [1_000_000, 100_000, 10_000]) -def test_to_parquet_row_group_size( - tmpdir, large_int64_gdf, size_bytes, size_rows -): - fname = tmpdir.join("row_group_size.parquet") - large_int64_gdf.to_parquet( - fname, row_group_size_bytes=size_bytes, row_group_size_rows=size_rows - ) - - num_rows, row_groups, col_names, _, _ = cudf.io.read_parquet_metadata( - fname - ) - # 8 bytes per row, as the column is int64 - expected_num_rows = max( - math.ceil(num_rows / size_rows), math.ceil(8 * num_rows / size_bytes) - ) - assert expected_num_rows == row_groups - - -@pytest.mark.parametrize("size_rows", [500_000, 100_000, 10_000]) -def test_parquet_row_group_metadata(tmpdir, large_int64_gdf, size_rows): - fname = tmpdir.join("row_group_size.parquet") - large_int64_gdf.to_parquet(fname, row_group_size_rows=size_rows) - - # read file metadata from parquet - ( - num_rows, - row_groups, - _, # col_names - _, # num_columns - row_group_metadata, - ) = cudf.io.read_parquet_metadata(fname) - - # length(RowGroupsMetaData) == number of row groups - assert len(row_group_metadata) == row_groups - # sum of rows in row groups == total rows - assert num_rows == sum( - [row_group["num_rows"] for row_group in row_group_metadata] - ) - - -def test_parquet_reader_decimal_columns(): - df = cudf.DataFrame( - { - "col1": cudf.Series([1, 2, 3], dtype=cudf.Decimal64Dtype(10, 2)), - "col2": [10, 11, 12], - "col3": [12, 13, 14], - "col4": ["a", "b", "c"], - } - ) - buffer = BytesIO() - df.to_parquet(buffer) - - actual = cudf.read_parquet(buffer, columns=["col3", "col2", "col1"]) - expected = pd.read_parquet(buffer, columns=["col3", "col2", "col1"]) - - assert_eq(actual, expected) - - -def test_parquet_reader_zstd_compression(datadir): - fname = datadir / "spark_zstd.parquet" - try: - df = cudf.read_parquet(fname) - pdf = pd.read_parquet(fname) - assert_eq(df, pdf) - except RuntimeError: - pytest.mark.xfail(reason="zstd support is not enabled") - - -def test_read_parquet_multiple_files(tmpdir): - df_1_path = tmpdir / "df_1.parquet" - df_2_path = tmpdir / "df_2.parquet" - df_1 = cudf.DataFrame({"id": range(100), "a": [1] * 100}) - df_1.to_parquet(df_1_path) - - df_2 = cudf.DataFrame({"id": range(200, 2200), "a": [2] * 2000}) - df_2.to_parquet(df_2_path) - - expected = pd.read_parquet([df_1_path, df_2_path]) - actual = cudf.read_parquet([df_1_path, df_2_path]) - assert_eq(expected, actual) - - expected = pd.read_parquet([df_2_path, df_1_path]) - actual = cudf.read_parquet([df_2_path, df_1_path]) - assert_eq(expected, actual) - - -@pytest.mark.parametrize("index", [True, False, None]) -@pytest.mark.parametrize("columns", [None, [], ["b", "a"]]) -def test_parquet_columns_and_index_param(index, columns): - buffer = BytesIO() - df = cudf.DataFrame({"a": [1, 2, 3], "b": ["a", "b", "c"]}) - df.to_parquet(buffer, index=index) - - expected = pd.read_parquet(buffer, columns=columns) - got = cudf.read_parquet(buffer, columns=columns) - if columns == [] and index in {False, None}: - # cuDF returns RangeIndex columns compared - # to pandas' Index[object] columns - got.columns = expected.columns - - assert_eq(expected, got, check_index_type=True) - - -@pytest.mark.parametrize("columns", [None, ["b", "a"]]) -def test_parquet_columns_and_range_index(columns): - buffer = BytesIO() - df = cudf.DataFrame( - {"a": [1, 2, 3], "b": ["a", "b", "c"]}, index=pd.RangeIndex(2, 5) - ) - df.to_parquet(buffer) - - expected = pd.read_parquet(buffer, columns=columns) - got = cudf.read_parquet(buffer, columns=columns) - - assert_eq(expected, got, check_index_type=True) - - -def test_parquet_nested_struct_list(): - buffer = BytesIO() - data = { - "payload": { - "Domain": { - "Name": "abc", - "Id": {"Name": "host", "Value": "127.0.0.8"}, - }, - "StreamId": "12345678", - "Duration": 10, - "Offset": 12, - "Resource": [{"Name": "ZoneName", "Value": "RAPIDS"}], - } - } - df = cudf.DataFrame({"a": cudf.Series(data)}) - - df.to_parquet(buffer) - expected = pd.read_parquet(buffer) - actual = cudf.read_parquet(buffer) - assert_eq(expected, actual) - assert_eq(actual.a.dtype, df.a.dtype) - - -def test_parquet_writer_zstd(): - size = 12345 - rng = np.random.default_rng(seed=0) - expected = cudf.DataFrame( - { - "a": np.arange(0, stop=size, dtype="float64"), - "b": rng.choice(list("abcd"), size=size), - "c": rng.choice(np.arange(4), size=size), - } - ) - - buff = BytesIO() - try: - expected.to_parquet(buff, compression="ZSTD") - except RuntimeError: - pytest.mark.xfail(reason="Newer nvCOMP version is required") - else: - got = pd.read_parquet(buff) - assert_eq(expected, got) - - -@pytest.mark.parametrize("store_schema", [True, False]) -def test_parquet_writer_time_delta_physical_type(store_schema): - df = cudf.DataFrame( - { - "s": cudf.Series([1], dtype="timedelta64[s]"), - "ms": cudf.Series([2], dtype="timedelta64[ms]"), - "us": cudf.Series([3], dtype="timedelta64[us]"), - # 4K because Pandas/pyarrow don't support non-zero nanoseconds - # in Parquet files - "ns": cudf.Series([4000], dtype="timedelta64[ns]"), - } - ) - buffer = BytesIO() - df.to_parquet(buffer, store_schema=store_schema) - - got = pd.read_parquet(buffer) - - if store_schema: - expected = pd.DataFrame( - { - "s": ["0 days 00:00:01"], - "ms": ["0 days 00:00:00.002000"], - "us": ["0 days 00:00:00.000003"], - "ns": ["0 days 00:00:00.000004"], - }, - dtype="str", - ) - else: - expected = pd.DataFrame( - { - "s": ["00:00:01"], - "ms": ["00:00:00.002000"], - "us": ["00:00:00.000003"], - "ns": ["00:00:00.000004"], - }, - dtype="str", - ) - assert_eq(got.astype("str"), expected) - - -@pytest.mark.parametrize("store_schema", [True, False]) -def test_parquet_roundtrip_time_delta(store_schema): - num_rows = 12345 - df = cudf.DataFrame( - { - "s": cudf.Series( - random.sample(range(0, 200000), num_rows), - dtype="timedelta64[s]", - ), - "ms": cudf.Series( - random.sample(range(0, 200000), num_rows), - dtype="timedelta64[ms]", - ), - "us": cudf.Series( - random.sample(range(0, 200000), num_rows), - dtype="timedelta64[us]", - ), - "ns": cudf.Series( - random.sample(range(0, 200000), num_rows), - dtype="timedelta64[ns]", - ), - } - ) - buffer = BytesIO() - df.to_parquet(buffer, store_schema=store_schema) - # `check_dtype` cannot be removed here as timedelta64[s] will change to `timedelta[ms]` - assert_eq(df, cudf.read_parquet(buffer), check_dtype=False) - if store_schema: - assert_eq(df, pd.read_parquet(buffer)) - - -def test_parquet_reader_malformed_file(datadir): - fname = datadir / "nested-unsigned-malformed.parquet" - - # expect a failure when reading the whole file - with pytest.raises(RuntimeError): - cudf.read_parquet(fname) - - -def test_parquet_reader_unsupported_page_encoding(datadir): - fname = datadir / "delta_encoding.parquet" - - # expect a failure when reading the whole file - with pytest.raises(RuntimeError): - cudf.read_parquet(fname) - - -def test_parquet_reader_detect_bad_dictionary(datadir): - fname = datadir / "bad_dict.parquet" - - # expect a failure when reading the whole file - with pytest.raises(RuntimeError): - cudf.read_parquet(fname) - - -@pytest.mark.parametrize("data", [{"a": [1, 2, 3, 4]}, {"b": [1, None, 2, 3]}]) -@pytest.mark.parametrize("force_nullable_schema", [True, False]) -def test_parquet_writer_schema_nullability(data, force_nullable_schema): - df = cudf.DataFrame(data) - file_obj = BytesIO() - - df.to_parquet(file_obj, force_nullable_schema=force_nullable_schema) - - assert pa.parquet.read_schema(file_obj).field(0).nullable == ( - force_nullable_schema or df.isnull().any().any() - ) - - -def test_parquet_read_filter_and_project(): - # Filter on columns that are not included - # in the current column projection - - with BytesIO() as buffer: - # Write parquet data - df = cudf.DataFrame( - { - "a": [1, 2, 3, 4, 5] * 10, - "b": [0, 1, 2, 3, 4] * 10, - "c": range(50), - "d": [6, 7] * 25, - "e": [8, 9] * 25, - } - ) - df.to_parquet(buffer) - - # Read back with filter and projection - columns = ["b"] - filters = [[("a", "==", 5), ("c", ">", 20)]] - got = cudf.read_parquet(buffer, columns=columns, filters=filters) - - # Check result - expected = df[(df.a == 5) & (df.c > 20)][columns].reset_index(drop=True) - assert_eq(got, expected) - - -def test_parquet_reader_multiindex(): - expected = pd.DataFrame( - {"A": [1, 2, 3]}, - index=pd.MultiIndex.from_tuples([("a", 1), ("a", 2), ("b", 1)]), - ) - file_obj = BytesIO() - expected.to_parquet(file_obj, engine="pyarrow") - with pytest.warns(UserWarning): - actual = cudf.read_parquet(file_obj, engine="pyarrow") - assert_eq(actual, expected) - - -def test_parquet_reader_engine_error(): - with pytest.raises(ValueError): - cudf.read_parquet(BytesIO(), engine="abc") - - -def test_reader_lz4(): - pdf = pd.DataFrame({"ints": [1, 2] * 5001}) - - buffer = BytesIO() - pdf.to_parquet(buffer, compression="LZ4") - - got = cudf.read_parquet(buffer) - assert_eq(pdf, got) - - -def test_writer_lz4(): - gdf = cudf.DataFrame({"ints": [1, 2] * 5001}) - - buffer = BytesIO() - gdf.to_parquet(buffer, compression="LZ4") - - got = pd.read_parquet(buffer) - assert_eq(gdf, got) - - -def test_parquet_reader_zstd_huff_tables(datadir): - # Ensure that this zstd-compressed file does not overrun buffers. The - # problem was fixed in nvcomp 3.0.6. - # See https://github.com/rapidsai/cudf/issues/15096 - fname = datadir / "zstd_huff_tables_bug.parquet" - - expected = pa.parquet.read_table(fname).to_pandas() - actual = cudf.read_parquet(fname) - assert_eq(actual, expected) - - -def test_parquet_reader_roundtrip_with_arrow_schema(): - # Ensure that the nested types are faithfully being roundtripped - # across Parquet with arrow schema which is used to faithfully - # round trip duration types (timedelta64) across Parquet read and write. - pdf = pd.DataFrame( - { - "s": pd.Series([None, None, None], dtype="timedelta64[s]"), - "ms": pd.Series([1234, None, 32442], dtype="timedelta64[ms]"), - "us": pd.Series([None, 3456, None], dtype="timedelta64[us]"), - "ns": pd.Series([1234, 3456, 32442], dtype="timedelta64[ns]"), - "duration_list": list( - [ - [ - datetime.timedelta(minutes=7, seconds=4), - datetime.timedelta(minutes=7), - ], - [ - None, - None, - ], - [ - datetime.timedelta(minutes=7, seconds=4), - None, - ], - ] - ), - "int64": pd.Series([1234, 123, 4123], dtype="int64"), - "list": list([[1, 2], [1, 2], [1, 2]]), - "datetime": pd.Series([1234, 123, 4123], dtype="datetime64[ms]"), - "map": pd.Series(["cat", "dog", "lion"]).map( - {"cat": "kitten", "dog": "puppy", "lion": "cub"} - ), - } - ) - - # Write parquet with arrow for now (to write arrow:schema) - buffer = BytesIO() - pdf.to_parquet(buffer, engine="pyarrow") - - # Read parquet with arrow schema - got = cudf.read_parquet(buffer) - # Convert to cudf table for an apple to apple comparison - expected = cudf.from_pandas(pdf) - - # Check results for reader with schema - assert_eq(expected, got) - - # Reset buffer - buffer = BytesIO() - - # Write to buffer with cudf - expected.to_parquet(buffer, store_schema=True) - - # Read parquet with arrow schema - got = cudf.read_parquet(buffer) - # Convert to cudf table for an apple to apple comparison - expected = cudf.from_pandas(pdf) - - -@pytest.mark.parametrize( - "data", - [ - # struct - [ - {"a": 1, "b": 2}, - {"a": 10, "b": 20}, - {"a": None, "b": 22}, - {"a": None, "b": None}, - {"a": 15, "b": None}, - ], - # struct-of-list - [ - {"a": 1, "b": 2, "c": [1, 2, 3]}, - {"a": 10, "b": 20, "c": [4, 5]}, - {"a": None, "b": 22, "c": [6]}, - {"a": None, "b": None, "c": None}, - {"a": 15, "b": None, "c": [-1, -2]}, - None, - {"a": 100, "b": 200, "c": [-10, None, -20]}, - ], - # list-of-struct - [ - [{"a": 1, "b": 2}, {"a": 2, "b": 3}, {"a": 4, "b": 5}], - None, - [{"a": 10, "b": 20}], - [{"a": 100, "b": 200}, {"a": None, "b": 300}, None], - ], - # struct-of-struct - [ - {"a": 1, "b": {"inner_a": 10, "inner_b": 20}, "c": 2}, - {"a": 3, "b": {"inner_a": 30, "inner_b": 40}, "c": 4}, - {"a": 5, "b": {"inner_a": 50, "inner_b": None}, "c": 6}, - {"a": 7, "b": None, "c": 8}, - {"a": None, "b": {"inner_a": None, "inner_b": None}, "c": None}, - None, - {"a": None, "b": {"inner_a": None, "inner_b": 100}, "c": 10}, - ], - # struct-with-mixed-types - [ - { - "struct": { - "payload": { - "Domain": { - "Name": "abc", - "Id": {"Name": "host", "Value": "127.0.0.8"}, - "Duration": datetime.timedelta(minutes=12), - }, - "StreamId": "12345678", - "Duration": datetime.timedelta(minutes=4), - "Offset": None, - "Resource": [ - { - "Name": "ZoneName", - "Value": "RAPIDS", - "Duration": datetime.timedelta(seconds=1), - } - ], - } - } - } - ], - ], -) -def test_parquet_reader_roundtrip_structs_with_arrow_schema(tmpdir, data): - # Ensure that the structs with duration types are faithfully being - # roundtripped across Parquet with arrow schema - pdf = pd.DataFrame({"struct": pd.Series(data)}) - - buffer = BytesIO() - pdf.to_parquet(buffer, engine="pyarrow") - - # Read parquet with arrow schema - got = cudf.read_parquet(buffer) - # Convert to cudf table for an apple to apple comparison - expected = cudf.from_pandas(pdf) - - # Check results - assert_eq(expected, got) - - # Reset buffer - buffer = BytesIO() - - # Write to buffer with cudf - expected.to_parquet(buffer, store_schema=True) - - # Read parquet with arrow schema - got = cudf.read_parquet(buffer) - # Convert to cudf table for an apple to apple comparison - expected = cudf.from_pandas(pdf) - - # Check results - assert_eq(expected, got) - - -@pytest.mark.parametrize("index", [None, True, False]) -@pytest.mark.skipif( - version.parse(pa.__version__) < version.parse("15.0.0"), - reason="https://github.com/apache/arrow/pull/37792", -) -def test_parquet_writer_roundtrip_with_arrow_schema(index): - # Ensure that the concrete and nested types are faithfully being roundtripped - # across Parquet with arrow schema - expected = cudf.DataFrame( - { - "s": cudf.Series([None, None, None], dtype="timedelta64[s]"), - "us": cudf.Series([None, 3456, None], dtype="timedelta64[us]"), - "duration_list": list( - [ - [ - datetime.timedelta(minutes=7, seconds=4), - datetime.timedelta(minutes=7), - ], - [ - None, - None, - ], - [ - datetime.timedelta(minutes=7, seconds=4), - None, - ], - ] - ), - "int64": cudf.Series([-1234, 123, 4123], dtype="int64"), - "uint32": cudf.Series([1234, 123, 4123], dtype="uint32"), - "list": list([[1, 2], [1, 2], [1, 2]]), - "bool": cudf.Series([True, None, False], dtype=bool), - "fixed32": cudf.Series([0.00, 1.0, None]).astype( - cudf.Decimal32Dtype(7, 2) - ), - "fixed64": cudf.Series([0.00, 1.0, None]).astype( - cudf.Decimal64Dtype(7, 2) - ), - "fixed128": cudf.Series([0.00, 1.0, None]).astype( - cudf.Decimal128Dtype(7, 2) - ), - "datetime": cudf.Series([1234, 123, 4123], dtype="datetime64[ms]"), - "map": cudf.Series(["cat", "dog", "lion"]).map( - {"cat": "kitten", "dog": "puppy", "lion": "cub"} - ), - } - ) - - # Convert decimals32/64 to decimal128 if pyarrow version is < 19.0.0 - if version.parse(pa.__version__) < version.parse("19.0.0"): - expected = expected.astype({"fixed32": cudf.Decimal128Dtype(9, 2)}) - expected = expected.astype({"fixed64": cudf.Decimal128Dtype(18, 2)}) - - # Write to Parquet with arrow schema for faithful roundtrip - buffer = BytesIO() - expected.to_parquet(buffer, store_schema=True, index=index) - - # Read parquet with pyarrow, pandas and cudf readers - got = cudf.DataFrame.from_arrow(pq.read_table(buffer)) - got2 = cudf.DataFrame.from_pandas(pd.read_parquet(buffer)) - got3 = cudf.read_parquet(buffer) - - # drop the index column for comparison: __index_level_0__ - if index: - got.drop(columns="__index_level_0__", inplace=True) - got2.drop(columns="__index_level_0__", inplace=True) - - # Check results - assert_eq(expected, got) - assert_eq(expected, got2) - assert_eq(expected, got3) - - -def test_parquet_writer_int96_timestamps_and_arrow_schema(): - df = cudf.DataFrame( - { - "timestamp": cudf.Series( - [1234, 123, 4123], dtype="datetime64[ms]" - ), - } - ) - - # Output buffer - buffer = BytesIO() - - # Writing out parquet with both INT96 timestamps and arrow_schema - # enabled should throw an exception. - with pytest.raises(RuntimeError): - df.to_parquet(buffer, int96_timestamps=True, store_schema=True) - - -@pytest.mark.parametrize( - "data", - [ - # struct - [ - {"a": 1, "b": 2}, - {"a": 10, "b": 20}, - {"a": None, "b": 22}, - {"a": None, "b": None}, - {"a": 15, "b": None}, - ], - # struct-of-list - [ - {"a": 1, "b": 2, "c": [1, 2, 3]}, - {"a": 10, "b": 20, "c": [4, 5]}, - {"a": None, "b": 22, "c": [6]}, - {"a": None, "b": None, "c": None}, - {"a": 15, "b": None, "c": [-1, -2]}, - None, - {"a": 100, "b": 200, "c": [-10, None, -20]}, - ], - # list-of-struct - [ - [{"a": 1, "b": 2}, {"a": 2, "b": 3}, {"a": 4, "b": 5}], - None, - [{"a": 10, "b": 20}], - [{"a": 100, "b": 200}, {"a": None, "b": 300}, None], - ], - # struct-of-struct - [ - {"a": 1, "b": {"inner_a": 10, "inner_b": 20}, "c": 2}, - {"a": 3, "b": {"inner_a": 30, "inner_b": 40}, "c": 4}, - {"a": 5, "b": {"inner_a": 50, "inner_b": None}, "c": 6}, - {"a": 7, "b": None, "c": 8}, - {"a": None, "b": {"inner_a": None, "inner_b": None}, "c": None}, - None, - {"a": None, "b": {"inner_a": None, "inner_b": 100}, "c": 10}, - ], - # struct-with-mixed-types - [ - { - "struct": { - "payload": { - "Domain": { - "Name": "abc", - "Id": {"Name": "host", "Value": "127.0.0.8"}, - "Duration": datetime.timedelta(minutes=12), - }, - "StreamId": "12345678", - "Duration": datetime.timedelta(minutes=4), - "Offset": None, - "Resource": [ - { - "Name": "ZoneName", - "Value": "RAPIDS", - "Duration": datetime.timedelta(seconds=1), - } - ], - } - } - } - ], - ], -) -@pytest.mark.parametrize("index", [None, True, False]) -@pytest.mark.skipif( - version.parse(pa.__version__) < version.parse("15.0.0"), - reason="https://github.com/apache/arrow/pull/37792", -) -def test_parquet_writer_roundtrip_structs_with_arrow_schema( - tmpdir, data, index -): - # Ensure that the structs are faithfully being roundtripped across - # Parquet with arrow schema - pa_expected = pa.Table.from_pydict({"struct": data}) - - expected = cudf.DataFrame.from_arrow(pa_expected) - - # Write expected data frame to Parquet with arrow schema - buffer = BytesIO() - expected.to_parquet(buffer, store_schema=True, index=index) - - # Read Parquet with pyarrow - pa_got = pq.read_table(buffer) - - # drop the index column for comparison: __index_level_0__ - if index: - pa_got = pa_got.drop(columns="__index_level_0__") - - # Check results - assert_eq(pa_expected, pa_got) - - # Convert to cuDF table and also read Parquet with cuDF reader - got = cudf.DataFrame.from_arrow(pa_got) - got2 = cudf.read_parquet(buffer) - - # Check results - assert_eq(expected, got) - assert_eq(expected, got2) - - -@pytest.mark.parametrize("chunk_read_limit", [0, 240, 1024000000]) -@pytest.mark.parametrize("pass_read_limit", [0, 240, 1024000000]) -@pytest.mark.parametrize("use_pandas_metadata", [True, False]) -@pytest.mark.parametrize("row_groups", [[[0]], None, [[0, 1]]]) -def test_parquet_chunked_reader( - chunk_read_limit, pass_read_limit, use_pandas_metadata, row_groups -): - df = pd.DataFrame( - {"a": [1, 2, 3, None] * 1000, "b": ["av", "qw", None, "xyz"] * 1000} - ) - buffer = BytesIO() - df.to_parquet(buffer, row_group_size=1000) - with cudf.option_context("io.parquet.low_memory", True): - actual = cudf.read_parquet( - [buffer], - _chunk_read_limit=chunk_read_limit, - _pass_read_limit=pass_read_limit, - use_pandas_metadata=use_pandas_metadata, - row_groups=row_groups, - ) - expected = cudf.read_parquet( - buffer, use_pandas_metadata=use_pandas_metadata, row_groups=row_groups - ) - assert_eq(expected, actual) - - -@pytest.mark.parametrize("chunk_read_limit", [256, 2560]) -@pytest.mark.parametrize("pass_read_limit", [256, 2560]) -@pytest.mark.parametrize("num_rows", [49, 291]) -@pytest.mark.parametrize("skip_rows", [412, 601]) -@pytest.mark.parametrize("data_size", [100, 200]) -def test_parquet_chunked_reader_structs( - chunk_read_limit, pass_read_limit, num_rows, skip_rows, data_size -): - data = [ - { - "a": "g", - "b": { - "b_a": 10, - "b_b": {"b_b_b": None, "b_b_a": 2}, - }, - "c": None, - }, - {"a": None, "b": {"b_a": None, "b_b": None}, "c": [15, 16]}, - {"a": "j", "b": None, "c": [8, 10]}, - {"a": None, "b": {"b_a": None, "b_b": None}, "c": None}, - None, - { - "a": None, - "b": {"b_a": None, "b_b": {"b_b_b": 1}}, - "c": [18, 19], - }, - {"a": None, "b": None, "c": None}, - ] * data_size - - pa_struct = pa.Table.from_pydict({"struct": data}) - df = cudf.DataFrame.from_arrow(pa_struct) - buffer = BytesIO() - df.to_parquet(buffer, row_group_size_rows=7000, max_page_size_rows=100) - - # Number of rows to read - nrows = num_rows if skip_rows + num_rows < len(df) else len(df) - skip_rows - - with cudf.option_context("io.parquet.low_memory", True): - actual = cudf.read_parquet( - [buffer], - _chunk_read_limit=chunk_read_limit, - _pass_read_limit=pass_read_limit, - nrows=nrows, - skip_rows=skip_rows, - ).reset_index(drop=True) - expected = cudf.read_parquet( - buffer, nrows=nrows, skip_rows=skip_rows - ).reset_index(drop=True) - assert_eq(expected, actual) - - -@pytest.mark.parametrize("chunk_read_limit", [0, 24, 10240000]) -@pytest.mark.parametrize("pass_read_limit", [0, 24, 10240000]) -@pytest.mark.parametrize("num_rows", [47, 97, None]) -@pytest.mark.parametrize( - "str_encoding", - [ - "PLAIN", - "DELTA_BYTE_ARRAY", - "DELTA_LENGTH_BYTE_ARRAY", - ], -) -def test_parquet_chunked_reader_string_decoders( - chunk_read_limit, - pass_read_limit, - num_rows, - str_encoding, -): - df = pd.DataFrame( - { - "i64": [1, 2, 3, None] * 100, - "str": ["av", "qw", "asd", "xyz"] * 100, - "list": list( - [["ad", "cd"], ["asd", "fd"], None, ["asd", None]] * 100 - ), - } - ) - buffer = BytesIO() - # Write 4 Parquet row groups with string column encoded - df.to_parquet( - buffer, - row_group_size=100, - use_dictionary=False, - column_encoding={"str": str_encoding}, - ) - - # Number of rows to read - nrows = num_rows if num_rows is not None else len(df) - - # Check with num_rows specified - with cudf.option_context("io.parquet.low_memory", True): - actual = cudf.read_parquet( - [buffer], - _chunk_read_limit=chunk_read_limit, - _pass_read_limit=pass_read_limit, - nrows=nrows, - ) - expected = cudf.read_parquet( - buffer, - nrows=nrows, - ) - assert_eq(expected, actual) - - -@pytest.mark.parametrize( - "nrows, skip_rows", - [ - (0, 0), - (99, 101), - (988, 61), - (99, 1011), - (101, 1601), - (99, 1901), - ], -) -@pytest.mark.parametrize( - "row_group_size_rows, page_size_rows", - [ - (1000, 1000), # 1 RG, 1 page per RG - (1000, 100), # 1 RG, multiple pages per RG - (100, 100), # multiple RGs, 1 page per RG - (100, 10), # multiple RGs, multiple pages per RG - ], -) -@pytest.mark.parametrize( - "chunk_read_limit, pass_read_limit", - [ - (256, 256), # small chunk and pass read limits - (0, 1024), # zero chunk and small pass read limit - (256, 0), # small chunk and zero pass read limit - (256000, 256000), # large chunk and pass read limits - ], -) -def test_chunked_parquet_reader_nrows_skiprows( - nrows, - skip_rows, - row_group_size_rows, - page_size_rows, - chunk_read_limit, - pass_read_limit, -): - df = cudf.DataFrame( - { - "a": list( - [ - ["cat", "lion", "deer"], - ["bear", "ibex", None], - ["tiger", None, "bull"], - [None, "wolf", "fox"], - ] - ) - * 500, - "b": ["av", "qw", None, "xyz"] * 500, - } - ) - expected = df[skip_rows : skip_rows + nrows] - buffer = BytesIO() - df.to_parquet( - buffer, - row_group_size_rows=row_group_size_rows, - max_page_size_rows=page_size_rows, - ) - got = cudf.read_parquet(buffer, nrows=nrows, skip_rows=skip_rows) - assert_eq(expected, got) - - # Check for chunked parquet reader - with cudf.option_context("io.parquet.low_memory", True): - got = cudf.read_parquet( - [buffer], - _chunk_read_limit=chunk_read_limit, - _pass_read_limit=pass_read_limit, - nrows=nrows, - skip_rows=skip_rows, - ).reset_index(drop=True) - # Reset index for comparison - expected = expected.reset_index(drop=True) - assert_eq(expected, got) - - -def test_parquet_reader_pandas_compatibility(): - df = pd.DataFrame( - {"a": [1, 2, 3, 4] * 10000, "b": ["av", "qw", "hi", "xyz"] * 10000} - ) - buffer = BytesIO() - df.to_parquet(buffer) - with cudf.option_context("io.parquet.low_memory", True): - expected = cudf.read_parquet(buffer) - assert_eq(expected, df) - - -@pytest.mark.parametrize("store_schema", [True, False]) -def test_parquet_reader_with_mismatched_tables(store_schema): - # cuDF tables with mixed types - df1 = cudf.DataFrame( - { - "i32": cudf.Series([None, None, None], dtype="int32"), - "i64": cudf.Series([1234, 467, 123], dtype="int64"), - "list": list([[1, 2], None, [None, 6]]), - "time": cudf.Series([1234, 123, 4123], dtype="datetime64[ms]"), - "str": ["vfd", None, "ghu"], - "d_list": list( - [ - [pd.Timedelta(minutes=1), pd.Timedelta(minutes=2)], - [None, pd.Timedelta(minutes=3)], - [pd.Timedelta(minutes=8), None], - ] - ), - } - ) - - df2 = cudf.DataFrame( - { - "str": ["abc", "def", "ghi"], - "i64": cudf.Series([None, 65, 98], dtype="int64"), - "times": cudf.Series([1234, None, 4123], dtype="datetime64[us]"), - "list": list([[7, 8], [9, 10], [11, 12]]), - "d_list": list( - [ - [pd.Timedelta(minutes=4), None], - None, - [pd.Timedelta(minutes=6), None], - ] - ), - } - ) - - # IO buffers - buf1 = BytesIO() - buf2 = BytesIO() - - # Write Parquet with and without arrow schema - df1.to_parquet(buf1, store_schema=store_schema) - df2.to_parquet(buf2, store_schema=store_schema) - - # Read mismatched Parquet files - got = cudf.read_parquet( - [buf1, buf2], - columns=["list", "d_list", "str"], - filters=[("i64", ">", 20)], - allow_mismatched_pq_schemas=True, - ) - - # Construct the expected table - expected = cudf.concat( - [ - df1[df1["i64"] > 20][["list", "d_list", "str"]], - df2[df2["i64"] > 20][["list", "d_list", "str"]], - ] - ).reset_index(drop=True) - - # Read with chunked reader (filter columns not supported) - with cudf.option_context("io.parquet.low_memory", True): - got_chunked = cudf.read_parquet( - [buf1, buf2], - columns=["list", "d_list", "str"], - _chunk_read_limit=240, - _pass_read_limit=240, - allow_mismatched_pq_schemas=True, - ) - - # Construct the expected table without filter columns - expected_chunked = cudf.concat( - [df1[["list", "d_list", "str"]], df2[["list", "d_list", "str"]]] - ).reset_index(drop=True) - - # Check results - assert_eq(expected, got) - assert_eq(expected_chunked, got_chunked) - - -def test_parquet_reader_with_mismatched_structs(): - data1 = [ - { - "a": 1, - "b": { - "a_a": 10, - "b_b": {"b_b_b": 1, "b_b_a": 2}, - }, - "c": 2, - }, - { - "a": 3, - "b": {"b_a": 30, "b_b": {"b_b_a": 210}}, - "c": 4, - }, - {"a": 5, "b": {"b_a": 50, "b_b": None}, "c": 6}, - {"a": 7, "b": None, "c": 8}, - {"a": 5, "b": {"b_a": None, "b_b": None}, "c": None}, - ] - - data2 = [ - {"a": 1, "b": {"b_b": {"b_b_a": None}}}, - {"a": 5, "b": {"b_b": None}}, - {"a": 7, "b": {"b_b": {"b_b_b": 1, "b_b_a": 0}}}, - {"a": None, "b": {"b_b": None}}, - None, - ] - - # cuDF tables from struct data - df1 = cudf.DataFrame.from_arrow(pa.Table.from_pydict({"struct": data1})) - df2 = cudf.DataFrame.from_arrow(pa.Table.from_pydict({"struct": data2})) - - # Buffers - buf1 = BytesIO() - buf2 = BytesIO() - - # Write to parquet - df1.to_parquet(buf1) - df2.to_parquet(buf2) - - # Read the struct.b.inner_b.inner_inner_a column from parquet - got = cudf.read_parquet( - [buf1, buf2], - columns=["struct.b.b_b.b_b_a"], - allow_mismatched_pq_schemas=True, - ) - got = ( - cudf.Series(got["struct"]) - .struct.field("b") - .struct.field("b_b") - .struct.field("b_b_a") - ) - - # Read with chunked reader - with cudf.option_context("io.parquet.low_memory", True): - got_chunked = cudf.read_parquet( - [buf1, buf2], - columns=["struct.b.b_b.b_b_a"], - _chunk_read_limit=240, - _pass_read_limit=240, - allow_mismatched_pq_schemas=True, - ) - got_chunked = ( - cudf.Series(got_chunked["struct"]) - .struct.field("b") - .struct.field("b_b") - .struct.field("b_b_a") - ) - - # Construct the expected series - expected = cudf.concat( - [ - cudf.Series(df1["struct"]) - .struct.field("b") - .struct.field("b_b") - .struct.field("b_b_a"), - cudf.Series(df2["struct"]) - .struct.field("b") - .struct.field("b_b") - .struct.field("b_b_a"), - ] - ).reset_index(drop=True) - - # Check results - assert_eq(expected, got) - assert_eq(expected, got_chunked) - - -def test_parquet_reader_with_mismatched_schemas_error(): - df1 = cudf.DataFrame( - { - "millis": cudf.Series([123, 3454, 123], dtype="timedelta64[ms]"), - "i64": cudf.Series([123, 3454, 123], dtype="int64"), - "i32": cudf.Series([123, 3454, 123], dtype="int32"), - } - ) - df2 = cudf.DataFrame( - { - "i64": cudf.Series([123, 3454, 123], dtype="int64"), - "millis": cudf.Series([123, 3454, 123], dtype="timedelta64[ms]"), - } - ) - - buf1 = BytesIO() - buf2 = BytesIO() - - df1.to_parquet(buf1, store_schema=True) - df2.to_parquet(buf2, store_schema=False) - - with pytest.raises( - ValueError, - match="Encountered mismatching SchemaElement properties for a column in the selected path", - ): - cudf.read_parquet( - [buf1, buf2], columns=["millis"], allow_mismatched_pq_schemas=True - ) - - data1 = [ - {"a": 1, "b": {"b_a": 1, "b_b": 6}}, - {"a": 3, "b": {"b_a": None, "b_b": 2}}, - ] - data2 = [ - {"b": {"b_a": 1}, "c": "str"}, - {"b": {"b_a": None}, "c": None}, - ] - - # cuDF tables from struct data - df1 = cudf.DataFrame.from_arrow(pa.Table.from_pydict({"struct": data1})) - df2 = cudf.DataFrame.from_arrow(pa.Table.from_pydict({"struct": data2})) - - # Buffers - buf1 = BytesIO() - buf2 = BytesIO() - - # Write to parquet - df1.to_parquet(buf1) - df2.to_parquet(buf2) - - with pytest.raises( - IndexError, - match="Encountered mismatching number of children for a column in the selected path", - ): - cudf.read_parquet( - [buf1, buf2], - columns=["struct.b"], - allow_mismatched_pq_schemas=True, - ) - - with pytest.raises( - IndexError, - match="Encountered mismatching schema tree depths across data sources", - ): - cudf.read_parquet( - [buf1, buf2], - columns=["struct.b.b_b"], - allow_mismatched_pq_schemas=True, - ) - - -def test_parquet_roundtrip_zero_rows_no_column_mask(): - expected = cudf.DataFrame._from_data( - { - "int": cudf.core.column.column_empty(0, np.dtype(np.int64)), - "float": cudf.core.column.column_empty(0, np.dtype(np.float64)), - "datetime": cudf.core.column.column_empty( - 0, np.dtype("datetime64[ns]") - ), - "timedelta": cudf.core.column.column_empty( - 0, np.dtype("timedelta64[ns]") - ), - "bool": cudf.core.column.column_empty(0, np.dtype(np.bool_)), - "decimal": cudf.core.column.column_empty( - 0, cudf.Decimal64Dtype(1) - ), - "struct": cudf.core.column.column_empty( - 0, cudf.StructDtype({"a": "int64"}) - ), - "list": cudf.core.column.column_empty( - 0, cudf.ListDtype("float64") - ), - } - ) - with BytesIO() as bio: - expected.to_parquet(bio) - result = cudf.read_parquet(bio) - assert_eq(result, expected) - - -def test_parquet_reader_mismatched_nullability(): - # Ensure that we can faithfully read the tables with mismatched nullabilities - df1 = cudf.DataFrame( - { - "timedelta": cudf.Series([12, 54, 1231], dtype="timedelta64[ms]"), - "duration_list": list( - [ - [ - [ - [pd.Timedelta(minutes=1), pd.Timedelta(minutes=2)], - None, - [pd.Timedelta(minutes=8), None], - ], - None, - ], - None, - [ - [ - [pd.Timedelta(minutes=1), pd.Timedelta(minutes=2)], - [pd.Timedelta(minutes=5), pd.Timedelta(minutes=3)], - [pd.Timedelta(minutes=8), pd.Timedelta(minutes=4)], - ] - ], - ] - ), - "int64": cudf.Series([1234, None, 4123], dtype="int64"), - "int32": cudf.Series([1234, 123, 4123], dtype="int32"), - "list": list([[1, 2], [1, 2], [1, 2]]), - "datetime": cudf.Series([1234, 123, 4123], dtype="datetime64[ms]"), - "string": cudf.Series(["kitten", "puppy", "cub"]), - } - ) - - df2 = cudf.DataFrame( - { - "timedelta": cudf.Series( - [None, None, None], dtype="timedelta64[ms]" - ), - "duration_list": list( - [ - [ - [ - [pd.Timedelta(minutes=1), pd.Timedelta(minutes=2)], - [pd.Timedelta(minutes=8), pd.Timedelta(minutes=1)], - ], - ], - [ - [ - [pd.Timedelta(minutes=1), pd.Timedelta(minutes=2)], - [pd.Timedelta(minutes=5), pd.Timedelta(minutes=3)], - [pd.Timedelta(minutes=8), pd.Timedelta(minutes=4)], - ] - ], - [ - [ - [pd.Timedelta(minutes=1), pd.Timedelta(minutes=2)], - [pd.Timedelta(minutes=5), pd.Timedelta(minutes=3)], - [pd.Timedelta(minutes=8), pd.Timedelta(minutes=4)], - ] - ], - ] - ), - "int64": cudf.Series([1234, 123, 4123], dtype="int64"), - "int32": cudf.Series([1234, None, 4123], dtype="int32"), - "list": list([[1, 2], None, [1, 2]]), - "datetime": cudf.Series( - [1234, None, 4123], dtype="datetime64[ms]" - ), - "string": cudf.Series(["kitten", None, "cub"]), - } - ) - - # Write tables to parquet with arrow schema for compatibility for duration column(s) - fname1 = BytesIO() - df1.to_parquet(fname1, store_schema=True) - fname2 = BytesIO() - df2.to_parquet(fname2, store_schema=True) - - # Read tables back with cudf and arrow in either order and compare - assert_eq( - cudf.read_parquet([fname1, fname2]), - cudf.concat([df1, df2]).reset_index(drop=True), - ) - assert_eq( - cudf.read_parquet([fname2, fname1]), - cudf.concat([df2, df1]).reset_index(drop=True), - ) - - -def test_parquet_reader_mismatched_nullability_structs(tmpdir): - data1 = [ - { - "a": "a", - "b": { - "b_a": 10, - "b_b": {"b_b_b": 1, "b_b_a": 12}, - }, - "c": [1, 2], - }, - { - "a": "b", - "b": { - "b_a": 30, - "b_b": {"b_b_b": 2, "b_b_a": 2}, - }, - "c": [3, 4], - }, - { - "a": "c", - "b": { - "b_a": 50, - "b_b": {"b_b_b": 4, "b_b_a": 5}, - }, - "c": [5, 6], - }, - { - "a": "d", - "b": { - "b_a": 135, - "b_b": {"b_b_b": 12, "b_b_a": 32}, - }, - "c": [7, 8], - }, - { - "a": "e", - "b": { - "b_a": 1, - "b_b": {"b_b_b": 1, "b_b_a": 5}, - }, - "c": [9, 10], - }, - { - "a": "f", - "b": { - "b_a": 32, - "b_b": {"b_b_b": 1, "b_b_a": 6}, - }, - "c": [11, 12], - }, - ] - - data2 = [ - { - "a": "g", - "b": { - "b_a": 10, - "b_b": {"b_b_b": None, "b_b_a": 2}, - }, - "c": None, - }, - {"a": None, "b": {"b_a": None, "b_b": None}, "c": [15, 16]}, - {"a": "j", "b": None, "c": [8, 10]}, - {"a": None, "b": {"b_a": None, "b_b": None}, "c": None}, - None, - { - "a": None, - "b": {"b_a": None, "b_b": {"b_b_b": 1}}, - "c": [18, 19], - }, - {"a": None, "b": None, "c": None}, - ] - - pa_table1 = pa.Table.from_pydict({"struct": data1}) - df1 = cudf.DataFrame.from_arrow(pa_table1) - - pa_table2 = pa.Table.from_pydict({"struct": data2}) - df2 = cudf.DataFrame.from_arrow(pa_table2) - - # Write tables to parquet - buf1 = BytesIO() - df1.to_parquet(buf1) - buf2 = BytesIO() - df2.to_parquet(buf2) - - # Read tables back with cudf and compare with expected. - assert_eq( - cudf.read_parquet([buf1, buf2]), - cudf.concat([df1, df2]).reset_index(drop=True), - ) - assert_eq( - cudf.read_parquet([buf2, buf1]), - cudf.concat([df2, df1]).reset_index(drop=True), - ) - - -@pytest.mark.skipif( - pa.__version__ == "19.0.0", - reason="https://github.com/rapidsai/cudf/issues/17806", -) -@pytest.mark.parametrize( - "stats_fname,bloom_filter_fname", - [ - ( - "mixed_card_ndv_100_chunk_stats.snappy.parquet", - "mixed_card_ndv_100_bf_fpp0.1_nostats.snappy.parquet", - ), - ( - "mixed_card_ndv_500_chunk_stats.snappy.parquet", - "mixed_card_ndv_500_bf_fpp0.1_nostats.snappy.parquet", - ), - ], -) -@pytest.mark.parametrize( - "predicate,expected_len", - [ - ([[("str", "==", "FINDME")], [("fp64", "==", float(500))]], 2), - ([("fixed_pt", "==", decimal.Decimal(float(500)))], 2), - ([[("ui32", "==", np.uint32(500)), ("str", "==", "FINDME")]], 2), - ([[("str", "==", "FINDME")], [("ui32", ">=", np.uint32(0))]], 1000), - ( - [ - ("str", "!=", "FINDME"), - ("fixed_pt", "==", decimal.Decimal(float(500))), - ], - 0, - ), - ], -) -def test_parquet_bloom_filters( - datadir, stats_fname, bloom_filter_fname, predicate, expected_len -): - fname_stats = datadir / stats_fname - fname_bf = datadir / bloom_filter_fname - df_stats = cudf.read_parquet(fname_stats, filters=predicate).reset_index( - drop=True - ) - df_bf = cudf.read_parquet(fname_bf, filters=predicate).reset_index( - drop=True - ) - - # Check if tables equal - assert_eq( - df_stats, - df_bf, - ) - - # Check for table length - assert_eq( - len(df_stats), - expected_len, - ) - - -@pytest.fixture(params=["cuda", "pool", "cuda_async"]) -def memory_resource(request): - import rmm - - current_mr = rmm.mr.get_current_device_resource() - - kind = request.param - if kind == "cuda": - mr = rmm.mr.CudaMemoryResource() - elif kind == "pool": - base = rmm.mr.CudaMemoryResource() - free, _ = rmm.mr.available_device_memory() - size = int(round(free * 0.5 / 256) * 256) - mr = rmm.mr.PoolMemoryResource(base, size, size) - elif kind == "cuda_async": - mr = rmm.mr.CudaAsyncMemoryResource() - - rmm.mr.set_current_device_resource(mr) - - try: - yield mr - finally: - rmm.mr.set_current_device_resource(current_mr) - - -@pytest.mark.parametrize("columns", [["r_reason_desc"], None]) -def test_parquet_bloom_filters_alignment(datadir, columns, memory_resource): - fname = datadir / "bloom_filter_alignment.parquet" - filters = [("r_reason_desc", "==", "Did not like the color")] - - # Read expected table using pyarrow - expected = pq.read_table(fname, columns=columns, filters=filters) - - # Read with cudf using the memory resource from fixture - read = cudf.read_parquet( - fname, columns=columns, filters=filters - ).to_arrow() - - assert_eq(expected, read) - - -def test_parquet_reader_unsupported_compression(datadir): - fname = datadir / "hadoop_lz4_compressed.parquet" - - with pytest.raises( - RuntimeError, - match="Unsupported Parquet compression type: LZ4", - ): - cudf.read_parquet(fname) - - -def test_parquet_reader_empty_compressed_page(datadir): - fname = datadir / "empty_datapage_v2.parquet" - - df = cudf.DataFrame({"value": cudf.Series([None], dtype="float32")}) - assert_eq(cudf.read_parquet(fname), df) - - -@pytest.fixture(params=[1234], scope="module") -def my_pdf(request): - return build_pdf(request, True) - - -@pytest.mark.parametrize("compression", ["brotli", "gzip", "snappy", "zstd"]) -def test_parquet_decompression(set_decomp_env_vars, my_pdf, compression): - # PANDAS returns category objects whereas cuDF returns hashes - expect = my_pdf.drop(columns=["col_category"]) - - # Write the DataFrame to a Parquet file - buffer = BytesIO() - expect.to_parquet(buffer, compression=compression) - - # Read the Parquet file back into a DataFrame - got = cudf.read_parquet(buffer) - - assert_eq(expect, got) From 960f93a7f9909a8514e54bef126bc4999907ebed Mon Sep 17 00:00:00 2001 From: Nghia Truong <7416935+ttnghia@users.noreply.github.com> Date: Tue, 19 Aug 2025 13:15:27 -0700 Subject: [PATCH 163/366] Cache hash values to improve hash-based groupby performance with wide/complex table keys (#19670) In hash-based groupby, the row indices of the keys table are inserted into a hash set twice. While doing so, row hashes are computed, which can be expensive in term of memory access for wide key tables with/without complex type columns. This PR pre-computes hash values and stores them in a caching array for such wide/complex keys tables, which helps improving the performance to some extent. A new benchmark is also implemented to test for such particular (wide, complex keys tables) situations. Contribute to https://github.com/rapidsai/cudf/issues/19513. Authors: - Nghia Truong (https://github.com/ttnghia) Approvers: - David Wendt (https://github.com/davidwendt) - Muhammad Haseeb (https://github.com/mhaseeb123) - Yunsong Wang (https://github.com/PointKernel) URL: https://github.com/rapidsai/cudf/pull/19670 --- cpp/benchmarks/CMakeLists.txt | 1 + cpp/benchmarks/groupby/group_complex_keys.cpp | 158 ++++++++++++++++++ cpp/src/groupby/hash/compute_aggregations.cu | 3 +- cpp/src/groupby/hash/compute_aggregations.cuh | 22 ++- cpp/src/groupby/hash/compute_aggregations.hpp | 3 +- .../groupby/hash/compute_aggregations_null.cu | 3 +- .../hash/compute_global_memory_aggs.cu | 3 +- .../hash/compute_global_memory_aggs.cuh | 14 +- .../hash/compute_global_memory_aggs.hpp | 3 +- .../hash/compute_global_memory_aggs_null.cu | 3 +- cpp/src/groupby/hash/compute_groupby.cu | 96 ++++++++--- .../groupby/hash/compute_mapping_indices.cu | 6 +- .../groupby/hash/compute_mapping_indices.cuh | 16 +- .../groupby/hash/compute_mapping_indices.hpp | 16 +- .../hash/compute_mapping_indices_null.cu | 7 +- .../hash/compute_shared_memory_aggs.cu | 15 +- .../hash/compute_shared_memory_aggs.hpp | 1 - cpp/src/groupby/hash/helpers.cuh | 21 ++- cpp/src/groupby/hash/single_pass_functors.cuh | 21 +-- 19 files changed, 312 insertions(+), 100 deletions(-) create mode 100644 cpp/benchmarks/groupby/group_complex_keys.cpp diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt index 9aad6a21012..5fc041c4e58 100644 --- a/cpp/benchmarks/CMakeLists.txt +++ b/cpp/benchmarks/CMakeLists.txt @@ -266,6 +266,7 @@ ConfigureBench( ConfigureNVBench( GROUPBY_NVBENCH + groupby/group_complex_keys.cpp groupby/group_histogram.cpp groupby/group_m2.cpp groupby/group_max.cpp diff --git a/cpp/benchmarks/groupby/group_complex_keys.cpp b/cpp/benchmarks/groupby/group_complex_keys.cpp new file mode 100644 index 00000000000..8717f0f50d8 --- /dev/null +++ b/cpp/benchmarks/groupby/group_complex_keys.cpp @@ -0,0 +1,158 @@ +/* + * Copyright (c) 2025, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include + +#include +#include +#include + +#include + +#include + +namespace { + +auto generate_int_keys(cudf::size_type num_cols, + cudf::size_type num_rows, + cudf::size_type value_key_ratio, + double null_probability) +{ + auto const create_column = [&] { + auto builder = + data_profile_builder() + .cardinality(num_rows / value_key_ratio) + .distribution(cudf::type_to_id(), distribution_id::UNIFORM, 0, num_rows); + if (null_probability > 0) { + builder.null_probability(null_probability); + } else { + builder.no_validity(); + } + return create_random_column( + cudf::type_to_id(), row_count{num_rows}, data_profile{builder}); + }; + std::vector> cols; + cols.reserve(num_cols); + for (cudf::size_type i = 0; i < num_cols; ++i) { + cols.emplace_back(create_column()); + } + return std::make_unique(std::move(cols)); +} + +auto generate_mixed_types_keys(cudf::size_type num_cols, + cudf::size_type num_rows, + cudf::size_type value_key_ratio, + double null_probability) +{ + constexpr auto max_str_length = 50; + constexpr auto max_list_size = 10; + constexpr auto nested_depth = 2; + + auto builder = data_profile_builder() + .cardinality(num_rows / value_key_ratio) + .distribution(cudf::type_id::INT32, distribution_id::UNIFORM, 0, num_rows) + .distribution(cudf::type_id::STRING, distribution_id::NORMAL, 0, max_str_length) + .distribution(cudf::type_id::INT64, distribution_id::UNIFORM, 0, num_rows) + .distribution(cudf::type_id::LIST, distribution_id::UNIFORM, 0, max_list_size) + .list_depth(nested_depth) + .struct_depth(nested_depth); + if (null_probability > 0) { + builder.null_probability(null_probability); + } else { + builder.no_validity(); + } + + return create_random_table(cycle_dtypes({cudf::type_id::INT32, + cudf::type_id::STRING, + cudf::type_id::INT64, + cudf::type_id::LIST, + cudf::type_id::STRUCT}, + num_cols), + row_count{num_rows}, + data_profile{builder}); +} + +auto generate_vals(cudf::size_type num_rows, double null_probability) +{ + using Type = int64_t; + auto builder = data_profile_builder().cardinality(0).distribution( + cudf::type_to_id(), distribution_id::UNIFORM, 0, num_rows); + if (null_probability > 0) { + builder.null_probability(null_probability); + } else { + builder.no_validity(); + } + return create_random_column(cudf::type_to_id(), row_count{num_rows}, data_profile{builder}); +} + +template +void run_benchmark_complex_keys(nvbench::state& state) +{ + auto const n_cols = static_cast(state.get_int64("num_cols")); + auto const n_rows = static_cast(state.get_int64("num_rows")); + auto const value_key_ratio = static_cast(state.get_int64("value_key_ratio")); + auto const null_probability = state.get_float64("null_probability"); + + auto const keys_table = [&] { + if constexpr (is_int_keys) { + return generate_int_keys(n_cols, n_rows, value_key_ratio, null_probability); + } else { + return generate_mixed_types_keys(n_cols, n_rows, value_key_ratio, null_probability); + } + }(); + auto const vals = generate_vals(n_rows, null_probability); + + cudf::groupby::groupby gb_obj(keys_table->view()); + + std::vector requests; + requests.emplace_back(cudf::groupby::aggregation_request()); + requests[0].values = vals->view(); + requests[0].aggregations.push_back(cudf::make_min_aggregation()); + + auto const mem_stats_logger = cudf::memory_stats_logger(); + auto const stream = cudf::get_default_stream(); + state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value())); + + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch&) { + [[maybe_unused]] auto const result = gb_obj.aggregate(requests, stream); + }); + + state.add_buffer_size( + mem_stats_logger.peak_memory_usage(), "peak_memory_usage", "peak_memory_usage"); +} + +} // namespace + +void bench_groupby_int_keys(nvbench::state& state) { run_benchmark_complex_keys(state); } +void bench_groupby_mixed_types_keys(nvbench::state& state) +{ + run_benchmark_complex_keys(state); +} + +#define RUN_BENCH(bench_func) \ + NVBENCH_BENCH(bench_func) \ + .set_name(#bench_func) \ + .add_int64_power_of_two_axis("num_rows", {12, 18, 24}) \ + .add_int64_axis("value_key_ratio", {20, 200}) \ + .add_float64_axis("null_probability", {0, 0.5}) + +RUN_BENCH(bench_groupby_int_keys).add_int64_axis("num_cols", {1, 2, 4, 8, 16}); + +// Not enough memory for more mixed types columns. +RUN_BENCH(bench_groupby_mixed_types_keys).add_int64_axis("num_cols", {1, 2, 3, 4, 5}); diff --git a/cpp/src/groupby/hash/compute_aggregations.cu b/cpp/src/groupby/hash/compute_aggregations.cu index cac6c2224f0..a0226de5b82 100644 --- a/cpp/src/groupby/hash/compute_aggregations.cu +++ b/cpp/src/groupby/hash/compute_aggregations.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2024, NVIDIA CORPORATION. + * Copyright (c) 2024-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -20,7 +20,6 @@ namespace cudf::groupby::detail::hash { template rmm::device_uvector compute_aggregations( int64_t num_rows, - bool skip_rows_with_nulls, bitmask_type const* row_bitmask, global_set_t& global_set, cudf::host_span requests, diff --git a/cpp/src/groupby/hash/compute_aggregations.cuh b/cpp/src/groupby/hash/compute_aggregations.cuh index 60a8b3c2f38..65fb8a7738a 100644 --- a/cpp/src/groupby/hash/compute_aggregations.cuh +++ b/cpp/src/groupby/hash/compute_aggregations.cuh @@ -25,6 +25,7 @@ #include "single_pass_functors.cuh" #include +#include #include #include #include @@ -45,6 +46,7 @@ #include namespace cudf::groupby::detail::hash { + /** * @brief Computes all aggregations from `requests` that require a single pass * over the data and stores the results in `sparse_results` @@ -52,7 +54,6 @@ namespace cudf::groupby::detail::hash { template rmm::device_uvector compute_aggregations( int64_t num_rows, - bool skip_rows_with_nulls, bitmask_type const* row_bitmask, SetType& global_set, cudf::host_span requests, @@ -64,8 +65,17 @@ rmm::device_uvector compute_aggregations( auto const d_agg_kinds = cudf::detail::make_device_uvector_async( agg_kinds, stream, rmm::mr::get_current_device_resource()); - auto const grid_size = - max_occupancy_grid_size>(num_rows); + auto const grid_size = [&] { + auto const max_blocks_mapping = + max_active_blocks_mapping_kernel>(); + auto const max_blocks_aggs = max_active_blocks_shmem_aggs_kernel(); + // We launch the same grid size for both kernels, thus we need to take the minimum of the two. + auto const max_blocks = std::min(max_blocks_mapping, max_blocks_aggs); + auto const max_grid_size = max_blocks * cudf::detail::num_multiprocessors(); + auto const num_blocks = + cudf::util::div_rounding_up_safe(static_cast(num_rows), GROUPBY_BLOCK_SIZE); + return std::min(max_grid_size, num_blocks); + }(); auto const available_shmem_size = get_available_shared_memory_size(grid_size); auto const offsets_buffer_size = compute_shmem_offsets_size(flattened_values.num_columns()) * 2; auto const data_buffer_size = available_shmem_size - offsets_buffer_size; @@ -94,7 +104,6 @@ rmm::device_uvector compute_aggregations( // present. if (!is_shared_memory_compatible) { return compute_global_memory_aggs(num_rows, - skip_rows_with_nulls, row_bitmask, flattened_values, d_agg_kinds.data(), @@ -123,7 +132,6 @@ rmm::device_uvector compute_aggregations( num_rows, global_set_ref, row_bitmask, - skip_rows_with_nulls, local_mapping_index.data(), global_mapping_index.data(), block_cardinality.data(), @@ -157,7 +165,6 @@ rmm::device_uvector compute_aggregations( available_shmem_size, num_rows, row_bitmask, - skip_rows_with_nulls, local_mapping_index.data(), global_mapping_index.data(), block_cardinality.data(), @@ -181,8 +188,7 @@ rmm::device_uvector compute_aggregations( d_agg_kinds.data(), block_cardinality.data(), stride, - row_bitmask, - skip_rows_with_nulls}); + row_bitmask}); extract_populated_keys(global_set, populated_keys, stream); } diff --git a/cpp/src/groupby/hash/compute_aggregations.hpp b/cpp/src/groupby/hash/compute_aggregations.hpp index 829c3c808b0..e387ffe3085 100644 --- a/cpp/src/groupby/hash/compute_aggregations.hpp +++ b/cpp/src/groupby/hash/compute_aggregations.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2024, NVIDIA CORPORATION. + * Copyright (c) 2024-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -31,7 +31,6 @@ namespace cudf::groupby::detail::hash { template rmm::device_uvector compute_aggregations( int64_t num_rows, - bool skip_rows_with_nulls, bitmask_type const* row_bitmask, SetType& global_set, cudf::host_span requests, diff --git a/cpp/src/groupby/hash/compute_aggregations_null.cu b/cpp/src/groupby/hash/compute_aggregations_null.cu index 1d7184227ea..6c7f0615be3 100644 --- a/cpp/src/groupby/hash/compute_aggregations_null.cu +++ b/cpp/src/groupby/hash/compute_aggregations_null.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2024, NVIDIA CORPORATION. + * Copyright (c) 2024-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -20,7 +20,6 @@ namespace cudf::groupby::detail::hash { template rmm::device_uvector compute_aggregations( int64_t num_rows, - bool skip_rows_with_nulls, bitmask_type const* row_bitmask, nullable_global_set_t& global_set, cudf::host_span requests, diff --git a/cpp/src/groupby/hash/compute_global_memory_aggs.cu b/cpp/src/groupby/hash/compute_global_memory_aggs.cu index d2830f7d905..2d7687a826d 100644 --- a/cpp/src/groupby/hash/compute_global_memory_aggs.cu +++ b/cpp/src/groupby/hash/compute_global_memory_aggs.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2024, NVIDIA CORPORATION. + * Copyright (c) 2024-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -20,7 +20,6 @@ namespace cudf::groupby::detail::hash { template rmm::device_uvector compute_global_memory_aggs( cudf::size_type num_rows, - bool skip_rows_with_nulls, bitmask_type const* row_bitmask, cudf::table_view const& flattened_values, cudf::aggregation::Kind const* d_agg_kinds, diff --git a/cpp/src/groupby/hash/compute_global_memory_aggs.cuh b/cpp/src/groupby/hash/compute_global_memory_aggs.cuh index 671ee2ea31f..375906b756f 100644 --- a/cpp/src/groupby/hash/compute_global_memory_aggs.cuh +++ b/cpp/src/groupby/hash/compute_global_memory_aggs.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2024, NVIDIA CORPORATION. + * Copyright (c) 2024-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -41,7 +41,6 @@ namespace cudf::groupby::detail::hash { template rmm::device_uvector compute_global_memory_aggs( cudf::size_type num_rows, - bool skip_rows_with_nulls, bitmask_type const* row_bitmask, cudf::table_view const& flattened_values, cudf::aggregation::Kind const* d_agg_kinds, @@ -69,12 +68,11 @@ rmm::device_uvector compute_global_memory_aggs( auto d_sparse_table = mutable_table_device_view::create(sparse_table, stream); auto global_set_ref = global_set.ref(cuco::op::insert_and_find); - thrust::for_each_n( - rmm::exec_policy_nosync(stream), - thrust::counting_iterator{0}, - num_rows, - hash::compute_single_pass_aggs_fn{ - global_set_ref, *d_values, *d_sparse_table, d_agg_kinds, row_bitmask, skip_rows_with_nulls}); + thrust::for_each_n(rmm::exec_policy_nosync(stream), + thrust::counting_iterator{0}, + num_rows, + hash::compute_single_pass_aggs_fn{ + global_set_ref, *d_values, *d_sparse_table, d_agg_kinds, row_bitmask}); extract_populated_keys(global_set, populated_keys, stream); // Add results back to sparse_results cache diff --git a/cpp/src/groupby/hash/compute_global_memory_aggs.hpp b/cpp/src/groupby/hash/compute_global_memory_aggs.hpp index 437823a3fea..5b7d46fa461 100644 --- a/cpp/src/groupby/hash/compute_global_memory_aggs.hpp +++ b/cpp/src/groupby/hash/compute_global_memory_aggs.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2024, NVIDIA CORPORATION. + * Copyright (c) 2024-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -31,7 +31,6 @@ namespace cudf::groupby::detail::hash { template rmm::device_uvector compute_global_memory_aggs( cudf::size_type num_rows, - bool skip_rows_with_nulls, bitmask_type const* row_bitmask, cudf::table_view const& flattened_values, cudf::aggregation::Kind const* d_agg_kinds, diff --git a/cpp/src/groupby/hash/compute_global_memory_aggs_null.cu b/cpp/src/groupby/hash/compute_global_memory_aggs_null.cu index 7cb3f8f190b..41efbaf96ba 100644 --- a/cpp/src/groupby/hash/compute_global_memory_aggs_null.cu +++ b/cpp/src/groupby/hash/compute_global_memory_aggs_null.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2024, NVIDIA CORPORATION. + * Copyright (c) 2024-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -20,7 +20,6 @@ namespace cudf::groupby::detail::hash { template rmm::device_uvector compute_global_memory_aggs( cudf::size_type num_rows, - bool skip_rows_with_nulls, bitmask_type const* row_bitmask, cudf::table_view const& flattened_values, cudf::aggregation::Kind const* d_agg_kinds, diff --git a/cpp/src/groupby/hash/compute_groupby.cu b/cpp/src/groupby/hash/compute_groupby.cu index 9648d942513..46b8f0a0a3f 100644 --- a/cpp/src/groupby/hash/compute_groupby.cu +++ b/cpp/src/groupby/hash/compute_groupby.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2024, NVIDIA CORPORATION. + * Copyright (c) 2024-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -29,14 +29,33 @@ #include #include -#include #include +#include -#include #include namespace cudf::groupby::detail::hash { + +namespace { + +// The number of columns in the keys table that will trigger caching of row hashes. +// This is a heuristic to reduce memory read when the keys table is hashed twice. +constexpr int HASH_CACHING_THRESHOLD = 4; + +int count_nested_columns(column_view const& input) +{ + if (!is_nested(input.type())) { return 1; } + + // Count the current column too. + return 1 + std::accumulate( + input.child_begin(), input.child_end(), 0, [](int count, column_view const& child) { + return count + count_nested_columns(child); + }); +} + +} // namespace + template std::unique_ptr
compute_groupby(table_view const& keys, host_span requests, @@ -50,6 +69,51 @@ std::unique_ptr
compute_groupby(table_view const& keys, // convert to int64_t to avoid potential overflow with large `keys` auto const num_keys = static_cast(keys.num_rows()); + [[maybe_unused]] auto const [row_bitmask_data, row_bitmask] = + [&]() -> std::pair { + if (!skip_rows_with_nulls) { return {rmm::device_buffer{0, stream}, nullptr}; } + + if (keys.num_columns() == 1) { + auto const& keys_col = keys.column(0); + // Only use the input null mask directly if the keys table was not sliced. + if (keys_col.offset() == 0) { return {rmm::device_buffer{0, stream}, keys_col.null_mask()}; } + // If the keys table was sliced, we need to copy the null mask to ensure its first bit aligns + // with the first row of the keys table. + auto null_mask_data = cudf::copy_bitmask(keys_col, stream); + auto const null_mask = static_cast(null_mask_data.data()); + return {std::move(null_mask_data), null_mask}; + } + + auto [null_mask_data, null_count] = cudf::bitmask_and(keys, stream); + if (null_count == 0) { return {rmm::device_buffer{0, stream}, nullptr}; } + + auto const null_mask = static_cast(null_mask_data.data()); + return {std::move(null_mask_data), null_mask}; + }(); + + auto const cached_hashes = [&]() -> rmm::device_uvector { + auto const num_columns = + std::accumulate(keys.begin(), keys.end(), 0, [](int count, column_view const& col) { + return count + count_nested_columns(col); + }); + + if (num_columns <= HASH_CACHING_THRESHOLD) { + return rmm::device_uvector{0, stream}; + } + + rmm::device_uvector hashes(num_keys, stream); + thrust::tabulate(rmm::exec_policy_nosync(stream), + hashes.begin(), + hashes.end(), + [d_row_hash, row_bitmask] __device__(size_type const idx) { + if (!row_bitmask || cudf::bit_is_set(row_bitmask, idx)) { + return d_row_hash(idx); + } + return hash_value_type{0}; // dummy value, as it will be unused + }); + return hashes; + }(); + // Cache of sparse results where the location of aggregate value in each // column is indexed by the hash set cudf::detail::result_cache sparse_results(requests.size()); @@ -59,35 +123,19 @@ std::unique_ptr
compute_groupby(table_view const& keys, cudf::detail::CUCO_DESIRED_LOAD_FACTOR, // 50% load factor cuco::empty_key{cudf::detail::CUDF_SIZE_TYPE_SENTINEL}, d_row_equal, - probing_scheme_t{d_row_hash}, + probing_scheme_t{row_hasher_with_cache_t{d_row_hash, cached_hashes.data()}}, cuco::thread_scope_device, cuco::storage{}, cudf::detail::cuco_allocator{rmm::mr::polymorphic_allocator{}, stream}, stream.value()}; - auto row_bitmask = - skip_rows_with_nulls - ? cudf::bitmask_and(keys, stream, cudf::get_current_device_resource_ref()).first - : rmm::device_buffer{}; - // Compute all single pass aggs first - auto gather_map = compute_aggregations(num_keys, - skip_rows_with_nulls, - static_cast(row_bitmask.data()), - set, - requests, - &sparse_results, - stream); + auto gather_map = + compute_aggregations(num_keys, row_bitmask, set, requests, &sparse_results, stream); // Compact all results from sparse_results and insert into cache - sparse_to_dense_results(requests, - &sparse_results, - cache, - gather_map, - set.ref(cuco::find), - static_cast(row_bitmask.data()), - stream, - mr); + sparse_to_dense_results( + requests, &sparse_results, cache, gather_map, set.ref(cuco::find), row_bitmask, stream, mr); return cudf::detail::gather(keys, gather_map, diff --git a/cpp/src/groupby/hash/compute_mapping_indices.cu b/cpp/src/groupby/hash/compute_mapping_indices.cu index 519d7cd2f1c..71d18a86d9c 100644 --- a/cpp/src/groupby/hash/compute_mapping_indices.cu +++ b/cpp/src/groupby/hash/compute_mapping_indices.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2024, NVIDIA CORPORATION. + * Copyright (c) 2024-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -18,15 +18,13 @@ #include "compute_mapping_indices.hpp" namespace cudf::groupby::detail::hash { -template cudf::size_type max_occupancy_grid_size>( - cudf::size_type n); +template int32_t max_active_blocks_mapping_kernel>(); template void compute_mapping_indices>( cudf::size_type grid_size, cudf::size_type num, hash_set_ref_t global_set, bitmask_type const* row_bitmask, - bool skip_rows_with_nulls, cudf::size_type* local_mapping_index, cudf::size_type* global_mapping_index, cudf::size_type* block_cardinality, diff --git a/cpp/src/groupby/hash/compute_mapping_indices.cuh b/cpp/src/groupby/hash/compute_mapping_indices.cuh index bd043671174..133a43ad972 100644 --- a/cpp/src/groupby/hash/compute_mapping_indices.cuh +++ b/cpp/src/groupby/hash/compute_mapping_indices.cuh @@ -30,7 +30,6 @@ #include #include #include -#include #include @@ -41,13 +40,12 @@ __device__ void find_local_mapping(cooperative_groups::thread_block const& block cudf::size_type num_input_rows, SetType shared_set, bitmask_type const* row_bitmask, - bool skip_rows_with_nulls, cudf::size_type* cardinality, cudf::size_type* local_mapping_index, cudf::size_type* shared_set_indices) { auto const is_valid_input = - idx < num_input_rows and (not skip_rows_with_nulls or cudf::bit_is_set(row_bitmask, idx)); + idx < num_input_rows and (not row_bitmask or cudf::bit_is_set(row_bitmask, idx)); auto const [result_idx, inserted] = [&]() { if (is_valid_input) { auto const result = shared_set.insert_and_find(idx); @@ -99,7 +97,6 @@ template CUDF_KERNEL void mapping_indices_kernel(cudf::size_type num_input_rows, SetRef global_set, bitmask_type const* row_bitmask, - bool skip_rows_with_nulls, cudf::size_type* local_mapping_index, cudf::size_type* global_mapping_index, cudf::size_type* block_cardinality, @@ -136,7 +133,6 @@ CUDF_KERNEL void mapping_indices_kernel(cudf::size_type num_input_rows, num_input_rows, shared_set, row_bitmask, - skip_rows_with_nulls, &cardinality, local_mapping_index, shared_set_indices); @@ -159,14 +155,12 @@ CUDF_KERNEL void mapping_indices_kernel(cudf::size_type num_input_rows, } template -cudf::size_type max_occupancy_grid_size(cudf::size_type n) +int32_t max_active_blocks_mapping_kernel() { - cudf::size_type max_active_blocks{-1}; + int32_t max_active_blocks{-1}; CUDF_CUDA_TRY(cudaOccupancyMaxActiveBlocksPerMultiprocessor( &max_active_blocks, mapping_indices_kernel, GROUPBY_BLOCK_SIZE, 0)); - auto const grid_size = max_active_blocks * cudf::detail::num_multiprocessors(); - auto const num_blocks = cudf::util::div_rounding_up_safe(n, GROUPBY_BLOCK_SIZE); - return std::min(grid_size, num_blocks); + return max_active_blocks; } template @@ -174,7 +168,6 @@ void compute_mapping_indices(cudf::size_type grid_size, cudf::size_type num, SetRef global_set, bitmask_type const* row_bitmask, - bool skip_rows_with_nulls, cudf::size_type* local_mapping_index, cudf::size_type* global_mapping_index, cudf::size_type* block_cardinality, @@ -185,7 +178,6 @@ void compute_mapping_indices(cudf::size_type grid_size, num, global_set, row_bitmask, - skip_rows_with_nulls, local_mapping_index, global_mapping_index, block_cardinality, diff --git a/cpp/src/groupby/hash/compute_mapping_indices.hpp b/cpp/src/groupby/hash/compute_mapping_indices.hpp index 473ad99e650..58bdfb250f5 100644 --- a/cpp/src/groupby/hash/compute_mapping_indices.hpp +++ b/cpp/src/groupby/hash/compute_mapping_indices.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2024, NVIDIA CORPORATION. + * Copyright (c) 2024-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -22,19 +22,25 @@ #include namespace cudf::groupby::detail::hash { + +/* + * @brief Computes the maximum number of active blocks of the shared memory aggregation kernel that + * can be executed on the underlying device. + */ +int32_t max_active_blocks_shmem_aggs_kernel(); + /* - * @brief Computes the maximum number of active blocks of the given kernel that can be executed on - * the underlying device + * @brief Computes the maximum number of active blocks of the mapping indices kernel that can be + * executed on the underlying device. */ template -[[nodiscard]] cudf::size_type max_occupancy_grid_size(cudf::size_type n); +[[nodiscard]] int32_t max_active_blocks_mapping_kernel(); template void compute_mapping_indices(cudf::size_type grid_size, cudf::size_type num, SetRef global_set, bitmask_type const* row_bitmask, - bool skip_rows_with_nulls, cudf::size_type* local_mapping_index, cudf::size_type* global_mapping_index, cudf::size_type* block_cardinality, diff --git a/cpp/src/groupby/hash/compute_mapping_indices_null.cu b/cpp/src/groupby/hash/compute_mapping_indices_null.cu index 81c3c9e456f..01d4657e20a 100644 --- a/cpp/src/groupby/hash/compute_mapping_indices_null.cu +++ b/cpp/src/groupby/hash/compute_mapping_indices_null.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2024, NVIDIA CORPORATION. + * Copyright (c) 2024-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -18,15 +18,14 @@ #include "compute_mapping_indices.hpp" namespace cudf::groupby::detail::hash { -template cudf::size_type -max_occupancy_grid_size>(cudf::size_type n); +template int32_t +max_active_blocks_mapping_kernel>(); template void compute_mapping_indices>( cudf::size_type grid_size, cudf::size_type num, nullable_hash_set_ref_t global_set, bitmask_type const* row_bitmask, - bool skip_rows_with_nulls, cudf::size_type* local_mapping_index, cudf::size_type* global_mapping_index, cudf::size_type* block_cardinality, diff --git a/cpp/src/groupby/hash/compute_shared_memory_aggs.cu b/cpp/src/groupby/hash/compute_shared_memory_aggs.cu index 878126bbea5..fc24dd727d4 100644 --- a/cpp/src/groupby/hash/compute_shared_memory_aggs.cu +++ b/cpp/src/groupby/hash/compute_shared_memory_aggs.cu @@ -105,7 +105,6 @@ __device__ void initialize_shmem_aggregations(cooperative_groups::thread_block c __device__ void compute_pre_aggregrations(cudf::size_type col_start, cudf::size_type col_end, bitmask_type const* row_bitmask, - bool skip_rows_with_nulls, cudf::table_device_view source, cudf::size_type num_input_rows, cudf::size_type* local_mapping_index, @@ -118,7 +117,7 @@ __device__ void compute_pre_aggregrations(cudf::size_type col_start, // Aggregates global memory sources to shared memory targets for (auto source_idx = cudf::detail::grid_1d::global_thread_id(); source_idx < num_input_rows; source_idx += cudf::detail::grid_1d::grid_stride()) { - if (not skip_rows_with_nulls or cudf::bit_is_set(row_bitmask, source_idx)) { + if (not row_bitmask or cudf::bit_is_set(row_bitmask, source_idx)) { auto const target_idx = local_mapping_index[source_idx] + agg_location_offset; for (auto col_idx = col_start; col_idx < col_end; col_idx++) { auto const source_col = source.column(col_idx); @@ -183,7 +182,6 @@ __device__ void compute_final_aggregations(cooperative_groups::thread_block cons * pre (shared) and final (global) aggregates*/ CUDF_KERNEL void single_pass_shmem_aggs_kernel(cudf::size_type num_rows, bitmask_type const* row_bitmask, - bool skip_rows_with_nulls, cudf::size_type* local_mapping_index, cudf::size_type* global_mapping_index, cudf::size_type* block_cardinality, @@ -248,7 +246,6 @@ CUDF_KERNEL void single_pass_shmem_aggs_kernel(cudf::size_type num_rows, compute_pre_aggregrations(col_start, col_end, row_bitmask, - skip_rows_with_nulls, input_values, num_rows, local_mapping_index, @@ -287,11 +284,18 @@ size_type get_available_shared_memory_size(cudf::size_type grid_size) ALIGNMENT); } +int32_t max_active_blocks_shmem_aggs_kernel() +{ + int32_t max_active_blocks{-1}; + CUDF_CUDA_TRY(cudaOccupancyMaxActiveBlocksPerMultiprocessor( + &max_active_blocks, single_pass_shmem_aggs_kernel, GROUPBY_BLOCK_SIZE, 0)); + return max_active_blocks; +} + void compute_shared_memory_aggs(cudf::size_type grid_size, size_type available_shmem_size, cudf::size_type num_input_rows, bitmask_type const* row_bitmask, - bool skip_rows_with_nulls, cudf::size_type* local_mapping_index, cudf::size_type* global_mapping_index, cudf::size_type* block_cardinality, @@ -310,7 +314,6 @@ void compute_shared_memory_aggs(cudf::size_type grid_size, single_pass_shmem_aggs_kernel<<>>( num_input_rows, row_bitmask, - skip_rows_with_nulls, local_mapping_index, global_mapping_index, block_cardinality, diff --git a/cpp/src/groupby/hash/compute_shared_memory_aggs.hpp b/cpp/src/groupby/hash/compute_shared_memory_aggs.hpp index b6ba6898c07..c361a1048b6 100644 --- a/cpp/src/groupby/hash/compute_shared_memory_aggs.hpp +++ b/cpp/src/groupby/hash/compute_shared_memory_aggs.hpp @@ -33,7 +33,6 @@ void compute_shared_memory_aggs(cudf::size_type grid_size, cudf::size_type available_shmem_size, cudf::size_type num_input_rows, bitmask_type const* row_bitmask, - bool skip_rows_with_nulls, cudf::size_type* local_mapping_index, cudf::size_type* global_mapping_index, cudf::size_type* block_cardinality, diff --git a/cpp/src/groupby/hash/helpers.cuh b/cpp/src/groupby/hash/helpers.cuh index 759a29840f4..c4c96ec210d 100644 --- a/cpp/src/groupby/hash/helpers.cuh +++ b/cpp/src/groupby/hash/helpers.cuh @@ -56,8 +56,27 @@ using row_hash_t = cudf::experimental::row::hash::device_row_hasher; +/// Adapter to cudf row hasher with caching support. +class row_hasher_with_cache_t { + row_hash_t hasher; + hash_value_type const* values; + + public: + row_hasher_with_cache_t(row_hash_t const& hasher, + hash_value_type const* values = nullptr) noexcept + : hasher(hasher), values(values) + { + } + + __device__ hash_value_type operator()(size_type const idx) const noexcept + { + if (values) { return values[idx]; } + return hasher(idx); + } +}; + /// Probing scheme type used by groupby hash table -using probing_scheme_t = cuco::linear_probing; +using probing_scheme_t = cuco::linear_probing; using row_comparator_t = cudf::experimental::row::equality::device_row_comparator< false, diff --git a/cpp/src/groupby/hash/single_pass_functors.cuh b/cpp/src/groupby/hash/single_pass_functors.cuh index 5b810145a2e..b4606cd08a0 100644 --- a/cpp/src/groupby/hash/single_pass_functors.cuh +++ b/cpp/src/groupby/hash/single_pass_functors.cuh @@ -191,7 +191,6 @@ struct global_memory_fallback_fn { cudf::size_type* block_cardinality; cudf::size_type stride; bitmask_type const* __restrict__ row_bitmask; - bool skip_rows_with_nulls; global_memory_fallback_fn(SetType set, cudf::table_device_view input_values, @@ -199,16 +198,14 @@ struct global_memory_fallback_fn { cudf::aggregation::Kind const* aggs, cudf::size_type* block_cardinality, cudf::size_type stride, - bitmask_type const* row_bitmask, - bool skip_rows_with_nulls) + bitmask_type const* row_bitmask) : set(set), input_values(input_values), output_values(output_values), aggs(aggs), block_cardinality(block_cardinality), stride(stride), - row_bitmask(row_bitmask), - skip_rows_with_nulls(skip_rows_with_nulls) + row_bitmask(row_bitmask) { } @@ -216,7 +213,7 @@ struct global_memory_fallback_fn { { auto const block_id = (i % stride) / GROUPBY_BLOCK_SIZE; if (block_cardinality[block_id] >= GROUPBY_CARDINALITY_THRESHOLD and - (not skip_rows_with_nulls or cudf::bit_is_set(row_bitmask, i))) { + (not row_bitmask or cudf::bit_is_set(row_bitmask, i))) { auto const result = set.insert_and_find(i); cudf::detail::aggregate_row(output_values, *result.first, input_values, i, aggs); } @@ -256,7 +253,6 @@ struct compute_single_pass_aggs_fn { mutable_table_device_view output_values; aggregation::Kind const* __restrict__ aggs; bitmask_type const* __restrict__ row_bitmask; - bool skip_rows_with_nulls; /** * @brief Construct a new compute_single_pass_aggs_fn functor object @@ -270,28 +266,23 @@ struct compute_single_pass_aggs_fn { * columns of the `input_values` rows * @param row_bitmask Bitmask where bit `i` indicates the presence of a null * value in row `i` of input keys. Only used if `skip_rows_with_nulls` is `true` - * @param skip_rows_with_nulls Indicates if rows in `input_keys` containing - * null values should be skipped. It `true`, it is assumed `row_bitmask` is a - * bitmask where bit `i` indicates the presence of a null value in row `i`. */ compute_single_pass_aggs_fn(SetType set, table_device_view input_values, mutable_table_device_view output_values, aggregation::Kind const* aggs, - bitmask_type const* row_bitmask, - bool skip_rows_with_nulls) + bitmask_type const* row_bitmask) : set(set), input_values(input_values), output_values(output_values), aggs(aggs), - row_bitmask(row_bitmask), - skip_rows_with_nulls(skip_rows_with_nulls) + row_bitmask(row_bitmask) { } __device__ void operator()(size_type i) { - if (not skip_rows_with_nulls or cudf::bit_is_set(row_bitmask, i)) { + if (not row_bitmask or cudf::bit_is_set(row_bitmask, i)) { auto const result = set.insert_and_find(i); cudf::detail::aggregate_row(output_values, *result.first, input_values, i, aggs); From 21b69cf8c31c42bf981c5c8987b4d0d6ff36b3d5 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 19 Aug 2025 13:40:01 -0700 Subject: [PATCH 164/366] Support decimal columns in cudf_polars (#19589) closes https://github.com/rapidsai/cudf/issues/18863 On top of https://github.com/rapidsai/cudf/pull/19587 Notes: * I think there was an error in `pylibcudf.Scalar.to/from_py` with `decimal.Decimal`s. `py-polars/tests/unit/datatypes/test_decimal.py::test_decimal_dynamic_float_st` originally failed because the `scale` of an `pylibcudf.Scalar` was flipped. * Polars `Decimal` supports `precision=None` meaning "precision will be inferred". The xfailed test in `plugin.py` are failing because IIUC [we cannot set the result back to `precision=None`](https://github.com/pola-rs/polars/issues/23899) and set the precision to the max libcudf precision instead. I'm not sure if we're OK with this difference (and document) or find a way to set `precision=None` Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - Matthew Murray (https://github.com/Matt711) - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/19589 --- .../cudf_polars/containers/dataframe.py | 19 ++++++++++---- .../cudf_polars/containers/datatype.py | 2 ++ python/cudf_polars/cudf_polars/dsl/ir.py | 10 ++++++++ .../cudf_polars/cudf_polars/testing/plugin.py | 2 ++ .../tests/expressions/test_literal.py | 6 ++++- python/cudf_polars/tests/test_groupby.py | 3 +++ python/cudf_polars/tests/test_scan.py | 9 +++++++ python/cudf_polars/tests/test_select.py | 25 +++++++++++++++++++ python/pylibcudf/pylibcudf/scalar.pyx | 8 +++--- python/pylibcudf/tests/test_interop.py | 2 +- 10 files changed, 75 insertions(+), 11 deletions(-) diff --git a/python/cudf_polars/cudf_polars/containers/dataframe.py b/python/cudf_polars/cudf_polars/containers/dataframe.py index 43ec63738b2..3a095be3cfe 100644 --- a/python/cudf_polars/cudf_polars/containers/dataframe.py +++ b/python/cudf_polars/cudf_polars/containers/dataframe.py @@ -29,17 +29,26 @@ def _create_polars_column_metadata( name: str, dtype: PolarsDataType ) -> plc.interop.ColumnMetadata: - """Create ColumnMetadata preserving pl.Struct field names.""" + """Create ColumnMetadata preserving dtype attributes not supported by libcudf.""" + children_meta = [] + timezone = "" + precision: int | None = None + if isinstance(dtype, pl.Struct): children_meta = [ _create_polars_column_metadata(field.name, field.dtype) for field in dtype.fields ] - else: - children_meta = [] - timezone = dtype.time_zone if isinstance(dtype, pl.Datetime) else None + elif isinstance(dtype, pl.Datetime): + timezone = dtype.time_zone or timezone + elif isinstance(dtype, pl.Decimal): + precision = dtype.precision + return plc.interop.ColumnMetadata( - name=name, timezone=timezone or "", children_meta=children_meta + name=name, + timezone=timezone, + precision=precision, + children_meta=children_meta, ) diff --git a/python/cudf_polars/cudf_polars/containers/datatype.py b/python/cudf_polars/cudf_polars/containers/datatype.py index 5de610425ed..50a5352612a 100644 --- a/python/cudf_polars/cudf_polars/containers/datatype.py +++ b/python/cudf_polars/cudf_polars/containers/datatype.py @@ -81,6 +81,8 @@ def _from_polars(dtype: pl.DataType) -> plc.DataType: assert_never(dtype.time_unit) elif isinstance(dtype, pl.String): return plc.DataType(plc.TypeId.STRING) + elif isinstance(dtype, pl.Decimal): + return plc.DataType(plc.TypeId.DECIMAL128, scale=-dtype.scale) elif isinstance(dtype, pl.Null): # TODO: Hopefully return plc.DataType(plc.TypeId.EMPTY) diff --git a/python/cudf_polars/cudf_polars/dsl/ir.py b/python/cudf_polars/cudf_polars/dsl/ir.py index 161c8f4a576..7783b15a207 100644 --- a/python/cudf_polars/cudf_polars/dsl/ir.py +++ b/python/cudf_polars/cudf_polars/dsl/ir.py @@ -1467,6 +1467,16 @@ def do_evaluate( else: (child,) = value.children col = child.evaluate(df, context=ExecutionContext.GROUPBY).obj + + if value.name == "median" and col.type().id() in { + plc.TypeId.DECIMAL128, + plc.TypeId.DECIMAL64, + plc.TypeId.DECIMAL32, + }: + # libcudf doesn't support median (quantile) with decimal types, + # but Polars returns a float result, so just cast the input. + assert isinstance(child, expr.Col) + col = plc.unary.cast(col, schema[child.name].plc) else: # Anything else, we pre-evaluate col = value.evaluate(df, context=ExecutionContext.GROUPBY).obj diff --git a/python/cudf_polars/cudf_polars/testing/plugin.py b/python/cudf_polars/cudf_polars/testing/plugin.py index 14956d2cfbc..5038a0d5690 100644 --- a/python/cudf_polars/cudf_polars/testing/plugin.py +++ b/python/cudf_polars/cudf_polars/testing/plugin.py @@ -175,6 +175,8 @@ def pytest_configure(config: pytest.Config) -> None: "tests/unit/io/test_lazy_parquet.py::test_parquet_schema_arg[False-row_groups]": "allow_missing_columns argument in read_parquet not translated in IR", "tests/unit/io/test_lazy_parquet.py::test_parquet_schema_arg[False-prefiltered]": "allow_missing_columns argument in read_parquet not translated in IR", "tests/unit/io/test_lazy_parquet.py::test_parquet_schema_arg[False-none]": "allow_missing_columns argument in read_parquet not translated in IR", + "tests/unit/datatypes/test_decimal.py::test_decimal_aggregations": "https://github.com/pola-rs/polars/issues/23899", + "tests/unit/datatypes/test_decimal.py::test_decimal_arithmetic_schema": "https://github.com/pola-rs/polars/issues/23899", } diff --git a/python/cudf_polars/tests/expressions/test_literal.py b/python/cudf_polars/tests/expressions/test_literal.py index 69ee80da82e..1c2eb05ebfe 100644 --- a/python/cudf_polars/tests/expressions/test_literal.py +++ b/python/cudf_polars/tests/expressions/test_literal.py @@ -2,6 +2,8 @@ # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations +import datetime + import pytest import polars as pl @@ -95,7 +97,9 @@ def test_select_literal_series(): assert_gpu_result_equal(q) -@pytest.mark.parametrize("expr", [pl.lit(None), pl.lit(10, dtype=pl.Decimal())]) +@pytest.mark.parametrize( + "expr", [pl.lit(None), pl.lit(datetime.time(12, 0), dtype=pl.Time())] +) def test_unsupported_literal_raises(expr): df = pl.LazyFrame({}) diff --git a/python/cudf_polars/tests/test_groupby.py b/python/cudf_polars/tests/test_groupby.py index 36196522f34..38ece61457e 100644 --- a/python/cudf_polars/tests/test_groupby.py +++ b/python/cudf_polars/tests/test_groupby.py @@ -2,6 +2,7 @@ # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations +import decimal import itertools import random from datetime import date @@ -25,6 +26,7 @@ def df(): "key2": [2, 2, 2, 2, 6, 1, 4, 6, 8], "int": [1, 2, 3, 4, 5, 6, 7, 8, 9], "int32": pl.Series([1, 2, 3, 4, 5, 6, 7, 8, 9], dtype=pl.Int32()), + "decimal": [decimal.Decimal("1.23"), None, decimal.Decimal("-0.23")] * 3, "uint16_with_null": pl.Series( [1, None, 2, None, None, None, 4, 5, 6], dtype=pl.UInt16() ), @@ -89,6 +91,7 @@ def keys(request): [pl.col("float").quantile(0.3, interpolation="lower")], [pl.col("float").quantile(0.3, interpolation="midpoint")], [pl.col("float").quantile(0.3, interpolation="linear")], + [pl.col("decimal").median()], [ pl.col("datetime").max(), pl.col("datetime").max().dt.is_leap_year().alias("leapyear"), diff --git a/python/cudf_polars/tests/test_scan.py b/python/cudf_polars/tests/test_scan.py index 8481105baad..7510fe833be 100644 --- a/python/cudf_polars/tests/test_scan.py +++ b/python/cudf_polars/tests/test_scan.py @@ -2,6 +2,7 @@ # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations +import decimal from typing import TYPE_CHECKING import pytest @@ -45,6 +46,14 @@ def df(): "a": [1, 2, 3, None, 4, 5], "b": ["ẅ", "x", "y", "z", "123", "abcd"], "c": [None, None, 4, 5, -1, 0], + "d": [ + decimal.Decimal("1.23"), + None, + decimal.Decimal("0.00"), + None, + decimal.Decimal("-5.67"), + None, + ], } ) diff --git a/python/cudf_polars/tests/test_select.py b/python/cudf_polars/tests/test_select.py index da3f519783b..10fcf9f660d 100644 --- a/python/cudf_polars/tests/test_select.py +++ b/python/cudf_polars/tests/test_select.py @@ -2,6 +2,8 @@ # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations +import decimal + import pytest import polars as pl @@ -27,6 +29,29 @@ def test_select(): assert_gpu_result_equal(query) +def test_select_decimal(): + ldf = pl.LazyFrame( + {"a": pl.Series(values=[decimal.Decimal("1.0"), None], dtype=pl.Decimal(3, 1))} + ) + query = ldf.select(pl.col("a")) + assert_gpu_result_equal(query) + + +def test_select_decimal_precision_none_result_max_precision(): + ldf = pl.LazyFrame( + { + "a": pl.Series( + values=[decimal.Decimal("1.0"), None], dtype=pl.Decimal(None, 1) + ) + } + ) + query = ldf.select(pl.col("a")) + cpu_result = query.collect() + gpu_result = query.collect(engine="gpu") + assert cpu_result.schema["a"].precision is None + assert gpu_result.schema["a"].precision == 38 + + def test_select_reduce(): ldf = pl.DataFrame( { diff --git a/python/pylibcudf/pylibcudf/scalar.pyx b/python/pylibcudf/pylibcudf/scalar.pyx index 0d533c960a4..31a93abd6fb 100644 --- a/python/pylibcudf/pylibcudf/scalar.pyx +++ b/python/pylibcudf/pylibcudf/scalar.pyx @@ -275,7 +275,7 @@ cdef class Scalar: return decimal.Decimal( (slr).value().value() ).scaleb( - -(slr).type().scale() + (slr).type().scale() ) else: raise NotImplementedError( @@ -647,12 +647,12 @@ def _(py_val: datetime.date, dtype: DataType | None): @_from_py.register(decimal.Decimal) def _(py_val: decimal.Decimal, dtype: DataType | None): - scale = -py_val.as_tuple().exponent - as_int = int(py_val.scaleb(scale)) + scale = py_val.as_tuple().exponent + as_int = int(py_val.scaleb(-scale)) cdef int128_t val = as_int - dtype = DataType(type_id.DECIMAL128, -scale) + dtype = DataType(type_id.DECIMAL128, scale) if dtype.id() != type_id.DECIMAL128: raise TypeError("Expected dtype to be DECIMAL128") diff --git a/python/pylibcudf/tests/test_interop.py b/python/pylibcudf/tests/test_interop.py index 171d70c2496..b1a6e9f2c66 100644 --- a/python/pylibcudf/tests/test_interop.py +++ b/python/pylibcudf/tests/test_interop.py @@ -105,7 +105,7 @@ def test_decimal_other(data_type): [plc.TypeId.DECIMAL128, plc.TypeId.DECIMAL64, plc.TypeId.DECIMAL32], ) def test_decimal_respect_metadata_precision(plc_type, request): - request.node.add_marker( + request.applymarker( pytest.mark.xfail( parse(pa.__version__) < parse("19.0.0") and plc_type in {plc.TypeId.DECIMAL64, plc.TypeId.DECIMAL32}, From f6b9b9eda6c3bdf4587c7a5069f4b2bfa747776b Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 19 Aug 2025 14:07:47 -0700 Subject: [PATCH 165/366] Move (most of) test_list.py to new cudf classic test directories (#19574) Towards https://github.com/rapidsai/cudf/issues/9999 Towards https://github.com/rapidsai/cudf/issues/15723 Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - Matthew Murray (https://github.com/Matt711) URL: https://github.com/rapidsai/cudf/pull/19574 --- .../tests/dataframe/indexing/test_setitem.py | 21 +- .../dataframe/methods/test_memory_usage.py | 9 + .../tests/dataframe/methods/test_to_arrow.py | 24 + .../tests/dataframe/methods/test_to_pandas.py | 12 + .../cudf/tests/dataframe/test_constructors.py | 38 + .../cudf/cudf/tests/dtypes/test_listdtype.py | 11 + .../cudf/tests/series/accessors/test_list.py | 536 ++++++++++ .../tests/series/indexing/test_getitem.py | 63 ++ .../tests/series/indexing/test_setitem.py | 67 ++ .../cudf/tests/series/methods/test_astype.py | 32 + .../cudf/tests/series/methods/test_explode.py | 46 + .../tests/series/methods/test_memory_usage.py | 16 + .../cudf/tests/series/test_constructors.py | 35 + python/cudf/cudf/tests/test_list.py | 926 ------------------ 14 files changed, 906 insertions(+), 930 deletions(-) create mode 100644 python/cudf/cudf/tests/dataframe/methods/test_memory_usage.py create mode 100644 python/cudf/cudf/tests/dataframe/methods/test_to_pandas.py create mode 100644 python/cudf/cudf/tests/series/accessors/test_list.py diff --git a/python/cudf/cudf/tests/dataframe/indexing/test_setitem.py b/python/cudf/cudf/tests/dataframe/indexing/test_setitem.py index 73f4632fafd..7d2dab7caa6 100644 --- a/python/cudf/cudf/tests/dataframe/indexing/test_setitem.py +++ b/python/cudf/cudf/tests/dataframe/indexing/test_setitem.py @@ -1,12 +1,25 @@ # Copyright (c) 2025, NVIDIA CORPORATION. - import pandas as pd -from cudf import DataFrame +import cudf + + +def test_listcol_setitem_retain_dtype(): + df = cudf.DataFrame( + {"a": cudf.Series([["a", "b"], []]), "b": [1, 2], "c": [123, 321]} + ) + df1 = df.head(0) + # Performing a setitem on `b` triggers a `column.column_empty` call + # which tries to create an empty ListColumn. + df1["b"] = df1["c"] + # Performing a copy to trigger a copy dtype which is obtained by accessing + # `ListColumn.children` that would have been corrupted in previous call + # prior to this fix: https://github.com/rapidsai/cudf/pull/10151/ + df2 = df1.copy() + assert df2["a"].dtype == df["a"].dtype def test_setitem_datetime(): - df = DataFrame() - df["date"] = pd.date_range("20010101", "20010105").values + df = cudf.DataFrame({"date": pd.date_range("20010101", "20010105").values}) assert df.date.dtype.kind == "M" diff --git a/python/cudf/cudf/tests/dataframe/methods/test_memory_usage.py b/python/cudf/cudf/tests/dataframe/methods/test_memory_usage.py new file mode 100644 index 00000000000..42185ff6a9c --- /dev/null +++ b/python/cudf/cudf/tests/dataframe/methods/test_memory_usage.py @@ -0,0 +1,9 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + + +import cudf + + +def test_list_struct_list_memory_usage(): + df = cudf.DataFrame({"a": [[{"b": [1]}]]}) + assert df.memory_usage().sum() == 16 diff --git a/python/cudf/cudf/tests/dataframe/methods/test_to_arrow.py b/python/cudf/cudf/tests/dataframe/methods/test_to_arrow.py index 2d3174a8225..d29a6bbcfaf 100644 --- a/python/cudf/cudf/tests/dataframe/methods/test_to_arrow.py +++ b/python/cudf/cudf/tests/dataframe/methods/test_to_arrow.py @@ -22,6 +22,30 @@ def test_dataframe_to_arrow_preserve_index(preserve_index): assert expect == got +def test_dataframe_list_round_trip(): + data = [{"text": "hello", "list_col": np.asarray([1, 2], dtype="uint32")}] + cudf_arrow = cudf.DataFrame(data).to_arrow() + pdf_arrow = pa.Table.from_pandas(pd.DataFrame(data)) + + for metadata in ( + None, + pdf_arrow.schema.metadata, + cudf_arrow.schema.metadata, + ): + schema = pa.schema( + [ + pa.field("text", pa.string()), + pa.field("list_col", pa.list_(pa.uint32())), + ], + metadata=metadata, + ) + + data = {"text": ["asd", "pqr"], "list_col": [[1, 2, 3], [4, 5]]} + + table = pa.Table.from_pydict(data, schema=schema) + assert_eq(table.to_pandas(), pd.DataFrame(data)) + + def test_datetime_to_arrow(datetime_types_as_str): data = pd.date_range("2000-01-01", "2000-01-02", freq="3600s") gdf = cudf.DataFrame({"timestamp": data.astype(datetime_types_as_str)}) diff --git a/python/cudf/cudf/tests/dataframe/methods/test_to_pandas.py b/python/cudf/cudf/tests/dataframe/methods/test_to_pandas.py new file mode 100644 index 00000000000..df549bbac5a --- /dev/null +++ b/python/cudf/cudf/tests/dataframe/methods/test_to_pandas.py @@ -0,0 +1,12 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + + +import pytest + +import cudf + + +def test_list_to_pandas_nullable_true(): + df = cudf.DataFrame({"a": cudf.Series([[1, 2, 3]])}) + with pytest.raises(NotImplementedError): + df.to_pandas(nullable=True) diff --git a/python/cudf/cudf/tests/dataframe/test_constructors.py b/python/cudf/cudf/tests/dataframe/test_constructors.py index 59463b812a0..385f4c44c01 100644 --- a/python/cudf/cudf/tests/dataframe/test_constructors.py +++ b/python/cudf/cudf/tests/dataframe/test_constructors.py @@ -403,6 +403,44 @@ def test_from_scalar_typing(request, all_supported_types_as_str): assert len(gdf["b"]) == len(gdf["a"]) +@pytest.mark.parametrize( + "data", + [ + {"a": [[]]}, + {"a": [[None]]}, + {"a": [[1, 2, 3]]}, + {"a": [[1, 2, 3]], "b": [[2, 3, 4]]}, + {"a": [[1, 2, 3, None], [None]], "b": [[2, 3, 4], [5]], "c": None}, + {"a": [[1]], "b": [[1, 2, 3]]}, + pd.DataFrame({"a": [[1, 2, 3]]}), + ], +) +def test_df_list_dtypes(data): + expect = pd.DataFrame(data) + got = cudf.DataFrame(data) + assert_eq(expect, got) + + +@pytest.mark.parametrize( + "data", + [ + {"a": [[]]}, + {"a": [[1, 2, None, 4]]}, + {"a": [["cat", None, "dog"]]}, + { + "a": [[1, 2, 3, None], [4, None, 5]], + "b": [None, ["fish", "bird"]], + "c": [[], []], + }, + {"a": [[1, 2, 3, None], [4, None, 5], None, [6, 7]]}, + ], +) +def test_serialize_list_columns(data): + df = cudf.DataFrame(data) + reconstructed = df.__class__.deserialize(*df.serialize()) + assert_eq(reconstructed, df) + + @pytest.mark.parametrize( "data1, data2", [(1, 2), (1.0, 2.0), (3, 4.0)], diff --git a/python/cudf/cudf/tests/dtypes/test_listdtype.py b/python/cudf/cudf/tests/dtypes/test_listdtype.py index 0477ae50003..84576dc0608 100644 --- a/python/cudf/cudf/tests/dtypes/test_listdtype.py +++ b/python/cudf/cudf/tests/dtypes/test_listdtype.py @@ -7,6 +7,17 @@ from cudf.utils.dtypes import cudf_dtype_to_pa_type +def test_listdtype_hash(): + a = cudf.ListDtype("int64") + b = cudf.ListDtype("int64") + + assert hash(a) == hash(b) + + c = cudf.ListDtype("int32") + + assert hash(a) != hash(c) + + def test_list_dtype_pyarrow_round_trip(all_supported_types_as_str, request): request.applymarker( pytest.mark.xfail( diff --git a/python/cudf/cudf/tests/series/accessors/test_list.py b/python/cudf/cudf/tests/series/accessors/test_list.py new file mode 100644 index 00000000000..18f86dfd7c2 --- /dev/null +++ b/python/cudf/cudf/tests/series/accessors/test_list.py @@ -0,0 +1,536 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + +import functools +import operator + +import numpy as np +import pandas as pd +import pyarrow as pa +import pytest + +import cudf +from cudf.api.types import is_scalar +from cudf.testing import assert_eq +from cudf.utils.dtypes import cudf_dtype_to_pa_type + + +@pytest.mark.parametrize( + "data", + [ + [[]], + [[[]]], + [[0]], + [[0, 1]], + [[0, 1], [2, 3]], + [[[0, 1], [2]], [[3, 4]]], + [[[0, 1, None], None], None, [[3, 2, None], None]], + [[["a", "c", None], None], None, [["b", "d", None], None]], + ], +) +def test_leaves(data): + pa_array = pa.array(data) + while hasattr(pa_array, "flatten"): + pa_array = pa_array.flatten() + + expect = cudf.Series(pa_array) + got = cudf.Series(data).list.leaves + assert_eq( + expect, + got, + check_dtype=not isinstance(pa_array, pa.NullArray), + ) + + +@pytest.mark.parametrize( + "data", + [ + [[]], + [[1, 2, 3], [4, 5]], + [[1, 2, 3], [], [4, 5]], + [[1, 2, 3], None, [4, 5]], + [[None, None], [None]], + [[[[[[1, 2, 3]]]]]], + cudf.Series([[1, 2]]).iloc[0:0], + cudf.Series([None, [1, 2]]).iloc[0:1], + ], +) +def test_len(data): + gsr = cudf.Series(data) + psr = gsr.to_pandas() + + expect = psr.map(lambda x: len(x) if x is not None else None) + got = gsr.list.len() + + assert_eq(expect, got, check_dtype=False) + + +@pytest.mark.parametrize( + ("data", "idx"), + [ + ([[1, 2, 3], [3, 4, 5], [4, 5, 6]], [[0, 1], [2], [1, 2]]), + ([[1, 2, 3], [3, 4, 5], [4, 5, 6]], [[1, 2, 0], [1, 0, 2], [0, 1, 2]]), + ([[1, 2, 3], []], [[0, 1], []]), + ([[1, 2, 3], [None]], [[0, 1], []]), + ([[1, None, 3], None], [[0, 1], []]), + ], +) +def test_take(data, idx): + ps = pd.Series(data) + gs = cudf.from_pandas(ps) + + expected = pd.Series(zip(ps, idx, strict=True)).map( + lambda x: [x[0][i] for i in x[1]] if x[0] is not None else None + ) + got = gs.list.take(idx) + assert_eq(expected, got) + + +@pytest.mark.parametrize( + ("invalid", "exception"), + [ + ([[0]], pytest.raises(ValueError, match="different size")), + ([1, 2, 3, 4], pytest.raises(ValueError, match="should be list type")), + ( + [["a", "b"], ["c"]], + pytest.raises( + TypeError, match="should be column of values of index types" + ), + ), + ( + [[[1], [0]], [[0]]], + pytest.raises( + TypeError, match="should be column of values of index types" + ), + ), + ([[0, 1], None], pytest.raises(ValueError, match="contains null")), + ], +) +def test_take_invalid(invalid, exception): + gs = cudf.Series([[0, 1], [2, 3]]) + with exception: + gs.list.take(invalid) + + +@pytest.mark.parametrize( + ("data", "expected"), + [ + ([[1, 1, 2, 2], [], None, [3, 4, 5]], [[1, 2], [], None, [3, 4, 5]]), + ( + [[1.233, np.nan, 1.234, 3.141, np.nan, 1.234]], + [[1.233, 1.234, np.nan, 3.141]], + ), # duplicate nans + ([[1, 1, 2, 2, None, None]], [[1, 2, None]]), # duplicate nulls + ( + [[1.233, np.nan, None, 1.234, 3.141, np.nan, 1.234, None]], + [[1.233, 1.234, np.nan, None, 3.141]], + ), # duplicate nans and nulls + ([[2, None, 1, None, 2]], [[1, 2, None]]), + ([[], []], [[], []]), + ([[], None], [[], None]), + ], +) +def test_unique(data, expected): + """ + Pandas de-duplicates nans and nulls respectively in Series.unique. + `expected` is setup to mimic such behavior + """ + gs = cudf.Series(data, nan_as_null=False) + + got = gs.list.unique().list.sort_values() + expected = cudf.Series(expected, nan_as_null=False).list.sort_values() + + assert_eq(expected, got) + + +def key_func_builder(x, na_position): + return x if x is not None else -1e8 if na_position == "first" else 1e8 + + +@pytest.mark.parametrize( + "data", + [ + [[4, 2, None, 9], [8, 8, 2], [2, 1]], + [[4, 2, None, 9], [8, 8, 2], None], + [[4, 2, None, 9], [], None], + ], +) +@pytest.mark.parametrize( + "index", + [ + None, + pd.Index(["a", "b", "c"]), + pd.MultiIndex.from_tuples( + [(0, "a"), (0, "b"), (1, "a")], names=["l0", "l1"] + ), + ], +) +@pytest.mark.parametrize("ascending", [True, False]) +@pytest.mark.parametrize("na_position", ["first", "last"]) +@pytest.mark.parametrize("ignore_index", [True, False]) +def test_sort_values(data, index, ascending, na_position, ignore_index): + key_func = functools.partial(key_func_builder, na_position=na_position) + + ps = pd.Series(data, index=index) + gs = cudf.from_pandas(ps) + + expected = ps.apply( + lambda x: sorted(x, key=key_func, reverse=not ascending) + if x is not None + else None + ) + if ignore_index: + expected.reset_index(drop=True, inplace=True) + got = gs.list.sort_values( + ascending=ascending, na_position=na_position, ignore_index=ignore_index + ) + + assert_eq(expected, got) + + +@pytest.mark.parametrize( + "data, index, expect", + [ + ([[None, None], [None, None]], 0, [None, None]), + ([[1, 2], [3, 4]], 0, [1, 3]), + ([["a", "b"], ["c", "d"]], 1, ["b", "d"]), + ([[1, None], [None, 2]], 1, [None, 2]), + ([[[1, 2], [3, 4]], [[5, 6], [7, 8]]], 1, [[3, 4], [7, 8]]), + ], +) +def test_get(data, index, expect): + sr = cudf.Series(data) + expect = cudf.Series(expect) + got = sr.list.get(index) + + assert_eq(expect, got, check_dtype=not expect.isnull().all()) + + +@pytest.mark.parametrize( + "data", + [ + [{"k": "v1"}, {"k": "v2"}], + [[{"k": "v1", "b": "v2"}], [{"k": "v3", "b": "v4"}]], + [ + [{"k": "v1", "b": [{"c": 10, "d": "v5"}]}], + [{"k": "v3", "b": [{"c": 14, "d": "v6"}]}], + ], + ], +) +@pytest.mark.parametrize("index", [0, 1]) +def test_get_nested_struct_dtype_transfer(data, index): + sr = cudf.Series([data]) + expect = cudf.Series(data[index : index + 1]) + assert_eq(expect, sr.list.get(index)) + + +def test_get_nested_lists(): + sr = cudf.Series( + [ + [[[1, 2], [3, 4]], [[5, 6], [7, 8]], [], [[3, 4], [7, 8]]], + [[], [[9, 10]], [[11, 12], [13, 14]]], + ] + ) + expect = cudf.Series([[[1, 2], [3, 4]], []]) + got = sr.list.get(0) + assert_eq(expect, got) + + +def test_get_default(): + sr = cudf.Series([[1, 2], [3, 4, 5], [6, 7, 8, 9]]) + + assert_eq(cudf.Series([cudf.NA, 5, 8]), sr.list.get(2)) + assert_eq(cudf.Series([cudf.NA, 5, 8]), sr.list.get(2, default=cudf.NA)) + assert_eq(cudf.Series([0, 5, 8]), sr.list.get(2, default=0)) + assert_eq(cudf.Series([0, 3, 7]), sr.list.get(-3, default=0)) + assert_eq(cudf.Series([2, 5, 9]), sr.list.get(-1)) + + string_sr = cudf.Series( + [["apple", "banana"], ["carrot", "daffodil", "elephant"]] + ) + assert_eq( + cudf.Series(["default", "elephant"]), + string_sr.list.get(2, default="default"), + ) + + sr_with_null = cudf.Series([[0, cudf.NA], [1]]) + assert_eq(cudf.Series([cudf.NA, 0]), sr_with_null.list.get(1, default=0)) + + sr_nested = cudf.Series([[[1, 2], [3, 4], [5, 6]], [[5, 6], [7, 8]]]) + assert_eq(cudf.Series([[3, 4], [7, 8]]), sr_nested.list.get(1)) + assert_eq(cudf.Series([[5, 6], cudf.NA]), sr_nested.list.get(2)) + assert_eq( + cudf.Series([[5, 6], [0, 0]]), sr_nested.list.get(2, default=[0, 0]) + ) + + +def test_get_ind_sequence(): + # test .list.get() when `index` is a sequence + sr = cudf.Series([[1, 2], [3, 4, 5], [6, 7, 8, 9]]) + assert_eq(cudf.Series([1, 4, 8]), sr.list.get([0, 1, 2])) + assert_eq(cudf.Series([1, 4, 8]), sr.list.get(cudf.Series([0, 1, 2]))) + assert_eq(cudf.Series([cudf.NA, 5, cudf.NA]), sr.list.get([2, 2, -5])) + assert_eq(cudf.Series([0, 5, 0]), sr.list.get([2, 2, -5], default=0)) + sr_nested = cudf.Series([[[1, 2], [3, 4], [5, 6]], [[5, 6], [7, 8]]]) + assert_eq(cudf.Series([[1, 2], [7, 8]]), sr_nested.list.get([0, 1])) + + +@pytest.mark.parametrize( + "data, scalar, expect", + [ + ( + [[1, 2, 3], []], + 1, + [True, False], + ), + ( + [[1, 2, 3], [], [3, 4, 5]], + 6, + [False, False, False], + ), + ( + [[1.0, 2.0, 3.0], None, []], + 2.0, + [True, None, False], + ), + ( + [[None, "b", "c"], [], ["b", "e", "f"]], + "b", + [True, False, True], + ), + ([[None, 2, 3], None, []], 1, [False, None, False]), + ( + [[None, "b", "c"], [], ["b", "e", "f"]], + "d", + [False, False, False], + ), + ], +) +def test_contains_scalar(data, scalar, expect): + sr = cudf.Series(data) + expect = cudf.Series(expect) + got = sr.list.contains( + pa.scalar(scalar, type=cudf_dtype_to_pa_type(sr.dtype.element_type)) + ) + assert_eq(expect, got) + + +@pytest.mark.parametrize( + "data, expect", + [ + ( + [[1, 2, 3], []], + [None, None], + ), + ( + [[1.0, 2.0, 3.0], None, []], + [None, None, None], + ), + ( + [[None, 2, 3], [], None], + [None, None, None], + ), + ( + [[1, 2, 3], [3, 4, 5]], + [None, None], + ), + ( + [[], [], []], + [None, None, None], + ), + ], +) +def test_contains_null_search_key(data, expect): + sr = cudf.Series(data) + expect = cudf.Series(expect, dtype="bool") + got = sr.list.contains( + pa.scalar(None, type=cudf_dtype_to_pa_type(sr.dtype.element_type)) + ) + assert_eq(expect, got) + + +@pytest.mark.parametrize( + "data, scalar", + [ + ( + [[9, 0, 2], [], [1, None, 0]], + "x", + ), + ( + [["z", "y", None], None, [None, "x"]], + 5, + ), + ], +) +def test_contains_invalid(data, scalar): + sr = cudf.Series(data) + with pytest.raises( + TypeError, + match="Type/Scale of search key does not " + "match list column element type.", + ): + sr.list.contains(scalar) + + +@pytest.mark.parametrize( + "data, search_key, expect", + [ + ( + [[1, 2, 3], [], [3, 4, 5]], + 3, + [2, -1, 0], + ), + ( + [[1.0, 2.0, 3.0], None, [2.0, 5.0]], + 2.0, + [1, None, 0], + ), + ( + [[None, "b", "c"], [], ["b", "e", "f"]], + "f", + [-1, -1, 2], + ), + ([[-5, None, 8], None, []], -5, [0, None, -1]), + ( + [[None, "x", None, "y"], ["z", "i", "j"]], + "y", + [3, -1], + ), + ( + [["h", "a", None], ["t", "g"]], + ["a", "b"], + [1, -1], + ), + ( + [None, ["h", "i"], ["p", "k", "z"]], + ["x", None, "z"], + [None, None, 2], + ), + ( + [["d", None, "e"], [None, "f"], []], + pa.scalar(None, type=pa.string()), + [None, None, None], + ), + ( + [None, [10, 9, 8], [5, 8, None]], + pa.scalar(None, type=pa.int64()), + [None, None, None], + ), + ], +) +def test_index(data, search_key, expect): + sr = cudf.Series(data) + expect = cudf.Series(expect, dtype="int32") + if is_scalar(search_key): + got = sr.list.index( + pa.scalar( + search_key, type=cudf_dtype_to_pa_type(sr.dtype.element_type) + ) + ) + else: + got = sr.list.index( + cudf.Series(search_key, dtype=sr.dtype.element_type) + ) + + assert_eq(expect, got) + + +@pytest.mark.parametrize( + "data, search_key", + [ + ( + [[9, None, 8], [], [7, 6, 5]], + "c", + ), + ( + [["a", "b", "c"], None, [None, "d"]], + 2, + ), + ( + [["e", "s"], ["t", "w"]], + [5, 6], + ), + ], +) +def test_index_invalid_type(data, search_key): + sr = cudf.Series(data) + with pytest.raises( + TypeError, + match="Type/Scale of search key does not " + "match list column element type.", + ): + sr.list.index(search_key) + + +@pytest.mark.parametrize( + "data, search_key", + [ + ( + [[5, 8], [2, 6]], + [8, 2, 4], + ), + ( + [["h", "j"], ["p", None], ["t", "z"]], + ["j", "a"], + ), + ], +) +def test_index_invalid_length(data, search_key): + sr = cudf.Series(data) + with pytest.raises( + RuntimeError, + match="Number of search keys must match list column size.", + ): + sr.list.index(search_key) + + +@pytest.mark.parametrize( + "row", + [ + [[]], + [[1]], + [[1, 2]], + [[1, 2], [3, 4, 5]], + [[1, 2], [], [3, 4, 5]], + [[1, 2, None], [3, 4, 5]], + [[1, 2, None], None, [3, 4, 5]], + [[1, 2, None], None, [], [3, 4, 5]], + [[[1, 2], [3, 4]], [[5, 6, 7], [8, 9]]], + [[["a", "c", "de", None], None, ["fg"]], [["abc", "de"], None]], + ], +) +@pytest.mark.parametrize("dropna", [True, False]) +def test_concat_elements(row, dropna): + if any(x is None for x in row): + if dropna: + row = [x for x in row if x is not None] + result = functools.reduce(operator.add, row) + else: + result = None + else: + result = functools.reduce(operator.add, row) + + expect = pd.Series([result]) + got = cudf.Series([row]).list.concat(dropna=dropna) + assert_eq(expect, got) + + +def test_concat_elements_raise(): + s = cudf.Series([[1, 2, 3]]) # no nesting + with pytest.raises( + ValueError, + match=".*Child of the input lists column must also be a lists column", + ): + s.list.concat() + + +def test_list_iterate_error(): + s = cudf.Series([[[[1, 2]], [[2], [3]]], [[[2]]], [[[3]]]]) + with pytest.raises(TypeError, match="ListMethods object is not iterable"): + iter(s.list) + + +def test_list_methods_setattr(): + ser = cudf.Series([["a", "b", "c"], ["d", "e", "f"]]) + + with pytest.raises(AttributeError): + ser.list.a = "b" diff --git a/python/cudf/cudf/tests/series/indexing/test_getitem.py b/python/cudf/cudf/tests/series/indexing/test_getitem.py index 3ed2ef57d9d..aecc70335f1 100644 --- a/python/cudf/cudf/tests/series/indexing/test_getitem.py +++ b/python/cudf/cudf/tests/series/indexing/test_getitem.py @@ -1,12 +1,75 @@ # Copyright (c) 2025, NVIDIA CORPORATION. import pandas as pd +import pyarrow as pa import pytest import cudf from cudf.testing import assert_eq +@pytest.mark.parametrize( + "data", + [ + [1], + [1, 2, 3], + [[1, 2, 3], [4, 5, 6]], + [pd.NA], + [1, pd.NA, 3], + [[1, pd.NA, 3], [pd.NA, 5, 6]], + [[1.1, pd.NA, 3.3], [4.4, 5.5, pd.NA]], + [["a", pd.NA, "c"], ["d", "e", pd.NA]], + [["a", "b", "c"], ["d", "e", "f"]], + ], +) +def test_list_getitem(data): + list_sr = cudf.Series([data]) + assert list_sr[0] == data + + +@pytest.mark.parametrize("nesting_level", [1, 3]) +def test_list_scalar_device_construction_null(nesting_level): + data = [[]] + for i in range(nesting_level - 1): + data = [data] + + arrow_type = pa.infer_type(data) + arrow_arr = pa.array([None], type=arrow_type) + + res = cudf.Series(arrow_arr)[0] + assert res is cudf.NA + + +@pytest.mark.parametrize( + "data, idx", + [ + ( + [[{"f2": {"a": 100}, "f1": "a"}, {"f1": "sf12", "f2": pd.NA}]], + 0, + ), + ( + [ + [ + {"f2": {"a": 100, "c": 90, "f2": 10}, "f1": "a"}, + {"f1": "sf12", "f2": pd.NA}, + ] + ], + 0, + ), + ( + [[[[1, 2]], [[2], [3]]], [[[2]]], [[[3]]]], + 0, + ), + ([[[[1, 2]], [[2], [3]]], [[[2]]], [[[3]]]], 2), + ([[[{"a": 1, "b": 2, "c": 10}]]], 0), + ], +) +def test_nested_list_extract_host_scalars(data, idx): + series = cudf.Series(data) + + assert series[idx] == data[idx] + + @pytest.mark.parametrize( "data, idx, expected", [ diff --git a/python/cudf/cudf/tests/series/indexing/test_setitem.py b/python/cudf/cudf/tests/series/indexing/test_setitem.py index f6ef3d4ddb6..4d78d3f4698 100644 --- a/python/cudf/cudf/tests/series/indexing/test_setitem.py +++ b/python/cudf/cudf/tests/series/indexing/test_setitem.py @@ -2,6 +2,7 @@ import numpy as np import pandas as pd +import pyarrow as pa import pytest import cudf @@ -11,6 +12,72 @@ ) +@pytest.mark.parametrize( + "data,item", + [ + ( + # basic list into a list column + [[1, 2, 3], [4, 5, 6], [7, 8, 9]], + [0, 0, 0], + ), + ( + # nested list into nested list column + [ + [[1, 2, 3], [4, 5, 6]], + [[1, 2, 3], [4, 5, 6]], + [[1, 2, 3], [4, 5, 6]], + ], + [[0, 0, 0], [0, 0, 0]], + ), + ( + # NA into a list column + [[1, 2, 3], [4, 5, 6], [7, 8, 9]], + pd.NA, + ), + ( + # NA into nested list column + [ + [[1, 2, 3], [4, 5, 6]], + [[1, 2, 3], [4, 5, 6]], + [[1, 2, 3], [4, 5, 6]], + ], + pd.NA, + ), + ], +) +def test_listcol_setitem(data, item): + sr = cudf.Series(data) + + sr[1] = item + data[1] = item + expect = cudf.Series(data) + + assert_eq(expect, sr) + + +@pytest.mark.parametrize( + "data,item,error_msg,error_type", + [ + ( + [[1, 2, 3], [4, 5, 6], [7, 8, 9]], + [[1, 2, 3], [4, 5, 6]], + "Could not convert .* with type list: tried to convert to int64", + pa.ArrowInvalid, + ), + ( + [[1, 2, 3], [4, 5, 6], [7, 8, 9]], + 0, + "Can not set 0 into ListColumn", + ValueError, + ), + ], +) +def test_listcol_setitem_error_cases(data, item, error_msg, error_type): + sr = cudf.Series(data) + with pytest.raises(error_type, match=error_msg): + sr[1] = item + + def test_fill_new_category(): gs = cudf.Series(pd.Categorical(["a", "b", "c"])) with pytest.raises(TypeError): diff --git a/python/cudf/cudf/tests/series/methods/test_astype.py b/python/cudf/cudf/tests/series/methods/test_astype.py index 30b4fcbdc4e..fb1942fc64a 100644 --- a/python/cudf/cudf/tests/series/methods/test_astype.py +++ b/python/cudf/cudf/tests/series/methods/test_astype.py @@ -11,6 +11,38 @@ from cudf.testing._utils import assert_exceptions_equal +@pytest.mark.parametrize( + "data", + [ + [[1, 2, 3], [4, 5, 6], [7, 8, 9]], + [ + [[1, 2, 3], [4, 5, 6]], + [[1, 2, 3], [4, 5, 6]], + [[1, 2, 3], [4, 5, 6]], + ], + [[[1, 2, 3], [4, None, 6]], [], None, [[7, 8], [], None, [9]]], + [[1, 2, 3], [4, None, 6], [7, 8], [], None, [9]], + [[1.0, 2.0, 3.0], [4.0, None, 6.0], [7.0, 8.0], [], None, [9.0]], + ], +) +def test_listcol_as_string(data): + got = cudf.Series(data).astype("str") + expect = pd.Series(data).astype("str") + assert_eq(expect, got) + + +def test_list_astype(): + s = cudf.Series([[1, 2], [3, 4]]) + s2 = s.list.astype("float64") + assert s2.dtype == cudf.ListDtype("float64") + assert_eq(s.list.leaves.astype("float64"), s2.list.leaves) + + s = cudf.Series([[[1, 2], [3]], [[5, 6], None]]) + s2 = s.list.astype("string") + assert s2.dtype == cudf.ListDtype(cudf.ListDtype("string")) + assert_eq(s.list.leaves.astype("string"), s2.list.leaves) + + def test_series_typecast_to_object_error(): actual = cudf.Series([1, 2, 3], dtype="datetime64[ns]") with cudf.option_context("mode.pandas_compatible", True): diff --git a/python/cudf/cudf/tests/series/methods/test_explode.py b/python/cudf/cudf/tests/series/methods/test_explode.py index 0cb54c98aac..f31abec1d2e 100644 --- a/python/cudf/cudf/tests/series/methods/test_explode.py +++ b/python/cudf/cudf/tests/series/methods/test_explode.py @@ -1,5 +1,6 @@ # Copyright (c) 2025, NVIDIA CORPORATION. +import numpy as np import pandas as pd import pytest @@ -29,3 +30,48 @@ def test_explode(data, ignore_index, p_index): got = gdf.explode(ignore_index) assert_eq(expect, got, check_dtype=False) + + +@pytest.fixture(params=["int", "float", "datetime", "timedelta"]) +def leaf_value(request): + if request.param == "int": + return np.int32(1) + elif request.param == "float": + return np.float64(1) + elif request.param == "datetime": + return pd.to_datetime("1900-01-01") + elif request.param == "timedelta": + return pd.to_timedelta("10d") + else: + raise ValueError("Unhandled data type") + + +@pytest.fixture(params=["list", "struct"]) +def list_or_struct(request, leaf_value): + if request.param == "list": + return [[leaf_value], [leaf_value]] + elif request.param == "struct": + return {"a": leaf_value, "b": [leaf_value], "c": {"d": [leaf_value]}} + else: + raise ValueError("Unhandled data type") + + +@pytest.fixture(params=["list", "struct"]) +def nested_list(request, list_or_struct, leaf_value): + if request.param == "list": + return [list_or_struct, list_or_struct] + elif request.param == "struct": + return [ + { + "a": list_or_struct, + "b": leaf_value, + "c": {"d": list_or_struct, "e": leaf_value}, + } + ] + else: + raise ValueError("Unhandled data type") + + +def test_list_dtype_explode(nested_list): + sr = cudf.Series([nested_list]) + assert sr.dtype.element_type == sr.explode().dtype diff --git a/python/cudf/cudf/tests/series/methods/test_memory_usage.py b/python/cudf/cudf/tests/series/methods/test_memory_usage.py index 003e2f61960..4b7c8237e68 100644 --- a/python/cudf/cudf/tests/series/methods/test_memory_usage.py +++ b/python/cudf/cudf/tests/series/methods/test_memory_usage.py @@ -4,6 +4,22 @@ from cudf.testing import assert_eq +def test_memory_usage_list(): + s1 = cudf.Series([[1, 2], [3, 4]]) + assert s1.memory_usage() == 44 + s2 = cudf.Series([[[[1, 2]]], [[[3, 4]]]]) + assert s2.memory_usage() == 68 + s3 = cudf.Series([[{"b": 1, "a": 10}, {"b": 2, "a": 100}]]) + assert s3.memory_usage() == 40 + + +def test_empty_nested_list_uninitialized_offsets_memory_usage(): + ser = cudf.Series( + [[[1, 2], [3]], []], dtype=cudf.ListDtype(cudf.ListDtype("int64")) + ) + assert ser.iloc[:0].memory_usage() == 8 + + def test_series_memory_usage(): sr = cudf.Series([1, 2, 3, 4], dtype="int64") assert sr.memory_usage() == 32 diff --git a/python/cudf/cudf/tests/series/test_constructors.py b/python/cudf/cudf/tests/series/test_constructors.py index 24bdcc6f894..a562a66f312 100644 --- a/python/cudf/cudf/tests/series/test_constructors.py +++ b/python/cudf/cudf/tests/series/test_constructors.py @@ -226,6 +226,41 @@ def test_series_init_dict(data): assert_eq(pandas_series, cudf_series) +@pytest.mark.parametrize( + "data", + [ + [[]], + [[[]]], + [[0]], + [[0, 1]], + [[0, 1], [2, 3]], + [[[0, 1], [2]], [[3, 4]]], + [[None]], + [[[None]]], + [[None], None], + [[1, None], [1]], + [[1, None], None], + [[[1, None], None], None], + ], +) +def test_create_list_series(data): + expect = pd.Series(data) + got = cudf.Series(data) + assert_eq(expect, got) + assert isinstance(got[0], type(expect[0])) + assert isinstance(got.to_pandas()[0], type(expect[0])) + + +@pytest.mark.parametrize( + "input_obj", [[[1, pd.NA, 3]], [[1, pd.NA, 3], [4, 5, pd.NA]]] +) +def test_construction_series_with_nulls(input_obj): + expect = pa.array(input_obj, from_pandas=True) + got = cudf.Series(input_obj).to_arrow() + + assert expect == got + + def test_series_unitness_np_datetimelike_units(): data = np.array([np.timedelta64(1)]) with pytest.raises(TypeError): diff --git a/python/cudf/cudf/tests/test_list.py b/python/cudf/cudf/tests/test_list.py index cadd5c80a54..039bb7cbf9c 100644 --- a/python/cudf/cudf/tests/test_list.py +++ b/python/cudf/cudf/tests/test_list.py @@ -1,638 +1,11 @@ # Copyright (c) 2020-2025, NVIDIA CORPORATION. -import functools -import operator -import numpy as np import pandas as pd -import pyarrow as pa import pytest import cudf -from cudf import NA -from cudf.api.types import is_scalar -from cudf.core.column.column import column_empty from cudf.testing import assert_eq -from cudf.utils.dtypes import cudf_dtype_to_pa_type - - -@pytest.mark.parametrize( - "data", - [ - [[]], - [[[]]], - [[0]], - [[0, 1]], - [[0, 1], [2, 3]], - [[[0, 1], [2]], [[3, 4]]], - [[None]], - [[[None]]], - [[None], None], - [[1, None], [1]], - [[1, None], None], - [[[1, None], None], None], - ], -) -def test_create_list_series(data): - expect = pd.Series(data) - got = cudf.Series(data) - assert_eq(expect, got) - assert isinstance(got[0], type(expect[0])) - assert isinstance(got.to_pandas()[0], type(expect[0])) - - -@pytest.mark.parametrize( - "data", - [ - {"a": [[]]}, - {"a": [[None]]}, - {"a": [[1, 2, 3]]}, - {"a": [[1, 2, 3]], "b": [[2, 3, 4]]}, - {"a": [[1, 2, 3, None], [None]], "b": [[2, 3, 4], [5]], "c": None}, - {"a": [[1]], "b": [[1, 2, 3]]}, - pd.DataFrame({"a": [[1, 2, 3]]}), - ], -) -def test_df_list_dtypes(data): - expect = pd.DataFrame(data) - got = cudf.DataFrame(data) - assert_eq(expect, got) - - -@pytest.mark.parametrize( - "data", - [ - [[]], - [[[]]], - [[0]], - [[0, 1]], - [[0, 1], [2, 3]], - [[[0, 1], [2]], [[3, 4]]], - [[[0, 1, None], None], None, [[3, 2, None], None]], - [[["a", "c", None], None], None, [["b", "d", None], None]], - ], -) -def test_leaves(data): - pa_array = pa.array(data) - while hasattr(pa_array, "flatten"): - pa_array = pa_array.flatten() - - expect = cudf.Series(pa_array) - got = cudf.Series(data).list.leaves - assert_eq( - expect, - got, - check_dtype=not isinstance(pa_array, pa.NullArray), - ) - - -def test_list_to_pandas_nullable_true(): - df = cudf.DataFrame({"a": cudf.Series([[1, 2, 3]])}) - with pytest.raises(NotImplementedError): - df.to_pandas(nullable=True) - - -def test_listdtype_hash(): - a = cudf.core.dtypes.ListDtype("int64") - b = cudf.core.dtypes.ListDtype("int64") - - assert hash(a) == hash(b) - - c = cudf.core.dtypes.ListDtype("int32") - - assert hash(a) != hash(c) - - -@pytest.fixture(params=["int", "float", "datetime", "timedelta"]) -def leaf_value(request): - if request.param == "int": - return np.int32(1) - elif request.param == "float": - return np.float64(1) - elif request.param == "datetime": - return pd.to_datetime("1900-01-01") - elif request.param == "timedelta": - return pd.to_timedelta("10d") - else: - raise ValueError("Unhandled data type") - - -@pytest.fixture(params=["list", "struct"]) -def list_or_struct(request, leaf_value): - if request.param == "list": - return [[leaf_value], [leaf_value]] - elif request.param == "struct": - return {"a": leaf_value, "b": [leaf_value], "c": {"d": [leaf_value]}} - else: - raise ValueError("Unhandled data type") - - -@pytest.fixture(params=["list", "struct"]) -def nested_list(request, list_or_struct, leaf_value): - if request.param == "list": - return [list_or_struct, list_or_struct] - elif request.param == "struct": - return [ - { - "a": list_or_struct, - "b": leaf_value, - "c": {"d": list_or_struct, "e": leaf_value}, - } - ] - else: - raise ValueError("Unhandled data type") - - -def test_list_dtype_explode(nested_list): - sr = cudf.Series([nested_list]) - assert sr.dtype.element_type == sr.explode().dtype - - -@pytest.mark.parametrize( - "data", - [ - [[]], - [[1, 2, 3], [4, 5]], - [[1, 2, 3], [], [4, 5]], - [[1, 2, 3], None, [4, 5]], - [[None, None], [None]], - [[[[[[1, 2, 3]]]]]], - cudf.Series([[1, 2]]).iloc[0:0], - cudf.Series([None, [1, 2]]).iloc[0:1], - ], -) -def test_len(data): - gsr = cudf.Series(data) - psr = gsr.to_pandas() - - expect = psr.map(lambda x: len(x) if x is not None else None) - got = gsr.list.len() - - assert_eq(expect, got, check_dtype=False) - - -@pytest.mark.parametrize( - ("data", "idx"), - [ - ([[1, 2, 3], [3, 4, 5], [4, 5, 6]], [[0, 1], [2], [1, 2]]), - ([[1, 2, 3], [3, 4, 5], [4, 5, 6]], [[1, 2, 0], [1, 0, 2], [0, 1, 2]]), - ([[1, 2, 3], []], [[0, 1], []]), - ([[1, 2, 3], [None]], [[0, 1], []]), - ([[1, None, 3], None], [[0, 1], []]), - ], -) -def test_take(data, idx): - ps = pd.Series(data) - gs = cudf.from_pandas(ps) - - expected = pd.Series(zip(ps, idx, strict=True)).map( - lambda x: [x[0][i] for i in x[1]] if x[0] is not None else None - ) - got = gs.list.take(idx) - assert_eq(expected, got) - - -@pytest.mark.parametrize( - ("invalid", "exception"), - [ - ([[0]], pytest.raises(ValueError, match="different size")), - ([1, 2, 3, 4], pytest.raises(ValueError, match="should be list type")), - ( - [["a", "b"], ["c"]], - pytest.raises( - TypeError, match="should be column of values of index types" - ), - ), - ( - [[[1], [0]], [[0]]], - pytest.raises( - TypeError, match="should be column of values of index types" - ), - ), - ([[0, 1], None], pytest.raises(ValueError, match="contains null")), - ], -) -def test_take_invalid(invalid, exception): - gs = cudf.Series([[0, 1], [2, 3]]) - with exception: - gs.list.take(invalid) - - -@pytest.mark.parametrize( - ("data", "expected"), - [ - ([[1, 1, 2, 2], [], None, [3, 4, 5]], [[1, 2], [], None, [3, 4, 5]]), - ( - [[1.233, np.nan, 1.234, 3.141, np.nan, 1.234]], - [[1.233, 1.234, np.nan, 3.141]], - ), # duplicate nans - ([[1, 1, 2, 2, None, None]], [[1, 2, None]]), # duplicate nulls - ( - [[1.233, np.nan, None, 1.234, 3.141, np.nan, 1.234, None]], - [[1.233, 1.234, np.nan, None, 3.141]], - ), # duplicate nans and nulls - ([[2, None, 1, None, 2]], [[1, 2, None]]), - ([[], []], [[], []]), - ([[], None], [[], None]), - ], -) -def test_unique(data, expected): - """ - Pandas de-duplicates nans and nulls respectively in Series.unique. - `expected` is setup to mimic such behavior - """ - gs = cudf.Series(data, nan_as_null=False) - - got = gs.list.unique() - expected = cudf.Series(expected, nan_as_null=False).list.sort_values() - - got = got.list.sort_values() - - assert_eq(expected, got) - - -def key_func_builder(x, na_position): - if x is None: - if na_position == "first": - return -1e8 - else: - return 1e8 - else: - return x - - -@pytest.mark.parametrize( - "data", - [ - [[4, 2, None, 9], [8, 8, 2], [2, 1]], - [[4, 2, None, 9], [8, 8, 2], None], - [[4, 2, None, 9], [], None], - ], -) -@pytest.mark.parametrize( - "index", - [ - None, - pd.Index(["a", "b", "c"]), - pd.MultiIndex.from_tuples( - [(0, "a"), (0, "b"), (1, "a")], names=["l0", "l1"] - ), - ], -) -@pytest.mark.parametrize("ascending", [True, False]) -@pytest.mark.parametrize("na_position", ["first", "last"]) -@pytest.mark.parametrize("ignore_index", [True, False]) -def test_sort_values(data, index, ascending, na_position, ignore_index): - key_func = functools.partial(key_func_builder, na_position=na_position) - - ps = pd.Series(data, index=index) - gs = cudf.from_pandas(ps) - - expected = ps.apply( - lambda x: sorted(x, key=key_func, reverse=not ascending) - if x is not None - else None - ) - if ignore_index: - expected.reset_index(drop=True, inplace=True) - got = gs.list.sort_values( - ascending=ascending, na_position=na_position, ignore_index=ignore_index - ) - - assert_eq(expected, got) - - -@pytest.mark.parametrize( - "data, index, expect", - [ - ([[None, None], [None, None]], 0, [None, None]), - ([[1, 2], [3, 4]], 0, [1, 3]), - ([["a", "b"], ["c", "d"]], 1, ["b", "d"]), - ([[1, None], [None, 2]], 1, [None, 2]), - ([[[1, 2], [3, 4]], [[5, 6], [7, 8]]], 1, [[3, 4], [7, 8]]), - ], -) -def test_get(data, index, expect): - sr = cudf.Series(data) - expect = cudf.Series(expect) - got = sr.list.get(index) - - assert_eq(expect, got, check_dtype=not expect.isnull().all()) - - -@pytest.mark.parametrize( - "data", - [ - [{"k": "v1"}, {"k": "v2"}], - [[{"k": "v1", "b": "v2"}], [{"k": "v3", "b": "v4"}]], - [ - [{"k": "v1", "b": [{"c": 10, "d": "v5"}]}], - [{"k": "v3", "b": [{"c": 14, "d": "v6"}]}], - ], - ], -) -@pytest.mark.parametrize("index", [0, 1]) -def test_get_nested_struct_dtype_transfer(data, index): - sr = cudf.Series([data]) - expect = cudf.Series(data[index : index + 1]) - assert_eq(expect, sr.list.get(index)) - - -def test_get_nested_lists(): - sr = cudf.Series( - [ - [[[1, 2], [3, 4]], [[5, 6], [7, 8]], [], [[3, 4], [7, 8]]], - [[], [[9, 10]], [[11, 12], [13, 14]]], - ] - ) - expect = cudf.Series([[[1, 2], [3, 4]], []]) - got = sr.list.get(0) - assert_eq(expect, got) - - -def test_get_default(): - sr = cudf.Series([[1, 2], [3, 4, 5], [6, 7, 8, 9]]) - - assert_eq(cudf.Series([cudf.NA, 5, 8]), sr.list.get(2)) - assert_eq(cudf.Series([cudf.NA, 5, 8]), sr.list.get(2, default=cudf.NA)) - assert_eq(cudf.Series([0, 5, 8]), sr.list.get(2, default=0)) - assert_eq(cudf.Series([0, 3, 7]), sr.list.get(-3, default=0)) - assert_eq(cudf.Series([2, 5, 9]), sr.list.get(-1)) - - string_sr = cudf.Series( - [["apple", "banana"], ["carrot", "daffodil", "elephant"]] - ) - assert_eq( - cudf.Series(["default", "elephant"]), - string_sr.list.get(2, default="default"), - ) - - sr_with_null = cudf.Series([[0, cudf.NA], [1]]) - assert_eq(cudf.Series([cudf.NA, 0]), sr_with_null.list.get(1, default=0)) - - sr_nested = cudf.Series([[[1, 2], [3, 4], [5, 6]], [[5, 6], [7, 8]]]) - assert_eq(cudf.Series([[3, 4], [7, 8]]), sr_nested.list.get(1)) - assert_eq(cudf.Series([[5, 6], cudf.NA]), sr_nested.list.get(2)) - assert_eq( - cudf.Series([[5, 6], [0, 0]]), sr_nested.list.get(2, default=[0, 0]) - ) - - -def test_get_ind_sequence(): - # test .list.get() when `index` is a sequence - sr = cudf.Series([[1, 2], [3, 4, 5], [6, 7, 8, 9]]) - assert_eq(cudf.Series([1, 4, 8]), sr.list.get([0, 1, 2])) - assert_eq(cudf.Series([1, 4, 8]), sr.list.get(cudf.Series([0, 1, 2]))) - assert_eq(cudf.Series([cudf.NA, 5, cudf.NA]), sr.list.get([2, 2, -5])) - assert_eq(cudf.Series([0, 5, 0]), sr.list.get([2, 2, -5], default=0)) - sr_nested = cudf.Series([[[1, 2], [3, 4], [5, 6]], [[5, 6], [7, 8]]]) - assert_eq(cudf.Series([[1, 2], [7, 8]]), sr_nested.list.get([0, 1])) - - -@pytest.mark.parametrize( - "data, scalar, expect", - [ - ( - [[1, 2, 3], []], - 1, - [True, False], - ), - ( - [[1, 2, 3], [], [3, 4, 5]], - 6, - [False, False, False], - ), - ( - [[1.0, 2.0, 3.0], None, []], - 2.0, - [True, None, False], - ), - ( - [[None, "b", "c"], [], ["b", "e", "f"]], - "b", - [True, False, True], - ), - ([[None, 2, 3], None, []], 1, [False, None, False]), - ( - [[None, "b", "c"], [], ["b", "e", "f"]], - "d", - [False, False, False], - ), - ], -) -def test_contains_scalar(data, scalar, expect): - sr = cudf.Series(data) - expect = cudf.Series(expect) - got = sr.list.contains( - pa.scalar(scalar, type=cudf_dtype_to_pa_type(sr.dtype.element_type)) - ) - assert_eq(expect, got) - - -@pytest.mark.parametrize( - "data, expect", - [ - ( - [[1, 2, 3], []], - [None, None], - ), - ( - [[1.0, 2.0, 3.0], None, []], - [None, None, None], - ), - ( - [[None, 2, 3], [], None], - [None, None, None], - ), - ( - [[1, 2, 3], [3, 4, 5]], - [None, None], - ), - ( - [[], [], []], - [None, None, None], - ), - ], -) -def test_contains_null_search_key(data, expect): - sr = cudf.Series(data) - expect = cudf.Series(expect, dtype="bool") - got = sr.list.contains( - pa.scalar(None, type=cudf_dtype_to_pa_type(sr.dtype.element_type)) - ) - assert_eq(expect, got) - - -@pytest.mark.parametrize( - "data, scalar", - [ - ( - [[9, 0, 2], [], [1, None, 0]], - "x", - ), - ( - [["z", "y", None], None, [None, "x"]], - 5, - ), - ], -) -def test_contains_invalid(data, scalar): - sr = cudf.Series(data) - with pytest.raises( - TypeError, - match="Type/Scale of search key does not " - "match list column element type.", - ): - sr.list.contains(scalar) - - -@pytest.mark.parametrize( - "data, search_key, expect", - [ - ( - [[1, 2, 3], [], [3, 4, 5]], - 3, - [2, -1, 0], - ), - ( - [[1.0, 2.0, 3.0], None, [2.0, 5.0]], - 2.0, - [1, None, 0], - ), - ( - [[None, "b", "c"], [], ["b", "e", "f"]], - "f", - [-1, -1, 2], - ), - ([[-5, None, 8], None, []], -5, [0, None, -1]), - ( - [[None, "x", None, "y"], ["z", "i", "j"]], - "y", - [3, -1], - ), - ( - [["h", "a", None], ["t", "g"]], - ["a", "b"], - [1, -1], - ), - ( - [None, ["h", "i"], ["p", "k", "z"]], - ["x", None, "z"], - [None, None, 2], - ), - ( - [["d", None, "e"], [None, "f"], []], - pa.scalar(None, type=pa.string()), - [None, None, None], - ), - ( - [None, [10, 9, 8], [5, 8, None]], - pa.scalar(None, type=pa.int64()), - [None, None, None], - ), - ], -) -def test_index(data, search_key, expect): - sr = cudf.Series(data) - expect = cudf.Series(expect, dtype="int32") - if is_scalar(search_key): - got = sr.list.index( - pa.scalar( - search_key, type=cudf_dtype_to_pa_type(sr.dtype.element_type) - ) - ) - else: - got = sr.list.index( - cudf.Series(search_key, dtype=sr.dtype.element_type) - ) - - assert_eq(expect, got) - - -@pytest.mark.parametrize( - "data, search_key", - [ - ( - [[9, None, 8], [], [7, 6, 5]], - "c", - ), - ( - [["a", "b", "c"], None, [None, "d"]], - 2, - ), - ( - [["e", "s"], ["t", "w"]], - [5, 6], - ), - ], -) -def test_index_invalid_type(data, search_key): - sr = cudf.Series(data) - with pytest.raises( - TypeError, - match="Type/Scale of search key does not " - "match list column element type.", - ): - sr.list.index(search_key) - - -@pytest.mark.parametrize( - "data, search_key", - [ - ( - [[5, 8], [2, 6]], - [8, 2, 4], - ), - ( - [["h", "j"], ["p", None], ["t", "z"]], - ["j", "a"], - ), - ], -) -def test_index_invalid_length(data, search_key): - sr = cudf.Series(data) - with pytest.raises( - RuntimeError, - match="Number of search keys must match list column size.", - ): - sr.list.index(search_key) - - -@pytest.mark.parametrize( - "row", - [ - [[]], - [[1]], - [[1, 2]], - [[1, 2], [3, 4, 5]], - [[1, 2], [], [3, 4, 5]], - [[1, 2, None], [3, 4, 5]], - [[1, 2, None], None, [3, 4, 5]], - [[1, 2, None], None, [], [3, 4, 5]], - [[[1, 2], [3, 4]], [[5, 6, 7], [8, 9]]], - [[["a", "c", "de", None], None, ["fg"]], [["abc", "de"], None]], - ], -) -@pytest.mark.parametrize("dropna", [True, False]) -def test_concat_elements(row, dropna): - if any(x is None for x in row): - if dropna: - row = [x for x in row if x is not None] - result = functools.reduce(operator.add, row) - else: - result = None - else: - result = functools.reduce(operator.add, row) - - expect = pd.Series([result]) - got = cudf.Series([row]).list.concat(dropna=dropna) - assert_eq(expect, got) - - -def test_concat_elements_raise(): - s = cudf.Series([[1, 2, 3]]) # no nesting - with pytest.raises(ValueError): - s.list.concat() def test_concatenate_rows_of_lists(): @@ -650,302 +23,3 @@ def test_concatenate_list_with_nonlist(): gdf1 = cudf.DataFrame({"A": [["a", "c"], ["b", "d"], ["c", "d"]]}) gdf2 = cudf.DataFrame({"A": ["a", "b", "c"]}) gdf1["A"] + gdf2["A"] - - -@pytest.mark.parametrize( - "data", - [ - [1], - [1, 2, 3], - [[1, 2, 3], [4, 5, 6]], - [NA], - [1, NA, 3], - [[1, NA, 3], [NA, 5, 6]], - ], -) -def test_list_getitem(data): - list_sr = cudf.Series([data]) - assert list_sr[0] == data - - -@pytest.mark.parametrize( - "data", - [ - [1, 2, 3], - [[1, 2, 3], [4, 5, 6]], - ["a", "b", "c"], - [["a", "b", "c"], ["d", "e", "f"]], - [1.1, 2.2, 3.3], - [[1.1, 2.2, 3.3], [4.4, 5.5, 6.6]], - [1, NA, 3], - [[1, NA, 3], [4, 5, NA]], - ["a", NA, "c"], - [["a", NA, "c"], ["d", "e", NA]], - [1.1, NA, 3.3], - [[1.1, NA, 3.3], [4.4, 5.5, NA]], - ], -) -def test_list_scalar_device_construction(data): - res = cudf.Series([data])._column.element_indexing(0) - assert res == data - - -@pytest.mark.parametrize("nesting_level", [1, 2, 3]) -def test_list_scalar_device_construction_null(nesting_level): - data = [[]] - for i in range(nesting_level - 1): - data = [data] - - arrow_type = pa.infer_type(data) - arrow_arr = pa.array([None], type=arrow_type) - - res = cudf.Series(arrow_arr)._column.element_indexing(0) - assert res is cudf.NA - - -@pytest.mark.parametrize("input_obj", [[[1, NA, 3]], [[1, NA, 3], [4, 5, NA]]]) -def test_construction_series_with_nulls(input_obj): - expect = pa.array(input_obj, from_pandas=True) - got = cudf.Series(input_obj).to_arrow() - - assert expect == got - - -@pytest.mark.parametrize( - "data", - [ - {"a": [[]]}, - {"a": [[1, 2, None, 4]]}, - {"a": [["cat", None, "dog"]]}, - { - "a": [[1, 2, 3, None], [4, None, 5]], - "b": [None, ["fish", "bird"]], - "c": [[], []], - }, - {"a": [[1, 2, 3, None], [4, None, 5], None, [6, 7]]}, - ], -) -def test_serialize_list_columns(data): - df = cudf.DataFrame(data) - recreated = df.__class__.deserialize(*df.serialize()) - assert_eq(recreated, df) - - -@pytest.mark.parametrize( - "data,item", - [ - ( - # basic list into a list column - [[1, 2, 3], [4, 5, 6], [7, 8, 9]], - [0, 0, 0], - ), - ( - # nested list into nested list column - [ - [[1, 2, 3], [4, 5, 6]], - [[1, 2, 3], [4, 5, 6]], - [[1, 2, 3], [4, 5, 6]], - ], - [[0, 0, 0], [0, 0, 0]], - ), - ( - # NA into a list column - [[1, 2, 3], [4, 5, 6], [7, 8, 9]], - NA, - ), - ( - # NA into nested list column - [ - [[1, 2, 3], [4, 5, 6]], - [[1, 2, 3], [4, 5, 6]], - [[1, 2, 3], [4, 5, 6]], - ], - NA, - ), - ], -) -def test_listcol_setitem(data, item): - sr = cudf.Series(data) - - sr[1] = item - data[1] = item - expect = cudf.Series(data) - - assert_eq(expect, sr) - - -@pytest.mark.parametrize( - "data", - [ - [[1, 2, 3], [4, 5, 6], [7, 8, 9]], - [ - [[1, 2, 3], [4, 5, 6]], - [[1, 2, 3], [4, 5, 6]], - [[1, 2, 3], [4, 5, 6]], - ], - [[[1, 2, 3], [4, None, 6]], [], None, [[7, 8], [], None, [9]]], - [[1, 2, 3], [4, None, 6], [7, 8], [], None, [9]], - [[1.0, 2.0, 3.0], [4.0, None, 6.0], [7.0, 8.0], [], None, [9.0]], - ], -) -def test_listcol_as_string(data): - got = cudf.Series(data).astype("str") - expect = pd.Series(data).astype("str") - assert_eq(expect, got) - - -@pytest.mark.parametrize( - "data,item,error", - [ - ( - [[1, 2, 3], [4, 5, 6], [7, 8, 9]], - [[1, 2, 3], [4, 5, 6]], - "Could not convert .* with type list: tried to convert to int64", - ), - ( - [[1, 2, 3], [4, 5, 6], [7, 8, 9]], - 0, - "Can not set 0 into ListColumn", - ), - ], -) -def test_listcol_setitem_error_cases(data, item, error): - sr = cudf.Series(data) - with pytest.raises(BaseException, match=error): - sr[1] = item - - -def test_listcol_setitem_retain_dtype(): - df = cudf.DataFrame( - {"a": cudf.Series([["a", "b"], []]), "b": [1, 2], "c": [123, 321]} - ) - df1 = df.head(0) - # Performing a setitem on `b` triggers a `column.column_empty` call - # which tries to create an empty ListColumn. - df1["b"] = df1["c"] - # Performing a copy to trigger a copy dtype which is obtained by accessing - # `ListColumn.children` that would have been corrupted in previous call - # prior to this fix: https://github.com/rapidsai/cudf/pull/10151/ - df2 = df1.copy() - assert df2["a"].dtype == df["a"].dtype - - -def test_list_astype(): - s = cudf.Series([[1, 2], [3, 4]]) - s2 = s.list.astype("float64") - assert s2.dtype == cudf.ListDtype("float64") - assert_eq(s.list.leaves.astype("float64"), s2.list.leaves) - - s = cudf.Series([[[1, 2], [3]], [[5, 6], None]]) - s2 = s.list.astype("string") - assert s2.dtype == cudf.ListDtype(cudf.ListDtype("string")) - assert_eq(s.list.leaves.astype("string"), s2.list.leaves) - - -def test_memory_usage(): - s1 = cudf.Series([[1, 2], [3, 4]]) - assert s1.memory_usage() == 44 - s2 = cudf.Series([[[[1, 2]]], [[[3, 4]]]]) - assert s2.memory_usage() == 68 - s3 = cudf.Series([[{"b": 1, "a": 10}, {"b": 2, "a": 100}]]) - assert s3.memory_usage() == 40 - - -@pytest.mark.parametrize( - "data, idx", - [ - ( - [[{"f2": {"a": 100}, "f1": "a"}, {"f1": "sf12", "f2": NA}]], - 0, - ), - ( - [ - [ - {"f2": {"a": 100, "c": 90, "f2": 10}, "f1": "a"}, - {"f1": "sf12", "f2": NA}, - ] - ], - 0, - ), - ( - [[[[1, 2]], [[2], [3]]], [[[2]]], [[[3]]]], - 0, - ), - ([[[[1, 2]], [[2], [3]]], [[[2]]], [[[3]]]], 2), - ([[[{"a": 1, "b": 2, "c": 10}]]], 0), - ], -) -def test_nested_list_extract_host_scalars(data, idx): - series = cudf.Series(data) - - assert series[idx] == data[idx] - - -def test_list_iterate_error(): - s = cudf.Series([[[[1, 2]], [[2], [3]]], [[[2]]], [[[3]]]]) - with pytest.raises(TypeError): - iter(s.list) - - -def test_list_struct_list_memory_usage(): - df = cudf.DataFrame({"a": [[{"b": [1]}]]}) - assert df.memory_usage().sum() == 16 - - -def test_empty_nested_list_uninitialized_offsets_memory_usage(): - col = column_empty(0, cudf.ListDtype(cudf.ListDtype("int64"))) - nested_col = col.children[1] - empty_inner = type(nested_col)( - data=None, - size=nested_col.size, - dtype=nested_col.dtype, - mask=nested_col.mask, - offset=nested_col.offset, - null_count=nested_col.null_count, - children=( - column_empty(0, nested_col.children[0].dtype), - nested_col.children[1], - ), - ) - col_empty_offset = type(col)( - data=None, - size=col.size, - dtype=col.dtype, - mask=col.mask, - offset=col.offset, - null_count=col.null_count, - children=(column_empty(0, col.children[0].dtype), empty_inner), - ) - ser = cudf.Series._from_column(col_empty_offset) - assert ser.memory_usage() == 8 - - -def test_list_methods_setattr(): - ser = cudf.Series([["a", "b", "c"], ["d", "e", "f"]]) - - with pytest.raises(AttributeError): - ser.list.a = "b" - - -def test_dataframe_list_round_trip(): - data = [{"text": "hello", "list_col": np.asarray([1, 2], dtype="uint32")}] - cudf_arrow = cudf.DataFrame(data).to_arrow() - pdf_arrow = pa.Table.from_pandas(pd.DataFrame(data)) - - for metadata in [ - None, - pdf_arrow.schema.metadata, - cudf_arrow.schema.metadata, - ]: - schema = pa.schema( - [ - pa.field("text", pa.string()), - pa.field("list_col", pa.list_(pa.uint32())), - ], - metadata=metadata, - ) - - data = {"text": ["asd", "pqr"], "list_col": [[1, 2, 3], [4, 5]]} - - table = pa.Table.from_pydict(data, schema=schema) - assert_eq(table.to_pandas(), pd.DataFrame(data)) From 783feb44dc3399da07d2a6e3d95668e6afbdb56e Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Tue, 19 Aug 2025 14:17:28 -0700 Subject: [PATCH 166/366] Add streams to column APIs (#19726) Contributes to #15163 Authors: - Vyas Ramasubramani (https://github.com/vyasr) Approvers: - Matthew Roeschke (https://github.com/mroeschke) URL: https://github.com/rapidsai/cudf/pull/19726 --- python/pylibcudf/pylibcudf/column.pxd | 2 +- python/pylibcudf/pylibcudf/column.pyx | 84 ++++++++++++++----- .../libcudf/column/column_factories.pxd | 6 ++ 3 files changed, 69 insertions(+), 23 deletions(-) diff --git a/python/pylibcudf/pylibcudf/column.pxd b/python/pylibcudf/pylibcudf/column.pxd index f209b6c3178..51fa36f8913 100644 --- a/python/pylibcudf/pylibcudf/column.pxd +++ b/python/pylibcudf/pylibcudf/column.pxd @@ -72,7 +72,7 @@ cdef class Column: Column base=*, ) - cpdef Scalar to_scalar(self) + cpdef Scalar to_scalar(self, Stream stream=*) cpdef DataType type(self) cpdef Column child(self, size_type index) cpdef size_type num_children(self) diff --git a/python/pylibcudf/pylibcudf/column.pyx b/python/pylibcudf/pylibcudf/column.pyx index 40ca7b0e4c3..8773c123d75 100644 --- a/python/pylibcudf/pylibcudf/column.pyx +++ b/python/pylibcudf/pylibcudf/column.pyx @@ -32,7 +32,6 @@ from pylibcudf.libcudf.strings.strings_column_view cimport strings_column_view from pylibcudf.libcudf.types cimport size_type, size_of as cpp_size_of, bitmask_type from pylibcudf.libcudf.utilities.traits cimport is_fixed_width from pylibcudf.libcudf.copying cimport get_element -from pylibcudf.libcudf.utilities.default_stream cimport get_default_stream from rmm.pylibrmm.device_buffer cimport DeviceBuffer @@ -330,7 +329,11 @@ cdef class Column: self._num_children = len(children) @staticmethod - def from_arrow(obj: ArrowLike, dtype: DataType | None = None) -> Column: + def from_arrow( + obj: ArrowLike, + dtype: DataType | None = None, + Stream stream=None + ) -> Column: """ Create a Column from an Arrow-like object using the Arrow C Data Interface. @@ -348,6 +351,8 @@ cdef class Column: - `__arrow_c_device_stream__` (device Arrow stream) dtype : DataType | None The pylibcudf data type. + stream : Stream | None + CUDA stream on which to perform the operation. Returns ------- @@ -375,6 +380,8 @@ cdef class Column: cdef ArrowDeviceArray* c_device_array cdef _ArrowColumnHolder result cdef unique_ptr[arrow_column] c_result + + stream = _get_stream(stream) if hasattr(obj, "__arrow_c_device_array__"): schema, d_array = obj.__arrow_c_device_array__() c_schema = PyCapsule_GetPointer(schema, "arrow_schema") @@ -385,7 +392,9 @@ cdef class Column: result = _ArrowColumnHolder() with nogil: c_result = make_unique[arrow_column]( - move(dereference(c_schema)), move(dereference(c_device_array)) + move(dereference(c_schema)), + move(dereference(c_device_array)), + stream.view(), ) result.col.swap(c_result) @@ -398,21 +407,26 @@ cdef class Column: result = _ArrowColumnHolder() with nogil: c_result = make_unique[arrow_column]( - move(dereference(c_schema)), move(dereference(c_array)) + move(dereference(c_schema)), + move(dereference(c_array)), + stream.view(), ) result.col.swap(c_result) return Column.from_column_view_of_arbitrary(result.col.get().view(), result) elif hasattr(obj, "__arrow_c_stream__"): - stream = obj.__arrow_c_stream__() - c_stream = ( - PyCapsule_GetPointer(stream, "arrow_array_stream") + arrow_stream = obj.__arrow_c_stream__() + c_arrow_stream = ( + PyCapsule_GetPointer( + arrow_stream, + "arrow_array_stream", + ) ) result = _ArrowColumnHolder() with nogil: c_result = make_unique[arrow_column]( - move(dereference(c_stream)) + move(dereference(c_arrow_stream)), stream.view() ) result.col.swap(c_result) @@ -561,7 +575,7 @@ cdef class Column: cdef gpumemoryview mask = None if null_count > 0: mask = gpumemoryview( - DeviceBuffer.c_from_unique_ptr(move(contents.null_mask)) + DeviceBuffer.c_from_unique_ptr(move(contents.null_mask), stream) ) children = [] @@ -678,7 +692,7 @@ cdef class Column: ) @staticmethod - def from_scalar(Scalar slr, size_type size): + def from_scalar(Scalar slr, size_type size, Stream stream=None): """Create a Column from a Scalar. Parameters @@ -687,6 +701,8 @@ cdef class Column: The scalar to create a column from. size : size_type The number of elements in the column. + stream : Stream | None + CUDA stream on which to perform the operation. Returns ------- @@ -695,11 +711,16 @@ cdef class Column: """ cdef const scalar* c_scalar = slr.get() cdef unique_ptr[column] c_result + stream = _get_stream(stream) with nogil: - c_result = make_column_from_scalar(dereference(c_scalar), size) - return Column.from_libcudf(move(c_result)) + c_result = make_column_from_scalar( + dereference(c_scalar), + size, + stream.view() + ) + return Column.from_libcudf(move(c_result), stream) - cpdef Scalar to_scalar(self): + cpdef Scalar to_scalar(self, Stream stream=None): """ Return the first value of 1-element column as a Scalar. @@ -707,6 +728,8 @@ cdef class Column: ------ ValueError If the column has more than one row. + stream : Stream | None + CUDA stream on which to perform the operation. Returns ------- @@ -718,14 +741,15 @@ cdef class Column: cdef column_view cv = self.view() cdef unique_ptr[scalar] result + stream = _get_stream(stream) with nogil: - result = get_element(cv, 0, get_default_stream()) + result = get_element(cv, 0, stream.view()) return Scalar.from_libcudf(move(result)) @staticmethod - def all_null_like(Column like, size_type size): + def all_null_like(Column like, size_type size, Stream stream=None): """Create an all null column from a template. Parameters @@ -734,6 +758,8 @@ cdef class Column: Column whose type we should mimic size : int Number of rows in the resulting column. + stream : Stream | None + CUDA stream on which to perform the operation. Returns ------- @@ -742,9 +768,14 @@ cdef class Column: """ cdef Scalar slr = Scalar.empty_like(like) cdef unique_ptr[column] c_result + stream = _get_stream(stream) with nogil: - c_result = make_column_from_scalar(dereference(slr.get()), size) - return Column.from_libcudf(move(c_result)) + c_result = make_column_from_scalar( + dereference(slr.get()), + size, + stream.view() + ) + return Column.from_libcudf(move(c_result), stream) @staticmethod cdef Column _wrap_nested_list_column( @@ -802,7 +833,7 @@ cdef class Column: return nested @classmethod - def from_array_interface(cls, obj): + def from_array_interface(cls, obj, Stream stream=None): """ Create a Column from an object implementing the NumPy Array Interface. @@ -814,6 +845,8 @@ cdef class Column: ---------- obj : Any Must implement the ``__array_interface__`` protocol. + stream : Stream | None + CUDA stream on which to perform the operation. Returns ------- @@ -840,13 +873,14 @@ cdef class Column: cdef const unsigned char* ptr cdef const unsigned char[:] view + stream = _get_stream(stream) if nbytes > 0: ptr = data_ptr view = ( ptr)[:nbytes] - dbuf = DeviceBuffer.to_device(view) + dbuf = DeviceBuffer.to_device(view, stream) else: - dbuf = DeviceBuffer(size=0) + dbuf = DeviceBuffer(size=0, stream=stream) return Column._wrap_nested_list_column(gpumemoryview(dbuf), shape, dtype) @@ -926,7 +960,11 @@ cdef class Column: ) @staticmethod - def from_iterable_of_py(obj: Iterable, dtype: DataType | None = None) -> Column: + def from_iterable_of_py( + obj: Iterable, + dtype: DataType | None = None, + Stream stream=None + ) -> Column: """ Create a Column from a Python iterable of scalar values or nested iterables. @@ -936,6 +974,8 @@ cdef class Column: An iterable of Python scalar values (int, float, bool, str) or nested lists. dtype : DataType | None The type of the leaf elements. If not specified, the type is inferred. + stream : Stream | None + CUDA stream on which to perform the operation. Returns ------- @@ -1027,7 +1067,7 @@ cdef class Column: "version": 3, } - return Column.from_array_interface(ArrayInterfaceWrapper(iface)) + return Column.from_array_interface(ArrayInterfaceWrapper(iface), stream) @classmethod def struct_from_children(cls, children: Iterable[Column]): diff --git a/python/pylibcudf/pylibcudf/libcudf/column/column_factories.pxd b/python/pylibcudf/pylibcudf/libcudf/column/column_factories.pxd index 60b48a7bb6a..5abc0523f7e 100644 --- a/python/pylibcudf/pylibcudf/libcudf/column/column_factories.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/column/column_factories.pxd @@ -88,6 +88,12 @@ cdef extern from "cudf/column/column_factories.hpp" namespace "cudf" nogil: size_type size ) except +libcudf_exception_handler + cdef unique_ptr[column] make_column_from_scalar( + const scalar& s, + size_type size, + cuda_stream_view stream + ) except +libcudf_exception_handler + cdef unique_ptr[column] make_dictionary_from_scalar( const scalar& s, size_type size From ba64909422016ba389ab06ed01d7578336c19e8e Mon Sep 17 00:00:00 2001 From: brandon-b-miller <53796099+brandon-b-miller@users.noreply.github.com> Date: Tue, 19 Aug 2025 17:04:20 -0500 Subject: [PATCH 167/366] Require `numba-cuda>=0.19.0,<0.20.0a0` (#19711) Updates the `numba-cuda` dependency to `>=0.19.0,<0.20.0a0`. This adds CUDA 13 support as well as constraining cuda `cuda-bindings` version to avoid recent cython issues and several other important API fixes/updates. Authors: - https://github.com/brandon-b-miller - Bradley Dice (https://github.com/bdice) Approvers: - Bradley Dice (https://github.com/bdice) URL: https://github.com/rapidsai/cudf/pull/19711 --- .../environments/all_cuda-129_arch-aarch64.yaml | 4 ++-- .../environments/all_cuda-129_arch-x86_64.yaml | 4 ++-- conda/recipes/cudf/recipe.yaml | 7 ++++--- dependencies.yaml | 17 +++++++++++------ python/cudf/pyproject.toml | 6 +++--- python/pylibcudf/pyproject.toml | 4 ++-- 6 files changed, 24 insertions(+), 18 deletions(-) diff --git a/conda/environments/all_cuda-129_arch-aarch64.yaml b/conda/environments/all_cuda-129_arch-aarch64.yaml index bd9e288cb43..e3d1d9c3b9b 100644 --- a/conda/environments/all_cuda-129_arch-aarch64.yaml +++ b/conda/environments/all_cuda-129_arch-aarch64.yaml @@ -55,8 +55,8 @@ dependencies: - nbsphinx - ninja - notebook -- numba-cuda>=0.18.0,<0.19.0a0 -- numba>=0.60.0,<0.62.0a0 +- numba-cuda>=0.19.0,<0.20.0a0 +- numba>=0.61.0,<0.62.0a0 - numpy>=1.23,<3.0a0 - numpydoc - nvidia-ml-py diff --git a/conda/environments/all_cuda-129_arch-x86_64.yaml b/conda/environments/all_cuda-129_arch-x86_64.yaml index 7d3b41c97e5..1b536f03c29 100644 --- a/conda/environments/all_cuda-129_arch-x86_64.yaml +++ b/conda/environments/all_cuda-129_arch-x86_64.yaml @@ -56,8 +56,8 @@ dependencies: - nbsphinx - ninja - notebook -- numba-cuda>=0.18.0,<0.19.0a0 -- numba>=0.60.0,<0.62.0a0 +- numba-cuda>=0.19.0,<0.20.0a0 +- numba>=0.61.0,<0.62.0a0 - numpy>=1.23,<3.0a0 - numpydoc - nvidia-ml-py diff --git a/conda/recipes/cudf/recipe.yaml b/conda/recipes/cudf/recipe.yaml index 3f69e8dcd2e..4b787a8178a 100644 --- a/conda/recipes/cudf/recipe.yaml +++ b/conda/recipes/cudf/recipe.yaml @@ -55,7 +55,7 @@ requirements: - rapids-build-backend >=0.4.0,<0.5.0.dev0 - scikit-build-core >=0.10.0 - dlpack >=0.8,<1.0 - - numba-cuda >=0.18.0,<0.19.0a0 + - numba-cuda >=0.19.0,<0.20.0a0 - libcudf =${{ version }} - pylibcudf =${{ version }} - rmm =${{ minor_version }} @@ -70,8 +70,9 @@ requirements: - typing_extensions >=4.0.0 - pandas >=2.0,<2.4.0dev0 - cupy >=12.0.0 - - numba-cuda >=0.18.0,<0.19.0a0 - - numba >=0.60.0,<0.62.0a0 + - numba-cuda >=0.19.0,<0.20.0a0 + # TODO: Revert to numba>=0.60.0,<0.62.0a0 once https://github.com/NVIDIA/numba-cuda/pull/403 is released. + - numba >=0.61.0,<0.62.0a0 - numpy >=1.23,<3.0a0 - pyarrow>=14.0.0,<20.0.0a0 - libcudf =${{ version }} diff --git a/dependencies.yaml b/dependencies.yaml index d28f2b5e5fb..0c6c1f19900 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -504,14 +504,14 @@ dependencies: common: - output_types: [conda] packages: - - &numba_cuda numba-cuda>=0.18.0,<0.19.0a0 + - &numba_cuda numba-cuda>=0.19.0,<0.20.0a0 specific: - output_types: [requirements, pyproject] matrices: - matrix: cuda: "12.*" packages: - - &numba_cuda_cu12 numba-cuda[cu12]>=0.18.0,<0.19.0a0 + - &numba_cuda_cu12 numba-cuda[cu12]>=0.19.0,<0.20.0a0 - matrix: # Fallback for no matrix packages: - *numba_cuda_cu12 @@ -684,7 +684,8 @@ dependencies: - output_types: [conda, requirements, pyproject] packages: - cachetools - - &numba numba>=0.60.0,<0.62.0a0 + # TODO: Revert to numba>=0.60.0,<0.62.0a0 once https://github.com/NVIDIA/numba-cuda/pull/403 is released. + - &numba numba>=0.61.0,<0.62.0a0 - nvtx>=0.2.1 - packaging - rich @@ -803,9 +804,10 @@ dependencies: matrices: - matrix: {dependencies: "oldest"} packages: - - numba==0.60.0 + # TODO: Revert to numba==0.60.0 once https://github.com/NVIDIA/numba-cuda/pull/403 is released. + - numba==0.61.0 - pandas==2.0.* - - numba-cuda==0.18.0 + - numba-cuda==0.19.0 - matrix: {dependencies: "latest"} packages: - pandas==2.3.1 @@ -875,7 +877,10 @@ dependencies: matrices: - matrix: {dependencies: "oldest"} packages: - - numpy==1.23.* + # TODO: Revert to numpy==1.23.* once + # https://github.com/NVIDIA/numba-cuda/pull/403 is released and + # we revert to an oldest pinning of numba==0.60.0. + - numpy==1.24.* # pyarrow 14 is fine in some circumstances but we require pyarrow # 15 in our CI tests in order to get a lz4-c that is compatible # with cudf_kafka's dependencies. diff --git a/python/cudf/pyproject.toml b/python/cudf/pyproject.toml index 6cb02397aed..56915721e37 100644 --- a/python/cudf/pyproject.toml +++ b/python/cudf/pyproject.toml @@ -23,8 +23,8 @@ dependencies = [ "cupy-cuda12x>=12.0.0", "fsspec>=0.6.0", "libcudf==25.10.*,>=0.0.0a0", - "numba-cuda[cu12]>=0.18.0,<0.19.0a0", - "numba>=0.60.0,<0.62.0a0", + "numba-cuda[cu12]>=0.19.0,<0.20.0a0", + "numba>=0.61.0,<0.62.0a0", "numpy>=1.23,<3.0a0", "nvtx>=0.2.1", "packaging", @@ -125,7 +125,7 @@ requires = [ "libcudf==25.10.*,>=0.0.0a0", "librmm==25.10.*,>=0.0.0a0", "ninja", - "numba-cuda[cu12]>=0.18.0,<0.19.0a0", + "numba-cuda[cu12]>=0.19.0,<0.20.0a0", "pylibcudf==25.10.*,>=0.0.0a0", "rmm==25.10.*,>=0.0.0a0", ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. diff --git a/python/pylibcudf/pyproject.toml b/python/pylibcudf/pyproject.toml index 081d4e54a4c..dd871c4bbdb 100644 --- a/python/pylibcudf/pyproject.toml +++ b/python/pylibcudf/pyproject.toml @@ -44,8 +44,8 @@ test = [ "hypothesis>=6.131.7", "mmh3", "nanoarrow", - "numba-cuda[cu12]>=0.18.0,<0.19.0a0", - "numba>=0.60.0,<0.62.0a0", + "numba-cuda[cu12]>=0.19.0,<0.20.0a0", + "numba>=0.61.0,<0.62.0a0", "pandas", "pyarrow>=14.0.0,<20.0.0a0,!=17.0.0; platform_machine=='aarch64'", "pyarrow>=14.0.0,<20.0.0a0; platform_machine=='x86_64'", From 18530481b655181b659d9281fb7568a1f7d7c143 Mon Sep 17 00:00:00 2001 From: Matthew Murray <41342305+Matt711@users.noreply.github.com> Date: Tue, 19 Aug 2025 19:21:37 -0400 Subject: [PATCH 168/366] Support null_count in groupby/rolling context (#19739) Contributes to #19200. Follows up #19314 Authors: - Matthew Murray (https://github.com/Matt711) Approvers: - Matthew Roeschke (https://github.com/mroeschke) - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/19739 --- .../cudf_polars/dsl/utils/aggregations.py | 18 +++++++++++++++++- python/cudf_polars/tests/test_groupby.py | 6 +++--- python/cudf_polars/tests/test_rolling.py | 10 ++++++++++ 3 files changed, 30 insertions(+), 4 deletions(-) diff --git a/python/cudf_polars/cudf_polars/dsl/utils/aggregations.py b/python/cudf_polars/cudf_polars/dsl/utils/aggregations.py index ebebc9bc361..00145e1d533 100644 --- a/python/cudf_polars/cudf_polars/dsl/utils/aggregations.py +++ b/python/cudf_polars/cudf_polars/dsl/utils/aggregations.py @@ -90,7 +90,23 @@ def decompose_single_agg( agg = named_expr.value name = named_expr.name if isinstance(agg, expr.UnaryFunction) and agg.name == "null_count": - raise NotImplementedError("null_count is not supported inside groupby context") + (child,) = agg.children + + is_null_bool = expr.BooleanFunction( + DataType(pl.Boolean()), + expr.BooleanFunction.Name.IsNull, + (), + child, + ) + u32 = DataType(pl.UInt32()) + sum_name = next(name_generator) + sum_agg = expr.NamedExpr( + sum_name, + expr.Agg(u32, "sum", (), expr.Cast(u32, is_null_bool)), + ) + return [(sum_agg, True)], named_expr.reconstruct( + expr.Cast(u32, expr.Col(u32, sum_name)) + ) if isinstance(agg, expr.Col): # TODO: collect_list produces null for empty group in libcudf, empty list in polars. # But we need the nested value type, so need to track proper dtypes in our DSL. diff --git a/python/cudf_polars/tests/test_groupby.py b/python/cudf_polars/tests/test_groupby.py index 38ece61457e..18ff5405505 100644 --- a/python/cudf_polars/tests/test_groupby.py +++ b/python/cudf_polars/tests/test_groupby.py @@ -285,10 +285,10 @@ def test_groupby_nunique(df: pl.LazyFrame, column): assert_gpu_result_equal(q, check_row_order=False) -def test_groupby_null_count_raises(df: pl.LazyFrame): - q = df.group_by("key1").agg(pl.col("int") + pl.col("uint16_with_null").null_count()) +def test_groupby_null_count(df: pl.LazyFrame): + q = df.group_by("key1").agg(pl.col("uint16_with_null").null_count()) - assert_ir_translation_raises(q, NotImplementedError) + assert_gpu_result_equal(q, check_row_order=False) @pytest.mark.parametrize( diff --git a/python/cudf_polars/tests/test_rolling.py b/python/cudf_polars/tests/test_rolling.py index dd473078fbd..89a4dc3b083 100644 --- a/python/cudf_polars/tests/test_rolling.py +++ b/python/cudf_polars/tests/test_rolling.py @@ -216,3 +216,13 @@ def test_rolling_sum_all_null_window_returns_null(): ) # Expected: [0, 0, 5, 5, 5, 1] assert_gpu_result_equal(q) + + +def test_rolling_null_count(df): + lf = df.with_columns( + null=pl.when(pl.col("values") % 2 == 0).then(None).otherwise(pl.col("values")) + ) + q = lf.rolling("dt", period="48h", closed="both").agg( + nc=pl.col("null").null_count() + ) + assert_gpu_result_equal(q) From 1d8c937b955f9e2893b055cd0617c33d8eb59301 Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Tue, 19 Aug 2025 18:22:39 -0500 Subject: [PATCH 169/366] Remove outdated numba workarounds (#19738) cuDF now requires numba>=0.60. We can remove these workarounds for bugs in earlier numba versions. Authors: - Bradley Dice (https://github.com/bdice) Approvers: - Matthew Roeschke (https://github.com/mroeschke) - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/19738 --- python/cudf/cudf/pandas/scripts/run-pandas-tests.sh | 4 ---- python/cudf/cudf_pandas_tests/test_cudf_pandas.py | 4 ---- 2 files changed, 8 deletions(-) diff --git a/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh b/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh index 7ceff137df7..1ec4c1cb844 100755 --- a/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh +++ b/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh @@ -51,10 +51,6 @@ if [ ! -d "pandas-tests" ]; then cat > pandas-tests/pyproject.toml << \EOF [tool.pytest.ini_options] xfail_strict = true -filterwarnings = [ - # Will be fixed in numba 0.56: https://github.com/numba/numba/issues/7758 - "ignore:`np.MachAr` is deprecated:DeprecationWarning:numba", -] markers = [ "single_cpu: tests that should run on a single cpu only", "slow: mark a test as slow", diff --git a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py index 4db0129bbae..18bdf427a04 100644 --- a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py +++ b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py @@ -719,10 +719,6 @@ def test_rolling_win_type(): tm.assert_equal(result, expected) -@pytest.mark.skipif( - version.parse(numba_version) < version.parse("0.59"), - reason="Requires Numba 0.59 to fix segfaults on ARM. See https://github.com/numba/llvmlite/pull/1009", -) @pytest.mark.xfail( version.parse(numba_version) >= version.parse("0.59") and PANDAS_VERSION < version.parse("2.1"), From 9f0d3f0bea88cc7522017489418f5de644965290 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 19 Aug 2025 17:55:07 -0700 Subject: [PATCH 170/366] Revert "Support decimal columns in cudf_polars" (#19746) Reverts rapidsai/cudf#19589 Currently CI is failing as the original PR did not sync with the HEAD of `branch-25.10` to catch presumably, new groupby tests that are failing. Reverting to unblock CI for now and going to reopen this PR once I can investigate why those tests are failing. Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/19746 --- .../cudf_polars/containers/dataframe.py | 19 ++++---------- .../cudf_polars/containers/datatype.py | 2 -- python/cudf_polars/cudf_polars/dsl/ir.py | 10 -------- .../cudf_polars/cudf_polars/testing/plugin.py | 2 -- .../tests/expressions/test_literal.py | 6 +---- python/cudf_polars/tests/test_groupby.py | 3 --- python/cudf_polars/tests/test_scan.py | 9 ------- python/cudf_polars/tests/test_select.py | 25 ------------------- python/pylibcudf/pylibcudf/scalar.pyx | 8 +++--- python/pylibcudf/tests/test_interop.py | 2 +- 10 files changed, 11 insertions(+), 75 deletions(-) diff --git a/python/cudf_polars/cudf_polars/containers/dataframe.py b/python/cudf_polars/cudf_polars/containers/dataframe.py index 3a095be3cfe..43ec63738b2 100644 --- a/python/cudf_polars/cudf_polars/containers/dataframe.py +++ b/python/cudf_polars/cudf_polars/containers/dataframe.py @@ -29,26 +29,17 @@ def _create_polars_column_metadata( name: str, dtype: PolarsDataType ) -> plc.interop.ColumnMetadata: - """Create ColumnMetadata preserving dtype attributes not supported by libcudf.""" - children_meta = [] - timezone = "" - precision: int | None = None - + """Create ColumnMetadata preserving pl.Struct field names.""" if isinstance(dtype, pl.Struct): children_meta = [ _create_polars_column_metadata(field.name, field.dtype) for field in dtype.fields ] - elif isinstance(dtype, pl.Datetime): - timezone = dtype.time_zone or timezone - elif isinstance(dtype, pl.Decimal): - precision = dtype.precision - + else: + children_meta = [] + timezone = dtype.time_zone if isinstance(dtype, pl.Datetime) else None return plc.interop.ColumnMetadata( - name=name, - timezone=timezone, - precision=precision, - children_meta=children_meta, + name=name, timezone=timezone or "", children_meta=children_meta ) diff --git a/python/cudf_polars/cudf_polars/containers/datatype.py b/python/cudf_polars/cudf_polars/containers/datatype.py index 50a5352612a..5de610425ed 100644 --- a/python/cudf_polars/cudf_polars/containers/datatype.py +++ b/python/cudf_polars/cudf_polars/containers/datatype.py @@ -81,8 +81,6 @@ def _from_polars(dtype: pl.DataType) -> plc.DataType: assert_never(dtype.time_unit) elif isinstance(dtype, pl.String): return plc.DataType(plc.TypeId.STRING) - elif isinstance(dtype, pl.Decimal): - return plc.DataType(plc.TypeId.DECIMAL128, scale=-dtype.scale) elif isinstance(dtype, pl.Null): # TODO: Hopefully return plc.DataType(plc.TypeId.EMPTY) diff --git a/python/cudf_polars/cudf_polars/dsl/ir.py b/python/cudf_polars/cudf_polars/dsl/ir.py index 7783b15a207..161c8f4a576 100644 --- a/python/cudf_polars/cudf_polars/dsl/ir.py +++ b/python/cudf_polars/cudf_polars/dsl/ir.py @@ -1467,16 +1467,6 @@ def do_evaluate( else: (child,) = value.children col = child.evaluate(df, context=ExecutionContext.GROUPBY).obj - - if value.name == "median" and col.type().id() in { - plc.TypeId.DECIMAL128, - plc.TypeId.DECIMAL64, - plc.TypeId.DECIMAL32, - }: - # libcudf doesn't support median (quantile) with decimal types, - # but Polars returns a float result, so just cast the input. - assert isinstance(child, expr.Col) - col = plc.unary.cast(col, schema[child.name].plc) else: # Anything else, we pre-evaluate col = value.evaluate(df, context=ExecutionContext.GROUPBY).obj diff --git a/python/cudf_polars/cudf_polars/testing/plugin.py b/python/cudf_polars/cudf_polars/testing/plugin.py index 5038a0d5690..14956d2cfbc 100644 --- a/python/cudf_polars/cudf_polars/testing/plugin.py +++ b/python/cudf_polars/cudf_polars/testing/plugin.py @@ -175,8 +175,6 @@ def pytest_configure(config: pytest.Config) -> None: "tests/unit/io/test_lazy_parquet.py::test_parquet_schema_arg[False-row_groups]": "allow_missing_columns argument in read_parquet not translated in IR", "tests/unit/io/test_lazy_parquet.py::test_parquet_schema_arg[False-prefiltered]": "allow_missing_columns argument in read_parquet not translated in IR", "tests/unit/io/test_lazy_parquet.py::test_parquet_schema_arg[False-none]": "allow_missing_columns argument in read_parquet not translated in IR", - "tests/unit/datatypes/test_decimal.py::test_decimal_aggregations": "https://github.com/pola-rs/polars/issues/23899", - "tests/unit/datatypes/test_decimal.py::test_decimal_arithmetic_schema": "https://github.com/pola-rs/polars/issues/23899", } diff --git a/python/cudf_polars/tests/expressions/test_literal.py b/python/cudf_polars/tests/expressions/test_literal.py index 1c2eb05ebfe..69ee80da82e 100644 --- a/python/cudf_polars/tests/expressions/test_literal.py +++ b/python/cudf_polars/tests/expressions/test_literal.py @@ -2,8 +2,6 @@ # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations -import datetime - import pytest import polars as pl @@ -97,9 +95,7 @@ def test_select_literal_series(): assert_gpu_result_equal(q) -@pytest.mark.parametrize( - "expr", [pl.lit(None), pl.lit(datetime.time(12, 0), dtype=pl.Time())] -) +@pytest.mark.parametrize("expr", [pl.lit(None), pl.lit(10, dtype=pl.Decimal())]) def test_unsupported_literal_raises(expr): df = pl.LazyFrame({}) diff --git a/python/cudf_polars/tests/test_groupby.py b/python/cudf_polars/tests/test_groupby.py index 18ff5405505..bf97ec41cde 100644 --- a/python/cudf_polars/tests/test_groupby.py +++ b/python/cudf_polars/tests/test_groupby.py @@ -2,7 +2,6 @@ # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations -import decimal import itertools import random from datetime import date @@ -26,7 +25,6 @@ def df(): "key2": [2, 2, 2, 2, 6, 1, 4, 6, 8], "int": [1, 2, 3, 4, 5, 6, 7, 8, 9], "int32": pl.Series([1, 2, 3, 4, 5, 6, 7, 8, 9], dtype=pl.Int32()), - "decimal": [decimal.Decimal("1.23"), None, decimal.Decimal("-0.23")] * 3, "uint16_with_null": pl.Series( [1, None, 2, None, None, None, 4, 5, 6], dtype=pl.UInt16() ), @@ -91,7 +89,6 @@ def keys(request): [pl.col("float").quantile(0.3, interpolation="lower")], [pl.col("float").quantile(0.3, interpolation="midpoint")], [pl.col("float").quantile(0.3, interpolation="linear")], - [pl.col("decimal").median()], [ pl.col("datetime").max(), pl.col("datetime").max().dt.is_leap_year().alias("leapyear"), diff --git a/python/cudf_polars/tests/test_scan.py b/python/cudf_polars/tests/test_scan.py index 7510fe833be..8481105baad 100644 --- a/python/cudf_polars/tests/test_scan.py +++ b/python/cudf_polars/tests/test_scan.py @@ -2,7 +2,6 @@ # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations -import decimal from typing import TYPE_CHECKING import pytest @@ -46,14 +45,6 @@ def df(): "a": [1, 2, 3, None, 4, 5], "b": ["ẅ", "x", "y", "z", "123", "abcd"], "c": [None, None, 4, 5, -1, 0], - "d": [ - decimal.Decimal("1.23"), - None, - decimal.Decimal("0.00"), - None, - decimal.Decimal("-5.67"), - None, - ], } ) diff --git a/python/cudf_polars/tests/test_select.py b/python/cudf_polars/tests/test_select.py index 10fcf9f660d..da3f519783b 100644 --- a/python/cudf_polars/tests/test_select.py +++ b/python/cudf_polars/tests/test_select.py @@ -2,8 +2,6 @@ # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations -import decimal - import pytest import polars as pl @@ -29,29 +27,6 @@ def test_select(): assert_gpu_result_equal(query) -def test_select_decimal(): - ldf = pl.LazyFrame( - {"a": pl.Series(values=[decimal.Decimal("1.0"), None], dtype=pl.Decimal(3, 1))} - ) - query = ldf.select(pl.col("a")) - assert_gpu_result_equal(query) - - -def test_select_decimal_precision_none_result_max_precision(): - ldf = pl.LazyFrame( - { - "a": pl.Series( - values=[decimal.Decimal("1.0"), None], dtype=pl.Decimal(None, 1) - ) - } - ) - query = ldf.select(pl.col("a")) - cpu_result = query.collect() - gpu_result = query.collect(engine="gpu") - assert cpu_result.schema["a"].precision is None - assert gpu_result.schema["a"].precision == 38 - - def test_select_reduce(): ldf = pl.DataFrame( { diff --git a/python/pylibcudf/pylibcudf/scalar.pyx b/python/pylibcudf/pylibcudf/scalar.pyx index 31a93abd6fb..0d533c960a4 100644 --- a/python/pylibcudf/pylibcudf/scalar.pyx +++ b/python/pylibcudf/pylibcudf/scalar.pyx @@ -275,7 +275,7 @@ cdef class Scalar: return decimal.Decimal( (slr).value().value() ).scaleb( - (slr).type().scale() + -(slr).type().scale() ) else: raise NotImplementedError( @@ -647,12 +647,12 @@ def _(py_val: datetime.date, dtype: DataType | None): @_from_py.register(decimal.Decimal) def _(py_val: decimal.Decimal, dtype: DataType | None): - scale = py_val.as_tuple().exponent - as_int = int(py_val.scaleb(-scale)) + scale = -py_val.as_tuple().exponent + as_int = int(py_val.scaleb(scale)) cdef int128_t val = as_int - dtype = DataType(type_id.DECIMAL128, scale) + dtype = DataType(type_id.DECIMAL128, -scale) if dtype.id() != type_id.DECIMAL128: raise TypeError("Expected dtype to be DECIMAL128") diff --git a/python/pylibcudf/tests/test_interop.py b/python/pylibcudf/tests/test_interop.py index b1a6e9f2c66..171d70c2496 100644 --- a/python/pylibcudf/tests/test_interop.py +++ b/python/pylibcudf/tests/test_interop.py @@ -105,7 +105,7 @@ def test_decimal_other(data_type): [plc.TypeId.DECIMAL128, plc.TypeId.DECIMAL64, plc.TypeId.DECIMAL32], ) def test_decimal_respect_metadata_precision(plc_type, request): - request.applymarker( + request.node.add_marker( pytest.mark.xfail( parse(pa.__version__) < parse("19.0.0") and plc_type in {plc.TypeId.DECIMAL64, plc.TypeId.DECIMAL32}, From 7ea7c27c0d6971bf3d269cb8489a65efbcafeb12 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 19 Aug 2025 20:21:03 -0700 Subject: [PATCH 171/366] Update cuDF classic testing documention regarding testing organization (#19745) Updates the documented, test organization conventions being worked towards in https://github.com/rapidsai/cudf/issues/9999 and https://github.com/rapidsai/cudf/issues/15723 Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/19745 --- docs/cudf/source/developer_guide/testing.md | 30 ++++++++++++--------- 1 file changed, 18 insertions(+), 12 deletions(-) diff --git a/docs/cudf/source/developer_guide/testing.md b/docs/cudf/source/developer_guide/testing.md index a31677e65af..732e7a3af8e 100644 --- a/docs/cudf/source/developer_guide/testing.md +++ b/docs/cudf/source/developer_guide/testing.md @@ -26,18 +26,24 @@ Where tests do not naturally belong to a project, for example the ## Test organization -How tests are organized depends on which of the following two groups they fall into: - -1. Free functions such as `cudf.merge` that operate on classes like `DataFrame` or `Series`. -2. Methods of the above classes. - -Tests of free functions should be grouped into files based on the -[API sections in the documentation](https://docs.rapids.ai/api/cudf/latest/api_docs/index.html). -This places tests of similar functionality in the same module. -Tests of class methods should be organized in the same way, except that this organization should be within a subdirectory corresponding to the class. -For instance, tests of `DataFrame` indexing should be placed into `dataframe/test_indexing.py`. -In cases where tests may be shared by multiple classes sharing a common parent (e.g. `DataFrame` and `Series` both require `IndexedFrame` tests), -the tests may be placed in a directory corresponding to the parent class. +Generally, the directories under `cudf/tests` describe tests of a certain object (e.g. `series/`) +or a general topic (e.g. `reshape/`, `series/methods`), and the test files are named according to an APIs name +(e.g. `series/methods/test_astype.py`, `reshape/test_concat.py`). Sometimes, tested operations and +APIs do not have a singular name to correspond to the file name and are instead named by a topic as well. +Some common examples include: + +- `test_constructors.py`: `__init__` and `@classmethod` constructors for objects +- `test_attributes.py`: `@property`s of objects +- `test_binops.py`: Binary methods (e.g. `+`, `%`) +- `test_reductions.py`: Reduction methods (e.g. `mean`, `quantile`) + +The organization aims to have many, specific test files that target a particular operation for a particular object; +therefore, there may be test files with the same name but live in different directories e.g. + +- `series/methods/test_astype.py`, `dataframe/methods/test_astype.py`, `indexes/index/methods/test_astype.py` +- `series/methods/test_reductions.py`, `dataframe/methods/test_reductions.py`, `groupby/test_reductions.py` + +When adding new tests, make a best-effort to place them in a file according to the tested object and API. ## Test contents From aea777db5bbbcdc0b81c2adc7fe63f399c0fa3eb Mon Sep 17 00:00:00 2001 From: Vukasin Milovanovic Date: Tue, 19 Aug 2025 20:57:01 -0700 Subject: [PATCH 172/366] Upgrade to nvCOMP 5.0.0.6 (#19636) Issue #19686 Update the nvCOMP version and the JNI code that directly uses nvCOMP. The libcudf code is already compatible with this version. Depends on https://github.com/rapidsai/rapids-cmake/pull/896 and https://github.com/rapidsai/kvikio/pull/800 Authors: - Vukasin Milovanovic (https://github.com/vuule) - Liangcai Li (https://github.com/firestarman) - Nghia Truong (https://github.com/ttnghia) - Bradley Dice (https://github.com/bdice) Approvers: - Bradley Dice (https://github.com/bdice) - Muhammad Haseeb (https://github.com/mhaseeb123) - Nghia Truong (https://github.com/ttnghia) - Alessandro Bellina (https://github.com/abellina) URL: https://github.com/rapidsai/cudf/pull/19636 --- ci/build_wheel_libcudf.sh | 2 +- .../all_cuda-129_arch-aarch64.yaml | 2 +- .../all_cuda-129_arch-x86_64.yaml | 2 +- conda/recipes/libcudf/conda_build_config.yaml | 2 +- dependencies.yaml | 2 +- .../rapids/cudf/nvcomp/BatchedCompressor.java | 7 ++- .../cudf/nvcomp/BatchedDecompressor.java | 6 ++- .../cudf/nvcomp/BatchedLZ4Compressor.java | 4 +- .../cudf/nvcomp/BatchedLZ4Decompressor.java | 6 ++- .../cudf/nvcomp/BatchedZstdCompressor.java | 4 +- .../cudf/nvcomp/BatchedZstdDecompressor.java | 6 ++- .../java/ai/rapids/cudf/nvcomp/NvcompJni.java | 14 ++++-- java/src/main/native/src/NvcompJni.cpp | 48 ++++++++++++------- 13 files changed, 68 insertions(+), 37 deletions(-) diff --git a/ci/build_wheel_libcudf.sh b/ci/build_wheel_libcudf.sh index 768ee5c8c0b..ae0ab29c7f8 100755 --- a/ci/build_wheel_libcudf.sh +++ b/ci/build_wheel_libcudf.sh @@ -36,8 +36,8 @@ export SKBUILD_CMAKE_ARGS="-DUSE_NVCOMP_FROM_LIBKVIKIO_WHEEL=ON" # repair wheels and write to the location that artifact-uploading code expects to find them python -m auditwheel repair \ - --exclude libnvcomp.so.4 \ --exclude libkvikio.so \ + --exclude libnvcomp.so.5 \ --exclude librapids_logger.so \ --exclude librmm.so \ -w "${RAPIDS_WHEEL_BLD_OUTPUT_DIR}" \ diff --git a/conda/environments/all_cuda-129_arch-aarch64.yaml b/conda/environments/all_cuda-129_arch-aarch64.yaml index e3d1d9c3b9b..ccf44072677 100644 --- a/conda/environments/all_cuda-129_arch-aarch64.yaml +++ b/conda/environments/all_cuda-129_arch-aarch64.yaml @@ -40,7 +40,7 @@ dependencies: - jupyter_client - libcurand-dev - libkvikio==25.10.*,>=0.0.0a0 -- libnvcomp-dev==4.2.0.11 +- libnvcomp-dev==5.0.0.6 - libnvjitlink-dev - librdkafka>=2.8.0,<2.9.0a0 - librmm==25.10.*,>=0.0.0a0 diff --git a/conda/environments/all_cuda-129_arch-x86_64.yaml b/conda/environments/all_cuda-129_arch-x86_64.yaml index 1b536f03c29..af6ca634e15 100644 --- a/conda/environments/all_cuda-129_arch-x86_64.yaml +++ b/conda/environments/all_cuda-129_arch-x86_64.yaml @@ -41,7 +41,7 @@ dependencies: - libcufile-dev - libcurand-dev - libkvikio==25.10.*,>=0.0.0a0 -- libnvcomp-dev==4.2.0.11 +- libnvcomp-dev==5.0.0.6 - libnvjitlink-dev - librdkafka>=2.8.0,<2.9.0a0 - librmm==25.10.*,>=0.0.0a0 diff --git a/conda/recipes/libcudf/conda_build_config.yaml b/conda/recipes/libcudf/conda_build_config.yaml index 7ca20165585..7acdfdd9698 100644 --- a/conda/recipes/libcudf/conda_build_config.yaml +++ b/conda/recipes/libcudf/conda_build_config.yaml @@ -26,7 +26,7 @@ flatbuffers_version: - "=24.3.25" nvcomp_version: - - "=4.2.0.11" + - "=5.0.0.6" zlib_version: - ">=1.2.13" diff --git a/dependencies.yaml b/dependencies.yaml index 0c6c1f19900..6398698546d 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -476,7 +476,7 @@ dependencies: - output_types: conda packages: # Align nvcomp version with rapids-cmake - - libnvcomp-dev==4.2.0.11 + - libnvcomp-dev==5.0.0.6 rapids_build_skbuild: common: - output_types: [conda, requirements, pyproject] diff --git a/java/src/main/java/ai/rapids/cudf/nvcomp/BatchedCompressor.java b/java/src/main/java/ai/rapids/cudf/nvcomp/BatchedCompressor.java index 72dfcdb3cb5..5a0ff574029 100644 --- a/java/src/main/java/ai/rapids/cudf/nvcomp/BatchedCompressor.java +++ b/java/src/main/java/ai/rapids/cudf/nvcomp/BatchedCompressor.java @@ -91,7 +91,8 @@ public DeviceMemoryBuffer[] compress(BaseDeviceMemoryBuffer[] origInputs, Cuda.S buildAddrsAndSizes(inputs, inputChunkAddrs, inputChunkSizes, compressedBuffers, outputChunkAddrs); - final long tempBufferSize = batchedCompressGetTempSize(numChunks, chunkSize); + final long tempBufferSize = batchedCompressGetTempSize(numChunks, chunkSize, + numChunks * chunkSize); try (DeviceMemoryBuffer addrsAndSizes = putAddrsAndSizesOnDevice(inputChunkAddrs, inputChunkSizes, outputChunkAddrs, stream); DeviceMemoryBuffer tempBuffer = @@ -308,9 +309,11 @@ private long[] calcOutputBufferSizes(int[] chunksPerInput, long[] outputChunkSiz * Get the temporary workspace size required to perform compression of an entire batch. * @param batchSize number of chunks in the batch * @param maxChunkSize maximum size of an uncompressed chunk in bytes + * @param totalSize Upper bound on the total uncompressed size of all chunks * @return The size of required temporary workspace in bytes to compress the batch. */ - protected abstract long batchedCompressGetTempSize(long batchSize, long maxChunkSize); + protected abstract long batchedCompressGetTempSize(long batchSize, long maxChunkSize, + long totalSize); /** * Asynchronously compress a batch of buffers. Note that compressedSizesOutPtr must diff --git a/java/src/main/java/ai/rapids/cudf/nvcomp/BatchedDecompressor.java b/java/src/main/java/ai/rapids/cudf/nvcomp/BatchedDecompressor.java index 5543d2dcb64..af195c84cc7 100644 --- a/java/src/main/java/ai/rapids/cudf/nvcomp/BatchedDecompressor.java +++ b/java/src/main/java/ai/rapids/cudf/nvcomp/BatchedDecompressor.java @@ -71,7 +71,8 @@ public void decompressAsync(BaseDeviceMemoryBuffer[] origInputs, totalChunks += numBufferChunks; } - final long tempBufferSize = batchedDecompressGetTempSize(totalChunks, chunkSize); + final long tempBufferSize = batchedDecompressGetTempSize(totalChunks, chunkSize, + totalChunks * chunkSize); try (DeviceMemoryBuffer devAddrsSizes = buildAddrsSizesBuffer(chunkSize, totalChunks, inputs.getArray(), chunksPerInput, outputs, stream); DeviceMemoryBuffer devTemp = DeviceMemoryBuffer.allocate(tempBufferSize)) { @@ -198,10 +199,11 @@ private static HostMemoryBuffer fetchMetadata(long totalChunks, BaseDeviceMemory * Computes the temporary storage size in bytes needed to decompress a compressed batch. * @param numChunks number of chunks in the batch * @param maxUncompressedChunkBytes maximum uncompressed size of any chunk in bytes + * @param maxTotalSize Upper bound on the total uncompressed size of all chunks * @return number of temporary storage bytes needed to decompress the batch */ protected abstract long batchedDecompressGetTempSize(long numChunks, - long maxUncompressedChunkBytes); + long maxUncompressedChunkBytes, long maxTotalSize); /** * Asynchronously decompress a batch of compressed data buffers. diff --git a/java/src/main/java/ai/rapids/cudf/nvcomp/BatchedLZ4Compressor.java b/java/src/main/java/ai/rapids/cudf/nvcomp/BatchedLZ4Compressor.java index 58c0e7ee169..d8b8d4c616e 100644 --- a/java/src/main/java/ai/rapids/cudf/nvcomp/BatchedLZ4Compressor.java +++ b/java/src/main/java/ai/rapids/cudf/nvcomp/BatchedLZ4Compressor.java @@ -32,8 +32,8 @@ public BatchedLZ4Compressor(long chunkSize, long maxIntermediateBufferSize) { } @Override - protected long batchedCompressGetTempSize(long batchSize, long maxChunkSize) { - return NvcompJni.batchedLZ4CompressGetTempSize(batchSize, maxChunkSize); + protected long batchedCompressGetTempSize(long batchSize, long maxChunkSize, long totalSize) { + return NvcompJni.batchedLZ4CompressGetTempSize(batchSize, maxChunkSize, totalSize); } @Override diff --git a/java/src/main/java/ai/rapids/cudf/nvcomp/BatchedLZ4Decompressor.java b/java/src/main/java/ai/rapids/cudf/nvcomp/BatchedLZ4Decompressor.java index d78d537ea13..82e425ae234 100644 --- a/java/src/main/java/ai/rapids/cudf/nvcomp/BatchedLZ4Decompressor.java +++ b/java/src/main/java/ai/rapids/cudf/nvcomp/BatchedLZ4Decompressor.java @@ -41,8 +41,10 @@ public static void decompressAsync(long chunkSize, BaseDeviceMemoryBuffer[] orig } @Override - protected long batchedDecompressGetTempSize(long numChunks, long maxUncompressedChunkBytes) { - return NvcompJni.batchedLZ4DecompressGetTempSize(numChunks, maxUncompressedChunkBytes); + protected long batchedDecompressGetTempSize(long numChunks, long maxUncompressedChunkBytes, + long maxTotalSize) { + return NvcompJni.batchedLZ4DecompressGetTempSize(numChunks, maxUncompressedChunkBytes, + maxTotalSize); } @Override diff --git a/java/src/main/java/ai/rapids/cudf/nvcomp/BatchedZstdCompressor.java b/java/src/main/java/ai/rapids/cudf/nvcomp/BatchedZstdCompressor.java index 0532b4aa86d..9c4a8aca52e 100644 --- a/java/src/main/java/ai/rapids/cudf/nvcomp/BatchedZstdCompressor.java +++ b/java/src/main/java/ai/rapids/cudf/nvcomp/BatchedZstdCompressor.java @@ -31,8 +31,8 @@ public BatchedZstdCompressor(long chunkSize, long maxIntermediateBufferSize) { } @Override - protected long batchedCompressGetTempSize(long batchSize, long maxChunkSize) { - return NvcompJni.batchedZstdCompressGetTempSize(batchSize, maxChunkSize); + protected long batchedCompressGetTempSize(long batchSize, long maxChunkSize, long totalSize) { + return NvcompJni.batchedZstdCompressGetTempSize(batchSize, maxChunkSize, totalSize); } @Override diff --git a/java/src/main/java/ai/rapids/cudf/nvcomp/BatchedZstdDecompressor.java b/java/src/main/java/ai/rapids/cudf/nvcomp/BatchedZstdDecompressor.java index ba11a236834..3ca21ac0b7e 100644 --- a/java/src/main/java/ai/rapids/cudf/nvcomp/BatchedZstdDecompressor.java +++ b/java/src/main/java/ai/rapids/cudf/nvcomp/BatchedZstdDecompressor.java @@ -23,8 +23,10 @@ public BatchedZstdDecompressor(long chunkSize) { } @Override - protected long batchedDecompressGetTempSize(long numChunks, long maxUncompressedChunkBytes) { - return NvcompJni.batchedZstdDecompressGetTempSize(numChunks, maxUncompressedChunkBytes); + protected long batchedDecompressGetTempSize(long numChunks, long maxUncompressedChunkBytes, + long maxTotalSize) { + return NvcompJni.batchedZstdDecompressGetTempSize(numChunks, maxUncompressedChunkBytes, + maxTotalSize); } @Override diff --git a/java/src/main/java/ai/rapids/cudf/nvcomp/NvcompJni.java b/java/src/main/java/ai/rapids/cudf/nvcomp/NvcompJni.java index 1a21629a208..f940f781ffb 100644 --- a/java/src/main/java/ai/rapids/cudf/nvcomp/NvcompJni.java +++ b/java/src/main/java/ai/rapids/cudf/nvcomp/NvcompJni.java @@ -29,9 +29,10 @@ class NvcompJni { * Get the temporary workspace size required to perform compression of entire LZ4 batch. * @param batchSize number of chunks in the batch * @param maxChunkSize maximum size of an uncompressed chunk in bytes + * @param maxTotalSize Upper bound on the total uncompressed size of all chunks * @return The size of required temporary workspace in bytes to compress the batch. */ - static native long batchedLZ4CompressGetTempSize(long batchSize, long maxChunkSize); + static native long batchedLZ4CompressGetTempSize(long batchSize, long maxChunkSize, long maxTotalSize); /** * Get the maximum size any chunk could compress to in a LZ4 batch. This is the minimum amount of @@ -74,11 +75,13 @@ static native void batchedLZ4CompressAsync( * Computes the temporary storage size in bytes needed to decompress a LZ4-compressed batch. * @param numChunks number of chunks in the batch * @param maxUncompressedChunkBytes maximum uncompressed size of any chunk in bytes + * @param maxTotalSize Upper bound on the total uncompressed size of all chunks * @return number of temporary storage bytes needed to decompress the batch */ static native long batchedLZ4DecompressGetTempSize( long numChunks, - long maxUncompressedChunkBytes); + long maxUncompressedChunkBytes, + long maxTotalSize); /** * Asynchronously decompress a batch of LZ4-compressed data buffers. @@ -121,9 +124,10 @@ static native void batchedLZ4GetDecompressSizeAsync( * Get the temporary workspace size required to perform compression of entire zstd batch. * @param batchSize number of chunks in the batch * @param maxChunkSize maximum size of an uncompressed chunk in bytes + * @param maxTotalSize Upper bound on the total uncompressed size of all chunks * @return The size of required temporary workspace in bytes to compress the batch. */ - static native long batchedZstdCompressGetTempSize(long batchSize, long maxChunkSize); + static native long batchedZstdCompressGetTempSize(long batchSize, long maxChunkSize, long maxTotalSize); /** * Get the maximum size any chunk could compress to in a ZSTD batch. This is the minimum @@ -167,11 +171,13 @@ static native void batchedZstdCompressAsync( * ZSTD-compressed batch. * @param numChunks number of chunks in the batch * @param maxUncompressedChunkBytes maximum uncompressed size of any chunk in bytes + * @param maxTotalSize Upper bound on the total uncompressed size of all chunks * @return number of temporary storage bytes needed to decompress the batch */ static native long batchedZstdDecompressGetTempSize( long numChunks, - long maxUncompressedChunkBytes); + long maxUncompressedChunkBytes, + long maxTotalSize); /** * Asynchronously decompress a batch of ZSTD-compressed data buffers. diff --git a/java/src/main/native/src/NvcompJni.cpp b/java/src/main/native/src/NvcompJni.cpp index 0b3bf1916b9..4e35f2f5688 100644 --- a/java/src/main/native/src/NvcompJni.cpp +++ b/java/src/main/native/src/NvcompJni.cpp @@ -63,15 +63,16 @@ extern "C" { // methods for lz4 JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_nvcomp_NvcompJni_batchedLZ4CompressGetTempSize( - JNIEnv* env, jclass, jlong j_batch_size, jlong j_max_chunk_size) + JNIEnv* env, jclass, jlong j_batch_size, jlong j_max_chunk_size, jlong j_max_total_size) { try { cudf::jni::auto_set_device(env); auto batch_size = static_cast(j_batch_size); auto max_chunk_size = static_cast(j_max_chunk_size); + auto total_size = static_cast(j_max_total_size); std::size_t temp_size = 0; - auto status = nvcompBatchedLZ4CompressGetTempSize( - batch_size, max_chunk_size, nvcompBatchedLZ4DefaultOpts, &temp_size); + auto status = nvcompBatchedLZ4CompressGetTempSizeAsync( + batch_size, max_chunk_size, nvcompBatchedLZ4CompressDefaultOpts, &temp_size, total_size); check_nvcomp_status(env, status); return static_cast(temp_size); } @@ -88,7 +89,7 @@ Java_ai_rapids_cudf_nvcomp_NvcompJni_batchedLZ4CompressGetMaxOutputChunkSize(JNI auto max_chunk_size = static_cast(j_max_chunk_size); std::size_t max_output_size = 0; auto status = nvcompBatchedLZ4CompressGetMaxOutputChunkSize( - max_chunk_size, nvcompBatchedLZ4DefaultOpts, &max_output_size); + max_chunk_size, nvcompBatchedLZ4CompressDefaultOpts, &max_output_size); check_nvcomp_status(env, status); return static_cast(max_output_size); } @@ -119,7 +120,10 @@ Java_ai_rapids_cudf_nvcomp_NvcompJni_batchedLZ4CompressAsync(JNIEnv* env, auto out_ptrs = reinterpret_cast(j_out_ptrs); auto compressed_out_sizes = reinterpret_cast(j_compressed_sizes_out_ptr); auto stream = reinterpret_cast(j_stream); - auto status = nvcompBatchedLZ4CompressAsync(in_ptrs, + // FIXME how to use these statuses ? They are not used either in the corresponding + // decompressor. + auto comp_statuses = rmm::device_uvector(batch_size, stream); + auto status = nvcompBatchedLZ4CompressAsync(in_ptrs, in_sizes, chunk_size, batch_size, @@ -127,7 +131,8 @@ Java_ai_rapids_cudf_nvcomp_NvcompJni_batchedLZ4CompressAsync(JNIEnv* env, temp_size, out_ptrs, compressed_out_sizes, - nvcompBatchedLZ4DefaultOpts, + nvcompBatchedLZ4CompressDefaultOpts, + comp_statuses.data(), stream); check_nvcomp_status(env, status); } @@ -135,14 +140,16 @@ Java_ai_rapids_cudf_nvcomp_NvcompJni_batchedLZ4CompressAsync(JNIEnv* env, } JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_nvcomp_NvcompJni_batchedLZ4DecompressGetTempSize( - JNIEnv* env, jclass, jlong j_batch_size, jlong j_chunk_size) + JNIEnv* env, jclass, jlong j_batch_size, jlong j_chunk_size, jlong j_max_total_size) { try { cudf::jni::auto_set_device(env); auto batch_size = static_cast(j_batch_size); auto chunk_size = static_cast(j_chunk_size); + auto total_size = static_cast(j_max_total_size); std::size_t temp_size = 0; - auto status = nvcompBatchedLZ4DecompressGetTempSize(batch_size, chunk_size, &temp_size); + auto status = nvcompBatchedLZ4DecompressGetTempSizeAsync( + batch_size, chunk_size, nvcompBatchedLZ4DecompressDefaultOpts, &temp_size, total_size); check_nvcomp_status(env, status); return static_cast(temp_size); } @@ -181,6 +188,7 @@ Java_ai_rapids_cudf_nvcomp_NvcompJni_batchedLZ4DecompressAsync(JNIEnv* env, temp_ptr, temp_size, uncompressed_ptrs, + nvcompBatchedLZ4DecompressDefaultOpts, uncompressed_statuses.data(), stream); check_nvcomp_status(env, status); @@ -218,15 +226,16 @@ Java_ai_rapids_cudf_nvcomp_NvcompJni_batchedLZ4GetDecompressSizeAsync(JNIEnv* en // methods for zstd JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_nvcomp_NvcompJni_batchedZstdCompressGetTempSize( - JNIEnv* env, jclass, jlong j_batch_size, jlong j_max_chunk_size) + JNIEnv* env, jclass, jlong j_batch_size, jlong j_max_chunk_size, jlong j_max_total_size) { try { cudf::jni::auto_set_device(env); auto batch_size = static_cast(j_batch_size); auto max_chunk_size = static_cast(j_max_chunk_size); + auto total_size = static_cast(j_max_total_size); std::size_t temp_size = 0; - auto status = nvcompBatchedZstdCompressGetTempSize( - batch_size, max_chunk_size, nvcompBatchedZstdDefaultOpts, &temp_size); + auto status = nvcompBatchedZstdCompressGetTempSizeAsync( + batch_size, max_chunk_size, nvcompBatchedZstdCompressDefaultOpts, &temp_size, total_size); check_nvcomp_status(env, status); return static_cast(temp_size); } @@ -242,7 +251,7 @@ Java_ai_rapids_cudf_nvcomp_NvcompJni_batchedZstdCompressGetMaxOutputChunkSize( auto max_chunk_size = static_cast(j_max_chunk_size); std::size_t max_output_size = 0; auto status = nvcompBatchedZstdCompressGetMaxOutputChunkSize( - max_chunk_size, nvcompBatchedZstdDefaultOpts, &max_output_size); + max_chunk_size, nvcompBatchedZstdCompressDefaultOpts, &max_output_size); check_nvcomp_status(env, status); return static_cast(max_output_size); } @@ -273,7 +282,10 @@ Java_ai_rapids_cudf_nvcomp_NvcompJni_batchedZstdCompressAsync(JNIEnv* env, auto out_ptrs = reinterpret_cast(j_out_ptrs); auto compressed_out_sizes = reinterpret_cast(j_compressed_sizes_out_ptr); auto stream = reinterpret_cast(j_stream); - auto status = nvcompBatchedZstdCompressAsync(in_ptrs, + // FIXME how to use these statuses ? They are not used either in the corresponding + // decompressor. + auto comp_statuses = rmm::device_uvector(batch_size, stream); + auto status = nvcompBatchedZstdCompressAsync(in_ptrs, in_sizes, chunk_size, batch_size, @@ -281,7 +293,8 @@ Java_ai_rapids_cudf_nvcomp_NvcompJni_batchedZstdCompressAsync(JNIEnv* env, temp_size, out_ptrs, compressed_out_sizes, - nvcompBatchedZstdDefaultOpts, + nvcompBatchedZstdCompressDefaultOpts, + comp_statuses.data(), stream); check_nvcomp_status(env, status); } @@ -289,14 +302,16 @@ Java_ai_rapids_cudf_nvcomp_NvcompJni_batchedZstdCompressAsync(JNIEnv* env, } JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_nvcomp_NvcompJni_batchedZstdDecompressGetTempSize( - JNIEnv* env, jclass, jlong j_batch_size, jlong j_chunk_size) + JNIEnv* env, jclass, jlong j_batch_size, jlong j_chunk_size, jlong j_max_total_size) { try { cudf::jni::auto_set_device(env); auto batch_size = static_cast(j_batch_size); auto chunk_size = static_cast(j_chunk_size); + auto total_size = static_cast(j_max_total_size); std::size_t temp_size = 0; - auto status = nvcompBatchedZstdDecompressGetTempSize(batch_size, chunk_size, &temp_size); + auto status = nvcompBatchedZstdDecompressGetTempSizeAsync( + batch_size, chunk_size, nvcompBatchedZstdDecompressDefaultOpts, &temp_size, total_size); check_nvcomp_status(env, status); return static_cast(temp_size); } @@ -335,6 +350,7 @@ Java_ai_rapids_cudf_nvcomp_NvcompJni_batchedZstdDecompressAsync(JNIEnv* env, temp_ptr, temp_size, uncompressed_ptrs, + nvcompBatchedZstdDecompressDefaultOpts, uncompressed_statuses.data(), stream); check_nvcomp_status(env, status); From f91e146a4442e44c4661830d08db229ca60c2fd4 Mon Sep 17 00:00:00 2001 From: David Wendt <45795991+davidwendt@users.noreply.github.com> Date: Wed, 20 Aug 2025 00:34:34 -0400 Subject: [PATCH 173/366] Suppress NVRTC warning from stdint.h (#19712) Fixes a warning from jitify compile with NVRTC when including C++ stdlib stdint.h or cstdint. ``` cuda/std/cstdint(105): warning #47-D: incompatible redefinition of macro "INTPTR_MIN" (declared at line 16 of stdint.h) # define INTPTR_MIN INT64_MIN ^ cuda/std/cstdint(106): warning #47-D: incompatible redefinition of macro "INTPTR_MAX" (declared at line 18 of stdint.h) # define INTPTR_MAX INT64_MAX ^ cuda/std/cstdint(107): warning #47-D: incompatible redefinition of macro "UINTPTR_MAX" (declared at line 20 of stdint.h) # define UINTPTR_MAX UINT64_MAX ^ cuda/std/cstdint(109): warning #47-D: incompatible redefinition of macro "INTMAX_MIN" (declared at line 17 of stdint.h) # define INTMAX_MIN INT64_MIN ^ cuda/std/cstdint(110): warning #47-D: incompatible redefinition of macro "INTMAX_MAX" (declared at line 19 of stdint.h) # define INTMAX_MAX INT64_MAX ^ cuda/std/cstdint(111): warning #47-D: incompatible redefinition of macro "UINTMAX_MAX" (declared at line 21 of stdint.h) # define UINTMAX_MAX UINT64_MAX ^ cuda/std/cstdint(113): warning #47-D: incompatible redefinition of macro "PTRDIFF_MIN" (declared at line 22 of stdint.h) # define PTRDIFF_MIN INT64_MIN ^ cuda/std/cstdint(114): warning #47-D: incompatible redefinition of macro "PTRDIFF_MAX" (declared at line 23 of stdint.h) # define PTRDIFF_MAX INT64_MAX ^ D22-125 [1265+2+10=1276] Custom command to JIT-compile files. cuda/std/cstdint(105): warning #47-D: incompatible redefinition of macro "INTPTR_MIN" (declared at line 16 of stdint.h) # define INTPTR_MIN INT64_MIN ^ cuda/std/cstdint(106): warning #47-D: incompatible redefinition of macro "INTPTR_MAX" (declared at line 18 of stdint.h) # define INTPTR_MAX INT64_MAX ^ cuda/std/cstdint(107): warning #47-D: incompatible redefinition of macro "UINTPTR_MAX" (declared at line 20 of stdint.h) # define UINTPTR_MAX UINT64_MAX ^ cuda/std/cstdint(109): warning #47-D: incompatible redefinition of macro "INTMAX_MIN" (declared at line 17 of stdint.h) # define INTMAX_MIN INT64_MIN ^ cuda/std/cstdint(110): warning #47-D: incompatible redefinition of macro "INTMAX_MAX" (declared at line 19 of stdint.h) # define INTMAX_MAX INT64_MAX ^ cuda/std/cstdint(111): warning #47-D: incompatible redefinition of macro "UINTMAX_MAX" (declared at line 21 of stdint.h) # define UINTMAX_MAX UINT64_MAX ^ cuda/std/cstdint(113): warning #47-D: incompatible redefinition of macro "PTRDIFF_MIN" (declared at line 22 of stdint.h) # define PTRDIFF_MIN INT64_MIN ^ cuda/std/cstdint(114): warning #47-D: incompatible redefinition of macro "PTRDIFF_MAX" (declared at line 23 of stdint.h) # define PTRDIFF_MAX INT64_MAX ^ D22-125 [1265+1+11=1276] Custom command to JIT-compile files. cuda/std/cstdint(105): warning #47-D: incompatible redefinition of macro "INTPTR_MIN" (declared at line 16 of stdint.h) # define INTPTR_MIN INT64_MIN ^ cuda/std/cstdint(106): warning #47-D: incompatible redefinition of macro "INTPTR_MAX" (declared at line 18 of stdint.h) # define INTPTR_MAX INT64_MAX ^ cuda/std/cstdint(107): warning #47-D: incompatible redefinition of macro "UINTPTR_MAX" (declared at line 20 of stdint.h) # define UINTPTR_MAX UINT64_MAX ^ cuda/std/cstdint(109): warning #47-D: incompatible redefinition of macro "INTMAX_MIN" (declared at line 17 of stdint.h) # define INTMAX_MIN INT64_MIN ^ cuda/std/cstdint(110): warning #47-D: incompatible redefinition of macro "INTMAX_MAX" (declared at line 19 of stdint.h) # define INTMAX_MAX INT64_MAX ^ cuda/std/cstdint(111): warning #47-D: incompatible redefinition of macro "UINTMAX_MAX" (declared at line 21 of stdint.h) # define UINTMAX_MAX UINT64_MAX ^ cuda/std/cstdint(113): warning #47-D: incompatible redefinition of macro "PTRDIFF_MIN" (declared at line 22 of stdint.h) # define PTRDIFF_MIN INT64_MIN ^ cuda/std/cstdint(114): warning #47-D: incompatible redefinition of macro "PTRDIFF_MAX" (declared at line 23 of stdint.h) # define PTRDIFF_MAX INT64_MAX ^ ``` This suppresses the warning by adding the compile flag `'diag-suppress=47` to the `JitifyPreprocessKernels.cmake` Authors: - David Wendt (https://github.com/davidwendt) Approvers: - Nghia Truong (https://github.com/ttnghia) - Vyas Ramasubramani (https://github.com/vyasr) - Muhammad Haseeb (https://github.com/mhaseeb123) URL: https://github.com/rapidsai/cudf/pull/19712 --- cpp/cmake/Modules/JitifyPreprocessKernels.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/cmake/Modules/JitifyPreprocessKernels.cmake b/cpp/cmake/Modules/JitifyPreprocessKernels.cmake index 7c27920ee28..1e363a3e55b 100644 --- a/cpp/cmake/Modules/JitifyPreprocessKernels.cmake +++ b/cpp/cmake/Modules/JitifyPreprocessKernels.cmake @@ -48,7 +48,7 @@ function(jit_preprocess_files) $ ${ARG_FILE} -o ${ARG_OUTPUT_DIR} -i -std=c++20 -remove-unused-globals -D_FILE_OFFSET_BITS=64 -D__CUDACC_RTC__ -DCUDF_RUNTIME_JIT -I${CUDF_SOURCE_DIR}/include -I${CUDF_SOURCE_DIR}/src ${includes} - --no-preinclude-workarounds --no-replace-pragma-once + --no-preinclude-workarounds --no-replace-pragma-once --diag-suppress=47 COMMENT "Custom command to JIT-compile files." ) endforeach() From 965b4a10416645d12ac8927bd8e35b05d5973c35 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 20 Aug 2025 01:34:28 -0500 Subject: [PATCH 174/366] Optionally print shuffle stats in pdsh benchmarks (#19719) This adds an option to the pdsh benchmarks to control whether the rapidsmpf Shuffler statistics are printed upon completion. Previously this was tied to the `rapidsmpf-oom-protection` flag, but these are two different things. Authors: - Tom Augspurger (https://github.com/TomAugspurger) - Vyas Ramasubramani (https://github.com/vyasr) Approvers: - Matthew Murray (https://github.com/Matt711) - Richard (Rick) Zamora (https://github.com/rjzamora) URL: https://github.com/rapidsai/cudf/pull/19719 --- .../cudf_polars/experimental/benchmarks/utils.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/python/cudf_polars/cudf_polars/experimental/benchmarks/utils.py b/python/cudf_polars/cudf_polars/experimental/benchmarks/utils.py index c85fbf8319e..8074488024c 100644 --- a/python/cudf_polars/cudf_polars/experimental/benchmarks/utils.py +++ b/python/cudf_polars/cudf_polars/experimental/benchmarks/utils.py @@ -413,7 +413,8 @@ def initialize_dask_cluster(run_config: RunConfig, args: argparse.Namespace): # options=Options( { "dask_spill_device": str(run_config.spill_device), - "dask_statistics": str(args.rapidsmpf_oom_protection), + "dask_statistics": str(args.rapidsmpf_dask_statistics), + "oom_protection": str(args.rapidsmpf_oom_protection), } ), ) @@ -625,6 +626,12 @@ def parse_args( default=False, help="Use rapidsmpf CUDA managed memory-based OOM protection.", ) + parser.add_argument( + "--rapidsmpf-dask-statistics", + action=argparse.BooleanOptionalAction, + default=False, + help="Print rapidsmpf shuffle statistics on each Dask worker upon completion.", + ) parser.add_argument( "--rapidsmpf-spill", action=argparse.BooleanOptionalAction, From da2a13d77744cb77d0ebcbc102498d7fd2dda579 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 20 Aug 2025 10:22:55 -0500 Subject: [PATCH 175/366] Remove unreachable code in rapidsmpf shuffle (#19704) https://github.com/rapidsai/rapidsmpf/pull/417 changed rapidsmpf's worker initialization so that workers are never in a partially initialized state, so we can safely remove the code blocks guarding against that. Authors: - Tom Augspurger (https://github.com/TomAugspurger) - Vyas Ramasubramani (https://github.com/vyasr) Approvers: - Richard (Rick) Zamora (https://github.com/rjzamora) URL: https://github.com/rapidsai/cudf/pull/19704 --- python/cudf_polars/cudf_polars/experimental/shuffle.py | 9 --------- 1 file changed, 9 deletions(-) diff --git a/python/cudf_polars/cudf_polars/experimental/shuffle.py b/python/cudf_polars/cudf_polars/experimental/shuffle.py index 9a4e63e4b76..3571ee67e7a 100644 --- a/python/cudf_polars/cudf_polars/experimental/shuffle.py +++ b/python/cudf_polars/cudf_polars/experimental/shuffle.py @@ -67,11 +67,6 @@ def insert_partition( context = get_worker_context() - if context.br is None: # pragma: no cover - raise ValueError( - "rapidsmpf insert_partition called on an uninitialized worker." - ) - on = options["on"] assert not other, f"Unexpected arguments: {other}" columns_to_hash = tuple(df.column_names.index(val) for val in on) @@ -104,10 +99,6 @@ def extract_partition( from rapidsmpf.integrations.single import get_worker_context context = get_worker_context() - if context.br is None: # pragma: no cover - raise ValueError( - "rapidsmpf extract_partition called on an uninitialized worker." - ) shuffler.wait_on(partition_id) column_names = options["column_names"] From b33b7946aa150d51a67af799af94548cdbcb6a86 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Wed, 20 Aug 2025 09:06:40 -0700 Subject: [PATCH 176/366] Add streams to all scalar factories (#19729) Contributes to #15163 Authors: - Vyas Ramasubramani (https://github.com/vyasr) Approvers: - Bradley Dice (https://github.com/bdice) URL: https://github.com/rapidsai/cudf/pull/19729 --- .../libcudf/scalar/scalar_factories.pxd | 24 +- .../pylibcudf/nvtext/byte_pair_encode.pyx | 8 +- python/pylibcudf/pylibcudf/nvtext/replace.pyx | 15 +- .../pylibcudf/pylibcudf/nvtext/tokenize.pyx | 17 +- python/pylibcudf/pylibcudf/scalar.pxd | 5 +- python/pylibcudf/pylibcudf/scalar.pyi | 21 +- python/pylibcudf/pylibcudf/scalar.pyx | 213 +++++++++++------- .../pylibcudf/strings/capitalize.pyx | 8 +- .../pylibcudf/pylibcudf/strings/combine.pyx | 9 +- .../pylibcudf/pylibcudf/strings/contains.pyx | 9 +- .../strings/convert/convert_lists.pyx | 8 +- .../pylibcudf/pylibcudf/strings/replace.pyx | 8 +- .../pylibcudf/strings/replace_re.pyx | 8 +- python/pylibcudf/pylibcudf/strings/slice.pyx | 9 +- .../pylibcudf/strings/split/partition.pyx | 12 +- python/pylibcudf/pylibcudf/strings/strip.pyx | 8 +- 16 files changed, 263 insertions(+), 119 deletions(-) diff --git a/python/pylibcudf/pylibcudf/libcudf/scalar/scalar_factories.pxd b/python/pylibcudf/pylibcudf/libcudf/scalar/scalar_factories.pxd index 70c9067288e..f29ed15b4dd 100644 --- a/python/pylibcudf/pylibcudf/libcudf/scalar/scalar_factories.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/scalar/scalar_factories.pxd @@ -8,30 +8,40 @@ from pylibcudf.libcudf.scalar.scalar cimport scalar from pylibcudf.libcudf.fixed_point.fixed_point cimport scale_type from pylibcudf.libcudf.types cimport int128 as int128_t +from rmm.librmm.cuda_stream_view cimport cuda_stream_view + cdef extern from "cudf/scalar/scalar_factories.hpp" namespace "cudf" nogil: cdef unique_ptr[scalar] make_string_scalar( - const string & _string + const string & _string, + cuda_stream_view stream ) except +libcudf_exception_handler cdef unique_ptr[scalar] make_fixed_width_scalar[T]( - T value + T value, + cuda_stream_view stream ) except +libcudf_exception_handler cdef unique_ptr[scalar] make_fixed_point_scalar[T]( int128_t value, scale_type scale, + cuda_stream_view stream ) except +libcudf_exception_handler cdef unique_ptr[scalar] make_numeric_scalar( - data_type type_ + data_type type_, + cuda_stream_view stream ) except +libcudf_exception_handler cdef unique_ptr[scalar] make_timestamp_scalar( - data_type type_ + data_type type_, + cuda_stream_view stream ) except +libcudf_exception_handler cdef unique_ptr[scalar] make_empty_scalar_like( - const column_view & + const column_view &, + cuda_stream_view stream ) except +libcudf_exception_handler cdef unique_ptr[scalar] make_duration_scalar( - data_type type_ + data_type type_, + cuda_stream_view stream ) except +libcudf_exception_handler cdef unique_ptr[scalar] make_default_constructed_scalar( - data_type type_ + data_type type_, + cuda_stream_view stream ) except +libcudf_exception_handler diff --git a/python/pylibcudf/pylibcudf/nvtext/byte_pair_encode.pyx b/python/pylibcudf/pylibcudf/nvtext/byte_pair_encode.pyx index 7565b21084f..ed962f76dc3 100644 --- a/python/pylibcudf/pylibcudf/nvtext/byte_pair_encode.pyx +++ b/python/pylibcudf/pylibcudf/nvtext/byte_pair_encode.pyx @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. +# Copyright (c) 2024-2025, NVIDIA CORPORATION. from cython.operator cimport dereference from libcpp.memory cimport unique_ptr @@ -15,6 +15,8 @@ from pylibcudf.libcudf.scalar.scalar_factories cimport ( make_string_scalar as cpp_make_string_scalar, ) from pylibcudf.scalar cimport Scalar +from pylibcudf.utils cimport _get_stream +from rmm.pylibrmm.stream cimport Stream __all__ = ["BPEMergePairs", "byte_pair_encoding"] @@ -55,10 +57,12 @@ cpdef Column byte_pair_encoding( An encoded column of strings. """ cdef unique_ptr[column] c_result + cdef Stream stream if separator is None: + stream = _get_stream(None) separator = Scalar.from_libcudf( - cpp_make_string_scalar(" ".encode()) + cpp_make_string_scalar(" ".encode(), stream.view()) ) with nogil: diff --git a/python/pylibcudf/pylibcudf/nvtext/replace.pyx b/python/pylibcudf/pylibcudf/nvtext/replace.pyx index a27592fb434..9464ef7324f 100644 --- a/python/pylibcudf/pylibcudf/nvtext/replace.pyx +++ b/python/pylibcudf/pylibcudf/nvtext/replace.pyx @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. +# Copyright (c) 2024-2025, NVIDIA CORPORATION. from cython.operator cimport dereference from libcpp.memory cimport unique_ptr @@ -15,6 +15,8 @@ from pylibcudf.libcudf.scalar.scalar_factories cimport ( ) from pylibcudf.libcudf.types cimport size_type from pylibcudf.scalar cimport Scalar +from pylibcudf.utils cimport _get_stream +from rmm.pylibrmm.stream cimport Stream __all__ = ["filter_tokens", "replace_tokens"] @@ -47,9 +49,11 @@ cpdef Column replace_tokens( New strings column with replaced strings """ cdef unique_ptr[column] c_result + cdef Stream stream if delimiter is None: + stream = _get_stream(None) delimiter = Scalar.from_libcudf( - cpp_make_string_scalar("".encode()) + cpp_make_string_scalar("".encode(), stream.view()) ) with nogil: c_result = cpp_replace_tokens( @@ -90,13 +94,16 @@ cpdef Column filter_tokens( New strings column of filtered strings """ cdef unique_ptr[column] c_result + cdef Stream stream if delimiter is None: + stream = _get_stream(None) delimiter = Scalar.from_libcudf( - cpp_make_string_scalar("".encode()) + cpp_make_string_scalar("".encode(), stream.view()) ) if replacement is None: + stream = _get_stream(None) replacement = Scalar.from_libcudf( - cpp_make_string_scalar("".encode()) + cpp_make_string_scalar("".encode(), stream.view()) ) with nogil: diff --git a/python/pylibcudf/pylibcudf/nvtext/tokenize.pyx b/python/pylibcudf/pylibcudf/nvtext/tokenize.pyx index 43d426489b4..1d49c828df7 100644 --- a/python/pylibcudf/pylibcudf/nvtext/tokenize.pyx +++ b/python/pylibcudf/pylibcudf/nvtext/tokenize.pyx @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. +# Copyright (c) 2024-2025, NVIDIA CORPORATION. from cython.operator cimport dereference from libcpp.memory cimport unique_ptr @@ -19,6 +19,9 @@ from pylibcudf.libcudf.scalar.scalar_factories cimport ( make_string_scalar as cpp_make_string_scalar, ) from pylibcudf.libcudf.types cimport size_type +from pylibcudf.scalar cimport Scalar +from pylibcudf.utils cimport _get_stream +from rmm.pylibrmm.stream cimport Stream __all__ = [ "TokenizeVocabulary", @@ -63,10 +66,12 @@ cpdef Column tokenize_scalar(Column input, Scalar delimiter=None): New strings columns of tokens """ cdef unique_ptr[column] c_result + cdef Stream stream if delimiter is None: + stream = _get_stream(None) delimiter = Scalar.from_libcudf( - cpp_make_string_scalar("".encode()) + cpp_make_string_scalar("".encode(), stream.view()) ) with nogil: @@ -126,10 +131,12 @@ cpdef Column count_tokens_scalar(Column input, Scalar delimiter=None): New column of token counts """ cdef unique_ptr[column] c_result + cdef Stream stream if delimiter is None: + stream = _get_stream(None) delimiter = Scalar.from_libcudf( - cpp_make_string_scalar("".encode()) + cpp_make_string_scalar("".encode(), stream.view()) ) with nogil: @@ -218,10 +225,12 @@ cpdef Column detokenize( New strings columns of tokens """ cdef unique_ptr[column] c_result + cdef Stream stream if separator is None: + stream = _get_stream(None) separator = Scalar.from_libcudf( - cpp_make_string_scalar(" ".encode()) + cpp_make_string_scalar(" ".encode(), stream.view()) ) with nogil: diff --git a/python/pylibcudf/pylibcudf/scalar.pxd b/python/pylibcudf/pylibcudf/scalar.pxd index a273647c98d..34b8c060377 100644 --- a/python/pylibcudf/pylibcudf/scalar.pxd +++ b/python/pylibcudf/pylibcudf/scalar.pxd @@ -1,10 +1,11 @@ -# Copyright (c) 2023-2024, NVIDIA CORPORATION. +# Copyright (c) 2023-2025, NVIDIA CORPORATION. from libcpp cimport bool from libcpp.memory cimport unique_ptr from pylibcudf.libcudf.scalar.scalar cimport scalar from rmm.pylibrmm.memory_resource cimport DeviceMemoryResource +from rmm.pylibrmm.stream cimport Stream from .column cimport Column from .types cimport DataType @@ -25,7 +26,7 @@ cdef class Scalar: cpdef bool is_valid(self) @staticmethod - cdef Scalar empty_like(Column column) + cdef Scalar empty_like(Column column, Stream stream=*) @staticmethod cdef Scalar from_libcudf(unique_ptr[scalar] libcudf_scalar, dtype=*) diff --git a/python/pylibcudf/pylibcudf/scalar.pyi b/python/pylibcudf/pylibcudf/scalar.pyi index 09c2e1c584c..be84726ef18 100644 --- a/python/pylibcudf/pylibcudf/scalar.pyi +++ b/python/pylibcudf/pylibcudf/scalar.pyi @@ -2,6 +2,8 @@ from typing import Any +from rmm.pylibrmm.stream import Stream + from pylibcudf.column import Column from pylibcudf.types import DataType @@ -12,11 +14,22 @@ class Scalar: def type(self) -> DataType: ... def is_valid(self) -> bool: ... @staticmethod - def empty_like(column: Column) -> Scalar: ... + def empty_like(column: Column, stream: Stream | None = None) -> Scalar: ... @staticmethod - def from_arrow(pa_val: Any, dtype: DataType | None = None) -> Scalar: ... + def from_arrow( + pa_val: Any, + dtype: DataType | None = None, + stream: Stream | None = None, + ) -> Scalar: ... @classmethod - def from_py(cls, py_val: Any, dtype: DataType | None = None) -> Scalar: ... + def from_py( + cls, + py_val: Any, + dtype: DataType | None = None, + stream: Stream | None = None, + ) -> Scalar: ... @classmethod - def from_numpy(cls, np_val: NpGeneric) -> Scalar: ... + def from_numpy( + cls, np_val: NpGeneric, stream: Stream | None = None + ) -> Scalar: ... def to_py(self) -> None | int | float | str | bool: ... diff --git a/python/pylibcudf/pylibcudf/scalar.pyx b/python/pylibcudf/pylibcudf/scalar.pyx index 0d533c960a4..57ea17d2921 100644 --- a/python/pylibcudf/pylibcudf/scalar.pyx +++ b/python/pylibcudf/pylibcudf/scalar.pyx @@ -52,10 +52,12 @@ from pylibcudf.libcudf.wrappers.timestamps cimport ( ) from rmm.pylibrmm.memory_resource cimport get_current_device_resource +from rmm.pylibrmm.stream cimport Stream from .column cimport Column from .traits cimport is_floating_point from .types cimport DataType +from .utils cimport _get_stream from functools import singledispatch try: @@ -149,7 +151,11 @@ cdef class Scalar: return self.get().is_valid() @staticmethod - def from_arrow(pa_val, dtype: DataType | None = None) -> Scalar: + def from_arrow( + pa_val, + dtype: DataType | None = None, + stream: Stream | None = None + ) -> Scalar: """ Convert a pyarrow scalar to a pylibcudf.Scalar. @@ -160,28 +166,35 @@ cdef class Scalar: dtype: DataType | None The datatype to cast the value to. If None, the type is inferred from the pyarrow scalar. + stream : Stream | None + CUDA stream on which to perform the operation. Returns ------- Scalar New pylibcudf.Scalar """ - return _from_arrow(pa_val, dtype) + return _from_arrow(pa_val, dtype, stream) @staticmethod - cdef Scalar empty_like(Column column): + cdef Scalar empty_like(Column column, Stream stream=None): """Construct a null scalar with the same type as column. Parameters ---------- column Column to take type from + stream : Stream | None + CUDA stream on which to perform the operation. Returns ------- New empty (null) scalar of the given type. """ - return Scalar.from_libcudf(move(make_empty_scalar_like(column.view()))) + stream = _get_stream(stream) + return Scalar.from_libcudf( + move(make_empty_scalar_like(column.view(), stream.view())) + ) @staticmethod cdef Scalar from_libcudf(unique_ptr[scalar] libcudf_scalar, dtype=None): @@ -197,7 +210,12 @@ cdef class Scalar: return s @classmethod - def from_py(cls, py_val, dtype: DataType | None = None): + def from_py( + cls, + py_val, + dtype: DataType | None = None, + stream: Stream | None = None + ): """ Convert a Python standard library object to a Scalar. @@ -208,16 +226,18 @@ cdef class Scalar: dtype: DataType | None The datatype to cast the value to. If None, the type is inferred from `py_val`. + stream : Stream | None + CUDA stream on which to perform the operation. Returns ------- Scalar New pylibcudf.Scalar """ - return _from_py(py_val, dtype) + return _from_py(py_val, dtype, stream) @classmethod - def from_numpy(cls, np_val): + def from_numpy(cls, np_val, stream: Stream | None = None): """ Convert a NumPy scalar to a Scalar. @@ -225,13 +245,15 @@ cdef class Scalar: ---------- np_val: numpy.generic Value to convert to a pylibcudf.Scalar + stream : Stream | None + CUDA stream on which to perform the operation. Returns ------- Scalar New pylibcudf.Scalar """ - return _from_numpy(np_val) + return _from_numpy(np_val, stream) def to_py(self): """ @@ -292,31 +314,35 @@ cdef Scalar _new_scalar(unique_ptr[scalar] c_obj, DataType dtype): @singledispatch -def _from_py(py_val, dtype: DataType | None): +def _from_py(py_val, dtype: DataType | None, stream: Stream | None): raise TypeError(f"{type(py_val).__name__} cannot be converted to pylibcudf.Scalar") @_from_py.register(type(None)) -def _(py_val, dtype: DataType | None): +def _(py_val, dtype: DataType | None, stream: Stream | None): cdef DataType c_dtype if dtype is None: raise ValueError("Must specify a dtype for a None value.") else: c_dtype = dtype - cdef unique_ptr[scalar] c_obj = make_default_constructed_scalar(c_dtype.c_obj) + stream = _get_stream(stream) + cdef unique_ptr[scalar] c_obj = make_default_constructed_scalar( + c_dtype.c_obj, + stream.view() + ) return _new_scalar(move(c_obj), dtype) @_from_py.register(dict) @_from_py.register(list) -def _(py_val, dtype: DataType | None): +def _(py_val, dtype: DataType | None, stream: Stream | None): raise NotImplementedError( f"Conversion from {type(py_val).__name__} is currently not supported." ) @_from_py.register(float) -def _(py_val: float, dtype: DataType | None): +def _(py_val: float, dtype: DataType | None, stream: Stream | None): cdef unique_ptr[scalar] c_obj cdef DataType c_dtype if dtype is None: @@ -324,15 +350,16 @@ def _(py_val: float, dtype: DataType | None): else: c_dtype = dtype + stream = _get_stream(stream) cdef type_id tid = c_dtype.id() if tid == type_id.FLOAT32: if abs(py_val) > numeric_limits[float].max(): raise OverflowError(f"{py_val} out of range for FLOAT32 scalar") - c_obj = make_numeric_scalar(c_dtype.c_obj) + c_obj = make_numeric_scalar(c_dtype.c_obj, stream.view()) (c_obj.get()).set_value(py_val) elif tid == type_id.FLOAT64: - c_obj = make_numeric_scalar(c_dtype.c_obj) + c_obj = make_numeric_scalar(c_dtype.c_obj, stream.view()) (c_obj.get()).set_value(py_val) else: typ = c_dtype.id() @@ -342,7 +369,7 @@ def _(py_val: float, dtype: DataType | None): @_from_py.register(int) -def _(py_val: int, dtype: DataType | None): +def _(py_val: int, dtype: DataType | None, stream: Stream | None): cdef unique_ptr[scalar] c_obj cdef DataType c_dtype cdef duration_ns c_duration_ns @@ -353,9 +380,10 @@ def _(py_val: int, dtype: DataType | None): if dtype is None: c_dtype = dtype = DataType(type_id.INT64) elif is_floating_point(dtype): - return _from_py(float(py_val), dtype) + return _from_py(float(py_val), dtype, stream) else: c_dtype = dtype + stream = _get_stream(stream) cdef type_id tid = c_dtype.id() if tid == type_id.INT8: @@ -363,7 +391,7 @@ def _(py_val: int, dtype: DataType | None): numeric_limits[int8_t].min() <= py_val <= numeric_limits[int8_t].max() ): raise OverflowError(f"{py_val} out of range for INT8 scalar") - c_obj = make_numeric_scalar(c_dtype.c_obj) + c_obj = make_numeric_scalar(c_dtype.c_obj, stream.view()) (c_obj.get()).set_value(py_val) elif tid == type_id.INT16: @@ -371,7 +399,7 @@ def _(py_val: int, dtype: DataType | None): numeric_limits[int16_t].min() <= py_val <= numeric_limits[int16_t].max() ): raise OverflowError(f"{py_val} out of range for INT16 scalar") - c_obj = make_numeric_scalar(c_dtype.c_obj) + c_obj = make_numeric_scalar(c_dtype.c_obj, stream.view()) (c_obj.get()).set_value(py_val) elif tid == type_id.INT32: @@ -379,7 +407,7 @@ def _(py_val: int, dtype: DataType | None): numeric_limits[int32_t].min() <= py_val <= numeric_limits[int32_t].max() ): raise OverflowError(f"{py_val} out of range for INT32 scalar") - c_obj = make_numeric_scalar(c_dtype.c_obj) + c_obj = make_numeric_scalar(c_dtype.c_obj, stream.view()) (c_obj.get()).set_value(py_val) elif tid == type_id.INT64: @@ -387,7 +415,7 @@ def _(py_val: int, dtype: DataType | None): numeric_limits[int64_t].min() <= py_val <= numeric_limits[int64_t].max() ): raise OverflowError(f"{py_val} out of range for INT64 scalar") - c_obj = make_numeric_scalar(c_dtype.c_obj) + c_obj = make_numeric_scalar(c_dtype.c_obj, stream.view()) (c_obj.get()).set_value(py_val) elif tid == type_id.UINT8: @@ -395,7 +423,7 @@ def _(py_val: int, dtype: DataType | None): raise ValueError("Cannot assign negative value to UINT8 scalar") if py_val > numeric_limits[uint8_t].max(): raise OverflowError(f"{py_val} out of range for UINT8 scalar") - c_obj = make_numeric_scalar(c_dtype.c_obj) + c_obj = make_numeric_scalar(c_dtype.c_obj, stream.view()) (c_obj.get()).set_value(py_val) elif tid == type_id.UINT16: @@ -403,7 +431,7 @@ def _(py_val: int, dtype: DataType | None): raise ValueError("Cannot assign negative value to UINT16 scalar") if py_val > numeric_limits[uint16_t].max(): raise OverflowError(f"{py_val} out of range for UINT16 scalar") - c_obj = make_numeric_scalar(c_dtype.c_obj) + c_obj = make_numeric_scalar(c_dtype.c_obj, stream.view()) (c_obj.get()).set_value(py_val) elif tid == type_id.UINT32: @@ -411,7 +439,7 @@ def _(py_val: int, dtype: DataType | None): raise ValueError("Cannot assign negative value to UINT32 scalar") if py_val > numeric_limits[uint32_t].max(): raise OverflowError(f"{py_val} out of range for UINT32 scalar") - c_obj = make_numeric_scalar(c_dtype.c_obj) + c_obj = make_numeric_scalar(c_dtype.c_obj, stream.view()) (c_obj.get()).set_value(py_val) elif tid == type_id.UINT64: @@ -419,7 +447,7 @@ def _(py_val: int, dtype: DataType | None): raise ValueError("Cannot assign negative value to UINT64 scalar") if py_val > numeric_limits[uint64_t].max(): raise OverflowError(f"{py_val} out of range for UINT64 scalar") - c_obj = make_numeric_scalar(c_dtype.c_obj) + c_obj = make_numeric_scalar(c_dtype.c_obj, stream.view()) (c_obj.get()).set_value(py_val) elif tid == type_id.DURATION_NANOSECONDS: @@ -427,7 +455,7 @@ def _(py_val: int, dtype: DataType | None): raise OverflowError( f"{py_val} nanoseconds out of range for INT64 limit." ) - c_obj = make_duration_scalar(c_dtype.c_obj) + c_obj = make_duration_scalar(c_dtype.c_obj, stream.view()) c_duration_ns = duration_ns(py_val) (c_obj.get()).set_value(c_duration_ns) @@ -436,7 +464,7 @@ def _(py_val: int, dtype: DataType | None): raise OverflowError( f"{py_val} microseconds out of range for INT64 limit." ) - c_obj = make_duration_scalar(c_dtype.c_obj) + c_obj = make_duration_scalar(c_dtype.c_obj, stream.view()) c_duration_us = duration_us(py_val) (c_obj.get()).set_value(c_duration_us) @@ -445,7 +473,7 @@ def _(py_val: int, dtype: DataType | None): raise OverflowError( f"{py_val} milliseconds out of range for INT64 limit." ) - c_obj = make_duration_scalar(c_dtype.c_obj) + c_obj = make_duration_scalar(c_dtype.c_obj, stream.view()) c_duration_ms = duration_ms(py_val) (c_obj.get()).set_value(c_duration_ms) @@ -454,7 +482,7 @@ def _(py_val: int, dtype: DataType | None): raise OverflowError( f"{py_val} seconds out of range for INT64 limit." ) - c_obj = make_duration_scalar(c_dtype.c_obj) + c_obj = make_duration_scalar(c_dtype.c_obj, stream.view()) c_duration_s = duration_s(py_val) (c_obj.get()).set_value(c_duration_s) @@ -463,7 +491,7 @@ def _(py_val: int, dtype: DataType | None): raise OverflowError( f"{py_val} days out of range for INT32 limit." ) - c_obj = make_duration_scalar(c_dtype.c_obj) + c_obj = make_duration_scalar(c_dtype.c_obj, stream.view()) c_duration_D = duration_D(py_val) (c_obj.get()).set_value(c_duration_D) @@ -475,8 +503,9 @@ def _(py_val: int, dtype: DataType | None): @_from_py.register(py_bool) -def _(py_val: py_bool, dtype: DataType | None): +def _(py_val: py_bool, dtype: DataType | None, stream: Stream | None): if dtype is None: + stream = _get_stream(stream) dtype = DataType(type_id.BOOL8) elif dtype.id() != type_id.BOOL8: tid = (dtype).id() @@ -484,26 +513,32 @@ def _(py_val: py_bool, dtype: DataType | None): f"Cannot convert bool to Scalar with dtype {tid.name}" ) - cdef unique_ptr[scalar] c_obj = make_numeric_scalar((dtype).c_obj) + stream = _get_stream(stream) + cdef unique_ptr[scalar] c_obj = make_numeric_scalar( + (dtype).c_obj, + stream.view() + ) (c_obj.get()).set_value(py_val) return _new_scalar(move(c_obj), dtype) @_from_py.register(str) -def _(py_val: str, dtype: DataType | None): +def _(py_val: str, dtype: DataType | None, stream: Stream | None): if dtype is None: + stream = _get_stream(stream) dtype = DataType(type_id.STRING) elif dtype.id() != type_id.STRING: tid = (dtype).id() raise TypeError( f"Cannot convert str to Scalar with dtype {tid.name}" ) - cdef unique_ptr[scalar] c_obj = make_string_scalar(py_val.encode()) + stream = _get_stream(stream) + cdef unique_ptr[scalar] c_obj = make_string_scalar(py_val.encode(), stream.view()) return _new_scalar(move(c_obj), dtype) @_from_py.register(datetime.timedelta) -def _(py_val: datetime.timedelta, dtype: DataType | None): +def _(py_val: datetime.timedelta, dtype: DataType | None, stream: Stream | None): cdef unique_ptr[scalar] c_obj cdef duration_us c_duration_us cdef duration_ns c_duration_ns @@ -511,8 +546,10 @@ def _(py_val: datetime.timedelta, dtype: DataType | None): cdef duration_s c_duration_s cdef duration_D c_duration_D if dtype is None: + stream = _get_stream(stream) dtype = DataType(type_id.DURATION_MICROSECONDS) + stream = _get_stream(stream) cdef DataType c_dtype = dtype cdef type_id tid = c_dtype.id() total_seconds = py_val.total_seconds() @@ -522,7 +559,7 @@ def _(py_val: datetime.timedelta, dtype: DataType | None): raise OverflowError( f"{total_nanoseconds} nanoseconds out of range for INT64 limit." ) - c_obj = make_duration_scalar(c_dtype.c_obj) + c_obj = make_duration_scalar(c_dtype.c_obj, stream.view()) c_duration_ns = duration_ns(total_nanoseconds) (c_obj.get()).set_value(c_duration_ns) elif tid == type_id.DURATION_MICROSECONDS: @@ -531,7 +568,7 @@ def _(py_val: datetime.timedelta, dtype: DataType | None): raise OverflowError( f"{total_microseconds} microseconds out of range for INT64 limit." ) - c_obj = make_duration_scalar(c_dtype.c_obj) + c_obj = make_duration_scalar(c_dtype.c_obj, stream.view()) c_duration_us = duration_us(total_microseconds) (c_obj.get()).set_value(c_duration_us) elif tid == type_id.DURATION_MILLISECONDS: @@ -540,7 +577,7 @@ def _(py_val: datetime.timedelta, dtype: DataType | None): raise OverflowError( f"{total_milliseconds} milliseconds out of range for INT64 limit." ) - c_obj = make_duration_scalar(c_dtype.c_obj) + c_obj = make_duration_scalar(c_dtype.c_obj, stream.view()) c_duration_ms = duration_ms(total_milliseconds) (c_obj.get()).set_value(c_duration_ms) elif tid == type_id.DURATION_SECONDS: @@ -549,7 +586,7 @@ def _(py_val: datetime.timedelta, dtype: DataType | None): raise OverflowError( f"{total_seconds} seconds out of range for INT64 limit." ) - c_obj = make_duration_scalar(c_dtype.c_obj) + c_obj = make_duration_scalar(c_dtype.c_obj, stream.view()) c_duration_s = duration_s(total_seconds) (c_obj.get()).set_value(c_duration_s) elif tid == type_id.DURATION_DAYS: @@ -558,7 +595,7 @@ def _(py_val: datetime.timedelta, dtype: DataType | None): raise OverflowError( f"{total_days} days out of range for INT32 limit." ) - c_obj = make_duration_scalar(c_dtype.c_obj) + c_obj = make_duration_scalar(c_dtype.c_obj, stream.view()) c_duration_D = duration_D(total_days) (c_obj.get()).set_value(c_duration_D) else: @@ -568,7 +605,7 @@ def _(py_val: datetime.timedelta, dtype: DataType | None): @_from_py.register(datetime.date) -def _(py_val: datetime.date, dtype: DataType | None): +def _(py_val: datetime.date, dtype: DataType | None, stream: Stream | None): cdef unique_ptr[scalar] c_obj cdef duration_us c_duration_us cdef duration_ns c_duration_ns @@ -581,8 +618,10 @@ def _(py_val: datetime.date, dtype: DataType | None): cdef timestamp_ns c_timestamp_ns cdef timestamp_D c_timestamp_D if dtype is None: + stream = _get_stream(stream) dtype = DataType(type_id.TIMESTAMP_MICROSECONDS) + stream = _get_stream(stream) cdef DataType c_dtype = dtype cdef type_id tid = c_dtype.id() if isinstance(py_val, datetime.datetime): @@ -595,7 +634,7 @@ def _(py_val: datetime.date, dtype: DataType | None): raise OverflowError( f"{epoch_nanoseconds} nanoseconds out of range for INT64 limit." ) - c_obj = make_timestamp_scalar(c_dtype.c_obj) + c_obj = make_timestamp_scalar(c_dtype.c_obj, stream.view()) c_duration_ns = duration_ns(epoch_nanoseconds) c_timestamp_ns = timestamp_ns(c_duration_ns) (c_obj.get()).set_value(c_timestamp_ns) @@ -605,7 +644,7 @@ def _(py_val: datetime.date, dtype: DataType | None): raise OverflowError( f"{epoch_microseconds} microseconds out of range for INT64 limit." ) - c_obj = make_timestamp_scalar(c_dtype.c_obj) + c_obj = make_timestamp_scalar(c_dtype.c_obj, stream.view()) c_duration_us = duration_us(epoch_microseconds) c_timestamp_us = timestamp_us(c_duration_us) (c_obj.get()).set_value(c_timestamp_us) @@ -615,7 +654,7 @@ def _(py_val: datetime.date, dtype: DataType | None): raise OverflowError( f"{epoch_milliseconds} milliseconds out of range for INT64 limit." ) - c_obj = make_timestamp_scalar(c_dtype.c_obj) + c_obj = make_timestamp_scalar(c_dtype.c_obj, stream.view()) c_duration_ms = duration_ms(epoch_milliseconds) c_timestamp_ms = timestamp_ms(c_duration_ms) (c_obj.get()).set_value(c_timestamp_ms) @@ -625,7 +664,7 @@ def _(py_val: datetime.date, dtype: DataType | None): raise OverflowError( f"{epoch_seconds} seconds out of range for INT64 limit." ) - c_obj = make_timestamp_scalar(c_dtype.c_obj) + c_obj = make_timestamp_scalar(c_dtype.c_obj, stream.view()) c_duration_s = duration_s(epoch_seconds) c_timestamp_s = timestamp_s(c_duration_s) (c_obj.get()).set_value(c_timestamp_s) @@ -635,7 +674,7 @@ def _(py_val: datetime.date, dtype: DataType | None): raise OverflowError( f"{epoch_days} days out of range for INT32 limit." ) - c_obj = make_timestamp_scalar(c_dtype.c_obj) + c_obj = make_timestamp_scalar(c_dtype.c_obj, stream.view()) c_duration_D = duration_D(epoch_days) c_timestamp_D = timestamp_D(c_duration_D) (c_obj.get()).set_value(c_timestamp_D) @@ -646,7 +685,7 @@ def _(py_val: datetime.date, dtype: DataType | None): @_from_py.register(decimal.Decimal) -def _(py_val: decimal.Decimal, dtype: DataType | None): +def _(py_val: decimal.Decimal, dtype: DataType | None, stream: Stream | None): scale = -py_val.as_tuple().exponent as_int = int(py_val.scaleb(scale)) @@ -657,15 +696,17 @@ def _(py_val: decimal.Decimal, dtype: DataType | None): if dtype.id() != type_id.DECIMAL128: raise TypeError("Expected dtype to be DECIMAL128") + stream = _get_stream(stream) cdef unique_ptr[scalar] c_obj = make_fixed_point_scalar[decimal128]( val, - scale_type(scale) + scale_type(scale), + stream.view() ) return _new_scalar(move(c_obj), dtype) @singledispatch -def _from_numpy(np_val): +def _from_numpy(np_val, stream: Stream | None): if np_error is not None: raise np_error raise TypeError(f"{type(np_val).__name__} cannot be converted to pylibcudf.Scalar") @@ -674,109 +715,129 @@ def _from_numpy(np_val): if np is not None: @_from_numpy.register(np.datetime64) @_from_numpy.register(np.timedelta64) - def _(np_val): + def _(np_val, stream: Stream | None): raise NotImplementedError( f"{type(np_val).__name__} is currently not supported." ) @_from_numpy.register(np.bool_) - def _(np_val): + def _(np_val, stream: Stream | None): cdef DataType dtype = DataType(type_id.BOOL8) - cdef unique_ptr[scalar] c_obj = make_numeric_scalar(dtype.c_obj) + stream = _get_stream(stream) + cdef unique_ptr[scalar] c_obj = make_numeric_scalar(dtype.c_obj, stream.view()) cdef cbool c_val = np_val (c_obj.get()).set_value(c_val) cdef Scalar slr = _new_scalar(move(c_obj), dtype) return slr @_from_numpy.register(np.str_) - def _(np_val): + def _(np_val, stream: Stream | None): cdef DataType dtype = DataType(type_id.STRING) - cdef unique_ptr[scalar] c_obj = make_string_scalar(np_val.item().encode()) + stream = _get_stream(stream) + cdef unique_ptr[scalar] c_obj = make_string_scalar( + np_val.item().encode(), + stream.view() + ) cdef Scalar slr = _new_scalar(move(c_obj), dtype) return slr @_from_numpy.register(np.int8) - def _(np_val): + def _(np_val, stream: Stream | None): + stream = _get_stream(stream) dtype = DataType(type_id.INT8) - cdef unique_ptr[scalar] c_obj = make_numeric_scalar(dtype.c_obj) + stream = _get_stream(stream) + cdef unique_ptr[scalar] c_obj = make_numeric_scalar(dtype.c_obj, stream.view()) (c_obj.get()).set_value(np_val) cdef Scalar slr = _new_scalar(move(c_obj), dtype) return slr @_from_numpy.register(np.int16) - def _(np_val): + def _(np_val, stream: Stream | None): + stream = _get_stream(stream) dtype = DataType(type_id.INT16) - cdef unique_ptr[scalar] c_obj = make_numeric_scalar(dtype.c_obj) + cdef unique_ptr[scalar] c_obj = make_numeric_scalar(dtype.c_obj, stream.view()) (c_obj.get()).set_value(np_val) cdef Scalar slr = _new_scalar(move(c_obj), dtype) return slr @_from_numpy.register(np.int32) - def _(np_val): + def _(np_val, stream: Stream | None): + stream = _get_stream(stream) dtype = DataType(type_id.INT32) - cdef unique_ptr[scalar] c_obj = make_numeric_scalar(dtype.c_obj) + cdef unique_ptr[scalar] c_obj = make_numeric_scalar(dtype.c_obj, stream.view()) (c_obj.get()).set_value(np_val) cdef Scalar slr = _new_scalar(move(c_obj), dtype) return slr @_from_numpy.register(np.int64) - def _(np_val): + def _(np_val, stream: Stream | None): + stream = _get_stream(stream) dtype = DataType(type_id.INT64) - cdef unique_ptr[scalar] c_obj = make_numeric_scalar(dtype.c_obj) + cdef unique_ptr[scalar] c_obj = make_numeric_scalar(dtype.c_obj, stream.view()) (c_obj.get()).set_value(np_val) cdef Scalar slr = _new_scalar(move(c_obj), dtype) return slr @_from_numpy.register(np.uint8) - def _(np_val): + def _(np_val, stream: Stream | None): + stream = _get_stream(stream) dtype = DataType(type_id.UINT8) - cdef unique_ptr[scalar] c_obj = make_numeric_scalar(dtype.c_obj) + cdef unique_ptr[scalar] c_obj = make_numeric_scalar(dtype.c_obj, stream.view()) (c_obj.get()).set_value(np_val) cdef Scalar slr = _new_scalar(move(c_obj), dtype) return slr @_from_numpy.register(np.uint16) - def _(np_val): + def _(np_val, stream: Stream | None): + stream = _get_stream(stream) dtype = DataType(type_id.UINT16) - cdef unique_ptr[scalar] c_obj = make_numeric_scalar(dtype.c_obj) + cdef unique_ptr[scalar] c_obj = make_numeric_scalar(dtype.c_obj, stream.view()) (c_obj.get()).set_value(np_val) cdef Scalar slr = _new_scalar(move(c_obj), dtype) return slr @_from_numpy.register(np.uint32) - def _(np_val): + def _(np_val, stream: Stream | None): + stream = _get_stream(stream) dtype = DataType(type_id.UINT32) - cdef unique_ptr[scalar] c_obj = make_numeric_scalar(dtype.c_obj) + cdef unique_ptr[scalar] c_obj = make_numeric_scalar(dtype.c_obj, stream.view()) (c_obj.get()).set_value(np_val) cdef Scalar slr = _new_scalar(move(c_obj), dtype) return slr @_from_numpy.register(np.uint64) - def _(np_val): + def _(np_val, stream: Stream | None): + stream = _get_stream(stream) dtype = DataType(type_id.UINT64) - cdef unique_ptr[scalar] c_obj = make_numeric_scalar(dtype.c_obj) + cdef unique_ptr[scalar] c_obj = make_numeric_scalar(dtype.c_obj, stream.view()) (c_obj.get()).set_value(np_val) cdef Scalar slr = _new_scalar(move(c_obj), dtype) return slr @_from_numpy.register(np.float32) - def _(np_val): + def _(np_val, stream: Stream | None): + stream = _get_stream(stream) dtype = DataType(type_id.FLOAT32) - cdef unique_ptr[scalar] c_obj = make_numeric_scalar(dtype.c_obj) + cdef unique_ptr[scalar] c_obj = make_numeric_scalar(dtype.c_obj, stream.view()) (c_obj.get()).set_value(np_val) cdef Scalar slr = _new_scalar(move(c_obj), dtype) return slr @_from_numpy.register(np.float64) - def _(np_val): + def _(np_val, stream: Stream | None): + stream = _get_stream(stream) dtype = DataType(type_id.FLOAT64) - cdef unique_ptr[scalar] c_obj = make_numeric_scalar(dtype.c_obj) + cdef unique_ptr[scalar] c_obj = make_numeric_scalar(dtype.c_obj, stream.view()) (c_obj.get()).set_value(np_val) cdef Scalar slr = _new_scalar(move(c_obj), dtype) return slr -def _from_arrow(obj: pa.Scalar, dtype: DataType | None = None) -> Scalar: +def _from_arrow( + obj: pa.Scalar, + dtype: DataType | None = None, + stream: Stream | None = None +) -> Scalar: if pa_err is not None: raise RuntimeError( "pyarrow was not found on your system. Please " @@ -790,4 +851,4 @@ def _from_arrow(obj: pa.Scalar, dtype: DataType | None = None) -> Scalar: pa_array = pa.array([None], type=obj.type) else: pa_array = pa.array([obj]) - return Column.from_arrow(pa_array, dtype=dtype).to_scalar() + return Column.from_arrow(pa_array, dtype=dtype, stream=stream).to_scalar() diff --git a/python/pylibcudf/pylibcudf/strings/capitalize.pyx b/python/pylibcudf/pylibcudf/strings/capitalize.pyx index a54480b8e4a..5297696145b 100644 --- a/python/pylibcudf/pylibcudf/strings/capitalize.pyx +++ b/python/pylibcudf/pylibcudf/strings/capitalize.pyx @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. +# Copyright (c) 2024-2025, NVIDIA CORPORATION. from libcpp.memory cimport unique_ptr from libcpp.utility cimport move @@ -11,6 +11,8 @@ from pylibcudf.libcudf.scalar.scalar_factories cimport ( from pylibcudf.libcudf.strings cimport capitalize as cpp_capitalize from pylibcudf.scalar cimport Scalar from pylibcudf.strings.char_types cimport string_character_types +from pylibcudf.utils cimport _get_stream +from rmm.pylibrmm.stream cimport Stream from cython.operator import dereference @@ -39,10 +41,12 @@ cpdef Column capitalize( Column of strings capitalized from the input column """ cdef unique_ptr[column] c_result + cdef Stream stream if delimiters is None: + stream = _get_stream(None) delimiters = Scalar.from_libcudf( - cpp_make_string_scalar("".encode()) + cpp_make_string_scalar("".encode(), stream.view()) ) cdef const string_scalar* cpp_delimiters = ( diff --git a/python/pylibcudf/pylibcudf/strings/combine.pyx b/python/pylibcudf/pylibcudf/strings/combine.pyx index da78c81c0c0..87650c3bcbd 100644 --- a/python/pylibcudf/pylibcudf/strings/combine.pyx +++ b/python/pylibcudf/pylibcudf/strings/combine.pyx @@ -10,6 +10,8 @@ from pylibcudf.libcudf.scalar.scalar_factories cimport ( from pylibcudf.libcudf.strings cimport combine as cpp_combine from pylibcudf.scalar cimport Scalar from pylibcudf.table cimport Table +from pylibcudf.utils cimport _get_stream +from rmm.pylibrmm.stream cimport Stream from cython.operator import dereference from pylibcudf.libcudf.strings.combine import \ @@ -62,10 +64,12 @@ cpdef Column concatenate( cdef unique_ptr[column] c_result cdef const string_scalar* c_col_narep cdef const string_scalar* c_separator + cdef Stream stream if narep is None: + stream = _get_stream(None) narep = Scalar.from_libcudf( - cpp_make_string_scalar("".encode()) + cpp_make_string_scalar("".encode(), stream.view()) ) cdef const string_scalar* c_narep = ( narep.c_obj.get() @@ -73,8 +77,9 @@ cpdef Column concatenate( if ColumnOrScalar is Column: if col_narep is None: + stream = _get_stream(None) col_narep = Scalar.from_libcudf( - cpp_make_string_scalar("".encode()) + cpp_make_string_scalar("".encode(), stream.view()) ) c_col_narep = ( col_narep.c_obj.get() diff --git a/python/pylibcudf/pylibcudf/strings/contains.pyx b/python/pylibcudf/pylibcudf/strings/contains.pyx index 7b4c53ed853..7773520d7b3 100644 --- a/python/pylibcudf/pylibcudf/strings/contains.pyx +++ b/python/pylibcudf/pylibcudf/strings/contains.pyx @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. +# Copyright (c) 2024-2025, NVIDIA CORPORATION. from libcpp.memory cimport unique_ptr from libcpp.utility cimport move from cython.operator import dereference @@ -11,6 +11,9 @@ from pylibcudf.libcudf.scalar.scalar_factories cimport ( ) from pylibcudf.libcudf.strings cimport contains as cpp_contains from pylibcudf.strings.regex_program cimport RegexProgram +from pylibcudf.scalar cimport Scalar +from pylibcudf.utils cimport _get_stream +from rmm.pylibrmm.stream cimport Stream __all__ = ["contains_re", "count_re", "like", "matches_re"] @@ -137,10 +140,12 @@ cpdef Column like(Column input, ColumnOrScalar pattern, Scalar escape_character= New column of boolean results for each string """ cdef unique_ptr[column] result + cdef Stream stream if escape_character is None: + stream = _get_stream(None) escape_character = Scalar.from_libcudf( - cpp_make_string_scalar("".encode()) + cpp_make_string_scalar("".encode(), stream.view()) ) cdef const string_scalar* c_escape_character = ( diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_lists.pyx b/python/pylibcudf/pylibcudf/strings/convert/convert_lists.pyx index 518f72f6644..e2abe69e519 100644 --- a/python/pylibcudf/pylibcudf/strings/convert/convert_lists.pyx +++ b/python/pylibcudf/pylibcudf/strings/convert/convert_lists.pyx @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. +# Copyright (c) 2024-2025, NVIDIA CORPORATION. from libcpp.memory cimport unique_ptr from libcpp.utility cimport move @@ -14,6 +14,8 @@ from pylibcudf.libcudf.strings.convert cimport ( ) from pylibcudf.scalar cimport Scalar from pylibcudf.types cimport type_id +from pylibcudf.utils cimport _get_stream +from rmm.pylibrmm.stream cimport Stream from cython.operator import dereference @@ -48,10 +50,12 @@ cpdef Column format_list_column( New strings column """ cdef unique_ptr[column] c_result + cdef Stream stream if na_rep is None: + stream = _get_stream(None) na_rep = Scalar.from_libcudf( - cpp_make_string_scalar("".encode()) + cpp_make_string_scalar("".encode(), stream.view()) ) cdef const string_scalar* c_na_rep = ( diff --git a/python/pylibcudf/pylibcudf/strings/replace.pyx b/python/pylibcudf/pylibcudf/strings/replace.pyx index 3ba6c1b5530..f02dd1c2cdb 100644 --- a/python/pylibcudf/pylibcudf/strings/replace.pyx +++ b/python/pylibcudf/pylibcudf/strings/replace.pyx @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. +# Copyright (c) 2024-2025, NVIDIA CORPORATION. from libcpp.memory cimport unique_ptr from libcpp.utility cimport move @@ -15,6 +15,8 @@ from pylibcudf.libcudf.strings.replace cimport ( ) from pylibcudf.libcudf.types cimport size_type from pylibcudf.scalar cimport Scalar +from pylibcudf.utils cimport _get_stream +from rmm.pylibrmm.stream cimport Stream __all__ = ["replace", "replace_multiple", "replace_slice"] @@ -144,10 +146,12 @@ cpdef Column replace_slice( New string column """ cdef unique_ptr[column] c_result + cdef Stream stream if repl is None: + stream = _get_stream(None) repl = Scalar.from_libcudf( - cpp_make_string_scalar("".encode()) + cpp_make_string_scalar("".encode(), stream.view()) ) cdef const string_scalar* scalar_str = (repl.c_obj.get()) diff --git a/python/pylibcudf/pylibcudf/strings/replace_re.pyx b/python/pylibcudf/pylibcudf/strings/replace_re.pyx index bdabc779ddf..fd4df37dbac 100644 --- a/python/pylibcudf/pylibcudf/strings/replace_re.pyx +++ b/python/pylibcudf/pylibcudf/strings/replace_re.pyx @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. +# Copyright (c) 2024-2025, NVIDIA CORPORATION. from cython.operator cimport dereference from libcpp.memory cimport unique_ptr from libcpp.string cimport string @@ -15,6 +15,8 @@ from pylibcudf.libcudf.types cimport size_type from pylibcudf.scalar cimport Scalar from pylibcudf.strings.regex_flags cimport regex_flags from pylibcudf.strings.regex_program cimport RegexProgram +from pylibcudf.utils cimport _get_stream +from rmm.pylibrmm.stream cimport Stream __all__ = ["replace_re", "replace_with_backrefs"] @@ -58,11 +60,13 @@ cpdef Column replace_re( """ cdef unique_ptr[column] c_result cdef vector[string] c_patterns + cdef Stream stream if Patterns is RegexProgram and Replacement is Scalar: if replacement is None: + stream = _get_stream(None) replacement = Scalar.from_libcudf( - cpp_make_string_scalar("".encode()) + cpp_make_string_scalar("".encode(), stream.view()) ) with nogil: c_result = move( diff --git a/python/pylibcudf/pylibcudf/strings/slice.pyx b/python/pylibcudf/pylibcudf/strings/slice.pyx index bf09d3963ff..eb6ae8505eb 100644 --- a/python/pylibcudf/pylibcudf/strings/slice.pyx +++ b/python/pylibcudf/pylibcudf/strings/slice.pyx @@ -81,16 +81,19 @@ cpdef Column slice_strings( elif ColumnOrScalar is Scalar: if start is None: + stream = _get_stream(None) start = Scalar.from_libcudf( - cpp_make_fixed_width_scalar(0) + cpp_make_fixed_width_scalar(0, stream.view()) ) if stop is None: + stream = _get_stream(None) stop = Scalar.from_libcudf( - cpp_make_fixed_width_scalar(0) + cpp_make_fixed_width_scalar(0, stream.view()) ) if step is None: + stream = _get_stream(None) step = Scalar.from_libcudf( - cpp_make_fixed_width_scalar(1) + cpp_make_fixed_width_scalar(1, stream.view()) ) cpp_start = start.c_obj.get() diff --git a/python/pylibcudf/pylibcudf/strings/split/partition.pyx b/python/pylibcudf/pylibcudf/strings/split/partition.pyx index 75537ea46d3..36337e82102 100644 --- a/python/pylibcudf/pylibcudf/strings/split/partition.pyx +++ b/python/pylibcudf/pylibcudf/strings/split/partition.pyx @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. +# Copyright (c) 2024-2025, NVIDIA CORPORATION. from libcpp.memory cimport unique_ptr from libcpp.utility cimport move from pylibcudf.column cimport Column @@ -10,6 +10,8 @@ from pylibcudf.libcudf.strings.split cimport partition as cpp_partition from pylibcudf.libcudf.table.table cimport table from pylibcudf.scalar cimport Scalar from pylibcudf.table cimport Table +from pylibcudf.utils cimport _get_stream +from rmm.pylibrmm.stream cimport Stream from cython.operator import dereference @@ -39,10 +41,12 @@ cpdef Table partition(Column input, Scalar delimiter=None): cdef const string_scalar* c_delimiter = ( delimiter.c_obj.get() ) + cdef Stream stream if delimiter is None: + stream = _get_stream(None) delimiter = Scalar.from_libcudf( - cpp_make_string_scalar("".encode()) + cpp_make_string_scalar("".encode(), stream.view()) ) with nogil: @@ -77,10 +81,12 @@ cpdef Table rpartition(Column input, Scalar delimiter=None): cdef const string_scalar* c_delimiter = ( delimiter.c_obj.get() ) + cdef Stream stream if delimiter is None: + stream = _get_stream(None) delimiter = Scalar.from_libcudf( - cpp_make_string_scalar("".encode()) + cpp_make_string_scalar("".encode(), stream.view()) ) with nogil: diff --git a/python/pylibcudf/pylibcudf/strings/strip.pyx b/python/pylibcudf/pylibcudf/strings/strip.pyx index 805d959891b..054bed6cd3c 100644 --- a/python/pylibcudf/pylibcudf/strings/strip.pyx +++ b/python/pylibcudf/pylibcudf/strings/strip.pyx @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. +# Copyright (c) 2024-2025, NVIDIA CORPORATION. from cython.operator cimport dereference from libcpp.memory cimport unique_ptr @@ -12,6 +12,8 @@ from pylibcudf.libcudf.scalar.scalar_factories cimport ( from pylibcudf.libcudf.strings cimport strip as cpp_strip from pylibcudf.scalar cimport Scalar from pylibcudf.strings.side_type cimport side_type +from pylibcudf.utils cimport _get_stream +from rmm.pylibrmm.stream cimport Stream __all__ = ["strip"] @@ -41,10 +43,12 @@ cpdef Column strip( pylibcudf.Column New strings column. """ + cdef Stream stream if to_strip is None: + stream = _get_stream(None) to_strip = Scalar.from_libcudf( - cpp_make_string_scalar("".encode()) + cpp_make_string_scalar("".encode(), stream.view()) ) cdef unique_ptr[column] c_result From c937657d384eced46fb8c75a57aaaca711c3cb4d Mon Sep 17 00:00:00 2001 From: David Wendt <45795991+davidwendt@users.noreply.github.com> Date: Wed, 20 Aug 2025 13:50:48 -0400 Subject: [PATCH 177/366] Split up rolling.cuh into separate headers (#19682) Splits up the `cpp/src/rolling/detail/rolling.cuh` into multiple headers. Moves the jit/udf code into `rolling_udf.cuh` and the device operators into `rolling_operators.cuh` and adjusts the dependent source files. Also cleans up some unnecessary includes. No function or behavior has changed--just moving internal code around for maintenance. Attempt to address https://github.com/rapidsai/cudf/issues/18568 Authors: - David Wendt (https://github.com/davidwendt) Approvers: - Bradley Dice (https://github.com/bdice) - Shruti Shivakumar (https://github.com/shrshi) URL: https://github.com/rapidsai/cudf/pull/19682 --- cpp/src/rolling/detail/rolling.cuh | 726 +----------------- .../rolling/detail/rolling_fixed_window.cu | 3 +- cpp/src/rolling/detail/rolling_operators.cuh | 682 ++++++++++++++++ cpp/src/rolling/detail/rolling_udf.cuh | 118 +++ cpp/src/rolling/detail/rolling_utils.cu | 2 +- .../rolling/detail/rolling_variable_window.cu | 5 +- cpp/src/rolling/grouped_rolling.cu | 13 +- 7 files changed, 806 insertions(+), 743 deletions(-) create mode 100644 cpp/src/rolling/detail/rolling_operators.cuh create mode 100644 cpp/src/rolling/detail/rolling_udf.cuh diff --git a/cpp/src/rolling/detail/rolling.cuh b/cpp/src/rolling/detail/rolling.cuh index c32d831a4f1..e3cff58fac3 100644 --- a/cpp/src/rolling/detail/rolling.cuh +++ b/cpp/src/rolling/detail/rolling.cuh @@ -16,15 +16,12 @@ #pragma once -#include "jit/cache.hpp" -#include "jit/parser.hpp" -#include "jit/util.hpp" #include "lead_lag_nested.cuh" #include "nth_element.cuh" #include "reductions/nested_type_minmax_util.cuh" #include "rolling.hpp" #include "rolling_collect_list.cuh" -#include "rolling_jit.hpp" +#include "rolling_operators.cuh" #include #include @@ -32,20 +29,16 @@ #include #include #include -#include #include #include -#include #include #include #include -#include #include #include #include #include #include -#include #include #include #include @@ -54,418 +47,12 @@ #include #include -#include -#include -#include -#include -#include -#include -#include - -#include - #include namespace cudf { namespace detail { -/** - * @brief Operator for applying a generic (non-specialized) rolling aggregation on a single window. - */ -template -struct DeviceRolling { - size_type min_periods; - - // what operations do we support - template - static constexpr bool is_supported() - { - return cudf::detail::is_valid_aggregation() && has_corresponding_operator() && - // MIN/MAX only supports fixed width types - (((O == aggregation::MIN || O == aggregation::MAX) && cudf::is_fixed_width()) || - (O == aggregation::SUM) || (O == aggregation::MEAN)); - } - - // operations we do support - template - explicit DeviceRolling(size_type _min_periods, std::enable_if_t()>* = nullptr) - : min_periods(_min_periods) - { - } - - // operations we don't support - template - explicit DeviceRolling(size_type _min_periods, std::enable_if_t()>* = nullptr) - : min_periods(_min_periods) - { - CUDF_FAIL("Invalid aggregation/type pair"); - } - - // perform the windowing operation - template - bool __device__ operator()(column_device_view const& input, - column_device_view const&, - mutable_column_device_view& output, - size_type start_index, - size_type end_index, - size_type current_index) const - { - using AggOp = typename corresponding_operator::type; - AggOp agg_op; - - cudf::size_type count = 0; - OutputType val = AggOp::template identity(); - - for (size_type j = start_index; j < end_index; j++) { - if (!has_nulls || input.is_valid(j)) { - OutputType element = input.element>(j); - val = agg_op(element, val); - count++; - } - } - - bool output_is_valid = (count >= min_periods); - - if (output_is_valid) { - // store the output value, one per thread, but only if the - // output is valid. min_periods is required to be >= 1, and so - // here, count must be nonzero. We need to avoid storing if - // count is zero since this could cause UB in some aggregations, - // which may cause the compiler to deduce nonsense about the loop - // that increments count. - cudf::detail::rolling_store_output_functor{}( - output.element(current_index), val, count); - } - - return output_is_valid; - } -}; - -/** - * @brief The base struct used for checking if the combination of input type and aggregation op is - * supported. - */ -template -struct DeviceRollingArgMinMaxBase { - size_type min_periods; - explicit DeviceRollingArgMinMaxBase(size_type _min_periods) : min_periods(_min_periods) {} - - static constexpr bool is_supported() - { - // Right now only support ARGMIN/ARGMAX of strings and structs. - auto const type_supported = - std::is_same_v || std::is_same_v; - auto const op_supported = op == aggregation::Kind::ARGMIN || op == aggregation::Kind::ARGMAX; - - return type_supported && op_supported; - } -}; - -/** - * @brief Operator for applying an ARGMAX/ARGMIN rolling aggregation on a single window for string. - */ -template -struct DeviceRollingArgMinMaxString : DeviceRollingArgMinMaxBase { - explicit DeviceRollingArgMinMaxString(size_type _min_periods) - : DeviceRollingArgMinMaxBase(_min_periods) - { - } - using DeviceRollingArgMinMaxBase::min_periods; - - template - bool __device__ operator()(column_device_view const& input, - column_device_view const&, - mutable_column_device_view& output, - size_type start_index, - size_type end_index, - size_type current_index) - { - auto constexpr default_output = (op == aggregation::ARGMIN) ? ARGMIN_SENTINEL : ARGMAX_SENTINEL; - - using InputType = cudf::string_view; - using AggOp = typename corresponding_operator::type; - AggOp agg_op; - - cudf::size_type count = 0; - InputType val = AggOp::template identity(); - OutputType val_index = default_output; - - for (size_type j = start_index; j < end_index; j++) { - if (!has_nulls || input.is_valid(j)) { - InputType element = input.element(j); - val = agg_op(element, val); - if (val == element) { val_index = j; } - count++; - } - } - - bool output_is_valid = (count >= min_periods); - // Use the sentinel value (i.e., -1) for the output will help identify null elements while - // gathering for Min and Max. - output.element(current_index) = output_is_valid ? val_index : default_output; - - // The gather mask shouldn't contain null values, so - // always return zero - return true; - } -}; - -/** - * @brief Operator for applying an ARGMAX/ARGMIN rolling aggregation on a single window for struct. - */ -template -struct DeviceRollingArgMinMaxStruct : DeviceRollingArgMinMaxBase { - DeviceRollingArgMinMaxStruct(size_type _min_periods, Comparator const& _comp) - : DeviceRollingArgMinMaxBase(_min_periods), comp(_comp) - { - } - using DeviceRollingArgMinMaxBase::min_periods; - Comparator comp; - - template - bool __device__ operator()(column_device_view const& input, - column_device_view const&, - mutable_column_device_view& output, - size_type start_index, - size_type end_index, - size_type current_index) - { - auto constexpr default_output = (op == aggregation::ARGMIN) ? ARGMIN_SENTINEL : ARGMAX_SENTINEL; - - auto const valid_count = - has_nulls ? thrust::count_if(thrust::seq, - thrust::make_counting_iterator(start_index), - thrust::make_counting_iterator(end_index), - [&input](size_type idx) { return input.is_valid_nocheck(idx); }) - : end_index - start_index; - - // Use the sentinel value (i.e., -1) for the output will help identify null elements while - // gathering for Min and Max. - output.element(current_index) = - (valid_count >= min_periods) ? thrust::reduce(thrust::seq, - thrust::make_counting_iterator(start_index), - thrust::make_counting_iterator(end_index), - size_type{start_index}, - comp) - : default_output; - - // The gather mask shouldn't contain null values, so always return true. - return true; - } -}; - -/** - * @brief Operator for applying a COUNT_VALID rolling aggregation on a single window. - */ -template -struct DeviceRollingCountValid { - size_type min_periods; - - // what operations do we support - template - static constexpr bool is_supported() - { - return true; - } - - DeviceRollingCountValid(size_type _min_periods) : min_periods(_min_periods) {} - - template - bool __device__ operator()(column_device_view const& input, - column_device_view const&, - mutable_column_device_view& output, - size_type start_index, - size_type end_index, - size_type current_index) - { - bool output_is_valid = ((end_index - start_index) >= min_periods); - - if (output_is_valid) { - cudf::size_type count = 0; - - if (!has_nulls) { - count = end_index - start_index; - } else { - count = thrust::count_if(thrust::seq, - thrust::make_counting_iterator(start_index), - thrust::make_counting_iterator(end_index), - [&input](auto i) { return input.is_valid_nocheck(i); }); - } - output.element(current_index) = count; - } - - return output_is_valid; - } -}; - -/** - * @brief Operator for applying a COUNT_ALL rolling aggregation on a single window. - */ -template -struct DeviceRollingCountAll { - size_type min_periods; - - // what operations do we support - template - static constexpr bool is_supported() - { - return true; - } - - DeviceRollingCountAll(size_type _min_periods) : min_periods(_min_periods) {} - - template - bool __device__ operator()(column_device_view const&, - column_device_view const&, - mutable_column_device_view& output, - size_type start_index, - size_type end_index, - size_type current_index) - { - cudf::size_type count = end_index - start_index; - - bool output_is_valid = count >= min_periods; - output.element(current_index) = count; - - return output_is_valid; - } -}; - -/** - * @brief Operator for applying a VAR rolling aggregation on a single window. - */ -template -struct DeviceRollingVariance { - size_type const min_periods; - size_type const ddof; - - // what operations do we support - template - static constexpr bool is_supported() - { - return is_fixed_width() and not is_chrono(); - } - - DeviceRollingVariance(size_type _min_periods, size_type _ddof) - : min_periods(_min_periods), ddof{_ddof} - { - } - - template - bool __device__ operator()(column_device_view const& input, - column_device_view const&, - mutable_column_device_view& output, - size_type start_index, - size_type end_index, - size_type current_index) const - { - using DeviceInputType = device_storage_type_t; - - // valid counts in the window - cudf::size_type const count = - has_nulls ? thrust::count_if(thrust::seq, - thrust::make_counting_iterator(start_index), - thrust::make_counting_iterator(end_index), - [&input](auto i) { return input.is_valid_nocheck(i); }) - : end_index - start_index; - - // Result will be null if any of the following conditions are met: - // - All inputs are null - // - Number of valid inputs is less than `min_periods` - bool output_is_valid = count > 0 and (count >= min_periods); - - if (output_is_valid) { - if (count >= ddof) { - // Welford algorithm - // See https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance - OutputType m{0}, m2{0}; - size_type running_count{0}; - - for (size_type i = start_index; i < end_index; i++) { - if (has_nulls and input.is_null_nocheck(i)) { continue; } - - OutputType const x = static_cast(input.element(i)); - - running_count++; - OutputType const tmp1 = x - m; - m += tmp1 / running_count; - OutputType const tmp2 = x - m; - m2 += tmp1 * tmp2; - } - if constexpr (is_fixed_point()) { - // For fixed_point types, the previous computed value used unscaled rep-value, - // the final result should be multiplied by the square of decimal `scale`. - OutputType scaleby = exp10(static_cast(input.type().scale())); - scaleby *= scaleby; - output.element(current_index) = m2 / (count - ddof) * scaleby; - } else { - output.element(current_index) = m2 / (count - ddof); - } - } else { - output.element(current_index) = - cuda::std::numeric_limits::signaling_NaN(); - } - } - - return output_is_valid; - } -}; - -/** - * @brief Operator for applying a ROW_NUMBER rolling aggregation on a single window. - */ -template -struct DeviceRollingRowNumber { - size_type min_periods; - - // what operations do we support - template - static constexpr bool is_supported() - { - return true; - } - - DeviceRollingRowNumber(size_type _min_periods) : min_periods(_min_periods) {} - - template - bool __device__ operator()(column_device_view const&, - column_device_view const&, - mutable_column_device_view& output, - size_type start_index, - size_type end_index, - size_type current_index) - { - bool output_is_valid = end_index - start_index >= min_periods; - output.element(current_index) = current_index - start_index + 1; - - return output_is_valid; - } -}; - -struct agg_specific_empty_output { - template - std::unique_ptr operator()(column_view const& input, rolling_aggregation const&) const - { - using target_type = cudf::detail::target_type_t; - - if constexpr (std::is_same_v, void>) { - CUDF_FAIL("Unsupported combination of column-type and aggregation."); - } - - if constexpr (cudf::is_fixed_width()) { - return cudf::make_empty_column(type_to_id()); - } - - if constexpr (op == aggregation::COLLECT_LIST) { - return cudf::make_lists_column( - 0, make_empty_column(type_to_id()), empty_like(input), 0, {}); - } - - return empty_like(input); - } -}; - static std::unique_ptr empty_output_for_rolling_aggregation(column_view const& input, rolling_aggregation const& agg) { @@ -485,241 +72,6 @@ static std::unique_ptr empty_output_for_rolling_aggregation(column_view input.type(), agg.kind, agg_specific_empty_output{}, input, agg); } -/** - * @brief Operator for applying a LEAD rolling aggregation on a single window. - */ -template -struct DeviceRollingLead { - size_type row_offset; - - // what operations do we support - template - static constexpr bool is_supported() - { - return cudf::is_fixed_width(); - } - - template - DeviceRollingLead(size_type _row_offset) - requires(is_supported()) - : row_offset(_row_offset) - { - } - - template - DeviceRollingLead(size_type _row_offset) - requires(!is_supported()) - : row_offset(_row_offset) - { - CUDF_FAIL("Invalid aggregation/type pair"); - } - - template - bool __device__ operator()(column_device_view const& input, - column_device_view const& default_outputs, - mutable_column_device_view& output, - size_type, - size_type end_index, - size_type current_index) - { - // Offsets have already been normalized. - - // Check if row is invalid. - if (row_offset > (end_index - current_index - 1)) { - // Invalid row marked. Use default value, if available. - if (default_outputs.size() == 0 || default_outputs.is_null(current_index)) { return false; } - - output.element(current_index) = - default_outputs.element(current_index); - return true; - } - - // Not an invalid row. - auto index = current_index + row_offset; - auto is_null = input.is_null(index); - if (!is_null) { - output.element(current_index) = - input.element>(index); - } - return !is_null; - } -}; - -/** - * @brief Operator for applying a LAG rolling aggregation on a single window. - */ -template -struct DeviceRollingLag { - size_type row_offset; - - // what operations do we support - template - static constexpr bool is_supported() - { - return cudf::is_fixed_width(); - } - - template - DeviceRollingLag(size_type _row_offset) - requires(is_supported()) - : row_offset(_row_offset) - { - } - - template - DeviceRollingLag(size_type _row_offset) - requires(!is_supported()) - : row_offset(_row_offset) - { - CUDF_FAIL("Invalid aggregation/type pair"); - } - - template - bool __device__ operator()(column_device_view const& input, - column_device_view const& default_outputs, - mutable_column_device_view& output, - size_type start_index, - size_type, - size_type current_index) - { - // Offsets have already been normalized. - - // Check if row is invalid. - if (row_offset > (current_index - start_index)) { - // Invalid row marked. Use default value, if available. - if (default_outputs.size() == 0 || default_outputs.is_null(current_index)) { return false; } - - output.element(current_index) = - default_outputs.element(current_index); - return true; - } - - // Not an invalid row. - auto index = current_index - row_offset; - auto is_null = input.is_null(index); - if (!is_null) { - output.element(current_index) = - input.element>(index); - } - return !is_null; - } -}; - -/** - * @brief Maps an `InputType and `aggregation::Kind` value to its corresponding - * rolling window operator. - * - * @tparam InputType The input type to map to its corresponding operator - * @tparam k The `aggregation::Kind` value to map to its corresponding operator - */ -template -struct corresponding_rolling_operator { - using type = DeviceRolling; -}; - -template -struct corresponding_rolling_operator { - using type = DeviceRollingArgMinMaxBase; -}; - -template -struct corresponding_rolling_operator { - using type = DeviceRollingArgMinMaxBase; -}; - -template -struct corresponding_rolling_operator { - using type = DeviceRollingCountValid; -}; - -template -struct corresponding_rolling_operator { - using type = DeviceRollingCountAll; -}; - -template -struct corresponding_rolling_operator { - using type = DeviceRollingRowNumber; -}; - -template -struct corresponding_rolling_operator { - using type = DeviceRollingVariance; -}; - -template -struct corresponding_rolling_operator { - using type = DeviceRollingLead; -}; - -template -struct corresponding_rolling_operator { - using type = DeviceRollingLag; -}; - -/** - * @brief Functor for creating a device rolling operator based on input type and aggregation type. - */ -template -struct create_rolling_operator { - auto operator()(size_type min_periods, rolling_aggregation const&) - { - return typename corresponding_rolling_operator::type(min_periods); - } -}; - -template -struct create_rolling_operator { - auto operator()(size_type min_periods, rolling_aggregation const& agg) - { - return DeviceRollingVariance{ - min_periods, dynamic_cast(agg)._ddof}; - } -}; - -template -struct create_rolling_operator { - auto operator()(size_type, rolling_aggregation const& agg) - { - return DeviceRollingLead{ - dynamic_cast(agg).row_offset}; - } -}; - -template -struct create_rolling_operator { - auto operator()(size_type, rolling_aggregation const& agg) - { - return DeviceRollingLag{ - dynamic_cast(agg).row_offset}; - } -}; - -template -struct create_rolling_operator< - InputType, - k, - typename std::enable_if_t && - (k == aggregation::Kind::ARGMIN || k == aggregation::Kind::ARGMAX)>> { - auto operator()(size_type min_periods, rolling_aggregation const&) - { - return DeviceRollingArgMinMaxString{min_periods}; - } -}; - -template -struct create_rolling_operator< - InputType, - k, - typename std::enable_if_t && - (k == aggregation::Kind::ARGMIN || k == aggregation::Kind::ARGMAX)>> { - template - auto operator()(size_type min_periods, Comparator const& comp) - { - return DeviceRollingArgMinMaxStruct{min_periods, comp}; - } -}; - /** * @brief Rolling window specific implementation of simple_aggregations_collector. * @@ -1229,82 +581,6 @@ struct dispatch_rolling { } }; -// Applies a user-defined rolling window function to the values in a column. -template -std::unique_ptr rolling_window_udf(column_view const& input, - PrecedingWindowIterator preceding_window, - std::string const& preceding_window_str, - FollowingWindowIterator following_window, - std::string const& following_window_str, - size_type min_periods, - rolling_aggregation const& agg, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) -{ - static_assert(warp_size == cudf::detail::size_in_bits(), - "bitmask_type size does not match CUDA warp size"); - - if (input.has_nulls()) { - CUDF_FAIL("Currently the UDF version of rolling window does NOT support inputs with nulls."); - } - - min_periods = std::max(min_periods, 0); - - auto& udf_agg = dynamic_cast(agg); - - std::string hash = "prog_rolling." + std::to_string(std::hash{}(udf_agg._source)); - - std::string cuda_source; - switch (udf_agg.kind) { - case aggregation::Kind::PTX: - cuda_source += - cudf::jit::parse_single_function_ptx(udf_agg._source, - udf_agg._function_name, - {{0, cudf::type_to_name(udf_agg._output_type) + " *"}, - {5, "void const *"}}); // args 0 and 5 are pointers - break; - case aggregation::Kind::CUDA: - cuda_source += cudf::jit::parse_single_function_cuda(udf_agg._source, udf_agg._function_name); - break; - default: CUDF_FAIL("Unsupported UDF type."); - } - - std::unique_ptr output = make_numeric_column( - udf_agg._output_type, input.size(), cudf::mask_state::UNINITIALIZED, stream, mr); - - auto output_view = output->mutable_view(); - cudf::detail::device_scalar device_valid_count{0, stream}; - - std::string kernel_name = - jitify2::reflection::Template("cudf::rolling::jit::gpu_rolling_new") // - .instantiate(cudf::type_to_name(input.type()), // list of template arguments - cudf::type_to_name(output->type()), - udf_agg._operator_name, - preceding_window_str.c_str(), - following_window_str.c_str()); - - cudf::jit::get_program_cache(*rolling_jit_kernel_cu_jit) - .get_kernel( - kernel_name, {}, {{"rolling/jit/operation-udf.hpp", cuda_source}}, {"-arch=sm_."}) // - ->configure_1d_max_occupancy(0, 0, nullptr, stream.value()) // - ->launch(input.size(), - cudf::jit::get_data_ptr(input), - input.null_mask(), - cudf::jit::get_data_ptr(output_view), - output_view.null_mask(), - device_valid_count.data(), - preceding_window, - following_window, - min_periods); - - output->set_null_count(output->size() - device_valid_count.value(stream)); - - // check the stream for debugging - CUDF_CHECK_CUDA(stream.value()); - - return output; -} - /** * @copydoc cudf::rolling_window(column_view const& input, * PrecedingWindowIterator preceding_window_begin, diff --git a/cpp/src/rolling/detail/rolling_fixed_window.cu b/cpp/src/rolling/detail/rolling_fixed_window.cu index 7526c858899..806d67bb7bb 100644 --- a/cpp/src/rolling/detail/rolling_fixed_window.cu +++ b/cpp/src/rolling/detail/rolling_fixed_window.cu @@ -15,6 +15,7 @@ */ #include "rolling.cuh" +#include "rolling_udf.cuh" #include "rolling_utils.cuh" #include @@ -23,8 +24,6 @@ #include #include -#include - namespace cudf::detail { // Applies a fixed-size rolling window function to the values in a column. diff --git a/cpp/src/rolling/detail/rolling_operators.cuh b/cpp/src/rolling/detail/rolling_operators.cuh new file mode 100644 index 00000000000..dea196ab676 --- /dev/null +++ b/cpp/src/rolling/detail/rolling_operators.cuh @@ -0,0 +1,682 @@ +/* + * Copyright (c) 2025, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "rolling.hpp" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include + +#include + +namespace cudf { + +namespace detail { + +/** + * @brief Operator for applying a generic (non-specialized) rolling aggregation on a single window. + */ +template +struct DeviceRolling { + size_type min_periods; + + // what operations do we support + template + static constexpr bool is_supported() + { + return cudf::detail::is_valid_aggregation() && has_corresponding_operator() && + // MIN/MAX only supports fixed width types + (((O == aggregation::MIN || O == aggregation::MAX) && cudf::is_fixed_width()) || + (O == aggregation::SUM) || (O == aggregation::MEAN)); + } + + // operations we do support + template + explicit DeviceRolling(size_type _min_periods, std::enable_if_t()>* = nullptr) + : min_periods(_min_periods) + { + } + + // operations we don't support + template + explicit DeviceRolling(size_type _min_periods, std::enable_if_t()>* = nullptr) + : min_periods(_min_periods) + { + CUDF_FAIL("Invalid aggregation/type pair"); + } + + // perform the windowing operation + template + bool __device__ operator()(column_device_view const& input, + column_device_view const&, + mutable_column_device_view& output, + size_type start_index, + size_type end_index, + size_type current_index) const + { + using AggOp = typename corresponding_operator::type; + AggOp agg_op; + + cudf::size_type count = 0; + OutputType val = AggOp::template identity(); + + for (size_type j = start_index; j < end_index; j++) { + if (!has_nulls || input.is_valid(j)) { + OutputType element = input.element>(j); + val = agg_op(element, val); + count++; + } + } + + bool output_is_valid = (count >= min_periods); + + if (output_is_valid) { + // store the output value, one per thread, but only if the + // output is valid. min_periods is required to be >= 1, and so + // here, count must be nonzero. We need to avoid storing if + // count is zero since this could cause UB in some aggregations, + // which may cause the compiler to deduce nonsense about the loop + // that increments count. + cudf::detail::rolling_store_output_functor{}( + output.element(current_index), val, count); + } + + return output_is_valid; + } +}; + +/** + * @brief The base struct used for checking if the combination of input type and aggregation op is + * supported. + */ +template +struct DeviceRollingArgMinMaxBase { + size_type min_periods; + explicit DeviceRollingArgMinMaxBase(size_type _min_periods) : min_periods(_min_periods) {} + + static constexpr bool is_supported() + { + // Right now only support ARGMIN/ARGMAX of strings and structs. + auto const type_supported = + std::is_same_v || std::is_same_v; + auto const op_supported = op == aggregation::Kind::ARGMIN || op == aggregation::Kind::ARGMAX; + + return type_supported && op_supported; + } +}; + +/** + * @brief Operator for applying an ARGMAX/ARGMIN rolling aggregation on a single window for string. + */ +template +struct DeviceRollingArgMinMaxString : DeviceRollingArgMinMaxBase { + explicit DeviceRollingArgMinMaxString(size_type _min_periods) + : DeviceRollingArgMinMaxBase(_min_periods) + { + } + using DeviceRollingArgMinMaxBase::min_periods; + + template + bool __device__ operator()(column_device_view const& input, + column_device_view const&, + mutable_column_device_view& output, + size_type start_index, + size_type end_index, + size_type current_index) + { + auto constexpr default_output = (op == aggregation::ARGMIN) ? ARGMIN_SENTINEL : ARGMAX_SENTINEL; + + using InputType = cudf::string_view; + using AggOp = typename corresponding_operator::type; + AggOp agg_op; + + cudf::size_type count = 0; + InputType val = AggOp::template identity(); + OutputType val_index = default_output; + + for (size_type j = start_index; j < end_index; j++) { + if (!has_nulls || input.is_valid(j)) { + InputType element = input.element(j); + val = agg_op(element, val); + if (val == element) { val_index = j; } + count++; + } + } + + bool output_is_valid = (count >= min_periods); + // Use the sentinel value (i.e., -1) for the output will help identify null elements while + // gathering for Min and Max. + output.element(current_index) = output_is_valid ? val_index : default_output; + + // The gather mask shouldn't contain null values, so + // always return zero + return true; + } +}; + +/** + * @brief Operator for applying an ARGMAX/ARGMIN rolling aggregation on a single window for struct. + */ +template +struct DeviceRollingArgMinMaxStruct : DeviceRollingArgMinMaxBase { + DeviceRollingArgMinMaxStruct(size_type _min_periods, Comparator const& _comp) + : DeviceRollingArgMinMaxBase(_min_periods), comp(_comp) + { + } + using DeviceRollingArgMinMaxBase::min_periods; + Comparator comp; + + template + bool __device__ operator()(column_device_view const& input, + column_device_view const&, + mutable_column_device_view& output, + size_type start_index, + size_type end_index, + size_type current_index) + { + auto constexpr default_output = (op == aggregation::ARGMIN) ? ARGMIN_SENTINEL : ARGMAX_SENTINEL; + + auto const valid_count = + has_nulls ? thrust::count_if(thrust::seq, + thrust::make_counting_iterator(start_index), + thrust::make_counting_iterator(end_index), + [&input](size_type idx) { return input.is_valid_nocheck(idx); }) + : end_index - start_index; + + // Use the sentinel value (i.e., -1) for the output will help identify null elements while + // gathering for Min and Max. + output.element(current_index) = + (valid_count >= min_periods) ? thrust::reduce(thrust::seq, + thrust::make_counting_iterator(start_index), + thrust::make_counting_iterator(end_index), + size_type{start_index}, + comp) + : default_output; + + // The gather mask shouldn't contain null values, so always return true. + return true; + } +}; + +/** + * @brief Operator for applying a COUNT_VALID rolling aggregation on a single window. + */ +template +struct DeviceRollingCountValid { + size_type min_periods; + + // what operations do we support + template + static constexpr bool is_supported() + { + return true; + } + + DeviceRollingCountValid(size_type _min_periods) : min_periods(_min_periods) {} + + template + bool __device__ operator()(column_device_view const& input, + column_device_view const&, + mutable_column_device_view& output, + size_type start_index, + size_type end_index, + size_type current_index) + { + bool output_is_valid = ((end_index - start_index) >= min_periods); + + if (output_is_valid) { + cudf::size_type count = 0; + + if (!has_nulls) { + count = end_index - start_index; + } else { + count = thrust::count_if(thrust::seq, + thrust::make_counting_iterator(start_index), + thrust::make_counting_iterator(end_index), + [&input](auto i) { return input.is_valid_nocheck(i); }); + } + output.element(current_index) = count; + } + + return output_is_valid; + } +}; + +/** + * @brief Operator for applying a COUNT_ALL rolling aggregation on a single window. + */ +template +struct DeviceRollingCountAll { + size_type min_periods; + + // what operations do we support + template + static constexpr bool is_supported() + { + return true; + } + + DeviceRollingCountAll(size_type _min_periods) : min_periods(_min_periods) {} + + template + bool __device__ operator()(column_device_view const&, + column_device_view const&, + mutable_column_device_view& output, + size_type start_index, + size_type end_index, + size_type current_index) + { + cudf::size_type count = end_index - start_index; + + bool output_is_valid = count >= min_periods; + output.element(current_index) = count; + + return output_is_valid; + } +}; + +/** + * @brief Operator for applying a VAR rolling aggregation on a single window. + */ +template +struct DeviceRollingVariance { + size_type const min_periods; + size_type const ddof; + + // what operations do we support + template + static constexpr bool is_supported() + { + return is_fixed_width() and not is_chrono(); + } + + DeviceRollingVariance(size_type _min_periods, size_type _ddof) + : min_periods(_min_periods), ddof{_ddof} + { + } + + template + bool __device__ operator()(column_device_view const& input, + column_device_view const&, + mutable_column_device_view& output, + size_type start_index, + size_type end_index, + size_type current_index) const + { + using DeviceInputType = device_storage_type_t; + + // valid counts in the window + cudf::size_type const count = + has_nulls ? thrust::count_if(thrust::seq, + thrust::make_counting_iterator(start_index), + thrust::make_counting_iterator(end_index), + [&input](auto i) { return input.is_valid_nocheck(i); }) + : end_index - start_index; + + // Result will be null if any of the following conditions are met: + // - All inputs are null + // - Number of valid inputs is less than `min_periods` + bool output_is_valid = count > 0 and (count >= min_periods); + + if (output_is_valid) { + if (count >= ddof) { + // Welford algorithm + // See https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance + OutputType m{0}, m2{0}; + size_type running_count{0}; + + for (size_type i = start_index; i < end_index; i++) { + if (has_nulls and input.is_null_nocheck(i)) { continue; } + + OutputType const x = static_cast(input.element(i)); + + running_count++; + OutputType const tmp1 = x - m; + m += tmp1 / running_count; + OutputType const tmp2 = x - m; + m2 += tmp1 * tmp2; + } + if constexpr (is_fixed_point()) { + // For fixed_point types, the previous computed value used unscaled rep-value, + // the final result should be multiplied by the square of decimal `scale`. + OutputType scaleby = exp10(static_cast(input.type().scale())); + scaleby *= scaleby; + output.element(current_index) = m2 / (count - ddof) * scaleby; + } else { + output.element(current_index) = m2 / (count - ddof); + } + } else { + output.element(current_index) = + cuda::std::numeric_limits::signaling_NaN(); + } + } + + return output_is_valid; + } +}; + +/** + * @brief Operator for applying a ROW_NUMBER rolling aggregation on a single window. + */ +template +struct DeviceRollingRowNumber { + size_type min_periods; + + // what operations do we support + template + static constexpr bool is_supported() + { + return true; + } + + DeviceRollingRowNumber(size_type _min_periods) : min_periods(_min_periods) {} + + template + bool __device__ operator()(column_device_view const&, + column_device_view const&, + mutable_column_device_view& output, + size_type start_index, + size_type end_index, + size_type current_index) + { + bool output_is_valid = end_index - start_index >= min_periods; + output.element(current_index) = current_index - start_index + 1; + + return output_is_valid; + } +}; + +struct agg_specific_empty_output { + template + std::unique_ptr operator()(column_view const& input, rolling_aggregation const&) const + { + using target_type = cudf::detail::target_type_t; + + if constexpr (std::is_same_v, void>) { + CUDF_FAIL("Unsupported combination of column-type and aggregation."); + } + + if constexpr (cudf::is_fixed_width()) { + return cudf::make_empty_column(type_to_id()); + } + + if constexpr (op == aggregation::COLLECT_LIST) { + return cudf::make_lists_column( + 0, make_empty_column(type_to_id()), empty_like(input), 0, {}); + } + + return empty_like(input); + } +}; + +/** + * @brief Operator for applying a LEAD rolling aggregation on a single window. + */ +template +struct DeviceRollingLead { + size_type row_offset; + + // what operations do we support + template + static constexpr bool is_supported() + { + return cudf::is_fixed_width(); + } + + template + DeviceRollingLead(size_type _row_offset) + requires(is_supported()) + : row_offset(_row_offset) + { + } + + template + DeviceRollingLead(size_type _row_offset) + requires(!is_supported()) + : row_offset(_row_offset) + { + CUDF_FAIL("Invalid aggregation/type pair"); + } + + template + bool __device__ operator()(column_device_view const& input, + column_device_view const& default_outputs, + mutable_column_device_view& output, + size_type, + size_type end_index, + size_type current_index) + { + // Offsets have already been normalized. + + // Check if row is invalid. + if (row_offset > (end_index - current_index - 1)) { + // Invalid row marked. Use default value, if available. + if (default_outputs.size() == 0 || default_outputs.is_null(current_index)) { return false; } + + output.element(current_index) = + default_outputs.element(current_index); + return true; + } + + // Not an invalid row. + auto index = current_index + row_offset; + auto is_null = input.is_null(index); + if (!is_null) { + output.element(current_index) = + input.element>(index); + } + return !is_null; + } +}; + +/** + * @brief Operator for applying a LAG rolling aggregation on a single window. + */ +template +struct DeviceRollingLag { + size_type row_offset; + + // what operations do we support + template + static constexpr bool is_supported() + { + return cudf::is_fixed_width(); + } + + template + DeviceRollingLag(size_type _row_offset) + requires(is_supported()) + : row_offset(_row_offset) + { + } + + template + DeviceRollingLag(size_type _row_offset) + requires(!is_supported()) + : row_offset(_row_offset) + { + CUDF_FAIL("Invalid aggregation/type pair"); + } + + template + bool __device__ operator()(column_device_view const& input, + column_device_view const& default_outputs, + mutable_column_device_view& output, + size_type start_index, + size_type, + size_type current_index) + { + // Offsets have already been normalized. + + // Check if row is invalid. + if (row_offset > (current_index - start_index)) { + // Invalid row marked. Use default value, if available. + if (default_outputs.size() == 0 || default_outputs.is_null(current_index)) { return false; } + + output.element(current_index) = + default_outputs.element(current_index); + return true; + } + + // Not an invalid row. + auto index = current_index - row_offset; + auto is_null = input.is_null(index); + if (!is_null) { + output.element(current_index) = + input.element>(index); + } + return !is_null; + } +}; + +/** + * @brief Maps an `InputType and `aggregation::Kind` value to its corresponding + * rolling window operator. + * + * @tparam InputType The input type to map to its corresponding operator + * @tparam k The `aggregation::Kind` value to map to its corresponding operator + */ +template +struct corresponding_rolling_operator { + using type = DeviceRolling; +}; + +template +struct corresponding_rolling_operator { + using type = DeviceRollingArgMinMaxBase; +}; + +template +struct corresponding_rolling_operator { + using type = DeviceRollingArgMinMaxBase; +}; + +template +struct corresponding_rolling_operator { + using type = DeviceRollingCountValid; +}; + +template +struct corresponding_rolling_operator { + using type = DeviceRollingCountAll; +}; + +template +struct corresponding_rolling_operator { + using type = DeviceRollingRowNumber; +}; + +template +struct corresponding_rolling_operator { + using type = DeviceRollingVariance; +}; + +template +struct corresponding_rolling_operator { + using type = DeviceRollingLead; +}; + +template +struct corresponding_rolling_operator { + using type = DeviceRollingLag; +}; + +/** + * @brief Functor for creating a device rolling operator based on input type and aggregation type. + */ +template +struct create_rolling_operator { + auto operator()(size_type min_periods, rolling_aggregation const&) + { + return typename corresponding_rolling_operator::type(min_periods); + } +}; + +template +struct create_rolling_operator { + auto operator()(size_type min_periods, rolling_aggregation const& agg) + { + return DeviceRollingVariance{ + min_periods, dynamic_cast(agg)._ddof}; + } +}; + +template +struct create_rolling_operator { + auto operator()(size_type, rolling_aggregation const& agg) + { + return DeviceRollingLead{ + dynamic_cast(agg).row_offset}; + } +}; + +template +struct create_rolling_operator { + auto operator()(size_type, rolling_aggregation const& agg) + { + return DeviceRollingLag{ + dynamic_cast(agg).row_offset}; + } +}; + +template +struct create_rolling_operator< + InputType, + k, + typename std::enable_if_t && + (k == aggregation::Kind::ARGMIN || k == aggregation::Kind::ARGMAX)>> { + auto operator()(size_type min_periods, rolling_aggregation const&) + { + return DeviceRollingArgMinMaxString{min_periods}; + } +}; + +template +struct create_rolling_operator< + InputType, + k, + typename std::enable_if_t && + (k == aggregation::Kind::ARGMIN || k == aggregation::Kind::ARGMAX)>> { + template + auto operator()(size_type min_periods, Comparator const& comp) + { + return DeviceRollingArgMinMaxStruct{min_periods, comp}; + } +}; + +} // namespace detail + +} // namespace cudf diff --git a/cpp/src/rolling/detail/rolling_udf.cuh b/cpp/src/rolling/detail/rolling_udf.cuh new file mode 100644 index 00000000000..f5c3f3a4b70 --- /dev/null +++ b/cpp/src/rolling/detail/rolling_udf.cuh @@ -0,0 +1,118 @@ +/* + * Copyright (c) 2025, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "jit/cache.hpp" +#include "jit/parser.hpp" +#include "jit/util.hpp" +#include "rolling.hpp" +#include "rolling_jit.hpp" + +#include +#include +#include +#include +#include +#include + +#include + +#include + +#include + +namespace cudf { +namespace detail { + +// Applies a user-defined rolling window function to the values in a column. +template +std::unique_ptr rolling_window_udf(column_view const& input, + PrecedingWindowIterator preceding_window, + std::string const& preceding_window_str, + FollowingWindowIterator following_window, + std::string const& following_window_str, + size_type min_periods, + rolling_aggregation const& agg, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + static_assert(warp_size == cudf::detail::size_in_bits(), + "bitmask_type size does not match CUDA warp size"); + + if (input.has_nulls()) { + CUDF_FAIL("Currently the UDF version of rolling window does NOT support inputs with nulls."); + } + + min_periods = std::max(min_periods, 0); + + auto& udf_agg = dynamic_cast(agg); + + std::string hash = "prog_rolling." + std::to_string(std::hash{}(udf_agg._source)); + + std::string cuda_source; + switch (udf_agg.kind) { + case aggregation::Kind::PTX: + cuda_source += + cudf::jit::parse_single_function_ptx(udf_agg._source, + udf_agg._function_name, + {{0, cudf::type_to_name(udf_agg._output_type) + " *"}, + {5, "void const *"}}); // args 0 and 5 are pointers + break; + case aggregation::Kind::CUDA: + cuda_source += cudf::jit::parse_single_function_cuda(udf_agg._source, udf_agg._function_name); + break; + default: CUDF_FAIL("Unsupported UDF type."); + } + + std::unique_ptr output = make_numeric_column( + udf_agg._output_type, input.size(), cudf::mask_state::UNINITIALIZED, stream, mr); + + auto output_view = output->mutable_view(); + cudf::detail::device_scalar device_valid_count{0, stream}; + + std::string kernel_name = + jitify2::reflection::Template("cudf::rolling::jit::gpu_rolling_new") // + .instantiate(cudf::type_to_name(input.type()), // list of template arguments + cudf::type_to_name(output->type()), + udf_agg._operator_name, + preceding_window_str.c_str(), + following_window_str.c_str()); + + cudf::jit::get_program_cache(*rolling_jit_kernel_cu_jit) + .get_kernel( + kernel_name, {}, {{"rolling/jit/operation-udf.hpp", cuda_source}}, {"-arch=sm_."}) // + ->configure_1d_max_occupancy(0, 0, nullptr, stream.value()) // + ->launch(input.size(), + cudf::jit::get_data_ptr(input), + input.null_mask(), + cudf::jit::get_data_ptr(output_view), + output_view.null_mask(), + device_valid_count.data(), + preceding_window, + following_window, + min_periods); + + output->set_null_count(output->size() - device_valid_count.value(stream)); + + // check the stream for debugging + CUDF_CHECK_CUDA(stream.value()); + + return output; +} + +} // namespace detail +} // namespace cudf diff --git a/cpp/src/rolling/detail/rolling_utils.cu b/cpp/src/rolling/detail/rolling_utils.cu index 00d7b8648d4..114f174f160 100644 --- a/cpp/src/rolling/detail/rolling_utils.cu +++ b/cpp/src/rolling/detail/rolling_utils.cu @@ -14,8 +14,8 @@ * limitations under the License. */ -#include "rolling.cuh" #include "rolling.hpp" +#include "rolling_operators.cuh" #include #include diff --git a/cpp/src/rolling/detail/rolling_variable_window.cu b/cpp/src/rolling/detail/rolling_variable_window.cu index a345c0d0c28..f7729168fcf 100644 --- a/cpp/src/rolling/detail/rolling_variable_window.cu +++ b/cpp/src/rolling/detail/rolling_variable_window.cu @@ -15,16 +15,13 @@ */ #include "rolling.cuh" +#include "rolling_udf.cuh" #include #include #include #include -#include -#include -#include - namespace cudf::detail { // Applies a variable-size rolling window function to the values in a column. diff --git a/cpp/src/rolling/grouped_rolling.cu b/cpp/src/rolling/grouped_rolling.cu index f3e85d4d3be..1dd118d85a5 100644 --- a/cpp/src/rolling/grouped_rolling.cu +++ b/cpp/src/rolling/grouped_rolling.cu @@ -17,19 +17,15 @@ #include "detail/optimized_unbounded_window.hpp" #include "detail/range_window_bounds.hpp" #include "detail/rolling.cuh" -#include "detail/rolling_jit.hpp" +#include "detail/rolling_udf.cuh" #include "detail/rolling_utils.cuh" -#include "rolling/detail/rolling.hpp" +#include #include #include -#include -#include -#include #include #include #include -#include #include #include #include @@ -37,11 +33,6 @@ #include -#include - -#include -#include - namespace cudf { namespace detail { From 9f7e84de927d150da582238499457cbcb19b6b70 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Wed, 20 Aug 2025 12:05:24 -0700 Subject: [PATCH 178/366] Move test_udf_masked_ops/test_dropna to new cudf classic testing directory (#19730) Towards https://github.com/rapidsai/cudf/issues/9999 Towards https://github.com/rapidsai/cudf/issues/15723 Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - https://github.com/brandon-b-miller URL: https://github.com/rapidsai/cudf/pull/19730 --- python/cudf/cudf/core/indexed_frame.py | 2 +- .../cudf/pandas/scripts/conftest-patch.py | 2 +- python/cudf/cudf/testing/_utils.py | 7 - python/cudf/cudf/tests/conftest.py | 55 + .../tests/dataframe/methods/test_apply.py | 772 ++++++++++++ .../tests/dataframe/methods/test_dropna.py | 157 ++- .../indexes/index/methods/test_dropna.py | 27 + .../tests/indexes/index/test_attributes.py | 39 +- .../indexes/multiindex/methods/test_dropna.py | 41 + .../indexes/rangeindex/test_attributes.py | 10 +- .../cudf/tests/series/methods/test_apply.py | 281 +++++ .../cudf/tests/series/methods/test_dropna.py | 81 ++ python/cudf/cudf/tests/test_contains.py | 41 - python/cudf/cudf/tests/test_dropna.py | 299 ----- .../cudf/tests/test_extension_compilation.py | 10 +- python/cudf/cudf/tests/test_udf_masked_ops.py | 1069 ----------------- 16 files changed, 1465 insertions(+), 1428 deletions(-) create mode 100644 python/cudf/cudf/tests/dataframe/methods/test_apply.py create mode 100644 python/cudf/cudf/tests/indexes/index/methods/test_dropna.py create mode 100644 python/cudf/cudf/tests/indexes/multiindex/methods/test_dropna.py create mode 100644 python/cudf/cudf/tests/series/methods/test_apply.py create mode 100644 python/cudf/cudf/tests/series/methods/test_dropna.py delete mode 100644 python/cudf/cudf/tests/test_dropna.py delete mode 100644 python/cudf/cudf/tests/test_udf_masked_ops.py diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index f7ef0f0db12..e9c8b71fbe3 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -4225,7 +4225,7 @@ def dropna( name toy born 0 Alfred Batmobile 1940-04-25 """ - if axis == 0: + if axis in [0, "index"]: result = self._drop_na_rows(how=how, subset=subset, thresh=thresh) if ignore_index: result.index = RangeIndex(len(result)) diff --git a/python/cudf/cudf/pandas/scripts/conftest-patch.py b/python/cudf/cudf/pandas/scripts/conftest-patch.py index 9a1051ec158..b0419f844fd 100644 --- a/python/cudf/cudf/pandas/scripts/conftest-patch.py +++ b/python/cudf/cudf/pandas/scripts/conftest-patch.py @@ -5157,6 +5157,7 @@ def pytest_unconfigure(config): "tests/frame/methods/test_dropna.py::TestDataFrameMissingData::test_dropna", "tests/frame/methods/test_dropna.py::TestDataFrameMissingData::test_dropna_multiple_axes", "tests/frame/methods/test_dropna.py::TestDataFrameMissingData::test_no_nans_in_frame[axis=0]", + "tests/frame/methods/test_dropna.py::TestDataFrameMissingData::test_no_nans_in_frame[axis='index']", "tests/frame/methods/test_dtypes.py::TestDataFrameDataTypes::test_dtypes_timedeltas", "tests/frame/methods/test_equals.py::TestEquals::test_equals_different_blocks", "tests/frame/methods/test_explode.py::test_duplicate_index[input_dict0-input_index0-expected_dict0-expected_index0]", @@ -13144,7 +13145,6 @@ def pytest_unconfigure(config): "tests/series/methods/test_update.py::TestUpdate::test_update_dtypes[other9-int64-expected9-FutureWarning]", "tests/series/methods/test_value_counts.py::TestSeriesValueCounts::test_value_counts_categorical_with_nan", "tests/series/test_api.py::TestSeriesMisc::test_attrs", - "tests/series/test_api.py::TestSeriesMisc::test_axis_alias", "tests/series/test_api.py::TestSeriesMisc::test_index_tab_completion[index0]", "tests/series/test_api.py::TestSeriesMisc::test_index_tab_completion[index10]", "tests/series/test_api.py::TestSeriesMisc::test_index_tab_completion[index11]", diff --git a/python/cudf/cudf/testing/_utils.py b/python/cudf/cudf/testing/_utils.py index 4c662808b9c..2987f94d1c9 100644 --- a/python/cudf/cudf/testing/_utils.py +++ b/python/cudf/cudf/testing/_utils.py @@ -1,7 +1,6 @@ # Copyright (c) 2020-2025, NVIDIA CORPORATION. from __future__ import annotations -import itertools import string from collections import abc from contextlib import contextmanager @@ -358,12 +357,6 @@ def assert_asserters_equal( cudf_asserter(cudf_left, cudf_right, *args, **kwargs) -parametrize_numeric_dtypes_pairwise = pytest.mark.parametrize( - "left_dtype,right_dtype", - list(itertools.combinations_with_replacement(NUMERIC_TYPES, 2)), -) - - @contextmanager def expect_warning_if(condition, warning=FutureWarning, *args, **kwargs): """Catch a warning using pytest.warns if the expect_warning is True. diff --git a/python/cudf/cudf/tests/conftest.py b/python/cudf/cudf/tests/conftest.py index 772abdfba32..b9c21a67c43 100644 --- a/python/cudf/cudf/tests/conftest.py +++ b/python/cudf/cudf/tests/conftest.py @@ -1,6 +1,7 @@ # Copyright (c) 2019-2025, NVIDIA CORPORATION. import itertools +import math import operator import os import pathlib @@ -210,6 +211,44 @@ def set_decomp_env_vars(monkeypatch, request): operator.gt, operator.ge, ] +bitwise_ops = [ + operator.and_, + operator.or_, + operator.xor, +] +unary_ops = [ + math.acos, + math.acosh, + math.asin, + math.asinh, + math.atan, + math.atanh, + math.ceil, + math.cos, + math.degrees, + math.erf, + math.erfc, + math.exp, + math.expm1, + math.fabs, + math.floor, + math.gamma, + math.lgamma, + math.log, + math.log10, + math.log1p, + math.log2, + math.radians, + math.sin, + math.sinh, + math.sqrt, + math.tan, + math.tanh, + operator.pos, + operator.neg, + operator.not_, + operator.invert, +] @pytest.fixture(params=arithmetic_ops) @@ -238,6 +277,16 @@ def comparison_op_method(comparison_op): return comparison_op.__name__ +@pytest.fixture(params=bitwise_ops) +def bitwise_op(request): + return request.param + + +@pytest.fixture(params=unary_ops) +def unary_op(request): + return request.param + + @pytest.fixture(params=arithmetic_ops + comparison_ops) def binary_op(request): return request.param @@ -576,3 +625,9 @@ def categorical_ordered(request): def interval_closed(request): """Param for `closed` argument for interval types""" return request.param + + +@pytest.fixture(params=["all", "any"]) +def dropna_how(request): + """Param for `how` argument""" + return request.param diff --git a/python/cudf/cudf/tests/dataframe/methods/test_apply.py b/python/cudf/cudf/tests/dataframe/methods/test_apply.py new file mode 100644 index 00000000000..b49f8d9dc25 --- /dev/null +++ b/python/cudf/cudf/tests/dataframe/methods/test_apply.py @@ -0,0 +1,772 @@ +# Copyright (c) 2021-2025, NVIDIA CORPORATION. +import decimal +import math +import operator + +import numpy as np +import pytest +from numba import cuda +from numba.core.typing import signature as nb_signature +from numba.core.typing.templates import AbstractTemplate +from numba.cuda.cudadecl import registry as cuda_decl_registry +from numba.cuda.cudaimpl import lower as cuda_lower + +import cudf +from cudf.core.missing import NA +from cudf.core.udf._ops import ( + comparison_ops, +) +from cudf.core.udf.strings_lowering import ( + cast_string_view_to_managed_udf_string, +) +from cudf.core.udf.strings_typing import ( + StringView, + managed_udf_string, + string_view, +) +from cudf.testing import assert_eq + + +def sv_to_managed_udf_str(sv): + """ + Cast a string_view object to a managed_udf_string object + + This placeholder function never runs in python + It exists only for numba to have something to replace + with the typing and lowering code below + + This is similar conceptually to needing a translation + engine to emit an expression in target language "B" when + there is no equivalent in the source language "A" to + translate from. This function effectively defines the + expression in language "A" and the associated typing + and lowering describe the translation process, despite + the expression having no meaning in language "A" + """ + pass + + +@cuda_decl_registry.register_global(sv_to_managed_udf_str) +class StringViewToUDFStringDecl(AbstractTemplate): + def generic(args, kws): + if isinstance(args[0], StringView) and len(args) == 1: + return nb_signature(managed_udf_string, string_view) + + +@cuda_lower(sv_to_managed_udf_str, string_view) +def sv_to_udf_str_testing_lowering(context, builder, sig, args): + return cast_string_view_to_managed_udf_string( + context, builder, sig.args[0], sig.return_type, args[0] + ) + + +def run_masked_udf_test(func, data, args=(), nullable=True, **kwargs): + gdf = data + pdf = data.to_pandas(nullable=nullable) + + expect = pdf.apply(func, args=args, axis=1) + obtain = gdf.apply(func, args=args, axis=1) + assert_eq(expect, obtain, **kwargs) + + +@pytest.fixture +def str_udf_data(): + return cudf.DataFrame( + { + "str_col": [ + "abc", + "ABC", + "AbC", + "123", + "123aBc", + "123@.!", + "", + "rapids ai", + "gpu", + "True", + "False", + "1.234", + ".123a", + "0.013", + "1.0", + "01", + "20010101", + "cudf", + "cuda", + "gpu", + "This Is A Title", + "This is Not a Title", + "Neither is This a Title", + "NoT a TiTlE", + "123 Title Works", + ] + } + ) + + +@pytest.fixture(params=["a", "2", "gpu", "", " "]) +def substr(request): + return request.param + + +def test_string_udf_len(str_udf_data): + def func(row): + return len(row["str_col"]) + + run_masked_udf_test(func, str_udf_data, check_dtype=False) + + +def test_string_udf_startswith(str_udf_data, substr): + def func(row): + return row["str_col"].startswith(substr) + + run_masked_udf_test(func, str_udf_data, check_dtype=False) + + +def test_string_udf_endswith(str_udf_data, substr): + def func(row): + return row["str_col"].endswith(substr) + + run_masked_udf_test(func, str_udf_data, check_dtype=False) + + +def test_string_udf_find(str_udf_data, substr): + def func(row): + return row["str_col"].find(substr) + + run_masked_udf_test(func, str_udf_data, check_dtype=False) + + +def test_string_udf_rfind(str_udf_data, substr): + def func(row): + return row["str_col"].rfind(substr) + + run_masked_udf_test(func, str_udf_data, check_dtype=False) + + +def test_string_udf_contains(str_udf_data, substr): + def func(row): + return substr in row["str_col"] + + run_masked_udf_test(func, str_udf_data, check_dtype=False) + + +@pytest.mark.parametrize("other", ["cudf", "123", "", " "]) +@pytest.mark.parametrize("cmpop", comparison_ops) +def test_string_udf_cmpops(str_udf_data, other, cmpop): + def func(row): + return cmpop(row["str_col"], other) + + run_masked_udf_test(func, str_udf_data, check_dtype=False) + + +def test_string_udf_isalnum(str_udf_data): + def func(row): + return row["str_col"].isalnum() + + run_masked_udf_test(func, str_udf_data, check_dtype=False) + + +def test_string_udf_isalpha(str_udf_data): + def func(row): + return row["str_col"].isalpha() + + run_masked_udf_test(func, str_udf_data, check_dtype=False) + + +def test_string_udf_isdigit(str_udf_data): + def func(row): + return row["str_col"].isdigit() + + run_masked_udf_test(func, str_udf_data, check_dtype=False) + + +def test_string_udf_isdecimal(str_udf_data): + def func(row): + return row["str_col"].isdecimal() + + run_masked_udf_test(func, str_udf_data, check_dtype=False) + + +def test_string_udf_isupper(str_udf_data): + def func(row): + return row["str_col"].isupper() + + run_masked_udf_test(func, str_udf_data, check_dtype=False) + + +def test_string_udf_islower(str_udf_data): + def func(row): + return row["str_col"].islower() + + run_masked_udf_test(func, str_udf_data, check_dtype=False) + + +def test_string_udf_isspace(str_udf_data): + def func(row): + return row["str_col"].isspace() + + run_masked_udf_test(func, str_udf_data, check_dtype=False) + + +def test_string_udf_istitle(str_udf_data): + def func(row): + return row["str_col"].istitle() + + run_masked_udf_test(func, str_udf_data, check_dtype=False) + + +def test_string_udf_count(str_udf_data, substr): + def func(row): + return row["str_col"].count(substr) + + run_masked_udf_test(func, str_udf_data, check_dtype=False) + + +@pytest.mark.xfail(reason="Identity function not supported.") +def test_string_udf_return_string(str_udf_data): + def func(row): + return row["str_col"] + + run_masked_udf_test(func, str_udf_data, check_dtype=False) + + +@pytest.mark.parametrize("strip_char", ["1", "a", "12", " ", "", ".", "@"]) +def test_string_udf_strip(str_udf_data, strip_char): + def func(row): + return row["str_col"].strip(strip_char) + + run_masked_udf_test(func, str_udf_data, check_dtype=False) + + +@pytest.mark.parametrize("strip_char", ["1", "a", "12", " ", "", ".", "@"]) +def test_string_udf_lstrip(str_udf_data, strip_char): + def func(row): + return row["str_col"].lstrip(strip_char) + + run_masked_udf_test(func, str_udf_data, check_dtype=False) + + +@pytest.mark.parametrize("strip_char", ["1", "a", "12", " ", "", ".", "@"]) +def test_string_udf_rstrip(str_udf_data, strip_char): + def func(row): + return row["str_col"].rstrip(strip_char) + + run_masked_udf_test(func, str_udf_data, check_dtype=False) + + +def test_string_udf_upper(str_udf_data): + def func(row): + return row["str_col"].upper() + + run_masked_udf_test(func, str_udf_data, check_dtype=False) + + +def test_string_udf_lower(str_udf_data): + def func(row): + return row["str_col"].lower() + + run_masked_udf_test(func, str_udf_data, check_dtype=False) + + +@pytest.mark.parametrize("concat_char", ["1", "a", "12", " ", "", ".", "@"]) +def test_string_udf_concat(str_udf_data, concat_char): + def func(row): + return row["str_col"] + concat_char + + run_masked_udf_test(func, str_udf_data, check_dtype=False) + + +@pytest.mark.parametrize("to_replace", ["a", "1", "", "@"]) +@pytest.mark.parametrize("replacement", ["a", "1", "", "@"]) +def test_string_udf_replace(str_udf_data, to_replace, replacement): + def func(row): + return row["str_col"].replace(to_replace, replacement) + + run_masked_udf_test(func, str_udf_data, check_dtype=False) + + +def test_arith_masked_vs_masked(arithmetic_op): + # This test should test all the typing + # and lowering for arithmetic ops between + # two columns + def func(row): + x = row["a"] + y = row["b"] + return arithmetic_op(x, y) + + gdf = cudf.DataFrame({"a": [1, None, 3, None], "b": [4, 5, None, None]}) + run_masked_udf_test(func, gdf, check_dtype=False) + + +def test_bitwise_masked_vs_masked(bitwise_op): + # This test should test all the typing + # and lowering for bitwise ops between + # two columns + def func(row): + x = row["a"] + y = row["b"] + return bitwise_op(x, y) + + gdf = cudf.DataFrame( + { + "a": [1, 0, 1, 0, 0b1011, 42, None], + "b": [1, 1, 0, 0, 0b1100, -42, 5], + } + ) + run_masked_udf_test(func, gdf, check_dtype=False) + + +@pytest.mark.parametrize("op", [operator.add, operator.sub]) +def test_arith_masked_vs_masked_datelike( + op, datetime_types_as_str, temporal_types_as_str +): + # Datetime version of the above + # does not test all dtype combinations for now + if temporal_types_as_str.startswith("datetime") and op is operator.add: + # don't try adding datetimes to datetimes. + pytest.skip("Adding datetime to datetime is not valid") + + def func(row): + x = row["a"] + y = row["b"] + return op(x, y) + + gdf = cudf.DataFrame( + { + "a": ["2011-01-01", cudf.NA, "2011-03-01", cudf.NA], + "b": [4, 5, cudf.NA, cudf.NA], + } + ) + gdf["a"] = gdf["a"].astype(datetime_types_as_str) + gdf["b"] = gdf["b"].astype(temporal_types_as_str) + + pdf = gdf.to_pandas() + expect = op(pdf["a"], pdf["b"]) + obtain = gdf.apply(func, axis=1) + assert_eq(expect, obtain, check_dtype=False) + # TODO: After the following pandas issue is + # fixed, uncomment the following line and delete + # through `to_pandas()` statement. + # https://github.com/pandas-dev/pandas/issues/52411 + + # run_masked_udf_test(func, gdf, nullable=False, check_dtype=False) + + +def test_compare_masked_vs_masked(comparison_op): + # this test should test all the + # typing and lowering for comparisons + # between columns + + def func(row): + x = row["a"] + y = row["b"] + return comparison_op(x, y) + + # we should get: + # [?, ?, , , ] + gdf = cudf.DataFrame( + {"a": [1, 0, None, 1, None], "b": [0, 1, 0, None, None]} + ) + run_masked_udf_test(func, gdf, check_dtype=False) + + +@pytest.mark.parametrize("constant", [1, 1.5, True, False]) +def test_arith_masked_vs_constant(arithmetic_op, constant): + if constant is False and arithmetic_op in { + operator.mod, + operator.pow, + operator.truediv, + operator.floordiv, + operator.imod, + operator.ipow, + operator.itruediv, + operator.ifloordiv, + }: + # The following tests cases yield undefined behavior: + # - truediv(x, False) because its dividing by zero + # - floordiv(x, False) because its dividing by zero + # - mod(x, False) because its mod by zero, + # - pow(x, False) because we have an NA in the series and pandas + # insists that (NA**0 == 1) where we do not + pytest.skip( + f"{constant=} yields undefined behavior for {arithmetic_op=}" + ) + + def func(row): + x = row["data"] + return arithmetic_op(x, constant) + + gdf = cudf.DataFrame({"data": [1, 2, cudf.NA]}) + run_masked_udf_test(func, gdf, check_dtype=False) + + +@pytest.mark.parametrize("constant", [1, 1.5, True, False]) +@pytest.mark.parametrize("data", [[2, 3, cudf.NA], [1, cudf.NA, 1]]) +def test_arith_masked_vs_constant_reflected( + request, arithmetic_op, constant, data +): + def func(row): + x = row["data"] + return arithmetic_op(constant, x) + + # Just a single column -> result will be all NA + gdf = cudf.DataFrame({"data": data}) + + # cudf differs from pandas for 1**NA + request.applymarker( + pytest.mark.xfail( + condition=( + constant == 1 + and arithmetic_op in {operator.pow, operator.ipow} + ), + reason="https://github.com/rapidsai/cudf/issues/7478", + ) + ) + run_masked_udf_test(func, gdf, check_dtype=False) + + +@pytest.mark.parametrize("data", [[1, cudf.NA, 3], [2, 3, cudf.NA]]) +def test_arith_masked_vs_null(request, arithmetic_op, data): + def func(row): + x = row["data"] + return arithmetic_op(x, NA) + + gdf = cudf.DataFrame({"data": data}) + + # In pandas, 1**NA == 1. + request.applymarker( + pytest.mark.xfail( + condition=( + (gdf["data"] == 1).any() + and arithmetic_op in {operator.pow, operator.ipow} + ), + reason="https://github.com/rapidsai/cudf/issues/7478", + ) + ) + run_masked_udf_test(func, gdf, check_dtype=False) + + +def test_arith_masked_vs_null_reflected(arithmetic_op): + def func(row): + x = row["data"] + return arithmetic_op(NA, x) + + gdf = cudf.DataFrame({"data": [1, None, 3]}) + run_masked_udf_test(func, gdf, check_dtype=False) + + +def test_unary_masked(unary_op): + # This test should test all the typing + # and lowering for unary ops + + def func(row): + x = row["a"] + return unary_op(x) if x is not NA else NA + + if "log" in unary_op.__name__: + gdf = cudf.DataFrame({"a": [0.1, 1.0, None, 3.5, 1e8]}) + elif unary_op.__name__ in {"asin", "acos"}: + gdf = cudf.DataFrame({"a": [0.0, 0.5, None, 1.0]}) + elif unary_op.__name__ in {"atanh"}: + gdf = cudf.DataFrame({"a": [0.0, -0.5, None, 0.8]}) + elif unary_op.__name__ in {"acosh", "sqrt", "lgamma"}: + gdf = cudf.DataFrame({"a": [1.0, 2.0, None, 11.0]}) + elif unary_op.__name__ in {"gamma"}: + gdf = cudf.DataFrame({"a": [0.1, 2, None, 4]}) + elif unary_op.__name__ in {"invert"}: + gdf = cudf.DataFrame({"a": [-100, 128, None, 0]}, dtype="int64") + else: + gdf = cudf.DataFrame({"a": [-125.60, 395.2, 0.0, None]}) + run_masked_udf_test(func, gdf, check_dtype=False) + + +def test_masked_is_null_conditional(): + def func(row): + x = row["a"] + y = row["b"] + if x is NA: + return y + else: + return x + y + + gdf = cudf.DataFrame({"a": [1, None, 3, None], "b": [4, 5, None, None]}) + run_masked_udf_test(func, gdf, check_dtype=False) + + +def test_apply_contains(): + def func(row): + x = row["a"] + return x in [1, 2] + + gdf = cudf.DataFrame({"a": [1, 3]}) + run_masked_udf_test(func, gdf, check_dtype=False) + + +@pytest.mark.parametrize("op", [operator.add, operator.and_, operator.eq]) +def test_apply_mixed_dtypes(numeric_types_as_str, numeric_types_as_str2, op): + """ + Test that operations can be performed between columns + of different dtypes and return a column with the correct + values and nulls + """ + + # First perform the op on two dummy data on host, if numpy can + # safely type cast, we should expect it to work in udf too. + try: + op( + np.dtype(numeric_types_as_str).type(0), + np.dtype(numeric_types_as_str2).type(42), + ) + except TypeError: + pytest.skip("Operation is unsupported for corresponding dtype.") + + def func(row): + x = row["a"] + y = row["b"] + return op(x, y) + + gdf = cudf.DataFrame({"a": [1.5, None, 3, None], "b": [4, 5, None, None]}) + gdf["a"] = gdf["a"].astype(numeric_types_as_str) + gdf["b"] = gdf["b"].astype(numeric_types_as_str2) + + run_masked_udf_test(func, gdf, check_dtype=False) + + +@pytest.mark.parametrize("val", [5, 5.5]) +def test_apply_return_literal(val): + """ + Test unification codepath for scalars and MaskedType + makes sure that numba knows how to cast a scalar value + to a MaskedType + """ + + def func(row): + x = row["a"] + y = row["b"] + if x is not NA and x < 2: + return val + else: + return x + y + + gdf = cudf.DataFrame({"a": [1, None, 3, None], "b": [4, 5, None, None]}) + + run_masked_udf_test(func, gdf, check_dtype=False) + + +def test_apply_return_null(): + """ + Tests casting / unification of Masked and NA + """ + + def func(row): + x = row["a"] + if x is NA: + return NA + else: + return x + + gdf = cudf.DataFrame({"a": [1, None, 3]}) + run_masked_udf_test(func, gdf, check_dtype=False) + + +def test_apply_return_either_null_or_literal(): + def func(row): + x = row["a"] + if x > 5: + return 2 + else: + return NA + + gdf = cudf.DataFrame({"a": [1, 3, 6]}) + run_masked_udf_test(func, gdf, check_dtype=False) + + +def test_apply_return_literal_only(): + def func(x): + return 5 + + gdf = cudf.DataFrame({"a": [1, None, 3]}) + run_masked_udf_test(func, gdf, check_dtype=False) + + +def test_apply_everything(): + def func(row): + w = row["a"] + x = row["b"] + y = row["c"] + z = row["d"] + if x is NA: + return w + y - z + elif ((z > y) is not NA) and z > y: + return x + elif ((x + y) is not NA) and x + y == 0: + return z / x + elif x + y is NA: + return 2.5 + elif w > 100: + return ( + math.sin(x) + + math.sqrt(y) + - (-z) + + math.lgamma(x) * math.fabs(-0.8) / math.radians(3.14) + ) + else: + return y > 2 + + gdf = cudf.DataFrame( + { + "a": [1, 3, 6, 0, None, 5, None, 101], + "b": [3.0, 2.5, None, 5.0, 1.0, 5.0, 11.0, 1.0], + "c": [2, 3, 6, 0, None, 5, None, 6], + "d": [4, None, 6, 0, None, 5, None, 7.5], + } + ) + run_masked_udf_test(func, gdf, check_dtype=False) + + +### + + +### + + +def test_masked_udf_lambda_support(binary_op): + func = lambda row: binary_op(row["a"], row["b"]) # noqa: E731 + + data = cudf.DataFrame( + {"a": [1, cudf.NA, 3, cudf.NA], "b": [1, 2, cudf.NA, cudf.NA]} + ) + + run_masked_udf_test(func, data, check_dtype=False) + + +def test_masked_udf_nested_function_support(binary_op): + """ + Nested functions need to be explicitly jitted by the user + for numba to recognize them. Unfortunately the object + representing the jitted function can not itself be used in + pandas udfs. + """ + + def inner(x, y): + return binary_op(x, y) + + def outer(row): + x = row["a"] + y = row["b"] + return inner(x, y) + + gdf = cudf.DataFrame( + {"a": [1, cudf.NA, 3, cudf.NA], "b": [1, 2, cudf.NA, cudf.NA]} + ) + + with pytest.raises(ValueError): + gdf.apply(outer, axis=1) + + pdf = gdf.to_pandas(nullable=True) + inner_gpu = cuda.jit(device=True)(inner) + + def outer_gpu(row): + x = row["a"] + y = row["b"] + return inner_gpu(x, y) + + got = gdf.apply(outer_gpu, axis=1) + expect = pdf.apply(outer, axis=1) + assert_eq(expect, got, check_dtype=False) + + +@pytest.mark.parametrize( + "data", + [ + {"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]}, + {"a": [1, 2, 3], "c": [4, 5, 6], "b": [7, 8, 9]}, + {"a": [1, 2, 3], "b": [4, 5, 6], "c": ["a", "b", "c"]}, + ], +) +def test_masked_udf_subset_selection(data): + def func(row): + return row["a"] + row["b"] + + data = cudf.DataFrame(data) + run_masked_udf_test(func, data) + + +@pytest.mark.parametrize( + "unsupported_col", + [ + lambda: cudf.Series( + [ + decimal.Decimal("1.0"), + decimal.Decimal("2.0"), + decimal.Decimal("3.0"), + ], + dtype=cudf.Decimal64Dtype(2, 1), + ), + lambda: cudf.Series([1, 2, 3], dtype="category"), + lambda: cudf.interval_range(start=0, end=3), + lambda: [[1, 2], [3, 4], [5, 6]], + lambda: [{"a": 1}, {"a": 2}, {"a": 3}], + ], +) +def test_masked_udf_unsupported_dtype(unsupported_col): + data = cudf.DataFrame({"unsupported_col": unsupported_col()}) + + def func(row): + return row["unsupported_col"] + + # check that we fail when an unsupported type is used within a function + with pytest.raises(ValueError): + data.apply(func, axis=1) + + # also check that a DF containing unsupported dtypes can still run a + # function that does NOT involve any of the unsupported dtype columns + data["supported_col"] = 1 + + def other_func(row): + return row["supported_col"] + + expect = cudf.Series(np.ones(len(data))) + got = data.apply(other_func, axis=1) + + assert_eq(expect, got, check_dtype=False) + + +# tests for `DataFrame.apply(f, args=(x,y,z))` +# testing the whole space of possibilities is intractable +# these test the most rudimentary guaranteed functionality +@pytest.mark.parametrize( + "data", + [ + {"a": [1, cudf.NA, 3]}, + {"a": [0.5, 2.0, cudf.NA, cudf.NA, 5.0]}, + {"a": [True, False, cudf.NA]}, + ], +) +def test_masked_udf_scalar_args_binops(data, binary_op): + data = cudf.DataFrame(data) + + def func(row, c): + return binary_op(row["a"], c) + + run_masked_udf_test(func, data, args=(1,), check_dtype=False) + + +@pytest.mark.parametrize( + "data", + [ + {"a": [1, cudf.NA, 3]}, + {"a": [0.5, 2.0, cudf.NA, cudf.NA, 5.0]}, + {"a": [True, False, cudf.NA]}, + ], +) +def test_masked_udf_scalar_args_binops_multiple(data, binary_op): + data = cudf.DataFrame(data) + + def func(row, c, k): + x = binary_op(row["a"], c) + y = binary_op(x, k) + return y + + run_masked_udf_test(func, data, args=(1, 2), check_dtype=False) diff --git a/python/cudf/cudf/tests/dataframe/methods/test_dropna.py b/python/cudf/cudf/tests/dataframe/methods/test_dropna.py index ec27503a0ef..f3c2f5a236a 100644 --- a/python/cudf/cudf/tests/dataframe/methods/test_dropna.py +++ b/python/cudf/cudf/tests/dataframe/methods/test_dropna.py @@ -1,8 +1,9 @@ # Copyright (c) 2025, NVIDIA CORPORATION. - import numpy as np import pandas as pd +import pyarrow as pa +import pytest import cudf from cudf.testing import assert_eq @@ -32,3 +33,157 @@ def test_datetime_dataframe(): assert_eq(ps.dropna(), gs.dropna()) assert_eq(ps.isnull(), gs.isnull()) + + +@pytest.mark.parametrize( + "data", + [ + {"a": [1, 2, None]}, + {"a": [1, 2, None], "b": [3, 4, 5]}, + {"a": [1, 2, None], "b": [3, 4, None]}, + {"a": [None, 1, 2], "b": [1, 2, None]}, + {"a": [None, 1, None], "b": [None, 2, None]}, + {"a": [None, None, 1], "b": [1, 2, None]}, + {"a": ["d", "e", "f"], "b": ["a", None, "c"]}, + ], +) +def test_dropna_dataframe(data, dropna_how, axis, inplace): + pdf = pd.DataFrame(data) + gdf = cudf.from_pandas(pdf) + + expected = pdf.dropna(axis=axis, how=dropna_how, inplace=inplace) + actual = gdf.dropna(axis=axis, how=dropna_how, inplace=inplace) + + if inplace: + expected = pdf + actual = gdf + + assert_eq(expected, actual) + + +@pytest.mark.parametrize( + "data", + [ + { + "a": pa.array([None, None, None], type=pa.float64()), + "b": [1, 2, None], + }, + { + "a": pa.array([np.nan, np.nan, np.nan]), + "b": [1, 2, None], + }, + {"a": pa.array([None, None, None], type=pa.string())}, + ], +) +def test_dropna_with_all_nulls(dropna_how, data, axis): + gdf = cudf.DataFrame(data) + pdf = gdf.to_pandas() + + assert_eq( + pdf.dropna(axis=axis, how=dropna_how), + gdf.dropna(axis=axis, how=dropna_how), + check_dtype=False, + ) + + +@pytest.mark.parametrize( + "data,subset", + [ + ({"a": [1, None], "b": [1, 2]}, ["a"]), + ({"a": [1, None], "b": [1, 2]}, ["b"]), + ({"a": [1, None], "b": [1, 2]}, []), + ({"a": [1, 2], "b": [1, 2]}, ["b"]), + ({"a": [1, 2, None], "b": [1, None, 2]}, ["a"]), + ({"a": [1, 2, None], "b": [1, None, 2]}, ["b"]), + ({"a": [1, 2, None], "b": [1, None, 2]}, ["a", "b"]), + ], +) +def test_dropna_subset_rows(data, subset): + pdf = pd.DataFrame(data) + gdf = cudf.from_pandas(pdf) + + assert_eq(pdf.dropna(subset=subset), gdf.dropna(subset=subset)) + + +@pytest.mark.parametrize( + "data, subset", + [ + ({"a": [1, None], "b": [1, 2]}, [0]), + ({"a": [1, None], "b": [1, 2]}, [1]), + ({"a": [1, None], "b": [1, 2]}, []), + ({"a": [1, 2], "b": [1, 2]}, [0]), + ({"a": [1, 2], "b": [None, 2], "c": [3, None]}, [0]), + ({"a": [1, 2], "b": [None, 2], "c": [3, None]}, [1]), + ({"a": [1, 2], "b": [None, 2], "c": [3, None]}, [0, 1]), + ], +) +def test_dropna_subset_cols(data, subset): + pdf = pd.DataFrame(data) + gdf = cudf.from_pandas(pdf) + + assert_eq( + pdf.dropna(axis=1, subset=subset), gdf.dropna(axis=1, subset=subset) + ) + + +# TODO: can't test with subset=[] below since Pandas +# returns empty DF when both subset=[] and thresh are specified. +@pytest.mark.parametrize("thresh", [0, 1, 2]) +@pytest.mark.parametrize("subset", [None, ["a"], ["b"], ["a", "b"]]) +def test_dropna_thresh(thresh, subset): + pdf = pd.DataFrame({"a": [1, 2, None, None], "b": [1, 2, 3, None]}) + gdf = cudf.from_pandas(pdf) + + assert_eq( + pdf.dropna(axis=0, thresh=thresh, subset=subset), + gdf.dropna(axis=0, thresh=thresh, subset=subset), + ) + + +@pytest.mark.parametrize("thresh", [0, 1, 2]) +@pytest.mark.parametrize("subset", [None, [0], [1], [0, 1]]) +def test_dropna_thresh_cols(thresh, subset, inplace): + pdf = pd.DataFrame( + {"a": [1, 2], "b": [3, 4], "c": [5, None], "d": [np.nan, np.nan]} + ) + gdf = cudf.from_pandas(pdf) + + expected = pdf.dropna( + axis=1, thresh=thresh, subset=subset, inplace=inplace + ) + actual = gdf.dropna(axis=1, thresh=thresh, subset=subset, inplace=inplace) + + if inplace: + expected = pdf + actual = gdf + + assert_eq( + expected, + actual, + ) + + +@pytest.mark.parametrize( + "data", + [ + { + "key": [1, 2, 10], + "val": pa.array([np.nan, 3.0, 1.0]), + "abc": [np.nan, None, 1], + }, + { + "key": [None, 2, 1], + "val": pa.array([3.0, None, 0.1]), + "abc": [None, 1, None], + }, + ], +) +def test_dropna_dataframe_np_nan(data, axis): + gdf = cudf.DataFrame(data) + pd_data = { + key: value.to_pandas() if isinstance(value, cudf.Series) else value + for key, value in data.items() + } + pdf = pd.DataFrame(pd_data) + + assert_eq(pdf.dropna(axis=axis), gdf.dropna(axis=axis), check_dtype=False) diff --git a/python/cudf/cudf/tests/indexes/index/methods/test_dropna.py b/python/cudf/cudf/tests/indexes/index/methods/test_dropna.py new file mode 100644 index 00000000000..b572e9e156d --- /dev/null +++ b/python/cudf/cudf/tests/indexes/index/methods/test_dropna.py @@ -0,0 +1,27 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + +import pandas as pd +import pytest + +import cudf +from cudf.testing import assert_eq + + +@pytest.mark.parametrize( + "data, dtype", + [ + ([1, float("nan"), 2], "float64"), + (["x", None, "y"], "str"), + (["x", None, "y"], "category"), + (["2020-01-20", pd.NaT, "2020-03-15"], "datetime64[ns]"), + (["1s", pd.NaT, "3d"], "timedelta64[ns]"), + ], +) +def test_dropna_index(data, dtype): + pi = pd.Index(data, dtype=dtype) + gi = cudf.from_pandas(pi) + + expect = pi.dropna() + got = gi.dropna() + + assert_eq(expect, got) diff --git a/python/cudf/cudf/tests/indexes/index/test_attributes.py b/python/cudf/cudf/tests/indexes/index/test_attributes.py index 2e80dfb272e..ee4a1654a10 100644 --- a/python/cudf/cudf/tests/indexes/index/test_attributes.py +++ b/python/cudf/cudf/tests/indexes/index/test_attributes.py @@ -1,10 +1,45 @@ # Copyright (c) 2025, NVIDIA CORPORATION. +import datetime import numpy as np import pandas as pd import pytest -from cudf import Index +import cudf + + +@pytest.mark.parametrize( + "values, item, expected", + [ + [[1, 2, 3], 2, True], + [[1, 2, 3], 4, False], + [[1, 2, 3], "a", False], + [["a", "b", "c"], "a", True], + [["a", "b", "c"], "ab", False], + [["a", "b", "c"], 6, False], + [pd.Categorical(["a", "b", "c"]), "a", True], + [pd.Categorical(["a", "b", "c"]), "ab", False], + [pd.Categorical(["a", "b", "c"]), 6, False], + [pd.date_range("20010101", periods=5, freq="D"), 20000101, False], + [ + pd.date_range("20010101", periods=5, freq="D"), + datetime.datetime(2000, 1, 1), + False, + ], + [ + pd.date_range("20010101", periods=5, freq="D"), + datetime.datetime(2001, 1, 1), + True, + ], + ], +) +@pytest.mark.parametrize( + "box", + [cudf.Index, lambda x: cudf.Series(index=x)], + ids=["index", "series"], +) +def test_contains(values, item, expected, box): + assert (item in box(values)) is expected @pytest.mark.parametrize( @@ -26,7 +61,7 @@ ], ) def test_index_is_unique_monotonic(testlist): - index = Index(testlist) + index = cudf.Index(testlist) index_pd = pd.Index(testlist) assert index.is_unique == index_pd.is_unique diff --git a/python/cudf/cudf/tests/indexes/multiindex/methods/test_dropna.py b/python/cudf/cudf/tests/indexes/multiindex/methods/test_dropna.py new file mode 100644 index 00000000000..049dc847e18 --- /dev/null +++ b/python/cudf/cudf/tests/indexes/multiindex/methods/test_dropna.py @@ -0,0 +1,41 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + +import numpy as np +import pandas as pd +import pytest + +import cudf +from cudf.testing import assert_eq + + +def test_dropna_multiindex(dropna_how): + pi = pd.MultiIndex.from_arrays([[1, None, 2], [None, None, 2]]) + gi = cudf.from_pandas(pi) + + expect = pi.dropna(dropna_how) + got = gi.dropna(dropna_how) + assert_eq(expect, got) + + +@pytest.mark.parametrize( + "data", + [ + [ + [pd.Timestamp("2020-01-01"), pd.NaT, pd.Timestamp("2020-02-01")], + [pd.NaT, pd.NaT, pd.Timestamp("2020-03-01")], + ], + [ + [pd.Timestamp("2020-01-01"), pd.NaT, pd.Timestamp("2020-02-01")], + [np.nan, np.nan, 1.0], + ], + [[1.0, np.nan, 2.0], [np.nan, np.nan, 1.0]], + ], +) +def test_dropna_multiindex_2(data, dropna_how): + pi = pd.MultiIndex.from_arrays(data) + gi = cudf.from_pandas(pi) + + expect = pi.dropna(dropna_how) + got = gi.dropna(dropna_how) + + assert_eq(expect, got) diff --git a/python/cudf/cudf/tests/indexes/rangeindex/test_attributes.py b/python/cudf/cudf/tests/indexes/rangeindex/test_attributes.py index a9de5d39622..60ee8b432e6 100644 --- a/python/cudf/cudf/tests/indexes/rangeindex/test_attributes.py +++ b/python/cudf/cudf/tests/indexes/rangeindex/test_attributes.py @@ -3,14 +3,20 @@ import pandas as pd import pytest -from cudf.core.index import RangeIndex +import cudf + + +def test_rangeindex_contains(): + ridx = cudf.RangeIndex(start=0, stop=10, name="Index") + assert 9 in ridx + assert 10 not in ridx @pytest.mark.parametrize( "start, stop, step", [(10, 20, 1), (0, -10, -1), (5, 5, 1)] ) def test_range_index_is_unique_monotonic(start, stop, step): - index = RangeIndex(start=start, stop=stop, step=step) + index = cudf.RangeIndex(start=start, stop=stop, step=step) index_pd = pd.RangeIndex(start=start, stop=stop, step=step) assert index.is_unique == index_pd.is_unique diff --git a/python/cudf/cudf/tests/series/methods/test_apply.py b/python/cudf/cudf/tests/series/methods/test_apply.py new file mode 100644 index 00000000000..8bdb46e02a2 --- /dev/null +++ b/python/cudf/cudf/tests/series/methods/test_apply.py @@ -0,0 +1,281 @@ +# Copyright (c) 2021-2025, NVIDIA CORPORATION. +import operator + +import numpy as np +import pytest + +import cudf +from cudf.core._compat import PANDAS_CURRENT_SUPPORTED_VERSION, PANDAS_VERSION +from cudf.core.udf.utils import precompiled +from cudf.testing import assert_eq + + +def run_masked_udf_series(func, data, args=(), **kwargs): + gsr = data + psr = data.to_pandas(nullable=True) + + expect = psr.apply(func, args=args) + obtain = gsr.apply(func, args=args) + assert_eq(expect, obtain, **kwargs) + + +@pytest.mark.parametrize( + "data", + [ + np.array( + [0, 1, -1, 0, np.iinfo("int64").min, np.iinfo("int64").max], + dtype="int64", + ), + np.array([0, 0, 1, np.iinfo("uint64").max], dtype="uint64"), + np.array( + [ + 0, + 0.0, + -1.0, + 1.5, + -1.5, + np.finfo("float64").min, + np.finfo("float64").max, + np.nan, + np.inf, + -np.inf, + ], + dtype="float64", + ), + [False, True, False, cudf.NA], + ], +) +def test_masked_udf_abs(data): + data = cudf.Series(data) + data[0] = cudf.NA + + def func(x): + return abs(x) + + run_masked_udf_series(func, data, check_dtype=False) + + +@pytest.mark.parametrize( + "data", [[1.0, 0.0, 1.5], [1, 0, 2], [True, False, True]] +) +@pytest.mark.parametrize("operator", [float, int, bool]) +def test_masked_udf_casting(operator, data): + data = cudf.Series(data) + + def func(x): + return operator(x) + + run_masked_udf_series(func, data, check_dtype=False) + + +def test_masked_udf_caching(): + # Make sure similar functions that differ + # by simple things like constants actually + # recompile + + data = cudf.Series([1, 2, 3]) + + expect = data**2 + got = data.apply(lambda x: x**2) + assert_eq(expect, got, check_dtype=False) + + # update the constant value being used and make sure + # it does not result in a cache hit + + expect = data**3 + got = data.apply(lambda x: x**3) + assert_eq(expect, got, check_dtype=False) + + # make sure we get a hit when reapplying + def f(x): + return x + 1 + + precompiled.clear() + assert precompiled.currsize == 0 + data.apply(f) + + assert precompiled.currsize == 1 + data.apply(f) + + assert precompiled.currsize == 1 + + # validate that changing the type of a scalar arg + # results in a miss + precompiled.clear() + + def f(x, c): + return x + c + + data.apply(f, args=(1,)) + assert precompiled.currsize == 1 + + data.apply(f, args=(1.5,)) + assert precompiled.currsize == 2 + + +@pytest.mark.parametrize( + "data", + [ + [1, cudf.NA, 3], + [0.5, 2.0, cudf.NA, cudf.NA, 5.0], + [True, False, cudf.NA], + ], +) +def test_masked_udf_scalar_args_binops_multiple_series( + request, data, binary_op +): + data = cudf.Series(data) + request.applymarker( + pytest.mark.xfail( + binary_op + in [ + operator.eq, + operator.ne, + operator.lt, + operator.le, + operator.gt, + operator.ge, + ] + and PANDAS_VERSION >= PANDAS_CURRENT_SUPPORTED_VERSION + and data.dtype.kind != "b", + reason="https://github.com/pandas-dev/pandas/issues/57390", + ) + ) + + def func(data, c, k): + x = binary_op(data, c) + y = binary_op(x, k) + return y + + run_masked_udf_series(func, data, args=(1, 2), check_dtype=False) + + +@pytest.mark.parametrize( + "data", + [ + [1, cudf.NA, 3], + [0.5, 2.0, cudf.NA, cudf.NA, 5.0], + [True, False, cudf.NA], + ], +) +def test_mask_udf_scalar_args_binops_series(data): + data = cudf.Series(data) + + def func(x, c): + return x + c + + run_masked_udf_series(func, data, args=(1,), check_dtype=False) + + +@pytest.mark.parametrize( + "data,name", + [([1, 2, 3], None), ([1, cudf.NA, 3], None), ([1, 2, 3], "test_name")], +) +def test_series_apply_basic(data, name): + data = cudf.Series(data, name=name) + + def func(x): + return x + 1 + + run_masked_udf_series(func, data, check_dtype=False) + + +@pytest.mark.xfail( + PANDAS_VERSION >= PANDAS_CURRENT_SUPPORTED_VERSION, + reason="https://github.com/pandas-dev/pandas/issues/57390", +) +def test_series_apply_null_conditional(): + def func(x): + if x is cudf.NA: + return 42 + else: + return x - 1 + + data = cudf.Series([1, cudf.NA, 3]) + + run_masked_udf_series(func, data) + + +def test_series_arith_masked_vs_masked(arithmetic_op): + def func(x): + return arithmetic_op(x, x) + + data = cudf.Series([1, cudf.NA, 3]) + run_masked_udf_series(func, data, check_dtype=False) + + +@pytest.mark.xfail( + PANDAS_VERSION >= PANDAS_CURRENT_SUPPORTED_VERSION, + reason="https://github.com/pandas-dev/pandas/issues/57390", +) +def test_series_compare_masked_vs_masked(comparison_op): + """ + In the series case, only one other MaskedType to compare with + - itself + """ + + def func(x): + return comparison_op(x, x) + + data = cudf.Series([1, cudf.NA, 3]) + run_masked_udf_series(func, data, check_dtype=False) + + +@pytest.mark.parametrize("constant", [1, 1.5, cudf.NA]) +def test_series_arith_masked_vs_constant(request, arithmetic_op, constant): + def func(x): + return arithmetic_op(x, constant) + + # Just a single column -> result will be all NA + data = cudf.Series([1, 2, cudf.NA]) + # in pandas, 1**NA == 1. In cudf, 1**NA == NA. + request.applymarker( + pytest.mark.xfail( + condition=( + constant is cudf.NA + and arithmetic_op in {operator.pow, operator.ipow} + ), + reason="https://github.com/rapidsai/cudf/issues/7478", + ) + ) + run_masked_udf_series(func, data, check_dtype=False) + + +@pytest.mark.parametrize("constant", [1, 1.5, cudf.NA]) +def test_series_arith_masked_vs_constant_reflected( + request, arithmetic_op, constant +): + def func(x): + return arithmetic_op(constant, x) + + # Just a single column -> result will be all NA + data = cudf.Series([1, 2, cudf.NA]) + # Using in {1} since bool(NA == 1) raises a TypeError since NA is + # neither truthy nor falsy + # in pandas, 1**NA == 1. In cudf, 1**NA == NA. + request.applymarker( + pytest.mark.xfail( + condition=( + constant in {1} + and arithmetic_op in {operator.pow, operator.ipow} + ), + reason="https://github.com/rapidsai/cudf/issues/7478", + ) + ) + run_masked_udf_series(func, data, check_dtype=False) + + +@pytest.mark.xfail( + PANDAS_VERSION >= PANDAS_CURRENT_SUPPORTED_VERSION, + reason="https://github.com/pandas-dev/pandas/issues/57390", +) +def test_series_masked_is_null_conditional(): + def func(x): + if x is cudf.NA: + return 42 + else: + return x + + data = cudf.Series([1, cudf.NA, 3, cudf.NA]) + + run_masked_udf_series(func, data, check_dtype=False) diff --git a/python/cudf/cudf/tests/series/methods/test_dropna.py b/python/cudf/cudf/tests/series/methods/test_dropna.py new file mode 100644 index 00000000000..dafcbf5bbfe --- /dev/null +++ b/python/cudf/cudf/tests/series/methods/test_dropna.py @@ -0,0 +1,81 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + +import numpy as np +import pandas as pd +import pytest + +import cudf +from cudf.testing import assert_eq + + +@pytest.mark.parametrize( + "data", + [ + [], + [1.0, 2, None, 4], + ["one", "two", "three", "four"], + pd.Series(["a", "b", "c", "d"], dtype="category"), + pd.Series(pd.date_range("2010-01-01", "2010-01-04")), + ], +) +@pytest.mark.parametrize("nulls", ["one", "some", "all", "none"]) +def test_dropna_series(data, nulls, inplace): + psr = pd.Series(data) + rng = np.random.default_rng(seed=0) + if len(data) > 0: + if nulls == "one": + p = rng.integers(0, 4) + psr[p] = None + elif nulls == "some": + p1, p2 = rng.integers(0, 4, (2,)) + psr[p1] = None + psr[p2] = None + elif nulls == "all": + psr[:] = None + + gsr = cudf.from_pandas(psr) + + check_dtype = True + if gsr.null_count == len(gsr): + check_dtype = False + + expected = psr.dropna() + actual = gsr.dropna() + + if inplace: + expected = psr + actual = gsr + + assert_eq(expected, actual, check_dtype=check_dtype) + + +def test_dropna_nan_as_null(): + sr = cudf.Series([1.0, 2.0, np.nan, None], nan_as_null=False) + assert_eq(sr.dropna(), sr[:2]) + sr = sr.nans_to_nulls() + assert_eq(sr.dropna(), sr[:2]) + + df = cudf.DataFrame( + { + "a": cudf.Series([1.0, 2.0, np.nan, None], nan_as_null=False), + "b": cudf.Series([1, 2, 3, 4]), + } + ) + + got = df.dropna() + expected = df[:2] + assert_eq(expected, got) + + df = df.nans_to_nulls() + got = df.dropna() + expected = df[:2] + assert_eq(expected, got) + + +def test_ignore_index(): + pser = pd.Series([1, 2, np.nan], index=[2, 4, 1]) + gser = cudf.from_pandas(pser) + + result = pser.dropna(ignore_index=True) + expected = gser.dropna(ignore_index=True) + assert_eq(result, expected) diff --git a/python/cudf/cudf/tests/test_contains.py b/python/cudf/cudf/tests/test_contains.py index 2c6bc0e8a00..7db558335fe 100644 --- a/python/cudf/cudf/tests/test_contains.py +++ b/python/cudf/cudf/tests/test_contains.py @@ -1,55 +1,14 @@ # Copyright (c) 2019-2025, NVIDIA CORPORATION. -import datetime import numpy as np -import pandas as pd import pytest import cudf from cudf import Series -from cudf.core.index import Index, RangeIndex from cudf.testing._utils import DATETIME_TYPES, NUMERIC_TYPES, TIMEDELTA_TYPES -@pytest.mark.parametrize( - "values, item, expected", - [ - [[1, 2, 3], 2, True], - [[1, 2, 3], 4, False], - [[1, 2, 3], "a", False], - [["a", "b", "c"], "a", True], - [["a", "b", "c"], "ab", False], - [["a", "b", "c"], 6, False], - [pd.Categorical(["a", "b", "c"]), "a", True], - [pd.Categorical(["a", "b", "c"]), "ab", False], - [pd.Categorical(["a", "b", "c"]), 6, False], - [pd.date_range("20010101", periods=5, freq="D"), 20000101, False], - [ - pd.date_range("20010101", periods=5, freq="D"), - datetime.datetime(2000, 1, 1), - False, - ], - [ - pd.date_range("20010101", periods=5, freq="D"), - datetime.datetime(2001, 1, 1), - True, - ], - ], -) -@pytest.mark.parametrize( - "box", [Index, lambda x: Series(index=x)], ids=["index", "series"] -) -def test_contains(values, item, expected, box): - assert (item in box(values)) is expected - - -def test_rangeindex_contains(): - ridx = RangeIndex(start=0, stop=10, name="Index") - assert 9 in ridx - assert 10 not in ridx - - @pytest.mark.parametrize("dtype", NUMERIC_TYPES) def test_lists_contains(dtype): dtype = cudf.dtype(dtype) diff --git a/python/cudf/cudf/tests/test_dropna.py b/python/cudf/cudf/tests/test_dropna.py deleted file mode 100644 index 1f927d03e95..00000000000 --- a/python/cudf/cudf/tests/test_dropna.py +++ /dev/null @@ -1,299 +0,0 @@ -# Copyright (c) 2020-2025, NVIDIA CORPORATION. - -import numpy as np -import pandas as pd -import pyarrow as pa -import pytest - -import cudf -from cudf.testing import assert_eq - - -@pytest.mark.parametrize( - "data", - [ - [], - [1.0, 2, None, 4], - ["one", "two", "three", "four"], - pd.Series(["a", "b", "c", "d"], dtype="category"), - pd.Series(pd.date_range("2010-01-01", "2010-01-04")), - ], -) -@pytest.mark.parametrize("nulls", ["one", "some", "all", "none"]) -@pytest.mark.parametrize("inplace", [True, False]) -def test_dropna_series(data, nulls, inplace): - psr = pd.Series(data) - rng = np.random.default_rng(seed=0) - if len(data) > 0: - if nulls == "one": - p = rng.integers(0, 4) - psr[p] = None - elif nulls == "some": - p1, p2 = rng.integers(0, 4, (2,)) - psr[p1] = None - psr[p2] = None - elif nulls == "all": - psr[:] = None - - gsr = cudf.from_pandas(psr) - - check_dtype = True - if gsr.null_count == len(gsr): - check_dtype = False - - expected = psr.dropna() - actual = gsr.dropna() - - if inplace: - expected = psr - actual = gsr - - assert_eq(expected, actual, check_dtype=check_dtype) - - -@pytest.mark.parametrize( - "data", - [ - {"a": [1, 2, None]}, - {"a": [1, 2, None], "b": [3, 4, 5]}, - {"a": [1, 2, None], "b": [3, 4, None]}, - {"a": [None, 1, 2], "b": [1, 2, None]}, - {"a": [None, 1, None], "b": [None, 2, None]}, - {"a": [None, None, 1], "b": [1, 2, None]}, - {"a": ["d", "e", "f"], "b": ["a", None, "c"]}, - ], -) -@pytest.mark.parametrize("how", ["all", "any"]) -@pytest.mark.parametrize("axis", [0, 1]) -@pytest.mark.parametrize("inplace", [True, False]) -def test_dropna_dataframe(data, how, axis, inplace): - pdf = pd.DataFrame(data) - gdf = cudf.from_pandas(pdf) - - expected = pdf.dropna(axis=axis, how=how, inplace=inplace) - actual = gdf.dropna(axis=axis, how=how, inplace=inplace) - - if inplace: - expected = pdf - actual = gdf - - assert_eq(expected, actual) - - -@pytest.mark.parametrize("how", ["all", "any"]) -@pytest.mark.parametrize( - "data", - [ - { - "a": pa.array([None, None, None], type=pa.float64()), - "b": [1, 2, None], - }, - { - "a": pa.array([np.nan, np.nan, np.nan]), - "b": [1, 2, None], - }, - {"a": pa.array([None, None, None], type=pa.string())}, - ], -) -@pytest.mark.parametrize("axis", [0, 1]) -def test_dropna_with_all_nulls(how, data, axis): - gdf = cudf.DataFrame(data) - pdf = gdf.to_pandas() - - assert_eq( - pdf.dropna(axis=axis, how=how), - gdf.dropna(axis=axis, how=how), - check_dtype=False, - ) - - -def test_dropna_nan_as_null(): - sr = cudf.Series([1.0, 2.0, np.nan, None], nan_as_null=False) - assert_eq(sr.dropna(), sr[:2]) - sr = sr.nans_to_nulls() - assert_eq(sr.dropna(), sr[:2]) - - df = cudf.DataFrame( - { - "a": cudf.Series([1.0, 2.0, np.nan, None], nan_as_null=False), - "b": cudf.Series([1, 2, 3, 4]), - } - ) - - got = df.dropna() - expected = df[:2] - assert_eq(expected, got) - - df = df.nans_to_nulls() - got = df.dropna() - expected = df[:2] - assert_eq(expected, got) - - -@pytest.mark.parametrize( - "data,subset", - [ - ({"a": [1, None], "b": [1, 2]}, ["a"]), - ({"a": [1, None], "b": [1, 2]}, ["b"]), - ({"a": [1, None], "b": [1, 2]}, []), - ({"a": [1, 2], "b": [1, 2]}, ["b"]), - ({"a": [1, 2, None], "b": [1, None, 2]}, ["a"]), - ({"a": [1, 2, None], "b": [1, None, 2]}, ["b"]), - ({"a": [1, 2, None], "b": [1, None, 2]}, ["a", "b"]), - ], -) -def test_dropna_subset_rows(data, subset): - pdf = pd.DataFrame(data) - gdf = cudf.from_pandas(pdf) - - assert_eq(pdf.dropna(subset=subset), gdf.dropna(subset=subset)) - - -@pytest.mark.parametrize( - "data, subset", - [ - ({"a": [1, None], "b": [1, 2]}, [0]), - ({"a": [1, None], "b": [1, 2]}, [1]), - ({"a": [1, None], "b": [1, 2]}, []), - ({"a": [1, 2], "b": [1, 2]}, [0]), - ({"a": [1, 2], "b": [None, 2], "c": [3, None]}, [0]), - ({"a": [1, 2], "b": [None, 2], "c": [3, None]}, [1]), - ({"a": [1, 2], "b": [None, 2], "c": [3, None]}, [0, 1]), - ], -) -def test_dropna_subset_cols(data, subset): - pdf = pd.DataFrame(data) - gdf = cudf.from_pandas(pdf) - - assert_eq( - pdf.dropna(axis=1, subset=subset), gdf.dropna(axis=1, subset=subset) - ) - - -# TODO: can't test with subset=[] below since Pandas -# returns empty DF when both subset=[] and thresh are specified. -@pytest.mark.parametrize("thresh", [0, 1, 2]) -@pytest.mark.parametrize("subset", [None, ["a"], ["b"], ["a", "b"]]) -def test_dropna_thresh(thresh, subset): - pdf = pd.DataFrame({"a": [1, 2, None, None], "b": [1, 2, 3, None]}) - gdf = cudf.from_pandas(pdf) - - assert_eq( - pdf.dropna(axis=0, thresh=thresh, subset=subset), - gdf.dropna(axis=0, thresh=thresh, subset=subset), - ) - - -@pytest.mark.parametrize("thresh", [0, 1, 2]) -@pytest.mark.parametrize("subset", [None, [0], [1], [0, 1]]) -@pytest.mark.parametrize("inplace", [True, False]) -def test_dropna_thresh_cols(thresh, subset, inplace): - pdf = pd.DataFrame( - {"a": [1, 2], "b": [3, 4], "c": [5, None], "d": [np.nan, np.nan]} - ) - gdf = cudf.from_pandas(pdf) - - expected = pdf.dropna( - axis=1, thresh=thresh, subset=subset, inplace=inplace - ) - actual = gdf.dropna(axis=1, thresh=thresh, subset=subset, inplace=inplace) - - if inplace: - expected = pdf - actual = gdf - - assert_eq( - expected, - actual, - ) - - -@pytest.mark.parametrize( - "data", - [ - { - "key": [1, 2, 10], - "val": pa.array([np.nan, 3.0, 1.0]), - "abc": [np.nan, None, 1], - }, - { - "key": [None, 2, 1], - "val": pa.array([3.0, None, 0.1]), - "abc": [None, 1, None], - }, - ], -) -@pytest.mark.parametrize("axis", [0, 1]) -def test_dropna_dataframe_np_nan(data, axis): - gdf = cudf.DataFrame(data) - pd_data = { - key: value.to_pandas() if isinstance(value, cudf.Series) else value - for key, value in data.items() - } - pdf = pd.DataFrame(pd_data) - - assert_eq(pdf.dropna(axis=axis), gdf.dropna(axis=axis), check_dtype=False) - - -@pytest.mark.parametrize( - "data, dtype", - [ - ([1, float("nan"), 2], "float64"), - (["x", None, "y"], "str"), - (["x", None, "y"], "category"), - (["2020-01-20", pd.NaT, "2020-03-15"], "datetime64[ns]"), - (["1s", pd.NaT, "3d"], "timedelta64[ns]"), - ], -) -def test_dropna_index(data, dtype): - pi = pd.Index(data, dtype=dtype) - gi = cudf.from_pandas(pi) - - expect = pi.dropna() - got = gi.dropna() - - assert_eq(expect, got) - - -@pytest.mark.parametrize("how", ["all", "any"]) -def test_dropna_multiindex(how): - pi = pd.MultiIndex.from_arrays([[1, None, 2], [None, None, 2]]) - gi = cudf.from_pandas(pi) - - expect = pi.dropna(how) - got = gi.dropna(how) - assert_eq(expect, got) - - -@pytest.mark.parametrize( - "data", - [ - [ - [pd.Timestamp("2020-01-01"), pd.NaT, pd.Timestamp("2020-02-01")], - [pd.NaT, pd.NaT, pd.Timestamp("2020-03-01")], - ], - [ - [pd.Timestamp("2020-01-01"), pd.NaT, pd.Timestamp("2020-02-01")], - [np.nan, np.nan, 1.0], - ], - [[1.0, np.nan, 2.0], [np.nan, np.nan, 1.0]], - ], -) -@pytest.mark.parametrize("how", ["all", "any"]) -def test_dropna_multiindex_2(data, how): - pi = pd.MultiIndex.from_arrays(data) - gi = cudf.from_pandas(pi) - - expect = pi.dropna(how) - got = gi.dropna(how) - - assert_eq(expect, got) - - -def test_ignore_index(): - pser = pd.Series([1, 2, np.nan], index=[2, 4, 1]) - gser = cudf.from_pandas(pser) - - result = pser.dropna(ignore_index=True) - expected = gser.dropna(ignore_index=True) - assert_eq(result, expected) diff --git a/python/cudf/cudf/tests/test_extension_compilation.py b/python/cudf/cudf/tests/test_extension_compilation.py index 8f08272ce66..78ffc662e11 100644 --- a/python/cudf/cudf/tests/test_extension_compilation.py +++ b/python/cudf/cudf/tests/test_extension_compilation.py @@ -12,7 +12,6 @@ from cudf import NA from cudf.core.udf.api import Masked from cudf.core.udf.masked_typing import MaskedType -from cudf.testing._utils import parametrize_numeric_dtypes_pairwise from cudf.utils._numba import _CUDFNumbaConfig arith_ops = ( @@ -166,20 +165,21 @@ def func(x): @pytest.mark.parametrize("op", ops) -@parametrize_numeric_dtypes_pairwise @pytest.mark.parametrize( "masked", ((False, True), (True, False), (True, True)), ids=("um", "mu", "mm"), ) -def test_compile_arith_masked_ops(op, left_dtype, right_dtype, masked): +def test_compile_arith_masked_ops( + op, numeric_types_as_str, numeric_types_as_str2, masked +): def func(x, y): return op(x, y) cc = (7, 5) - ty1 = from_dtype(np.dtype(left_dtype)) - ty2 = from_dtype(np.dtype(right_dtype)) + ty1 = from_dtype(np.dtype(numeric_types_as_str)) + ty2 = from_dtype(np.dtype(numeric_types_as_str2)) if masked[0]: ty1 = MaskedType(ty1) diff --git a/python/cudf/cudf/tests/test_udf_masked_ops.py b/python/cudf/cudf/tests/test_udf_masked_ops.py deleted file mode 100644 index 958a3657abb..00000000000 --- a/python/cudf/cudf/tests/test_udf_masked_ops.py +++ /dev/null @@ -1,1069 +0,0 @@ -# Copyright (c) 2021-2025, NVIDIA CORPORATION. -import math -import operator - -import numpy as np -import pytest -from numba import cuda -from numba.core.typing import signature as nb_signature -from numba.core.typing.templates import AbstractTemplate -from numba.cuda.cudadecl import registry as cuda_decl_registry -from numba.cuda.cudaimpl import lower as cuda_lower - -import cudf -from cudf.core._compat import PANDAS_CURRENT_SUPPORTED_VERSION, PANDAS_VERSION -from cudf.core.missing import NA -from cudf.core.udf._ops import ( - arith_ops, - bitwise_ops, - comparison_ops, - unary_ops, -) -from cudf.core.udf.api import Masked -from cudf.core.udf.strings_lowering import ( - cast_string_view_to_managed_udf_string, -) -from cudf.core.udf.strings_typing import ( - StringView, - managed_udf_string, - string_view, -) -from cudf.core.udf.utils import precompiled -from cudf.testing import assert_eq -from cudf.testing._utils import ( - _decimal_series, - parametrize_numeric_dtypes_pairwise, -) - - -def sv_to_managed_udf_str(sv): - """ - Cast a string_view object to a managed_udf_string object - - This placeholder function never runs in python - It exists only for numba to have something to replace - with the typing and lowering code below - - This is similar conceptually to needing a translation - engine to emit an expression in target language "B" when - there is no equivalent in the source language "A" to - translate from. This function effectively defines the - expression in language "A" and the associated typing - and lowering describe the translation process, despite - the expression having no meaning in language "A" - """ - pass - - -@cuda_decl_registry.register_global(sv_to_managed_udf_str) -class StringViewToUDFStringDecl(AbstractTemplate): - def generic(self, args, kws): - if isinstance(args[0], StringView) and len(args) == 1: - return nb_signature(managed_udf_string, string_view) - - -@cuda_lower(sv_to_managed_udf_str, string_view) -def sv_to_udf_str_testing_lowering(context, builder, sig, args): - return cast_string_view_to_managed_udf_string( - context, builder, sig.args[0], sig.return_type, args[0] - ) - - -@pytest.fixture(scope="module") -def str_udf_data(): - return cudf.DataFrame( - { - "str_col": [ - "abc", - "ABC", - "AbC", - "123", - "123aBc", - "123@.!", - "", - "rapids ai", - "gpu", - "True", - "False", - "1.234", - ".123a", - "0.013", - "1.0", - "01", - "20010101", - "cudf", - "cuda", - "gpu", - "This Is A Title", - "This is Not a Title", - "Neither is This a Title", - "NoT a TiTlE", - "123 Title Works", - ] - } - ) - - -@pytest.fixture(params=["a", "cu", "2", "gpu", "", " "]) -def substr(request): - return request.param - - -def run_masked_udf_test(func, data, args=(), nullable=True, **kwargs): - gdf = data - pdf = data.to_pandas(nullable=nullable) - - expect = pdf.apply(func, args=args, axis=1) - obtain = gdf.apply(func, args=args, axis=1) - assert_eq(expect, obtain, **kwargs) - - -def run_masked_string_udf_test(func, data, args=(), **kwargs): - gdf = data - pdf = data.to_pandas(nullable=True) - - def row_wrapper(row): - st = row["str_col"] - return func(st) - - expect = pdf.apply(row_wrapper, args=args, axis=1) - - func = cuda.jit(device=True)(func) - obtain = gdf.apply(row_wrapper, args=args, axis=1) - assert_eq(expect, obtain, **kwargs) - - # strings that come directly from input columns are backed by - # MaskedType(string_view) types. But new strings that are returned - # from functions or operators are backed by MaskedType(udf_string) - # types. We need to make sure all of our methods work on both kind - # of MaskedType. This function promotes the former to the latter - # prior to running the input function - def udf_string_wrapper(row): - masked_udf_str = Masked( - sv_to_managed_udf_str(row["str_col"].value), row["str_col"].valid - ) - return func(masked_udf_str) - - obtain = gdf.apply(udf_string_wrapper, args=args, axis=1) - assert_eq(expect, obtain, **kwargs) - - -def run_masked_udf_series(func, data, args=(), **kwargs): - gsr = data - psr = data.to_pandas(nullable=True) - - expect = psr.apply(func, args=args) - obtain = gsr.apply(func, args=args) - assert_eq(expect, obtain, **kwargs) - - -@pytest.mark.parametrize("op", arith_ops) -def test_arith_masked_vs_masked(op): - # This test should test all the typing - # and lowering for arithmetic ops between - # two columns - def func(row): - x = row["a"] - y = row["b"] - return op(x, y) - - gdf = cudf.DataFrame({"a": [1, None, 3, None], "b": [4, 5, None, None]}) - run_masked_udf_test(func, gdf, check_dtype=False) - - -@pytest.mark.parametrize("op", bitwise_ops) -def test_bitwise_masked_vs_masked(op): - # This test should test all the typing - # and lowering for bitwise ops between - # two columns - def func(row): - x = row["a"] - y = row["b"] - return op(x, y) - - gdf = cudf.DataFrame( - { - "a": [1, 0, 1, 0, 0b1011, 42, None], - "b": [1, 1, 0, 0, 0b1100, -42, 5], - } - ) - run_masked_udf_test(func, gdf, check_dtype=False) - - -@pytest.mark.parametrize( - "dtype_l", - ["datetime64[ns]", "datetime64[us]", "datetime64[ms]", "datetime64[s]"], -) -@pytest.mark.parametrize( - "dtype_r", - [ - "timedelta64[ns]", - "timedelta64[us]", - "timedelta64[ms]", - "timedelta64[s]", - "datetime64[ns]", - "datetime64[ms]", - "datetime64[us]", - "datetime64[s]", - ], -) -@pytest.mark.parametrize("op", [operator.add, operator.sub]) -def test_arith_masked_vs_masked_datelike(op, dtype_l, dtype_r): - # Datetime version of the above - # does not test all dtype combinations for now - if "datetime" in dtype_l and "datetime" in dtype_r and op is operator.add: - # don't try adding datetimes to datetimes. - pytest.skip("Adding datetime to datetime is not valid") - - def func(row): - x = row["a"] - y = row["b"] - return op(x, y) - - gdf = cudf.DataFrame( - { - "a": ["2011-01-01", cudf.NA, "2011-03-01", cudf.NA], - "b": [4, 5, cudf.NA, cudf.NA], - } - ) - gdf["a"] = gdf["a"].astype(dtype_l) - gdf["b"] = gdf["b"].astype(dtype_r) - - pdf = gdf.to_pandas() - expect = op(pdf["a"], pdf["b"]) - obtain = gdf.apply(func, axis=1) - assert_eq(expect, obtain, check_dtype=False) - # TODO: After the following pandas issue is - # fixed, uncomment the following line and delete - # through `to_pandas()` statement. - # https://github.com/pandas-dev/pandas/issues/52411 - - # run_masked_udf_test(func, gdf, nullable=False, check_dtype=False) - - -@pytest.mark.parametrize("op", comparison_ops) -def test_compare_masked_vs_masked(op): - # this test should test all the - # typing and lowering for comparisons - # between columns - - def func(row): - x = row["a"] - y = row["b"] - return op(x, y) - - # we should get: - # [?, ?, , , ] - gdf = cudf.DataFrame( - {"a": [1, 0, None, 1, None], "b": [0, 1, 0, None, None]} - ) - run_masked_udf_test(func, gdf, check_dtype=False) - - -@pytest.mark.parametrize("op", arith_ops) -@pytest.mark.parametrize("constant", [1, 1.5, True, False]) -@pytest.mark.parametrize("data", [[1, 2, cudf.NA]]) -def test_arith_masked_vs_constant(op, constant, data): - def func(row): - x = row["data"] - return op(x, constant) - - gdf = cudf.DataFrame({"data": data}) - - if constant is False and op in { - operator.mod, - operator.pow, - operator.truediv, - operator.floordiv, - operator.imod, - operator.ipow, - operator.itruediv, - operator.ifloordiv, - }: - # The following tests cases yield undefined behavior: - # - truediv(x, False) because its dividing by zero - # - floordiv(x, False) because its dividing by zero - # - mod(x, False) because its mod by zero, - # - pow(x, False) because we have an NA in the series and pandas - # insists that (NA**0 == 1) where we do not - pytest.skip() - run_masked_udf_test(func, gdf, check_dtype=False) - - -@pytest.mark.parametrize("op", arith_ops) -@pytest.mark.parametrize("constant", [1, 1.5, True, False]) -@pytest.mark.parametrize("data", [[2, 3, cudf.NA], [1, cudf.NA, 1]]) -def test_arith_masked_vs_constant_reflected(request, op, constant, data): - def func(row): - x = row["data"] - return op(constant, x) - - # Just a single column -> result will be all NA - gdf = cudf.DataFrame({"data": data}) - - # cudf differs from pandas for 1**NA - request.applymarker( - pytest.mark.xfail( - condition=(constant == 1 and op in {operator.pow, operator.ipow}), - reason="https://github.com/rapidsai/cudf/issues/7478", - ) - ) - run_masked_udf_test(func, gdf, check_dtype=False) - - -@pytest.mark.parametrize("op", arith_ops) -@pytest.mark.parametrize("data", [[1, cudf.NA, 3], [2, 3, cudf.NA]]) -def test_arith_masked_vs_null(request, op, data): - def func(row): - x = row["data"] - return op(x, NA) - - gdf = cudf.DataFrame({"data": data}) - - # In pandas, 1**NA == 1. - request.applymarker( - pytest.mark.xfail( - condition=( - (gdf["data"] == 1).any() - and op in {operator.pow, operator.ipow} - ), - reason="https://github.com/rapidsai/cudf/issues/7478", - ) - ) - run_masked_udf_test(func, gdf, check_dtype=False) - - -@pytest.mark.parametrize("op", arith_ops) -def test_arith_masked_vs_null_reflected(op): - def func(row): - x = row["data"] - return op(NA, x) - - gdf = cudf.DataFrame({"data": [1, None, 3]}) - run_masked_udf_test(func, gdf, check_dtype=False) - - -@pytest.mark.parametrize("op", unary_ops) -def test_unary_masked(op): - # This test should test all the typing - # and lowering for unary ops - - def func(row): - x = row["a"] - return op(x) if x is not NA else NA - - if "log" in op.__name__: - gdf = cudf.DataFrame({"a": [0.1, 1.0, None, 3.5, 1e8]}) - elif op.__name__ in {"asin", "acos"}: - gdf = cudf.DataFrame({"a": [0.0, 0.5, None, 1.0]}) - elif op.__name__ in {"atanh"}: - gdf = cudf.DataFrame({"a": [0.0, -0.5, None, 0.8]}) - elif op.__name__ in {"acosh", "sqrt", "lgamma"}: - gdf = cudf.DataFrame({"a": [1.0, 2.0, None, 11.0]}) - elif op.__name__ in {"gamma"}: - gdf = cudf.DataFrame({"a": [0.1, 2, None, 4]}) - elif op.__name__ in {"invert"}: - gdf = cudf.DataFrame({"a": [-100, 128, None, 0]}, dtype="int64") - else: - gdf = cudf.DataFrame({"a": [-125.60, 395.2, 0.0, None]}) - run_masked_udf_test(func, gdf, check_dtype=False) - - -def test_masked_is_null_conditional(): - def func(row): - x = row["a"] - y = row["b"] - if x is NA: - return y - else: - return x + y - - gdf = cudf.DataFrame({"a": [1, None, 3, None], "b": [4, 5, None, None]}) - run_masked_udf_test(func, gdf, check_dtype=False) - - -def test_apply_contains(): - def func(row): - x = row["a"] - return x in [1, 2] - - gdf = cudf.DataFrame({"a": [1, 3]}) - run_masked_udf_test(func, gdf, check_dtype=False) - - -@parametrize_numeric_dtypes_pairwise -@pytest.mark.parametrize("op", [operator.add, operator.and_, operator.eq]) -def test_apply_mixed_dtypes(left_dtype, right_dtype, op): - """ - Test that operations can be performed between columns - of different dtypes and return a column with the correct - values and nulls - """ - - # First perform the op on two dummy data on host, if numpy can - # safely type cast, we should expect it to work in udf too. - try: - op(np.dtype(left_dtype).type(0), np.dtype(right_dtype).type(42)) - except TypeError: - pytest.skip("Operation is unsupported for corresponding dtype.") - - def func(row): - x = row["a"] - y = row["b"] - return op(x, y) - - gdf = cudf.DataFrame({"a": [1.5, None, 3, None], "b": [4, 5, None, None]}) - gdf["a"] = gdf["a"].astype(left_dtype) - gdf["b"] = gdf["b"].astype(right_dtype) - - run_masked_udf_test(func, gdf, check_dtype=False) - - -@pytest.mark.parametrize("val", [5, 5.5]) -def test_apply_return_literal(val): - """ - Test unification codepath for scalars and MaskedType - makes sure that numba knows how to cast a scalar value - to a MaskedType - """ - - def func(row): - x = row["a"] - y = row["b"] - if x is not NA and x < 2: - return val - else: - return x + y - - gdf = cudf.DataFrame({"a": [1, None, 3, None], "b": [4, 5, None, None]}) - - run_masked_udf_test(func, gdf, check_dtype=False) - - -def test_apply_return_null(): - """ - Tests casting / unification of Masked and NA - """ - - def func(row): - x = row["a"] - if x is NA: - return NA - else: - return x - - gdf = cudf.DataFrame({"a": [1, None, 3]}) - run_masked_udf_test(func, gdf, check_dtype=False) - - -def test_apply_return_either_null_or_literal(): - def func(row): - x = row["a"] - if x > 5: - return 2 - else: - return NA - - gdf = cudf.DataFrame({"a": [1, 3, 6]}) - run_masked_udf_test(func, gdf, check_dtype=False) - - -def test_apply_return_literal_only(): - def func(x): - return 5 - - gdf = cudf.DataFrame({"a": [1, None, 3]}) - run_masked_udf_test(func, gdf, check_dtype=False) - - -def test_apply_everything(): - def func(row): - w = row["a"] - x = row["b"] - y = row["c"] - z = row["d"] - if x is NA: - return w + y - z - elif ((z > y) is not NA) and z > y: - return x - elif ((x + y) is not NA) and x + y == 0: - return z / x - elif x + y is NA: - return 2.5 - elif w > 100: - return ( - math.sin(x) - + math.sqrt(y) - - (-z) - + math.lgamma(x) * math.fabs(-0.8) / math.radians(3.14) - ) - else: - return y > 2 - - gdf = cudf.DataFrame( - { - "a": [1, 3, 6, 0, None, 5, None, 101], - "b": [3.0, 2.5, None, 5.0, 1.0, 5.0, 11.0, 1.0], - "c": [2, 3, 6, 0, None, 5, None, 6], - "d": [4, None, 6, 0, None, 5, None, 7.5], - } - ) - run_masked_udf_test(func, gdf, check_dtype=False) - - -### - - -@pytest.mark.parametrize( - "data,name", - [([1, 2, 3], None), ([1, cudf.NA, 3], None), ([1, 2, 3], "test_name")], -) -def test_series_apply_basic(data, name): - data = cudf.Series(data, name=name) - - def func(x): - return x + 1 - - run_masked_udf_series(func, data, check_dtype=False) - - -@pytest.mark.xfail( - PANDAS_VERSION >= PANDAS_CURRENT_SUPPORTED_VERSION, - reason="https://github.com/pandas-dev/pandas/issues/57390", -) -def test_series_apply_null_conditional(): - def func(x): - if x is NA: - return 42 - else: - return x - 1 - - data = cudf.Series([1, cudf.NA, 3]) - - run_masked_udf_series(func, data) - - -### - - -@pytest.mark.parametrize("op", arith_ops) -def test_series_arith_masked_vs_masked(op): - def func(x): - return op(x, x) - - data = cudf.Series([1, cudf.NA, 3]) - run_masked_udf_series(func, data, check_dtype=False) - - -@pytest.mark.xfail( - PANDAS_VERSION >= PANDAS_CURRENT_SUPPORTED_VERSION, - reason="https://github.com/pandas-dev/pandas/issues/57390", -) -@pytest.mark.parametrize("op", comparison_ops) -def test_series_compare_masked_vs_masked(op): - """ - In the series case, only one other MaskedType to compare with - - itself - """ - - def func(x): - return op(x, x) - - data = cudf.Series([1, cudf.NA, 3]) - run_masked_udf_series(func, data, check_dtype=False) - - -@pytest.mark.parametrize("op", arith_ops) -@pytest.mark.parametrize("constant", [1, 1.5, cudf.NA]) -def test_series_arith_masked_vs_constant(request, op, constant): - def func(x): - return op(x, constant) - - # Just a single column -> result will be all NA - data = cudf.Series([1, 2, cudf.NA]) - # in pandas, 1**NA == 1. In cudf, 1**NA == NA. - request.applymarker( - pytest.mark.xfail( - condition=( - constant is cudf.NA and op in {operator.pow, operator.ipow} - ), - reason="https://github.com/rapidsai/cudf/issues/7478", - ) - ) - run_masked_udf_series(func, data, check_dtype=False) - - -@pytest.mark.parametrize("op", arith_ops) -@pytest.mark.parametrize("constant", [1, 1.5, cudf.NA]) -def test_series_arith_masked_vs_constant_reflected(request, op, constant): - def func(x): - return op(constant, x) - - # Just a single column -> result will be all NA - data = cudf.Series([1, 2, cudf.NA]) - # Using in {1} since bool(NA == 1) raises a TypeError since NA is - # neither truthy nor falsy - # in pandas, 1**NA == 1. In cudf, 1**NA == NA. - request.applymarker( - pytest.mark.xfail( - condition=( - constant in {1} and op in {operator.pow, operator.ipow} - ), - reason="https://github.com/rapidsai/cudf/issues/7478", - ) - ) - run_masked_udf_series(func, data, check_dtype=False) - - -@pytest.mark.xfail( - PANDAS_VERSION >= PANDAS_CURRENT_SUPPORTED_VERSION, - reason="https://github.com/pandas-dev/pandas/issues/57390", -) -def test_series_masked_is_null_conditional(): - def func(x): - if x is NA: - return 42 - else: - return x - - data = cudf.Series([1, cudf.NA, 3, cudf.NA]) - - run_masked_udf_series(func, data, check_dtype=False) - - -@pytest.mark.parametrize("op", arith_ops + comparison_ops) -def test_masked_udf_lambda_support(op): - func = lambda row: op(row["a"], row["b"]) # noqa: E731 - - data = cudf.DataFrame( - {"a": [1, cudf.NA, 3, cudf.NA], "b": [1, 2, cudf.NA, cudf.NA]} - ) - - run_masked_udf_test(func, data, check_dtype=False) - - -@pytest.mark.parametrize("op", arith_ops + comparison_ops) -def test_masked_udf_nested_function_support(op): - """ - Nested functions need to be explicitly jitted by the user - for numba to recognize them. Unfortunately the object - representing the jitted function can not itself be used in - pandas udfs. - """ - - def inner(x, y): - return op(x, y) - - def outer(row): - x = row["a"] - y = row["b"] - return inner(x, y) - - gdf = cudf.DataFrame( - {"a": [1, cudf.NA, 3, cudf.NA], "b": [1, 2, cudf.NA, cudf.NA]} - ) - - with pytest.raises(ValueError): - gdf.apply(outer, axis=1) - - pdf = gdf.to_pandas(nullable=True) - inner_gpu = cuda.jit(device=True)(inner) - - def outer_gpu(row): - x = row["a"] - y = row["b"] - return inner_gpu(x, y) - - got = gdf.apply(outer_gpu, axis=1) - expect = pdf.apply(outer, axis=1) - assert_eq(expect, got, check_dtype=False) - - -@pytest.mark.parametrize( - "data", - [ - {"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]}, - {"a": [1, 2, 3], "c": [4, 5, 6], "b": [7, 8, 9]}, - {"a": [1, 2, 3], "b": [4, 5, 6], "c": ["a", "b", "c"]}, - ], -) -def test_masked_udf_subset_selection(data): - def func(row): - return row["a"] + row["b"] - - data = cudf.DataFrame(data) - run_masked_udf_test(func, data) - - -@pytest.mark.parametrize( - "unsupported_col", - [ - _decimal_series( - ["1.0", "2.0", "3.0"], dtype=cudf.Decimal64Dtype(2, 1) - ), - cudf.Series([1, 2, 3], dtype="category"), - cudf.interval_range(start=0, end=3), - [[1, 2], [3, 4], [5, 6]], - [{"a": 1}, {"a": 2}, {"a": 3}], - ], -) -def test_masked_udf_unsupported_dtype(unsupported_col): - data = cudf.DataFrame() - data["unsupported_col"] = unsupported_col - - def func(row): - return row["unsupported_col"] - - # check that we fail when an unsupported type is used within a function - with pytest.raises(ValueError): - data.apply(func, axis=1) - - # also check that a DF containing unsupported dtypes can still run a - # function that does NOT involve any of the unsupported dtype columns - data["supported_col"] = 1 - - def other_func(row): - return row["supported_col"] - - expect = cudf.Series(np.ones(len(data))) - got = data.apply(other_func, axis=1) - - assert_eq(expect, got, check_dtype=False) - - -# tests for `DataFrame.apply(f, args=(x,y,z))` -# testing the whole space of possibilities is intractable -# these test the most rudimentary guaranteed functionality -@pytest.mark.parametrize( - "data", - [ - {"a": [1, cudf.NA, 3]}, - {"a": [0.5, 2.0, cudf.NA, cudf.NA, 5.0]}, - {"a": [True, False, cudf.NA]}, - ], -) -@pytest.mark.parametrize("op", arith_ops + comparison_ops) -def test_masked_udf_scalar_args_binops(data, op): - data = cudf.DataFrame(data) - - def func(row, c): - return op(row["a"], c) - - run_masked_udf_test(func, data, args=(1,), check_dtype=False) - - -@pytest.mark.parametrize( - "data", - [ - {"a": [1, cudf.NA, 3]}, - {"a": [0.5, 2.0, cudf.NA, cudf.NA, 5.0]}, - {"a": [True, False, cudf.NA]}, - ], -) -@pytest.mark.parametrize("op", arith_ops + comparison_ops) -def test_masked_udf_scalar_args_binops_multiple(data, op): - data = cudf.DataFrame(data) - - def func(row, c, k): - x = op(row["a"], c) - y = op(x, k) - return y - - run_masked_udf_test(func, data, args=(1, 2), check_dtype=False) - - -@pytest.mark.parametrize( - "data", - [ - [1, cudf.NA, 3], - [0.5, 2.0, cudf.NA, cudf.NA, 5.0], - [True, False, cudf.NA], - ], -) -@pytest.mark.parametrize("op", arith_ops + comparison_ops) -def test_mask_udf_scalar_args_binops_series(data, op): - data = cudf.Series(data) - - def func(x, c): - return x + c - - run_masked_udf_series(func, data, args=(1,), check_dtype=False) - - -@pytest.mark.parametrize( - "data", - [ - [1, cudf.NA, 3], - [0.5, 2.0, cudf.NA, cudf.NA, 5.0], - [True, False, cudf.NA], - ], -) -@pytest.mark.parametrize("op", arith_ops + comparison_ops) -def test_masked_udf_scalar_args_binops_multiple_series(request, data, op): - data = cudf.Series(data) - request.applymarker( - pytest.mark.xfail( - op in comparison_ops - and PANDAS_VERSION >= PANDAS_CURRENT_SUPPORTED_VERSION - and data.dtype.kind != "b", - reason="https://github.com/pandas-dev/pandas/issues/57390", - ) - ) - - def func(data, c, k): - x = op(data, c) - y = op(x, k) - return y - - run_masked_udf_series(func, data, args=(1, 2), check_dtype=False) - - -def test_masked_udf_caching(): - # Make sure similar functions that differ - # by simple things like constants actually - # recompile - - data = cudf.Series([1, 2, 3]) - - expect = data**2 - got = data.apply(lambda x: x**2) - assert_eq(expect, got, check_dtype=False) - - # update the constant value being used and make sure - # it does not result in a cache hit - - expect = data**3 - got = data.apply(lambda x: x**3) - assert_eq(expect, got, check_dtype=False) - - # make sure we get a hit when reapplying - def f(x): - return x + 1 - - precompiled.clear() - assert precompiled.currsize == 0 - data.apply(f) - - assert precompiled.currsize == 1 - data.apply(f) - - assert precompiled.currsize == 1 - - # validate that changing the type of a scalar arg - # results in a miss - precompiled.clear() - - def f(x, c): - return x + c - - data.apply(f, args=(1,)) - assert precompiled.currsize == 1 - - data.apply(f, args=(1.5,)) - assert precompiled.currsize == 2 - - -@pytest.mark.parametrize( - "data", [[1.0, 0.0, 1.5], [1, 0, 2], [True, False, True]] -) -@pytest.mark.parametrize("operator", [float, int, bool]) -def test_masked_udf_casting(operator, data): - data = cudf.Series(data) - - def func(x): - return operator(x) - - run_masked_udf_series(func, data, check_dtype=False) - - -@pytest.mark.parametrize( - "data", - [ - np.array( - [0, 1, -1, 0, np.iinfo("int64").min, np.iinfo("int64").max], - dtype="int64", - ), - np.array([0, 0, 1, np.iinfo("uint64").max], dtype="uint64"), - np.array( - [ - 0, - 0.0, - -1.0, - 1.5, - -1.5, - np.finfo("float64").min, - np.finfo("float64").max, - np.nan, - np.inf, - -np.inf, - ], - dtype="float64", - ), - [False, True, False, cudf.NA], - ], -) -def test_masked_udf_abs(data): - data = cudf.Series(data) - data[0] = cudf.NA - - def func(x): - return abs(x) - - run_masked_udf_series(func, data, check_dtype=False) - - -class TestStringUDFs: - def test_string_udf_len(self, str_udf_data): - def func(row): - return len(row["str_col"]) - - run_masked_udf_test(func, str_udf_data, check_dtype=False) - - def test_string_udf_startswith(self, str_udf_data, substr): - def func(row): - return row["str_col"].startswith(substr) - - run_masked_udf_test(func, str_udf_data, check_dtype=False) - - def test_string_udf_endswith(self, str_udf_data, substr): - def func(row): - return row["str_col"].endswith(substr) - - run_masked_udf_test(func, str_udf_data, check_dtype=False) - - def test_string_udf_find(self, str_udf_data, substr): - def func(row): - return row["str_col"].find(substr) - - run_masked_udf_test(func, str_udf_data, check_dtype=False) - - def test_string_udf_rfind(self, str_udf_data, substr): - def func(row): - return row["str_col"].rfind(substr) - - run_masked_udf_test(func, str_udf_data, check_dtype=False) - - def test_string_udf_contains(self, str_udf_data, substr): - def func(row): - return substr in row["str_col"] - - run_masked_udf_test(func, str_udf_data, check_dtype=False) - - @pytest.mark.parametrize("other", ["cudf", "123", "", " "]) - @pytest.mark.parametrize("cmpop", comparison_ops) - def test_string_udf_cmpops(self, str_udf_data, other, cmpop): - def func(row): - return cmpop(row["str_col"], other) - - run_masked_udf_test(func, str_udf_data, check_dtype=False) - - def test_string_udf_isalnum(self, str_udf_data): - def func(row): - return row["str_col"].isalnum() - - run_masked_udf_test(func, str_udf_data, check_dtype=False) - - def test_string_udf_isalpha(self, str_udf_data): - def func(row): - return row["str_col"].isalpha() - - run_masked_udf_test(func, str_udf_data, check_dtype=False) - - def test_string_udf_isdigit(self, str_udf_data): - def func(row): - return row["str_col"].isdigit() - - run_masked_udf_test(func, str_udf_data, check_dtype=False) - - def test_string_udf_isdecimal(self, str_udf_data): - def func(row): - return row["str_col"].isdecimal() - - run_masked_udf_test(func, str_udf_data, check_dtype=False) - - def test_string_udf_isupper(self, str_udf_data): - def func(row): - return row["str_col"].isupper() - - run_masked_udf_test(func, str_udf_data, check_dtype=False) - - def test_string_udf_islower(self, str_udf_data): - def func(row): - return row["str_col"].islower() - - run_masked_udf_test(func, str_udf_data, check_dtype=False) - - def test_string_udf_isspace(self, str_udf_data): - def func(row): - return row["str_col"].isspace() - - run_masked_udf_test(func, str_udf_data, check_dtype=False) - - def test_string_udf_istitle(self, str_udf_data): - def func(row): - return row["str_col"].istitle() - - run_masked_udf_test(func, str_udf_data, check_dtype=False) - - def test_string_udf_count(self, str_udf_data, substr): - def func(row): - return row["str_col"].count(substr) - - run_masked_udf_test(func, str_udf_data, check_dtype=False) - - @pytest.mark.xfail(reason="Identity function not supported.") - def test_string_udf_return_string(self, str_udf_data): - def func(row): - return row["str_col"] - - run_masked_udf_test(func, str_udf_data, check_dtype=False) - - @pytest.mark.parametrize("strip_char", ["1", "a", "12", " ", "", ".", "@"]) - def test_string_udf_strip(self, str_udf_data, strip_char): - def func(row): - return row["str_col"].strip(strip_char) - - run_masked_udf_test(func, str_udf_data, check_dtype=False) - - @pytest.mark.parametrize("strip_char", ["1", "a", "12", " ", "", ".", "@"]) - def test_string_udf_lstrip(self, str_udf_data, strip_char): - def func(row): - return row["str_col"].lstrip(strip_char) - - run_masked_udf_test(func, str_udf_data, check_dtype=False) - - @pytest.mark.parametrize("strip_char", ["1", "a", "12", " ", "", ".", "@"]) - def test_string_udf_rstrip(self, str_udf_data, strip_char): - def func(row): - return row["str_col"].rstrip(strip_char) - - run_masked_udf_test(func, str_udf_data, check_dtype=False) - - def test_string_udf_upper(self, str_udf_data): - def func(row): - return row["str_col"].upper() - - run_masked_udf_test(func, str_udf_data, check_dtype=False) - - def test_string_udf_lower(self, str_udf_data): - def func(row): - return row["str_col"].lower() - - run_masked_udf_test(func, str_udf_data, check_dtype=False) - - @pytest.mark.parametrize( - "concat_char", ["1", "a", "12", " ", "", ".", "@"] - ) - def test_string_udf_concat(self, str_udf_data, concat_char): - def func(row): - return row["str_col"] + concat_char - - run_masked_udf_test(func, str_udf_data, check_dtype=False) - - @pytest.mark.parametrize("to_replace", ["a", "1", "", "@"]) - @pytest.mark.parametrize("replacement", ["a", "1", "", "@"]) - def test_string_udf_replace(self, str_udf_data, to_replace, replacement): - def func(row): - return row["str_col"].replace(to_replace, replacement) - - run_masked_udf_test(func, str_udf_data, check_dtype=False) From 87f8b59fdc0d07d921c4f91637467c35aba769dc Mon Sep 17 00:00:00 2001 From: Peter Andreas Entschev Date: Wed, 20 Aug 2025 21:26:22 +0200 Subject: [PATCH 179/366] Fix cudf-polars dependency list docs (#19750) The cudf-polars documentation only requests installing `ucxx`, but `distributed-ucxx` is required (which also brings `ucxx`), and that needs to be fixed. Authors: - Peter Andreas Entschev (https://github.com/pentschev) Approvers: - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/19750 --- docs/cudf/source/cudf_polars/streaming_execution.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/docs/cudf/source/cudf_polars/streaming_execution.md b/docs/cudf/source/cudf_polars/streaming_execution.md index 5e74af7038d..56a80a9b11a 100644 --- a/docs/cudf/source/cudf_polars/streaming_execution.md +++ b/docs/cudf/source/cudf_polars/streaming_execution.md @@ -77,9 +77,9 @@ Unlike the single GPU executor, this does require a number of additional dependencies. We currently require [Dask](https://www.dask.org/) and [Dask-CUDA](https://docs.rapids.ai/api/dask-cuda/nightly/) to be -installed. In addition, we recommend that -[ucxx](https://github.com/rapidsai/ucxx) and -[rapidsmpf](https://github.com/rapidsai/rapidsmpf) are installed to +installed. In addition, we recommend that Dask Distributed plugin of +[UCXX](https://github.com/rapidsai/ucxx) and +[RapidsMPF](https://github.com/rapidsai/rapidsmpf) are installed to take advantage of any high-performance networking. To quickly install all of these dependencies into a conda environment, @@ -87,7 +87,7 @@ you can run: ``` conda install -c rapidsai -c conda-forge \ - cudf-polars rapidsmpf dask-cuda ucxx + cudf-polars rapidsmpf dask-cuda distributed-ucxx ``` From 528eea27f8c211bc3670eb67bd562d29410948bf Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Wed, 20 Aug 2025 12:27:15 -0700 Subject: [PATCH 180/366] Skip third-party tests when possible (#19747) I noticed that we were running these in CI on PRs that didn't need to run them. Authors: - Vyas Ramasubramani (https://github.com/vyasr) Approvers: - James Lamb (https://github.com/jameslamb) URL: https://github.com/rapidsai/cudf/pull/19747 --- .github/workflows/pr.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml index 061a24e226b..8249fea30a0 100644 --- a/.github/workflows/pr.yaml +++ b/.github/workflows/pr.yaml @@ -343,6 +343,7 @@ jobs: needs: conda-python-build secrets: inherit uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-25.10 + if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_python || fromJSON(needs.changed-files.outputs.changed_file_groups).test_cudf_pandas with: build_type: pull-request branch: ${{ inputs.branch }} From a46a97f1bca6a7bdf6ac1de927c5103911f4df19 Mon Sep 17 00:00:00 2001 From: Nghia Truong <7416935+ttnghia@users.noreply.github.com> Date: Wed, 20 Aug 2025 14:46:54 -0700 Subject: [PATCH 181/366] Improve `M2`, `VARIANCE` and `STD` hash-based groupby aggregations (#19694) Currently, the aggregations `M2`, `VARIANCE` and `STD` are implemented using a two-pass groupby approach: * In the first pass, `SUM` and `COUNT_VALID` groupby aggregations are computed. * In the second pass, the results of these aggregations are used to perform another groupby aggregation. The second groupby operation can be avoided by changing the implementation of these aggregations to use another formula: * Instead of computing `M2` as groupby aggregation `M2 = SUM((x - avg)*(x-avg))` where `x` are values in the same key group, we can compute it using a linear transformation `M2 = SUM(x^2) - SUM(x)*SUM(x)/group_size`. This linear transformation is much faster than a groupby operation, which involves atomic operations across all values within each key group. In addition, such transformation is performed on the output column, which can be much smaller than the input column used for the second step groupby aggregation. * For `VARIANCE` and `STD`, they can also be computed from `M2` with additional linear transformations. This PR reimplements the aggregations above to use the new formula, changing the second step from a groupby operation into a linear transformation. Authors: - Nghia Truong (https://github.com/ttnghia) Approvers: - Muhammad Haseeb (https://github.com/mhaseeb123) - Yunsong Wang (https://github.com/PointKernel) - Matthew Roeschke (https://github.com/mroeschke) URL: https://github.com/rapidsai/cudf/pull/19694 --- cpp/CMakeLists.txt | 1 + cpp/benchmarks/CMakeLists.txt | 2 +- .../{group_m2.cpp => group_m2_var_std.cpp} | 51 +++-- cpp/src/groupby/common/m2_var_std.cu | 196 ++++++++++++++++++ cpp/src/groupby/common/m2_var_std.hpp | 46 ++++ .../groupby/hash/flatten_single_pass_aggs.cpp | 3 + .../hash/hash_compound_agg_finalizer.cu | 99 ++++----- .../hash/hash_compound_agg_finalizer.hpp | 5 +- cpp/src/groupby/hash/m2_var_functor.cuh | 177 ---------------- .../cudf/pandas/scripts/conftest-patch.py | 1 - 10 files changed, 319 insertions(+), 262 deletions(-) rename cpp/benchmarks/groupby/{group_m2.cpp => group_m2_var_std.cpp} (57%) create mode 100644 cpp/src/groupby/common/m2_var_std.cu create mode 100644 cpp/src/groupby/common/m2_var_std.hpp delete mode 100644 cpp/src/groupby/hash/m2_var_functor.cuh diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 1598d7fb51b..f59b9d6bf7e 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -424,6 +424,7 @@ add_library( src/filling/fill.cu src/filling/repeat.cu src/filling/sequence.cu + src/groupby/common/m2_var_std.cu src/groupby/groupby.cu src/groupby/hash/compute_aggregations.cu src/groupby/hash/compute_aggregations_null.cu diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt index 5fc041c4e58..c16f0789795 100644 --- a/cpp/benchmarks/CMakeLists.txt +++ b/cpp/benchmarks/CMakeLists.txt @@ -268,7 +268,7 @@ ConfigureNVBench( GROUPBY_NVBENCH groupby/group_complex_keys.cpp groupby/group_histogram.cpp - groupby/group_m2.cpp + groupby/group_m2_var_std.cpp groupby/group_max.cpp groupby/group_max_multithreaded.cpp groupby/group_nunique.cpp diff --git a/cpp/benchmarks/groupby/group_m2.cpp b/cpp/benchmarks/groupby/group_m2_var_std.cpp similarity index 57% rename from cpp/benchmarks/groupby/group_m2.cpp rename to cpp/benchmarks/groupby/group_m2_var_std.cpp index be907e9e343..fbe6ed3cf78 100644 --- a/cpp/benchmarks/groupby/group_m2.cpp +++ b/cpp/benchmarks/groupby/group_m2_var_std.cpp @@ -21,11 +21,13 @@ #include -template -void groupby_m2_helper(nvbench::state& state, - cudf::size_type num_rows, - cudf::size_type value_key_ratio, - double null_probability) +namespace { + +template +void run_benchmark(nvbench::state& state, + cudf::size_type num_rows, + cudf::size_type value_key_ratio, + double null_probability) { auto const keys = [&] { data_profile const profile = @@ -51,13 +53,22 @@ void groupby_m2_helper(nvbench::state& state, // Vector of 1 request std::vector requests(1); requests.back().values = values->view(); - requests.back().aggregations.push_back(cudf::make_m2_aggregation()); + if constexpr (Agg == cudf::aggregation::Kind::M2) { + requests.back().aggregations.push_back(cudf::make_m2_aggregation()); + } else if constexpr (Agg == cudf::aggregation::Kind::VARIANCE) { + requests.back().aggregations.push_back( + cudf::make_variance_aggregation()); + } else if constexpr (Agg == cudf::aggregation::Kind::STD) { + requests.back().aggregations.push_back(cudf::make_std_aggregation()); + } else { + throw std::runtime_error("Unsupported aggregation kind."); + } auto const mem_stats_logger = cudf::memory_stats_logger(); state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value())); state.exec(nvbench::exec_tag::sync, [&](nvbench::launch&) { - auto gb_obj = cudf::groupby::groupby(cudf::table_view({keys->view()})); - auto const result = gb_obj.aggregate(requests); + auto gb_obj = cudf::groupby::groupby(cudf::table_view({keys->view()})); + [[maybe_unused]] auto const result = gb_obj.aggregate(requests); }); auto const elapsed_time = state.get_summary("nv/cold/time/gpu/mean").get_float64("value"); @@ -66,18 +77,26 @@ void groupby_m2_helper(nvbench::state& state, mem_stats_logger.peak_memory_usage(), "peak_memory_usage", "peak_memory_usage"); } -template -void bench_groupby_m2(nvbench::state& state, nvbench::type_list) +} // namespace + +template +void bench_groupby_m2_var_std(nvbench::state& state, + nvbench::type_list>) { auto const value_key_ratio = static_cast(state.get_int64("value_key_ratio")); auto const num_rows = static_cast(state.get_int64("num_rows")); auto const null_probability = state.get_float64("null_probability"); - groupby_m2_helper(state, num_rows, value_key_ratio, null_probability); + run_benchmark(state, num_rows, value_key_ratio, null_probability); } -NVBENCH_BENCH_TYPES(bench_groupby_m2, NVBENCH_TYPE_AXES(nvbench::type_list)) - .set_name("groupby_m2") - .add_int64_axis("value_key_ratio", {10, 30, 100}) - .add_int64_axis("num_rows", {10'000, 1'000'000, 10'000'000}) - .add_float64_axis("null_probability", {0, 0.1, 0.9}); +using Types = nvbench::type_list; +using AggKinds = nvbench::enum_type_list; + +NVBENCH_BENCH_TYPES(bench_groupby_m2_var_std, NVBENCH_TYPE_AXES(Types, AggKinds)) + .set_name("groupby_m2_var_std") + .add_int64_axis("value_key_ratio", {20, 100}) + .add_int64_axis("num_rows", {100'000, 10'000'000, 100'000'000}) + .add_float64_axis("null_probability", {0, 0.5}); diff --git a/cpp/src/groupby/common/m2_var_std.cu b/cpp/src/groupby/common/m2_var_std.cu new file mode 100644 index 00000000000..5ca471c15bf --- /dev/null +++ b/cpp/src/groupby/common/m2_var_std.cu @@ -0,0 +1,196 @@ +/* + * Copyright (c) 2020-2025, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "m2_var_std.hpp" + +#include +#include +#include +#include +#include +#include + +#include +#include + +#include + +namespace cudf::groupby::detail { + +namespace { + +template +__device__ constexpr bool is_m2_supported() +{ + return is_numeric() && !is_fixed_point(); +} + +struct m2_functor { + template + void operator()(Args...) // + requires(!is_m2_supported()) + { + CUDF_FAIL("Invalid source type for M2 aggregation."); + } + + template + void evaluate(Target* target, + SumSqrType const* sum_sqr, + SumType const* sum, + CountType const* count, + size_type size, + rmm::cuda_stream_view stream) const noexcept + { + thrust::tabulate(rmm::exec_policy_nosync(stream), + target, + target + size, + [sum_sqr, sum, count] __device__(size_type const idx) { + auto const group_count = count[idx]; + if (group_count == 0) { return Target{}; } + auto const group_sum_sqr = static_cast(sum_sqr[idx]); + auto const group_sum = static_cast(sum[idx]); + auto const result = group_sum_sqr - group_sum * group_sum / group_count; + return result; + }); + } + + template + void operator()(mutable_column_view const& target, + column_view const& sum_sqr, + column_view const& sum, + column_view const& count, + rmm::cuda_stream_view stream) const noexcept // + requires(is_m2_supported()) + { + using Target = cudf::detail::target_type_t; + using SumSqrType = cudf::detail::target_type_t; + using SumType = cudf::detail::target_type_t; + using CountType = cudf::detail::target_type_t; + + // Separate the implementation into another function, which has fewer instantiations since + // the data types (target/sum/count etc) are mostly the same. + evaluate(target.begin(), + sum_sqr.begin(), + sum.begin(), + count.begin(), + target.size(), + stream); + } +}; + +} // namespace + +std::unique_ptr compute_m2(data_type source_type, + column_view const& sum_sqr, + column_view const& sum, + column_view const& count, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + auto output = make_numeric_column(cudf::detail::target_type(source_type, aggregation::M2), + sum.size(), + cudf::detail::copy_bitmask(sum, stream, mr), + sum.null_count(), + stream, + mr); + type_dispatcher(source_type, m2_functor{}, output->mutable_view(), sum_sqr, sum, count, stream); + return output; +} + +namespace { + +// M2, VARIANCE, STD and COUNT_VALID aggregations always have fixed types, thus we hardcode them +// instead of using type dispatcher for faster compilation. +using M2Type = double; +using VarianceType = double; +using StdType = double; +using CountType = int32_t; + +void check_input_types(column_view const& m2, column_view const& count) +{ + CUDF_EXPECTS(m2.type().id() == type_to_id(), + "Data type of M2 aggregation must be FLOAT64.", + std::invalid_argument); + CUDF_EXPECTS(count.type().id() == type_to_id(), + "Data type of COUNT_VALID aggregation must be INT32.", + std::invalid_argument); +} + +template +std::unique_ptr compute_variance_std(TransformFunc&& transform_fn, + size_type size, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + auto output = make_numeric_column( + data_type(type_to_id()), size, mask_state::UNALLOCATED, stream, mr); + + // Since we may have new null rows depending on the group count, we need to generate a new null + // mask from scratch. + rmm::device_uvector validity(size, stream); + + auto const out_it = + thrust::make_zip_iterator(output->mutable_view().begin(), validity.begin()); + thrust::tabulate(rmm::exec_policy_nosync(stream), out_it, out_it + size, transform_fn); + + auto [null_mask, null_count] = + cudf::detail::valid_if(validity.begin(), validity.end(), cuda::std::identity{}, stream, mr); + if (null_count > 0) { output->set_null_mask(std::move(null_mask), null_count); } + + return output; +} + +} // namespace + +std::unique_ptr compute_variance(column_view const& m2, + column_view const& count, + size_type ddof, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + check_input_types(m2, count); + + auto const transform_func = + [m2 = m2.begin(), count = count.begin(), ddof] __device__( + size_type const idx) -> cuda::std::pair { + auto const group_count = count[idx]; + auto const df = group_count - ddof; + if (group_count == 0 || df <= 0) { return {VarianceType{}, false}; } + return {m2[idx] / df, true}; + }; + return compute_variance_std(transform_func, m2.size(), stream, mr); +} + +std::unique_ptr compute_std(column_view const& m2, + column_view const& count, + size_type ddof, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + check_input_types(m2, count); + + auto const transform_func = + [m2 = m2.begin(), count = count.begin(), ddof] __device__( + size_type const idx) -> cuda::std::pair { + auto const group_count = count[idx]; + auto const df = group_count - ddof; + if (group_count == 0 || df <= 0) { return {StdType{}, false}; } + return {cuda::std::sqrt(m2[idx] / df), true}; + }; + return compute_variance_std(transform_func, m2.size(), stream, mr); +} + +} // namespace cudf::groupby::detail diff --git a/cpp/src/groupby/common/m2_var_std.hpp b/cpp/src/groupby/common/m2_var_std.hpp new file mode 100644 index 00000000000..161bfbcb909 --- /dev/null +++ b/cpp/src/groupby/common/m2_var_std.hpp @@ -0,0 +1,46 @@ +/* + * Copyright (c) 2025, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include + +#include +#include + +namespace cudf::groupby::detail { + +std::unique_ptr compute_m2(data_type source_type, + column_view const& sum_sqr, + column_view const& sum, + column_view const& count, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr); + +std::unique_ptr compute_variance(column_view const& m2, + column_view const& count, + size_type ddof, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr); + +std::unique_ptr compute_std(column_view const& m2, + column_view const& count, + size_type ddof, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr); + +} // namespace cudf::groupby::detail diff --git a/cpp/src/groupby/hash/flatten_single_pass_aggs.cpp b/cpp/src/groupby/hash/flatten_single_pass_aggs.cpp index 8e64560d246..6f2402c5149 100644 --- a/cpp/src/groupby/hash/flatten_single_pass_aggs.cpp +++ b/cpp/src/groupby/hash/flatten_single_pass_aggs.cpp @@ -72,6 +72,7 @@ class groupby_simple_aggregations_collector final cudf::detail::m2_aggregation const&) override { std::vector> aggs; + aggs.push_back(make_sum_of_squares_aggregation()); aggs.push_back(make_sum_aggregation()); // COUNT_VALID aggs.push_back(make_count_aggregation()); @@ -83,6 +84,7 @@ class groupby_simple_aggregations_collector final cudf::detail::var_aggregation const&) override { std::vector> aggs; + aggs.push_back(make_sum_of_squares_aggregation()); aggs.push_back(make_sum_aggregation()); // COUNT_VALID aggs.push_back(make_count_aggregation()); @@ -94,6 +96,7 @@ class groupby_simple_aggregations_collector final cudf::detail::std_aggregation const&) override { std::vector> aggs; + aggs.push_back(make_sum_of_squares_aggregation()); aggs.push_back(make_sum_aggregation()); // COUNT_VALID aggs.push_back(make_count_aggregation()); diff --git a/cpp/src/groupby/hash/hash_compound_agg_finalizer.cu b/cpp/src/groupby/hash/hash_compound_agg_finalizer.cu index 8b61254ce38..bd17780671b 100644 --- a/cpp/src/groupby/hash/hash_compound_agg_finalizer.cu +++ b/cpp/src/groupby/hash/hash_compound_agg_finalizer.cu @@ -14,28 +14,25 @@ * limitations under the License. */ +#include "groupby/common/m2_var_std.hpp" #include "hash_compound_agg_finalizer.hpp" #include "helpers.cuh" -#include "m2_var_functor.cuh" -#include #include #include #include #include #include -#include +#include #include #include #include -#include - namespace cudf::groupby::detail::hash { template hash_compound_agg_finalizer::hash_compound_agg_finalizer( - column_view col, + column_view const& col, cudf::detail::result_cache* sparse_results, cudf::detail::result_cache* dense_results, device_span gather_map, @@ -52,7 +49,7 @@ hash_compound_agg_finalizer::hash_compound_agg_finalizer( stream(stream), mr(mr) { - result_type = + input_type = cudf::is_dictionary(col.type()) ? cudf::dictionary_column_view(col).keys().type() : col.type(); } @@ -106,7 +103,7 @@ template void hash_compound_agg_finalizer::visit(cudf::detail::min_aggregation const& agg) { if (dense_results->has_result(col, agg)) return; - if (result_type.id() == type_id::STRING) { + if (input_type.id() == type_id::STRING) { auto transformed_agg = make_argmin_aggregation(); dense_results->add_result(col, agg, gather_argminmax(*transformed_agg)); } else { @@ -119,7 +116,7 @@ void hash_compound_agg_finalizer::visit(cudf::detail::max_aggregation c { if (dense_results->has_result(col, agg)) return; - if (result_type.id() == type_id::STRING) { + if (input_type.id() == type_id::STRING) { auto transformed_agg = make_argmax_aggregation(); dense_results->add_result(col, agg, gather_argminmax(*transformed_agg)); } else { @@ -143,7 +140,7 @@ void hash_compound_agg_finalizer::visit(cudf::detail::mean_aggregation cudf::detail::binary_operation(sum_result, count_result, binary_operator::DIV, - cudf::detail::target_type(result_type, aggregation::MEAN), + cudf::detail::target_type(input_type, aggregation::MEAN), stream, mr); dense_results->add_result(col, agg, std::move(result)); @@ -154,31 +151,18 @@ void hash_compound_agg_finalizer::visit(cudf::detail::m2_aggregation co { if (dense_results->has_result(col, agg)) { return; } - auto sum_agg = make_sum_aggregation(); - auto count_agg = make_count_aggregation(); + auto const sum_sqr_agg = make_sum_of_squares_aggregation(); + auto const sum_agg = make_sum_aggregation(); + auto const count_agg = make_count_aggregation(); + this->visit(*sum_sqr_agg); this->visit(*sum_agg); this->visit(*count_agg); - auto const sum_result = sparse_results->get_result(col, *sum_agg); - auto const count_result = sparse_results->get_result(col, *count_agg); - - auto const d_values_ptr = column_device_view::create(col, stream); - auto const d_sum_ptr = column_device_view::create(sum_result, stream).release(); - auto const d_count_ptr = column_device_view::create(count_result, stream).release(); - - auto output = make_fixed_width_column( - cudf::detail::target_type(result_type, agg.kind), col.size(), mask_state::ALL_NULL, stream); - auto output_view = mutable_column_device_view::create(output->mutable_view(), stream); - auto output_tview = mutable_table_view{{output->mutable_view()}}; - cudf::detail::initialize_with_identity( - output_tview, host_span(&agg.kind, 1), stream); - - thrust::for_each_n( - rmm::exec_policy_nosync(stream), - thrust::make_counting_iterator(0), - col.size(), - m2_hash_functor{set, row_bitmask, *output_view, *d_values_ptr, *d_sum_ptr, *d_count_ptr}); - sparse_results->add_result(col, agg, std::move(output)); - dense_results->add_result(col, agg, to_dense_agg_result(agg)); + auto const sum_sqr_result = dense_results->get_result(col, *sum_sqr_agg); + auto const sum_result = dense_results->get_result(col, *sum_agg); + auto const count_result = dense_results->get_result(col, *count_agg); + + auto output = compute_m2(input_type, sum_sqr_result, sum_result, count_result, stream, mr); + dense_results->add_result(col, agg, std::move(output)); } template @@ -186,44 +170,31 @@ void hash_compound_agg_finalizer::visit(cudf::detail::var_aggregation c { if (dense_results->has_result(col, agg)) return; - auto sum_agg = make_sum_aggregation(); - auto count_agg = make_count_aggregation(); - this->visit(*sum_agg); + auto const m2_agg = make_m2_aggregation(); + auto const count_agg = make_count_aggregation(); + this->visit(*dynamic_cast(m2_agg.get())); this->visit(*count_agg); - column_view sum_result = sparse_results->get_result(col, *sum_agg); - column_view count_result = sparse_results->get_result(col, *count_agg); - - auto values_view = column_device_view::create(col, stream); - auto sum_view = column_device_view::create(sum_result, stream); - auto count_view = column_device_view::create(count_result, stream); - - auto var_result = make_fixed_width_column( - cudf::detail::target_type(result_type, agg.kind), col.size(), mask_state::ALL_NULL, stream); - auto var_result_view = mutable_column_device_view::create(var_result->mutable_view(), stream); - mutable_table_view var_table_view{{var_result->mutable_view()}}; - cudf::detail::initialize_with_identity( - var_table_view, host_span(&agg.kind, 1), stream); - - thrust::for_each_n( - rmm::exec_policy_nosync(stream), - thrust::make_counting_iterator(0), - col.size(), - var_hash_functor{ - set, row_bitmask, *var_result_view, *values_view, *sum_view, *count_view, agg._ddof}); - sparse_results->add_result(col, agg, std::move(var_result)); - dense_results->add_result(col, agg, to_dense_agg_result(agg)); + auto const m2_result = dense_results->get_result(col, *m2_agg); + auto const count_result = dense_results->get_result(col, *count_agg); + + auto output = compute_variance(m2_result, count_result, agg._ddof, stream, mr); + dense_results->add_result(col, agg, std::move(output)); } template void hash_compound_agg_finalizer::visit(cudf::detail::std_aggregation const& agg) { - if (dense_results->has_result(col, agg)) return; - auto var_agg = make_variance_aggregation(agg._ddof); - this->visit(*dynamic_cast(var_agg.get())); - column_view variance = dense_results->get_result(col, *var_agg); + if (dense_results->has_result(col, agg)) { return; } - auto result = cudf::detail::unary_operation(variance, unary_operator::SQRT, stream, mr); - dense_results->add_result(col, agg, std::move(result)); + auto const m2_agg = make_m2_aggregation(); + auto const count_agg = make_count_aggregation(); + this->visit(*dynamic_cast(m2_agg.get())); + this->visit(*count_agg); + auto const m2_result = dense_results->get_result(col, *m2_agg); + auto const count_result = dense_results->get_result(col, *count_agg); + + auto output = compute_std(m2_result, count_result, agg._ddof, stream, mr); + dense_results->add_result(col, agg, std::move(output)); } template class hash_compound_agg_finalizer>; diff --git a/cpp/src/groupby/hash/hash_compound_agg_finalizer.hpp b/cpp/src/groupby/hash/hash_compound_agg_finalizer.hpp index 63e08a19177..a78a0729a06 100644 --- a/cpp/src/groupby/hash/hash_compound_agg_finalizer.hpp +++ b/cpp/src/groupby/hash/hash_compound_agg_finalizer.hpp @@ -21,13 +21,12 @@ #include #include -#include namespace cudf::groupby::detail::hash { template class hash_compound_agg_finalizer final : public cudf::detail::aggregation_finalizer { column_view col; - data_type result_type; + data_type input_type; cudf::detail::result_cache* sparse_results; cudf::detail::result_cache* dense_results; device_span gather_map; @@ -39,7 +38,7 @@ class hash_compound_agg_finalizer final : public cudf::detail::aggregation_final public: using cudf::detail::aggregation_finalizer::visit; - hash_compound_agg_finalizer(column_view col, + hash_compound_agg_finalizer(column_view const& col, cudf::detail::result_cache* sparse_results, cudf::detail::result_cache* dense_results, device_span gather_map, diff --git a/cpp/src/groupby/hash/m2_var_functor.cuh b/cpp/src/groupby/hash/m2_var_functor.cuh deleted file mode 100644 index c8f72fc4fbc..00000000000 --- a/cpp/src/groupby/hash/m2_var_functor.cuh +++ /dev/null @@ -1,177 +0,0 @@ -/* - * Copyright (c) 2020-2025, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include -#include -#include -#include -#include -#include - -#include - -namespace cudf::groupby::detail::hash { - -template -__device__ constexpr static bool is_m2_var_supported() -{ - return is_numeric() && !is_fixed_point(); -} - -template -struct m2_hash_functor { - SetType set; - bitmask_type const* __restrict__ row_bitmask; - mutable_column_device_view target; - column_device_view source; - column_device_view sum; - column_device_view count; - m2_hash_functor(SetType set, - bitmask_type const* row_bitmask, - mutable_column_device_view target, - column_device_view source, - column_device_view sum, - column_device_view count) - : set{set}, row_bitmask{row_bitmask}, target{target}, source{source}, sum{sum}, count{count} - { - } - - template - __device__ void operator()(column_device_view const&, size_type, size_type) noexcept - requires(!is_m2_var_supported()) - { - CUDF_UNREACHABLE("Invalid source type for M2 aggregation."); - } - - template - __device__ void operator()(column_device_view const& source, - size_type source_index, - size_type target_index) noexcept - requires(is_m2_var_supported()) - { - using Target = cudf::detail::target_type_t; - using SumType = cudf::detail::target_type_t; - using CountType = cudf::detail::target_type_t; - - if (source.is_null(source_index)) { return; } - auto const group_size = count.element(target_index); - if (group_size == 0) { return; } - - auto const x = static_cast(source.element(source_index)); - auto const mean = static_cast(sum.element(target_index)) / group_size; - auto const diff = x - mean; - auto const result = diff * diff; - cuda::atomic_ref ref{target.element(target_index)}; - ref.fetch_add(result, cuda::std::memory_order_relaxed); - if (target.is_null(target_index)) { target.set_valid(target_index); } - } - - __device__ inline void operator()(size_type source_index) - { - if (row_bitmask == nullptr or bit_is_set(row_bitmask, source_index)) { - auto const target_index = *set.find(source_index); - - auto col = source; - auto source_type = source.type(); - if (source_type.id() == type_id::DICTIONARY32) { - col = source.child(cudf::dictionary_column_view::keys_column_index); - source_type = col.type(); - source_index = static_cast(source.element(source_index)); - } - - type_dispatcher(source_type, *this, col, source_index, target_index); - } - } -}; - -template -struct var_hash_functor { - SetType set; - bitmask_type const* __restrict__ row_bitmask; - mutable_column_device_view target; - column_device_view source; - column_device_view sum; - column_device_view count; - size_type ddof; - var_hash_functor(SetType set, - bitmask_type const* row_bitmask, - mutable_column_device_view target, - column_device_view source, - column_device_view sum, - column_device_view count, - size_type ddof) - : set{set}, - row_bitmask{row_bitmask}, - target{target}, - source{source}, - sum{sum}, - count{count}, - ddof{ddof} - { - } - - template - __device__ void operator()(column_device_view const&, size_type, size_type) noexcept - requires(!is_m2_var_supported()) - { - CUDF_UNREACHABLE("Invalid source type for std, var aggregation combination."); - } - - template - __device__ void operator()(column_device_view const& source, - size_type source_index, - size_type target_index) noexcept - requires(is_m2_var_supported()) - { - using Target = cudf::detail::target_type_t; - using SumType = cudf::detail::target_type_t; - using CountType = cudf::detail::target_type_t; - - if (source.is_null(source_index)) return; - CountType group_size = count.element(target_index); - if (group_size == 0 or group_size - ddof <= 0) return; - - auto x = static_cast(source.element(source_index)); - auto mean = static_cast(sum.element(target_index)) / group_size; - Target result = (x - mean) * (x - mean) / (group_size - ddof); - cuda::atomic_ref ref{target.element(target_index)}; - ref.fetch_add(result, cuda::std::memory_order_relaxed); - // STD sqrt is applied in finalize() - - if (target.is_null(target_index)) { target.set_valid(target_index); } - } - - __device__ inline void operator()(size_type source_index) - { - if (row_bitmask == nullptr or cudf::bit_is_set(row_bitmask, source_index)) { - auto const target_index = *set.find(source_index); - - auto col = source; - auto source_type = source.type(); - if (source_type.id() == type_id::DICTIONARY32) { - col = source.child(cudf::dictionary_column_view::keys_column_index); - source_type = col.type(); - source_index = static_cast(source.element(source_index)); - } - - type_dispatcher(source_type, *this, col, source_index, target_index); - } - } -}; - -} // namespace cudf::groupby::detail::hash diff --git a/python/cudf/cudf/pandas/scripts/conftest-patch.py b/python/cudf/cudf/pandas/scripts/conftest-patch.py index b0419f844fd..cf12fae4337 100644 --- a/python/cudf/cudf/pandas/scripts/conftest-patch.py +++ b/python/cudf/cudf/pandas/scripts/conftest-patch.py @@ -7586,7 +7586,6 @@ def pytest_unconfigure(config): "tests/groupby/test_raises.py::test_groupby_raises_category[by8-True-fillna-method]", "tests/groupby/test_raises.py::test_groupby_raises_category[by9-False-fillna-method]", "tests/groupby/test_raises.py::test_groupby_raises_category[by9-True-fillna-method]", - "tests/groupby/test_raises.py::test_groupby_raises_timedelta[var]", "tests/groupby/test_reductions.py::test_empty_categorical[True]", "tests/groupby/test_reductions.py::test_first_last_skipna[Float32-False-False-first]", "tests/groupby/test_reductions.py::test_first_last_skipna[Float32-False-False-last]", From ada4351653e0d2acfb0e50f95b05ad9dfb9f926a Mon Sep 17 00:00:00 2001 From: Matthew Murray <41342305+Matt711@users.noreply.github.com> Date: Wed, 20 Aug 2025 21:29:17 -0400 Subject: [PATCH 182/366] Support `over` expression (window mapping) in cudf-polars (#19684) - Closes #16227 - Contributes to https://github.com/rapidsai/cudf/issues/18633 Note this PR implements grouped window aggregations (not really rolling). I think we can keep the name `GroupedRollingWindow` though because polars can support expressions like `rolling(...).over(..)` in the future. - Also contributes to #19200 and helps unblock PDS-DS queries 12, 20, 47, 51, 57, 89, 98 Authors: - Matthew Murray (https://github.com/Matt711) Approvers: - Matthew Roeschke (https://github.com/mroeschke) URL: https://github.com/rapidsai/cudf/pull/19684 --- .../cudf_polars/dsl/expressions/rolling.py | 172 +++++++++++++++++- .../cudf_polars/cudf_polars/dsl/translate.py | 32 +++- .../cudf_polars/dsl/utils/aggregations.py | 16 +- .../cudf_polars/experimental/parallel.py | 25 ++- .../cudf_polars/experimental/utils.py | 8 +- .../cudf_polars/cudf_polars/testing/plugin.py | 1 - .../tests/experimental/test_rolling.py | 21 +++ .../tests/expressions/test_rolling.py | 77 +++++++- .../tests/test_window_functions.py | 4 +- 9 files changed, 325 insertions(+), 31 deletions(-) diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/rolling.py b/python/cudf_polars/cudf_polars/dsl/expressions/rolling.py index a9e0eb0c0d8..f89d0e4133f 100644 --- a/python/cudf_polars/cudf_polars/dsl/expressions/rolling.py +++ b/python/cudf_polars/cudf_polars/dsl/expressions/rolling.py @@ -8,15 +8,19 @@ from typing import TYPE_CHECKING, Any +import polars as pl + import pylibcudf as plc -from cudf_polars.containers import Column +from cudf_polars.containers import Column, DataFrame, DataType from cudf_polars.dsl import expr from cudf_polars.dsl.expressions.base import ExecutionContext, Expr +from cudf_polars.dsl.utils.reshape import broadcast from cudf_polars.dsl.utils.windows import offsets_to_windows, range_window_bounds if TYPE_CHECKING: - from cudf_polars.containers import DataFrame, DataType + from collections.abc import Sequence + from cudf_polars.typing import ClosedInterval, Duration __all__ = ["GroupedRollingWindow", "RollingWindow", "to_request"] @@ -152,12 +156,166 @@ def do_evaluate( # noqa: D102 class GroupedRollingWindow(Expr): - __slots__ = ("options",) - _non_child = ("dtype", "options") + """ + Compute a window ``.over(...)`` aggregation and broadcast to rows. + + Notes + ----- + - This expression node currently implements **grouped window mapping** + (aggregate once per group, then broadcast back), not rolling windows. + - It can be extended later to support `rolling(...).over(...)` + when polars supports that expression. + """ + + __slots__ = ("by_count", "named_aggs", "options", "post") + _non_child = ("dtype", "options", "named_aggs", "post", "by_count") - def __init__(self, dtype: DataType, options: Any, agg: Expr, *by: Expr) -> None: + def __init__( + self, + dtype: DataType, + options: Any, + named_aggs: Sequence[expr.NamedExpr], + post: expr.NamedExpr, + *by: Expr, + ) -> None: self.dtype = dtype self.options = options - self.children = (agg, *by) + self.named_aggs = tuple(named_aggs) + self.post = post self.is_pointwise = False - raise NotImplementedError("Grouped rolling window not implemented") + + unsupported = [ + type(named_expr.value).__name__ + for named_expr in self.named_aggs + if not isinstance(named_expr.value, (expr.Len, expr.Agg)) + ] + if unsupported: + kinds = ", ".join(sorted(set(unsupported))) + raise NotImplementedError( + f"Unsupported over(...) only expression: {kinds}=" + ) + + # Ensures every partition-by is an Expr + # Fixes over(1) cases with the streaming + # executor and a small blocksize + by_expr = [ + (b if isinstance(b, Expr) else expr.Literal(DataType(pl.Int64()), b)) + for b in by + ] + + # Expose agg dependencies as children so the streaming + # executor retains required source columns + child_deps = [ + v.children[0] + for ne in self.named_aggs + for v in (ne.value,) + if isinstance(v, expr.Agg) + ] + self.by_count = len(by_expr) + self.children = tuple(by_expr) + tuple(child_deps) + + def do_evaluate( # noqa: D102 + self, df: DataFrame, *, context: ExecutionContext = ExecutionContext.FRAME + ) -> Column: + if context != ExecutionContext.FRAME: + raise RuntimeError( + "Window mapping (.over) can only be evaluated at the frame level" + ) # pragma: no cover; translation raises first + + by_exprs = self.children[: self.by_count] + by_cols = list( + broadcast( + *(b.evaluate(df, context=ExecutionContext.FRAME) for b in by_exprs), + target_length=df.num_rows, + ) + ) + by_tbl = plc.Table([c.obj for c in by_cols]) + + sorted_flag = ( + plc.types.Sorted.YES + if all(k.is_sorted for k in by_cols) + else plc.types.Sorted.NO + ) + grouper = plc.groupby.GroupBy( + by_tbl, + null_handling=plc.types.NullPolicy.INCLUDE, + keys_are_sorted=sorted_flag, + column_order=[k.order for k in by_cols], + null_precedence=[k.null_order for k in by_cols], + ) + + gb_requests: list[plc.groupby.GroupByRequest] = [] + out_names: list[str] = [] + out_dtypes: list[DataType] = [] + for ne in self.named_aggs: + val = ne.value + out_names.append(ne.name) + out_dtypes.append(val.dtype) + + if isinstance(val, expr.Len): + # Count rows per group via sum(1). + ones = plc.Column.from_scalar( + plc.Scalar.from_py(1, plc.DataType(plc.TypeId.INT8)), df.num_rows + ) + gb_requests.append( + plc.groupby.GroupByRequest(ones, [plc.aggregation.sum()]) + ) + elif isinstance(val, expr.Agg): + (child,) = ( + val.children if val.name != "quantile" else (val.children[0],) + ) + col = child.evaluate(df, context=ExecutionContext.FRAME).obj + gb_requests.append(plc.groupby.GroupByRequest(col, [val.agg_request])) + + group_keys_tbl, value_tables = grouper.aggregate(gb_requests) + out_cols = (t.columns()[0] for t in value_tables) + + # Build gather maps to broadcast per-group results to all rows. + # Also left-join input keys to group-keys so every input row appears exactly once. + lg, rg = plc.join.left_join( + by_tbl, group_keys_tbl, plc.types.NullEquality.EQUAL + ) + + # Reorder the gather maps to preserve left/input order + left_rows, right_rows = by_tbl.num_rows(), group_keys_tbl.num_rows() + init = plc.Scalar.from_py(0, plc.types.SIZE_TYPE) + step = plc.Scalar.from_py(1, plc.types.SIZE_TYPE) + left_order = plc.copying.gather( + plc.Table([plc.filling.sequence(left_rows, init, step)]), + lg, + plc.copying.OutOfBoundsPolicy.DONT_CHECK, + ) + right_order = plc.copying.gather( + plc.Table([plc.filling.sequence(right_rows, init, step)]), + rg, + plc.copying.OutOfBoundsPolicy.NULLIFY, + ) + # Sort both maps by (left_order, right_order), then use the reordered right map + # to gather group aggregates in the original row order. + _, rg = plc.sorting.stable_sort_by_key( + plc.Table([lg, rg]), + plc.Table([*left_order.columns(), *right_order.columns()]), + [plc.types.Order.ASCENDING, plc.types.Order.ASCENDING], + [plc.types.NullOrder.AFTER, plc.types.NullOrder.AFTER], + ).columns() + + # Broadcast each aggregated result back to row-shape using the right map. + broadcasted_cols = [ + Column( + plc.copying.gather( + plc.Table([col]), + rg, + plc.copying.OutOfBoundsPolicy.NULLIFY, + ).columns()[0], + name=named_expr.name, + dtype=dtype, + ) + for named_expr, dtype, col in zip( + self.named_aggs, out_dtypes, out_cols, strict=True + ) + ] + + # Create a temporary DataFrame with the broadcasted columns named by their + # placeholder names from agg decomposition, then evaluate the post-expression. + df = DataFrame(broadcasted_cols) + return self.post.value.evaluate(df, context=ExecutionContext.FRAME) diff --git a/python/cudf_polars/cudf_polars/dsl/translate.py b/python/cudf_polars/cudf_polars/dsl/translate.py index 27aa6bfbf4a..2e0401d006c 100644 --- a/python/cudf_polars/cudf_polars/dsl/translate.py +++ b/python/cudf_polars/cudf_polars/dsl/translate.py @@ -748,10 +748,38 @@ def _( return replace([named_post_agg.value], replacements)[0] elif isinstance(node.options, pl_expr.WindowMapping): # pl.col("a").over(...) + agg = translator.translate_expr(n=node.function, schema=schema) + name_gen = unique_names(schema) + aggs, post = decompose_single_agg( + expr.NamedExpr(next(name_gen), agg), + name_gen, + is_top=True, + # Follows GROUPBY semantics + context=ExecutionContext.GROUPBY, + ) + + mapping = node.options.kind + has_order_by = node.order_by is not None + descending = bool(getattr(node, "order_by_descending", False)) + nulls_last = bool(getattr(node, "order_by_nulls_last", False)) + + if has_order_by or descending or nulls_last: + raise NotImplementedError( + f"over(order_by) not supported yet: " + f"{node.order_by=}, {descending=}, {nulls_last=}" + ) + + if mapping != "groups_to_rows": + raise NotImplementedError( + f"over(mapping_strategy) not supported yet: {mapping=}; " + f"expected 'groups_to_rows'" + ) + return expr.GroupedRollingWindow( dtype, - node.options, - translator.translate_expr(n=node.function, schema=schema), + (mapping, has_order_by, descending, nulls_last), + [agg for agg, _ in aggs], + post, *(translator.translate_expr(n=n, schema=schema) for n in node.partition_by), ) assert_never(node.options) diff --git a/python/cudf_polars/cudf_polars/dsl/utils/aggregations.py b/python/cudf_polars/cudf_polars/dsl/utils/aggregations.py index 00145e1d533..83674fcc61d 100644 --- a/python/cudf_polars/cudf_polars/dsl/utils/aggregations.py +++ b/python/cudf_polars/cudf_polars/dsl/utils/aggregations.py @@ -176,6 +176,14 @@ def decompose_single_agg( [(named_expr.reconstruct(agg.reconstruct([child])), True)], named_expr.reconstruct(expr.Col(agg.dtype, name)), ) + elif agg.name in ("mean", "median", "quantile", "std", "var"): + # libcudf promotes these to float64; but polars + # keeps Float32, so cast back in post-processing. + named = expr.NamedExpr(name, agg) + post_col: expr.Expr = expr.Col(DataType(pl.Float64()), name) + if agg.dtype.plc.id() == plc.TypeId.FLOAT32: + post_col = expr.Cast(agg.dtype, post_col) + return [(named, True)], expr.NamedExpr(name, post_col) elif agg.name == "sum": col = ( expr.Cast(agg.dtype, expr.Col(DataType(pl.datatypes.Int64()), name)) @@ -222,14 +230,6 @@ def decompose_single_agg( return [(named_expr, True), (win_len, True)], expr.NamedExpr( name, post_ternary_expr ) - elif agg.name == "mean": - post_agg_col: expr.Expr = expr.Col( - DataType(pl.Float64), name - ) # libcudf promotes to float64 - if agg.dtype.plc.id() == plc.TypeId.FLOAT32: - # Cast back to float32 to match Polars - post_agg_col = expr.Cast(agg.dtype, post_agg_col) - return [(named_expr, True)], named_expr.reconstruct(post_agg_col) else: return [(named_expr, True)], named_expr.reconstruct( expr.Col(agg.dtype, name) diff --git a/python/cudf_polars/cudf_polars/experimental/parallel.py b/python/cudf_polars/cudf_polars/experimental/parallel.py index bb0aff53f94..f470168bf71 100644 --- a/python/cudf_polars/cudf_polars/experimental/parallel.py +++ b/python/cudf_polars/cudf_polars/experimental/parallel.py @@ -35,7 +35,7 @@ ) from cudf_polars.experimental.io import _clear_source_info_cache from cudf_polars.experimental.repartition import Repartition -from cudf_polars.experimental.utils import _concat, _lower_ir_fallback +from cudf_polars.experimental.utils import _concat, _contains_over, _lower_ir_fallback if TYPE_CHECKING: from collections.abc import MutableMapping @@ -346,11 +346,32 @@ def _lower_ir_pwise( _lower_ir_pwise_preserve = partial(_lower_ir_pwise, preserve_partitioning=True) lower_ir_node.register(Projection, _lower_ir_pwise_preserve) -lower_ir_node.register(Filter, _lower_ir_pwise_preserve) lower_ir_node.register(Cache, _lower_ir_pwise) lower_ir_node.register(HConcat, _lower_ir_pwise) +@lower_ir_node.register(Filter) +def _( + ir: Filter, rec: LowerIRTransformer +) -> tuple[IR, MutableMapping[IR, PartitionInfo]]: + child, partition_info = rec(ir.children[0]) + + if partition_info[child].count > 1 and _contains_over([ir.mask.value]): + # mask contains .over(...), collapse to single partition + return _lower_ir_fallback( + ir.reconstruct([child]), + rec, + msg=( + "over(...) inside filter is not supported for multiple partitions; " + "falling back to in-memory evaluation." + ), + ) + + new_node = ir.reconstruct([child]) + partition_info[new_node] = partition_info[child] + return new_node, partition_info + + @lower_ir_node.register(Slice) def _( ir: Slice, rec: LowerIRTransformer diff --git a/python/cudf_polars/cudf_polars/experimental/utils.py b/python/cudf_polars/cudf_polars/experimental/utils.py index 2ea74eac82e..bdebc02fac7 100644 --- a/python/cudf_polars/cudf_polars/experimental/utils.py +++ b/python/cudf_polars/cudf_polars/experimental/utils.py @@ -10,8 +10,9 @@ from itertools import chain from typing import TYPE_CHECKING -from cudf_polars.dsl.expr import Col +from cudf_polars.dsl.expr import Col, Expr, GroupedRollingWindow from cudf_polars.dsl.ir import Union +from cudf_polars.dsl.traversal import traversal from cudf_polars.experimental.base import PartitionInfo if TYPE_CHECKING: @@ -110,3 +111,8 @@ def _get_unique_fractions( for c, f in user_unique_fractions.items() if c in column_names } + + +def _contains_over(exprs: Sequence[Expr]) -> bool: + """Return True if any expression in 'exprs' contains an over(...) (ie. GroupedRollingWindow).""" + return any(isinstance(e, GroupedRollingWindow) for e in traversal(exprs)) diff --git a/python/cudf_polars/cudf_polars/testing/plugin.py b/python/cudf_polars/cudf_polars/testing/plugin.py index 14956d2cfbc..cd3bbe01020 100644 --- a/python/cudf_polars/cudf_polars/testing/plugin.py +++ b/python/cudf_polars/cudf_polars/testing/plugin.py @@ -130,7 +130,6 @@ def pytest_configure(config: pytest.Config) -> None: "tests/unit/operations/test_group_by.py::test_group_by_mean_by_dtype[input11-expected11-input_dtype11-output_dtype11]": "Unsupported groupby-agg for a particular dtype", "tests/unit/operations/test_group_by.py::test_group_by_mean_by_dtype[input12-expected12-input_dtype12-output_dtype12]": "Unsupported groupby-agg for a particular dtype", "tests/unit/operations/test_group_by.py::test_group_by_mean_by_dtype[input13-expected13-input_dtype13-output_dtype13]": "Unsupported groupby-agg for a particular dtype", - "tests/unit/operations/test_group_by.py::test_group_by_median_by_dtype[input7-expected7-Float32-Float32]": "Mismatching dtypes, needs cudf#15852", "tests/unit/operations/test_group_by.py::test_group_by_median_by_dtype[input10-expected10-Date-output_dtype10]": "Unsupported groupby-agg for a particular dtype", "tests/unit/operations/test_group_by.py::test_group_by_median_by_dtype[input11-expected11-input_dtype11-output_dtype11]": "Unsupported groupby-agg for a particular dtype", "tests/unit/operations/test_group_by.py::test_group_by_median_by_dtype[input12-expected12-input_dtype12-output_dtype12]": "Unsupported groupby-agg for a particular dtype", diff --git a/python/cudf_polars/tests/experimental/test_rolling.py b/python/cudf_polars/tests/experimental/test_rolling.py index 3f7046376c1..0748a3ce52e 100644 --- a/python/cudf_polars/tests/experimental/test_rolling.py +++ b/python/cudf_polars/tests/experimental/test_rolling.py @@ -34,3 +34,24 @@ def test_rolling_datetime(): ) q = df.with_columns(pl.sum("a").rolling(index_column="dt", period="2d")) assert_gpu_result_equal(q, engine=engine) + + +def test_over_in_filter_unsupported() -> None: + q = pl.concat( + [ + pl.LazyFrame({"k": ["x", "y"], "v": [3, 2]}), + pl.LazyFrame({"k": ["x", "y"], "v": [5, 7]}), + ] + ).filter(pl.len().over("k") == 2) + + engine = pl.GPUEngine( + raise_on_fail=True, + executor="streaming", + executor_options={ + "max_rows_per_partition": 1, + "scheduler": DEFAULT_SCHEDULER, + "fallback_mode": StreamingFallbackMode.SILENT, + }, + ) + + assert_gpu_result_equal(q, engine=engine) diff --git a/python/cudf_polars/tests/expressions/test_rolling.py b/python/cudf_polars/tests/expressions/test_rolling.py index 373e4903cef..6cf3c956358 100644 --- a/python/cudf_polars/tests/expressions/test_rolling.py +++ b/python/cudf_polars/tests/expressions/test_rolling.py @@ -14,6 +14,19 @@ from cudf_polars.utils.versions import POLARS_VERSION_LT_130 +@pytest.fixture +def df(): + return pl.LazyFrame( + { + "g": [1, 1, 2, 2, 2], + "x": [1, 2, 3, 4, 5], + "x2": [1, 100, 3, 4, 50], + "g2": ["a", "a", "b", "a", "a"], + "g_null": [1, None, 1, None, 2], + } + ) + + @pytest.mark.parametrize("time_unit", ["ns", "us", "ms"]) def test_rolling_datetime(time_unit): dates = [ @@ -124,14 +137,6 @@ def test_invalid_duration_spec_raises_in_translation(): assert_ir_translation_raises(q, pl.exceptions.InvalidOperationError) -def test_grouped_rolling(): - df = pl.LazyFrame({"a": [1, 2, 3, 4, 5, 6], "b": [1, 2, 1, 3, 1, 2]}) - - q = df.select(pl.col("a").min().over("b")) - - assert_ir_translation_raises(q, NotImplementedError) - - def test_rolling_inside_groupby_raises(): df = pl.LazyFrame( {"keys": [1, 1, 1, 2], "orderby": [1, 2, 4, 2], "values": [1, 2, 3, 4]} @@ -156,3 +161,59 @@ def test_rolling_sum_all_null_window_returns_null(): ) # Expected: [null, null, 5, 5, 5, 1] assert_gpu_result_equal(q) + + +@pytest.mark.parametrize( + "expr", + [ + pl.col("x").sum().over("g"), + pl.len().over("g"), + pl.col("x").cast(pl.Float64).mean().round(1).over("g"), + pl.col("x2").quantile(0.5, interpolation="lower").over("g"), + pl.col("x").sum().over("g", "g2"), + pl.col("x").sum().over(pl.col("g") % 2), + pl.col("x").sum().over("g_null"), + pl.col("x").cast(pl.Float32).mean().over("g"), + pl.col("x").sum().over(pl.lit(1)), + ], + ids=[ + "sum_broadcast", + "len_broadcast", + "mean_round", + "quantile_lower", + "multi_key_partition", + "expr_partition", + "null_keys", + "mean_float32_promotion", + "literal_partition", + ], +) +def test_over_group_various(df, expr): + q = df.select(expr) + assert_gpu_result_equal(q) + + +def test_window_over_group_sum_all_null_group_is_zero(df): + q = df.with_columns( + pl.when(pl.col("g") == 1) + .then(pl.lit(None, dtype=pl.Int64)) + .otherwise(pl.col("x")) + .alias("null") + ).select(s=pl.col("null").sum().over("g")) + assert_gpu_result_equal(q) + + +def test_over_with_order_by_unsupported(df): + q = df.select(pl.col("x").sum().over("g", order_by="x")) + assert_ir_translation_raises(q, NotImplementedError) + + +@pytest.mark.parametrize("strategy", ["explode", "join"], ids=["explode", "join"]) +def test_over_with_mapping_strategy_unsupported(df, strategy): + q = df.select(pl.col("x").sum().over("g", mapping_strategy=strategy)) + assert_ir_translation_raises(q, NotImplementedError) + + +def test_over_boolean_function_unsupported(df): + q = df.select(pl.col("x").not_().over("g")) + assert_ir_translation_raises(q, NotImplementedError) diff --git a/python/cudf_polars/tests/test_window_functions.py b/python/cudf_polars/tests/test_window_functions.py index 4bf1e482301..8ddaae4bd75 100644 --- a/python/cudf_polars/tests/test_window_functions.py +++ b/python/cudf_polars/tests/test_window_functions.py @@ -84,9 +84,9 @@ def test_over(df: pl.LazyFrame, partition_by, agg_expr): result_name = f"{agg_expr!s}_over_{partition_by!s}" window_expr = window_expr.alias(result_name) - query = df.with_columns(window_expr) + q = df.with_columns(window_expr) - assert_ir_translation_raises(query, NotImplementedError) + assert_gpu_result_equal(q) def test_over_with_sort(df: pl.LazyFrame): From c09beacd83227e390b53b45b14abf2db55aee8a4 Mon Sep 17 00:00:00 2001 From: Matthew Murray <41342305+Matt711@users.noreply.github.com> Date: Wed, 20 Aug 2025 21:45:12 -0400 Subject: [PATCH 183/366] Support ternary expression inside groupby/rolling context (#19242) Closes #18841 and Contributes to #19200. Helps unblock TPC-DS queries 12, 49, 76, 78. - Depends on #19680 - Depens on #19689 Authors: - Matthew Murray (https://github.com/Matt711) Approvers: - Tom Augspurger (https://github.com/TomAugspurger) URL: https://github.com/rapidsai/cudf/pull/19242 --- .../cudf_polars/dsl/utils/aggregations.py | 44 +++++++++++++++++- python/cudf_polars/tests/test_groupby.py | 46 +++++++++++++++++-- python/cudf_polars/tests/test_rolling.py | 46 +++++++++++++++++++ 3 files changed, 131 insertions(+), 5 deletions(-) diff --git a/python/cudf_polars/cudf_polars/dsl/utils/aggregations.py b/python/cudf_polars/cudf_polars/dsl/utils/aggregations.py index 83674fcc61d..8e242c55a47 100644 --- a/python/cudf_polars/cudf_polars/dsl/utils/aggregations.py +++ b/python/cudf_polars/cudf_polars/dsl/utils/aggregations.py @@ -235,7 +235,49 @@ def decompose_single_agg( expr.Col(agg.dtype, name) ) if isinstance(agg, expr.Ternary): - raise NotImplementedError("Ternary inside groupby") + when, then, otherwise = agg.children + + when_aggs, when_post = decompose_single_agg( + expr.NamedExpr(next(name_generator), when), + name_generator, + is_top=False, + context=context, + ) + then_aggs, then_post = decompose_single_agg( + expr.NamedExpr(next(name_generator), then), + name_generator, + is_top=False, + context=context, + ) + otherwise_aggs, otherwise_post = decompose_single_agg( + expr.NamedExpr(next(name_generator), otherwise), + name_generator, + is_top=False, + context=context, + ) + + when_has = any(h for _, h in when_aggs) + then_has = any(h for _, h in then_aggs) + otherwise_has = any(h for _, h in otherwise_aggs) + + if is_top and not (when_has or then_has or otherwise_has): + raise NotImplementedError( + "Broadcasted ternary with list output in groupby is not supported" + ) + + for post, has in ( + (when_post, when_has), + (then_post, then_has), + (otherwise_post, otherwise_has), + ): + if is_top and not has and not isinstance(post.value, expr.Literal): + raise NotImplementedError( + "Broadcasting aggregated expressions in groupby/rolling" + ) + + return [*when_aggs, *then_aggs, *otherwise_aggs], named_expr.reconstruct( + agg.reconstruct([when_post.value, then_post.value, otherwise_post.value]) + ) if not agg.is_pointwise and isinstance(agg, expr.BooleanFunction): raise NotImplementedError( f"Non pointwise boolean function {agg.name!r} not supported in groupby or rolling context" diff --git a/python/cudf_polars/tests/test_groupby.py b/python/cudf_polars/tests/test_groupby.py index bf97ec41cde..dcbf2d3eec5 100644 --- a/python/cudf_polars/tests/test_groupby.py +++ b/python/cudf_polars/tests/test_groupby.py @@ -152,12 +152,17 @@ def test_groupby_len(df, keys): "expr", [ (pl.col("int").max() + pl.col("float").min()).max(), - pl.when(pl.col("int") < pl.lit(2)) - .then(pl.col("float").sum()) - .otherwise(pl.lit(-2)), + ( + pl.when((pl.col("float") - pl.col("float").mean()) > 0) + .then(pl.col("float")) + .otherwise(None) + .sum() + ), + (pl.when(pl.col("int") > 5).then(pl.col("float")).otherwise(pl.lit(0.0))), + (pl.when(pl.col("int").min() >= 3).then(pl.col("float"))), ], ) -def test_groupby_unsupported(df, expr): +def test_groupby_unsupported(df: pl.LazyFrame, expr: pl.Expr) -> None: q = df.group_by("key1").agg(expr) assert_ir_translation_raises(q, NotImplementedError) @@ -346,3 +351,36 @@ def test_groupby_aggs_keep_unsupported_as_null(df: pl.LazyFrame, agg_expr) -> No lf = df.filter(pl.col("datetime") == date(2004, 12, 1)) q = lf.group_by("datetime").agg(agg_expr) assert_gpu_result_equal(q) + + +@pytest.mark.parametrize( + "expr", + [ + pl.when(pl.col("int") > 5).then(pl.col("float")).otherwise(None).sum(), + pl.when(pl.col("float").count() > 0) + .then(pl.col("float").sum()) + .otherwise(None), + ( + pl.when(pl.col("float").min() < pl.col("float").max()) + .then(pl.col("float").max() - pl.col("float").min()) + .otherwise(pl.lit(0.0)) + ), + ( + pl.when(pl.col("int").count() > 0) + .then( + pl.col("int").cast(pl.Float64).sum() + / pl.col("int").count().cast(pl.Float64) + ) + .otherwise(None) + ), + ], + ids=[ + "pre_pointwise_then_sum", + "post_over_aggs", + "post_multiple_aggs_range", + "post_manually_compute_mean", + ], +) +def test_groupby_ternary_supported(df: pl.LazyFrame, expr: pl.Expr) -> None: + q = df.group_by("key1").agg(expr) + assert_gpu_result_equal(q, check_row_order=False) diff --git a/python/cudf_polars/tests/test_rolling.py b/python/cudf_polars/tests/test_rolling.py index 89a4dc3b083..9dbe75e14ba 100644 --- a/python/cudf_polars/tests/test_rolling.py +++ b/python/cudf_polars/tests/test_rolling.py @@ -226,3 +226,49 @@ def test_rolling_null_count(df): nc=pl.col("null").null_count() ) assert_gpu_result_equal(q) + + +@pytest.mark.parametrize( + "expr", + [ + pl.when(pl.col("values") > 5).then(pl.col("floats")).otherwise(None).sum(), + pl.when(pl.col("values").count() > 0) + .then(pl.col("values").sum()) + .otherwise(None), + pl.when(pl.col("values").min() < pl.col("values").max()) + .then(pl.col("values").max() - pl.col("values").min()) + .otherwise(pl.lit(0)), + pl.when(pl.col("values").count() > 0) + .then( + pl.col("values").cast(pl.Float64).sum() + / pl.col("values").count().cast(pl.Float64) + ) + .otherwise(None), + ], + ids=[ + "pre_pointwise_then_sum", + "post_over_aggs", + "post_multiple_aggs_range", + "post_manually_compute_mean", + ], +) +def test_rolling_ternary_supported(df, expr): + q = df.rolling("dt", period="48h", closed="both").agg(expr.alias("out")) + assert_gpu_result_equal(q) + + +@pytest.mark.parametrize( + "expr", + [ + pl.when(pl.col("values") > 3) + .then(pl.col("values")) + .otherwise(pl.lit(None, dtype=pl.Int64)), + pl.when((pl.col("floats") - pl.col("floats").mean()) > 0) + .then(pl.col("floats")) + .otherwise(None) + .sum(), + ], +) +def test_rolling_ternary_unsupported(df, expr): + q = df.rolling("dt", period="48h", closed="both").agg(expr.alias("out")) + assert_ir_translation_raises(q, NotImplementedError) From 0ddd460fe1ffc4a8578c1842e53901691628713e Mon Sep 17 00:00:00 2001 From: Matthew Murray <41342305+Matt711@users.noreply.github.com> Date: Thu, 21 Aug 2025 02:33:27 -0400 Subject: [PATCH 184/366] Fix window var() test failures from float rounding (#19761) Adds relative and absolute tolerances to the window var tests. The failure was discovered after I merged #19684. I'm not sure exactly why CI didn't fail before. Authors: - Matthew Murray (https://github.com/Matt711) Approvers: - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/19761 --- python/cudf_polars/tests/test_window_functions.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/python/cudf_polars/tests/test_window_functions.py b/python/cudf_polars/tests/test_window_functions.py index 8ddaae4bd75..e1ad4719402 100644 --- a/python/cudf_polars/tests/test_window_functions.py +++ b/python/cudf_polars/tests/test_window_functions.py @@ -86,7 +86,12 @@ def test_over(df: pl.LazyFrame, partition_by, agg_expr): q = df.with_columns(window_expr) - assert_gpu_result_equal(q) + # CPU: 1.333333333333333 + # GPU: 1.333333333333334 + # Classic floating-point gotcha: looks the same, but the test fails + assert_gpu_result_equal( + q, check_exact=False, rtol=1e-15, atol=1e-15 + ) if "var" in str(agg_expr) else assert_gpu_result_equal(q) def test_over_with_sort(df: pl.LazyFrame): From 8c09ceb4866624369d5d84c8c6d6c55fced83a3d Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com> Date: Thu, 21 Aug 2025 02:54:50 -0700 Subject: [PATCH 185/366] Use `is_compressed` field from Parquet V2 data page headers to determine if they are compressed (#19755) Closes #19756 This PR decodes the `is_compressed` field from each Parquet data page's V2 header to determine if it's actually compressed instead of relying on column chunk codec information to assume that all pages in a column chunk are compressed. Authors: - Muhammad Haseeb (https://github.com/mhaseeb123) - Vyas Ramasubramani (https://github.com/vyasr) Approvers: - Bradley Dice (https://github.com/bdice) - Vukasin Milovanovic (https://github.com/vuule) - David Wendt (https://github.com/davidwendt) URL: https://github.com/rapidsai/cudf/pull/19755 --- cpp/src/io/parquet/page_hdr.cu | 23 ++++++++++++++++++- cpp/src/io/parquet/parquet_gpu.hpp | 1 + .../io/parquet/reader_impl_chunking_utils.cu | 14 +++++++---- 3 files changed, 33 insertions(+), 5 deletions(-) diff --git a/cpp/src/io/parquet/page_hdr.cu b/cpp/src/io/parquet/page_hdr.cu index abcdc31aa8b..2279f1cf08f 100644 --- a/cpp/src/io/parquet/page_hdr.cu +++ b/cpp/src/io/parquet/page_hdr.cu @@ -234,6 +234,25 @@ __device__ decode_kernel_mask kernel_mask_for_page(PageInfo const& page, return decode_kernel_mask::GENERAL; } +/** + * @brief Functor to set value to bool read from byte stream + * + * @return True if field type is not bool + */ +struct ParquetFieldBool { + int field; + bool& val; + + __device__ ParquetFieldBool(int f, bool& v) : field(f), val(v) {} + + inline __device__ bool operator()(byte_stream_s* bs, int field_type) + { + val = static_cast(field_type) == FieldType::BOOLEAN_TRUE; + return not(static_cast(field_type) == FieldType::BOOLEAN_TRUE or + static_cast(field_type) == FieldType::BOOLEAN_FALSE); + } +}; + /** * @brief Functor to set value to 32 bit integer read from byte stream * @@ -391,7 +410,8 @@ struct gpuParseDataPageHeaderV2 { ParquetFieldInt32(3, bs->page.num_rows), ParquetFieldEnum(4, bs->page.encoding), ParquetFieldInt32(5, bs->page.lvl_bytes[level_type::DEFINITION]), - ParquetFieldInt32(6, bs->page.lvl_bytes[level_type::REPETITION])); + ParquetFieldInt32(6, bs->page.lvl_bytes[level_type::REPETITION]), + ParquetFieldBool(7, bs->page.is_compressed)); return parse_header(op, bs); } }; @@ -470,6 +490,7 @@ void __launch_bounds__(decode_page_headers_block_size) bs->page.temp_string_size = 0; bs->page.temp_string_buf = nullptr; bs->page.kernel_mask = decode_kernel_mask::NONE; + bs->page.is_compressed = true; } num_values = bs->ck.num_values; page_info = chunk_pages ? chunk_pages[chunk].pages : nullptr; diff --git a/cpp/src/io/parquet/parquet_gpu.hpp b/cpp/src/io/parquet/parquet_gpu.hpp index 81303df097b..6ee49ff1c65 100644 --- a/cpp/src/io/parquet/parquet_gpu.hpp +++ b/cpp/src/io/parquet/parquet_gpu.hpp @@ -340,6 +340,7 @@ struct PageInfo { Encoding encoding; // Encoding for data or dictionary page Encoding definition_level_encoding; // Encoding used for definition levels (data page) Encoding repetition_level_encoding; // Encoding used for repetition levels (data page) + bool is_compressed; // Whether the page is compressed (V2 header) // for nested types, we run a preprocess step in order to determine output // column sizes. Because of this, we can jump directly to the position in the diff --git a/cpp/src/io/parquet/reader_impl_chunking_utils.cu b/cpp/src/io/parquet/reader_impl_chunking_utils.cu index 38fd132acf7..978ccd9dde5 100644 --- a/cpp/src/io/parquet/reader_impl_chunking_utils.cu +++ b/cpp/src/io/parquet/reader_impl_chunking_utils.cu @@ -129,17 +129,20 @@ void codec_stats::add_pages(host_span chunks, host_span page_mask) { // Create a page mask iterator that defaults to true if the page_mask is empty - auto page_mask_iter = - page_mask.empty() ? thrust::make_constant_iterator(true) : page_mask.begin(); + auto page_mask_iter = thrust::make_constant_iterator(true); // Zip iterator for iterating over pages and the page mask auto zip_iter = thrust::make_zip_iterator(pages.begin(), page_mask_iter); std::for_each(zip_iter, zip_iter + pages.size(), [&](auto const& item) { auto& [page, is_page_needed] = item; + // If this is a V2 page, use the `is_compressed` field to determine if it's compressed. + // For V1 pages, it's always compressed if the chunk.codec is specified. + auto const is_page_compressed = (page.flags & PAGEINFO_FLAGS_V2) ? page.is_compressed : true; if (is_page_needed && chunks[page.chunk_idx].codec == compression_type && (page.flags & cudf::io::parquet::detail::PAGEINFO_FLAGS_DICTIONARY) == - (selection == page_selection::DICT_PAGES)) { + (selection == page_selection::DICT_PAGES) and + is_page_compressed) { ++num_pages; total_decomp_size += page.uncompressed_page_size; max_decompressed_size = std::max(max_decompressed_size, page.uncompressed_page_size); @@ -540,8 +543,11 @@ std::vector compute_page_splits_by_row(device_span(decomp_data) + decomp_offset; // offset will only be non-zero for V2 pages auto const offset = From 36cd03697e8f71db3508d5e9dda6803b7a034f30 Mon Sep 17 00:00:00 2001 From: David Wendt <45795991+davidwendt@users.noreply.github.com> Date: Thu, 21 Aug 2025 07:31:42 -0400 Subject: [PATCH 186/366] Add count aggregation support to cudf::reduce (#19734) Simple aggregation type for `cudf::reduce` just return `input.size()` or `input.size()-input.null_count()` depending if nulls are specified included or not. Closes #13756 Closes #19700 Also found that `nunique.cu` could be renamed to `nunique.cpp` since it contains no device code in it. Authors: - David Wendt (https://github.com/davidwendt) Approvers: - Muhammad Haseeb (https://github.com/mhaseeb123) - Vyas Ramasubramani (https://github.com/vyasr) - Nghia Truong (https://github.com/ttnghia) URL: https://github.com/rapidsai/cudf/pull/19734 --- cpp/CMakeLists.txt | 3 +- .../cudf/detail/aggregation/aggregation.hpp | 3 +- .../reduction/detail/reduction_functions.hpp | 18 ++++++ cpp/src/aggregation/aggregation.cpp | 2 + cpp/src/reductions/count.cpp | 63 +++++++++++++++++++ .../reductions/{nunique.cu => nunique.cpp} | 0 cpp/src/reductions/reductions.cpp | 12 ++++ cpp/tests/reductions/reduction_tests.cpp | 41 ++++++++++++ 8 files changed, 140 insertions(+), 2 deletions(-) create mode 100644 cpp/src/reductions/count.cpp rename cpp/src/reductions/{nunique.cu => nunique.cpp} (100%) diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index f59b9d6bf7e..5b4c34ccd09 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -631,13 +631,14 @@ add_library( src/reductions/any.cu src/reductions/bitwise.cu src/reductions/collect_ops.cu + src/reductions/count.cpp src/reductions/histogram.cu src/reductions/max.cu src/reductions/mean.cu src/reductions/min.cu src/reductions/minmax.cu src/reductions/nth_element.cu - src/reductions/nunique.cu + src/reductions/nunique.cpp src/reductions/product.cu src/reductions/quantile.cu src/reductions/reductions.cpp diff --git a/cpp/include/cudf/detail/aggregation/aggregation.hpp b/cpp/include/cudf/detail/aggregation/aggregation.hpp index 32f4d8c572e..0d3efa425d9 100644 --- a/cpp/include/cudf/detail/aggregation/aggregation.hpp +++ b/cpp/include/cudf/detail/aggregation/aggregation.hpp @@ -279,7 +279,8 @@ class max_aggregation final : public rolling_aggregation, */ class count_aggregation final : public rolling_aggregation, public groupby_aggregation, - public groupby_scan_aggregation { + public groupby_scan_aggregation, + public reduce_aggregation { public: count_aggregation(aggregation::Kind kind) : aggregation(kind) {} diff --git a/cpp/include/cudf/reduction/detail/reduction_functions.hpp b/cpp/include/cudf/reduction/detail/reduction_functions.hpp index d7c49acca32..af43e0697fa 100644 --- a/cpp/include/cudf/reduction/detail/reduction_functions.hpp +++ b/cpp/include/cudf/reduction/detail/reduction_functions.hpp @@ -427,5 +427,23 @@ std::unique_ptr nunique(column_view const& col, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr); +/** + * @brief Returns the number of elements in the input column + * + * Returns `col.size()` or `col.size() - col.null_count()` depending on `null_handling` + * + * @param col Input column to compute the number of elements + * @param null_handling Indicates if null values will be included in the count + * @param output_type Data type of return type + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned scalar's device memory + * @return Number of elements as scalar of type `output_type` + */ +std::unique_ptr count(column_view const& col, + null_policy null_handling, + data_type const output_type, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr); + } // namespace reduction::detail } // namespace CUDF_EXPORT cudf diff --git a/cpp/src/aggregation/aggregation.cpp b/cpp/src/aggregation/aggregation.cpp index 39355983ed4..d206989b7a1 100644 --- a/cpp/src/aggregation/aggregation.cpp +++ b/cpp/src/aggregation/aggregation.cpp @@ -554,6 +554,8 @@ template CUDF_EXPORT std::unique_ptr make_count_aggregation(null_policy null_handling); template CUDF_EXPORT std::unique_ptr make_count_aggregation(null_policy null_handling); +template CUDF_EXPORT std::unique_ptr make_count_aggregation( + null_policy null_handling); /// Factory to create a HISTOGRAM aggregation template diff --git a/cpp/src/reductions/count.cpp b/cpp/src/reductions/count.cpp new file mode 100644 index 00000000000..e02d4bb9f14 --- /dev/null +++ b/cpp/src/reductions/count.cpp @@ -0,0 +1,63 @@ +/* + * Copyright (c) 2025, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include + +#include + +namespace cudf { +namespace reduction { +namespace detail { +namespace { + +struct count_scalar_fn { + template + requires(cudf::is_numeric_not_bool()) + std::unique_ptr operator()(size_type count, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) const + { + auto const value = static_cast(count); + return cudf::make_fixed_width_scalar(value, stream, mr); + } + + template + requires(not cudf::is_numeric_not_bool()) + std::unique_ptr operator()(size_type, + rmm::cuda_stream_view, + rmm::device_async_resource_ref) const + { + CUDF_FAIL("COUNT is not supported for boolean or non-numeric types", std::invalid_argument); + } +}; +} // namespace + +std::unique_ptr count(column_view const& col, + cudf::null_policy null_handling, + cudf::data_type const output_dtype, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + auto const count = col.size() - (null_handling == null_policy::EXCLUDE ? col.null_count() : 0); + return cudf::type_dispatcher(output_dtype, count_scalar_fn{}, count, stream, mr); +} +} // namespace detail +} // namespace reduction +} // namespace cudf diff --git a/cpp/src/reductions/nunique.cu b/cpp/src/reductions/nunique.cpp similarity index 100% rename from cpp/src/reductions/nunique.cu rename to cpp/src/reductions/nunique.cpp diff --git a/cpp/src/reductions/reductions.cpp b/cpp/src/reductions/reductions.cpp index 2c1c582091b..97aa0b1eff1 100644 --- a/cpp/src/reductions/reductions.cpp +++ b/cpp/src/reductions/reductions.cpp @@ -85,6 +85,12 @@ std::unique_ptr reduce_aggregate_impl( std::invalid_argument); return quantile(col, qagg._quantiles.front(), qagg._interpolation, output_dtype, stream, mr); } + case aggregation::COUNT_ALL: + case aggregation::COUNT_VALID: { + auto null_handling = + agg.kind == aggregation::COUNT_VALID ? null_policy::EXCLUDE : null_policy::INCLUDE; + return count(col, null_handling, output_dtype, stream, mr); + } case aggregation::NUNIQUE: { auto nunique_agg = static_cast(agg); return nunique(col, nunique_agg._null_handling, output_dtype, stream, mr); @@ -177,6 +183,12 @@ std::unique_ptr reduce_no_data_impl(reduce_aggregation const& agg, case aggregation::ALL: { return std::make_unique>(agg.kind == aggregation::ALL, true, stream, mr); } + case aggregation::COUNT_ALL: + case aggregation::COUNT_VALID: { + auto null_handling = + agg.kind == aggregation::COUNT_VALID ? null_policy::EXCLUDE : null_policy::INCLUDE; + return count(col, null_handling, output_dtype, stream, mr); + } case aggregation::NUNIQUE: { auto nunique_agg = static_cast(agg); auto valid = !col.is_empty() && (nunique_agg._null_handling == cudf::null_policy::INCLUDE); diff --git a/cpp/tests/reductions/reduction_tests.cpp b/cpp/tests/reductions/reduction_tests.cpp index ec22328e84a..7ccb1cc8f2c 100644 --- a/cpp/tests/reductions/reduction_tests.cpp +++ b/cpp/tests/reductions/reduction_tests.cpp @@ -1131,6 +1131,15 @@ TEST_F(ReductionEmptyTest, empty_column) EXPECT_EQ(result->is_valid(), false); result = cudf::reduce(col_nulls, *quantile_agg, double_type); EXPECT_EQ(result->is_valid(), false); + + auto count_agg = + cudf::make_count_aggregation(cudf::null_policy::INCLUDE); + result = cudf::reduce(col0, *count_agg, size_data_type); + EXPECT_EQ(result->is_valid(), true); + EXPECT_EQ(dynamic_cast*>(result.get())->value(), 0); + result = cudf::reduce(col_nulls, *count_agg, size_data_type); + EXPECT_EQ(result->is_valid(), true); + EXPECT_EQ(dynamic_cast*>(result.get())->value(), col_size); } TEST_F(ReductionEmptyTest, Errors) @@ -1590,6 +1599,38 @@ TYPED_TEST(ReductionTest, UniqueCount) expected_null_value1); } +TYPED_TEST(ReductionTest, Count) +{ + using T = TypeParam; + std::vector int_values({1, -3, 1, 2, 0, 2, -4, 45}); + std::vector v = convert_values(int_values); + + auto const output_type = cudf::data_type{cudf::type_to_id()}; + + // test without nulls + auto col = cudf::test::fixed_width_column_wrapper(v.begin(), v.end()); + auto expected_value = static_cast(v.size()); + auto count_agg = cudf::make_count_aggregation(cudf::null_policy::INCLUDE); + auto count_agg_exclude = + cudf::make_count_aggregation(cudf::null_policy::EXCLUDE); + EXPECT_EQ(this->template reduction_test(col, *count_agg, output_type).first, + expected_value); + EXPECT_EQ( + this->template reduction_test(col, *count_agg_exclude, output_type).first, + expected_value); + + // test with nulls + auto validity = cudf::test::iterators::null_at(3); + col = cudf::test::fixed_width_column_wrapper(v.begin(), v.end(), validity); + EXPECT_EQ(this->template reduction_test(col, *count_agg, output_type).first, + expected_value); + + expected_value = static_cast(v.size() - 1); + EXPECT_EQ( + this->template reduction_test(col, *count_agg_exclude, output_type).first, + expected_value); +} + template struct FixedPointTestAllReps : public cudf::test::BaseFixture {}; From 5f83c847aa6f09f9cffc73cb1b555f73e0dcd69b Mon Sep 17 00:00:00 2001 From: David Wendt <45795991+davidwendt@users.noreply.github.com> Date: Thu, 21 Aug 2025 07:58:58 -0400 Subject: [PATCH 187/366] Change nvtext::character_tokenize to return a list column (#19685) Changes the return object for `nvtext::character_tokenize()` to a list column instead of a flat strings column. This is so characters can be identified with their corresponding original input row. Authors: - David Wendt (https://github.com/davidwendt) Approvers: - Muhammad Haseeb (https://github.com/mhaseeb123) - Bradley Dice (https://github.com/bdice) - Lawrence Mitchell (https://github.com/wence-) URL: https://github.com/rapidsai/cudf/pull/19685 --- .../cudf/strings/detail/attributes.hpp | 35 +++++++++++ cpp/src/strings/attributes.cu | 1 + cpp/src/text/tokenize.cu | 63 ++++++------------- cpp/tests/text/tokenize_tests.cpp | 11 ++-- python/cudf/cudf/core/accessors/string.py | 7 ++- .../pylibcudf/tests/test_nvtext_tokenize.py | 2 +- 6 files changed, 66 insertions(+), 53 deletions(-) create mode 100644 cpp/include/cudf/strings/detail/attributes.hpp diff --git a/cpp/include/cudf/strings/detail/attributes.hpp b/cpp/include/cudf/strings/detail/attributes.hpp new file mode 100644 index 00000000000..6d8379df167 --- /dev/null +++ b/cpp/include/cudf/strings/detail/attributes.hpp @@ -0,0 +1,35 @@ +/* + * Copyright (c) 2025, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include +#include +#include + +namespace cudf { +namespace strings { +namespace detail { + +/** + * @copydoc cudf::strings::count_characters + */ +std::unique_ptr count_characters(strings_column_view const& input, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr); + +} // namespace detail +} // namespace strings +} // namespace cudf diff --git a/cpp/src/strings/attributes.cu b/cpp/src/strings/attributes.cu index 46360ee8663..c06eeb09ed0 100644 --- a/cpp/src/strings/attributes.cu +++ b/cpp/src/strings/attributes.cu @@ -22,6 +22,7 @@ #include #include #include +#include #include #include #include diff --git a/cpp/src/text/tokenize.cu b/cpp/src/text/tokenize.cu index 56c00723b77..a37f1c327e5 100644 --- a/cpp/src/text/tokenize.cu +++ b/cpp/src/text/tokenize.cu @@ -19,11 +19,9 @@ #include #include #include -#include #include #include -#include -#include +#include #include #include #include @@ -38,10 +36,7 @@ #include #include -#include -#include #include -#include #include #include #include @@ -104,31 +99,6 @@ std::unique_ptr tokenize_fn(cudf::size_type strings_count, return cudf::strings::detail::make_strings_column(tokens.begin(), tokens.end(), stream, mr); } -constexpr int64_t block_size = 512; // number of threads per block -constexpr int64_t bytes_per_thread = 4; // bytes processed per thread - -CUDF_KERNEL void count_characters(uint8_t const* d_chars, int64_t chars_bytes, int64_t* d_output) -{ - auto const idx = cudf::detail::grid_1d::global_thread_id(); - auto const byte_idx = static_cast(idx) * bytes_per_thread; - auto const lane_idx = static_cast(threadIdx.x); - - using block_reduce = cub::BlockReduce; - __shared__ typename block_reduce::TempStorage temp_storage; - - int64_t count = 0; - // each thread processes multiple bytes - for (auto i = byte_idx; (i < (byte_idx + bytes_per_thread)) && (i < chars_bytes); ++i) { - count += cudf::strings::detail::is_begin_utf8_char(d_chars[i]); - } - auto const total = block_reduce(temp_storage).Reduce(count, cuda::std::plus()); - - if ((lane_idx == 0) && (total > 0)) { - cuda::atomic_ref ref{*d_output}; - ref.fetch_add(total, cuda::std::memory_order_relaxed); - } -} - } // namespace // detail APIs @@ -203,7 +173,7 @@ std::unique_ptr character_tokenize(cudf::strings_column_view const { auto strings_count = strings_column.size(); if (strings_count == 0) { - return cudf::make_empty_column(cudf::data_type{cudf::type_id::STRING}); + return cudf::make_empty_lists_column(cudf::data_type{cudf::type_id::STRING}); } CUDF_EXPECTS( @@ -216,21 +186,20 @@ std::unique_ptr character_tokenize(cudf::strings_column_view const offsets, strings_column.offset() + strings_count, stream) - offset; // no bytes -- this could happen in an all-empty column - if (chars_bytes == 0) { return cudf::make_empty_column(cudf::type_id::STRING); } + if (chars_bytes == 0) { + return cudf::make_empty_lists_column(cudf::data_type{cudf::type_id::STRING}); + } auto d_chars = strings_column.parent().data(); // unsigned is necessary for checking bits d_chars += offset; - // To minimize memory, count the number of characters so we can - // build the output offsets without an intermediate buffer. - // In the worst case each byte is a character so the output is 4x the input. - cudf::detail::device_scalar d_count(0, stream); - auto const num_blocks = cudf::util::div_rounding_up_safe( - cudf::util::div_rounding_up_safe(chars_bytes, static_cast(bytes_per_thread)), - block_size); - count_characters<<>>( - d_chars, chars_bytes, d_count.data()); - auto const num_characters = d_count.value(stream); + auto const character_counts = cudf::strings::detail::count_characters( + strings_column, stream, cudf::get_current_device_resource_ref()); + auto [list_offsets, num_characters] = + cudf::detail::make_offsets_child_column(character_counts->view().begin(), + character_counts->view().end(), + stream, + mr); // number of characters becomes the number of rows so need to check the row limit CUDF_EXPECTS( @@ -258,9 +227,13 @@ std::unique_ptr character_tokenize(cudf::strings_column_view const rmm::device_uvector output_chars(chars_bytes, stream, mr); thrust::copy(rmm::exec_policy(stream), d_chars, d_chars + chars_bytes, output_chars.data()); - // return new strings column - return cudf::make_strings_column( + auto output_strings = cudf::make_strings_column( num_characters, std::move(offsets_column), output_chars.release(), 0, rmm::device_buffer{}); + return cudf::make_lists_column(strings_count, + std::move(list_offsets), + std::move(output_strings), + strings_column.null_count(), + cudf::detail::copy_bitmask(strings_column.parent(), stream, mr)); } } // namespace detail diff --git a/cpp/tests/text/tokenize_tests.cpp b/cpp/tests/text/tokenize_tests.cpp index f9ca343eaac..b7526230d91 100644 --- a/cpp/tests/text/tokenize_tests.cpp +++ b/cpp/tests/text/tokenize_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2024, NVIDIA CORPORATION. + * Copyright (c) 2020-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -111,11 +111,12 @@ TEST_F(TextTokenizeTest, TokenizeErrorTest) TEST_F(TextTokenizeTest, CharacterTokenize) { - cudf::test::strings_column_wrapper input({"the mousé ate the cheese", ""}); + cudf::test::strings_column_wrapper input({"the mousé ate", "the cheese", ""}); - cudf::test::strings_column_wrapper expected{"t", "h", "e", " ", "m", "o", "u", "s", - "é", " ", "a", "t", "e", " ", "t", "h", - "e", " ", "c", "h", "e", "e", "s", "e"}; + using LCW = cudf::test::lists_column_wrapper; + LCW expected{LCW{"t", "h", "e", " ", "m", "o", "u", "s", "é", " ", "a", "t", "e"}, + LCW{"t", "h", "e", " ", "c", "h", "e", "e", "s", "e"}, + LCW{}}; auto results = nvtext::character_tokenize(cudf::strings_column_view(input)); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); diff --git a/python/cudf/cudf/core/accessors/string.py b/python/cudf/cudf/core/accessors/string.py index 843d205c008..0b3f14d6267 100644 --- a/python/cudf/cudf/core/accessors/string.py +++ b/python/cudf/cudf/core/accessors/string.py @@ -535,8 +535,11 @@ def join( def _split_by_character(self) -> ListColumn: col = self._column.fillna("") # sanitize nulls result_col = col.character_tokenize() + if isinstance(result_col.dtype, ListDtype) and (result_col.size > 0): + return result_col # type: ignore offset_col = col.children[0] + child_col = result_col.children[1] return ListColumn( data=None, @@ -545,7 +548,7 @@ def _split_by_character(self) -> ListColumn: mask=col.mask, offset=0, null_count=0, - children=(offset_col, result_col), # type: ignore[arg-type] + children=(offset_col, child_col), # type: ignore[arg-type] ) def extract( @@ -4621,7 +4624,7 @@ def character_tokenize(self) -> Series | Index: 2 . dtype: object """ - result_col = self._column.character_tokenize() + result_col = self._column.character_tokenize().children[1] if isinstance(self._parent, cudf.Series): lengths = self.len().fillna(0) index = self._parent.index.repeat(lengths) diff --git a/python/pylibcudf/tests/test_nvtext_tokenize.py b/python/pylibcudf/tests/test_nvtext_tokenize.py index 2ecef38984c..9f22dc514de 100644 --- a/python/pylibcudf/tests/test_nvtext_tokenize.py +++ b/python/pylibcudf/tests/test_nvtext_tokenize.py @@ -62,7 +62,7 @@ def test_character_tokenize(input_col): got = plc.nvtext.tokenize.character_tokenize( plc.Column.from_arrow(input_col) ) - expect = pa.array(["a", "b", " ", "c", "d", ".", "e", ":", "f", ";"]) + expect = pa.array([["a"], ["b", " ", "c"], ["d", ".", "e", ":", "f", ";"]]) assert_column_eq(expect, got) From a133eb5723a0ab7cf16c642ba32928a3bc31be3c Mon Sep 17 00:00:00 2001 From: "Richard (Rick) Zamora" Date: Thu, 21 Aug 2025 10:05:20 -0500 Subject: [PATCH 188/366] Add ``ColumnSourceInfo`` convenience layer (#19752) Prerequisite for https://github.com/rapidsai/cudf/pull/19736 Adds a new ``ColumnSourceInfo`` class for convenience. This new class is a thin wrapper around ``DataSourceInfo``. While working on [this PR](https://github.com/rapidsai/cudf/pull/19736), I found the `ColumnStats -> DataSourceInfo` relationship to be a bit confusing/clumsy. The relationship was a bit cleaner with `ColumnStats -> ColumnSourceInfo`, because the ColumnSourceInfo class is able to better "hide" the fact that the underlying source is actually a table (rather than a column). Authors: - Richard (Rick) Zamora (https://github.com/rjzamora) Approvers: - Tom Augspurger (https://github.com/TomAugspurger) URL: https://github.com/rapidsai/cudf/pull/19752 --- .../cudf_polars/experimental/base.py | 89 +++++++++++-- .../cudf_polars/experimental/io.py | 15 +-- .../cudf_polars/experimental/statistics.py | 3 +- python/cudf_polars/docs/overview.md | 5 +- .../tests/experimental/test_stats.py | 125 ++++++++++-------- 5 files changed, 157 insertions(+), 80 deletions(-) diff --git a/python/cudf_polars/cudf_polars/experimental/base.py b/python/cudf_polars/cudf_polars/experimental/base.py index 01fc4d1e1fc..e4c0a43aa64 100644 --- a/python/cudf_polars/cudf_polars/experimental/base.py +++ b/python/cudf_polars/cudf_polars/experimental/base.py @@ -90,12 +90,12 @@ class UniqueStats: class DataSourceInfo: """ - Datasource information. + Table data source information. Notes ----- This class should be sub-classed for specific - datasource types (e.g. Parquet, DataFrame, etc.). + data source types (e.g. Parquet, DataFrame, etc.). The required properties/methods enable lazy sampling of the underlying datasource. """ @@ -117,6 +117,70 @@ def add_unique_stats_column(self, column: str) -> None: """Add a column needing unique-value information.""" +class ColumnSourceInfo: + """ + Source column information. + + Parameters + ---------- + table_source_info + Table data source information. + column_name + Column name in the data source. + + Notes + ----- + This is a thin wrapper around DataSourceInfo that provides + direct access to column-specific information. + """ + + __slots__ = ("_allow_unique_sampling", "column_name", "table_source_info") + table_source_info: DataSourceInfo + column_name: str + _allow_unique_sampling: bool + + def __init__(self, table_source_info: DataSourceInfo, column_name: str) -> None: + self.table_source_info = table_source_info + self.column_name = column_name + self._allow_unique_sampling = False + + @property + def row_count(self) -> ColumnStat[int]: + """Data source row-count estimate.""" + return self.table_source_info.row_count + + def unique_stats(self, *, force: bool = False) -> UniqueStats: + """ + Return unique-value statistics for a column. + + Parameters + ---------- + force + If True, return unique-value statistics even if the column + wasn't marked as needing unique-value information. + """ + return ( + self.table_source_info.unique_stats(self.column_name) + # Avoid sampling unique-stats if this column + # wasn't marked as needing unique-stats. + if force or self._allow_unique_sampling + else UniqueStats() + ) + + @property + def storage_size(self) -> ColumnStat[int]: + """Return the average column size for a single file.""" + return self.table_source_info.storage_size(self.column_name) + + def add_unique_stats_column(self, column: str | None = None) -> None: + """Add a column needing unique-value information.""" + if column in (None, self.column_name): + self._allow_unique_sampling = True + return self.table_source_info.add_unique_stats_column( + column or self.column_name + ) + + class ColumnStats: """ Column statistics. @@ -128,19 +192,16 @@ class ColumnStats: children Child ColumnStats objects. source_info - Datasource information. - source_name - Source-column name. + Column source information. unique_stats Unique-value statistics. """ - __slots__ = ("children", "name", "source_info", "source_name", "unique_stats") + __slots__ = ("children", "name", "source_info", "unique_stats") name: str children: tuple[ColumnStats, ...] - source_info: DataSourceInfo - source_name: str + source_info: ColumnSourceInfo unique_stats: UniqueStats def __init__( @@ -148,14 +209,12 @@ def __init__( name: str, *, children: tuple[ColumnStats, ...] = (), - source_info: DataSourceInfo | None = None, - source_name: str | None = None, + source_info: ColumnSourceInfo | None = None, unique_stats: UniqueStats | None = None, ) -> None: self.name = name self.children = children - self.source_info = source_info or DataSourceInfo() - self.source_name = source_name or name + self.source_info = source_info or ColumnSourceInfo(DataSourceInfo(), name) self.unique_stats = unique_stats or UniqueStats() def new_parent( @@ -184,7 +243,6 @@ def new_parent( children=(self,), # Want to reference the same DataSourceInfo source_info=self.source_info, - source_name=self.source_name, # Want fresh UniqueStats so we can mutate in place unique_stats=UniqueStats(), ) @@ -195,6 +253,11 @@ class StatsCollector: __slots__ = ("column_stats", "row_count") + row_count: dict[IR, ColumnStat[int]] + """Estimated row count for each IR node.""" + column_stats: dict[IR, dict[str, ColumnStats]] + """Column statistics for each IR node.""" + def __init__(self) -> None: self.row_count: dict[IR, ColumnStat[int]] = {} self.column_stats: dict[IR, dict[str, ColumnStats]] = {} diff --git a/python/cudf_polars/cudf_polars/experimental/io.py b/python/cudf_polars/cudf_polars/experimental/io.py index 57f314ed571..e36e73c5f90 100644 --- a/python/cudf_polars/cudf_polars/experimental/io.py +++ b/python/cudf_polars/cudf_polars/experimental/io.py @@ -19,6 +19,7 @@ from cudf_polars.dsl.ir import IR, DataFrameScan, Empty, Scan, Sink, Union from cudf_polars.experimental.base import ( + ColumnSourceInfo, ColumnStat, ColumnStats, DataSourceInfo, @@ -118,8 +119,8 @@ def from_scan(ir: Scan, config_options: ConfigOptions) -> ScanPartitionPlan: blocksize: int = config_options.executor.target_partition_size column_stats = _extract_scan_stats(ir, config_options) column_sizes: list[int] = [] - for name, cs in column_stats.items(): - storage_size = cs.source_info.storage_size(name) + for cs in column_stats.values(): + storage_size = cs.source_info.storage_size if storage_size.value is not None: column_sizes.append(storage_size.value) @@ -821,7 +822,7 @@ def _extract_scan_stats( ) -> dict[str, ColumnStats]: """Extract base ColumnStats for a Scan node.""" if ir.typ == "parquet": - source_info = _sample_pq_stats( + table_source_info = _sample_pq_stats( tuple(ir.paths), config_options.parquet_options.max_footer_samples, config_options.parquet_options.max_row_group_samples, @@ -829,8 +830,7 @@ def _extract_scan_stats( return { name: ColumnStats( name=name, - source_info=source_info, - source_name=name, + source_info=ColumnSourceInfo(table_source_info, name), ) for name in ir.schema } @@ -879,12 +879,11 @@ def unique_stats(self, column: str) -> UniqueStats: def _extract_dataframescan_stats(ir: DataFrameScan) -> dict[str, ColumnStats]: """Extract base ColumnStats for a DataFrameScan node.""" - source_info = DataFrameSourceInfo(ir.df) + table_source_info = DataFrameSourceInfo(ir.df) return { name: ColumnStats( name=name, - source_info=source_info, - source_name=name, + source_info=ColumnSourceInfo(table_source_info, name), ) for name in ir.schema } diff --git a/python/cudf_polars/cudf_polars/experimental/statistics.py b/python/cudf_polars/cudf_polars/experimental/statistics.py index 7c302fe59d7..18588ac7e29 100644 --- a/python/cudf_polars/cudf_polars/experimental/statistics.py +++ b/python/cudf_polars/cudf_polars/experimental/statistics.py @@ -66,9 +66,8 @@ def _update_unique_stats_columns( if ( name not in unique_fraction and (column_stats := child_column_stats.get(name)) is not None - and (source_stats := column_stats.source_info) is not None ): - source_stats.add_unique_stats_column(column_stats.source_name or name) + column_stats.source_info.add_unique_stats_column() @initialize_column_stats.register(IR) diff --git a/python/cudf_polars/docs/overview.md b/python/cudf_polars/docs/overview.md index 0eaf182762b..34e66e62436 100644 --- a/python/cudf_polars/docs/overview.md +++ b/python/cudf_polars/docs/overview.md @@ -417,8 +417,11 @@ datasource (e.g. a Parquet dataset or in-memory `DataFrame`). **aggregated** column sampling via sub-classing. For example, The `ParquetSourceInfo` sub-class uses caching to avoid redundant file-system access. +- `ColumnSourceInfo`: This class wraps a `DataSourceInfo` object. +Since `DataSourceInfo` tracks information for an entire table, we use +`ColumnSourceInfo` to provide a single-column view of the object. - `ColumnStats`: This class is used to group together the "base" -`DataSourceInfo` reference and the current `UniqueStats` estimates +`ColumnSourceInfo` reference and the local `UniqueStats` estimates for a specific IR + column combination. We bundle these references together to simplify the design and maintenance of `StatsCollector`. **NOTE:** The current `UniqueStats` estimates are not yet populated. diff --git a/python/cudf_polars/tests/experimental/test_stats.py b/python/cudf_polars/tests/experimental/test_stats.py index 147cfc98555..f139d344eaa 100644 --- a/python/cudf_polars/tests/experimental/test_stats.py +++ b/python/cudf_polars/tests/experimental/test_stats.py @@ -44,31 +44,46 @@ def test_base_stats_dataframescan(df): column_stats = stats.column_stats[ir] # Source info is the same for all columns - source_info = column_stats["x"].source_info - assert source_info is column_stats["y"].source_info - assert source_info is column_stats["z"].source_info - assert source_info.row_count.value == row_count - assert source_info.row_count.exact + source_info_x = column_stats["x"].source_info + source_info_y = column_stats["y"].source_info + source_info_z = column_stats["z"].source_info + table_source_info = source_info_x.table_source_info + assert table_source_info is source_info_y.table_source_info + assert table_source_info is source_info_z.table_source_info + assert source_info_x.row_count.value == row_count + assert source_info_x.row_count.exact # Storage stats should not be available - assert source_info.storage_size("x").value is None + assert source_info_x.storage_size.value is None - # Check unique stats + # Check unique stats. + # We need to use force=True to sample unique-value statistics, + # because nothing in the query requires unique-value statistics. assert math.isclose( - source_info.unique_stats("x").count.value, row_count, rel_tol=5e-2 + source_info_x.unique_stats(force=True).count.value, row_count, rel_tol=5e-2 ) - assert math.isclose(source_info.unique_stats("x").fraction.value, 1.0, abs_tol=1e-2) - assert not source_info.unique_stats("x").count.exact - assert math.isclose(source_info.unique_stats("y").count.value, 3, rel_tol=5e-2) assert math.isclose( - source_info.unique_stats("y").fraction.value, 3 / row_count, abs_tol=1e-2 + source_info_x.unique_stats(force=True).fraction.value, 1.0, abs_tol=1e-2 ) - assert not source_info.unique_stats("y").count.exact - assert math.isclose(source_info.unique_stats("z").count.value, 5, rel_tol=5e-2) + assert not source_info_x.unique_stats(force=True).count.exact assert math.isclose( - source_info.unique_stats("z").fraction.value, 5 / row_count, abs_tol=1e-2 + source_info_y.unique_stats(force=True).count.value, 3, rel_tol=5e-2 ) - assert not source_info.unique_stats("z").count.exact + assert math.isclose( + source_info_y.unique_stats(force=True).fraction.value, + 3 / row_count, + abs_tol=1e-2, + ) + assert not source_info_y.unique_stats(force=True).count.exact + assert math.isclose( + source_info_z.unique_stats(force=True).count.value, 5, rel_tol=5e-2 + ) + assert math.isclose( + source_info_z.unique_stats(force=True).fraction.value, + 5 / row_count, + abs_tol=1e-2, + ) + assert not source_info_z.unique_stats(force=True).count.exact @pytest.mark.parametrize("n_files", [1, 3]) @@ -109,60 +124,58 @@ def test_base_stats_parquet( column_stats = stats.column_stats[ir] # Source info is the same for all columns - source_info = column_stats["x"].source_info - assert source_info is column_stats["y"].source_info - assert source_info is column_stats["z"].source_info + source_info_x = column_stats["x"].source_info + source_info_y = column_stats["y"].source_info + source_info_z = column_stats["z"].source_info + table_source_info = source_info_x.table_source_info + assert table_source_info is source_info_y.table_source_info + assert table_source_info is source_info_z.table_source_info if max_footer_samples: - assert source_info.row_count.value == df.height - assert source_info.row_count.exact + assert source_info_x.row_count.value == df.height + assert source_info_x.row_count.exact else: - assert source_info.row_count.value is None + assert source_info_x.row_count.value is None # Storage stats should be available if max_footer_samples: - assert source_info.storage_size("x").value > 0 - assert source_info.storage_size("y").value > 0 + assert source_info_x.storage_size.value > 0 + assert source_info_y.storage_size.value > 0 else: - assert source_info.storage_size("x").value is None - assert source_info.storage_size("y").value is None - - # Check that we can query a missing column name - assert source_info.storage_size("foo").value is None - assert source_info.unique_stats("foo").count.value is None - assert source_info.unique_stats("foo").fraction.value is None + assert source_info_x.storage_size.value is None + assert source_info_y.storage_size.value is None # source._unique_stats should be empty - assert set(source_info._unique_stats) == set() + assert set(table_source_info._unique_stats) == set() if max_footer_samples and max_row_group_samples: - assert source_info.unique_stats("x").count.value == df.height - assert source_info.unique_stats("x").fraction.value == 1.0 + assert source_info_x.unique_stats(force=True).count.value == df.height + assert source_info_x.unique_stats(force=True).fraction.value == 1.0 else: - assert source_info.unique_stats("x").count.value is None - assert source_info.unique_stats("x").fraction.value is None + assert source_info_x.unique_stats(force=True).count.value is None + assert source_info_x.unique_stats(force=True).fraction.value is None # source_info._unique_stats should only contain 'x' if max_footer_samples and max_row_group_samples: - assert set(source_info._unique_stats) == {"x"} + assert set(table_source_info._unique_stats) == {"x"} else: - assert set(source_info._unique_stats) == set() + assert set(table_source_info._unique_stats) == set() # Check add_unique_stats_column behavior if max_footer_samples and max_row_group_samples: # Can add a "bad"/missing key column - source_info.add_unique_stats_column("foo") - assert set(source_info._unique_stats) == {"x"} + source_info_x.add_unique_stats_column("foo") + assert set(table_source_info._unique_stats) == {"x"} # Mark 'z' as a key column, and query 'y' stats - source_info.add_unique_stats_column("z") + source_info_z.add_unique_stats_column() if n_files == 1 and row_group_size == 10_000: - assert source_info.unique_stats("y").count.value == 3 + assert source_info_y.unique_stats(force=True).count.value == 3 else: - assert source_info.unique_stats("y").count.value is None - assert source_info.unique_stats("y").fraction.value < 1.0 + assert source_info_y.unique_stats(force=True).count.value is None + assert source_info_y.unique_stats(force=True).fraction.value < 1.0 # source_info._unique_stats should contain all columns now - assert set(source_info._unique_stats) == {"x", "y", "z"} + assert set(table_source_info._unique_stats) == {"x", "y", "z"} def test_base_stats_csv(tmp_path, df): @@ -181,10 +194,10 @@ def test_base_stats_csv(tmp_path, df): column_stats = stats.column_stats[ir] # Source info should be empty for CSV - source_info = column_stats["x"].source_info - assert source_info.row_count.value is None - assert source_info.unique_stats("x").count.value is None - assert source_info.unique_stats("x").fraction.value is None + source_info_x = column_stats["x"].source_info + assert source_info_x.row_count.value is None + assert source_info_x.unique_stats().count.value is None + assert source_info_x.unique_stats().fraction.value is None @pytest.mark.parametrize("max_footer_samples", [1, 3]) @@ -217,7 +230,7 @@ def test_base_stats_parquet_groupby( qir1 = Translator(q1._ldf.visit(), engine).translate_ir() stats = collect_base_stats(qir1, ConfigOptions.from_polars_engine(engine)) source_info_y = stats.column_stats[qir1]["y"].source_info - unique_stats_y = source_info_y.unique_stats("y") + unique_stats_y = source_info_y.unique_stats(force=True) y_unique_fraction = unique_stats_y.fraction y_row_count = source_info_y.row_count assert y_unique_fraction.value < 1.0 @@ -250,7 +263,7 @@ def test_base_stats_parquet_groupby( qir2 = Translator(q2._ldf.visit(), engine).translate_ir() stats = collect_base_stats(qir2, ConfigOptions.from_polars_engine(engine)) source_info_y = stats.column_stats[qir2]["y"].source_info - assert source_info_y.unique_stats("y").fraction == y_unique_fraction + assert source_info_y.unique_stats().fraction == y_unique_fraction assert y_row_count == source_info_y.row_count assert_gpu_result_equal(q2.sort(pl.col("y")).slice(0, 2), engine=engine) @@ -333,8 +346,8 @@ def test_base_stats_union(): # We lose source info after a Union, but we # can set accurate row-count and unique-value # estimates for the current IR in #19392 - source_info = column_stats["x"].source_info - assert source_info.row_count.value is None + source_info_x = column_stats["x"].source_info + assert source_info_x.row_count.value is None def test_base_stats_distinct(df): @@ -352,6 +365,6 @@ def test_base_stats_distinct(df): stats = collect_base_stats(ir, ConfigOptions.from_polars_engine(engine)) column_stats = stats.column_stats[ir] - source_info = column_stats["y"].source_info - assert source_info.row_count.value == row_count - assert source_info.row_count.exact + source_info_y = column_stats["y"].source_info + assert source_info_y.row_count.value == row_count + assert source_info_y.row_count.exact From ba388bd1071c47f60b310801c2cd72d8bd883d6d Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Thu, 21 Aug 2025 10:36:43 -0500 Subject: [PATCH 189/366] Fix bug in `eval` function with `nvtx-0.2.11` (#19754) When `nvtx-0.2.11` is used with `cudf-pandas`, we run into a scenario of incorrectly fetching the frame from the stack. This PR fixes it. Authors: - GALI PREM SAGAR (https://github.com/galipremsagar) Approvers: - Matthew Murray (https://github.com/Matt711) URL: https://github.com/rapidsai/cudf/pull/19754 --- python/cudf/cudf/pandas/_wrappers/pandas.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/python/cudf/cudf/pandas/_wrappers/pandas.py b/python/cudf/cudf/pandas/_wrappers/pandas.py index 95f559f4b6f..b4cb727721c 100644 --- a/python/cudf/cudf/pandas/_wrappers/pandas.py +++ b/python/cudf/cudf/pandas/_wrappers/pandas.py @@ -1132,7 +1132,8 @@ def _find_user_frame(): frame = inspect.currentframe() while frame: modname = frame.f_globals.get("__name__", "") - if modname == "__main__" or not modname.startswith("cudf."): + # TODO: Remove "nvtx." entry once we cross nvtx-0.2.11 as minimum version + if modname == "__main__" or not modname.startswith(("cudf.", "nvtx.")): return frame frame = frame.f_back raise RuntimeError("Could not find the user's frame.") From 4db8a382162fc69107c8a43b2ec8582fae59512a Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Thu, 21 Aug 2025 09:23:28 -0700 Subject: [PATCH 190/366] Support decimal columns in cudf_polars (#19749) Redo of https://github.com/rapidsai/cudf/pull/19589 Authors: - Matthew Roeschke (https://github.com/mroeschke) - Matthew Murray (https://github.com/Matt711) Approvers: - Matthew Murray (https://github.com/Matt711) URL: https://github.com/rapidsai/cudf/pull/19749 --- .../cudf_polars/containers/dataframe.py | 19 +++-- .../cudf_polars/containers/datatype.py | 2 + .../cudf_polars/dsl/utils/aggregations.py | 47 +++++++---- .../cudf_polars/cudf_polars/testing/plugin.py | 2 + .../tests/expressions/test_literal.py | 6 +- python/cudf_polars/tests/test_groupby.py | 81 +++++++++++-------- python/cudf_polars/tests/test_scan.py | 9 +++ python/cudf_polars/tests/test_select.py | 25 ++++++ python/pylibcudf/pylibcudf/scalar.pyx | 8 +- python/pylibcudf/tests/test_interop.py | 2 +- 10 files changed, 142 insertions(+), 59 deletions(-) diff --git a/python/cudf_polars/cudf_polars/containers/dataframe.py b/python/cudf_polars/cudf_polars/containers/dataframe.py index 43ec63738b2..3a095be3cfe 100644 --- a/python/cudf_polars/cudf_polars/containers/dataframe.py +++ b/python/cudf_polars/cudf_polars/containers/dataframe.py @@ -29,17 +29,26 @@ def _create_polars_column_metadata( name: str, dtype: PolarsDataType ) -> plc.interop.ColumnMetadata: - """Create ColumnMetadata preserving pl.Struct field names.""" + """Create ColumnMetadata preserving dtype attributes not supported by libcudf.""" + children_meta = [] + timezone = "" + precision: int | None = None + if isinstance(dtype, pl.Struct): children_meta = [ _create_polars_column_metadata(field.name, field.dtype) for field in dtype.fields ] - else: - children_meta = [] - timezone = dtype.time_zone if isinstance(dtype, pl.Datetime) else None + elif isinstance(dtype, pl.Datetime): + timezone = dtype.time_zone or timezone + elif isinstance(dtype, pl.Decimal): + precision = dtype.precision + return plc.interop.ColumnMetadata( - name=name, timezone=timezone or "", children_meta=children_meta + name=name, + timezone=timezone, + precision=precision, + children_meta=children_meta, ) diff --git a/python/cudf_polars/cudf_polars/containers/datatype.py b/python/cudf_polars/cudf_polars/containers/datatype.py index 5de610425ed..50a5352612a 100644 --- a/python/cudf_polars/cudf_polars/containers/datatype.py +++ b/python/cudf_polars/cudf_polars/containers/datatype.py @@ -81,6 +81,8 @@ def _from_polars(dtype: pl.DataType) -> plc.DataType: assert_never(dtype.time_unit) elif isinstance(dtype, pl.String): return plc.DataType(plc.TypeId.STRING) + elif isinstance(dtype, pl.Decimal): + return plc.DataType(plc.TypeId.DECIMAL128, scale=-dtype.scale) elif isinstance(dtype, pl.Null): # TODO: Hopefully return plc.DataType(plc.TypeId.EMPTY) diff --git a/python/cudf_polars/cudf_polars/dsl/utils/aggregations.py b/python/cudf_polars/cudf_polars/dsl/utils/aggregations.py index 8e242c55a47..5f51d54cd44 100644 --- a/python/cudf_polars/cudf_polars/dsl/utils/aggregations.py +++ b/python/cudf_polars/cudf_polars/dsl/utils/aggregations.py @@ -158,12 +158,30 @@ def decompose_single_agg( is_median = agg.name == "median" is_quantile = agg.name == "quantile" + # quantile agg on decimal: unsupported -> keep dtype Decimal + # mean/median on decimal: Polars returns float -> pre-cast + decimal_unsupported = False + if plc.traits.is_fixed_point(child_dtype): + if is_quantile: + decimal_unsupported = True + elif agg.name in {"mean", "median"}: + tid = agg.dtype.plc.id() + if tid in {plc.TypeId.FLOAT32, plc.TypeId.FLOAT64}: + cast_to = ( + DataType(pl.Float64) + if tid == plc.TypeId.FLOAT64 + else DataType(pl.Float32) + ) + child = expr.Cast(cast_to, child) + child_dtype = child.dtype.plc + is_group_quantile_supported = plc.traits.is_integral( child_dtype ) or plc.traits.is_floating_point(child_dtype) unsupported = ( - (is_median or is_quantile) and not is_group_quantile_supported + decimal_unsupported + or ((is_median or is_quantile) and not is_group_quantile_supported) ) or (not plc.aggregation.is_valid_aggregation(child_dtype, req)) if unsupported: return [], named_expr.reconstruct(expr.Literal(child.dtype, None)) @@ -172,19 +190,12 @@ def decompose_single_agg( # The aggregation is just reconstructed with the new # (potentially masked) child. This is safe because we recursed # to ensure there are no nested aggregations. - return ( - [(named_expr.reconstruct(agg.reconstruct([child])), True)], - named_expr.reconstruct(expr.Col(agg.dtype, name)), - ) - elif agg.name in ("mean", "median", "quantile", "std", "var"): - # libcudf promotes these to float64; but polars - # keeps Float32, so cast back in post-processing. - named = expr.NamedExpr(name, agg) - post_col: expr.Expr = expr.Col(DataType(pl.Float64()), name) - if agg.dtype.plc.id() == plc.TypeId.FLOAT32: - post_col = expr.Cast(agg.dtype, post_col) - return [(named, True)], expr.NamedExpr(name, post_col) - elif agg.name == "sum": + + # rebuild the agg with the transformed child + new_children = [child] if not is_quantile else [child, agg.children[1]] + named_expr = named_expr.reconstruct(agg.reconstruct(new_children)) + + if agg.name == "sum": col = ( expr.Cast(agg.dtype, expr.Col(DataType(pl.datatypes.Int64()), name)) if ( @@ -230,6 +241,14 @@ def decompose_single_agg( return [(named_expr, True), (win_len, True)], expr.NamedExpr( name, post_ternary_expr ) + elif agg.name in {"mean", "median", "quantile", "std", "var"}: + post_agg_col: expr.Expr = expr.Col( + DataType(pl.Float64()), name + ) # libcudf promotes to float64 + if agg.dtype.plc.id() == plc.TypeId.FLOAT32: + # Cast back to float32 to match Polars + post_agg_col = expr.Cast(agg.dtype, post_agg_col) + return [(named_expr, True)], named_expr.reconstruct(post_agg_col) else: return [(named_expr, True)], named_expr.reconstruct( expr.Col(agg.dtype, name) diff --git a/python/cudf_polars/cudf_polars/testing/plugin.py b/python/cudf_polars/cudf_polars/testing/plugin.py index cd3bbe01020..ccb4658b5ba 100644 --- a/python/cudf_polars/cudf_polars/testing/plugin.py +++ b/python/cudf_polars/cudf_polars/testing/plugin.py @@ -174,6 +174,8 @@ def pytest_configure(config: pytest.Config) -> None: "tests/unit/io/test_lazy_parquet.py::test_parquet_schema_arg[False-row_groups]": "allow_missing_columns argument in read_parquet not translated in IR", "tests/unit/io/test_lazy_parquet.py::test_parquet_schema_arg[False-prefiltered]": "allow_missing_columns argument in read_parquet not translated in IR", "tests/unit/io/test_lazy_parquet.py::test_parquet_schema_arg[False-none]": "allow_missing_columns argument in read_parquet not translated in IR", + "tests/unit/datatypes/test_decimal.py::test_decimal_aggregations": "https://github.com/pola-rs/polars/issues/23899", + "tests/unit/datatypes/test_decimal.py::test_decimal_arithmetic_schema": "https://github.com/pola-rs/polars/issues/23899", } diff --git a/python/cudf_polars/tests/expressions/test_literal.py b/python/cudf_polars/tests/expressions/test_literal.py index 69ee80da82e..1c2eb05ebfe 100644 --- a/python/cudf_polars/tests/expressions/test_literal.py +++ b/python/cudf_polars/tests/expressions/test_literal.py @@ -2,6 +2,8 @@ # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations +import datetime + import pytest import polars as pl @@ -95,7 +97,9 @@ def test_select_literal_series(): assert_gpu_result_equal(q) -@pytest.mark.parametrize("expr", [pl.lit(None), pl.lit(10, dtype=pl.Decimal())]) +@pytest.mark.parametrize( + "expr", [pl.lit(None), pl.lit(datetime.time(12, 0), dtype=pl.Time())] +) def test_unsupported_literal_raises(expr): df = pl.LazyFrame({}) diff --git a/python/cudf_polars/tests/test_groupby.py b/python/cudf_polars/tests/test_groupby.py index dcbf2d3eec5..33404e9579b 100644 --- a/python/cudf_polars/tests/test_groupby.py +++ b/python/cudf_polars/tests/test_groupby.py @@ -14,12 +14,12 @@ assert_gpu_result_equal, assert_ir_translation_raises, ) -from cudf_polars.utils.versions import POLARS_VERSION_LT_1321 +from cudf_polars.utils.versions import POLARS_VERSION_LT_132, POLARS_VERSION_LT_1321 @pytest.fixture def df(): - return pl.LazyFrame( + lf = pl.LazyFrame( { "key1": [1, 1, 1, 2, 3, 1, 4, 6, 7], "key2": [2, 2, 2, 2, 6, 1, 4, 6, 8], @@ -43,6 +43,11 @@ def df(): ], } ) + if not POLARS_VERSION_LT_132: + lf = lf.with_columns( + pl.col("float").cast(pl.Decimal(precision=9, scale=2)).alias("decimal") + ) + return lf @pytest.fixture( @@ -61,39 +66,47 @@ def keys(request): return request.param -@pytest.fixture( - params=[ - [], - ["int"], - ["float", "int"], - [pl.col("float") + pl.col("int")], - [pl.col("float").is_not_null()], - [pl.col("int32").sum()], - [pl.col("int32").mean()], - [ - pl.col("uint16_with_null").sum(), - pl.col("uint16_with_null").mean().alias("mean"), - ], - [pl.col("float").max() - pl.col("int").min() + pl.col("int").max()], - [pl.col("float").mean(), pl.col("int").std()], - [(pl.col("float") - pl.lit(2)).max()], - [pl.lit(10).alias("literal_value")], - [pl.col("float").sum().round(decimals=1)], - [pl.col("float").round(decimals=1).sum()], - [pl.col("float").sum().round()], - [pl.col("float").round().sum()], - [pl.col("int").first(), pl.col("float").last()], - [pl.col("int").sum(), pl.col("string").str.replace("h", "foo", literal=True)], - [pl.col("float").quantile(0.3, interpolation="nearest")], - [pl.col("float").quantile(0.3, interpolation="higher")], - [pl.col("float").quantile(0.3, interpolation="lower")], - [pl.col("float").quantile(0.3, interpolation="midpoint")], - [pl.col("float").quantile(0.3, interpolation="linear")], - [ - pl.col("datetime").max(), - pl.col("datetime").max().dt.is_leap_year().alias("leapyear"), - ], +_EXPRS: list[list[pl.Expr | str]] = [ + [], + ["int"], + ["float", "int"], + [pl.col("float") + pl.col("int")], + [pl.col("float").is_not_null()], + [pl.col("int32").sum()], + [pl.col("int32").mean()], + [ + pl.col("uint16_with_null").sum(), + pl.col("uint16_with_null").mean().alias("mean"), + ], + [pl.col("float").max() - pl.col("int").min() + pl.col("int").max()], + [pl.col("float").mean(), pl.col("int").std()], + [(pl.col("float") - pl.lit(2)).max()], + [pl.lit(10).alias("literal_value")], + [pl.col("float").sum().round(decimals=1)], + [pl.col("float").round(decimals=1).sum()], + [pl.col("float").sum().round()], + [pl.col("float").round().sum()], + [pl.col("int").first(), pl.col("float").last()], + [pl.col("int").sum(), pl.col("string").str.replace("h", "foo", literal=True)], + [pl.col("float").quantile(0.3, interpolation="nearest")], + [pl.col("float").quantile(0.3, interpolation="higher")], + [pl.col("float").quantile(0.3, interpolation="lower")], + [pl.col("float").quantile(0.3, interpolation="midpoint")], + [pl.col("float").quantile(0.3, interpolation="linear")], + [ + pl.col("datetime").max(), + pl.col("datetime").max().dt.is_leap_year().alias("leapyear"), ], +] + +# polars gives us precision=None, which we +# do not supprt +if not POLARS_VERSION_LT_132: + _EXPRS.append([pl.col("decimal").median()]) + + +@pytest.fixture( + params=_EXPRS, ids=lambda aggs: "-".join(map(str, aggs)), ) def exprs(request): diff --git a/python/cudf_polars/tests/test_scan.py b/python/cudf_polars/tests/test_scan.py index 8481105baad..7510fe833be 100644 --- a/python/cudf_polars/tests/test_scan.py +++ b/python/cudf_polars/tests/test_scan.py @@ -2,6 +2,7 @@ # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations +import decimal from typing import TYPE_CHECKING import pytest @@ -45,6 +46,14 @@ def df(): "a": [1, 2, 3, None, 4, 5], "b": ["ẅ", "x", "y", "z", "123", "abcd"], "c": [None, None, 4, 5, -1, 0], + "d": [ + decimal.Decimal("1.23"), + None, + decimal.Decimal("0.00"), + None, + decimal.Decimal("-5.67"), + None, + ], } ) diff --git a/python/cudf_polars/tests/test_select.py b/python/cudf_polars/tests/test_select.py index da3f519783b..10fcf9f660d 100644 --- a/python/cudf_polars/tests/test_select.py +++ b/python/cudf_polars/tests/test_select.py @@ -2,6 +2,8 @@ # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations +import decimal + import pytest import polars as pl @@ -27,6 +29,29 @@ def test_select(): assert_gpu_result_equal(query) +def test_select_decimal(): + ldf = pl.LazyFrame( + {"a": pl.Series(values=[decimal.Decimal("1.0"), None], dtype=pl.Decimal(3, 1))} + ) + query = ldf.select(pl.col("a")) + assert_gpu_result_equal(query) + + +def test_select_decimal_precision_none_result_max_precision(): + ldf = pl.LazyFrame( + { + "a": pl.Series( + values=[decimal.Decimal("1.0"), None], dtype=pl.Decimal(None, 1) + ) + } + ) + query = ldf.select(pl.col("a")) + cpu_result = query.collect() + gpu_result = query.collect(engine="gpu") + assert cpu_result.schema["a"].precision is None + assert gpu_result.schema["a"].precision == 38 + + def test_select_reduce(): ldf = pl.DataFrame( { diff --git a/python/pylibcudf/pylibcudf/scalar.pyx b/python/pylibcudf/pylibcudf/scalar.pyx index 57ea17d2921..7c183b96bbb 100644 --- a/python/pylibcudf/pylibcudf/scalar.pyx +++ b/python/pylibcudf/pylibcudf/scalar.pyx @@ -297,7 +297,7 @@ cdef class Scalar: return decimal.Decimal( (slr).value().value() ).scaleb( - -(slr).type().scale() + (slr).type().scale() ) else: raise NotImplementedError( @@ -686,12 +686,12 @@ def _(py_val: datetime.date, dtype: DataType | None, stream: Stream | None): @_from_py.register(decimal.Decimal) def _(py_val: decimal.Decimal, dtype: DataType | None, stream: Stream | None): - scale = -py_val.as_tuple().exponent - as_int = int(py_val.scaleb(scale)) + scale = py_val.as_tuple().exponent + as_int = int(py_val.scaleb(-scale)) cdef int128_t val = as_int - dtype = DataType(type_id.DECIMAL128, -scale) + dtype = DataType(type_id.DECIMAL128, scale) if dtype.id() != type_id.DECIMAL128: raise TypeError("Expected dtype to be DECIMAL128") diff --git a/python/pylibcudf/tests/test_interop.py b/python/pylibcudf/tests/test_interop.py index 171d70c2496..b1a6e9f2c66 100644 --- a/python/pylibcudf/tests/test_interop.py +++ b/python/pylibcudf/tests/test_interop.py @@ -105,7 +105,7 @@ def test_decimal_other(data_type): [plc.TypeId.DECIMAL128, plc.TypeId.DECIMAL64, plc.TypeId.DECIMAL32], ) def test_decimal_respect_metadata_precision(plc_type, request): - request.node.add_marker( + request.applymarker( pytest.mark.xfail( parse(pa.__version__) < parse("19.0.0") and plc_type in {plc.TypeId.DECIMAL64, plc.TypeId.DECIMAL32}, From 3ae9ff85e115e4bf3a805de5027c582be24eed3a Mon Sep 17 00:00:00 2001 From: Matthew Murray <41342305+Matt711@users.noreply.github.com> Date: Thu, 21 Aug 2025 12:23:36 -0400 Subject: [PATCH 191/366] Skip polars CPU perf test for with_columns (#19763) Flaky test discovered in https://github.com/rapidsai/cudf/pull/19754. Lets skip it. Authors: - Matthew Murray (https://github.com/Matt711) Approvers: - Matthew Roeschke (https://github.com/mroeschke) URL: https://github.com/rapidsai/cudf/pull/19763 --- python/cudf_polars/cudf_polars/testing/plugin.py | 1 + 1 file changed, 1 insertion(+) diff --git a/python/cudf_polars/cudf_polars/testing/plugin.py b/python/cudf_polars/cudf_polars/testing/plugin.py index ccb4658b5ba..84ef9d9234f 100644 --- a/python/cudf_polars/cudf_polars/testing/plugin.py +++ b/python/cudf_polars/cudf_polars/testing/plugin.py @@ -193,6 +193,7 @@ def pytest_configure(config: pytest.Config) -> None: # Tests performance difference of CPU engine "tests/unit/operations/test_join.py::test_join_where_eager_perf_21145": "Tests performance bug in CPU engine", "tests/unit/operations/namespaces/list/test_list.py::test_list_struct_field_perf": "Tests CPU Engine perf", + "tests/benchmark/test_with_columns.py::test_with_columns_quadratic_19503": "Tests performance bug in CPU engine", # The test may segfault with the legacy streaming engine. We should # remove this skip when all polars tests use the new streaming engine. "tests/unit/streaming/test_streaming_group_by.py::test_streaming_group_by_literal[1]": "May segfault w/the legacy streaming engine", From 75d423fa5014a132ef990288ee9903800cf0f1f2 Mon Sep 17 00:00:00 2001 From: Matthew Murray <41342305+Matt711@users.noreply.github.com> Date: Thu, 21 Aug 2025 16:42:49 -0400 Subject: [PATCH 192/366] Support fill_null with fill strategy in cudf-polars (#19318) - Contributes to https://github.com/rapidsai/cudf/issues/19200. - Depends on https://github.com/pola-rs/polars/pull/23479 Authors: - Matthew Murray (https://github.com/Matt711) Approvers: - Matthew Roeschke (https://github.com/mroeschke) URL: https://github.com/rapidsai/cudf/pull/19318 --- .../cudf_polars/dsl/expressions/unary.py | 59 +++++++++++++++++++ .../cudf_polars/dsl/utils/aggregations.py | 9 +++ .../cudf_polars/experimental/select.py | 16 ++++- .../cudf_polars/experimental/utils.py | 13 +++- .../tests/experimental/test_select.py | 30 +++++++++- python/cudf_polars/tests/test_drop_nulls.py | 18 ++++-- python/cudf_polars/tests/test_groupby.py | 16 +++++ 7 files changed, 151 insertions(+), 10 deletions(-) diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/unary.py b/python/cudf_polars/cudf_polars/dsl/expressions/unary.py index 440a2efa9cc..e763775e37c 100644 --- a/python/cudf_polars/cudf_polars/dsl/expressions/unary.py +++ b/python/cudf_polars/cudf_polars/dsl/expressions/unary.py @@ -7,6 +7,8 @@ from typing import TYPE_CHECKING, Any, ClassVar, cast +from typing_extensions import assert_never + import pylibcudf as plc from cudf_polars.containers import Column @@ -110,6 +112,7 @@ class UnaryFunction(Expr): "set_sorted", "unique", "value_counts", + "fill_null_with_strategy", "null_count", "top_k", } @@ -152,6 +155,10 @@ def __init__( raise NotImplementedError( "reverse=True is not supported for cumulative aggregations" ) + if self.name == "fill_null_with_strategy" and self.options[1] not in {0, None}: + raise NotImplementedError( + "Filling null values with limit specified is not yet supported." + ) def do_evaluate( self, df: DataFrame, *, context: ExecutionContext = ExecutionContext.FRAME @@ -326,6 +333,58 @@ def do_evaluate( .obj.to_scalar() ) return Column(plc.replace.replace_nulls(column.obj, arg), dtype=self.dtype) + elif self.name == "fill_null_with_strategy": + column = self.children[0].evaluate(df, context=context) + strategy, limit = self.options + if ( + column.null_count == 0 + or limit == 0 + or ( + column.null_count == column.size and strategy not in {"zero", "one"} + ) + ): + return column + if strategy == "forward": + replacement = plc.replace.ReplacePolicy.PRECEDING + elif strategy == "backward": + replacement = plc.replace.ReplacePolicy.FOLLOWING + elif strategy == "min": + replacement = plc.reduce.reduce( + column.obj, + plc.aggregation.min(), + column.dtype.plc, + ) + elif strategy == "max": + replacement = plc.reduce.reduce( + column.obj, + plc.aggregation.max(), + column.dtype.plc, + ) + elif strategy == "mean": + replacement = plc.reduce.reduce( + column.obj, + plc.aggregation.mean(), + plc.DataType(plc.TypeId.FLOAT64), + ) + elif strategy == "zero": + replacement = plc.scalar.Scalar.from_py(0, dtype=column.dtype.plc) + elif strategy == "one": + replacement = plc.scalar.Scalar.from_py(1, dtype=column.dtype.plc) + else: + assert_never(strategy) # pragma: no cover + + if strategy == "mean": + return Column( + plc.replace.replace_nulls( + plc.unary.cast(column.obj, plc.DataType(plc.TypeId.FLOAT64)), + replacement, + ), + dtype=self.dtype, + ).astype(self.dtype) + return Column( + plc.replace.replace_nulls(column.obj, replacement), + dtype=self.dtype, + ) elif self.name == "as_struct": children = [ child.evaluate(df, context=context).obj for child in self.children diff --git a/python/cudf_polars/cudf_polars/dsl/utils/aggregations.py b/python/cudf_polars/cudf_polars/dsl/utils/aggregations.py index 5f51d54cd44..2cd7cde44ef 100644 --- a/python/cudf_polars/cudf_polars/dsl/utils/aggregations.py +++ b/python/cudf_polars/cudf_polars/dsl/utils/aggregations.py @@ -131,6 +131,15 @@ def decompose_single_agg( return [(named_expr, True)], named_expr.reconstruct(expr.Col(agg.dtype, name)) if isinstance(agg, (expr.Literal, expr.LiteralColumn)): return [], named_expr + if ( + is_top + and isinstance(agg, expr.UnaryFunction) + and agg.name == "fill_null_with_strategy" + ): + strategy, _ = agg.options + raise NotImplementedError( + f"fill_null_with_strategy({strategy!r}) is not supported in groupby aggregations" + ) if isinstance(agg, expr.Agg): if agg.name == "quantile": # Second child the requested quantile (which is asserted diff --git a/python/cudf_polars/cudf_polars/experimental/select.py b/python/cudf_polars/cudf_polars/experimental/select.py index 1ca199a8af6..d2c29aa0f81 100644 --- a/python/cudf_polars/cudf_polars/experimental/select.py +++ b/python/cudf_polars/cudf_polars/experimental/select.py @@ -15,7 +15,10 @@ from cudf_polars.experimental.base import PartitionInfo from cudf_polars.experimental.dispatch import lower_ir_node from cudf_polars.experimental.expressions import decompose_expr_graph -from cudf_polars.experimental.utils import _lower_ir_fallback +from cudf_polars.experimental.utils import ( + _contains_unsupported_fill_strategy, + _lower_ir_fallback, +) if TYPE_CHECKING: from collections.abc import MutableMapping @@ -107,6 +110,17 @@ def _( ) -> tuple[IR, MutableMapping[IR, PartitionInfo]]: child, partition_info = rec(ir.children[0]) pi = partition_info[child] + if pi.count > 1 and _contains_unsupported_fill_strategy( + [e.value for e in ir.exprs] + ): + return _lower_ir_fallback( + ir.reconstruct([child]), + rec, + msg=( + "fill_null with strategy other than 'zero' or 'one' is not supported " + "for multiple partitions; falling back to in-memory evaluation." + ), + ) if ( pi.count == 1 and Select._is_len_expr(ir.exprs) diff --git a/python/cudf_polars/cudf_polars/experimental/utils.py b/python/cudf_polars/cudf_polars/experimental/utils.py index bdebc02fac7..e62990c1755 100644 --- a/python/cudf_polars/cudf_polars/experimental/utils.py +++ b/python/cudf_polars/cudf_polars/experimental/utils.py @@ -10,7 +10,7 @@ from itertools import chain from typing import TYPE_CHECKING -from cudf_polars.dsl.expr import Col, Expr, GroupedRollingWindow +from cudf_polars.dsl.expr import Col, Expr, GroupedRollingWindow, UnaryFunction from cudf_polars.dsl.ir import Union from cudf_polars.dsl.traversal import traversal from cudf_polars.experimental.base import PartitionInfo @@ -116,3 +116,14 @@ def _get_unique_fractions( def _contains_over(exprs: Sequence[Expr]) -> bool: """Return True if any expression in 'exprs' contains an over(...) (ie. GroupedRollingWindow).""" return any(isinstance(e, GroupedRollingWindow) for e in traversal(exprs)) + + +def _contains_unsupported_fill_strategy(exprs: Sequence[Expr]) -> bool: + for e in traversal(exprs): + if ( + isinstance(e, UnaryFunction) + and e.name == "fill_null_with_strategy" + and e.options[0] not in ("zero", "one") + ): + return True + return False diff --git a/python/cudf_polars/tests/experimental/test_select.py b/python/cudf_polars/tests/experimental/test_select.py index cd19f337830..4daa23773ee 100644 --- a/python/cudf_polars/tests/experimental/test_select.py +++ b/python/cudf_polars/tests/experimental/test_select.py @@ -9,8 +9,12 @@ import polars as pl -from cudf_polars.testing.asserts import DEFAULT_SCHEDULER, assert_gpu_result_equal -from cudf_polars.utils.versions import POLARS_VERSION_LT_130 +from cudf_polars.testing.asserts import ( + DEFAULT_SCHEDULER, + assert_gpu_result_equal, + assert_ir_translation_raises, +) +from cudf_polars.utils.versions import POLARS_VERSION_LT_130, POLARS_VERSION_LT_132 @pytest.fixture(scope="module") @@ -79,6 +83,28 @@ def test_select_reduce_fallback(df, fallback_mode): assert_gpu_result_equal(query, engine=engine) +def test_select_fill_null_with_strategy(df): + engine = pl.GPUEngine( + raise_on_fail=True, + executor="streaming", + executor_options={ + "fallback_mode": "warn", + "max_rows_per_partition": 3, + "scheduler": DEFAULT_SCHEDULER, + }, + ) + q = df.select(pl.col("a").forward_fill()) + + if POLARS_VERSION_LT_132: + assert_ir_translation_raises(q, NotImplementedError) + else: + with pytest.warns( + UserWarning, + match="fill_null with strategy other than 'zero' or 'one' is not supported for multiple partitions", + ): + assert_gpu_result_equal(q, engine=engine) + + @pytest.mark.parametrize( "aggs", [ diff --git a/python/cudf_polars/tests/test_drop_nulls.py b/python/cudf_polars/tests/test_drop_nulls.py index 46a8007a805..ba60932d063 100644 --- a/python/cudf_polars/tests/test_drop_nulls.py +++ b/python/cudf_polars/tests/test_drop_nulls.py @@ -10,6 +10,7 @@ assert_gpu_result_equal, assert_ir_translation_raises, ) +from cudf_polars.utils.versions import POLARS_VERSION_LT_132 @pytest.fixture( @@ -56,15 +57,20 @@ def test_fill_null_with_string(): ) def test_fill_null_with_strategy(null_data, strategy): q = null_data.select(pl.col("a").fill_null(strategy=strategy)) - - # Not yet exposed to python from rust - assert_ir_translation_raises(q, NotImplementedError) + if POLARS_VERSION_LT_132: + assert_ir_translation_raises(q, NotImplementedError) + else: + assert_gpu_result_equal(q) @pytest.mark.parametrize("strategy", ["forward", "backward"]) @pytest.mark.parametrize("limit", [0, 1, 2]) def test_fill_null_with_limit(null_data, strategy, limit): q = null_data.select(pl.col("a").fill_null(strategy=strategy, limit=limit)) - - # Not yet exposed to python from rust - assert_ir_translation_raises(q, NotImplementedError) + if limit != 0: + assert_ir_translation_raises(q, NotImplementedError) + else: + if POLARS_VERSION_LT_132: + assert_ir_translation_raises(q, NotImplementedError) + else: + assert_gpu_result_equal(q) diff --git a/python/cudf_polars/tests/test_groupby.py b/python/cudf_polars/tests/test_groupby.py index 33404e9579b..d900737c62b 100644 --- a/python/cudf_polars/tests/test_groupby.py +++ b/python/cudf_polars/tests/test_groupby.py @@ -397,3 +397,19 @@ def test_groupby_aggs_keep_unsupported_as_null(df: pl.LazyFrame, agg_expr) -> No def test_groupby_ternary_supported(df: pl.LazyFrame, expr: pl.Expr) -> None: q = df.group_by("key1").agg(expr) assert_gpu_result_equal(q, check_row_order=False) + + +@pytest.mark.parametrize( + "strategy", ["forward", "backward", "min", "max", "mean", "zero", "one"] +) +def test_groupby_fill_null_with_strategy(strategy): + lf = pl.LazyFrame( + { + "key": [1, 1, 2, 2, 2], + "val": [None, 2, None, 4, None], + } + ) + + q = lf.group_by("key").agg(pl.col("val").fill_null(strategy=strategy)) + + assert_ir_translation_raises(q, NotImplementedError) From 1e5fc00bae5a8ffa64ecbf2b9c6fb7a0b738bbe9 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Thu, 21 Aug 2025 13:46:09 -0700 Subject: [PATCH 193/366] Move more test_dataframe.py tests to new cudf classic testing directory (#19731) Towards https://github.com/rapidsai/cudf/issues/9999 Towards https://github.com/rapidsai/cudf/issues/15723 Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - GALI PREM SAGAR (https://github.com/galipremsagar) URL: https://github.com/rapidsai/cudf/pull/19731 --- .../methods/test_add_prefix_suffix.py | 24 + .../tests/dataframe/methods/test_astype.py | 79 + .../methods/test_nlargest_nsmallest.py | 22 +- .../tests/dataframe/methods/test_squeeze.py | 15 + .../tests/dataframe/methods/test_to_dict.py | 28 + .../tests/dataframe/methods/test_transpose.py | 38 + .../tests/dataframe/methods/test_where.py | 456 +++- .../cudf/tests/dataframe/test_attributes.py | 126 + .../cudf/tests/dataframe/test_constructors.py | 834 +++++- .../cudf/cudf/tests/groupby/test_cov_corr.py | 257 ++ .../cudf/tests/series/methods/test_astype.py | 282 +++ .../cudf/tests/series/test_constructors.py | 12 + python/cudf/cudf/tests/test_dataframe.py | 2254 +---------------- 13 files changed, 2251 insertions(+), 2176 deletions(-) create mode 100644 python/cudf/cudf/tests/dataframe/methods/test_add_prefix_suffix.py create mode 100644 python/cudf/cudf/tests/dataframe/methods/test_astype.py create mode 100644 python/cudf/cudf/tests/dataframe/methods/test_squeeze.py create mode 100644 python/cudf/cudf/tests/dataframe/methods/test_to_dict.py create mode 100644 python/cudf/cudf/tests/groupby/test_cov_corr.py diff --git a/python/cudf/cudf/tests/dataframe/methods/test_add_prefix_suffix.py b/python/cudf/cudf/tests/dataframe/methods/test_add_prefix_suffix.py new file mode 100644 index 00000000000..db56d2cc360 --- /dev/null +++ b/python/cudf/cudf/tests/dataframe/methods/test_add_prefix_suffix.py @@ -0,0 +1,24 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + +import cudf +from cudf.testing import assert_eq + + +def test_dataframe_add_prefix(): + cdf = cudf.DataFrame({"A": [1, 2, 3, 4], "B": [3, 4, 5, 6]}) + pdf = cdf.to_pandas() + + got = cdf.add_prefix("item_") + expected = pdf.add_prefix("item_") + + assert_eq(got, expected) + + +def test_dataframe_add_suffix(): + cdf = cudf.DataFrame({"A": [1, 2, 3, 4], "B": [3, 4, 5, 6]}) + pdf = cdf.to_pandas() + + got = cdf.add_suffix("_item") + expected = pdf.add_suffix("_item") + + assert_eq(got, expected) diff --git a/python/cudf/cudf/tests/dataframe/methods/test_astype.py b/python/cudf/cudf/tests/dataframe/methods/test_astype.py new file mode 100644 index 00000000000..9eb93d7b6a8 --- /dev/null +++ b/python/cudf/cudf/tests/dataframe/methods/test_astype.py @@ -0,0 +1,79 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + +import pytest + +import cudf +from cudf.testing import assert_eq +from cudf.testing._utils import assert_exceptions_equal + + +@pytest.mark.parametrize("copy", [True, False]) +def test_df_series_dataframe_astype_copy(copy): + gdf = cudf.DataFrame({"col1": [1, 2], "col2": [3, 4]}) + pdf = gdf.to_pandas() + + assert_eq( + gdf.astype(dtype="float", copy=copy), + pdf.astype(dtype="float", copy=copy), + ) + assert_eq(gdf, pdf) + + gsr = cudf.Series([1, 2]) + psr = gsr.to_pandas() + + assert_eq( + gsr.astype(dtype="float", copy=copy), + psr.astype(dtype="float", copy=copy), + ) + assert_eq(gsr, psr) + + gsr = cudf.Series([1, 2]) + psr = gsr.to_pandas() + + actual = gsr.astype(dtype="int64", copy=copy) + expected = psr.astype(dtype="int64", copy=copy) + assert_eq(expected, actual) + assert_eq(gsr, psr) + actual[0] = 3 + expected[0] = 3 + assert_eq(gsr, psr) + + +@pytest.mark.parametrize("copy", [True, False]) +def test_df_series_dataframe_astype_dtype_dict(copy): + gdf = cudf.DataFrame({"col1": [1, 2], "col2": [3, 4]}) + pdf = gdf.to_pandas() + + assert_eq( + gdf.astype(dtype={"col1": "float"}, copy=copy), + pdf.astype(dtype={"col1": "float"}, copy=copy), + ) + assert_eq(gdf, pdf) + + gsr = cudf.Series([1, 2]) + psr = gsr.to_pandas() + + assert_eq( + gsr.astype(dtype={None: "float"}, copy=copy), + psr.astype(dtype={None: "float"}, copy=copy), + ) + assert_eq(gsr, psr) + + assert_exceptions_equal( + lfunc=psr.astype, + rfunc=gsr.astype, + lfunc_args_and_kwargs=([], {"dtype": {"a": "float"}, "copy": copy}), + rfunc_args_and_kwargs=([], {"dtype": {"a": "float"}, "copy": copy}), + ) + + gsr = cudf.Series([1, 2]) + psr = gsr.to_pandas() + + actual = gsr.astype({None: "int64"}, copy=copy) + expected = psr.astype({None: "int64"}, copy=copy) + assert_eq(expected, actual) + assert_eq(gsr, psr) + + actual[0] = 3 + expected[0] = 3 + assert_eq(gsr, psr) diff --git a/python/cudf/cudf/tests/dataframe/methods/test_nlargest_nsmallest.py b/python/cudf/cudf/tests/dataframe/methods/test_nlargest_nsmallest.py index 6c148cef4a6..c153d5c92fb 100644 --- a/python/cudf/cudf/tests/dataframe/methods/test_nlargest_nsmallest.py +++ b/python/cudf/cudf/tests/dataframe/methods/test_nlargest_nsmallest.py @@ -5,8 +5,9 @@ import pandas as pd import pytest -from cudf import DataFrame +import cudf from cudf.testing import assert_eq +from cudf.testing._utils import assert_exceptions_equal @pytest.mark.parametrize("n", [10, 5]) @@ -18,7 +19,7 @@ def test_dataframe_nlargest_nsmallest(n, op, columns): aa = rng.random(nelem) bb = rng.random(nelem) - df = DataFrame({"a": aa, "b": bb}) + df = cudf.DataFrame({"a": aa, "b": bb}) pdf = df.to_pandas() assert_eq(getattr(df, op)(n, columns), getattr(pdf, op)(n, columns)) @@ -35,7 +36,7 @@ def test_dataframe_nlargest_sliced(sliceobj): df["b"] = rng.random(nelem) expect = df[sliceobj].nlargest(n, "a") - gdf = DataFrame.from_pandas(df) + gdf = cudf.DataFrame.from_pandas(df) got = gdf[sliceobj].nlargest(n, "a") assert (got.to_pandas() == expect).all().all() @@ -52,6 +53,19 @@ def test_dataframe_nsmallest_sliced(sliceobj): df["b"] = rng.random(nelem) expect = df[sliceobj].nsmallest(n, "a") - gdf = DataFrame.from_pandas(df) + gdf = cudf.DataFrame.from_pandas(df) got = gdf[sliceobj].nsmallest(n, "a") assert (got.to_pandas() == expect).all().all() + + +@pytest.mark.parametrize("attr", ["nlargest", "nsmallest"]) +def test_dataframe_nlargest_nsmallest_str_error(attr): + gdf = cudf.DataFrame({"a": [1, 2, 3, 4], "b": ["a", "b", "c", "d"]}) + pdf = gdf.to_pandas() + + assert_exceptions_equal( + getattr(gdf, attr), + getattr(pdf, attr), + ([], {"n": 1, "columns": ["a", "b"]}), + ([], {"n": 1, "columns": ["a", "b"]}), + ) diff --git a/python/cudf/cudf/tests/dataframe/methods/test_squeeze.py b/python/cudf/cudf/tests/dataframe/methods/test_squeeze.py new file mode 100644 index 00000000000..4c89e167752 --- /dev/null +++ b/python/cudf/cudf/tests/dataframe/methods/test_squeeze.py @@ -0,0 +1,15 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + +import pytest + +import cudf +from cudf.testing import assert_eq + + +@pytest.mark.parametrize("axis", [None, 0, "index", 1, "columns"]) +@pytest.mark.parametrize("data", [[[1, 2], [2, 3]], [1, 2], [1]]) +def test_squeeze(axis, data): + df = cudf.DataFrame(data) + result = df.squeeze(axis=axis) + expected = df.to_pandas().squeeze(axis=axis) + assert_eq(result, expected) diff --git a/python/cudf/cudf/tests/dataframe/methods/test_to_dict.py b/python/cudf/cudf/tests/dataframe/methods/test_to_dict.py new file mode 100644 index 00000000000..bfaf8787f15 --- /dev/null +++ b/python/cudf/cudf/tests/dataframe/methods/test_to_dict.py @@ -0,0 +1,28 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + +import collections + +import pytest + +import cudf +from cudf.testing import assert_eq + + +@pytest.mark.parametrize( + "orient", ["dict", "list", "split", "tight", "records", "index", "series"] +) +@pytest.mark.parametrize( + "into", [dict, collections.OrderedDict, collections.defaultdict(list)] +) +def test_dataframe_to_dict(orient, into): + df = cudf.DataFrame({"a": [1, 2, 3], "b": [9, 5, 3]}, index=[10, 11, 12]) + pdf = df.to_pandas() + + actual = df.to_dict(orient=orient, into=into) + expected = pdf.to_dict(orient=orient, into=into) + if orient == "series": + assert actual.keys() == expected.keys() + for key in actual.keys(): + assert_eq(expected[key], actual[key]) + else: + assert_eq(expected, actual) diff --git a/python/cudf/cudf/tests/dataframe/methods/test_transpose.py b/python/cudf/cudf/tests/dataframe/methods/test_transpose.py index 96ef5deb49a..ece422f7822 100644 --- a/python/cudf/cudf/tests/dataframe/methods/test_transpose.py +++ b/python/cudf/cudf/tests/dataframe/methods/test_transpose.py @@ -1,4 +1,5 @@ # Copyright (c) 2025, NVIDIA CORPORATION. +import decimal import string import numpy as np @@ -97,3 +98,40 @@ def test_dataframe_transpose( assert_eq(expect, got_function.to_pandas(nullable=nullable)) assert_eq(expect, got_property.to_pandas(nullable=nullable)) + + +@pytest.mark.parametrize( + "data", + [ + {"col": [{"a": 1.1}, {"a": 2.1}, {"a": 10.0}, {"a": 11.2323}, None]}, + {"a": [[{"b": 567}], None] * 10}, + {"a": [decimal.Decimal(10), decimal.Decimal(20), None]}, + ], +) +def test_dataframe_transpose_complex_types(data): + gdf = cudf.DataFrame(data) + pdf = gdf.to_pandas() + + expected = pdf.T + actual = gdf.T + + assert_eq(expected, actual) + + +def test_dataframe_transpose_category(): + pdf = pd.DataFrame( + { + "a": pd.Series(["a", "b", "c"], dtype="category"), + "b": pd.Series(["a", "b", "c"], dtype="category"), + } + ) + + gdf = cudf.DataFrame.from_pandas(pdf) + + got_function = gdf.transpose() + got_property = gdf.T + + expect = pdf.transpose() + + assert_eq(expect, got_function.to_pandas()) + assert_eq(expect, got_property.to_pandas()) diff --git a/python/cudf/cudf/tests/dataframe/methods/test_where.py b/python/cudf/cudf/tests/dataframe/methods/test_where.py index f7af9945272..7b546d20f45 100644 --- a/python/cudf/cudf/tests/dataframe/methods/test_where.py +++ b/python/cudf/cudf/tests/dataframe/methods/test_where.py @@ -1,12 +1,14 @@ # Copyright (c) 2025, NVIDIA CORPORATION. - +import cupy as cp import numpy as np import pandas as pd import pytest +from numba import cuda import cudf from cudf.testing import assert_eq +from cudf.testing._utils import assert_exceptions_equal @pytest.mark.parametrize("fill_value", [[888, 999]]) @@ -79,3 +81,455 @@ def test_dataframe_where_with_different_options(): got = gdf.where(boolean_mask, [8, 9]) assert_eq(expect, got) + + +def test_frame_series_where(): + gdf = cudf.DataFrame( + {"a": [1.0, 2.0, None, 3.0, None], "b": [None, 10.0, 11.0, None, 23.0]} + ) + pdf = gdf.to_pandas() + expected = gdf.where(gdf.notna(), gdf.mean()) + actual = pdf.where(pdf.notna(), pdf.mean(), axis=1) + assert_eq(expected, actual) + + +def test_frame_series_where_other(): + gdf = cudf.DataFrame({"a": [1, 2, 3], "b": [1, 1, 0]}) + pdf = gdf.to_pandas() + + expected = gdf.where(gdf["b"] == 1, cudf.NA) + actual = pdf.where(pdf["b"] == 1, pd.NA) + assert_eq( + actual.fillna(-1).values, + expected.fillna(-1).values, + check_dtype=False, + ) + + expected = gdf.where(gdf["b"] == 1, 0) + actual = pdf.where(pdf["b"] == 1, 0) + assert_eq(expected, actual) + + +@pytest.mark.parametrize( + "data,condition,other,error", + [ + (pd.Series(range(5)), pd.Series(range(5)) > 0, None, None), + (pd.Series(range(5)), pd.Series(range(5)) > 1, None, None), + (pd.Series(range(5)), pd.Series(range(5)) > 1, 10, None), + ( + pd.Series(range(5)), + pd.Series(range(5)) > 1, + pd.Series(range(5, 10)), + None, + ), + ( + pd.DataFrame(np.arange(10).reshape(-1, 2), columns=["A", "B"]), + ( + pd.DataFrame(np.arange(10).reshape(-1, 2), columns=["A", "B"]) + % 3 + ) + == 0, + -pd.DataFrame(np.arange(10).reshape(-1, 2), columns=["A", "B"]), + None, + ), + ( + pd.DataFrame({"a": [1, 2, np.nan], "b": [4, np.nan, 6]}), + pd.DataFrame({"a": [1, 2, np.nan], "b": [4, np.nan, 6]}) == 4, + None, + None, + ), + ( + pd.DataFrame({"a": [1, 2, np.nan], "b": [4, np.nan, 6]}), + pd.DataFrame({"a": [1, 2, np.nan], "b": [4, np.nan, 6]}) != 4, + None, + None, + ), + ( + pd.DataFrame({"p": [-2, 3, -4, -79], "k": [9, 10, 11, 12]}), + [True, True, True], + None, + ValueError, + ), + ( + pd.DataFrame({"p": [-2, 3, -4, -79], "k": [9, 10, 11, 12]}), + [True, True, True, False], + None, + ValueError, + ), + ( + pd.DataFrame({"p": [-2, 3, -4, -79], "k": [9, 10, 11, 12]}), + [[True, True, True, False], [True, True, True, False]], + None, + ValueError, + ), + ( + pd.DataFrame({"p": [-2, 3, -4, -79], "k": [9, 10, 11, 12]}), + [[True, True], [False, True], [True, False], [False, True]], + None, + None, + ), + ( + pd.DataFrame({"p": [-2, 3, -4, -79], "k": [9, 10, 11, 12]}), + cuda.to_device( + np.array( + [[True, True], [False, True], [True, False], [False, True]] + ) + ), + None, + None, + ), + ( + pd.DataFrame({"p": [-2, 3, -4, -79], "k": [9, 10, 11, 12]}), + cp.array( + [[True, True], [False, True], [True, False], [False, True]] + ), + 17, + None, + ), + ( + pd.DataFrame({"p": [-2, 3, -4, -79], "k": [9, 10, 11, 12]}), + [[True, True], [False, True], [True, False], [False, True]], + 17, + None, + ), + ( + pd.DataFrame({"p": [-2, 3, -4, -79], "k": [9, 10, 11, 12]}), + [ + [True, True, False, True], + [True, True, False, True], + [True, True, False, True], + [True, True, False, True], + ], + None, + ValueError, + ), + ( + pd.Series([1, 2, np.nan]), + pd.Series([1, 2, np.nan]) == 4, + None, + None, + ), + ( + pd.Series([1, 2, np.nan]), + pd.Series([1, 2, np.nan]) != 4, + None, + None, + ), + ( + pd.Series([4, np.nan, 6]), + pd.Series([4, np.nan, 6]) == 4, + None, + None, + ), + ( + pd.Series([4, np.nan, 6]), + pd.Series([4, np.nan, 6]) != 4, + None, + None, + ), + ( + pd.Series([4, np.nan, 6], dtype="category"), + pd.Series([4, np.nan, 6], dtype="category") != 4, + None, + None, + ), + ( + pd.Series(["a", "b", "b", "d", "c", "s"], dtype="category"), + pd.Series(["a", "b", "b", "d", "c", "s"], dtype="category") == "b", + None, + None, + ), + ( + pd.Series(["a", "b", "b", "d", "c", "s"], dtype="category"), + pd.Series(["a", "b", "b", "d", "c", "s"], dtype="category") == "b", + "s", + None, + ), + ( + pd.Series([1, 2, 3, 2, 5]), + pd.Series([1, 2, 3, 2, 5]) == 2, + pd.DataFrame( + { + "a": pd.Series([1, 2, 3, 2, 5]), + "b": pd.Series([1, 2, 3, 2, 5]), + } + ), + NotImplementedError, + ), + ], +) +def test_df_sr_mask_where(data, condition, other, error, inplace): + ps_where = data + gs_where = cudf.from_pandas(data) + + ps_mask = ps_where.copy(deep=True) + gs_mask = gs_where.copy(deep=True) + + if hasattr(condition, "__cuda_array_interface__"): + if type(condition).__module__.split(".")[0] == "cupy": + ps_condition = cp.asnumpy(condition) + else: + ps_condition = np.array(condition).astype("bool") + else: + ps_condition = condition + + if type(condition).__module__.split(".")[0] == "pandas": + gs_condition = cudf.from_pandas(condition) + else: + gs_condition = condition + + ps_other = other + if type(other).__module__.split(".")[0] == "pandas": + gs_other = cudf.from_pandas(other) + else: + gs_other = other + + if error is None: + expect_where = ps_where.where( + ps_condition, other=ps_other, inplace=inplace + ) + got_where = gs_where.where( + gs_condition, other=gs_other, inplace=inplace + ) + + expect_mask = ps_mask.mask( + ps_condition, other=ps_other, inplace=inplace + ) + got_mask = gs_mask.mask(gs_condition, other=gs_other, inplace=inplace) + + if inplace: + expect_where = ps_where + got_where = gs_where + + expect_mask = ps_mask + got_mask = gs_mask + + if isinstance(expect_where, pd.Series) and isinstance( + expect_where.dtype, pd.CategoricalDtype + ): + np.testing.assert_array_equal( + expect_where.cat.codes, + got_where.cat.codes.astype(expect_where.cat.codes.dtype) + .fillna(-1) + .to_numpy(), + ) + assert_eq(expect_where.cat.categories, got_where.cat.categories) + + np.testing.assert_array_equal( + expect_mask.cat.codes, + got_mask.cat.codes.astype(expect_mask.cat.codes.dtype) + .fillna(-1) + .to_numpy(), + ) + assert_eq(expect_mask.cat.categories, got_mask.cat.categories) + else: + assert_eq( + expect_where.fillna(-1), + got_where.fillna(-1), + check_dtype=False, + ) + assert_eq( + expect_mask.fillna(-1), got_mask.fillna(-1), check_dtype=False + ) + else: + assert_exceptions_equal( + lfunc=ps_where.where, + rfunc=gs_where.where, + lfunc_args_and_kwargs=( + [ps_condition], + {"other": ps_other, "inplace": inplace}, + ), + rfunc_args_and_kwargs=( + [gs_condition], + {"other": gs_other, "inplace": inplace}, + ), + ) + + assert_exceptions_equal( + lfunc=ps_mask.mask, + rfunc=gs_mask.mask, + lfunc_args_and_kwargs=( + [ps_condition], + {"other": ps_other, "inplace": inplace}, + ), + rfunc_args_and_kwargs=( + [gs_condition], + {"other": gs_other, "inplace": inplace}, + ), + ) + + +@pytest.mark.parametrize( + "data,condition,other,has_cat", + [ + ( + pd.DataFrame( + { + "a": pd.Series(["a", "a", "b", "c", "a", "d", "d", "a"]), + "b": pd.Series(["o", "p", "q", "e", "p", "p", "a", "a"]), + } + ), + pd.DataFrame( + { + "a": pd.Series(["a", "a", "b", "c", "a", "d", "d", "a"]), + "b": pd.Series(["o", "p", "q", "e", "p", "p", "a", "a"]), + } + ) + != "a", + None, + None, + ), + ( + pd.DataFrame( + { + "a": pd.Series( + ["a", "a", "b", "c", "a", "d", "d", "a"], + dtype="category", + ), + "b": pd.Series( + ["o", "p", "q", "e", "p", "p", "a", "a"], + dtype="category", + ), + } + ), + pd.DataFrame( + { + "a": pd.Series( + ["a", "a", "b", "c", "a", "d", "d", "a"], + dtype="category", + ), + "b": pd.Series( + ["o", "p", "q", "e", "p", "p", "a", "a"], + dtype="category", + ), + } + ) + != "a", + None, + True, + ), + ( + pd.DataFrame( + { + "a": pd.Series( + ["a", "a", "b", "c", "a", "d", "d", "a"], + dtype="category", + ), + "b": pd.Series( + ["o", "p", "q", "e", "p", "p", "a", "a"], + dtype="category", + ), + } + ), + pd.DataFrame( + { + "a": pd.Series( + ["a", "a", "b", "c", "a", "d", "d", "a"], + dtype="category", + ), + "b": pd.Series( + ["o", "p", "q", "e", "p", "p", "a", "a"], + dtype="category", + ), + } + ) + == "a", + None, + True, + ), + ( + pd.DataFrame( + { + "a": pd.Series( + ["a", "a", "b", "c", "a", "d", "d", "a"], + dtype="category", + ), + "b": pd.Series( + ["o", "p", "q", "e", "p", "p", "a", "a"], + dtype="category", + ), + } + ), + pd.DataFrame( + { + "a": pd.Series( + ["a", "a", "b", "c", "a", "d", "d", "a"], + dtype="category", + ), + "b": pd.Series( + ["o", "p", "q", "e", "p", "p", "a", "a"], + dtype="category", + ), + } + ) + != "a", + "a", + True, + ), + ( + pd.DataFrame( + { + "a": pd.Series( + ["a", "a", "b", "c", "a", "d", "d", "a"], + dtype="category", + ), + "b": pd.Series( + ["o", "p", "q", "e", "p", "p", "a", "a"], + dtype="category", + ), + } + ), + pd.DataFrame( + { + "a": pd.Series( + ["a", "a", "b", "c", "a", "d", "d", "a"], + dtype="category", + ), + "b": pd.Series( + ["o", "p", "q", "e", "p", "p", "a", "a"], + dtype="category", + ), + } + ) + == "a", + "a", + True, + ), + ], +) +def test_df_string_cat_types_mask_where(data, condition, other, has_cat): + ps = data + gs = cudf.from_pandas(data) + + ps_condition = condition + if type(condition).__module__.split(".")[0] == "pandas": + gs_condition = cudf.from_pandas(condition) + else: + gs_condition = condition + + ps_other = other + if type(other).__module__.split(".")[0] == "pandas": + gs_other = cudf.from_pandas(other) + else: + gs_other = other + + expect_where = ps.where(ps_condition, other=ps_other) + got_where = gs.where(gs_condition, other=gs_other) + + expect_mask = ps.mask(ps_condition, other=ps_other) + got_mask = gs.mask(gs_condition, other=gs_other) + + if has_cat is None: + assert_eq( + expect_where.fillna(-1).astype("str"), + got_where.fillna(-1), + check_dtype=False, + ) + assert_eq( + expect_mask.fillna(-1).astype("str"), + got_mask.fillna(-1), + check_dtype=False, + ) + else: + assert_eq(expect_where, got_where, check_dtype=False) + assert_eq(expect_mask, got_mask, check_dtype=False) diff --git a/python/cudf/cudf/tests/dataframe/test_attributes.py b/python/cudf/cudf/tests/dataframe/test_attributes.py index afe2dbc10c4..d2cef2d8bdc 100644 --- a/python/cudf/cudf/tests/dataframe/test_attributes.py +++ b/python/cudf/cudf/tests/dataframe/test_attributes.py @@ -1,5 +1,7 @@ # Copyright (c) 2025, NVIDIA CORPORATION. +import decimal +import functools import numpy as np import pandas as pd @@ -7,6 +9,130 @@ import cudf from cudf.testing import assert_eq +from cudf.testing._utils import assert_exceptions_equal + + +@pytest.mark.parametrize("digits", [0, 1, 4]) +def test_dataframe_round_builtin(digits): + pdf = pd.DataFrame( + { + "a": [1.2234242333234, 323432.3243423, np.nan], + "b": ["a", "b", "c"], + "c": pd.Series([34224, 324324, 324342], dtype="datetime64[ns]"), + "d": pd.Series([224.242, None, 2424.234324], dtype="category"), + "e": [ + decimal.Decimal("342.3243234234242"), + decimal.Decimal("89.32432497687622"), + None, + ], + } + ) + gdf = cudf.from_pandas(pdf, nan_as_null=False) + + expected = round(pdf, digits) + actual = round(gdf, digits) + + assert_eq(expected, actual) + + +def test_bool_raises(): + assert_exceptions_equal( + lfunc=bool, + rfunc=bool, + lfunc_args_and_kwargs=[[cudf.DataFrame()]], + rfunc_args_and_kwargs=[[pd.DataFrame()]], + ) + + +@pytest.mark.parametrize("name", [None, "foo", 1, 1.0]) +def test_dataframe_column_name(name): + df = cudf.DataFrame({"a": [1, 2, 3]}) + pdf = df.to_pandas() + + df.columns.name = name + pdf.columns.name = name + + assert_eq(df, pdf) + assert_eq(df.columns.name, pdf.columns.name) + + +def test_dataframe_columns_set_none_raises(): + df = cudf.DataFrame({"a": [0]}) + with pytest.raises(TypeError): + df.columns = None + + +@pytest.mark.parametrize( + "columns", + [cudf.RangeIndex(1, name="foo"), pd.RangeIndex(1, name="foo"), range(1)], +) +def test_dataframe_columns_set_rangeindex(columns): + df = cudf.DataFrame([1], columns=["a"]) + df.columns = columns + result = df.columns + expected = pd.RangeIndex(1, name=getattr(columns, "name", None)) + pd.testing.assert_index_equal(result, expected, exact=True) + + +@pytest.mark.parametrize("klass", [cudf.MultiIndex, pd.MultiIndex]) +def test_dataframe_columns_set_multiindex(klass): + columns = klass.from_arrays([[10]], names=["foo"]) + df = cudf.DataFrame([1], columns=["a"]) + df.columns = columns + result = df.columns + expected = pd.MultiIndex.from_arrays([[10]], names=["foo"]) + pd.testing.assert_index_equal(result, expected, exact=True) + + +@pytest.mark.parametrize( + "klass", + [ + functools.partial(cudf.Index, name="foo"), + functools.partial(cudf.Series, name="foo"), + functools.partial(pd.Index, name="foo"), + functools.partial(pd.Series, name="foo"), + np.array, + ], +) +def test_dataframe_columns_set_preserve_type(klass): + df = cudf.DataFrame([1], columns=["a"]) + columns = klass([10], dtype="int8") + df.columns = columns + result = df.columns + expected = pd.Index( + [10], dtype="int8", name=getattr(columns, "name", None) + ) + pd.testing.assert_index_equal(result, expected) + + +@pytest.mark.parametrize("column", [range(1, 2), np.array([1], dtype=np.int8)]) +@pytest.mark.parametrize( + "operation", + [ + lambda df: df.where(df < 2, 2), + lambda df: df.nans_to_nulls(), + lambda df: df.isna(), + lambda df: df.notna(), + lambda df: abs(df), + lambda df: -df, + lambda df: ~df, + lambda df: df.cumsum(), + lambda df: df.replace(1, 2), + lambda df: df.replace(10, 20), + lambda df: df.clip(0, 10), + lambda df: df.rolling(1).mean(), + lambda df: df.interpolate(), + lambda df: df.shift(), + lambda df: df.sort_values(1), + lambda df: df.round(), + lambda df: df.rank(), + ], +) +def test_op_preserves_column_metadata(column, operation): + df = cudf.DataFrame([1], columns=cudf.Index(column)) + result = operation(df).columns + expected = pd.Index(column) + pd.testing.assert_index_equal(result, expected, exact=True) @pytest.mark.parametrize( diff --git a/python/cudf/cudf/tests/dataframe/test_constructors.py b/python/cudf/cudf/tests/dataframe/test_constructors.py index 385f4c44c01..15926f4faf0 100644 --- a/python/cudf/cudf/tests/dataframe/test_constructors.py +++ b/python/cudf/cudf/tests/dataframe/test_constructors.py @@ -1,5 +1,6 @@ # Copyright (c) 2025, NVIDIA CORPORATION. +import collections from contextlib import nullcontext as does_not_raise import cupy as cp @@ -7,8 +8,10 @@ import pandas as pd import pyarrow as pa import pytest +from numba import cuda import cudf +from cudf.core.column.column import as_column from cudf.testing import assert_eq @@ -263,7 +266,7 @@ def test_from_records_index(columns, index): assert_eq(df, gdf) -def test_dataframe_construction_from_cupy_arrays(): +def test_dataframe_construction_from_cp_arrays(): h_ary = np.array([[1, 2, 3], [4, 5, 6]], np.int32) d_ary = cp.asarray(h_ary) @@ -302,7 +305,7 @@ def test_dataframe_construction_from_cupy_arrays(): assert_eq(df, gdf) -def test_dataframe_cupy_wrong_dimensions(): +def test_dataframe_cp_wrong_dimensions(): d_ary = cp.empty((2, 3, 4), dtype=np.int32) with pytest.raises( ValueError, match="records dimension expected 1 or 2 but found: 3" @@ -310,7 +313,7 @@ def test_dataframe_cupy_wrong_dimensions(): cudf.DataFrame(d_ary) -def test_dataframe_cupy_array_wrong_index(): +def test_dataframe_cp_array_wrong_index(): d_ary = cp.empty((2, 3), dtype=np.int32) with pytest.raises(ValueError): @@ -403,6 +406,831 @@ def test_from_scalar_typing(request, all_supported_types_as_str): assert len(gdf["b"]) == len(gdf["a"]) +@pytest.mark.parametrize( + "data", + [ + {"a": [np.nan, 1, 2], "b": [None, None, None]}, + {"a": [1, 2, np.nan, 2], "b": [np.nan, np.nan, np.nan, np.nan]}, + { + "a": [1, 2, np.nan, 2, None], + "b": [np.nan, np.nan, None, np.nan, np.nan], + }, + {"a": [1, 2, 2, None, 1.1], "b": [1, 2.2, 3, None, 5]}, + ], +) +def test_dataframe_constructor_nan_as_null(data, nan_as_null): + actual = cudf.DataFrame(data, nan_as_null=nan_as_null) + + if nan_as_null: + assert ( + not ( + actual.astype("float").replace( + cudf.Series([np.nan], nan_as_null=False), cudf.Series([-1]) + ) + == -1 + ) + .any() + .any() + ) + else: + actual = actual.select_dtypes(exclude=["object"]) + assert (actual.replace(np.nan, -1) == -1).any().any() + + +@pytest.mark.parametrize( + "data,columns,index", + [ + (pd.Series([1, 2, 3]), None, None), + (pd.Series(["a", "b", None, "c"], name="abc"), None, None), + ( + pd.Series(["a", "b", None, "c"], name="abc"), + ["abc", "b"], + [1, 2, 3], + ), + ], +) +def test_dataframe_init_from_series(data, columns, index): + expected = pd.DataFrame(data, columns=columns, index=index) + actual = cudf.DataFrame(data, columns=columns, index=index) + + assert_eq( + expected, + actual, + check_index_type=len(expected) != 0, + ) + + +@pytest.mark.parametrize( + "dtype,expected_upcast_type,error", + [ + ( + "float32", + np.dtype("float32"), + None, + ), + ( + "float16", + None, + TypeError, + ), + ( + "float64", + np.dtype("float64"), + None, + ), + ( + "float128", + None, + ValueError, + ), + ], +) +def test_from_pandas_unsupported_types(dtype, expected_upcast_type, error): + data = pd.Series([1.1, 0.55, -1.23], dtype=dtype) + pdf = pd.DataFrame({"one_col": data}) + if error is not None: + with pytest.raises(error): + cudf.from_pandas(data) + + with pytest.raises(error): + cudf.Series(data) + + with pytest.raises(error): + cudf.from_pandas(pdf) + + with pytest.raises(error): + cudf.DataFrame(pdf) + else: + df = cudf.from_pandas(data) + + assert_eq(data, df, check_dtype=False) + assert df.dtype == expected_upcast_type + + df = cudf.Series(data) + assert_eq(data, df, check_dtype=False) + assert df.dtype == expected_upcast_type + + df = cudf.from_pandas(pdf) + assert_eq(pdf, df, check_dtype=False) + assert df["one_col"].dtype == expected_upcast_type + + df = cudf.DataFrame(pdf) + assert_eq(pdf, df, check_dtype=False) + assert df["one_col"].dtype == expected_upcast_type + + +@pytest.mark.parametrize( + "data", + [ + {"a": [1, 2, 3], "b": ["x", "y", "z"], "c": 4}, + {"c": 4, "a": [1, 2, 3], "b": ["x", "y", "z"]}, + {"a": [1, 2, 3], "c": 4}, + ], +) +def test_dataframe_init_from_scalar_and_lists(data): + actual = cudf.DataFrame(data) + expected = pd.DataFrame(data) + + assert_eq(expected, actual) + + +@pytest.mark.parametrize( + "columns", + ( + [], + ["c", "a"], + ["a", "d", "b", "e", "c"], + ["a", "b", "c"], + pd.Index(["b", "a", "c"], name="custom_name"), + ), +) +@pytest.mark.parametrize("index", (None, [4, 5, 6])) +def test_dataframe_dict_like_with_columns(columns, index): + data = {"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]} + expect = pd.DataFrame(data, columns=columns, index=index) + actual = cudf.DataFrame(data, columns=columns, index=index) + if index is None and len(columns) == 0: + # We make an empty range index, pandas makes an empty index + expect = expect.reset_index(drop=True) + assert_eq(expect, actual) + + +def test_dataframe_init_columns_named_multiindex(): + rng = np.random.default_rng(seed=0) + data = rng.standard_normal(size=(2, 2)) + columns = cudf.MultiIndex.from_tuples( + [("A", "one"), ("A", "two")], names=["y", "z"] + ) + gdf = cudf.DataFrame(data, columns=columns) + pdf = pd.DataFrame(data, columns=columns.to_pandas()) + + assert_eq(gdf, pdf) + + +def test_dataframe_init_columns_named_index(): + rng = np.random.default_rng(seed=0) + data = rng.standard_normal(size=(2, 2)) + columns = pd.Index(["a", "b"], name="custom_name") + gdf = cudf.DataFrame(data, columns=columns) + pdf = pd.DataFrame(data, columns=columns) + + assert_eq(gdf, pdf) + + +def test_dataframe_from_pandas_sparse(): + pdf = pd.DataFrame(range(2), dtype=pd.SparseDtype(np.int64, 0)) + with pytest.raises(NotImplementedError): + cudf.DataFrame(pdf) + + +def test_dataframe_constructor_unbounded_sequence(): + class A: + def __getitem__(self, key): + return 1 + + with pytest.raises(TypeError): + cudf.DataFrame([A()]) + + with pytest.raises(TypeError): + cudf.DataFrame({"a": A()}) + + +def test_dataframe_constructor_dataframe_list(): + df = cudf.DataFrame(range(2)) + with pytest.raises(TypeError): + cudf.DataFrame([df]) + + +def test_dataframe_constructor_from_namedtuple(): + Point1 = collections.namedtuple("Point1", ["a", "b", "c"]) + Point2 = collections.namedtuple("Point1", ["x", "y"]) + + data = [Point1(1, 2, 3), Point2(4, 5)] + idx = ["a", "b"] + gdf = cudf.DataFrame(data, index=idx) + pdf = pd.DataFrame(data, index=idx) + + assert_eq(gdf, pdf) + + data = [Point2(4, 5), Point1(1, 2, 3)] + with pytest.raises(ValueError): + cudf.DataFrame(data, index=idx) + with pytest.raises(ValueError): + pd.DataFrame(data, index=idx) + + +def test_series_data_no_name_with_columns(): + gdf = cudf.DataFrame(cudf.Series([1]), columns=[1]) + pdf = pd.DataFrame(pd.Series([1]), columns=[1]) + assert_eq(gdf, pdf) + + +def test_series_data_no_name_with_columns_more_than_one_raises(): + with pytest.raises(ValueError): + cudf.DataFrame(cudf.Series([1]), columns=[1, 2]) + with pytest.raises(ValueError): + pd.DataFrame(pd.Series([1]), columns=[1, 2]) + + +def test_series_data_with_name_with_columns_matching(): + gdf = cudf.DataFrame(cudf.Series([1], name=1), columns=[1]) + pdf = pd.DataFrame(pd.Series([1], name=1), columns=[1]) + assert_eq(gdf, pdf) + + +def test_series_data_with_name_with_columns_not_matching(): + gdf = cudf.DataFrame(cudf.Series([1], name=2), columns=[1]) + pdf = pd.DataFrame(pd.Series([1], name=2), columns=[1]) + assert_eq(gdf, pdf) + + +def test_series_data_with_name_with_columns_matching_align(): + gdf = cudf.DataFrame(cudf.Series([1], name=2), columns=[1, 2]) + pdf = pd.DataFrame(pd.Series([1], name=2), columns=[1, 2]) + assert_eq(gdf, pdf) + + +def test_generated_column(): + gdf = cudf.DataFrame({"a": (i for i in range(5))}) + assert len(gdf) == 5 + + +@pytest.mark.parametrize( + "data", + [ + ( + pd.Series([3, 3.0]), + pd.Series([2.3, 3.9]), + pd.Series([1.5, 3.9]), + pd.Series([1.0, 2]), + ), + [ + pd.Series([3, 3.0]), + pd.Series([2.3, 3.9]), + pd.Series([1.5, 3.9]), + pd.Series([1.0, 2]), + ], + ], +) +def test_create_dataframe_from_list_like(data): + pdf = pd.DataFrame(data, index=["count", "mean", "std", "min"]) + gdf = cudf.DataFrame(data, index=["count", "mean", "std", "min"]) + + assert_eq(pdf, gdf) + + pdf = pd.DataFrame(data) + gdf = cudf.DataFrame(data) + + assert_eq(pdf, gdf) + + +def test_create_dataframe_column(): + pdf = pd.DataFrame(columns=["a", "b", "c"], index=["A", "Z", "X"]) + gdf = cudf.DataFrame(columns=["a", "b", "c"], index=["A", "Z", "X"]) + + assert_eq(pdf, gdf) + + pdf = pd.DataFrame( + {"a": [1, 2, 3], "b": [2, 3, 5]}, + columns=["a", "b", "c"], + index=["A", "Z", "X"], + ) + gdf = cudf.DataFrame( + {"a": [1, 2, 3], "b": [2, 3, 5]}, + columns=["a", "b", "c"], + index=["A", "Z", "X"], + ) + + assert_eq(pdf, gdf) + + +@pytest.mark.parametrize( + "data", + [ + pd.DataFrame(np.eye(2)), + cudf.DataFrame(np.eye(2)), + np.eye(2), + cp.eye(2), + None, + [[1, 0], [0, 1]], + [cudf.Series([0, 1]), cudf.Series([1, 0])], + ], +) +@pytest.mark.parametrize( + "columns", + [None, range(2), pd.RangeIndex(2), cudf.RangeIndex(2)], +) +def test_dataframe_columns_returns_rangeindex(data, columns): + if data is None and columns is None: + pytest.skip(f"{data=} and {columns=} not relevant.") + result = cudf.DataFrame(data=data, columns=columns).columns + expected = pd.RangeIndex(range(2)) + assert_eq(result, expected) + + +def test_dataframe_columns_returns_rangeindex_single_col(): + result = cudf.DataFrame([1, 2, 3]).columns + expected = pd.RangeIndex(range(1)) + assert_eq(result, expected) + + +@pytest.mark.parametrize("dtype", ["int64", "datetime64[ns]", "int8"]) +@pytest.mark.parametrize("idx_data", [[], [1, 2]]) +@pytest.mark.parametrize("data", [None, [], {}]) +def test_dataframe_columns_empty_data_preserves_dtype(dtype, idx_data, data): + result = cudf.DataFrame( + data, columns=cudf.Index(idx_data, dtype=dtype) + ).columns + expected = pd.Index(idx_data, dtype=dtype) + assert_eq(result, expected) + + +def test_dataframe_init_from_nested_dict(): + ordered_dict = collections.OrderedDict( + [ + ( + "one", + collections.OrderedDict( + [("col_a", "foo1"), ("col_b", "bar1")] + ), + ), + ( + "two", + collections.OrderedDict( + [("col_a", "foo2"), ("col_b", "bar2")] + ), + ), + ( + "three", + collections.OrderedDict( + [("col_a", "foo3"), ("col_b", "bar3")] + ), + ), + ] + ) + pdf = pd.DataFrame(ordered_dict) + gdf = cudf.DataFrame(ordered_dict) + + assert_eq(pdf, gdf) + regular_dict = {key: dict(value) for key, value in ordered_dict.items()} + + pdf = pd.DataFrame(regular_dict) + gdf = cudf.DataFrame(regular_dict) + assert_eq(pdf, gdf) + + +def test_init_from_2_categoricalindex_series_diff_categories(): + s1 = cudf.Series( + [39, 6, 4], index=cudf.CategoricalIndex(["female", "male", "unknown"]) + ) + s2 = cudf.Series( + [2, 152, 2, 242, 150], + index=cudf.CategoricalIndex(["f", "female", "m", "male", "unknown"]), + ) + result = cudf.DataFrame([s1, s2]) + expected = pd.DataFrame([s1.to_pandas(), s2.to_pandas()]) + # TODO: Remove once https://github.com/pandas-dev/pandas/issues/57592 + # is adressed + expected.columns = result.columns + assert_eq(result, expected, check_dtype=False) + + +def test_data_frame_values_no_cols_but_index(): + result = cudf.DataFrame(index=range(5)).values + expected = pd.DataFrame(index=range(5)).values + assert_eq(result, expected) + + +def test_dataframe_from_ndarray_dup_columns(): + with pytest.raises(ValueError): + cudf.DataFrame(np.eye(2), columns=["A", "A"]) + + +def test_dataframe_init_with_nans(): + with cudf.option_context("mode.pandas_compatible", True): + gdf = cudf.DataFrame({"a": [1, 2, 3, np.nan]}) + assert gdf["a"].dtype == np.dtype("float64") + pdf = pd.DataFrame({"a": [1, 2, 3, np.nan]}) + assert_eq(pdf, gdf) + + +@pytest.mark.parametrize( + "data", + [ + cudf.DataFrame(range(2)), + None, + [cudf.Series(range(2))], + [[0], [1]], + {1: range(2)}, + cp.arange(2), + ], +) +def test_init_with_index_no_shallow_copy(data): + idx = cudf.RangeIndex(2) + df = cudf.DataFrame(data, index=idx) + assert df.index is idx + + +def test_from_records_with_index_no_shallow_copy(): + idx = cudf.RangeIndex(2) + data = np.array([(1.0, 2), (3.0, 4)], dtype=[("x", " min_periods).all().all(), + RuntimeWarning, + ): + expected = pdf.groupby(gkey).cov(min_periods=min_periods, ddof=ddof) + + assert_eq(expected, actual) + + +def test_groupby_covariance_multiindex_dataframe(): + gdf = cudf.DataFrame( + { + "a": [1, 1, 2, 2], + "b": [1, 1, 2, 2], + "c": [2, 3, 4, 5], + "d": [6, 8, 9, 1], + } + ).set_index(["a", "b"]) + + actual = gdf.groupby(level=["a", "b"]).cov() + expected = gdf.to_pandas().groupby(level=["a", "b"]).cov() + + assert_eq(expected, actual) + + +def test_groupby_covariance_empty_columns(): + gdf = cudf.DataFrame(columns=["id", "val1", "val2"]) + pdf = gdf.to_pandas() + + actual = gdf.groupby("id").cov() + expected = pdf.groupby("id").cov() + + assert_eq( + expected, + actual, + check_dtype=False, + check_index_type=False, + ) + + +def test_groupby_cov_invalid_column_types(): + gdf = cudf.DataFrame( + { + "id": ["a", "a", "a", "b", "b", "b", "c", "c", "c"], + "val1": ["v", "n", "k", "l", "m", "i", "y", "r", "w"], + "val2": ["d", "d", "d", "e", "e", "e", "f", "f", "f"], + }, + ) + with pytest.raises( + TypeError, + match="Covariance accepts only numerical column-pairs", + ): + gdf.groupby("id").cov() + + +def test_groupby_cov_positive_semidefinite_matrix(): + # Refer to discussions in PR #9889 re "pair-wise deletion" strategy + # being used in pandas to compute the covariance of a dataframe with + # rows containing missing values. + # Note: cuDF currently matches pandas behavior in that the covariance + # matrices are not guaranteed PSD (positive semi definite). + # https://github.com/rapidsai/cudf/pull/9889#discussion_r794158358 + gdf = cudf.DataFrame( + [[1, 2], [None, 4], [5, None], [7, 8]], columns=["v0", "v1"] + ) + actual = gdf.groupby(by=cudf.Series([1, 1, 1, 1])).cov() + actual.reset_index(drop=True, inplace=True) + + pdf = gdf.to_pandas() + expected = pdf.groupby(by=pd.Series([1, 1, 1, 1])).cov() + expected.reset_index(drop=True, inplace=True) + + assert_eq( + expected, + actual, + check_dtype=False, + ) + + +@pytest.mark.xfail +def test_groupby_cov_for_pandas_bug_case(): + # Handles case: pandas bug using ddof with missing data. + # Filed an issue in Pandas on GH, link below: + # https://github.com/pandas-dev/pandas/issues/45814 + pdf = pd.DataFrame( + {"id": ["a", "a"], "val1": [1.0, 2.0], "val2": [np.nan, np.nan]} + ) + expected = pdf.groupby("id").cov(ddof=2) + + gdf = cudf.from_pandas(pdf) + actual = gdf.groupby("id").cov(ddof=2) + + assert_eq(expected, actual) + + +@pytest.mark.parametrize( + "data, gkey", + [ + ( + { + "id": ["a", "a", "a", "b", "b", "b", "c", "c", "c"], + "val1": [5, 4, 6, 4, 8, 7, 4, 5, 2], + "val2": [4, 5, 6, 1, 2, 9, 8, 5, 1], + "val3": [4, 5, 6, 1, 2, 9, 8, 5, 1], + }, + ["id", "val1", "val2"], + ), + ( + { + "id": [0] * 4 + [1] * 3, + "a": [10, 3, 4, 2, -3, 9, 10], + "b": [10, 23, -4, 2, -3, 9, 19], + }, + ["id", "a"], + ), + ( + { + "id": ["a", "a", "b", "b", "c", "c"], + "val": pa.array( + [None, None, None, None, None, None], type=pa.float64() + ), + }, + ["id"], + ), + ( + { + "id": ["a", "a", "b", "b", "c", "c"], + "val1": [None, 4, 6, 8, None, 2], + "val2": [4, 5, None, 2, 9, None], + }, + ["id"], + ), + ({"id": [1.0], "val1": [2.0], "val2": [3.0]}, ["id"]), + ], +) +@pytest.mark.parametrize("min_per", [0, 1, 2]) +def test_pearson_corr_passing(data, gkey, min_per): + gdf = cudf.DataFrame(data) + pdf = gdf.to_pandas() + + actual = gdf.groupby(gkey).corr(method="pearson", min_periods=min_per) + expected = pdf.groupby(gkey).corr(method="pearson", min_periods=min_per) + + assert_eq(expected, actual) + + +@pytest.mark.parametrize("method", ["kendall", "spearman"]) +def test_pearson_corr_unsupported_methods(method): + gdf = cudf.DataFrame( + { + "id": ["a", "a", "a", "b", "b", "b", "c", "c", "c"], + "val1": [5, 4, 6, 4, 8, 7, 4, 5, 2], + "val2": [4, 5, 6, 1, 2, 9, 8, 5, 1], + "val3": [4, 5, 6, 1, 2, 9, 8, 5, 1], + } + ) + + with pytest.raises( + NotImplementedError, + match="Only pearson correlation is currently supported", + ): + gdf.groupby("id").corr(method) + + +def test_pearson_corr_empty_columns(): + gdf = cudf.DataFrame(columns=["id", "val1", "val2"]) + pdf = gdf.to_pandas() + + actual = gdf.groupby("id").corr("pearson") + expected = pdf.groupby("id").corr("pearson") + + assert_eq( + expected, + actual, + check_dtype=False, + check_index_type=False, + ) diff --git a/python/cudf/cudf/tests/series/methods/test_astype.py b/python/cudf/cudf/tests/series/methods/test_astype.py index fb1942fc64a..18b13f7c3d1 100644 --- a/python/cudf/cudf/tests/series/methods/test_astype.py +++ b/python/cudf/cudf/tests/series/methods/test_astype.py @@ -529,3 +529,285 @@ def test_datetime_infer_format(data, timezone, datetime_types_as_str): with pytest.raises(NotImplementedError): # pandas doesn't allow parsing "Z" to naive type sr.astype(datetime_types_as_str) + + +@pytest.mark.parametrize( + "np_dtype,pd_dtype", + [ + tuple(item) + for item in cudf.utils.dtypes.np_dtypes_to_pandas_dtypes.items() + ], +) +def test_series_astype_pandas_nullable( + all_supported_types_as_str, np_dtype, pd_dtype +): + source = cudf.Series([0, 1, None], dtype=all_supported_types_as_str) + + expect = source.astype(np_dtype) + got = source.astype(pd_dtype) + + assert_eq(expect, got) + + +def test_series_astype_numeric_to_numeric( + numeric_types_as_str, numeric_types_as_str2 +): + psr = pd.Series([1, 2, 4, 3], dtype=numeric_types_as_str) + gsr = cudf.from_pandas(psr) + assert_eq( + psr.astype(numeric_types_as_str2), gsr.astype(numeric_types_as_str2) + ) + + +def test_series_astype_numeric_to_numeric_nulls( + numeric_types_as_str, numeric_types_as_str2 +): + data = [1, 2, None, 3] + sr = cudf.Series(data, dtype=numeric_types_as_str) + got = sr.astype(numeric_types_as_str2) + expect = cudf.Series([1, 2, None, 3], dtype=numeric_types_as_str2) + assert_eq(expect, got) + + +@pytest.mark.parametrize( + "as_dtype", + [ + "str", + "category", + "datetime64[s]", + "datetime64[ms]", + "datetime64[us]", + "datetime64[ns]", + ], +) +def test_series_astype_numeric_to_other( + request, all_supported_types_as_str, as_dtype +): + if all_supported_types_as_str.startswith( + "timedelta64" + ) and as_dtype.startswith("datetime64"): + pytest.skip( + f"Casting {all_supported_types_as_str} to {as_dtype} is invalid" + ) + if all_supported_types_as_str == "str" and as_dtype.startswith( + "datetime64" + ): + pytest.skip( + f"Casting {all_supported_types_as_str} to {as_dtype} for test data is invalid." + ) + request.applymarker( + pytest.mark.xfail( + all_supported_types_as_str + in {"timedelta64[us]", "timedelta64[ms]", "timedelta64[s]"} + and as_dtype == "str", + reason=f"Casting {all_supported_types_as_str} to {as_dtype} is incorrect.", + ) + ) + psr = pd.Series([1, 2, 3], dtype=all_supported_types_as_str) + gsr = cudf.from_pandas(psr) + assert_eq(psr.astype(as_dtype), gsr.astype(as_dtype)) + + +@pytest.mark.parametrize( + "as_dtype", + [ + "str", + "int32", + "uint32", + "float32", + "category", + "datetime64[s]", + "datetime64[ms]", + "datetime64[us]", + "datetime64[ns]", + ], +) +def test_series_astype_string_to_other(as_dtype): + if "datetime64" in as_dtype: + data = ["2001-01-01", "2002-02-02", "2000-01-05"] + else: + data = ["1", "2", "3"] + psr = pd.Series(data) + gsr = cudf.from_pandas(psr) + assert_eq(psr.astype(as_dtype), gsr.astype(as_dtype)) + + +@pytest.mark.parametrize( + "as_dtype", + [ + "category", + "datetime64[s]", + "datetime64[ms]", + "datetime64[us]", + "datetime64[ns]", + ], +) +def test_series_astype_datetime_to_other(as_dtype): + data = ["2001-01-01", "2002-02-02", "2001-01-05"] + psr = pd.Series(data) + gsr = cudf.from_pandas(psr) + assert_eq(psr.astype(as_dtype), gsr.astype(as_dtype)) + + +@pytest.mark.parametrize( + "inp", + [ + ("datetime64[ns]", "2011-01-01 00:00:00.000000000"), + ("datetime64[us]", "2011-01-01 00:00:00.000000"), + ("datetime64[ms]", "2011-01-01 00:00:00.000"), + ("datetime64[s]", "2011-01-01 00:00:00"), + ], +) +def test_series_astype_datetime_to_string(inp): + dtype, expect = inp + base_date = "2011-01-01" + sr = cudf.Series([base_date], dtype=dtype) + got = sr.astype(str)[0] + assert expect == got + + +@pytest.mark.parametrize( + "as_dtype", + [ + "int32", + "uint32", + "float32", + "category", + "datetime64[s]", + "datetime64[ms]", + "datetime64[us]", + "datetime64[ns]", + "str", + ], +) +def test_series_astype_categorical_to_other(as_dtype): + if "datetime64" in as_dtype: + data = ["2001-01-01", "2002-02-02", "2000-01-05", "2001-01-01"] + else: + data = [1, 2, 3, 1] + psr = pd.Series(data, dtype="category") + gsr = cudf.from_pandas(psr) + assert_eq(psr.astype(as_dtype), gsr.astype(as_dtype)) + + +def test_series_astype_to_categorical_ordered(categorical_ordered): + psr = pd.Series([1, 2, 3, 1], dtype="category") + gsr = cudf.from_pandas(psr) + + ordered_dtype_pd = pd.CategoricalDtype( + categories=[1, 2, 3], ordered=categorical_ordered + ) + ordered_dtype_gd = cudf.CategoricalDtype.from_pandas(ordered_dtype_pd) + assert_eq( + psr.astype("int32").astype(ordered_dtype_pd).astype("int32"), + gsr.astype("int32").astype(ordered_dtype_gd).astype("int32"), + ) + + +def test_series_astype_cat_ordered_to_unordered(categorical_ordered): + pd_dtype = pd.CategoricalDtype( + categories=[1, 2, 3], ordered=categorical_ordered + ) + pd_to_dtype = pd.CategoricalDtype( + categories=[1, 2, 3], ordered=not categorical_ordered + ) + gd_dtype = cudf.CategoricalDtype.from_pandas(pd_dtype) + gd_to_dtype = cudf.CategoricalDtype.from_pandas(pd_to_dtype) + + psr = pd.Series([1, 2, 3], dtype=pd_dtype) + gsr = cudf.Series([1, 2, 3], dtype=gd_dtype) + + expect = psr.astype(pd_to_dtype) + got = gsr.astype(gd_to_dtype) + + assert_eq(expect, got) + + +def test_series_astype_null_cases(): + data = [1, 2, None, 3] + + # numerical to other + assert_eq(cudf.Series(data, dtype="str"), cudf.Series(data).astype("str")) + + assert_eq( + cudf.Series(data, dtype="category"), + cudf.Series(data).astype("category"), + ) + + assert_eq( + cudf.Series(data, dtype="float32"), + cudf.Series(data, dtype="int32").astype("float32"), + ) + + assert_eq( + cudf.Series(data, dtype="float32"), + cudf.Series(data, dtype="uint32").astype("float32"), + ) + + assert_eq( + cudf.Series(data, dtype="datetime64[ms]"), + cudf.Series(data).astype("datetime64[ms]"), + ) + + # categorical to other + assert_eq( + cudf.Series(data, dtype="str"), + cudf.Series(data, dtype="category").astype("str"), + ) + + assert_eq( + cudf.Series(data, dtype="float32"), + cudf.Series(data, dtype="category").astype("float32"), + ) + + assert_eq( + cudf.Series(data, dtype="datetime64[ms]"), + cudf.Series(data, dtype="category").astype("datetime64[ms]"), + ) + + # string to other + assert_eq( + cudf.Series([1, 2, None, 3], dtype="int32"), + cudf.Series(["1", "2", None, "3"]).astype("int32"), + ) + + assert_eq( + cudf.Series( + ["2001-01-01", "2001-02-01", None, "2001-03-01"], + dtype="datetime64[ms]", + ), + cudf.Series(["2001-01-01", "2001-02-01", None, "2001-03-01"]).astype( + "datetime64[ms]" + ), + ) + + assert_eq( + cudf.Series(["a", "b", "c", None], dtype="category").to_pandas(), + cudf.Series(["a", "b", "c", None]).astype("category").to_pandas(), + ) + + # datetime to other + data = [ + "2001-01-01 00:00:00.000000", + "2001-02-01 00:00:00.000000", + None, + "2001-03-01 00:00:00.000000", + ] + assert_eq( + cudf.Series(data), + cudf.Series(data, dtype="datetime64[us]").astype("str"), + ) + + assert_eq( + pd.Series(data, dtype="datetime64[ns]").astype("category"), + cudf.from_pandas(pd.Series(data, dtype="datetime64[ns]")).astype( + "category" + ), + ) + + +def test_series_astype_null_categorical(): + sr = cudf.Series([None, None, None], dtype="category") + expect = cudf.Series([None, None, None], dtype="int32") + got = sr.astype("int32") + assert_eq(expect, got) diff --git a/python/cudf/cudf/tests/series/test_constructors.py b/python/cudf/cudf/tests/series/test_constructors.py index a562a66f312..c0aaf5a1b22 100644 --- a/python/cudf/cudf/tests/series/test_constructors.py +++ b/python/cudf/cudf/tests/series/test_constructors.py @@ -72,6 +72,18 @@ def test_create_interval_series(data1, data2, data3, data4, interval_closed): assert_eq(expect_three, got_three) +def test_from_pandas_for_series_nan_as_null(nan_as_null): + data = [np.nan, 2.0, 3.0] + psr = pd.Series(data) + + expected = cudf.Series._from_column( + as_column(data, nan_as_null=nan_as_null) + ) + got = cudf.from_pandas(psr, nan_as_null=nan_as_null) + + assert_eq(expected, got) + + @pytest.mark.parametrize( "data", [ diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index 72601dd6a1a..8d2e7f9a707 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -3,15 +3,11 @@ import array as arr import datetime import decimal -import functools import io import operator -import random import re -import string import textwrap import warnings -from collections import OrderedDict, defaultdict, namedtuple from contextlib import contextmanager from copy import copy @@ -20,7 +16,6 @@ import pandas as pd import pyarrow as pa import pytest -from numba import cuda from packaging import version import cudf @@ -866,33 +861,6 @@ def test_dataframe_shape_empty(): assert pdf.shape == gdf.shape -@pytest.mark.parametrize("num_cols", [1, 3]) -@pytest.mark.parametrize("num_rows", [1, 5]) -def test_dataframe_transpose_category(num_cols, num_rows): - pdf = pd.DataFrame() - - for i in range(num_cols): - colname = string.ascii_lowercase[i] - data = pd.Series(list(string.ascii_lowercase), dtype="category") - data = data.sample(num_rows, replace=True).reset_index(drop=True) - pdf[colname] = data - - gdf = cudf.DataFrame.from_pandas(pdf) - - got_function = gdf.transpose() - got_property = gdf.T - - expect = pdf.transpose() - - assert_eq(expect, got_function.to_pandas()) - assert_eq(expect, got_property.to_pandas()) - - -def test_generated_column(): - gdf = cudf.DataFrame({"a": (i for i in range(5))}) - assert len(gdf) == 5 - - @pytest.fixture def pdf(): return pd.DataFrame({"x": range(10), "y": range(10)}) @@ -1636,354 +1604,6 @@ def test_as_column_types(): assert_eq(pds, gds) -@pytest.mark.parametrize("dtype", ALL_TYPES) -@pytest.mark.parametrize( - "np_dtype,pd_dtype", - [ - tuple(item) - for item in cudf.utils.dtypes.np_dtypes_to_pandas_dtypes.items() - ], -) -def test_series_astype_pandas_nullable(dtype, np_dtype, pd_dtype): - source = cudf.Series([0, 1, None], dtype=dtype) - - expect = source.astype(np_dtype) - got = source.astype(pd_dtype) - - assert_eq(expect, got) - - -@pytest.mark.parametrize("dtype", NUMERIC_TYPES) -@pytest.mark.parametrize("as_dtype", NUMERIC_TYPES) -def test_series_astype_numeric_to_numeric(dtype, as_dtype): - psr = pd.Series([1, 2, 4, 3], dtype=dtype) - gsr = cudf.from_pandas(psr) - assert_eq(psr.astype(as_dtype), gsr.astype(as_dtype)) - - -@pytest.mark.parametrize("dtype", NUMERIC_TYPES) -@pytest.mark.parametrize("as_dtype", NUMERIC_TYPES) -def test_series_astype_numeric_to_numeric_nulls(dtype, as_dtype): - data = [1, 2, None, 3] - sr = cudf.Series(data, dtype=dtype) - got = sr.astype(as_dtype) - expect = cudf.Series([1, 2, None, 3], dtype=as_dtype) - assert_eq(expect, got) - - -@pytest.mark.parametrize("dtype", NUMERIC_TYPES) -@pytest.mark.parametrize( - "as_dtype", - [ - "str", - "category", - "datetime64[s]", - "datetime64[ms]", - "datetime64[us]", - "datetime64[ns]", - ], -) -def test_series_astype_numeric_to_other(dtype, as_dtype): - psr = pd.Series([1, 2, 3], dtype=dtype) - gsr = cudf.from_pandas(psr) - assert_eq(psr.astype(as_dtype), gsr.astype(as_dtype)) - - -@pytest.mark.parametrize( - "as_dtype", - [ - "str", - "int32", - "uint32", - "float32", - "category", - "datetime64[s]", - "datetime64[ms]", - "datetime64[us]", - "datetime64[ns]", - ], -) -def test_series_astype_string_to_other(as_dtype): - if "datetime64" in as_dtype: - data = ["2001-01-01", "2002-02-02", "2000-01-05"] - else: - data = ["1", "2", "3"] - psr = pd.Series(data) - gsr = cudf.from_pandas(psr) - assert_eq(psr.astype(as_dtype), gsr.astype(as_dtype)) - - -@pytest.mark.parametrize( - "as_dtype", - [ - "category", - "datetime64[s]", - "datetime64[ms]", - "datetime64[us]", - "datetime64[ns]", - ], -) -def test_series_astype_datetime_to_other(as_dtype): - data = ["2001-01-01", "2002-02-02", "2001-01-05"] - psr = pd.Series(data) - gsr = cudf.from_pandas(psr) - assert_eq(psr.astype(as_dtype), gsr.astype(as_dtype)) - - -@pytest.mark.parametrize( - "inp", - [ - ("datetime64[ns]", "2011-01-01 00:00:00.000000000"), - ("datetime64[us]", "2011-01-01 00:00:00.000000"), - ("datetime64[ms]", "2011-01-01 00:00:00.000"), - ("datetime64[s]", "2011-01-01 00:00:00"), - ], -) -def test_series_astype_datetime_to_string(inp): - dtype, expect = inp - base_date = "2011-01-01" - sr = cudf.Series([base_date], dtype=dtype) - got = sr.astype(str)[0] - assert expect == got - - -@pytest.mark.parametrize( - "as_dtype", - [ - "int32", - "uint32", - "float32", - "category", - "datetime64[s]", - "datetime64[ms]", - "datetime64[us]", - "datetime64[ns]", - "str", - ], -) -def test_series_astype_categorical_to_other(as_dtype): - if "datetime64" in as_dtype: - data = ["2001-01-01", "2002-02-02", "2000-01-05", "2001-01-01"] - else: - data = [1, 2, 3, 1] - psr = pd.Series(data, dtype="category") - gsr = cudf.from_pandas(psr) - assert_eq(psr.astype(as_dtype), gsr.astype(as_dtype)) - - -@pytest.mark.parametrize("ordered", [True, False]) -def test_series_astype_to_categorical_ordered(ordered): - psr = pd.Series([1, 2, 3, 1], dtype="category") - gsr = cudf.from_pandas(psr) - - ordered_dtype_pd = pd.CategoricalDtype( - categories=[1, 2, 3], ordered=ordered - ) - ordered_dtype_gd = cudf.CategoricalDtype.from_pandas(ordered_dtype_pd) - assert_eq( - psr.astype("int32").astype(ordered_dtype_pd).astype("int32"), - gsr.astype("int32").astype(ordered_dtype_gd).astype("int32"), - ) - - -@pytest.mark.parametrize("ordered", [True, False]) -def test_series_astype_cat_ordered_to_unordered(ordered): - pd_dtype = pd.CategoricalDtype(categories=[1, 2, 3], ordered=ordered) - pd_to_dtype = pd.CategoricalDtype( - categories=[1, 2, 3], ordered=not ordered - ) - gd_dtype = cudf.CategoricalDtype.from_pandas(pd_dtype) - gd_to_dtype = cudf.CategoricalDtype.from_pandas(pd_to_dtype) - - psr = pd.Series([1, 2, 3], dtype=pd_dtype) - gsr = cudf.Series([1, 2, 3], dtype=gd_dtype) - - expect = psr.astype(pd_to_dtype) - got = gsr.astype(gd_to_dtype) - - assert_eq(expect, got) - - -def test_series_astype_null_cases(): - data = [1, 2, None, 3] - - # numerical to other - assert_eq(cudf.Series(data, dtype="str"), cudf.Series(data).astype("str")) - - assert_eq( - cudf.Series(data, dtype="category"), - cudf.Series(data).astype("category"), - ) - - assert_eq( - cudf.Series(data, dtype="float32"), - cudf.Series(data, dtype="int32").astype("float32"), - ) - - assert_eq( - cudf.Series(data, dtype="float32"), - cudf.Series(data, dtype="uint32").astype("float32"), - ) - - assert_eq( - cudf.Series(data, dtype="datetime64[ms]"), - cudf.Series(data).astype("datetime64[ms]"), - ) - - # categorical to other - assert_eq( - cudf.Series(data, dtype="str"), - cudf.Series(data, dtype="category").astype("str"), - ) - - assert_eq( - cudf.Series(data, dtype="float32"), - cudf.Series(data, dtype="category").astype("float32"), - ) - - assert_eq( - cudf.Series(data, dtype="datetime64[ms]"), - cudf.Series(data, dtype="category").astype("datetime64[ms]"), - ) - - # string to other - assert_eq( - cudf.Series([1, 2, None, 3], dtype="int32"), - cudf.Series(["1", "2", None, "3"]).astype("int32"), - ) - - assert_eq( - cudf.Series( - ["2001-01-01", "2001-02-01", None, "2001-03-01"], - dtype="datetime64[ms]", - ), - cudf.Series(["2001-01-01", "2001-02-01", None, "2001-03-01"]).astype( - "datetime64[ms]" - ), - ) - - assert_eq( - cudf.Series(["a", "b", "c", None], dtype="category").to_pandas(), - cudf.Series(["a", "b", "c", None]).astype("category").to_pandas(), - ) - - # datetime to other - data = [ - "2001-01-01 00:00:00.000000", - "2001-02-01 00:00:00.000000", - None, - "2001-03-01 00:00:00.000000", - ] - assert_eq( - cudf.Series(data), - cudf.Series(data, dtype="datetime64[us]").astype("str"), - ) - - assert_eq( - pd.Series(data, dtype="datetime64[ns]").astype("category"), - cudf.from_pandas(pd.Series(data, dtype="datetime64[ns]")).astype( - "category" - ), - ) - - -def test_series_astype_null_categorical(): - sr = cudf.Series([None, None, None], dtype="category") - expect = cudf.Series([None, None, None], dtype="int32") - got = sr.astype("int32") - assert_eq(expect, got) - - -@pytest.mark.parametrize( - "data", - [ - ( - pd.Series([3, 3.0]), - pd.Series([2.3, 3.9]), - pd.Series([1.5, 3.9]), - pd.Series([1.0, 2]), - ), - [ - pd.Series([3, 3.0]), - pd.Series([2.3, 3.9]), - pd.Series([1.5, 3.9]), - pd.Series([1.0, 2]), - ], - ], -) -def test_create_dataframe_from_list_like(data): - pdf = pd.DataFrame(data, index=["count", "mean", "std", "min"]) - gdf = cudf.DataFrame(data, index=["count", "mean", "std", "min"]) - - assert_eq(pdf, gdf) - - pdf = pd.DataFrame(data) - gdf = cudf.DataFrame(data) - - assert_eq(pdf, gdf) - - -def test_create_dataframe_column(): - pdf = pd.DataFrame(columns=["a", "b", "c"], index=["A", "Z", "X"]) - gdf = cudf.DataFrame(columns=["a", "b", "c"], index=["A", "Z", "X"]) - - assert_eq(pdf, gdf) - - pdf = pd.DataFrame( - {"a": [1, 2, 3], "b": [2, 3, 5]}, - columns=["a", "b", "c"], - index=["A", "Z", "X"], - ) - gdf = cudf.DataFrame( - {"a": [1, 2, 3], "b": [2, 3, 5]}, - columns=["a", "b", "c"], - index=["A", "Z", "X"], - ) - - assert_eq(pdf, gdf) - - -@pytest.mark.parametrize( - "data", - [ - pd.DataFrame(np.eye(2)), - cudf.DataFrame(np.eye(2)), - np.eye(2), - cupy.eye(2), - None, - [[1, 0], [0, 1]], - [cudf.Series([0, 1]), cudf.Series([1, 0])], - ], -) -@pytest.mark.parametrize( - "columns", - [None, range(2), pd.RangeIndex(2), cudf.RangeIndex(2)], -) -def test_dataframe_columns_returns_rangeindex(data, columns): - if data is None and columns is None: - pytest.skip(f"{data=} and {columns=} not relevant.") - result = cudf.DataFrame(data=data, columns=columns).columns - expected = pd.RangeIndex(range(2)) - assert_eq(result, expected) - - -def test_dataframe_columns_returns_rangeindex_single_col(): - result = cudf.DataFrame([1, 2, 3]).columns - expected = pd.RangeIndex(range(1)) - assert_eq(result, expected) - - -@pytest.mark.parametrize("dtype", ["int64", "datetime64[ns]", "int8"]) -@pytest.mark.parametrize("idx_data", [[], [1, 2]]) -@pytest.mark.parametrize("data", [None, [], {}]) -def test_dataframe_columns_empty_data_preserves_dtype(dtype, idx_data, data): - result = cudf.DataFrame( - data, columns=cudf.Index(idx_data, dtype=dtype) - ).columns - expected = pd.Index(idx_data, dtype=dtype) - assert_eq(result, expected) - - @pytest.mark.parametrize("dtype", ["int64", "datetime64[ns]", "int8"]) def test_dataframe_astype_preserves_column_dtype(dtype): result = cudf.DataFrame([1], columns=cudf.Index([1], dtype=dtype)) @@ -3227,45 +2847,6 @@ def test_tupleize_cols_False_set(): assert_eq(pdf.columns, gdf.columns) -def test_init_multiindex_from_dict(): - pdf = pd.DataFrame({("a", "b"): [1]}) - gdf = cudf.DataFrame({("a", "b"): [1]}) - assert_eq(pdf, gdf) - assert_eq(pdf.columns, gdf.columns) - - -def test_change_column_dtype_in_empty(): - pdf = pd.DataFrame({"a": [], "b": []}) - gdf = cudf.from_pandas(pdf) - assert_eq(pdf, gdf) - pdf["b"] = pdf["b"].astype("int64") - gdf["b"] = gdf["b"].astype("int64") - assert_eq(pdf, gdf) - - -@pytest.mark.parametrize("dtype", ["int64", "str"]) -def test_dataframe_from_dictionary_series_same_name_index(dtype): - pd_idx1 = pd.Index([1, 2, 0], name="test_index").astype(dtype) - pd_idx2 = pd.Index([2, 0, 1], name="test_index").astype(dtype) - pd_series1 = pd.Series([1, 2, 3], index=pd_idx1) - pd_series2 = pd.Series([1, 2, 3], index=pd_idx2) - - gd_idx1 = cudf.from_pandas(pd_idx1) - gd_idx2 = cudf.from_pandas(pd_idx2) - gd_series1 = cudf.Series([1, 2, 3], index=gd_idx1) - gd_series2 = cudf.Series([1, 2, 3], index=gd_idx2) - - expect = pd.DataFrame({"a": pd_series1, "b": pd_series2}) - got = cudf.DataFrame({"a": gd_series1, "b": gd_series2}) - - if dtype == "str": - # Pandas actually loses its index name erroneously here... - expect.index.name = "test_index" - - assert_eq(expect, got) - assert expect.index.names == got.index.names - - @pytest.mark.parametrize( "arg", [slice(2, 8, 3), slice(1, 20, 4), slice(-2, -6, -2)] ) @@ -3288,732 +2869,9 @@ def test_dataframe_strided_slice(arg): assert_eq(expect, got) +@pytest_unmark_spilling @pytest.mark.parametrize( - "data,condition,other,error", - [ - (pd.Series(range(5)), pd.Series(range(5)) > 0, None, None), - (pd.Series(range(5)), pd.Series(range(5)) > 1, None, None), - (pd.Series(range(5)), pd.Series(range(5)) > 1, 10, None), - ( - pd.Series(range(5)), - pd.Series(range(5)) > 1, - pd.Series(range(5, 10)), - None, - ), - ( - pd.DataFrame(np.arange(10).reshape(-1, 2), columns=["A", "B"]), - ( - pd.DataFrame(np.arange(10).reshape(-1, 2), columns=["A", "B"]) - % 3 - ) - == 0, - -pd.DataFrame(np.arange(10).reshape(-1, 2), columns=["A", "B"]), - None, - ), - ( - pd.DataFrame({"a": [1, 2, np.nan], "b": [4, np.nan, 6]}), - pd.DataFrame({"a": [1, 2, np.nan], "b": [4, np.nan, 6]}) == 4, - None, - None, - ), - ( - pd.DataFrame({"a": [1, 2, np.nan], "b": [4, np.nan, 6]}), - pd.DataFrame({"a": [1, 2, np.nan], "b": [4, np.nan, 6]}) != 4, - None, - None, - ), - ( - pd.DataFrame({"p": [-2, 3, -4, -79], "k": [9, 10, 11, 12]}), - [True, True, True], - None, - ValueError, - ), - ( - pd.DataFrame({"p": [-2, 3, -4, -79], "k": [9, 10, 11, 12]}), - [True, True, True, False], - None, - ValueError, - ), - ( - pd.DataFrame({"p": [-2, 3, -4, -79], "k": [9, 10, 11, 12]}), - [[True, True, True, False], [True, True, True, False]], - None, - ValueError, - ), - ( - pd.DataFrame({"p": [-2, 3, -4, -79], "k": [9, 10, 11, 12]}), - [[True, True], [False, True], [True, False], [False, True]], - None, - None, - ), - ( - pd.DataFrame({"p": [-2, 3, -4, -79], "k": [9, 10, 11, 12]}), - cuda.to_device( - np.array( - [[True, True], [False, True], [True, False], [False, True]] - ) - ), - None, - None, - ), - ( - pd.DataFrame({"p": [-2, 3, -4, -79], "k": [9, 10, 11, 12]}), - cupy.array( - [[True, True], [False, True], [True, False], [False, True]] - ), - 17, - None, - ), - ( - pd.DataFrame({"p": [-2, 3, -4, -79], "k": [9, 10, 11, 12]}), - [[True, True], [False, True], [True, False], [False, True]], - 17, - None, - ), - ( - pd.DataFrame({"p": [-2, 3, -4, -79], "k": [9, 10, 11, 12]}), - [ - [True, True, False, True], - [True, True, False, True], - [True, True, False, True], - [True, True, False, True], - ], - None, - ValueError, - ), - ( - pd.Series([1, 2, np.nan]), - pd.Series([1, 2, np.nan]) == 4, - None, - None, - ), - ( - pd.Series([1, 2, np.nan]), - pd.Series([1, 2, np.nan]) != 4, - None, - None, - ), - ( - pd.Series([4, np.nan, 6]), - pd.Series([4, np.nan, 6]) == 4, - None, - None, - ), - ( - pd.Series([4, np.nan, 6]), - pd.Series([4, np.nan, 6]) != 4, - None, - None, - ), - ( - pd.Series([4, np.nan, 6], dtype="category"), - pd.Series([4, np.nan, 6], dtype="category") != 4, - None, - None, - ), - ( - pd.Series(["a", "b", "b", "d", "c", "s"], dtype="category"), - pd.Series(["a", "b", "b", "d", "c", "s"], dtype="category") == "b", - None, - None, - ), - ( - pd.Series(["a", "b", "b", "d", "c", "s"], dtype="category"), - pd.Series(["a", "b", "b", "d", "c", "s"], dtype="category") == "b", - "s", - None, - ), - ( - pd.Series([1, 2, 3, 2, 5]), - pd.Series([1, 2, 3, 2, 5]) == 2, - pd.DataFrame( - { - "a": pd.Series([1, 2, 3, 2, 5]), - "b": pd.Series([1, 2, 3, 2, 5]), - } - ), - NotImplementedError, - ), - ], -) -@pytest.mark.parametrize("inplace", [True, False]) -def test_df_sr_mask_where(data, condition, other, error, inplace): - ps_where = data - gs_where = cudf.from_pandas(data) - - ps_mask = ps_where.copy(deep=True) - gs_mask = gs_where.copy(deep=True) - - if hasattr(condition, "__cuda_array_interface__"): - if type(condition).__module__.split(".")[0] == "cupy": - ps_condition = cupy.asnumpy(condition) - else: - ps_condition = np.array(condition).astype("bool") - else: - ps_condition = condition - - if type(condition).__module__.split(".")[0] == "pandas": - gs_condition = cudf.from_pandas(condition) - else: - gs_condition = condition - - ps_other = other - if type(other).__module__.split(".")[0] == "pandas": - gs_other = cudf.from_pandas(other) - else: - gs_other = other - - if error is None: - expect_where = ps_where.where( - ps_condition, other=ps_other, inplace=inplace - ) - got_where = gs_where.where( - gs_condition, other=gs_other, inplace=inplace - ) - - expect_mask = ps_mask.mask( - ps_condition, other=ps_other, inplace=inplace - ) - got_mask = gs_mask.mask(gs_condition, other=gs_other, inplace=inplace) - - if inplace: - expect_where = ps_where - got_where = gs_where - - expect_mask = ps_mask - got_mask = gs_mask - - if isinstance(expect_where, pd.Series) and isinstance( - expect_where.dtype, pd.CategoricalDtype - ): - np.testing.assert_array_equal( - expect_where.cat.codes, - got_where.cat.codes.astype(expect_where.cat.codes.dtype) - .fillna(-1) - .to_numpy(), - ) - assert_eq(expect_where.cat.categories, got_where.cat.categories) - - np.testing.assert_array_equal( - expect_mask.cat.codes, - got_mask.cat.codes.astype(expect_mask.cat.codes.dtype) - .fillna(-1) - .to_numpy(), - ) - assert_eq(expect_mask.cat.categories, got_mask.cat.categories) - else: - assert_eq( - expect_where.fillna(-1), - got_where.fillna(-1), - check_dtype=False, - ) - assert_eq( - expect_mask.fillna(-1), got_mask.fillna(-1), check_dtype=False - ) - else: - assert_exceptions_equal( - lfunc=ps_where.where, - rfunc=gs_where.where, - lfunc_args_and_kwargs=( - [ps_condition], - {"other": ps_other, "inplace": inplace}, - ), - rfunc_args_and_kwargs=( - [gs_condition], - {"other": gs_other, "inplace": inplace}, - ), - ) - - assert_exceptions_equal( - lfunc=ps_mask.mask, - rfunc=gs_mask.mask, - lfunc_args_and_kwargs=( - [ps_condition], - {"other": ps_other, "inplace": inplace}, - ), - rfunc_args_and_kwargs=( - [gs_condition], - {"other": gs_other, "inplace": inplace}, - ), - ) - - -@pytest.mark.parametrize( - "data,condition,other,has_cat", - [ - ( - pd.DataFrame( - { - "a": pd.Series(["a", "a", "b", "c", "a", "d", "d", "a"]), - "b": pd.Series(["o", "p", "q", "e", "p", "p", "a", "a"]), - } - ), - pd.DataFrame( - { - "a": pd.Series(["a", "a", "b", "c", "a", "d", "d", "a"]), - "b": pd.Series(["o", "p", "q", "e", "p", "p", "a", "a"]), - } - ) - != "a", - None, - None, - ), - ( - pd.DataFrame( - { - "a": pd.Series( - ["a", "a", "b", "c", "a", "d", "d", "a"], - dtype="category", - ), - "b": pd.Series( - ["o", "p", "q", "e", "p", "p", "a", "a"], - dtype="category", - ), - } - ), - pd.DataFrame( - { - "a": pd.Series( - ["a", "a", "b", "c", "a", "d", "d", "a"], - dtype="category", - ), - "b": pd.Series( - ["o", "p", "q", "e", "p", "p", "a", "a"], - dtype="category", - ), - } - ) - != "a", - None, - True, - ), - ( - pd.DataFrame( - { - "a": pd.Series( - ["a", "a", "b", "c", "a", "d", "d", "a"], - dtype="category", - ), - "b": pd.Series( - ["o", "p", "q", "e", "p", "p", "a", "a"], - dtype="category", - ), - } - ), - pd.DataFrame( - { - "a": pd.Series( - ["a", "a", "b", "c", "a", "d", "d", "a"], - dtype="category", - ), - "b": pd.Series( - ["o", "p", "q", "e", "p", "p", "a", "a"], - dtype="category", - ), - } - ) - == "a", - None, - True, - ), - ( - pd.DataFrame( - { - "a": pd.Series( - ["a", "a", "b", "c", "a", "d", "d", "a"], - dtype="category", - ), - "b": pd.Series( - ["o", "p", "q", "e", "p", "p", "a", "a"], - dtype="category", - ), - } - ), - pd.DataFrame( - { - "a": pd.Series( - ["a", "a", "b", "c", "a", "d", "d", "a"], - dtype="category", - ), - "b": pd.Series( - ["o", "p", "q", "e", "p", "p", "a", "a"], - dtype="category", - ), - } - ) - != "a", - "a", - True, - ), - ( - pd.DataFrame( - { - "a": pd.Series( - ["a", "a", "b", "c", "a", "d", "d", "a"], - dtype="category", - ), - "b": pd.Series( - ["o", "p", "q", "e", "p", "p", "a", "a"], - dtype="category", - ), - } - ), - pd.DataFrame( - { - "a": pd.Series( - ["a", "a", "b", "c", "a", "d", "d", "a"], - dtype="category", - ), - "b": pd.Series( - ["o", "p", "q", "e", "p", "p", "a", "a"], - dtype="category", - ), - } - ) - == "a", - "a", - True, - ), - ], -) -def test_df_string_cat_types_mask_where(data, condition, other, has_cat): - ps = data - gs = cudf.from_pandas(data) - - ps_condition = condition - if type(condition).__module__.split(".")[0] == "pandas": - gs_condition = cudf.from_pandas(condition) - else: - gs_condition = condition - - ps_other = other - if type(other).__module__.split(".")[0] == "pandas": - gs_other = cudf.from_pandas(other) - else: - gs_other = other - - expect_where = ps.where(ps_condition, other=ps_other) - got_where = gs.where(gs_condition, other=gs_other) - - expect_mask = ps.mask(ps_condition, other=ps_other) - got_mask = gs.mask(gs_condition, other=gs_other) - - if has_cat is None: - assert_eq( - expect_where.fillna(-1).astype("str"), - got_where.fillna(-1), - check_dtype=False, - ) - assert_eq( - expect_mask.fillna(-1).astype("str"), - got_mask.fillna(-1), - check_dtype=False, - ) - else: - assert_eq(expect_where, got_where, check_dtype=False) - assert_eq(expect_mask, got_mask, check_dtype=False) - - -@pytest.mark.parametrize( - "data,expected_upcast_type,error", - [ - ( - pd.Series([random.random() for _ in range(10)], dtype="float32"), - np.dtype("float32"), - None, - ), - ( - pd.Series([random.random() for _ in range(10)], dtype="float16"), - None, - TypeError, - ), - ( - pd.Series([random.random() for _ in range(10)], dtype="float64"), - np.dtype("float64"), - None, - ), - ( - pd.Series([random.random() for _ in range(10)], dtype="float128"), - None, - ValueError, - ), - ], -) -def test_from_pandas_unsupported_types(data, expected_upcast_type, error): - pdf = pd.DataFrame({"one_col": data}) - if error is not None: - with pytest.raises(error): - cudf.from_pandas(data) - - with pytest.raises(error): - cudf.Series(data) - - with pytest.raises(error): - cudf.from_pandas(pdf) - - with pytest.raises(error): - cudf.DataFrame(pdf) - else: - df = cudf.from_pandas(data) - - assert_eq(data, df, check_dtype=False) - assert df.dtype == expected_upcast_type - - df = cudf.Series(data) - assert_eq(data, df, check_dtype=False) - assert df.dtype == expected_upcast_type - - df = cudf.from_pandas(pdf) - assert_eq(pdf, df, check_dtype=False) - assert df["one_col"].dtype == expected_upcast_type - - df = cudf.DataFrame(pdf) - assert_eq(pdf, df, check_dtype=False) - assert df["one_col"].dtype == expected_upcast_type - - -@pytest.mark.parametrize("nan_as_null", [True, False]) -@pytest.mark.parametrize("index", [None, "a", ["a", "b"]]) -def test_from_pandas_nan_as_null(nan_as_null, index): - data = [np.nan, 2.0, 3.0] - - if index is None: - pdf = pd.DataFrame({"a": data, "b": data}) - expected = cudf.DataFrame( - { - "a": as_column(data, nan_as_null=nan_as_null), - "b": as_column(data, nan_as_null=nan_as_null), - } - ) - else: - pdf = pd.DataFrame({"a": data, "b": data}).set_index(index) - expected = cudf.DataFrame( - { - "a": as_column(data, nan_as_null=nan_as_null), - "b": as_column(data, nan_as_null=nan_as_null), - } - ) - expected = cudf.DataFrame( - { - "a": as_column(data, nan_as_null=nan_as_null), - "b": as_column(data, nan_as_null=nan_as_null), - } - ) - expected = expected.set_index(index) - - got = cudf.from_pandas(pdf, nan_as_null=nan_as_null) - - assert_eq(expected, got) - - -@pytest.mark.parametrize("nan_as_null", [True, False]) -def test_from_pandas_for_series_nan_as_null(nan_as_null): - data = [np.nan, 2.0, 3.0] - psr = pd.Series(data) - - expected = cudf.Series._from_column( - as_column(data, nan_as_null=nan_as_null) - ) - got = cudf.from_pandas(psr, nan_as_null=nan_as_null) - - assert_eq(expected, got) - - -@pytest.mark.parametrize("copy", [True, False]) -def test_df_series_dataframe_astype_copy(copy): - gdf = cudf.DataFrame({"col1": [1, 2], "col2": [3, 4]}) - pdf = gdf.to_pandas() - - assert_eq( - gdf.astype(dtype="float", copy=copy), - pdf.astype(dtype="float", copy=copy), - ) - assert_eq(gdf, pdf) - - gsr = cudf.Series([1, 2]) - psr = gsr.to_pandas() - - assert_eq( - gsr.astype(dtype="float", copy=copy), - psr.astype(dtype="float", copy=copy), - ) - assert_eq(gsr, psr) - - gsr = cudf.Series([1, 2]) - psr = gsr.to_pandas() - - actual = gsr.astype(dtype="int64", copy=copy) - expected = psr.astype(dtype="int64", copy=copy) - assert_eq(expected, actual) - assert_eq(gsr, psr) - actual[0] = 3 - expected[0] = 3 - assert_eq(gsr, psr) - - -@pytest.mark.parametrize("copy", [True, False]) -def test_df_series_dataframe_astype_dtype_dict(copy): - gdf = cudf.DataFrame({"col1": [1, 2], "col2": [3, 4]}) - pdf = gdf.to_pandas() - - assert_eq( - gdf.astype(dtype={"col1": "float"}, copy=copy), - pdf.astype(dtype={"col1": "float"}, copy=copy), - ) - assert_eq(gdf, pdf) - - gsr = cudf.Series([1, 2]) - psr = gsr.to_pandas() - - assert_eq( - gsr.astype(dtype={None: "float"}, copy=copy), - psr.astype(dtype={None: "float"}, copy=copy), - ) - assert_eq(gsr, psr) - - assert_exceptions_equal( - lfunc=psr.astype, - rfunc=gsr.astype, - lfunc_args_and_kwargs=([], {"dtype": {"a": "float"}, "copy": copy}), - rfunc_args_and_kwargs=([], {"dtype": {"a": "float"}, "copy": copy}), - ) - - gsr = cudf.Series([1, 2]) - psr = gsr.to_pandas() - - actual = gsr.astype({None: "int64"}, copy=copy) - expected = psr.astype({None: "int64"}, copy=copy) - assert_eq(expected, actual) - assert_eq(gsr, psr) - - actual[0] = 3 - expected[0] = 3 - assert_eq(gsr, psr) - - -@pytest.mark.parametrize( - "data,columns", - [ - ([1, 2, 3, 100, 112, 35464], ["a"]), - (range(100), None), - ( - [], - None, - ), - ((-10, 21, 32, 32, 1, 2, 3), ["p"]), - ( - (), - None, - ), - ([[1, 2, 3], [1, 2, 3]], ["col1", "col2", "col3"]), - ([range(100), range(100)], ["range" + str(i) for i in range(100)]), - (((1, 2, 3), (1, 2, 3)), ["tuple0", "tuple1", "tuple2"]), - ([[1, 2, 3]], ["list col1", "list col2", "list col3"]), - ([[1, 2, 3]], pd.Index(["col1", "col2", "col3"], name="rapids")), - ([range(100)], ["range" + str(i) for i in range(100)]), - (((1, 2, 3),), ["k1", "k2", "k3"]), - ], -) -def test_dataframe_init_1d_list(data, columns): - expect = pd.DataFrame(data, columns=columns) - actual = cudf.DataFrame(data, columns=columns) - - assert_eq( - expect, - actual, - check_index_type=len(data) != 0, - ) - - expect = pd.DataFrame(data, columns=None) - actual = cudf.DataFrame(data, columns=None) - - assert_eq( - expect, - actual, - check_index_type=len(data) != 0, - ) - - -@pytest.mark.parametrize( - "data,cols,index", - [ - ( - np.ndarray(shape=(4, 2), dtype=float, order="F"), - ["a", "b"], - ["a", "b", "c", "d"], - ), - ( - np.ndarray(shape=(4, 2), dtype=float, order="F"), - ["a", "b"], - [0, 20, 30, 10], - ), - ( - np.ndarray(shape=(4, 2), dtype=float, order="F"), - ["a", "b"], - [0, 1, 2, 3], - ), - (np.array([11, 123, -2342, 232]), ["a"], [1, 2, 11, 12]), - (np.array([11, 123, -2342, 232]), ["a"], ["khsdjk", "a", "z", "kk"]), - ( - cupy.ndarray(shape=(4, 2), dtype=float, order="F"), - ["a", "z"], - ["a", "z", "a", "z"], - ), - (cupy.array([11, 123, -2342, 232]), ["z"], [0, 1, 1, 0]), - (cupy.array([11, 123, -2342, 232]), ["z"], [1, 2, 3, 4]), - (cupy.array([11, 123, -2342, 232]), ["z"], ["a", "z", "d", "e"]), - ( - np.random.default_rng(seed=0).standard_normal(size=(2, 4)), - ["a", "b", "c", "d"], - ["a", "b"], - ), - ( - np.random.default_rng(seed=0).standard_normal(size=(2, 4)), - ["a", "b", "c", "d"], - [1, 0], - ), - (cupy.random.randn(2, 4), ["a", "b", "c", "d"], ["a", "b"]), - (cupy.random.randn(2, 4), ["a", "b", "c", "d"], [1, 0]), - ], -) -def test_dataframe_init_from_arrays_cols(data, cols, index): - gd_data = data - if isinstance(data, cupy.ndarray): - # pandas can't handle cupy arrays in general - pd_data = data.get() - - # additional test for building DataFrame with gpu array whose - # cuda array interface has no `descr` attribute - numba_data = cuda.as_cuda_array(data) - else: - pd_data = data - numba_data = None - - # verify with columns & index - pdf = pd.DataFrame(pd_data, columns=cols, index=index) - gdf = cudf.DataFrame(gd_data, columns=cols, index=index) - - assert_eq(pdf, gdf, check_dtype=False) - - # verify with columns - pdf = pd.DataFrame(pd_data, columns=cols) - gdf = cudf.DataFrame(gd_data, columns=cols) - - assert_eq(pdf, gdf, check_dtype=False) - - pdf = pd.DataFrame(pd_data) - gdf = cudf.DataFrame(gd_data) - - assert_eq(pdf, gdf, check_dtype=False) - - if numba_data is not None: - gdf = cudf.DataFrame(numba_data) - assert_eq(pdf, gdf, check_dtype=False) - - -@pytest_unmark_spilling -@pytest.mark.parametrize( - "col_data", + "col_data", [ range(5), ["a", "b", "x", "y", "z"], @@ -4309,223 +3167,57 @@ def test_dataframe_info_null_counts(): df = cudf.DataFrame( { - "a": [1, 2, 3, None, 10, 11, 12, None], - "b": ["a", "b", "c", "sd", "sdf", "sd", None, None], - } - ) - - str_cmp = textwrap.dedent( - """\ - - RangeIndex: 8 entries, 0 to 7 - Data columns (total 2 columns): - # Column Dtype - --- ------ ----- - 0 a int64 - 1 b object - dtypes: int64(1), object(1) - memory usage: 238.0+ bytes - """ - ) - pd.options.display.max_info_rows = 2 - df.info(buf=buffer, max_cols=2, null_counts=None) - pd.reset_option("display.max_info_rows") - actual_string = buffer.getvalue() - assert str_cmp == actual_string - - buffer.truncate(0) - buffer.seek(0) - - str_cmp = textwrap.dedent( - """\ - - RangeIndex: 8 entries, 0 to 7 - Data columns (total 2 columns): - # Column Non-Null Count Dtype - --- ------ -------------- ----- - 0 a 6 non-null int64 - 1 b 6 non-null object - dtypes: int64(1), object(1) - memory usage: 238.0+ bytes - """ - ) - - df.info(buf=buffer, max_cols=2, null_counts=None) - actual_string = buffer.getvalue() - assert str_cmp == actual_string - - buffer.truncate(0) - buffer.seek(0) - - df.info(buf=buffer, null_counts=True) - actual_string = buffer.getvalue() - assert str_cmp == actual_string - - -@pytest.mark.parametrize( - "orient", ["dict", "list", "split", "tight", "records", "index", "series"] -) -@pytest.mark.parametrize("into", [dict, OrderedDict, defaultdict(list)]) -def test_dataframe_to_dict(orient, into): - df = cudf.DataFrame({"a": [1, 2, 3], "b": [9, 5, 3]}, index=[10, 11, 12]) - pdf = df.to_pandas() - - actual = df.to_dict(orient=orient, into=into) - expected = pdf.to_dict(orient=orient, into=into) - if orient == "series": - assert actual.keys() == expected.keys() - for key in actual.keys(): - assert_eq(expected[key], actual[key]) - else: - assert_eq(expected, actual) - - -@pytest.mark.parametrize( - "data, orient, dtype, columns", - [ - ( - {"col_1": [3, 2, 1, 0], "col_2": [3, 2, 1, 0]}, - "columns", - None, - None, - ), - ({"col_1": [3, 2, 1, 0], "col_2": [3, 2, 1, 0]}, "index", None, None), - ( - {"col_1": [None, 2, 1, 0], "col_2": [3, None, 1, 0]}, - "index", - None, - ["A", "B", "C", "D"], - ), - ( - { - "col_1": ["ab", "cd", "ef", "gh"], - "col_2": ["zx", "one", "two", "three"], - }, - "index", - None, - ["A", "B", "C", "D"], - ), - ( - { - "index": [("a", "b"), ("a", "c")], - "columns": [("x", 1), ("y", 2)], - "data": [[1, 3], [2, 4]], - "index_names": ["n1", "n2"], - "column_names": ["z1", "z2"], - }, - "tight", - "float64", - None, - ), - ], -) -def test_dataframe_from_dict(data, orient, dtype, columns): - expected = pd.DataFrame.from_dict( - data=data, orient=orient, dtype=dtype, columns=columns - ) - - actual = cudf.DataFrame.from_dict( - data=data, orient=orient, dtype=dtype, columns=columns - ) - - assert_eq(expected, actual) - - -@pytest.mark.parametrize("dtype", ["int64", "str", None]) -def test_dataframe_from_dict_transposed(dtype): - pd_data = {"a": [3, 2, 1, 0], "col_2": [3, 2, 1, 0]} - gd_data = {key: cudf.Series(val) for key, val in pd_data.items()} - - expected = pd.DataFrame.from_dict(pd_data, orient="index", dtype=dtype) - actual = cudf.DataFrame.from_dict(gd_data, orient="index", dtype=dtype) - - gd_data = {key: cupy.asarray(val) for key, val in pd_data.items()} - actual = cudf.DataFrame.from_dict(gd_data, orient="index", dtype=dtype) - assert_eq(expected, actual) - - -@pytest.mark.parametrize( - "pd_data, gd_data, orient, dtype, columns", - [ - ( - {"col_1": np.array([3, 2, 1, 0]), "col_2": np.array([3, 2, 1, 0])}, - { - "col_1": cupy.array([3, 2, 1, 0]), - "col_2": cupy.array([3, 2, 1, 0]), - }, - "columns", - None, - None, - ), - ( - {"col_1": np.array([3, 2, 1, 0]), "col_2": np.array([3, 2, 1, 0])}, - { - "col_1": cupy.array([3, 2, 1, 0]), - "col_2": cupy.array([3, 2, 1, 0]), - }, - "index", - None, - None, - ), - ( - { - "col_1": np.array([None, 2, 1, 0]), - "col_2": np.array([3, None, 1, 0]), - }, - { - "col_1": cupy.array([np.nan, 2, 1, 0]), - "col_2": cupy.array([3, np.nan, 1, 0]), - }, - "index", - None, - ["A", "B", "C", "D"], - ), - ( - { - "col_1": np.array(["ab", "cd", "ef", "gh"]), - "col_2": np.array(["zx", "one", "two", "three"]), - }, - { - "col_1": np.array(["ab", "cd", "ef", "gh"]), - "col_2": np.array(["zx", "one", "two", "three"]), - }, - "index", - None, - ["A", "B", "C", "D"], - ), - ( - { - "index": [("a", "b"), ("a", "c")], - "columns": [("x", 1), ("y", 2)], - "data": [np.array([1, 3]), np.array([2, 4])], - "index_names": ["n1", "n2"], - "column_names": ["z1", "z2"], - }, - { - "index": [("a", "b"), ("a", "c")], - "columns": [("x", 1), ("y", 2)], - "data": [cupy.array([1, 3]), cupy.array([2, 4])], - "index_names": ["n1", "n2"], - "column_names": ["z1", "z2"], - }, - "tight", - "float64", - None, - ), - ], -) -def test_dataframe_from_dict_cp_np_arrays( - pd_data, gd_data, orient, dtype, columns -): - expected = pd.DataFrame.from_dict( - data=pd_data, orient=orient, dtype=dtype, columns=columns + "a": [1, 2, 3, None, 10, 11, 12, None], + "b": ["a", "b", "c", "sd", "sdf", "sd", None, None], + } + ) + + str_cmp = textwrap.dedent( + """\ + + RangeIndex: 8 entries, 0 to 7 + Data columns (total 2 columns): + # Column Dtype + --- ------ ----- + 0 a int64 + 1 b object + dtypes: int64(1), object(1) + memory usage: 238.0+ bytes + """ ) + pd.options.display.max_info_rows = 2 + df.info(buf=buffer, max_cols=2, null_counts=None) + pd.reset_option("display.max_info_rows") + actual_string = buffer.getvalue() + assert str_cmp == actual_string + + buffer.truncate(0) + buffer.seek(0) - actual = cudf.DataFrame.from_dict( - data=gd_data, orient=orient, dtype=dtype, columns=columns + str_cmp = textwrap.dedent( + """\ + + RangeIndex: 8 entries, 0 to 7 + Data columns (total 2 columns): + # Column Non-Null Count Dtype + --- ------ -------------- ----- + 0 a 6 non-null int64 + 1 b 6 non-null object + dtypes: int64(1), object(1) + memory usage: 238.0+ bytes + """ ) - assert_eq(expected, actual, check_dtype=dtype is not None) + df.info(buf=buffer, max_cols=2, null_counts=None) + actual_string = buffer.getvalue() + assert str_cmp == actual_string + + buffer.truncate(0) + buffer.seek(0) + + df.info(buf=buffer, null_counts=True) + actual_string = buffer.getvalue() + assert str_cmp == actual_string @pytest.mark.parametrize( @@ -6376,402 +5068,53 @@ def test_rename_for_level_is_None_MC(): ], ) def test_explode(data, labels, ignore_index, p_index, label_to_explode): - pdf = pd.DataFrame(data, index=p_index, columns=labels) - gdf = cudf.from_pandas(pdf) - - expect = pdf.explode(label_to_explode, ignore_index) - got = gdf.explode(label_to_explode, ignore_index) - - assert_eq(expect, got, check_dtype=False) - - -def test_explode_preserve_categorical(): - gdf = cudf.DataFrame( - { - "A": [[1, 2], None, [2, 3]], - "B": cudf.Series([0, 1, 2], dtype="category"), - } - ) - result = gdf.explode("A") - expected = cudf.DataFrame( - { - "A": [1, 2, None, 2, 3], - "B": cudf.Series([0, 0, 1, 2, 2], dtype="category"), - } - ) - expected.index = cudf.Index([0, 0, 1, 2, 2]) - assert_eq(result, expected) - - -@pytest.mark.parametrize( - "data,ascending,expected_data", - [ - ( - {"a": [10, 0, 2], "b": [-10, 10, 1]}, - True, - [1, 2, 0], - ), - ( - {"a": [10, 0, 2], "b": [-10, 10, 1]}, - False, - [0, 2, 1], - ), - ], -) -def test_dataframe_argsort(data, ascending, expected_data): - actual = cudf.DataFrame(data).argsort(ascending=ascending) - expected = cupy.array(expected_data, dtype="int32") - - assert_eq(actual, expected) - - -@pytest.mark.parametrize( - "data,columns,index", - [ - (pd.Series([1, 2, 3]), None, None), - (pd.Series(["a", "b", None, "c"], name="abc"), None, None), - ( - pd.Series(["a", "b", None, "c"], name="abc"), - ["abc", "b"], - [1, 2, 3], - ), - ], -) -def test_dataframe_init_from_series(data, columns, index): - expected = pd.DataFrame(data, columns=columns, index=index) - actual = cudf.DataFrame(data, columns=columns, index=index) - - assert_eq( - expected, - actual, - check_index_type=len(expected) != 0, - ) - - -def test_frame_series_where(): - gdf = cudf.DataFrame( - {"a": [1.0, 2.0, None, 3.0, None], "b": [None, 10.0, 11.0, None, 23.0]} - ) - pdf = gdf.to_pandas() - expected = gdf.where(gdf.notna(), gdf.mean()) - actual = pdf.where(pdf.notna(), pdf.mean(), axis=1) - assert_eq(expected, actual) - - -def test_frame_series_where_other(): - gdf = cudf.DataFrame({"a": [1, 2, 3], "b": [1, 1, 0]}) - pdf = gdf.to_pandas() - - expected = gdf.where(gdf["b"] == 1, cudf.NA) - actual = pdf.where(pdf["b"] == 1, pd.NA) - assert_eq( - actual.fillna(-1).values, - expected.fillna(-1).values, - check_dtype=False, - ) - - expected = gdf.where(gdf["b"] == 1, 0) - actual = pdf.where(pdf["b"] == 1, 0) - assert_eq(expected, actual) - - -@pytest.mark.parametrize( - "data, gkey", - [ - ( - { - "id": ["a", "a", "a", "b", "b", "b", "c", "c", "c"], - "val1": [5, 4, 6, 4, 8, 7, 4, 5, 2], - "val2": [4, 5, 6, 1, 2, 9, 8, 5, 1], - "val3": [4, 5, 6, 1, 2, 9, 8, 5, 1], - }, - ["id", "val1", "val2"], - ), - ( - { - "id": [0] * 4 + [1] * 3, - "a": [10, 3, 4, 2, -3, 9, 10], - "b": [10, 23, -4, 2, -3, 9, 19], - }, - ["id", "a"], - ), - ( - { - "id": ["a", "a", "b", "b", "c", "c"], - "val": pa.array( - [None, None, None, None, None, None], type=pa.float64() - ), - }, - ["id"], - ), - ( - { - "id": ["a", "a", "b", "b", "c", "c"], - "val1": [None, 4, 6, 8, None, 2], - "val2": [4, 5, None, 2, 9, None], - }, - ["id"], - ), - ({"id": [1.0], "val1": [2.0], "val2": [3.0]}, ["id"]), - ], -) -@pytest.mark.parametrize("min_per", [0, 1, 2]) -def test_pearson_corr_passing(data, gkey, min_per): - gdf = cudf.DataFrame(data) - pdf = gdf.to_pandas() - - actual = gdf.groupby(gkey).corr(method="pearson", min_periods=min_per) - expected = pdf.groupby(gkey).corr(method="pearson", min_periods=min_per) - - assert_eq(expected, actual) - - -@pytest.mark.parametrize("method", ["kendall", "spearman"]) -def test_pearson_corr_unsupported_methods(method): - gdf = cudf.DataFrame( - { - "id": ["a", "a", "a", "b", "b", "b", "c", "c", "c"], - "val1": [5, 4, 6, 4, 8, 7, 4, 5, 2], - "val2": [4, 5, 6, 1, 2, 9, 8, 5, 1], - "val3": [4, 5, 6, 1, 2, 9, 8, 5, 1], - } - ) - - with pytest.raises( - NotImplementedError, - match="Only pearson correlation is currently supported", - ): - gdf.groupby("id").corr(method) - - -def test_pearson_corr_empty_columns(): - gdf = cudf.DataFrame(columns=["id", "val1", "val2"]) - pdf = gdf.to_pandas() - - actual = gdf.groupby("id").corr("pearson") - expected = pdf.groupby("id").corr("pearson") - - assert_eq( - expected, - actual, - check_dtype=False, - check_index_type=False, - ) - - -@pytest.mark.parametrize( - "data", - [ - ["v", "n", "k", "l", "m", "i", "y", "r", "w"], - [1, 1, 1, 2, 2, 2, 3, 3, 3], - ], -) -@pytest.mark.parametrize("gkey", ["id", "val1", "val2"]) -def test_pearson_corr_invalid_column_types(data, gkey): - gdf = cudf.DataFrame( - { - "id": ["a", "a", "a", "b", "b", "b", "c", "c", "c"], - "val1": data, - "val2": ["d", "d", "d", "e", "e", "e", "f", "f", "f"], - } - ) - with pytest.raises( - TypeError, - match="Correlation accepts only numerical column-pairs", - ): - gdf.groupby(gkey).corr("pearson") - - -def test_pearson_corr_multiindex_dataframe(): - gdf = cudf.DataFrame( - {"a": [1, 1, 2, 2], "b": [1, 1, 2, 3], "c": [2, 3, 4, 5]} - ).set_index(["a", "b"]) - - actual = gdf.groupby(level="a").corr("pearson") - expected = gdf.to_pandas().groupby(level="a").corr("pearson") - - assert_eq(expected, actual) - - -@pytest.mark.parametrize( - "data", - [ - {"a": [np.nan, 1, 2], "b": [None, None, None]}, - {"a": [1, 2, np.nan, 2], "b": [np.nan, np.nan, np.nan, np.nan]}, - { - "a": [1, 2, np.nan, 2, None], - "b": [np.nan, np.nan, None, np.nan, np.nan], - }, - {"a": [1, 2, 2, None, 1.1], "b": [1, 2.2, 3, None, 5]}, - ], -) -@pytest.mark.parametrize("nan_as_null", [True, False]) -def test_dataframe_constructor_nan_as_null(data, nan_as_null): - actual = cudf.DataFrame(data, nan_as_null=nan_as_null) - - if nan_as_null: - assert ( - not ( - actual.astype("float").replace( - cudf.Series([np.nan], nan_as_null=False), cudf.Series([-1]) - ) - == -1 - ) - .any() - .any() - ) - else: - actual = actual.select_dtypes(exclude=["object"]) - assert (actual.replace(np.nan, -1) == -1).any().any() - - -def test_dataframe_add_prefix(): - cdf = cudf.DataFrame({"A": [1, 2, 3, 4], "B": [3, 4, 5, 6]}) - pdf = cdf.to_pandas() - - got = cdf.add_prefix("item_") - expected = pdf.add_prefix("item_") - - assert_eq(got, expected) - - -def test_dataframe_add_suffix(): - cdf = cudf.DataFrame({"A": [1, 2, 3, 4], "B": [3, 4, 5, 6]}) - pdf = cdf.to_pandas() - - got = cdf.add_suffix("_item") - expected = pdf.add_suffix("_item") - - assert_eq(got, expected) - - -@pytest.mark.parametrize( - "data, gkey", - [ - ( - { - "id": ["a", "a", "a", "b", "b", "b", "c", "c", "c"], - "val1": [5, 4, 6, 4, 8, 7, 4, 5, 2], - "val2": [4, 5, 6, 1, 2, 9, 8, 5, 1], - "val3": [4, 5, 6, 1, 2, 9, 8, 5, 1], - }, - ["id"], - ), - ( - { - "id": [0, 0, 0, 0, 1, 1, 1], - "a": [10.0, 3, 4, 2.0, -3.0, 9.0, 10.0], - "b": [10.0, 23, -4.0, 2, -3.0, 9, 19.0], - }, - ["id", "a"], - ), - ], -) -@pytest.mark.parametrize("min_periods", [0, 3]) -@pytest.mark.parametrize("ddof", [1, 2]) -def test_groupby_covariance(data, gkey, min_periods, ddof): - gdf = cudf.DataFrame(data) - pdf = gdf.to_pandas() - - actual = gdf.groupby(gkey).cov(min_periods=min_periods, ddof=ddof) - # We observe a warning if there are too few observations to generate a - # non-singular covariance matrix _and_ there are enough that pandas will - # actually attempt to compute a value. Groups with fewer than min_periods - # inputs will be skipped altogether, so no warning occurs. - with expect_warning_if( - (pdf.groupby(gkey).count() < 2).all().all() - and (pdf.groupby(gkey).count() > min_periods).all().all(), - RuntimeWarning, - ): - expected = pdf.groupby(gkey).cov(min_periods=min_periods, ddof=ddof) - - assert_eq(expected, actual) - - -def test_groupby_covariance_multiindex_dataframe(): - gdf = cudf.DataFrame( - { - "a": [1, 1, 2, 2], - "b": [1, 1, 2, 2], - "c": [2, 3, 4, 5], - "d": [6, 8, 9, 1], - } - ).set_index(["a", "b"]) - - actual = gdf.groupby(level=["a", "b"]).cov() - expected = gdf.to_pandas().groupby(level=["a", "b"]).cov() - - assert_eq(expected, actual) - - -def test_groupby_covariance_empty_columns(): - gdf = cudf.DataFrame(columns=["id", "val1", "val2"]) - pdf = gdf.to_pandas() - - actual = gdf.groupby("id").cov() - expected = pdf.groupby("id").cov() - - assert_eq( - expected, - actual, - check_dtype=False, - check_index_type=False, - ) + pdf = pd.DataFrame(data, index=p_index, columns=labels) + gdf = cudf.from_pandas(pdf) + expect = pdf.explode(label_to_explode, ignore_index) + got = gdf.explode(label_to_explode, ignore_index) -def test_groupby_cov_invalid_column_types(): - gdf = cudf.DataFrame( - { - "id": ["a", "a", "a", "b", "b", "b", "c", "c", "c"], - "val1": ["v", "n", "k", "l", "m", "i", "y", "r", "w"], - "val2": ["d", "d", "d", "e", "e", "e", "f", "f", "f"], - }, - ) - with pytest.raises( - TypeError, - match="Covariance accepts only numerical column-pairs", - ): - gdf.groupby("id").cov() + assert_eq(expect, got, check_dtype=False) -def test_groupby_cov_positive_semidefinite_matrix(): - # Refer to discussions in PR #9889 re "pair-wise deletion" strategy - # being used in pandas to compute the covariance of a dataframe with - # rows containing missing values. - # Note: cuDF currently matches pandas behavior in that the covariance - # matrices are not guaranteed PSD (positive semi definite). - # https://github.com/rapidsai/cudf/pull/9889#discussion_r794158358 +def test_explode_preserve_categorical(): gdf = cudf.DataFrame( - [[1, 2], [None, 4], [5, None], [7, 8]], columns=["v0", "v1"] + { + "A": [[1, 2], None, [2, 3]], + "B": cudf.Series([0, 1, 2], dtype="category"), + } ) - actual = gdf.groupby(by=cudf.Series([1, 1, 1, 1])).cov() - actual.reset_index(drop=True, inplace=True) - - pdf = gdf.to_pandas() - expected = pdf.groupby(by=pd.Series([1, 1, 1, 1])).cov() - expected.reset_index(drop=True, inplace=True) - - assert_eq( - expected, - actual, - check_dtype=False, + result = gdf.explode("A") + expected = cudf.DataFrame( + { + "A": [1, 2, None, 2, 3], + "B": cudf.Series([0, 0, 1, 2, 2], dtype="category"), + } ) + expected.index = cudf.Index([0, 0, 1, 2, 2]) + assert_eq(result, expected) -@pytest_xfail -def test_groupby_cov_for_pandas_bug_case(): - # Handles case: pandas bug using ddof with missing data. - # Filed an issue in Pandas on GH, link below: - # https://github.com/pandas-dev/pandas/issues/45814 - pdf = pd.DataFrame( - {"id": ["a", "a"], "val1": [1.0, 2.0], "val2": [np.nan, np.nan]} - ) - expected = pdf.groupby("id").cov(ddof=2) - - gdf = cudf.from_pandas(pdf) - actual = gdf.groupby("id").cov(ddof=2) +@pytest.mark.parametrize( + "data,ascending,expected_data", + [ + ( + {"a": [10, 0, 2], "b": [-10, 10, 1]}, + True, + [1, 2, 0], + ), + ( + {"a": [10, 0, 2], "b": [-10, 10, 1]}, + False, + [0, 2, 1], + ), + ], +) +def test_dataframe_argsort(data, ascending, expected_data): + actual = cudf.DataFrame(data).argsort(ascending=ascending) + expected = cupy.array(expected_data, dtype="int32") - assert_eq(expected, actual) + assert_eq(actual, expected) @pytest.mark.parametrize( @@ -7262,24 +5605,6 @@ def test_dataframe_duplicated(data, subset, keep): assert_eq(expected, actual) -@pytest.mark.parametrize( - "data", - [ - {"col": [{"a": 1.1}, {"a": 2.1}, {"a": 10.0}, {"a": 11.2323}, None]}, - {"a": [[{"b": 567}], None] * 10}, - {"a": [decimal.Decimal(10), decimal.Decimal(20), None]}, - ], -) -def test_dataframe_transpose_complex_types(data): - gdf = cudf.DataFrame(data) - pdf = gdf.to_pandas() - - expected = pdf.T - actual = gdf.T - - assert_eq(expected, actual) - - @pytest.mark.parametrize( "data", [ @@ -7308,21 +5633,6 @@ def test_dataframe_from_arrow_slice(): assert_eq(expected, actual) -@pytest.mark.parametrize( - "data", - [ - {"a": [1, 2, 3], "b": ["x", "y", "z"], "c": 4}, - {"c": 4, "a": [1, 2, 3], "b": ["x", "y", "z"]}, - {"a": [1, 2, 3], "c": 4}, - ], -) -def test_dataframe_init_from_scalar_and_lists(data): - actual = cudf.DataFrame(data) - expected = pd.DataFrame(data) - - assert_eq(expected, actual) - - @pytest.mark.parametrize( "data,index", [ @@ -7428,91 +5738,6 @@ def test_dataframe_binop_with_datetime_index(): assert_eq(expected, got) -@pytest.mark.parametrize( - "columns", - ( - [], - ["c", "a"], - ["a", "d", "b", "e", "c"], - ["a", "b", "c"], - pd.Index(["b", "a", "c"], name="custom_name"), - ), -) -@pytest.mark.parametrize("index", (None, [4, 5, 6])) -def test_dataframe_dict_like_with_columns(columns, index): - data = {"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]} - expect = pd.DataFrame(data, columns=columns, index=index) - actual = cudf.DataFrame(data, columns=columns, index=index) - if index is None and len(columns) == 0: - # We make an empty range index, pandas makes an empty index - expect = expect.reset_index(drop=True) - assert_eq(expect, actual) - - -def test_dataframe_init_columns_named_multiindex(): - rng = np.random.default_rng(seed=0) - data = rng.standard_normal(size=(2, 2)) - columns = cudf.MultiIndex.from_tuples( - [("A", "one"), ("A", "two")], names=["y", "z"] - ) - gdf = cudf.DataFrame(data, columns=columns) - pdf = pd.DataFrame(data, columns=columns.to_pandas()) - - assert_eq(gdf, pdf) - - -def test_dataframe_init_columns_named_index(): - rng = np.random.default_rng(seed=0) - data = rng.standard_normal(size=(2, 2)) - columns = pd.Index(["a", "b"], name="custom_name") - gdf = cudf.DataFrame(data, columns=columns) - pdf = pd.DataFrame(data, columns=columns) - - assert_eq(gdf, pdf) - - -def test_dataframe_from_pandas_sparse(): - pdf = pd.DataFrame(range(2), dtype=pd.SparseDtype(np.int64, 0)) - with pytest.raises(NotImplementedError): - cudf.DataFrame(pdf) - - -def test_dataframe_constructor_unbounded_sequence(): - class A: - def __getitem__(self, key): - return 1 - - with pytest.raises(TypeError): - cudf.DataFrame([A()]) - - with pytest.raises(TypeError): - cudf.DataFrame({"a": A()}) - - -def test_dataframe_constructor_dataframe_list(): - df = cudf.DataFrame(range(2)) - with pytest.raises(TypeError): - cudf.DataFrame([df]) - - -def test_dataframe_constructor_from_namedtuple(): - Point1 = namedtuple("Point1", ["a", "b", "c"]) - Point2 = namedtuple("Point1", ["x", "y"]) - - data = [Point1(1, 2, 3), Point2(4, 5)] - idx = ["a", "b"] - gdf = cudf.DataFrame(data, index=idx) - pdf = pd.DataFrame(data, index=idx) - - assert_eq(gdf, pdf) - - data = [Point2(4, 5), Point1(1, 2, 3)] - with pytest.raises(ValueError): - cudf.DataFrame(data, index=idx) - with pytest.raises(ValueError): - pd.DataFrame(data, index=idx) - - @pytest.mark.parametrize( "dtype", ["datetime64[ns]", "timedelta64[ns]", "int64", "float32"] ) @@ -7550,114 +5775,6 @@ def test_dataframe_reindex_with_index_names(index_data, name): assert_eq(actual, expected) -@pytest.mark.parametrize("attr", ["nlargest", "nsmallest"]) -def test_dataframe_nlargest_nsmallest_str_error(attr): - gdf = cudf.DataFrame({"a": [1, 2, 3, 4], "b": ["a", "b", "c", "d"]}) - pdf = gdf.to_pandas() - - assert_exceptions_equal( - getattr(gdf, attr), - getattr(pdf, attr), - ([], {"n": 1, "columns": ["a", "b"]}), - ([], {"n": 1, "columns": ["a", "b"]}), - ) - - -def test_series_data_no_name_with_columns(): - gdf = cudf.DataFrame(cudf.Series([1]), columns=[1]) - pdf = pd.DataFrame(pd.Series([1]), columns=[1]) - assert_eq(gdf, pdf) - - -def test_series_data_no_name_with_columns_more_than_one_raises(): - with pytest.raises(ValueError): - cudf.DataFrame(cudf.Series([1]), columns=[1, 2]) - with pytest.raises(ValueError): - pd.DataFrame(pd.Series([1]), columns=[1, 2]) - - -def test_series_data_with_name_with_columns_matching(): - gdf = cudf.DataFrame(cudf.Series([1], name=1), columns=[1]) - pdf = pd.DataFrame(pd.Series([1], name=1), columns=[1]) - assert_eq(gdf, pdf) - - -def test_series_data_with_name_with_columns_not_matching(): - gdf = cudf.DataFrame(cudf.Series([1], name=2), columns=[1]) - pdf = pd.DataFrame(pd.Series([1], name=2), columns=[1]) - assert_eq(gdf, pdf) - - -def test_series_data_with_name_with_columns_matching_align(): - gdf = cudf.DataFrame(cudf.Series([1], name=2), columns=[1, 2]) - pdf = pd.DataFrame(pd.Series([1], name=2), columns=[1, 2]) - assert_eq(gdf, pdf) - - -@pytest.mark.parametrize("digits", [0, 1, 4]) -def test_dataframe_round_builtin(digits): - pdf = pd.DataFrame( - { - "a": [1.2234242333234, 323432.3243423, np.nan], - "b": ["a", "b", "c"], - "c": pd.Series([34224, 324324, 324342], dtype="datetime64[ns]"), - "d": pd.Series([224.242, None, 2424.234324], dtype="category"), - "e": [ - decimal.Decimal("342.3243234234242"), - decimal.Decimal("89.32432497687622"), - None, - ], - } - ) - gdf = cudf.from_pandas(pdf, nan_as_null=False) - - expected = round(pdf, digits) - actual = round(gdf, digits) - - assert_eq(expected, actual) - - -def test_dataframe_init_from_nested_dict(): - ordered_dict = OrderedDict( - [ - ("one", OrderedDict([("col_a", "foo1"), ("col_b", "bar1")])), - ("two", OrderedDict([("col_a", "foo2"), ("col_b", "bar2")])), - ("three", OrderedDict([("col_a", "foo3"), ("col_b", "bar3")])), - ] - ) - pdf = pd.DataFrame(ordered_dict) - gdf = cudf.DataFrame(ordered_dict) - - assert_eq(pdf, gdf) - regular_dict = {key: dict(value) for key, value in ordered_dict.items()} - - pdf = pd.DataFrame(regular_dict) - gdf = cudf.DataFrame(regular_dict) - assert_eq(pdf, gdf) - - -def test_init_from_2_categoricalindex_series_diff_categories(): - s1 = cudf.Series( - [39, 6, 4], index=cudf.CategoricalIndex(["female", "male", "unknown"]) - ) - s2 = cudf.Series( - [2, 152, 2, 242, 150], - index=cudf.CategoricalIndex(["f", "female", "m", "male", "unknown"]), - ) - result = cudf.DataFrame([s1, s2]) - expected = pd.DataFrame([s1.to_pandas(), s2.to_pandas()]) - # TODO: Remove once https://github.com/pandas-dev/pandas/issues/57592 - # is adressed - expected.columns = result.columns - assert_eq(result, expected, check_dtype=False) - - -def test_data_frame_values_no_cols_but_index(): - result = cudf.DataFrame(index=range(5)).values - expected = pd.DataFrame(index=range(5)).values - assert_eq(result, expected) - - def test_dataframe_reduction_error(): gdf = cudf.DataFrame( { @@ -7676,11 +5793,6 @@ def test_dataframe_from_generator(): assert_eq(pdf, gdf) -def test_dataframe_from_ndarray_dup_columns(): - with pytest.raises(ValueError): - cudf.DataFrame(np.eye(2), columns=["A", "A"]) - - @pytest.mark.parametrize("name", ["a", 0, None, np.nan, cudf.NA]) @pytest.mark.parametrize("contains", ["a", 0, None, np.nan, cudf.NA]) @pytest.mark.parametrize("other_names", [[], ["b", "c"], [1, 2]]) @@ -7785,55 +5897,6 @@ def test_dataframe_duplicate_index_reindex(): ) -def test_dataframe_columns_set_none_raises(): - df = cudf.DataFrame({"a": [0]}) - with pytest.raises(TypeError): - df.columns = None - - -@pytest.mark.parametrize( - "columns", - [cudf.RangeIndex(1, name="foo"), pd.RangeIndex(1, name="foo"), range(1)], -) -def test_dataframe_columns_set_rangeindex(columns): - df = cudf.DataFrame([1], columns=["a"]) - df.columns = columns - result = df.columns - expected = pd.RangeIndex(1, name=getattr(columns, "name", None)) - pd.testing.assert_index_equal(result, expected, exact=True) - - -@pytest.mark.parametrize("klass", [cudf.MultiIndex, pd.MultiIndex]) -def test_dataframe_columns_set_multiindex(klass): - columns = klass.from_arrays([[10]], names=["foo"]) - df = cudf.DataFrame([1], columns=["a"]) - df.columns = columns - result = df.columns - expected = pd.MultiIndex.from_arrays([[10]], names=["foo"]) - pd.testing.assert_index_equal(result, expected, exact=True) - - -@pytest.mark.parametrize( - "klass", - [ - functools.partial(cudf.Index, name="foo"), - functools.partial(cudf.Series, name="foo"), - functools.partial(pd.Index, name="foo"), - functools.partial(pd.Series, name="foo"), - np.array, - ], -) -def test_dataframe_columns_set_preserve_type(klass): - df = cudf.DataFrame([1], columns=["a"]) - columns = klass([10], dtype="int8") - df.columns = columns - result = df.columns - expected = pd.Index( - [10], dtype="int8", name=getattr(columns, "name", None) - ) - pd.testing.assert_index_equal(result, expected) - - @pytest.mark.parametrize( "expected", [ @@ -7890,53 +5953,6 @@ def test_dataframe_to_pandas_arrow_type(scalar): pd.testing.assert_frame_equal(result, expected) -@pytest.mark.parametrize("axis", [None, 0, "index", 1, "columns"]) -@pytest.mark.parametrize("data", [[[1, 2], [2, 3]], [1, 2], [1]]) -def test_squeeze(axis, data): - df = cudf.DataFrame(data) - result = df.squeeze(axis=axis) - expected = df.to_pandas().squeeze(axis=axis) - assert_eq(result, expected) - - -@pytest.mark.parametrize("column", [range(1, 2), np.array([1], dtype=np.int8)]) -@pytest.mark.parametrize( - "operation", - [ - lambda df: df.where(df < 2, 2), - lambda df: df.nans_to_nulls(), - lambda df: df.isna(), - lambda df: df.notna(), - lambda df: abs(df), - lambda df: -df, - lambda df: ~df, - lambda df: df.cumsum(), - lambda df: df.replace(1, 2), - lambda df: df.replace(10, 20), - lambda df: df.clip(0, 10), - lambda df: df.rolling(1).mean(), - lambda df: df.interpolate(), - lambda df: df.shift(), - lambda df: df.sort_values(1), - lambda df: df.round(), - lambda df: df.rank(), - ], -) -def test_op_preserves_column_metadata(column, operation): - df = cudf.DataFrame([1], columns=cudf.Index(column)) - result = operation(df).columns - expected = pd.Index(column) - pd.testing.assert_index_equal(result, expected, exact=True) - - -def test_dataframe_init_with_nans(): - with cudf.option_context("mode.pandas_compatible", True): - gdf = cudf.DataFrame({"a": [1, 2, 3, np.nan]}) - assert gdf["a"].dtype == np.dtype("float64") - pdf = pd.DataFrame({"a": [1, 2, 3, np.nan]}) - assert_eq(pdf, gdf) - - @pytest.mark.parametrize("dtype1", ["int16", "float32"]) @pytest.mark.parametrize("dtype2", ["int16", "float32"]) def test_dataframe_loc_int_float(dtype1, dtype2): @@ -7955,66 +5971,6 @@ def test_dataframe_loc_int_float(dtype1, dtype2): assert_eq(actual, expected, check_index_type=True, check_dtype=True) -@pytest.mark.parametrize( - "data", - [ - cudf.DataFrame(range(2)), - None, - [cudf.Series(range(2))], - [[0], [1]], - {1: range(2)}, - cupy.arange(2), - ], -) -def test_init_with_index_no_shallow_copy(data): - idx = cudf.RangeIndex(2) - df = cudf.DataFrame(data, index=idx) - assert df.index is idx - - -def test_from_records_with_index_no_shallow_copy(): - idx = cudf.RangeIndex(2) - data = np.array([(1.0, 2), (3.0, 4)], dtype=[("x", " Date: Thu, 21 Aug 2025 20:35:23 -0400 Subject: [PATCH 194/366] Support `nan` in non-floating point column in cudf-polars (#19742) Closes #19741 Authors: - Matthew Murray (https://github.com/Matt711) Approvers: - James Lamb (https://github.com/jameslamb) - Matthew Roeschke (https://github.com/mroeschke) URL: https://github.com/rapidsai/cudf/pull/19742 --- ci/test_narwhals.sh | 4 +- .../cudf_polars/dsl/expressions/boolean.py | 54 ++++++++++++------- .../tests/expressions/test_booleanfunction.py | 31 +++++------ 3 files changed, 51 insertions(+), 38 deletions(-) diff --git a/ci/test_narwhals.sh b/ci/test_narwhals.sh index d30d0d92c05..c40e49184c0 100755 --- a/ci/test_narwhals.sh +++ b/ci/test_narwhals.sh @@ -39,8 +39,10 @@ PYTEST_DISABLE_PLUGIN_AUTOLOAD=1 python -m pytest \ --constructors=cudf # test_datetime[polars[lazy]]: Fixed in the next narwhals release >2.0.1 +# test_nan[polars[lazy]]: Passes as of https://github.com/rapidsai/cudf/pull/19742 TESTS_THAT_NEED_NARWHALS_FIX_FOR_CUDF_POLARS=" \ -test_datetime[polars[lazy]] \ +test_datetime[polars[lazy]] or \ +test_nan[polars[lazy]] \ " rapids-logger "Run narwhals tests for cuDF Polars" diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/boolean.py b/python/cudf_polars/cudf_polars/dsl/expressions/boolean.py index ce8c4fc3276..dc16a18eaf3 100644 --- a/python/cudf_polars/cudf_polars/dsl/expressions/boolean.py +++ b/python/cudf_polars/cudf_polars/dsl/expressions/boolean.py @@ -153,13 +153,18 @@ def do_evaluate( ): # Avoid evaluating the child if the dtype tells us it's unnecessary. (child,) = self.children + needles = child.evaluate(df, context=context) + is_float = needles.obj.type().id() in ( + plc.TypeId.FLOAT32, + plc.TypeId.FLOAT64, + ) is_finite = self.name is BooleanFunction.Name.IsFinite - if child.dtype.id() not in (plc.TypeId.FLOAT32, plc.TypeId.FLOAT64): - value = plc.Scalar.from_py(is_finite) - return Column( - plc.Column.from_scalar(value, df.num_rows), dtype=self.dtype + if not is_float: + base = plc.Column.from_scalar( + plc.Scalar.from_py(py_val=is_finite), needles.size ) - needles = child.evaluate(df, context=context) + out = base.with_mask(needles.obj.null_mask(), needles.null_count) + return Column(out, dtype=self.dtype) to_search = [-float("inf"), float("inf")] if is_finite: # NaN is neither finite not infinite @@ -171,7 +176,10 @@ def do_evaluate( result = plc.search.contains(haystack, needles.obj) if is_finite: result = plc.unary.unary_operation(result, plc.unary.UnaryOperator.NOT) - return Column(result, dtype=self.dtype) + return Column( + result.with_mask(needles.obj.null_mask(), needles.null_count), + dtype=self.dtype, + ) columns = [child.evaluate(df, context=context) for child in self.children] # Kleene logic for Any (OR) and All (AND) if ignore_nulls is # False @@ -206,22 +214,28 @@ def do_evaluate( elif self.name is BooleanFunction.Name.IsNotNull: (column,) = columns return Column(plc.unary.is_valid(column.obj), dtype=self.dtype) - elif self.name is BooleanFunction.Name.IsNan: - (column,) = columns - return Column( - plc.unary.is_nan(column.obj).with_mask( - column.obj.null_mask(), column.null_count - ), - dtype=self.dtype, - ) - elif self.name is BooleanFunction.Name.IsNotNan: + elif self.name in (BooleanFunction.Name.IsNan, BooleanFunction.Name.IsNotNan): (column,) = columns - return Column( - plc.unary.is_not_nan(column.obj).with_mask( - column.obj.null_mask(), column.null_count - ), - dtype=self.dtype, + is_float = column.obj.type().id() in ( + plc.TypeId.FLOAT32, + plc.TypeId.FLOAT64, ) + if is_float: + op = ( + plc.unary.is_nan + if self.name is BooleanFunction.Name.IsNan + else plc.unary.is_not_nan + ) + base = op(column.obj) + else: + base = plc.Column.from_scalar( + plc.Scalar.from_py( + py_val=self.name is not BooleanFunction.Name.IsNan + ), + column.size, + ) + out = base.with_mask(column.obj.null_mask(), column.null_count) + return Column(out, dtype=self.dtype) elif self.name is BooleanFunction.Name.IsFirstDistinct: (column,) = columns return self._distinct( diff --git a/python/cudf_polars/tests/expressions/test_booleanfunction.py b/python/cudf_polars/tests/expressions/test_booleanfunction.py index 236a35935b8..6a6926b9d50 100644 --- a/python/cudf_polars/tests/expressions/test_booleanfunction.py +++ b/python/cudf_polars/tests/expressions/test_booleanfunction.py @@ -12,7 +12,7 @@ assert_gpu_result_equal, assert_ir_translation_raises, ) -from cudf_polars.utils.versions import POLARS_VERSION_LT_130, POLARS_VERSION_LT_132 +from cudf_polars.utils.versions import POLARS_VERSION_LT_132 if TYPE_CHECKING: from collections.abc import Callable @@ -91,7 +91,15 @@ def test_boolean_function_unary( assert_gpu_result_equal(q) -def test_nan_in_non_floating_point_column(): +@pytest.mark.parametrize( + "expr", + [ + pytest.param(lambda e: e.is_nan(), id="is_nan"), + pytest.param(lambda e: e.is_not_nan(), id="is_not_nan"), + pytest.param(lambda e: e.is_finite(), id="is_finite"), + ], +) +def test_nan_in_non_floating_point_column(expr): ldf = pl.LazyFrame({"int": [-1, 1, None]}).with_columns( float=pl.col("int").cast(pl.Float64), float_na=pl.col("int") ** 0.5, @@ -99,24 +107,13 @@ def test_nan_in_non_floating_point_column(): q = ldf.select( [ - pl.col("int").is_nan().alias("int"), - pl.col("float").is_nan().alias("float"), - pl.col("float_na").is_nan().alias("float_na"), + expr(pl.col("int")), + expr(pl.col("float")), + expr(pl.col("float_na")), ] ) - if POLARS_VERSION_LT_130: - with pytest.raises( - pl.exceptions.ComputeError, - match="NAN is not supported in a Non-floating point type column", - ): - assert_gpu_result_equal(q) - else: - with pytest.raises( - RuntimeError, - match="NAN is not supported in a Non-floating point type column", - ): - assert_gpu_result_equal(q) + assert_gpu_result_equal(q) @pytest.mark.parametrize( From 21d062d691252c8f259ac3888f0b77422609c75d Mon Sep 17 00:00:00 2001 From: "Richard (Rick) Zamora" Date: Fri, 22 Aug 2025 11:56:15 -0500 Subject: [PATCH 195/366] Avoid using multiple `Cache` nodes with the same hash (#19769) Closes https://github.com/rapidsai/cudf/issues/19766 Adds simple caching logic to avoid using multiple `Cache` objects with the same hash in the translated logical plan. (I'm very open to other ideas as well) Authors: - Richard (Rick) Zamora (https://github.com/rjzamora) Approvers: - Tom Augspurger (https://github.com/TomAugspurger) URL: https://github.com/rapidsai/cudf/pull/19769 --- .../cudf_polars/cudf_polars/dsl/translate.py | 18 +++++++++++------ python/cudf_polars/tests/test_cache.py | 20 +++++++++++++++++++ 2 files changed, 32 insertions(+), 6 deletions(-) diff --git a/python/cudf_polars/cudf_polars/dsl/translate.py b/python/cudf_polars/cudf_polars/dsl/translate.py index 2e0401d006c..6cadbcc4927 100644 --- a/python/cudf_polars/cudf_polars/dsl/translate.py +++ b/python/cudf_polars/cudf_polars/dsl/translate.py @@ -60,6 +60,7 @@ def __init__(self, visitor: NodeTraverser, engine: GPUEngine): self.visitor = visitor self.config_options = config.ConfigOptions.from_polars_engine(engine) self.errors: list[Exception] = [] + self._cache_nodes: dict[int, ir.Cache] = {} def translate_ir(self, *, n: int | None = None) -> ir.IR: """ @@ -278,12 +279,17 @@ def _(node: pl_ir.Cache, translator: Translator, schema: Schema) -> ir.IR: refcount = node.cache_hits else: refcount = None - return ir.Cache( - schema, - node.id_, - refcount, - translator.translate_ir(n=node.input), - ) + + # Make sure Cache nodes with the same id_ + # are actually the same object. + if node.id_ not in translator._cache_nodes: + translator._cache_nodes[node.id_] = ir.Cache( + schema, + node.id_, + refcount, + translator.translate_ir(n=node.input), + ) + return translator._cache_nodes[node.id_] @_translate_ir.register diff --git a/python/cudf_polars/tests/test_cache.py b/python/cudf_polars/tests/test_cache.py index e242cd52cb2..afb067738a4 100644 --- a/python/cudf_polars/tests/test_cache.py +++ b/python/cudf_polars/tests/test_cache.py @@ -59,3 +59,23 @@ def __getitem__(self, key): qir.evaluate(cache=node_cache, timer=None) assert len(node_cache) == 0 assert node_cache.hits == 3 + + +def test_union_cache_nodes(): + df = pl.LazyFrame({"a": [7, 8], "b": [12, 13]}) + q = pl.concat([df, df]) + qir = Translator(q._ldf.visit(), pl.GPUEngine()).translate_ir() + # Logical plan: + # UNION ('x', 'y', 'z') + # CACHE ('x', 'y', 'z') + # PROJECTION ('x', 'y', 'z') + # DATAFRAMESCAN ('x', 'y', 'z') + # (repeated 2 times) + + # Check that the concatenated Cache nodes are the same object + # See: https://github.com/rapidsai/cudf/issues/19766 + assert isinstance(qir, ir.Union) + assert isinstance(qir.children[0], ir.Cache) + assert isinstance(qir.children[1], ir.Cache) + assert hash(qir.children[0]) == hash(qir.children[1]) + assert hash(qir.children[0].children[0]) == hash(qir.children[1].children[0]) From 38fbfb245b1c8e63e023a9389d1a42a69512c1cf Mon Sep 17 00:00:00 2001 From: David Wendt <45795991+davidwendt@users.noreply.github.com> Date: Fri, 22 Aug 2025 13:27:16 -0400 Subject: [PATCH 196/366] Add libcudf top_k_segmented APIs (#19597) Adds the following libcudf APIs: ``` std::unique_ptr top_k_segmented( column_view const& col, column_view const& segment_offsets, size_type k, order sort_order, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr); std::unique_ptr top_k_segmented_order( column_view const& col, column_view const& segment_offsets, size_type k, order sort_order, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr); ``` Authors: - David Wendt (https://github.com/davidwendt) Approvers: - Bradley Dice (https://github.com/bdice) - Nghia Truong (https://github.com/ttnghia) URL: https://github.com/rapidsai/cudf/pull/19597 --- cpp/CMakeLists.txt | 1 + cpp/benchmarks/CMakeLists.txt | 13 +- cpp/benchmarks/sort/segmented_top_k.cpp | 65 +++++++++ cpp/include/cudf/sorting.hpp | 72 ++++++++++ cpp/src/sort/segmented_top_k.cu | 182 ++++++++++++++++++++++++ cpp/src/sort/top_k.cu | 18 ++- cpp/tests/CMakeLists.txt | 4 +- cpp/tests/sort/sort_test.cpp | 29 ---- cpp/tests/sort/top_k_tests.cpp | 141 ++++++++++++++++++ 9 files changed, 486 insertions(+), 39 deletions(-) create mode 100644 cpp/benchmarks/sort/segmented_top_k.cpp create mode 100644 cpp/src/sort/segmented_top_k.cu create mode 100644 cpp/tests/sort/top_k_tests.cpp diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 5b4c34ccd09..8ee45a34df7 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -697,6 +697,7 @@ add_library( src/sort/is_sorted.cu src/sort/rank.cu src/sort/segmented_sort.cu + src/sort/segmented_top_k.cu src/sort/sort.cu src/sort/sort_column.cu src/sort/sort_radix.cu diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt index c16f0789795..4f07dd72ed4 100644 --- a/cpp/benchmarks/CMakeLists.txt +++ b/cpp/benchmarks/CMakeLists.txt @@ -210,8 +210,17 @@ ConfigureNVBench(SEARCH_NVBENCH search/contains_scalar.cpp search/contains_table # ################################################################################################## # * sort benchmark -------------------------------------------------------------------------------- ConfigureNVBench( - SORT_NVBENCH sort/rank.cpp sort/rank_lists.cpp sort/rank_structs.cpp sort/segmented_sort.cpp - sort/sort.cpp sort/sort_lists.cpp sort/sort_strings.cpp sort/sort_structs.cpp sort/top_k.cpp + SORT_NVBENCH + sort/rank.cpp + sort/rank_lists.cpp + sort/rank_structs.cpp + sort/segmented_top_k.cpp + sort/segmented_sort.cpp + sort/sort.cpp + sort/sort_lists.cpp + sort/sort_strings.cpp + sort/sort_structs.cpp + sort/top_k.cpp ) # ################################################################################################## diff --git a/cpp/benchmarks/sort/segmented_top_k.cpp b/cpp/benchmarks/sort/segmented_top_k.cpp new file mode 100644 index 00000000000..6e6507d2f51 --- /dev/null +++ b/cpp/benchmarks/sort/segmented_top_k.cpp @@ -0,0 +1,65 @@ +/* + * Copyright (c) 2025, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include +#include +#include +#include + +#include + +template +void bench_segmented_top_k(nvbench::state& state, nvbench::type_list) +{ + auto const ordered = static_cast(state.get_int64("ordered")); + auto const num_rows = static_cast(state.get_int64("num_rows")); + auto const segment = static_cast(state.get_int64("segment")); + auto const k = static_cast(state.get_int64("k")); + auto const data_type = cudf::type_to_id(); + + data_profile const profile = data_profile_builder().no_validity().distribution( + data_type, distribution_id::UNIFORM, 0, segment); + auto const input = create_random_column(data_type, row_count{num_rows}, profile); + + auto const segments = cudf::sequence((num_rows / segment) + 1, + cudf::numeric_scalar(0), + cudf::numeric_scalar(segment)); + + state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value())); + state.add_global_memory_reads(num_rows); + state.add_global_memory_writes(segments->size() * k); + + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { + if (ordered) { + cudf::segmented_top_k_order(input->view(), segments->view(), k); + } else { + cudf::segmented_top_k(input->view(), segments->view(), k); + } + }); +} + +NVBENCH_DECLARE_TYPE_STRINGS(cudf::timestamp_s, "time_s", "time_s"); + +using Types = nvbench::type_list; + +NVBENCH_BENCH_TYPES(bench_segmented_top_k, NVBENCH_TYPE_AXES(Types)) + .set_name("segmented_top_k") + .add_int64_axis("num_rows", {262144, 2097152, 16777216, 67108864}) + .add_int64_axis("segment", {1024, 2048}) + .add_int64_axis("k", {100, 1000}) + .add_int64_axis("ordered", {0, 1}); diff --git a/cpp/include/cudf/sorting.hpp b/cpp/include/cudf/sorting.hpp index 2ef5f66aa42..532cec53e8f 100644 --- a/cpp/include/cudf/sorting.hpp +++ b/cpp/include/cudf/sorting.hpp @@ -390,5 +390,77 @@ std::unique_ptr top_k_order( rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); +/** + * @brief Computes the top k values within each segment of a column + * + * Returns the top k values (largest or smallest) within each segment of the given column. + * The values within each segment may not necessarily be sorted. + * If a segment contain less than k elements then all values for that segment are returned. + * + * @code{.pseudo} + * Example: + * col = [ 3, 4, 5, 4, 1, 2, 3, 5, 6, 7, 8, 9, 10 ] + * offsets = [0, 3, 7, 13] + * result = cudf::segmented_top_k(col, offsets, 3); + * result is [[5,4,3], [4,3,2], [10,8,9]] // each segment may not be sorted + * @endcode + * + * @throw std::invalid_argument if k less than or equal to zero + * @throw cudf::data_type_error if segment_offsets is not size_type + * @throw std::invalid_argument segments_offsets is empty or contains nulls + * + * @param col Column to compute top k + * @param segment_offsets Start offset index for each contiguous segment + * @param k Number of values to return for each segment + * @param sort_order DESCENDING is the largest k values (default). + * ASCENDING is the smallest k values. + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned column's device memory + * @return A column with the top k values of the input column. + */ +std::unique_ptr segmented_top_k( + column_view const& col, + column_view const& segment_offsets, + size_type k, + order sort_order = order::DESCENDING, + rmm::cuda_stream_view stream = cudf::get_default_stream(), + rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); + +/** + * @brief Computes the indices of the top k values within each segment of a column + * + * The indices will represent the top k elements within each segment but may not represent + * those elements as k sorted values. + * If a segment contain less than k elements then all values for that segment are returned. + * + * @code{.pseudo} + * Example: + * col = [ 3, 4, 5, 4, 1, 2, 3, 5, 6, 7, 8, 9, 10 ] + * offsets = [0, 3, 7, 13] + * result = cudf::segmented_top_k_order(col, offsets, 3); + * result is [[2,1,0], [3,6,5], [12,10,11]] // each segment may not be sorted + * @endcode + * + * @throw std::invalid_argument if k less than or equal to zero + * @throw cudf::data_type_error if segment_offsets is not size_type + * @throw std::invalid_argument segments_offsets is empty or contains nulls + * + * @param col Column to compute top k + * @param segment_offsets Start offset index for each contiguous segment + * @param k Number of values to return for each segment + * @param sort_order DESCENDING is the indices of the largest k values (default). + * ASCENDING is the indices of the smallest k values. + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned column's device memory + * @return Indices of the top k values of the input column + */ +std::unique_ptr segmented_top_k_order( + column_view const& col, + column_view const& segment_offsets, + size_type k, + order sort_order = order::DESCENDING, + rmm::cuda_stream_view stream = cudf::get_default_stream(), + rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); + /** @} */ // end of group } // namespace CUDF_EXPORT cudf diff --git a/cpp/src/sort/segmented_top_k.cu b/cpp/src/sort/segmented_top_k.cu new file mode 100644 index 00000000000..ef27f2cc9f9 --- /dev/null +++ b/cpp/src/sort/segmented_top_k.cu @@ -0,0 +1,182 @@ +/* + * Copyright (c) 2025, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "sort_column_impl.cuh" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include + +namespace cudf { +namespace detail { +namespace { + +/** + * @brief Resolves the k indices per segment + * + * Marks values outside the k range to -1 to be removed in a separate step. + * Also computes the total number of valid indices for each segment. + * All elements are used in a segment if it has less than k total elements. + * + * @param d_offsets Offsets for each segment + * @param k Number of values to keep in each segment + * @param d_indices Mark these indices to be removed + * @param d_segment_sizes Store actual sizes of each segment + */ +CUDF_KERNEL void resolve_segment_indices(device_span d_offsets, + size_type k, + device_span d_indices, + size_type* d_segment_sizes) +{ + auto const tid = cudf::detail::grid_1d::global_thread_id(); + if (tid >= d_indices.size()) { return; } + + auto const sitr = thrust::upper_bound(thrust::seq, d_offsets.begin(), d_offsets.end(), tid); + auto const segment_start = *(sitr - 1); + auto const segment_end = *sitr; + auto const index = tid - segment_start; + if (index >= k) { d_indices[tid] = -1; } // mark values outside of top k + + if (index == 0) { + auto const segment_size = segment_end - segment_start; + auto const segment_index = thrust::distance(d_offsets.begin(), sitr) - 1; + // segment is k or less elements + d_segment_sizes[segment_index] = cuda::std::min(k, segment_size); + } +} +} // namespace + +std::unique_ptr segmented_top_k_order(column_view const& col, + column_view const& segment_offsets, + size_type k, + order sort_order, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + CUDF_EXPECTS(k >= 0, "k must be greater than or equal to 0", std::invalid_argument); + + auto const size_data_type = data_type{type_to_id()}; + if (k == 0 || col.is_empty()) { + return cudf::make_empty_lists_column(size_data_type, stream, mr); + } + + CUDF_EXPECTS(segment_offsets.size() > 0, + "segment_offsets must have at least one element", + std::invalid_argument); + + CUDF_EXPECTS(segment_offsets.type() == size_data_type, + "segment_offsets must be of type INT32", + cudf::data_type_error); + CUDF_EXPECTS(segment_offsets.null_count() == 0, + "segment_offsets must not have nulls", + std::invalid_argument); + + auto const nulls = sort_order == order::ASCENDING ? null_order::AFTER : null_order::BEFORE; + auto const temp_mr = cudf::get_current_device_resource_ref(); + auto const indices = cudf::detail::segmented_sorted_order( + cudf::table_view({col}), segment_offsets, {sort_order}, {nulls}, stream, temp_mr); + auto const d_indices = indices->mutable_view().begin(); + + auto segment_sizes = rmm::device_uvector(segment_offsets.size() - 1, stream); + auto span_indices = device_span{d_indices, static_cast(indices->size())}; + auto const grid = cudf::detail::grid_1d(indices->size(), 256); + resolve_segment_indices<<>>( + segment_offsets, k, span_indices, segment_sizes.data()); + auto [offsets, total_elements] = + cudf::detail::make_offsets_child_column(segment_sizes.begin(), segment_sizes.end(), stream, mr); + + auto result = cudf::make_fixed_width_column( + size_data_type, total_elements, mask_state::UNALLOCATED, stream, mr); + auto d_result = result->mutable_view().begin(); + // remove the indices marked by resolve_segment_indices + thrust::remove_copy( + rmm::exec_policy_nosync(stream), d_indices, d_indices + indices->size(), d_result, -1); + + auto const num_rows = static_cast(offsets->size() - 1); + return make_lists_column( + num_rows, std::move(offsets), std::move(result), 0, rmm::device_buffer{}, stream, mr); +} + +std::unique_ptr segmented_top_k(column_view const& col, + column_view const& segment_offsets, + size_type k, + order sort_order, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + if (col.is_empty()) { return cudf::make_empty_column(col.type()); } + + auto ordered = + cudf::detail::segmented_top_k_order(col, segment_offsets, k, sort_order, stream, mr); + auto lv = cudf::lists_column_view(ordered->view()); + if (lv.is_empty()) { return cudf::make_empty_lists_column(col.type(), stream, mr); } + + auto result = cudf::detail::gather(cudf::table_view({col}), + lv.child(), + out_of_bounds_policy::DONT_CHECK, + negative_index_policy::NOT_ALLOWED, + stream, + mr); + auto offsets = std::move(ordered->release().children.front()); + auto const num_rows = static_cast(offsets->size() - 1); + return make_lists_column(num_rows, + std::move(offsets), + std::move(result->release().front()), + 0, + rmm::device_buffer{}, + stream, + mr); +} + +} // namespace detail + +std::unique_ptr segmented_top_k(column_view const& col, + column_view const& segment_offsets, + size_type k, + order sort_order, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + CUDF_FUNC_RANGE(); + return detail::segmented_top_k(col, segment_offsets, k, sort_order, stream, mr); +} + +std::unique_ptr segmented_top_k_order(column_view const& col, + column_view const& segment_offsets, + size_type k, + order sort_order, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + CUDF_FUNC_RANGE(); + return detail::segmented_top_k_order(col, segment_offsets, k, sort_order, stream, mr); +} +} // namespace cudf diff --git a/cpp/src/sort/top_k.cu b/cpp/src/sort/top_k.cu index 62d7b5f6842..3263144d000 100644 --- a/cpp/src/sort/top_k.cu +++ b/cpp/src/sort/top_k.cu @@ -17,10 +17,13 @@ #include "sort_column_impl.cuh" #include +#include #include #include #include +#include #include +#include #include #include #include @@ -37,9 +40,9 @@ std::unique_ptr top_k(column_view const& col, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { - CUDF_EXPECTS(k <= col.size(), - "k must be less than or equal to the number of rows in the column", - std::invalid_argument); + CUDF_EXPECTS(k >= 0, "k must be non-negative", std::invalid_argument); + if (k == 0 || col.is_empty()) { return empty_like(col); } + if (k >= col.size()) { return std::make_unique(col, stream, mr); } // code will be specialized for fixed-width types once CUB topk function is available auto const nulls = sort_order == order::ASCENDING ? null_order::AFTER : null_order::BEFORE; @@ -60,9 +63,12 @@ std::unique_ptr top_k_order(column_view const& col, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { - CUDF_EXPECTS(k <= col.size(), - "k must be less than or equal to the number of rows in the column", - std::invalid_argument); + CUDF_EXPECTS(k >= 0, "k must be non-negative", std::invalid_argument); + if (k == 0 || col.is_empty()) { return make_empty_column(cudf::type_to_id()); } + if (k >= col.size()) { + return cudf::detail::sequence( + col.size(), numeric_scalar(0, true, stream), stream, mr); + } auto const nulls = sort_order == order::ASCENDING ? null_order::AFTER : null_order::BEFORE; auto const indices = sorted_order(col, sort_order, nulls, stream, mr); diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index 65e3a608660..63b41e55f51 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -365,8 +365,8 @@ ConfigureTest(TYPE_INFERENCE_TEST io/type_inference_test.cu) # ################################################################################################## # * sort tests ------------------------------------------------------------------------------------ ConfigureTest( - SORT_TEST sort/segmented_sort_tests.cpp sort/sort_nested_types_tests.cpp sort/sort_test.cpp - sort/stable_sort_tests.cpp sort/rank_test.cpp + SORT_TEST sort/rank_test.cpp sort/segmented_sort_tests.cpp sort/sort_nested_types_tests.cpp + sort/sort_test.cpp sort/stable_sort_tests.cpp sort/top_k_tests.cpp GPUS 1 PERCENT 70 ) diff --git a/cpp/tests/sort/sort_test.cpp b/cpp/tests/sort/sort_test.cpp index 05a632c4f2b..9473e181c05 100644 --- a/cpp/tests/sort/sort_test.cpp +++ b/cpp/tests/sort/sort_test.cpp @@ -1113,33 +1113,4 @@ TEST_F(SortDouble, InfinityAndNan) CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->view(), expected); } -TYPED_TEST(Sort, TopK) -{ - using T = TypeParam; - if constexpr (std::is_same_v) { GTEST_SKIP(); } - - auto itr = thrust::counting_iterator(0); - auto input = cudf::test::fixed_width_column_wrapper( - itr, itr + 100, cudf::test::iterators::null_at(4)); - auto expected = - cudf::test::fixed_width_column_wrapper({99, 98, 97, 96, 95, 94, 93, 92, 91, 90}); - auto result = cudf::top_k(input, 10); - CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, result->view()); - auto expected_order = cudf::test::fixed_width_column_wrapper( - {99, 98, 97, 96, 95, 94, 93, 92, 91, 90}); - result = cudf::top_k_order(input, 10); - CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_order, result->view()); - - result = cudf::top_k(input, 10, cudf::order::ASCENDING); - expected = cudf::test::fixed_width_column_wrapper({0, 1, 2, 3, 5, 6, 7, 8, 9, 10}); - CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, result->view()); - expected_order = - cudf::test::fixed_width_column_wrapper({0, 1, 2, 3, 5, 6, 7, 8, 9, 10}); - result = cudf::top_k_order(input, 10, cudf::order::ASCENDING); - CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_order, result->view()); - - EXPECT_THROW(cudf::top_k(input, 101), std::invalid_argument); - EXPECT_THROW(cudf::top_k_order(input, 101), std::invalid_argument); -} - CUDF_TEST_PROGRAM_MAIN() diff --git a/cpp/tests/sort/top_k_tests.cpp b/cpp/tests/sort/top_k_tests.cpp new file mode 100644 index 00000000000..2f087612833 --- /dev/null +++ b/cpp/tests/sort/top_k_tests.cpp @@ -0,0 +1,141 @@ +/* + * Copyright (c) 2025, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include + +using TestTypes = cudf::test:: + Concat; + +template +struct TopKTypes : public cudf::test::BaseFixture {}; + +TYPED_TEST_SUITE(TopKTypes, TestTypes); + +TYPED_TEST(TopKTypes, TopK) +{ + using T = TypeParam; + + auto itr = thrust::counting_iterator(0); + auto input = cudf::test::fixed_width_column_wrapper( + itr, itr + 100, cudf::test::iterators::null_at(4)); + auto expected = + cudf::test::fixed_width_column_wrapper({99, 98, 97, 96, 95, 94, 93, 92, 91, 90}); + auto result = cudf::top_k(input, 10); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, result->view()); + auto expected_order = cudf::test::fixed_width_column_wrapper( + {99, 98, 97, 96, 95, 94, 93, 92, 91, 90}); + result = cudf::top_k_order(input, 10); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_order, result->view()); + + result = cudf::top_k(input, 10, cudf::order::ASCENDING); + expected = cudf::test::fixed_width_column_wrapper({0, 1, 2, 3, 5, 6, 7, 8, 9, 10}); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, result->view()); + expected_order = + cudf::test::fixed_width_column_wrapper({0, 1, 2, 3, 5, 6, 7, 8, 9, 10}); + result = cudf::top_k_order(input, 10, cudf::order::ASCENDING); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_order, result->view()); +} + +TYPED_TEST(TopKTypes, TopKSegmented) +{ + using T = TypeParam; + using LCW = cudf::test::lists_column_wrapper; + using LCWO = cudf::test::lists_column_wrapper; + + auto itr = thrust::counting_iterator(0); + auto input = cudf::test::fixed_width_column_wrapper( + itr, itr + 100, cudf::test::iterators::null_at(4)); + auto offsets = + cudf::test::fixed_width_column_wrapper({0, 15, 20, 23, 40, 42, 60, 70, 80, 90, 100}); + { + // clang-format off + LCW expected({ + {14, 13, 12}, {19, 18, 17}, {22, 21, 20}, {39, 38, 37}, {41, 40}, + {59, 58, 57}, {69, 68, 67}, {79, 78, 77}, {89, 88, 87}, {99, 98, 97}}); + LCWO expected_order({ + {14, 13, 12}, {19, 18, 17}, {22, 21, 20}, {39, 38, 37}, {41, 40}, + {59, 58, 57}, {69, 68, 67}, {79, 78, 77}, {89, 88, 87}, {99, 98, 97}}); + // clang-format on + auto result = cudf::segmented_top_k(input, offsets, 3); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, result->view()); + result = cudf::segmented_top_k_order(input, offsets, 3); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_order, result->view()); + } + + { + // clang-format off + LCW expected({ + {0, 1, 2}, {15, 16, 17}, {20, 21, 22}, {23, 24, 25}, {40, 41}, + {42, 43, 44}, {60, 61, 62}, {70, 71, 72}, {80, 81, 82}, {90, 91, 92}}); + LCWO expected_order({ + {0, 1, 2}, {15, 16, 17}, {20, 21, 22}, {23, 24, 25}, {40, 41}, + {42, 43, 44}, {60, 61, 62}, {70, 71, 72}, {80, 81, 82}, {90, 91, 92}}); + // clang-format on + auto result = cudf::segmented_top_k(input, offsets, 3, cudf::order::ASCENDING); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, result->view()); + result = cudf::segmented_top_k_order(input, offsets, 3, cudf::order::ASCENDING); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_order, result->view()); + } +} + +struct TopK : public cudf::test::BaseFixture {}; + +TEST_F(TopK, Empty) +{ + auto input = cudf::test::fixed_width_column_wrapper({0, 1, 2, 3}); + + auto result = cudf::top_k(input, 0); + EXPECT_EQ(result->size(), 0); + result = cudf::top_k_order(input, 0); + EXPECT_EQ(result->size(), 0); + result = cudf::segmented_top_k(input, input, 0); + EXPECT_EQ(result->size(), 0); + result = cudf::segmented_top_k_order(input, input, 0); + EXPECT_EQ(result->size(), 0); +} + +TEST_F(TopK, Errors) +{ + auto itr = thrust::counting_iterator(0); + auto input = cudf::test::fixed_width_column_wrapper(itr, itr + 100); + + EXPECT_THROW(cudf::top_k(input, -1), std::invalid_argument); + EXPECT_THROW(cudf::top_k_order(input, -1), std::invalid_argument); + + auto offsets = cudf::test::fixed_width_column_wrapper({0, 15, 20, 23, 40, 42}); + EXPECT_THROW(cudf::segmented_top_k(input, offsets, -1), std::invalid_argument); + EXPECT_THROW(cudf::segmented_top_k_order(input, offsets, -1), std::invalid_argument); + offsets = cudf::test::fixed_width_column_wrapper({}); + EXPECT_THROW(cudf::segmented_top_k(input, offsets, 10), std::invalid_argument); + EXPECT_THROW(cudf::segmented_top_k_order(input, offsets, 10), std::invalid_argument); + offsets = cudf::test::fixed_width_column_wrapper({0, 15}, {1, 0}); + EXPECT_THROW(cudf::segmented_top_k(input, offsets, 10), std::invalid_argument); + EXPECT_THROW(cudf::segmented_top_k_order(input, offsets, 10), std::invalid_argument); + + EXPECT_THROW(cudf::segmented_top_k(input, input, 10), cudf::data_type_error); + EXPECT_THROW(cudf::segmented_top_k_order(input, input, 10), cudf::data_type_error); +} From b761f86c7deb96e1cad656e7a63abc64ae88620b Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 22 Aug 2025 12:30:59 -0500 Subject: [PATCH 197/366] Optionally capture Shuffle Stats in cudf-polars pdsh benchmarks (#19762) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This updates the PDSH benchmarks to gather and persist the shuffle stats from rapidsmpf after each query iteration. For example: ``` ❯ python python/cudf_polars/cudf_polars/experimental/benchmarks/pdsh.py \ --path /datasets/toaugspurger/tpch/scale-100/ --no-print-results \ --executor streaming --scheduler distributed --n-workers=8 --protocol=ucx \ --shuffle=rapidsmpf --rapidsmpf-dask-statistics --no-rapidsmpf-print-statistics \ --iterations=1 --rmm-async 3 ``` Will write the shuffle stats to the `pdsh_results.jsonl` file: ``` ❯ tail -n 1 pdsh_results.jsonl | jq '.records."3"' [ { "query": 3, "duration": 5.394909042050131, "shuffle_stats": { "event-loop-check-future-finish": { "count": 561426, "value": 0.07810913599999943 }, ... } } ] ``` Additionally, we use the new configuration option from https://github.com/rapidsai/rapidsmpf/pull/448 to control whether statistics are printed to stdout. Blocked by https://github.com/rapidsai/rapidsmpf/pull/452, which add the ability to clear statistics (which we do between each query iteration). Authors: - Tom Augspurger (https://github.com/TomAugspurger) Approvers: - Matthew Murray (https://github.com/Matt711) URL: https://github.com/rapidsai/cudf/pull/19762 --- .../experimental/benchmarks/utils.py | 31 +++++++++++++++++-- 1 file changed, 29 insertions(+), 2 deletions(-) diff --git a/python/cudf_polars/cudf_polars/experimental/benchmarks/utils.py b/python/cudf_polars/cudf_polars/experimental/benchmarks/utils.py index 8074488024c..53b492e2e82 100644 --- a/python/cudf_polars/cudf_polars/experimental/benchmarks/utils.py +++ b/python/cudf_polars/cudf_polars/experimental/benchmarks/utils.py @@ -52,6 +52,7 @@ class Record: query: int duration: float + shuffle_stats: dict[str, dict[str, int | float]] | None = None @dataclasses.dataclass @@ -187,7 +188,8 @@ class RunConfig: records: dict[int, list[Record]] = dataclasses.field(default_factory=dict) dataset_path: Path scale_factor: int | float - shuffle: str | None = None + shuffle: Literal["rapidsmpf", "tasks"] | None = None + gather_shuffle_stats: bool = False broadcast_join_limit: int | None = None blocksize: int | None = None max_rows_per_partition: int | None = None @@ -203,6 +205,12 @@ class RunConfig: spill_device: float query_set: str + def __post_init__(self) -> None: # noqa: D105 + if self.gather_shuffle_stats and self.shuffle != "rapidsmpf": + raise ValueError( + "gather_shuffle_stats is only supported when shuffle='rapidsmpf'." + ) + @classmethod def from_args(cls, args: argparse.Namespace) -> RunConfig: """Create a RunConfig from command line arguments.""" @@ -251,6 +259,7 @@ def from_args(cls, args: argparse.Namespace) -> RunConfig: scheduler=scheduler, n_workers=args.n_workers, shuffle=args.shuffle, + gather_shuffle_stats=args.rapidsmpf_dask_statistics, broadcast_join_limit=args.broadcast_join_limit, dataset_path=path, scale_factor=scale_factor, @@ -414,6 +423,7 @@ def initialize_dask_cluster(run_config: RunConfig, args: argparse.Namespace): # { "dask_spill_device": str(run_config.spill_device), "dask_statistics": str(args.rapidsmpf_dask_statistics), + "dask_print_statistics": str(args.rapidsmpf_print_statistics), "oom_protection": str(args.rapidsmpf_oom_protection), } ), @@ -630,6 +640,12 @@ def parse_args( "--rapidsmpf-dask-statistics", action=argparse.BooleanOptionalAction, default=False, + help="Collect rapidsmpf shuffle statistics. The output will be stored in the 'shuffle_stats' field of each record.", + ) + parser.add_argument( + "--rapidsmpf-print-statistics", + action=argparse.BooleanOptionalAction, + default=False, help="Print rapidsmpf shuffle statistics on each Dask worker upon completion.", ) parser.add_argument( @@ -729,6 +745,17 @@ def run_polars( result = execute_query(q_id, i, q, run_config, args, engine) + if run_config.shuffle == "rapidsmpf" and run_config.gather_shuffle_stats: + from rapidsmpf.integrations.dask.shuffler import ( + clear_shuffle_statistics, + gather_shuffle_statistics, + ) + + shuffle_stats = gather_shuffle_statistics(client) # type: ignore[arg-type] + clear_shuffle_statistics(client) # type: ignore[arg-type] + else: + shuffle_stats = None + if args.validate and run_config.executor != "cpu": try: assert_gpu_result_equal( @@ -743,7 +770,7 @@ def run_polars( print(f"❌ Query {q_id} failed validation!\n{e}") t1 = time.monotonic() - record = Record(query=q_id, duration=t1 - t0) + record = Record(query=q_id, duration=t1 - t0, shuffle_stats=shuffle_stats) if args.print_results: print(result) From 4760b4bcdab36cc8cdb7292b65e3580e00d7a48d Mon Sep 17 00:00:00 2001 From: Vukasin Milovanovic Date: Fri, 22 Aug 2025 11:06:44 -0700 Subject: [PATCH 198/366] Expand compression codec coverage in ORC and Parquet benchmarks (#19760) Added ZSTD and ZLIB to the list of covered compression codecs in reader and writer benchmarks, since their support in nvCOMP is stable now. Also converted IO and compression type template parameters into string axis to match Parquet reader benchmarks and allow changing the used values at runtime. Authors: - Vukasin Milovanovic (https://github.com/vuule) Approvers: - Yunsong Wang (https://github.com/PointKernel) - David Wendt (https://github.com/davidwendt) URL: https://github.com/rapidsai/cudf/pull/19760 --- cpp/benchmarks/io/orc/orc_reader_input.cpp | 68 ++++++++----------- cpp/benchmarks/io/orc/orc_writer.cpp | 35 ++++------ .../io/parquet/parquet_reader_input.cpp | 2 +- cpp/benchmarks/io/parquet/parquet_writer.cpp | 35 ++++------ 4 files changed, 59 insertions(+), 81 deletions(-) diff --git a/cpp/benchmarks/io/orc/orc_reader_input.cpp b/cpp/benchmarks/io/orc/orc_reader_input.cpp index cafd3cc5c39..6a6b48bebac 100644 --- a/cpp/benchmarks/io/orc/orc_reader_input.cpp +++ b/cpp/benchmarks/io/orc/orc_reader_input.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2024, NVIDIA CORPORATION. + * Copyright (c) 2022-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -87,14 +87,14 @@ void orc_read_common(cudf::size_type num_rows_to_read, } // namespace -template -void BM_orc_read_data(nvbench::state& state, - nvbench::type_list, nvbench::enum_type>) +template +void BM_orc_read_data(nvbench::state& state, nvbench::type_list>) { auto const d_type = get_type_or_group(static_cast(DataType)); cudf::size_type const cardinality = state.get_int64("cardinality"); cudf::size_type const run_length = state.get_int64("run_length"); - cuio_source_sink_pair source_sink(IOType); + auto const source_type = retrieve_io_type_enum(state.get_string("io_type")); + cuio_source_sink_pair source_sink(source_type); auto const num_rows_written = [&]() { auto const tbl = create_random_table( @@ -112,16 +112,18 @@ void BM_orc_read_data(nvbench::state& state, orc_read_common(num_rows_written, source_sink, state); } -template +template void orc_read_io_compression(nvbench::state& state) { - auto const d_type = get_type_or_group({static_cast(data_type::INTEGRAL_SIGNED), - static_cast(data_type::FLOAT), - static_cast(data_type::DECIMAL), - static_cast(data_type::TIMESTAMP), - static_cast(data_type::STRING), - static_cast(data_type::LIST), - static_cast(data_type::STRUCT)}); + auto const source_type = retrieve_io_type_enum(state.get_string("io_type")); + auto const compression = retrieve_compression_type_enum(state.get_string("compression_type")); + auto const d_type = get_type_or_group({static_cast(data_type::INTEGRAL_SIGNED), + static_cast(data_type::FLOAT), + static_cast(data_type::DECIMAL), + static_cast(data_type::TIMESTAMP), + static_cast(data_type::STRING), + static_cast(data_type::LIST), + static_cast(data_type::STRUCT)}); auto const [cardinality, run_length] = [&]() -> std::pair { if constexpr (chunked_read) { @@ -131,7 +133,7 @@ void orc_read_io_compression(nvbench::state& state) static_cast(state.get_int64("run_length"))}; } }(); - cuio_source_sink_pair source_sink(IOType); + cuio_source_sink_pair source_sink(source_type); auto const num_rows_written = [&]() { auto const tbl = create_random_table( @@ -142,7 +144,7 @@ void orc_read_io_compression(nvbench::state& state) cudf::io::orc_writer_options opts = cudf::io::orc_writer_options::builder(source_sink.make_sink_info(), view) - .compression(Compression); + .compression(compression); cudf::io::write_orc(opts); return view.num_rows(); }(); @@ -150,20 +152,14 @@ void orc_read_io_compression(nvbench::state& state) orc_read_common(num_rows_written, source_sink, state); } -template -void BM_orc_read_io_compression( - nvbench::state& state, - nvbench::type_list, nvbench::enum_type>) +void BM_orc_read_io_compression(nvbench::state& state) { - return orc_read_io_compression(state); + return orc_read_io_compression(state); } -template -void BM_orc_chunked_read_io_compression(nvbench::state& state, - nvbench::type_list>) +void BM_orc_chunked_read_io_compression(nvbench::state& state) { - // Only run benchmark using HOST_BUFFER IO. - return orc_read_io_compression(state); + return orc_read_io_compression(state); } using d_type_list = nvbench::enum_type_list; -using io_list = - nvbench::enum_type_list; - -using compression_list = - nvbench::enum_type_list; - -NVBENCH_BENCH_TYPES(BM_orc_read_data, - NVBENCH_TYPE_AXES(d_type_list, nvbench::enum_type_list)) +NVBENCH_BENCH_TYPES(BM_orc_read_data, NVBENCH_TYPE_AXES(d_type_list)) .set_name("orc_read_decode") - .set_type_axes_names({"data_type", "io"}) + .set_type_axes_names({"data_type"}) + .add_string_axis("io_type", {"DEVICE_BUFFER"}) .set_min_samples(4) .add_int64_axis("cardinality", {0, 1000}) .add_int64_axis("run_length", {1, 32}); -NVBENCH_BENCH_TYPES(BM_orc_read_io_compression, NVBENCH_TYPE_AXES(io_list, compression_list)) +NVBENCH_BENCH(BM_orc_read_io_compression) .set_name("orc_read_io_compression") - .set_type_axes_names({"io", "compression"}) + .add_string_axis("io_type", {"FILEPATH", "HOST_BUFFER", "DEVICE_BUFFER"}) + .add_string_axis("compression_type", {"SNAPPY", "ZSTD", "ZLIB", "NONE"}) .set_min_samples(4) .add_int64_axis("cardinality", {0, 1000}) .add_int64_axis("run_length", {1, 32}); // Should have the same parameters as `BM_orc_read_io_compression` for comparison. -NVBENCH_BENCH_TYPES(BM_orc_chunked_read_io_compression, NVBENCH_TYPE_AXES(compression_list)) +NVBENCH_BENCH(BM_orc_chunked_read_io_compression) .set_name("orc_chunked_read_io_compression") - .set_type_axes_names({"compression"}) + .add_string_axis("io_type", {"DEVICE_BUFFER"}) + .add_string_axis("compression_type", {"SNAPPY", "ZSTD", "ZLIB", "NONE"}) .set_min_samples(4) // The input has approximately 520MB and 127K rows. // The limits below are given in MBs. diff --git a/cpp/benchmarks/io/orc/orc_writer.cpp b/cpp/benchmarks/io/orc/orc_writer.cpp index b795f3e3164..2021ed9e48d 100644 --- a/cpp/benchmarks/io/orc/orc_writer.cpp +++ b/cpp/benchmarks/io/orc/orc_writer.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2024, NVIDIA CORPORATION. + * Copyright (c) 2020-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -82,10 +82,7 @@ void BM_orc_write_encode(nvbench::state& state, nvbench::type_list -void BM_orc_write_io_compression( - nvbench::state& state, - nvbench::type_list, nvbench::enum_type>) +void BM_orc_write_io_compression(nvbench::state& state) { auto const d_type = get_type_or_group({static_cast(data_type::INTEGRAL_SIGNED), static_cast(data_type::FLOAT), @@ -97,8 +94,8 @@ void BM_orc_write_io_compression( cudf::size_type const cardinality = state.get_int64("cardinality"); cudf::size_type const run_length = state.get_int64("run_length"); - auto const compression = Compression; - auto const sink_type = IO; + auto const sink_type = retrieve_io_type_enum(state.get_string("io_type")); + auto const compression = retrieve_compression_type_enum(state.get_string("compression_type")); auto const tbl = create_random_table(cycle_dtypes(d_type, num_cols), @@ -131,10 +128,9 @@ void BM_orc_write_io_compression( state.add_buffer_size(encoded_file_size, "encoded_file_size", "encoded_file_size"); } -template -void BM_orc_write_statistics( - nvbench::state& state, - nvbench::type_list, nvbench::enum_type>) +template +void BM_orc_write_statistics(nvbench::state& state, + nvbench::type_list>) { auto const d_type = get_type_or_group({static_cast(data_type::INTEGRAL_SIGNED), static_cast(data_type::FLOAT), @@ -143,7 +139,7 @@ void BM_orc_write_statistics( static_cast(data_type::STRING), static_cast(data_type::LIST)}); - auto const compression = Compression; + auto const compression = retrieve_compression_type_enum(state.get_string("compression_type")); auto const stats_freq = Statistics; auto const tbl = create_random_table(d_type, table_size_bytes{data_size}); @@ -183,11 +179,6 @@ using d_type_list = nvbench::enum_type_list; -using io_list = nvbench::enum_type_list; - -using compression_list = - nvbench::enum_type_list; - using stats_list = nvbench::enum_type_list; @@ -199,14 +190,16 @@ NVBENCH_BENCH_TYPES(BM_orc_write_encode, NVBENCH_TYPE_AXES(d_type_list)) .add_int64_axis("cardinality", {0, 1000}) .add_int64_axis("run_length", {1, 32}); -NVBENCH_BENCH_TYPES(BM_orc_write_io_compression, NVBENCH_TYPE_AXES(io_list, compression_list)) +NVBENCH_BENCH(BM_orc_write_io_compression) .set_name("orc_write_io_compression") - .set_type_axes_names({"io", "compression"}) + .add_string_axis("io_type", {"FILEPATH", "HOST_BUFFER", "VOID"}) + .add_string_axis("compression_type", {"SNAPPY", "ZSTD", "ZLIB", "NONE"}) .set_min_samples(4) .add_int64_axis("cardinality", {0, 1000}) .add_int64_axis("run_length", {1, 32}); -NVBENCH_BENCH_TYPES(BM_orc_write_statistics, NVBENCH_TYPE_AXES(stats_list, compression_list)) +NVBENCH_BENCH_TYPES(BM_orc_write_statistics, NVBENCH_TYPE_AXES(stats_list)) .set_name("orc_write_statistics") - .set_type_axes_names({"statistics", "compression"}) + .set_type_axes_names({"statistics"}) + .add_string_axis("compression_type", {"SNAPPY", "ZSTD", "ZLIB", "NONE"}) .set_min_samples(4); diff --git a/cpp/benchmarks/io/parquet/parquet_reader_input.cpp b/cpp/benchmarks/io/parquet/parquet_reader_input.cpp index d1699daff04..d6ae56aa2e4 100644 --- a/cpp/benchmarks/io/parquet/parquet_reader_input.cpp +++ b/cpp/benchmarks/io/parquet/parquet_reader_input.cpp @@ -420,7 +420,7 @@ NVBENCH_BENCH_TYPES(BM_parquet_read_data, NVBENCH_TYPE_AXES(d_type_list)) NVBENCH_BENCH(BM_parquet_read_io_compression) .set_name("parquet_read_io_compression") .add_string_axis("io_type", {"FILEPATH", "HOST_BUFFER", "DEVICE_BUFFER"}) - .add_string_axis("compression_type", {"SNAPPY", "NONE"}) + .add_string_axis("compression_type", {"SNAPPY", "ZSTD", "NONE"}) .set_min_samples(4) .add_int64_axis("cardinality", {0, 1000}) .add_int64_axis("run_length", {1, 32}) diff --git a/cpp/benchmarks/io/parquet/parquet_writer.cpp b/cpp/benchmarks/io/parquet/parquet_writer.cpp index 84e4b8b93c0..a81ae82cae4 100644 --- a/cpp/benchmarks/io/parquet/parquet_writer.cpp +++ b/cpp/benchmarks/io/parquet/parquet_writer.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2024, NVIDIA CORPORATION. + * Copyright (c) 2020-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -82,10 +82,7 @@ void BM_parq_write_encode(nvbench::state& state, nvbench::type_list -void BM_parq_write_io_compression( - nvbench::state& state, - nvbench::type_list, nvbench::enum_type>) +void BM_parq_write_io_compression(nvbench::state& state) { auto const data_types = get_type_or_group({static_cast(data_type::INTEGRAL), static_cast(data_type::FLOAT), @@ -99,8 +96,8 @@ void BM_parq_write_io_compression( cudf::size_type const cardinality = state.get_int64("cardinality"); cudf::size_type const run_length = state.get_int64("run_length"); - auto const compression = Compression; - auto const sink_type = IO; + auto const sink_type = retrieve_io_type_enum(state.get_string("io_type")); + auto const compression = retrieve_compression_type_enum(state.get_string("compression_type")); auto const tbl = create_random_table(cycle_dtypes(data_types, num_cols), @@ -133,13 +130,12 @@ void BM_parq_write_io_compression( state.add_buffer_size(encoded_file_size, "encoded_file_size", "encoded_file_size"); } -template -void BM_parq_write_varying_options( - nvbench::state& state, - nvbench::type_list, nvbench::enum_type>) +template +void BM_parq_write_varying_options(nvbench::state& state, + nvbench::type_list>) { auto const enable_stats = Statistics; - auto const compression = Compression; + auto const compression = retrieve_compression_type_enum(state.get_string("compression_type")); auto const file_path = state.get_string("file_path"); auto const data_types = get_type_or_group({static_cast(data_type::INTEGRAL_SIGNED), @@ -191,11 +187,6 @@ using d_type_list = nvbench::enum_type_list; -using io_list = nvbench::enum_type_list; - -using compression_list = - nvbench::enum_type_list; - using stats_list = nvbench::enum_type_list Date: Fri, 22 Aug 2025 16:45:53 -0400 Subject: [PATCH 199/366] Support rank expression in cudf-polars (#19340) - Contributes to https://github.com/rapidsai/cudf/issues/19200 - Depends on https://github.com/pola-rs/polars/pull/23512 Authors: - Matthew Murray (https://github.com/Matt711) Approvers: - Bradley Dice (https://github.com/bdice) - Tom Augspurger (https://github.com/TomAugspurger) URL: https://github.com/rapidsai/cudf/pull/19340 --- .../cudf_polars/dsl/expressions/unary.py | 56 +++++++++++++++++-- .../cudf_polars/dsl/utils/aggregations.py | 5 ++ .../expressions/test_numeric_unaryops.py | 45 ++++++++++++++- python/cudf_polars/tests/test_groupby.py | 6 ++ python/cudf_polars/tests/test_rolling.py | 7 +++ 5 files changed, 113 insertions(+), 6 deletions(-) diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/unary.py b/python/cudf_polars/cudf_polars/dsl/expressions/unary.py index e763775e37c..9eeb7eae609 100644 --- a/python/cudf_polars/cudf_polars/dsl/expressions/unary.py +++ b/python/cudf_polars/cudf_polars/dsl/expressions/unary.py @@ -107,14 +107,15 @@ class UnaryFunction(Expr): "as_struct", "drop_nulls", "fill_null", + "fill_null_with_strategy", "mask_nans", + "null_count", + "rank", "round", "set_sorted", + "top_k", "unique", "value_counts", - "fill_null_with_strategy", - "null_count", - "top_k", } ) _supported_cum_aggs = frozenset( @@ -138,13 +139,14 @@ def __init__( self.children = children self.is_pointwise = self.name not in ( "as_struct", - "cum_min", "cum_max", + "cum_min", "cum_prod", "cum_sum", "drop_nulls", - "unique", + "rank", "top_k", + "unique", ) if self.name not in UnaryFunction._supported_fns: @@ -159,6 +161,12 @@ def __init__( raise NotImplementedError( "Filling null values with limit specified is not yet supported." ) + if self.name == "rank": + method, _, _ = self.options + if method not in {"average", "min", "max", "dense", "ordinal"}: + raise NotImplementedError( + f"ranking with {method=} is not yet supported" + ) def do_evaluate( self, df: DataFrame, *, context: ExecutionContext = ExecutionContext.FRAME @@ -401,6 +409,44 @@ def do_evaluate( ), dtype=self.dtype, ) + elif self.name == "rank": + (column,) = (child.evaluate(df, context=context) for child in self.children) + method_str, descending, _ = self.options + + method = { + "average": plc.aggregation.RankMethod.AVERAGE, + "min": plc.aggregation.RankMethod.MIN, + "max": plc.aggregation.RankMethod.MAX, + "dense": plc.aggregation.RankMethod.DENSE, + "ordinal": plc.aggregation.RankMethod.FIRST, + }[method_str] + + order = ( + plc.types.Order.DESCENDING if descending else plc.types.Order.ASCENDING + ) + + ranked: plc.Column = plc.sorting.rank( + column.obj, + method, + order, + plc.types.NullPolicy.EXCLUDE, + plc.types.NullOrder.BEFORE if descending else plc.types.NullOrder.AFTER, + percentage=False, + ) + + # Min/Max/Dense/Ordinal -> IDX_DTYPE + # See https://github.com/pola-rs/polars/blob/main/crates/polars-ops/src/series/ops/rank.rs + if method_str in {"min", "max", "dense", "ordinal"}: + dest = self.dtype.plc.id() + src = ranked.type().id() + if dest == plc.TypeId.UINT32 and src != plc.TypeId.UINT32: + ranked = plc.unary.cast(ranked, plc.DataType(plc.TypeId.UINT32)) + elif ( + dest == plc.TypeId.UINT64 and src != plc.TypeId.UINT64 + ): # pragma: no cover + ranked = plc.unary.cast(ranked, plc.DataType(plc.TypeId.UINT64)) + + return Column(ranked, dtype=self.dtype) elif self.name == "top_k": (column, k) = ( child.evaluate(df, context=context) for child in self.children diff --git a/python/cudf_polars/cudf_polars/dsl/utils/aggregations.py b/python/cudf_polars/cudf_polars/dsl/utils/aggregations.py index 2cd7cde44ef..5ddf060c41c 100644 --- a/python/cudf_polars/cudf_polars/dsl/utils/aggregations.py +++ b/python/cudf_polars/cudf_polars/dsl/utils/aggregations.py @@ -89,6 +89,11 @@ def decompose_single_agg( """ agg = named_expr.value name = named_expr.name + if isinstance(agg, expr.UnaryFunction) and agg.name in {"rank"}: + name = agg.name + raise NotImplementedError( + f"UnaryFunction {name=} not supported in groupby context" + ) if isinstance(agg, expr.UnaryFunction) and agg.name == "null_count": (child,) = agg.children diff --git a/python/cudf_polars/tests/expressions/test_numeric_unaryops.py b/python/cudf_polars/tests/expressions/test_numeric_unaryops.py index dd3c443223c..8df62b18526 100644 --- a/python/cudf_polars/tests/expressions/test_numeric_unaryops.py +++ b/python/cudf_polars/tests/expressions/test_numeric_unaryops.py @@ -8,7 +8,11 @@ import polars as pl -from cudf_polars.testing.asserts import assert_gpu_result_equal +from cudf_polars.testing.asserts import ( + assert_gpu_result_equal, + assert_ir_translation_raises, +) +from cudf_polars.utils.versions import POLARS_VERSION_LT_132 @pytest.fixture( @@ -112,3 +116,42 @@ def test_null_count(): pl.col("baz").is_null().sum(), ) assert_gpu_result_equal(q) + + +@pytest.mark.parametrize("method", ["ordinal", "dense", "min", "max", "average"]) +@pytest.mark.parametrize("descending", [False, True]) +def test_rank_supported(request, ldf: pl.LazyFrame, method: str, *, descending: bool): + request.applymarker( + pytest.mark.xfail(condition=POLARS_VERSION_LT_132, reason="nested loop join") + ) + expr = pl.col("a").rank(method=method, descending=descending) + q = ldf.select(expr) + assert_gpu_result_equal(q) + + +@pytest.mark.parametrize("method", ["ordinal", "dense", "min", "max", "average"]) +@pytest.mark.parametrize("descending", [False, True]) +@pytest.mark.parametrize("test", ["with_nulls", "with_ties"]) +def test_rank_methods_with_nulls_or_ties( + request, ldf: pl.LazyFrame, method: str, *, descending: bool, test: str +) -> None: + request.applymarker( + pytest.mark.xfail(condition=POLARS_VERSION_LT_132, reason="nested loop join") + ) + + base = pl.col("a") + if test == "with_nulls": + expr = pl.when((base % 2) == 0).then(None).otherwise(base) + else: + expr = pl.when((base % 2) == 0).then(pl.lit(-5)).otherwise(base) + + q = ldf.select(expr.rank(method=method, descending=descending)) + assert_gpu_result_equal(q) + + +@pytest.mark.parametrize("seed", [42]) +@pytest.mark.parametrize("method", ["random"]) +def test_rank_unsupported(ldf: pl.LazyFrame, method: str, seed: int) -> None: + expr = pl.col("a").rank(method=method, seed=seed) + q = ldf.select(expr) + assert_ir_translation_raises(q, NotImplementedError) diff --git a/python/cudf_polars/tests/test_groupby.py b/python/cudf_polars/tests/test_groupby.py index d900737c62b..3261afa7b7c 100644 --- a/python/cudf_polars/tests/test_groupby.py +++ b/python/cudf_polars/tests/test_groupby.py @@ -413,3 +413,9 @@ def test_groupby_fill_null_with_strategy(strategy): q = lf.group_by("key").agg(pl.col("val").fill_null(strategy=strategy)) assert_ir_translation_raises(q, NotImplementedError) + + +def test_groupby_rank_raises(df: pl.LazyFrame) -> None: + q = df.group_by("key1").agg(pl.col("int").rank()) + + assert_ir_translation_raises(q, NotImplementedError) diff --git a/python/cudf_polars/tests/test_rolling.py b/python/cudf_polars/tests/test_rolling.py index 9dbe75e14ba..53cae2180fe 100644 --- a/python/cudf_polars/tests/test_rolling.py +++ b/python/cudf_polars/tests/test_rolling.py @@ -272,3 +272,10 @@ def test_rolling_ternary_supported(df, expr): def test_rolling_ternary_unsupported(df, expr): q = df.rolling("dt", period="48h", closed="both").agg(expr.alias("out")) assert_ir_translation_raises(q, NotImplementedError) + + +def test_rolling_rank_unsupported(df): + q = df.rolling("dt", period="48h", closed="both").agg( + pl.col("values").rank(method="dense", descending=False) + ) + assert_ir_translation_raises(q, NotImplementedError) From 24403ca74808797e6043c14c4dceafff786a5ace Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Fri, 22 Aug 2025 14:30:22 -0700 Subject: [PATCH 200/366] Remove validation on import (#19775) This validation can be slow and adds unnecessary import overhead. It was added when cudf and cudf packaging was much less mature. These days if the install environment is not compatible for any reason the user should see comprehensible errors later from lower-level libraries without us doing an extra check ourselves. Contributes to #627 Authors: - Vyas Ramasubramani (https://github.com/vyasr) Approvers: - Matthew Roeschke (https://github.com/mroeschke) - GALI PREM SAGAR (https://github.com/galipremsagar) - Bradley Dice (https://github.com/bdice) - Matthew Murray (https://github.com/Matt711) URL: https://github.com/rapidsai/cudf/pull/19775 --- python/cudf/cudf/__init__.py | 5 - python/cudf/cudf/utils/gpu_utils.py | 162 ---------------------------- 2 files changed, 167 deletions(-) delete mode 100644 python/cudf/cudf/utils/gpu_utils.py diff --git a/python/cudf/cudf/__init__.py b/python/cudf/cudf/__init__.py index b56fe69568c..8f18c3609d0 100644 --- a/python/cudf/cudf/__init__.py +++ b/python/cudf/cudf/__init__.py @@ -10,11 +10,6 @@ libcudf.load_library() del libcudf -from cudf.utils.gpu_utils import validate_setup - -validate_setup() - -del validate_setup import cupy from numba import cuda diff --git a/python/cudf/cudf/utils/gpu_utils.py b/python/cudf/cudf/utils/gpu_utils.py deleted file mode 100644 index 1ff638d8830..00000000000 --- a/python/cudf/cudf/utils/gpu_utils.py +++ /dev/null @@ -1,162 +0,0 @@ -# Copyright (c) 2020-2025, NVIDIA CORPORATION. - - -def validate_setup(): - import os - - # TODO: Remove the following check once we arrive at a solution for #4827 - # This is a temporary workaround to unblock internal testing - # related issue: https://github.com/rapidsai/cudf/issues/4827 - if ( - "RAPIDS_NO_INITIALIZE" in os.environ - or "CUDF_NO_INITIALIZE" in os.environ - ): - return - - import warnings - - from cuda.bindings.runtime import ( - cudaDeviceAttr, - cudaError_t, - ) - - from rmm._cuda.gpu import ( - CUDARuntimeError, - deviceGetName, - driverGetVersion, - getDeviceAttribute, - getDeviceCount, - runtimeGetVersion, - ) - - from cudf.errors import UnsupportedCUDAError - - notify_caller_errors = { - cudaError_t.cudaErrorInitializationError, - cudaError_t.cudaErrorInvalidDeviceFunction, - cudaError_t.cudaErrorInvalidDevice, - cudaError_t.cudaErrorStartupFailure, - cudaError_t.cudaErrorInvalidKernelImage, - cudaError_t.cudaErrorAlreadyAcquired, - cudaError_t.cudaErrorOperatingSystem, - cudaError_t.cudaErrorNotPermitted, - cudaError_t.cudaErrorNotSupported, - cudaError_t.cudaErrorSystemNotReady, - cudaError_t.cudaErrorSystemDriverMismatch, - cudaError_t.cudaErrorCompatNotSupportedOnDevice, - cudaError_t.cudaErrorDeviceUninitialized, - cudaError_t.cudaErrorTimeout, - cudaError_t.cudaErrorUnknown, - cudaError_t.cudaErrorApiFailureBase, - } - - try: - gpus_count = getDeviceCount() - except CUDARuntimeError as e: - if e.status in notify_caller_errors: - raise e - - # We must distinguish between "CPU only" and "the driver is - # insufficient for the runtime". - if e.status == cudaError_t.cudaErrorInsufficientDriver: - # cudaDriverGetVersion() returns 0 when ``libcuda.so`` is - # missing. Otherwise there is a CUDA driver but it is - # insufficient for the runtime, so we re-raise the original - # exception - if driverGetVersion() != 0: - raise e - - # If there is no GPU detected, set `gpus_count` to -1 - gpus_count = -1 - except RuntimeError as e: - # When using cuda-python < 12.9, getDeviceCount() can raise a - # RuntimeError if ``libcuda.so`` is missing. We don't want this to - # propagate up to the user. - warnings.warn(str(e)) - return - - if gpus_count > 0: - # Cupy throws RunTimeException to get GPU count, - # hence obtaining GPU count by in-house cpp api above - - major_version = getDeviceAttribute( - cudaDeviceAttr.cudaDevAttrComputeCapabilityMajor, 0 - ) - - if major_version < 7: - # A GPU with NVIDIA Volta™ architecture or newer is required. - # Reference: https://developer.nvidia.com/cuda-gpus - # Hardware Generation Compute Capability - # Hopper 9.x - # Ampere 8.x - # Turing 7.5 - # Volta 7.0, 7.2 - # Pascal 6.x - # Maxwell 5.x - # Kepler 3.x - # Fermi 2.x - device_name = deviceGetName(0) - minor_version = getDeviceAttribute( - cudaDeviceAttr.cudaDevAttrComputeCapabilityMinor, 0 - ) - raise UnsupportedCUDAError( - "A GPU with NVIDIA Volta™ (Compute Capability 7.0) " - "or newer architecture is required.\n" - f"Detected GPU 0: {device_name}\n" - f"Detected Compute Capability: {major_version}.{minor_version}" - ) - - cuda_runtime_version = runtimeGetVersion() - - if cuda_runtime_version < 12000: - # Require CUDA Runtime version 12.0 or greater. - major_version = cuda_runtime_version // 1000 - minor_version = (cuda_runtime_version % 1000) // 10 - raise UnsupportedCUDAError( - "Detected CUDA Runtime version is " - f"{major_version}.{minor_version}. " - "Please update your CUDA Runtime to 12.0 or above." - ) - - cuda_driver_supported_rt_version = driverGetVersion() - - # Though Yes, Externally driver version is represented like `418.39` - # and cuda runtime version like `10.1`. It is not the similar case - # at cuda api's level. Coming down to APIs they follow a uniform - # convention of an integer which corresponds to the versioning - # like (1000 major + 10 minor) for 10.1 Driver version API doesn't - # actually indicate driver version, it indicates only the latest - # CUDA version supported by the driver. - # For reference : - # https://docs.nvidia.com/deploy/cuda-compatibility/index.html - - if cuda_driver_supported_rt_version == 0: - raise UnsupportedCUDAError( - "We couldn't detect the GPU driver properly. Please follow " - "the installation guide to ensure your driver is properly " - "installed: " - "https://docs.nvidia.com/cuda/cuda-installation-guide-linux/" - ) - elif cuda_driver_supported_rt_version >= cuda_runtime_version: - # CUDA Driver Version Check: - # Driver Runtime version is >= Runtime version - pass - elif ( - cuda_driver_supported_rt_version >= 12000 - and cuda_runtime_version >= 12000 - ): - # With cuda enhanced compatibility any code compiled - # with 12.x version of cuda can now run on any - # driver >= 525.60.13. 12000 is the minimum cuda - # version 525.60.13 supports. - pass - else: - raise UnsupportedCUDAError( - "Please update your NVIDIA GPU Driver to support CUDA " - "Runtime.\n" - f"Detected CUDA Runtime version : {cuda_runtime_version}\n" - "Latest version of CUDA supported by current " - f"NVIDIA GPU Driver : {cuda_driver_supported_rt_version}" - ) - else: - warnings.warn("No NVIDIA GPU detected") From 3a4a9ff4e622da6849cfa19d0fc870cbb774a561 Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Fri, 22 Aug 2025 17:28:23 -0500 Subject: [PATCH 201/366] Vendor libnvcomp in libcudf (#19743) Now that kvikio no longer uses nvcomp, we need to vendor libnvcomp in the libcudf wheels so that nvcomp can be fully removed as a dependency of kvikio. Merge this just before https://github.com/rapidsai/kvikio/pull/805. Authors: - Bradley Dice (https://github.com/bdice) Approvers: - Mike Sarahan (https://github.com/msarahan) - Kyle Edwards (https://github.com/KyleFromNVIDIA) - Matthew Murray (https://github.com/Matt711) URL: https://github.com/rapidsai/cudf/pull/19743 --- build.sh | 6 ---- ci/build_wheel_cudf.sh | 2 +- ci/build_wheel_libcudf.sh | 6 +--- ci/build_wheel_pylibcudf.sh | 2 +- cpp/CMakeLists.txt | 1 - cpp/cmake/thirdparty/get_nvcomp.cmake | 4 +-- python/libcudf/CMakeLists.txt | 50 ++++++++++++++++----------- python/libcudf/libcudf/load.py | 22 +++++++----- 8 files changed, 47 insertions(+), 46 deletions(-) diff --git a/build.sh b/build.sh index d4443695347..28a3a30738c 100755 --- a/build.sh +++ b/build.sh @@ -73,7 +73,6 @@ BUILD_PER_THREAD_DEFAULT_STREAM=OFF BUILD_REPORT_METRICS=OFF BUILD_REPORT_INCL_CACHE_STATS=OFF BUILD_DISABLE_LARGE_STRINGS=OFF -USE_PROPRIETARY_NVCOMP=ON PYTHON_ARGS_FOR_INSTALL=("-m" "pip" "install" "--no-build-isolation" "--no-deps" "--config-settings" "rapidsai.disable-cuda=true") # Set defaults for vars that may not have been defined externally @@ -153,7 +152,6 @@ function buildLibCudfJniInDocker { -DCMAKE_CUDA_ARCHITECTURES=${CUDF_CMAKE_CUDA_ARCHITECTURES} \ -DCMAKE_INSTALL_PREFIX=/usr/local/rapids \ -DUSE_NVTX=ON \ - -DCUDF_USE_PROPRIETARY_NVCOMP=ON \ -DCUDF_USE_ARROW_STATIC=ON \ -DCUDF_ENABLE_ARROW_S3=OFF \ -DBUILD_TESTS=OFF \ @@ -221,9 +219,6 @@ fi if hasArg --disable_nvtx; then BUILD_NVTX="OFF" fi -if hasArg --opensource_nvcomp; then - USE_PROPRIETARY_NVCOMP="OFF" -fi if hasArg --show_depr_warn; then BUILD_DISABLE_DEPRECATION_WARNINGS=OFF fi @@ -292,7 +287,6 @@ if buildAll || hasArg libcudf; then -DCMAKE_INSTALL_PREFIX="${INSTALL_PREFIX}" \ -DCMAKE_CUDA_ARCHITECTURES="${CUDF_CMAKE_CUDA_ARCHITECTURES}" \ -DUSE_NVTX=${BUILD_NVTX} \ - -DCUDF_USE_PROPRIETARY_NVCOMP=${USE_PROPRIETARY_NVCOMP} \ -DBUILD_TESTS=${BUILD_TESTS} \ -DBUILD_BENCHMARKS=${BUILD_BENCHMARKS} \ -DDISABLE_DEPRECATION_WARNINGS=${BUILD_DISABLE_DEPRECATION_WARNINGS} \ diff --git a/ci/build_wheel_cudf.sh b/ci/build_wheel_cudf.sh index af289fe7229..8b75a01479f 100755 --- a/ci/build_wheel_cudf.sh +++ b/ci/build_wheel_cudf.sh @@ -24,7 +24,7 @@ echo "pylibcudf-${RAPIDS_PY_CUDA_SUFFIX} @ file://$(echo ${PYLIBCUDF_WHEELHOUSE} # repair wheels and write to the location that artifact-uploading code expects to find them python -m auditwheel repair \ --exclude libcudf.so \ - --exclude libnvcomp.so \ + --exclude libnvcomp.so.* \ --exclude libkvikio.so \ --exclude librapids_logger.so \ --exclude librmm.so \ diff --git a/ci/build_wheel_libcudf.sh b/ci/build_wheel_libcudf.sh index ae0ab29c7f8..ea2a818d97f 100755 --- a/ci/build_wheel_libcudf.sh +++ b/ci/build_wheel_libcudf.sh @@ -27,11 +27,7 @@ rapids-pip-retry install \ # 0 really means "add --no-build-isolation" (ref: https://github.com/pypa/pip/issues/5735) export PIP_NO_BUILD_ISOLATION=0 -# TODO(nvcomp): when `nvcomp` supports Python 3.13 and we de-vendor `nvcomp` from `kvikio` -# this should be switched back to using the nvcomp runtime wheel -# https://github.com/rapidsai/build-planning/issues/171 -# export SKBUILD_CMAKE_ARGS="-DUSE_NVCOMP_RUNTIME_WHEEL=ON" -export SKBUILD_CMAKE_ARGS="-DUSE_NVCOMP_FROM_LIBKVIKIO_WHEEL=ON" +export SKBUILD_CMAKE_ARGS="-DUSE_NVCOMP_RUNTIME_WHEEL=OFF" ./ci/build_wheel.sh "${package_name}" "${package_dir}" # repair wheels and write to the location that artifact-uploading code expects to find them diff --git a/ci/build_wheel_pylibcudf.sh b/ci/build_wheel_pylibcudf.sh index 3f99f75ceb4..4c09752626f 100755 --- a/ci/build_wheel_pylibcudf.sh +++ b/ci/build_wheel_pylibcudf.sh @@ -22,7 +22,7 @@ echo "libcudf-${RAPIDS_PY_CUDA_SUFFIX} @ file://$(echo ${LIBCUDF_WHEELHOUSE}/lib # repair wheels and write to the location that artifact-uploading code expects to find them python -m auditwheel repair \ --exclude libcudf.so \ - --exclude libnvcomp.so \ + --exclude libnvcomp.so.* \ --exclude libkvikio.so \ --exclude librapids_logger.so \ --exclude librmm.so \ diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 8ee45a34df7..5eb59323caa 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -53,7 +53,6 @@ option(BUILD_SHARED_LIBS "Build cuDF shared libraries" ON) option(JITIFY_USE_CACHE "Use a file cache for JIT compiled kernels" ON) option(CUDF_BUILD_TESTUTIL "Whether to build the test utilities contained in libcudf" ON) mark_as_advanced(CUDF_BUILD_TESTUTIL) -option(CUDF_USE_PROPRIETARY_NVCOMP "Download and use NVCOMP with proprietary extensions" ON) option(CUDF_EXPORT_NVCOMP "Export NVCOMP as a dependency" ON) option(CUDF_LARGE_STRINGS_DISABLED "Build with large string support disabled" OFF) mark_as_advanced(CUDF_LARGE_STRINGS_DISABLED) diff --git a/cpp/cmake/thirdparty/get_nvcomp.cmake b/cpp/cmake/thirdparty/get_nvcomp.cmake index 33b1b45fb44..bb5c0c3c215 100644 --- a/cpp/cmake/thirdparty/get_nvcomp.cmake +++ b/cpp/cmake/thirdparty/get_nvcomp.cmake @@ -1,5 +1,5 @@ # ============================================================================= -# Copyright (c) 2021-2024, NVIDIA CORPORATION. +# Copyright (c) 2021-2025, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except # in compliance with the License. You may obtain a copy of the License at @@ -20,7 +20,7 @@ function(find_and_configure_nvcomp) if(CUDF_EXPORT_NVCOMP) set(export_args BUILD_EXPORT_SET cudf-exports INSTALL_EXPORT_SET cudf-exports) endif() - rapids_cpm_nvcomp(${export_args} USE_PROPRIETARY_BINARY ${CUDF_USE_PROPRIETARY_NVCOMP}) + rapids_cpm_nvcomp(${export_args} USE_PROPRIETARY_BINARY ON) # Per-thread default stream if(TARGET nvcomp AND CUDF_USE_PER_THREAD_DEFAULT_STREAM) diff --git a/python/libcudf/CMakeLists.txt b/python/libcudf/CMakeLists.txt index ca737af08f3..6e4c525edbd 100644 --- a/python/libcudf/CMakeLists.txt +++ b/python/libcudf/CMakeLists.txt @@ -15,18 +15,18 @@ cmake_minimum_required(VERSION 3.30.4 FATAL_ERROR) include(../../cmake/rapids_config.cmake) +include(rapids-cpm) +include(rapids-cuda) +rapids_cuda_init_architectures(libcudf-python) +rapids_cpm_init() project( libcudf-python VERSION "${RAPIDS_VERSION}" - LANGUAGES CXX + LANGUAGES CXX CUDA ) -option(USE_NVCOMP_RUNTIME_WHEEL "Use the nvcomp wheel at runtime instead of the system library" OFF) - -# TODO(nvcomp): when `nvcomp` supports Python 3.13 and we de-vendor `nvcomp` from `kvikio` this -# option should be removed https://github.com/rapidsai/build-planning/issues/171 -option(USE_NVCOMP_FROM_LIBKVIKIO_WHEEL "Use nvcomp bundled with libkvikio" OFF) +option(USE_NVCOMP_RUNTIME_WHEEL "Use the nvcomp wheel at runtime instead of vendoring nvcomp" OFF) # Check if cudf is already available. If so, it is the user's responsibility to ensure that the # CMake package is also available at build time of the Python cudf package. @@ -42,12 +42,31 @@ set(BUILD_TESTS OFF) set(BUILD_BENCHMARKS OFF) set(CUDF_BUILD_TESTUTIL OFF) set(CUDF_BUILD_STREAMS_TEST_UTIL OFF) +set(CUDF_EXPORT_NVCOMP OFF) + +include(../../cpp/cmake/thirdparty/get_nvcomp.cmake) -# TODO(nvcomp): when `nvcomp` supports Python 3.13 and we de-vendor `nvcomp` from `kvikio` the -# libkvikio branch should be removed -if(USE_NVCOMP_RUNTIME_WHEEL OR USE_NVCOMP_FROM_LIBKVIKIO_WHEEL) - set(CUDF_EXPORT_NVCOMP OFF) +# Install only the specific libnvcomp.so.* library instead of all nvcomp targets +if(TARGET nvcomp::nvcomp) + get_target_property(is_imported nvcomp::nvcomp IMPORTED) + if(is_imported) + get_target_property(nvcomp_lib_path nvcomp::nvcomp IMPORTED_LOCATION_RELEASE) + # Compute the SOVERSION from the library path + get_filename_component(nvcomp_lib_dir ${nvcomp_lib_path} DIRECTORY) + get_filename_component(nvcomp_lib_name ${nvcomp_lib_path} NAME) + string(REGEX REPLACE "libnvcomp\\.so\\.([0-9]+)" "\\1" nvcomp_soversion ${nvcomp_lib_name}) + install( + FILES ${nvcomp_lib_path} + DESTINATION ${SKBUILD_PLATLIB_DIR}/libcudf/lib64/ + RENAME libnvcomp.so.${nvcomp_soversion} + ) + else() + message(FATAL_ERROR "nvcomp target must be imported") + endif() +else() + message(FATAL_ERROR "nvcomp target not found") endif() + set(CUDA_STATIC_RUNTIME ON) set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/lib) @@ -62,14 +81,3 @@ if(USE_NVCOMP_RUNTIME_WHEEL) APPEND ) endif() - -# TODO(nvcomp): when `nvcomp` supports Python 3.13 and we de-vendor `nvcomp` from `kvikio` this -# block should be removed -if(USE_NVCOMP_FROM_LIBKVIKIO_WHEEL) - set(rpaths "$ORIGIN/../../libkvikio/lib64") - set_property( - TARGET cudf - PROPERTY INSTALL_RPATH ${rpaths} - APPEND - ) -endif() diff --git a/python/libcudf/libcudf/load.py b/python/libcudf/libcudf/load.py index 4198fcbe385..d32d139d945 100644 --- a/python/libcudf/libcudf/load.py +++ b/python/libcudf/libcudf/load.py @@ -62,29 +62,33 @@ def load_library(): # we assume the library is discoverable on system paths. pass + _load_library("libnvcomp.so.5") + return _load_library("libcudf.so") + + +def _load_library(soname): prefer_system_installation = ( os.getenv("RAPIDS_LIBCUDF_PREFER_SYSTEM_LIBRARY", "false").lower() != "false" ) - soname = "libcudf.so" - libcudf_lib = None + found_lib = None if prefer_system_installation: # Prefer a system library if one is present to # avoid clobbering symbols that other packages might expect, but if no # other library is present use the one in the wheel. try: - libcudf_lib = _load_system_installation(soname) + found_lib = _load_system_installation(soname) except OSError: - libcudf_lib = _load_wheel_installation(soname) + found_lib = _load_wheel_installation(soname) else: # Prefer the libraries bundled in this package. If they aren't found # (which might be the case in builds where the library was prebuilt before # packaging the wheel), look for a system installation. try: - libcudf_lib = _load_wheel_installation(soname) - if libcudf_lib is None: - libcudf_lib = _load_system_installation(soname) + found_lib = _load_wheel_installation(soname) + if found_lib is None: + found_lib = _load_system_installation(soname) except OSError: # If none of the searches above succeed, just silently return None # and rely on other mechanisms (like RPATHs on other DSOs) to @@ -93,5 +97,5 @@ def load_library(): # The caller almost never needs to do anything with this library, but no # harm in offering the option since this object at least provides a handle - # to inspect where libcudf was loaded from. - return libcudf_lib + # to inspect where the library was loaded from. + return found_lib From ac044f14ecf41de2c1957b5d78a8a79908984f2f Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Fri, 22 Aug 2025 15:39:22 -0700 Subject: [PATCH 202/366] Moves test_options to cudf testing directory, clean up old, stubbed testing files in directory (#19698) Towards https://github.com/rapidsai/cudf/issues/9999 Towards https://github.com/rapidsai/cudf/issues/15723 Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - Matthew Murray (https://github.com/Matt711) URL: https://github.com/rapidsai/cudf/pull/19698 --- python/cudf/cudf/tests/conftest.py | 33 ++ .../datetimeindex/methods/test_tz_convert.py | 6 + .../indexes/datetimeindex/test_attributes.py | 27 ++ .../cudf/tests/indexes/test_categorical.py | 1 - .../cudf/cudf/tests/indexes/test_combining.py | 1 - .../cudf/tests/indexes/test_computation.py | 1 - .../cudf/tests/indexes/test_constructing.py | 1 - .../cudf/tests/indexes/test_conversion.py | 1 - .../cudf/tests/indexes/test_memory_usage.py | 1 - .../cudf/cudf/tests/indexes/test_missing.py | 1 - .../cudf/cudf/tests/indexes/test_modifying.py | 1 - .../tests/indexes/test_multiindex_compat.py | 1 - .../cudf/cudf/tests/indexes/test_numeric.py | 1 - .../cudf/tests/indexes/test_properties.py | 1 - .../cudf/cudf/tests/indexes/test_selecting.py | 1 - .../cudf/cudf/tests/indexes/test_sorting.py | 1 - .../cudf/tests/indexes/test_time_specific.py | 1 - python/cudf/cudf/tests/lists/__init__.py | 0 .../cudf/tests/lists/test_list_methods.py | 1 - .../cudf/cudf/tests/options/test_options.py | 131 +++++++- .../cudf/tests/series/accessors/test_dt.py | 157 +++++++++ .../cudf/tests/series/methods/test_astype.py | 25 ++ .../test_convert_dtypes.py} | 6 +- .../tests/series/methods/test_to_pandas.py | 13 + .../cudf/tests/series/test_constructors.py | 37 +++ .../cudf/tests/series/test_datetimelike.py | 300 ------------------ python/cudf/cudf/tests/strings/__init__.py | 0 .../cudf/tests/strings/test_string_methods.py | 1 - python/cudf/cudf/tests/test_options.py | 129 -------- 29 files changed, 429 insertions(+), 451 deletions(-) delete mode 100644 python/cudf/cudf/tests/indexes/test_categorical.py delete mode 100644 python/cudf/cudf/tests/indexes/test_combining.py delete mode 100644 python/cudf/cudf/tests/indexes/test_computation.py delete mode 100644 python/cudf/cudf/tests/indexes/test_constructing.py delete mode 100644 python/cudf/cudf/tests/indexes/test_conversion.py delete mode 100644 python/cudf/cudf/tests/indexes/test_memory_usage.py delete mode 100644 python/cudf/cudf/tests/indexes/test_missing.py delete mode 100644 python/cudf/cudf/tests/indexes/test_modifying.py delete mode 100644 python/cudf/cudf/tests/indexes/test_multiindex_compat.py delete mode 100644 python/cudf/cudf/tests/indexes/test_numeric.py delete mode 100644 python/cudf/cudf/tests/indexes/test_properties.py delete mode 100644 python/cudf/cudf/tests/indexes/test_selecting.py delete mode 100644 python/cudf/cudf/tests/indexes/test_sorting.py delete mode 100644 python/cudf/cudf/tests/indexes/test_time_specific.py delete mode 100644 python/cudf/cudf/tests/lists/__init__.py delete mode 100644 python/cudf/cudf/tests/lists/test_list_methods.py rename python/cudf/cudf/tests/series/{test_conversion.py => methods/test_convert_dtypes.py} (90%) delete mode 100644 python/cudf/cudf/tests/series/test_datetimelike.py delete mode 100644 python/cudf/cudf/tests/strings/__init__.py delete mode 100644 python/cudf/cudf/tests/strings/test_string_methods.py delete mode 100644 python/cudf/cudf/tests/test_options.py diff --git a/python/cudf/cudf/tests/conftest.py b/python/cudf/cudf/tests/conftest.py index b9c21a67c43..f4157430185 100644 --- a/python/cudf/cudf/tests/conftest.py +++ b/python/cudf/cudf/tests/conftest.py @@ -5,9 +5,11 @@ import operator import os import pathlib +import zoneinfo import cupy as cp import numpy as np +import pandas as pd import pytest import rmm # noqa: F401 @@ -176,6 +178,37 @@ def pytest_runtest_makereport(item, call): setattr(item, "report", {rep.when: rep}) +def _get_all_zones(): + zones = [] + for zone in zoneinfo.available_timezones(): + # TODO: pandas 3.0 defaults to zoneinfo, + # so all_zone_names can use zoneinfo.available_timezones() + try: + pd.DatetimeTZDtype("ns", zone) + except KeyError: + continue + else: + zones.append(zone) + return sorted(zones) + + +# NOTE: _get_all_zones is a very large list; we likely do NOT want to +# use it for more than a handful of tests +@pytest.fixture(params=_get_all_zones()) +def all_timezones(request): + return request.param + + +@pytest.fixture( + params=["America/New_York", "Asia/Tokyo", "CET", "Etc/GMT+1", "UTC"] +) +def limited_timezones(request): + """ + Small representative set of timezones for testing. + """ + return request.param + + @pytest.fixture( params=[ { diff --git a/python/cudf/cudf/tests/indexes/datetimeindex/methods/test_tz_convert.py b/python/cudf/cudf/tests/indexes/datetimeindex/methods/test_tz_convert.py index 1c026224da3..816b2e108b1 100644 --- a/python/cudf/cudf/tests/indexes/datetimeindex/methods/test_tz_convert.py +++ b/python/cudf/cudf/tests/indexes/datetimeindex/methods/test_tz_convert.py @@ -2,6 +2,7 @@ import zoneinfo import pandas as pd +import pytest import cudf from cudf.testing import assert_eq @@ -14,3 +15,8 @@ def test_tz_convert(): pidx = pidx.tz_localize("UTC") idx = idx.tz_localize("UTC") assert_eq(pidx.tz_convert(tz), idx.tz_convert(tz)) + + +def test_tz_convert_naive_typeerror(): + with pytest.raises(TypeError, match="Cannot convert tz-naive timestamps"): + cudf.date_range("2020", periods=2, freq="D").tz_convert(None) diff --git a/python/cudf/cudf/tests/indexes/datetimeindex/test_attributes.py b/python/cudf/cudf/tests/indexes/datetimeindex/test_attributes.py index 8d230451886..d5b24a6a88e 100644 --- a/python/cudf/cudf/tests/indexes/datetimeindex/test_attributes.py +++ b/python/cudf/cudf/tests/indexes/datetimeindex/test_attributes.py @@ -9,6 +9,33 @@ from cudf.testing import assert_eq +@pytest.mark.parametrize( + "item, expected", + [ + ["2020-01-01", False], + ["2020-01-01T00:00:00+00:00", True], + ["2020-01-01T00:00:00-08:00", False], + ["2019-12-31T16:00:00-08:00", True], + ], +) +def test_contains_tz_aware(item, expected): + dti = cudf.date_range("2020", periods=2, freq="D").tz_localize("UTC") + result = item in dti + assert result == expected + + +def test_tz_aware_attributes_local(): + data = [ + "2008-05-12 13:50:00", + "2008-12-12 14:50:35", + "2009-05-12 13:50:32", + ] + dti = cudf.DatetimeIndex(data).tz_localize("UTC").tz_convert("US/Eastern") + result = dti.hour + expected = cudf.Index([9, 9, 9], dtype="int16") + assert_eq(result, expected) + + @pytest.mark.parametrize( "field", [ diff --git a/python/cudf/cudf/tests/indexes/test_categorical.py b/python/cudf/cudf/tests/indexes/test_categorical.py deleted file mode 100644 index 06777c8e6af..00000000000 --- a/python/cudf/cudf/tests/indexes/test_categorical.py +++ /dev/null @@ -1 +0,0 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. diff --git a/python/cudf/cudf/tests/indexes/test_combining.py b/python/cudf/cudf/tests/indexes/test_combining.py deleted file mode 100644 index 06777c8e6af..00000000000 --- a/python/cudf/cudf/tests/indexes/test_combining.py +++ /dev/null @@ -1 +0,0 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. diff --git a/python/cudf/cudf/tests/indexes/test_computation.py b/python/cudf/cudf/tests/indexes/test_computation.py deleted file mode 100644 index 06777c8e6af..00000000000 --- a/python/cudf/cudf/tests/indexes/test_computation.py +++ /dev/null @@ -1 +0,0 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. diff --git a/python/cudf/cudf/tests/indexes/test_constructing.py b/python/cudf/cudf/tests/indexes/test_constructing.py deleted file mode 100644 index 06777c8e6af..00000000000 --- a/python/cudf/cudf/tests/indexes/test_constructing.py +++ /dev/null @@ -1 +0,0 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. diff --git a/python/cudf/cudf/tests/indexes/test_conversion.py b/python/cudf/cudf/tests/indexes/test_conversion.py deleted file mode 100644 index 06777c8e6af..00000000000 --- a/python/cudf/cudf/tests/indexes/test_conversion.py +++ /dev/null @@ -1 +0,0 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. diff --git a/python/cudf/cudf/tests/indexes/test_memory_usage.py b/python/cudf/cudf/tests/indexes/test_memory_usage.py deleted file mode 100644 index 06777c8e6af..00000000000 --- a/python/cudf/cudf/tests/indexes/test_memory_usage.py +++ /dev/null @@ -1 +0,0 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. diff --git a/python/cudf/cudf/tests/indexes/test_missing.py b/python/cudf/cudf/tests/indexes/test_missing.py deleted file mode 100644 index 06777c8e6af..00000000000 --- a/python/cudf/cudf/tests/indexes/test_missing.py +++ /dev/null @@ -1 +0,0 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. diff --git a/python/cudf/cudf/tests/indexes/test_modifying.py b/python/cudf/cudf/tests/indexes/test_modifying.py deleted file mode 100644 index 06777c8e6af..00000000000 --- a/python/cudf/cudf/tests/indexes/test_modifying.py +++ /dev/null @@ -1 +0,0 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. diff --git a/python/cudf/cudf/tests/indexes/test_multiindex_compat.py b/python/cudf/cudf/tests/indexes/test_multiindex_compat.py deleted file mode 100644 index 06777c8e6af..00000000000 --- a/python/cudf/cudf/tests/indexes/test_multiindex_compat.py +++ /dev/null @@ -1 +0,0 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. diff --git a/python/cudf/cudf/tests/indexes/test_numeric.py b/python/cudf/cudf/tests/indexes/test_numeric.py deleted file mode 100644 index 06777c8e6af..00000000000 --- a/python/cudf/cudf/tests/indexes/test_numeric.py +++ /dev/null @@ -1 +0,0 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. diff --git a/python/cudf/cudf/tests/indexes/test_properties.py b/python/cudf/cudf/tests/indexes/test_properties.py deleted file mode 100644 index 06777c8e6af..00000000000 --- a/python/cudf/cudf/tests/indexes/test_properties.py +++ /dev/null @@ -1 +0,0 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. diff --git a/python/cudf/cudf/tests/indexes/test_selecting.py b/python/cudf/cudf/tests/indexes/test_selecting.py deleted file mode 100644 index 06777c8e6af..00000000000 --- a/python/cudf/cudf/tests/indexes/test_selecting.py +++ /dev/null @@ -1 +0,0 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. diff --git a/python/cudf/cudf/tests/indexes/test_sorting.py b/python/cudf/cudf/tests/indexes/test_sorting.py deleted file mode 100644 index 06777c8e6af..00000000000 --- a/python/cudf/cudf/tests/indexes/test_sorting.py +++ /dev/null @@ -1 +0,0 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. diff --git a/python/cudf/cudf/tests/indexes/test_time_specific.py b/python/cudf/cudf/tests/indexes/test_time_specific.py deleted file mode 100644 index 06777c8e6af..00000000000 --- a/python/cudf/cudf/tests/indexes/test_time_specific.py +++ /dev/null @@ -1 +0,0 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. diff --git a/python/cudf/cudf/tests/lists/__init__.py b/python/cudf/cudf/tests/lists/__init__.py deleted file mode 100644 index e69de29bb2d..00000000000 diff --git a/python/cudf/cudf/tests/lists/test_list_methods.py b/python/cudf/cudf/tests/lists/test_list_methods.py deleted file mode 100644 index 06777c8e6af..00000000000 --- a/python/cudf/cudf/tests/lists/test_list_methods.py +++ /dev/null @@ -1 +0,0 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. diff --git a/python/cudf/cudf/tests/options/test_options.py b/python/cudf/cudf/tests/options/test_options.py index 06777c8e6af..3f994959ec9 100644 --- a/python/cudf/cudf/tests/options/test_options.py +++ b/python/cudf/cudf/tests/options/test_options.py @@ -1 +1,130 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. +# Copyright (c) 2022-2025, NVIDIA CORPORATION. + +from contextlib import redirect_stdout +from io import StringIO + +import pytest + +import cudf + + +@pytest.fixture(scope="class", autouse=False) +def empty_option_environment(): + old_option_environment = cudf.options._OPTIONS + cudf.options._OPTIONS = {} + yield + cudf.options._OPTIONS = old_option_environment + + +@pytest.fixture(scope="function") +def odd_option(empty_option_environment): + def validator(x): + if not x % 2 == 1: + raise ValueError(f"Invalid option value {x}") + + cudf.options._register_option( + "odd_option", + 1, + "An odd option.", + validator, + ) + yield + del cudf.options._OPTIONS["odd_option"] + + +@pytest.fixture(scope="function") +def even_option(empty_option_environment): + def validator(x): + if not x % 2 == 0: + raise ValueError(f"Invalid option value {x}") + + cudf.options._register_option( + "even_option", 0, "An even option.", validator + ) + yield + del cudf.options._OPTIONS["even_option"] + + +@pytest.mark.usefixtures("odd_option", "even_option") +class TestCleanOptions: + def test_option_get_set(odd_option): + assert cudf.get_option("odd_option") == 1 + cudf.set_option("odd_option", 101) + assert cudf.get_option("odd_option") == 101 + + def test_option_set_invalid(odd_option): + with pytest.raises(ValueError, match="Invalid option value 0"): + cudf.set_option("odd_option", 0) + + def test_option_description(odd_option): + s = StringIO() + with redirect_stdout(s): + cudf.describe_option("odd_option") + s.seek(0) + expected = ( + "odd_option:\n\tAn odd option.\n\t[Default: 1] [Current: 1]\n" + ) + assert expected == s.read() + + def test_option_description_all(odd_option, even_option): + s = StringIO() + with redirect_stdout(s): + cudf.describe_option() + s.seek(0) + expected = ( + "odd_option:\n\tAn odd option.\n\t[Default: 1] [Current: 1]\n" + "even_option:\n\tAn even option.\n\t[Default: 0] [Current: 0]\n" + ) + assert expected == s.read() + + +@pytest.mark.parametrize("default_integer_bitwidth", [32, 64, None]) +def test_empty_option_context(default_integer_bitwidth): + with cudf.option_context( + "default_integer_bitwidth", default_integer_bitwidth + ): + with cudf.option_context(): + assert ( + cudf.get_option("default_integer_bitwidth") + == default_integer_bitwidth + ) + + assert ( + cudf.get_option("default_integer_bitwidth") + == default_integer_bitwidth + ) + + +@pytest.mark.parametrize("pandas_compatible", [True, False]) +@pytest.mark.parametrize("default_integer_bitwidth", [32, 64]) +def test_option_context(pandas_compatible, default_integer_bitwidth): + prev_pandas_compatible_setting = cudf.get_option("mode.pandas_compatible") + prev_width_setting = cudf.get_option("default_integer_bitwidth") + + with cudf.option_context( + "mode.pandas_compatible", + pandas_compatible, + "default_integer_bitwidth", + default_integer_bitwidth, + ): + assert cudf.get_option("mode.pandas_compatible") is pandas_compatible + assert ( + cudf.get_option("default_integer_bitwidth") + is default_integer_bitwidth + ) + + assert ( + cudf.get_option("mode.pandas_compatible") + is prev_pandas_compatible_setting + ) + assert cudf.get_option("default_integer_bitwidth") is prev_width_setting + + +def test_options_context_error(): + with pytest.raises(ValueError): + with cudf.option_context("mode.pandas_compatible"): + pass + + with pytest.raises(ValueError): + with cudf.option_context("mode.pandas_compatible", 1, 2): + pass diff --git a/python/cudf/cudf/tests/series/accessors/test_dt.py b/python/cudf/cudf/tests/series/accessors/test_dt.py index 401f89aed65..a3f30b8a180 100644 --- a/python/cudf/cudf/tests/series/accessors/test_dt.py +++ b/python/cudf/cudf/tests/series/accessors/test_dt.py @@ -1,4 +1,6 @@ # Copyright (c) 2025, NVIDIA CORPORATION. +import datetime +import zoneinfo import cupy as cp import numpy as np @@ -571,3 +573,158 @@ def test_dt_series_datetime_fields(data, field): base = getattr(pd_data.dt, field) test = getattr(gdf_data.dt, field) assert_eq(base, test, check_dtype=False) + + +@pytest.mark.parametrize("fmt", ["%Y-%m-%dT%H:%M%z", "%Y-%m-%dT%H:%M"]) +def test_strftime_tz_aware_as_utc(fmt): + data = [datetime.datetime(2024, 1, 1, tzinfo=datetime.timezone.utc)] + cudf_pacific = cudf.Series(data).dt.tz_convert("US/Pacific") + pd_utc = pd.Series(data) + assert cudf_pacific.dtype != pd_utc.dtype + result = cudf_pacific.dt.strftime(fmt) + expected = pd_utc.dt.strftime(fmt) + assert_eq(result, expected) + + +def test_tz_localize(datetime_types_as_str, all_timezones): + s = cudf.Series(cudf.date_range("2001-01-01", "2001-01-02", freq="1s")) + s = s.astype(datetime_types_as_str) + s = s.dt.tz_localize(all_timezones) + assert isinstance(s.dtype, pd.DatetimeTZDtype) + assert s.dtype.unit == datetime_types_as_str.removeprefix( + "datetime64[" + ).removesuffix("]") + assert str(s.dtype.tz) == all_timezones + + +def test_localize_ambiguous(request, datetime_types_as_str, all_timezones): + request.applymarker( + pytest.mark.xfail( + condition=(all_timezones == "America/Metlakatla"), + reason="https://www.timeanddate.com/news/time/metlakatla-quits-dst.html", + ) + ) + s = cudf.Series( + [ + "2018-11-04 00:30:00", + "2018-11-04 01:00:00", + "2018-11-04 01:30:00", + "2018-11-04 02:00:00", + None, + "2018-11-04 02:30:00", + ], + dtype=datetime_types_as_str, + ) + expect = s.to_pandas().dt.tz_localize( + zoneinfo.ZoneInfo(all_timezones), ambiguous="NaT", nonexistent="NaT" + ) + got = s.dt.tz_localize(all_timezones) + assert_eq(expect, got) + + +def test_localize_nonexistent(request, datetime_types_as_str, all_timezones): + request.applymarker( + pytest.mark.xfail( + condition=all_timezones == "America/Grand_Turk", + reason="https://www.worldtimezone.com/dst_news/dst_news_turkscaicos03.html", + ) + ) + s = cudf.Series( + [ + "2018-03-11 01:30:00", + "2018-03-11 02:00:00", + "2018-03-11 02:30:00", + "2018-03-11 03:00:00", + None, + "2018-03-11 03:30:00", + ], + dtype=datetime_types_as_str, + ) + expect = s.to_pandas().dt.tz_localize( + zoneinfo.ZoneInfo(all_timezones), ambiguous="NaT", nonexistent="NaT" + ) + got = s.dt.tz_localize(all_timezones) + assert_eq(expect, got) + + +def test_delocalize(datetime_types_as_str, limited_timezones): + psr = pd.Series( + pd.date_range("2001-01-01", "2001-01-02", freq="1s") + ).astype(datetime_types_as_str) + sr = cudf.from_pandas(psr) + + expect = psr.dt.tz_localize(limited_timezones).dt.tz_localize(None) + got = sr.dt.tz_localize(limited_timezones).dt.tz_localize(None) + assert_eq(expect, got) + + +def test_delocalize_naive(): + # delocalizing naive datetimes should be a no-op + psr = pd.Series(["2001-01-01"], dtype="datetime64[ns]") + sr = cudf.from_pandas(psr) + + expect = psr.dt.tz_localize(None) + got = sr.dt.tz_localize(None) + assert_eq(expect, got) + + +@pytest.mark.parametrize( + "from_tz", ["Europe/London", "America/Chicago", "UTC"] +) +@pytest.mark.parametrize( + "to_tz", ["Europe/London", "America/Chicago", "UTC", None] +) +def test_convert(from_tz, to_tz): + from_tz = zoneinfo.ZoneInfo(from_tz) + if to_tz is not None: + to_tz = zoneinfo.ZoneInfo(to_tz) + ps = pd.Series(pd.date_range("2023-01-01", periods=3, freq="h")) + gs = cudf.from_pandas(ps) + ps = ps.dt.tz_localize(from_tz) + gs = gs.dt.tz_localize(from_tz) + expect = ps.dt.tz_convert(to_tz) + got = gs.dt.tz_convert(to_tz) + assert_eq(expect, got) + + +def test_convert_from_naive(): + gs = cudf.Series(cudf.date_range("2023-01-01", periods=3, freq="h")) + with pytest.raises(TypeError): + gs.dt.tz_convert("America/New_York") + + +@pytest.mark.parametrize( + "data,original_timezone,target_timezone", + [ + # DST transition: + (["2023-03-12 01:30:00"], "America/New_York", "America/Los_Angeles"), + # crossing the international date line: + (["2023-05-17 23:30:00"], "Pacific/Auckland", "America/Los_Angeles"), + # timezone with non-integer offset: + (["2023-05-17 12:00:00"], "Asia/Kolkata", "Australia/Eucla"), + # timezone with negative offset: + (["2023-05-17 09:00:00"], "America/Los_Angeles", "Pacific/Auckland"), + # conversion across multiple days: + (["2023-05-16 23:30:00"], "America/New_York", "Asia/Kolkata"), + # timezone with half-hour offset: + (["2023-05-17 12:00:00"], "Asia/Kolkata", "Australia/Adelaide"), + # timezone conversion with a timestamp in the future: + (["2025-01-01 00:00:00"], "America/New_York", "Europe/London"), + # timezone conversion with a timestamp in the past: + (["2000-01-01 12:00:00"], "Europe/Paris", "America/Los_Angeles"), + # timezone conversion with a timestamp at midnight: + (["2023-05-17 00:00:00"], "Asia/Tokyo", "Europe/Paris"), + ], +) +def test_convert_edge_cases(data, original_timezone, target_timezone): + original_timezone = zoneinfo.ZoneInfo(original_timezone) + target_timezone = zoneinfo.ZoneInfo(target_timezone) + ps = pd.Series(data, dtype="datetime64[s]").dt.tz_localize( + original_timezone + ) + gs = cudf.Series(data, dtype="datetime64[s]").dt.tz_localize( + original_timezone + ) + expect = ps.dt.tz_convert(target_timezone) + got = gs.dt.tz_convert(target_timezone) + assert_eq(expect, got) diff --git a/python/cudf/cudf/tests/series/methods/test_astype.py b/python/cudf/cudf/tests/series/methods/test_astype.py index 18b13f7c3d1..609ad8efedd 100644 --- a/python/cudf/cudf/tests/series/methods/test_astype.py +++ b/python/cudf/cudf/tests/series/methods/test_astype.py @@ -1,5 +1,8 @@ # Copyright (c) 2025, NVIDIA CORPORATION. +import datetime +import zoneinfo + import cupy as cp import numpy as np import pandas as pd @@ -531,6 +534,28 @@ def test_datetime_infer_format(data, timezone, datetime_types_as_str): sr.astype(datetime_types_as_str) +@pytest.mark.parametrize("unit", ["ns", "us"]) +def test_astype_aware_to_aware(unit): + ser = cudf.Series( + [datetime.datetime(2020, 1, 1, tzinfo=datetime.timezone.utc)] + ) + result = ser.astype(f"datetime64[{unit}, US/Pacific]") + expected = ser.to_pandas().astype(f"datetime64[{unit}, US/Pacific]") + zoneinfo_type = pd.DatetimeTZDtype( + expected.dtype.unit, zoneinfo.ZoneInfo(str(expected.dtype.tz)) + ) + expected = ser.astype(zoneinfo_type) + assert_eq(result, expected) + + +def test_astype_naive_to_aware_raises(): + ser = cudf.Series([datetime.datetime(2020, 1, 1)]) + with pytest.raises(TypeError): + ser.astype("datetime64[ns, UTC]") + with pytest.raises(TypeError): + ser.to_pandas().astype("datetime64[ns, UTC]") + + @pytest.mark.parametrize( "np_dtype,pd_dtype", [ diff --git a/python/cudf/cudf/tests/series/test_conversion.py b/python/cudf/cudf/tests/series/methods/test_convert_dtypes.py similarity index 90% rename from python/cudf/cudf/tests/series/test_conversion.py rename to python/cudf/cudf/tests/series/methods/test_convert_dtypes.py index 1d680d7860d..e3216d01e77 100644 --- a/python/cudf/cudf/tests/series/test_conversion.py +++ b/python/cudf/cudf/tests/series/methods/test_convert_dtypes.py @@ -1,4 +1,4 @@ -# Copyright (c) 2023-2024, NVIDIA CORPORATION. +# Copyright (c) 2023-2025, NVIDIA CORPORATION. import pandas as pd import pytest @@ -42,7 +42,3 @@ def test_convert_integer_false_convert_floating_true(): .to_pandas(nullable=True) ) assert_eq(result, expected) - - -# Now write the same test, but construct a DataFrame -# as input instead of parametrizing: diff --git a/python/cudf/cudf/tests/series/methods/test_to_pandas.py b/python/cudf/cudf/tests/series/methods/test_to_pandas.py index 1768d6ccc0e..c49a4bfc7f3 100644 --- a/python/cudf/cudf/tests/series/methods/test_to_pandas.py +++ b/python/cudf/cudf/tests/series/methods/test_to_pandas.py @@ -13,6 +13,19 @@ from cudf.testing import assert_eq +def test_to_pandas_index_true_timezone(): + data = [ + "2008-05-12", + "2008-12-12", + "2009-05-12", + ] + dti = cudf.DatetimeIndex(data).tz_localize("UTC") + ser = cudf.Series(dti, index=list("abc")) + result = ser.to_pandas(index=True) + expected = pd.Series(pd.to_datetime(data, utc=True), index=list("abc")) + assert_eq(result, expected) + + @pytest.mark.parametrize( "sr_data,expected_psr", [ diff --git a/python/cudf/cudf/tests/series/test_constructors.py b/python/cudf/cudf/tests/series/test_constructors.py index c0aaf5a1b22..b5b03f4955b 100644 --- a/python/cudf/cudf/tests/series/test_constructors.py +++ b/python/cudf/cudf/tests/series/test_constructors.py @@ -2,6 +2,7 @@ import datetime import decimal import types +import zoneinfo import cupy as cp import numba.cuda @@ -1337,3 +1338,39 @@ def test_timezone_pyarrow_array(): result = cudf.Series(pa_array) expected = pa_array.to_pandas() assert_eq(result, expected) + + +@pytest.mark.parametrize( + "klass", ["Series", "DatetimeIndex", "Index", "CategoricalIndex"] +) +def test_pandas_compatible_non_zoneinfo_raises(klass): + pytz = pytest.importorskip("pytz") + tz = pytz.timezone("US/Pacific") + tz_aware_data = [pd.Timestamp("2020-01-01", tz="UTC").tz_convert(tz)] + pandas_obj = getattr(pd, klass)(tz_aware_data) + with cudf.option_context("mode.pandas_compatible", True): + with pytest.raises(NotImplementedError): + cudf.from_pandas(pandas_obj) + + +@pytest.mark.parametrize( + "klass", ["Series", "DatetimeIndex", "Index", "CategoricalIndex"] +) +def test_from_pandas_obj_tz_aware(klass): + tz = zoneinfo.ZoneInfo("US/Pacific") + tz_aware_data = [pd.Timestamp("2020-01-01", tz="UTC").tz_convert(tz)] + pandas_obj = getattr(pd, klass)(tz_aware_data) + result = cudf.from_pandas(pandas_obj) + expected = getattr(cudf, klass)(tz_aware_data) + assert_eq(result, expected) + + +@pytest.mark.parametrize( + "klass", ["Series", "DatetimeIndex", "Index", "CategoricalIndex"] +) +def test_from_pandas_obj_tz_aware_unsupported(klass): + tz = datetime.timezone(datetime.timedelta(hours=1)) + tz_aware_data = [pd.Timestamp("2020-01-01", tz="UTC").tz_convert(tz)] + pandas_obj = getattr(pd, klass)(tz_aware_data) + with pytest.raises(NotImplementedError): + cudf.from_pandas(pandas_obj) diff --git a/python/cudf/cudf/tests/series/test_datetimelike.py b/python/cudf/cudf/tests/series/test_datetimelike.py deleted file mode 100644 index 400777e46e1..00000000000 --- a/python/cudf/cudf/tests/series/test_datetimelike.py +++ /dev/null @@ -1,300 +0,0 @@ -# Copyright (c) 2023-2025, NVIDIA CORPORATION. - -import datetime -import os -import zoneinfo - -import pandas as pd -import pytest - -import cudf -from cudf import date_range -from cudf.testing import assert_eq - - -def _get_all_zones(): - zones = [] - for root, dirs, files in os.walk("/usr/share/zoneinfo"): - for f in files: - zone_name = ("/".join([root, f])).lstrip("/usr/share/zoneinfo") - try: - _ = pd.DatetimeTZDtype("ns", zone_name) - except Exception: - continue - zones.append(zone_name) - return zones - - -# NOTE: _get_all_zones is a very large list; we likely do NOT want to -# use it for more than a handful of tests -@pytest.fixture(params=_get_all_zones()) -def zone_name(request): - return request.param - - -@pytest.fixture(params=["ns", "us", "ms", "s"]) -def unit(request): - return request.param - - -@pytest.fixture( - params=["America/New_York", "Asia/Tokyo", "CET", "Etc/GMT+1", "UTC"] -) -def tz(request): - return request.param - - -def test_tz_localize(unit, zone_name): - s = cudf.Series(date_range("2001-01-01", "2001-01-02", freq="1s")) - s = s.astype(f" Date: Fri, 22 Aug 2025 19:19:32 -0400 Subject: [PATCH 203/366] remove initial memset of values in parquet reader (#19643) In the parquet reader we pre-fill the output column data/offset buffers with zeroes to make sure we don't have any uninitialized reads. This is slow and seems to be unnecessary, because the parquet reader should write over that memory anyway when reading the data. However, the reader doesn't write to the value buffer's memory where there are nulls, and without zero-initialization, reading this memory leads to crashes. When do we read nulled values/offsets? You'd think it'd be never, but we do so when we: * check_non_empty_nulls() * Trying to debug-print a column to screen * Trying to compare/print any differences between expected/result in tests * element() reads AND USES invalid offsets * etc. So we still remove the pre-zeroing, but instead write zeroes in data_out in the parquet reader decode functions for the entries that are null. This makes it so we only spend time zeroing what we need to (which is probably sparse) rather than the entire column. Also, because we write the input int32 TIME_MILLIS to an output int64 DURATION_MILLISECONDS, we have to fill in the last 4 bytes of zeroes. The performance improvement on my machine is about 5% for most column types, though is negligible for strings (because per is dominated by the string copy, not the offset write). Further work can be done to try to remove the null mask zeroing as well. Authors: - Paul Mattione (https://github.com/pmattione-nvidia) Approvers: - https://github.com/nvdbaranec - Vukasin Milovanovic (https://github.com/vuule) URL: https://github.com/rapidsai/cudf/pull/19643 --- .../io/parquet/parquet_reader_input.cpp | 1 + .../cudf/detail/utilities/batched_memset.hpp | 3 + cpp/src/io/parquet/decode_fixed.cu | 27 ++++- cpp/src/io/parquet/decode_preprocess.cu | 3 + cpp/src/io/parquet/page_data.cu | 30 ++++- cpp/src/io/parquet/page_decode.cuh | 109 ++++++++++++++++++ cpp/src/io/parquet/page_delta_decode.cu | 39 +++++++ cpp/src/io/parquet/reader_impl.cpp | 3 + cpp/src/io/parquet/reader_impl_preprocess.cu | 12 +- 9 files changed, 214 insertions(+), 13 deletions(-) diff --git a/cpp/benchmarks/io/parquet/parquet_reader_input.cpp b/cpp/benchmarks/io/parquet/parquet_reader_input.cpp index d6ae56aa2e4..5bcae5cab23 100644 --- a/cpp/benchmarks/io/parquet/parquet_reader_input.cpp +++ b/cpp/benchmarks/io/parquet/parquet_reader_input.cpp @@ -488,5 +488,6 @@ NVBENCH_BENCH(BM_parquet_read_long_strings) .add_string_axis("io_type", {"DEVICE_BUFFER"}) .set_min_samples(4) .add_int64_axis("cardinality", {0, 1000}) + .add_int64_axis("data_size", {512 << 20}) .add_int64_power_of_two_axis("avg_string_length", nvbench::range(4, 16, 2)); // 16, 64, ... -> 64k diff --git a/cpp/include/cudf/detail/utilities/batched_memset.hpp b/cpp/include/cudf/detail/utilities/batched_memset.hpp index bba50d61274..55e8821b59f 100644 --- a/cpp/include/cudf/detail/utilities/batched_memset.hpp +++ b/cpp/include/cudf/detail/utilities/batched_memset.hpp @@ -17,6 +17,7 @@ #pragma once #include +#include #include #include @@ -46,6 +47,8 @@ void batched_memset(cudf::host_span const> host_buffers, T const value, rmm::cuda_stream_view stream) { + CUDF_FUNC_RANGE(); + // Copy buffer spans into device memory and then get sizes auto buffers = cudf::detail::make_device_uvector_async( host_buffers, stream, cudf::get_current_device_resource_ref()); diff --git a/cpp/src/io/parquet/decode_fixed.cu b/cpp/src/io/parquet/decode_fixed.cu index 051dd79b5db..71c44f915fa 100644 --- a/cpp/src/io/parquet/decode_fixed.cu +++ b/cpp/src/io/parquet/decode_fixed.cu @@ -164,7 +164,10 @@ __device__ void decode_fixed_width_values( // Reading INT32 TIME_MILLIS into 64-bit DURATION_MILLISECONDS // TIME_MILLIS is the only duration type stored as int32: // https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#deprecated-time-convertedtype - read_fixed_width_value_fast(s, sb, src_pos, static_cast(dst)); + auto const dst_ptr = static_cast(dst); + read_fixed_width_value_fast(s, sb, src_pos, dst_ptr); + // zero out most significant bytes + cuda::std::memset(dst_ptr + 1, 0, sizeof(int32_t)); } else if (s->ts_scale) { read_int64_timestamp(s, sb, src_pos, static_cast(dst)); } else { @@ -1148,9 +1151,10 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size_t, 8) // - valid_count: number of non-null values we have decoded so far. In each iteration of the // loop below, we look at the number of valid items (which could be all for non-nullable), // and valid_count is that running count. - int processed_count = 0; - int valid_count = 0; - size_t string_output_offset = 0; + int processed_count = 0; + int valid_count = 0; + size_t string_output_offset = 0; + int const init_valid_map_offset = s->nesting_info[s->col.max_nesting_depth - 1].valid_map_offset; // Skip ahead in the decoding so that we don't repeat work (skipped_leaf_values = 0 for non-lists) auto const skipped_leaf_values = s->page.skipped_leaf_values; @@ -1253,6 +1257,21 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size_t, 8) valid_count = next_valid_count; } + // Zero-fill null positions after decoding valid values + if (should_process_nulls) { + uint32_t const dtype_len = has_strings_t ? sizeof(cudf::size_type) : s->dtype_len; + int const num_values = [&]() { + if constexpr (has_lists_t) { + auto const& ni = s->nesting_info[s->col.max_nesting_depth - 1]; + return ni.valid_map_offset - init_valid_map_offset; + } else { + return s->num_rows; + } + }(); + zero_fill_null_positions_shared( + s, dtype_len, init_valid_map_offset, num_values, t); + } + if constexpr (has_strings_t) { // For large strings, update the initial string buffer offset to be used during large string // column construction. Otherwise, convert string sizes to final offsets. diff --git a/cpp/src/io/parquet/decode_preprocess.cu b/cpp/src/io/parquet/decode_preprocess.cu index 756f334ebe5..d0da5006780 100644 --- a/cpp/src/io/parquet/decode_preprocess.cu +++ b/cpp/src/io/parquet/decode_preprocess.cu @@ -18,6 +18,7 @@ #include "io/utilities/column_buffer.hpp" #include "page_decode.cuh" +#include #include #include @@ -518,6 +519,8 @@ void compute_page_sizes(cudf::detail::hostdevice_span pages, int level_type_size, rmm::cuda_stream_view stream) { + CUDF_FUNC_RANGE(); + dim3 dim_block(preprocess_block_size, 1); dim3 dim_grid(pages.size(), 1); // 1 threadblock per page diff --git a/cpp/src/io/parquet/page_data.cu b/cpp/src/io/parquet/page_data.cu index cc215e272c0..9f391588c1b 100644 --- a/cpp/src/io/parquet/page_data.cu +++ b/cpp/src/io/parquet/page_data.cu @@ -105,6 +105,9 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size) __shared__ level_t rep[rolling_buf_size]; // circular buffer of repetition level values __shared__ level_t def[rolling_buf_size]; // circular buffer of definition level values + // Capture initial valid_map_offset before any processing that might modify it + int const init_valid_map_offset = s->nesting_info[s->col.max_nesting_depth - 1].valid_map_offset; + // skipped_leaf_values will always be 0 for flat hierarchies. uint32_t skipped_leaf_values = s->page.skipped_leaf_values; while (s->error == 0 && @@ -216,6 +219,16 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size) } block.sync(); } + + // Zero-fill null positions after decoding valid values + int const leaf_level_index = s->col.max_nesting_depth - 1; + auto const& ni = s->nesting_info[leaf_level_index]; + if (ni.valid_map != nullptr) { + int const num_values = ni.valid_map_offset - init_valid_map_offset; + zero_fill_null_positions_shared( + s, s->dtype_len, init_valid_map_offset, num_values, static_cast(block.thread_rank())); + } + if (block.thread_rank() == 0 and s->error != 0) { set_error(s->error, error_code); } } @@ -308,6 +321,9 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size) PageNestingDecodeInfo* nesting_info_base = s->nesting_info; + // Capture initial valid_map_offset before any processing that might modify it + int const init_valid_map_offset = s->nesting_info[s->col.max_nesting_depth - 1].valid_map_offset; + if (s->dict_base) { out_warp_id = (s->dict_bits > 0) ? 2 : 1; } else { @@ -447,7 +463,10 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size) // Reading INT32 TIME_MILLIS into 64-bit DURATION_MILLISECONDS // TIME_MILLIS is the only duration type stored as int32: // https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#deprecated-time-convertedtype - read_fixed_width_value_fast(s, sb, val_src_pos, static_cast(dst)); + auto const dst_ptr = static_cast(dst); + read_fixed_width_value_fast(s, sb, val_src_pos, dst_ptr); + // zero out most significant bytes + cuda::std::memset(dst_ptr + 1, 0, sizeof(int32_t)); } else if (s->ts_scale) { read_int64_timestamp(s, sb, val_src_pos, static_cast(dst)); } else { @@ -464,6 +483,15 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size) } __syncthreads(); } + + // Zero-fill null positions after decoding valid values + auto const& ni = s->nesting_info[s->col.max_nesting_depth - 1]; + if (ni.valid_map != nullptr) { + int const num_values = ni.valid_map_offset - init_valid_map_offset; + zero_fill_null_positions_shared( + s, s->dtype_len, init_valid_map_offset, num_values, static_cast(block.thread_rank())); + } + if (block.thread_rank() == 0 and s->error != 0) { set_error(s->error, error_code); } } diff --git a/cpp/src/io/parquet/page_decode.cuh b/cpp/src/io/parquet/page_decode.cuh index 6c987230709..0ab76a5fd4b 100644 --- a/cpp/src/io/parquet/page_decode.cuh +++ b/cpp/src/io/parquet/page_decode.cuh @@ -1506,4 +1506,113 @@ inline __device__ bool setup_local_page_info(page_state_s* const s, return true; } +/** + * @brief Zero-fill null positions in output data using parallel per-validity-block processing + * + * This function processes the validity bitmap and zero-fills all positions in the output + * data that correspond to null values. It uses a parallel approach where each thread + * handles one 32-bit validity block at a time, looping only over the zero bits (null positions) + * within that block. + * + * @tparam block_size CUDA block size for the kernel + * @param s Page state containing all necessary information + * @param dtype_len Size of each data element in bytes + * @param valid_map_offset Starting bit offset in the validity map + * @param num_values Number of values to process + * @param t Thread index within the block + */ +template +__device__ void zero_fill_null_positions_shared( + page_state_s* s, uint32_t dtype_len, int valid_map_offset, int num_values, int t) +{ + auto const block = cg::this_thread_block(); + auto const warp = cg::tiled_partition(block); + + // nesting level that is storing actual leaf values + int const leaf_level_index = s->col.max_nesting_depth - 1; + auto const& ni = s->nesting_info[leaf_level_index]; + + // Check if we have nulls to fill + if ((ni.valid_map == nullptr) || (num_values == 0)) { return; } + + auto const data_out = ni.data_out; + + constexpr int bits_per_mask = cudf::detail::size_in_bits(); + using cudf::detail::warp_size; + constexpr int num_warps = block_size / warp_size; + + // Calculate the range of validity blocks we need to process + int const start_bit_idx = valid_map_offset; + int const end_bit_idx = valid_map_offset + num_values; + int const start_block = start_bit_idx / bits_per_mask; + int const end_block = cudf::util::div_rounding_up_safe(end_bit_idx, bits_per_mask); + int const num_blocks = end_block - start_block; + + int const warp_id = t / warp_size; + int const lane_id = warp.thread_rank(); + + // Helper lambda for warp-parallel bit processing + auto process_block_parallel = [&](int block_idx) { + static_assert(bits_per_mask == warp_size, "if 64bit mask, use 2 warps per mask"); + + cudf::bitmask_type validity_word = ni.valid_map[block_idx]; + int const block_start_bit = block_idx * bits_per_mask; + + // Each thread in the warp processes one bit + int const bit_idx = block_start_bit + lane_id; + int const dst_pos = bit_idx - valid_map_offset; + + // Check if this bit is within our range + bool in_range = (bit_idx >= start_bit_idx && bit_idx < end_bit_idx); + + // Check if this bit is null (0 in validity mask) + bool const is_null = not cudf::bit_is_set(&validity_word, lane_id); + + if (in_range && is_null) { + void* const dst = data_out + (static_cast(dst_pos) * dtype_len); + cuda::std::memset(dst, 0, dtype_len); + } + }; + + // Helper lambda for sequential bit processing (fallback for remaining blocks) + auto process_block_sequential = [&](int block_idx) { + cudf::bitmask_type validity_word = ni.valid_map[block_idx]; + cudf::bitmask_type null_positions = ~validity_word; + int const dst_pos_first_bit = block_idx * bits_per_mask - valid_map_offset; + + while (null_positions != 0) { + int const bit_pos = __ffs(null_positions) - 1; + int const dst_pos = dst_pos_first_bit + bit_pos; + + void* const dst = data_out + (static_cast(dst_pos) * dtype_len); + cuda::std::memset(dst, 0, dtype_len); + + null_positions &= (null_positions - 1); + } + }; + + // Phase 1: Assign specific blocks to warps for warp-parallel processing + if (warp_id == 0) { + // Warp 0: Process first block + process_block_parallel(start_block); + } else if (warp_id == 1 && num_blocks > 1) { + // Warp 1: Process last block (if different from first) + process_block_parallel(end_block - 1); + } else if (warp_id >= 2) { + // Warps 2+: Process additional blocks from the beginning + int const block_idx = start_block + (warp_id - 1); + if (block_idx < (end_block - 1)) { process_block_parallel(block_idx); } + } + + // Phase 2: All warps cooperatively process remaining middle blocks + auto const last_block_processed = static_cast(num_blocks > 1); + int const remaining_start = start_block + num_warps - last_block_processed; + int const remaining_end = end_block - last_block_processed; + for (int block_idx = remaining_start + t; block_idx < remaining_end; block_idx += block_size) { + process_block_sequential(block_idx); + } + + __syncthreads(); +} + } // namespace cudf::io::parquet::detail diff --git a/cpp/src/io/parquet/page_delta_decode.cu b/cpp/src/io/parquet/page_delta_decode.cu index 49f21bb2a15..660f7f96876 100644 --- a/cpp/src/io/parquet/page_delta_decode.cu +++ b/cpp/src/io/parquet/page_delta_decode.cu @@ -342,6 +342,9 @@ CUDF_KERNEL void __launch_bounds__(decode_delta_binary_block_size) // Must be evaluated after setup_local_page_info bool const has_repetition = s->col.max_level[level_type::REPETITION] > 0; + // Capture initial valid_map_offset before any processing that might modify it + int const init_valid_map_offset = s->nesting_info[s->col.max_nesting_depth - 1].valid_map_offset; + // Write list offsets and exit if the page does not need to be decoded if (not page_mask[page_idx]) { auto& page = pages[page_idx]; @@ -434,6 +437,14 @@ CUDF_KERNEL void __launch_bounds__(decode_delta_binary_block_size) block.sync(); } + // Zero-fill null positions after decoding valid values + auto const& ni = s->nesting_info[s->col.max_nesting_depth - 1]; + if (ni.valid_map != nullptr) { + int const num_values = ni.valid_map_offset - init_valid_map_offset; + zero_fill_null_positions_shared( + s, s->dtype_len, init_valid_map_offset, num_values, static_cast(block.thread_rank())); + } + if (block.thread_rank() == 0 and s->error != 0) { set_error(s->error, error_code); } } @@ -488,6 +499,9 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size) bool const has_repetition = s->col.max_level[level_type::REPETITION] > 0; + // Capture initial valid_map_offset before any processing that might modify it + int const init_valid_map_offset = s->nesting_info[s->col.max_nesting_depth - 1].valid_map_offset; + // Write list/string offsets and exit if the page does not need to be decoded if (not page_mask[page_idx]) { auto page = &pages[page_idx]; @@ -615,6 +629,17 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size) block.sync(); } + // Zero-fill null positions after decoding valid values + auto const& ni = s->nesting_info[leaf_level_index]; + if (ni.valid_map != nullptr) { + int const num_values = ni.valid_map_offset - init_valid_map_offset; + zero_fill_null_positions_shared(s, + sizeof(size_type), + init_valid_map_offset, + num_values, + static_cast(block.thread_rank())); + } + // For large strings, update the initial string buffer offset to be used during large string // column construction. Otherwise, convert string sizes to final offsets. if (s->col.is_large_string_col) { @@ -684,6 +709,9 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size) bool const has_repetition = s->col.max_level[level_type::REPETITION] > 0; + // Capture initial valid_map_offset before any processing that might modify it + int const init_valid_map_offset = s->nesting_info[s->col.max_nesting_depth - 1].valid_map_offset; + // Write list/string offsets and exit if the page does not need to be decoded if (not page_mask[page_idx]) { auto page = &pages[page_idx]; @@ -806,6 +834,17 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size) block.sync(); } + // Zero-fill null positions after decoding valid values + auto const& ni = nesting_info_base[leaf_level_index]; + if (ni.valid_map != nullptr) { + int const num_values = ni.valid_map_offset - init_valid_map_offset; + zero_fill_null_positions_shared(s, + sizeof(size_type), + init_valid_map_offset, + num_values, + static_cast(block.thread_rank())); + } + // For large strings, update the initial string buffer offset to be used during large string // column construction. Otherwise, convert string sizes to final offsets. if (s->col.is_large_string_col) { diff --git a/cpp/src/io/parquet/reader_impl.cpp b/cpp/src/io/parquet/reader_impl.cpp index 96aa416d27b..ccc75fbd293 100644 --- a/cpp/src/io/parquet/reader_impl.cpp +++ b/cpp/src/io/parquet/reader_impl.cpp @@ -18,6 +18,7 @@ #include "error.hpp" +#include #include #include #include @@ -49,6 +50,8 @@ inline bool is_treat_fixed_length_as_string(std::optional const& lo void reader_impl::decode_page_data(read_mode mode, size_t skip_rows, size_t num_rows) { + CUDF_FUNC_RANGE(); + auto& pass = *_pass_itm_data; auto& subpass = *pass.subpass; diff --git a/cpp/src/io/parquet/reader_impl_preprocess.cu b/cpp/src/io/parquet/reader_impl_preprocess.cu index 1c5c1f2641c..d617b13165d 100644 --- a/cpp/src/io/parquet/reader_impl_preprocess.cu +++ b/cpp/src/io/parquet/reader_impl_preprocess.cu @@ -497,6 +497,8 @@ void reader_impl::generate_list_column_row_counts(is_estimate_row_counts is_esti void reader_impl::preprocess_subpass_pages(read_mode mode, size_t chunk_read_limit) { + CUDF_FUNC_RANGE(); + auto& pass = *_pass_itm_data; auto& subpass = *pass.subpass; @@ -639,6 +641,8 @@ void reader_impl::preprocess_subpass_pages(read_mode mode, size_t chunk_read_lim void reader_impl::allocate_columns(read_mode mode, size_t skip_rows, size_t num_rows) { + CUDF_FUNC_RANGE(); + auto& pass = *_pass_itm_data; auto& subpass = *pass.subpass; @@ -665,8 +669,6 @@ void reader_impl::allocate_columns(read_mode mode, size_t skip_rows, size_t num_ // buffers if they are not part of a list hierarchy. mark down // if we have any list columns that need further processing. bool has_lists = false; - // Casting to std::byte since data buffer pointer is void * - std::vector> memset_bufs; // Validity Buffer is a uint32_t pointer std::vector> nullmask_bufs; @@ -694,8 +696,6 @@ void reader_impl::allocate_columns(read_mode mode, size_t skip_rows, size_t num_ std::overflow_error); out_buf.create_with_mask( out_buf_size, cudf::mask_state::UNINITIALIZED, false, _stream, _mr); - memset_bufs.emplace_back(static_cast(out_buf.data()), - out_buf.data_size()); nullmask_bufs.emplace_back( out_buf.null_mask(), cudf::util::round_up_safe(out_buf.null_mask_size(), sizeof(cudf::bitmask_type)) / @@ -813,8 +813,6 @@ void reader_impl::allocate_columns(read_mode mode, size_t skip_rows, size_t num_ // we're going to start null mask as all valid and then turn bits off if necessary out_buf.create_with_mask( buffer_size, cudf::mask_state::UNINITIALIZED, false, _stream, _mr); - memset_bufs.emplace_back(static_cast(out_buf.data()), - out_buf.data_size()); nullmask_bufs.emplace_back( out_buf.null_mask(), cudf::util::round_up_safe(out_buf.null_mask_size(), sizeof(cudf::bitmask_type)) / @@ -824,8 +822,6 @@ void reader_impl::allocate_columns(read_mode mode, size_t skip_rows, size_t num_ } } - cudf::detail::batched_memset( - memset_bufs, static_cast(0), _stream); // Need to set null mask bufs to all high bits cudf::detail::batched_memset( nullmask_bufs, std::numeric_limits::max(), _stream); From 5be7bebf6d650d4606e545c9f90908fef050446a Mon Sep 17 00:00:00 2001 From: David Wendt <45795991+davidwendt@users.noreply.github.com> Date: Mon, 25 Aug 2025 10:33:46 -0400 Subject: [PATCH 204/366] Add examples of null handling to doxygen for cudf::rank (#19774) Updates the doxygen for `cudf::rank` to illustrate how the null-policy, null-order, and order parameter combinations behave for a given input. Closes #19772 Authors: - David Wendt (https://github.com/davidwendt) Approvers: - Matthew Murray (https://github.com/Matt711) - Bradley Dice (https://github.com/bdice) - Shruti Shivakumar (https://github.com/shrshi) URL: https://github.com/rapidsai/cudf/pull/19774 --- cpp/include/cudf/sorting.hpp | 47 ++++++++++++++++++++++++++++-------- 1 file changed, 37 insertions(+), 10 deletions(-) diff --git a/cpp/include/cudf/sorting.hpp b/cpp/include/cudf/sorting.hpp index 532cec53e8f..432d8b6f332 100644 --- a/cpp/include/cudf/sorting.hpp +++ b/cpp/include/cudf/sorting.hpp @@ -177,28 +177,55 @@ std::unique_ptr
stable_sort_by_key( * value starts from 1. * * @code{.pseudo} - * input = { 3, 4, 5, 4, 1, 2} - * Result for different rank_method are + * Using default order::ASCENDING + * input = {3, 4, 5, 4, 1, 2} + * Results for different rank_methods * FIRST = {3, 4, 6, 5, 1, 2} * AVERAGE = {3, 4.5, 6, 4.5, 1, 2} * MIN = {3, 4, 6, 4, 1, 2} * MAX = {3, 5, 6, 5, 1, 2} * DENSE = {3, 4, 5, 4, 1, 2} + * + * For null_policy::INCLUDE, null_order::AFTER + * input = {3, 4, null, 4, 1, 2} + * The results are the same as above. + * + * For null_policy::INCLUDE, null_order::BEFORE + * input = {3, 4, null, 4, 1, 2} + * Results for different rank_methods + * FIRST = {4, 5, 1, 6, 2, 3} + * AVERAGE = {4, 5.5, 1, 5.5, 2, 3} + * MIN = {4, 5, 1, 5, 2, 3} + * MAX = {4, 6, 1, 6, 2, 3} + * DENSE = {4, 5, 1, 5, 2, 3} + * + * For null_policy::EXCLUDE (null_order::AFTER only) + * input = {3, 4, null, 4, 1, 2} + * Results for different rank_methods + * FIRST = {3, 4, null, 5, 1, 2} + * AVERAGE = {3, 4.5, null, 4.5, 1, 2} + * MIN = {3, 4, null, 4, 1, 2} + * MAX = {3, 5, null, 5, 1, 2} + * DENSE = {3, 4, null, 4, 1, 2} * @endcode * + * For null_policy::EXCLUDE with null_order::BEFORE, using column_order::ASCENDING + * will result in undefined behavior. Likewise for null_policy::EXCLUDE with + * null_order::AFTER and column_order::DESCENDING. + * + * The output column type will be `double` when `method=rank_method::AVERAGE` or `percentage=True` + * and `size_type` otherwise. + * * @param input The column to rank * @param method The ranking method used for tie breaking (same values) * @param column_order The desired sort order for ranking - * @param null_handling flag to include nulls during ranking. If nulls are not - * included, corresponding rank will be null. - * @param null_precedence The desired order of null compared to other elements - * for column - * @param percentage flag to convert ranks to percentage in range (0,1] + * @param null_handling Flag to include nulls during ranking. + * If nulls are excluded, the corresponding rank will be null. + * @param null_precedence The desired order of null rows compared to other elements + * @param percentage Flag to convert ranks to percentage in range (0,1] * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned column's device memory - * @return A column of containing the rank of the each element of the column of `input`. The output - * column type will be `size_type`column by default or else `double` when - * `method=rank_method::AVERAGE` or `percentage=True` + * @return A column of containing the rank of the each element of the column of `input` */ std::unique_ptr rank( column_view const& input, From c79567ab24d2b53d84768a7b9ccb9edf47206e60 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Mon, 25 Aug 2025 10:02:56 -0700 Subject: [PATCH 205/366] Move test_buffer/column/column_accesor/cuda_apply.py to new cudf classic testing directory (#19737) Towards https://github.com/rapidsai/cudf/issues/9999 Towards https://github.com/rapidsai/cudf/issues/15723 Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - Matthew Murray (https://github.com/Matt711) URL: https://github.com/rapidsai/cudf/pull/19737 --- .../methods/test_apply_chunks.py} | 166 +++--------------- .../dataframe/methods/test_apply_rows.py | 135 ++++++++++++++ .../tests/{ => input_output}/test_dlpack.py | 0 .../cudf/tests/private_objects/__init__.py | 0 .../{ => private_objects}/test_buffer.py | 0 .../{ => private_objects}/test_column.py | 89 +++------- .../test_column_accessor.py | 0 .../{ => private_objects}/test_compile_udf.py | 5 +- .../test_extension_compilation.py | 0 .../test_memory_records_report.py} | 0 .../{ => private_objects}/test_nrt_stats.py | 0 .../cudf/tests/series/methods/test_astype.py | 23 +++ 12 files changed, 212 insertions(+), 206 deletions(-) rename python/cudf/cudf/tests/{test_cuda_apply.py => dataframe/methods/test_apply_chunks.py} (50%) create mode 100644 python/cudf/cudf/tests/dataframe/methods/test_apply_rows.py rename python/cudf/cudf/tests/{ => input_output}/test_dlpack.py (100%) create mode 100644 python/cudf/cudf/tests/private_objects/__init__.py rename python/cudf/cudf/tests/{ => private_objects}/test_buffer.py (100%) rename python/cudf/cudf/tests/{ => private_objects}/test_column.py (87%) rename python/cudf/cudf/tests/{ => private_objects}/test_column_accessor.py (100%) rename python/cudf/cudf/tests/{ => private_objects}/test_compile_udf.py (96%) rename python/cudf/cudf/tests/{ => private_objects}/test_extension_compilation.py (100%) rename python/cudf/cudf/tests/{test_performance_tracking.py => private_objects/test_memory_records_report.py} (100%) rename python/cudf/cudf/tests/{ => private_objects}/test_nrt_stats.py (100%) diff --git a/python/cudf/cudf/tests/test_cuda_apply.py b/python/cudf/cudf/tests/dataframe/methods/test_apply_chunks.py similarity index 50% rename from python/cudf/cudf/tests/test_cuda_apply.py rename to python/cudf/cudf/tests/dataframe/methods/test_apply_chunks.py index f7b0af9e51a..f215a29de89 100644 --- a/python/cudf/cudf/tests/test_cuda_apply.py +++ b/python/cudf/cudf/tests/dataframe/methods/test_apply_chunks.py @@ -1,105 +1,11 @@ -# Copyright (c) 2018-2025, NVIDIA CORPORATION. - -""" -Test method that apply GPU kernel to a frame. -""" +# Copyright (c) 2025, NVIDIA CORPORATION. import numpy as np import pytest from numba import cuda -import cudf from cudf import DataFrame -from cudf.core.column import column from cudf.testing import assert_eq -from cudf.testing._utils import gen_rand_series - - -def _kernel_multiply(a, b, out): - # numba doesn't support zip(..., strict=True), so we must tell ruff to ignore it. - for i, (x, y) in enumerate(zip(a, b)): # noqa: B905 - out[i] = x * y - - -@pytest.mark.parametrize("dtype", [np.dtype("float32"), np.dtype("float64")]) -@pytest.mark.parametrize("has_nulls", [False, True]) -@pytest.mark.parametrize("pessimistic", [False, True]) -def test_dataframe_apply_rows(dtype, has_nulls, pessimistic): - count = 1000 - gdf_series_a = gen_rand_series(dtype, count, has_nulls=has_nulls) - gdf_series_b = gen_rand_series(dtype, count, has_nulls=has_nulls) - gdf_series_c = gen_rand_series(dtype, count, has_nulls=has_nulls) - - if pessimistic: - # pessimistically combine the null masks - gdf_series_expected = gdf_series_a * gdf_series_b - else: - # optimistically ignore the null masks - a = cudf.Series._from_column( - column.build_column(gdf_series_a.data, dtype) - ) - b = cudf.Series._from_column( - column.build_column(gdf_series_b.data, dtype) - ) - gdf_series_expected = a * b - - df_expected = cudf.DataFrame( - { - "a": gdf_series_a, - "b": gdf_series_b, - "c": gdf_series_c, - "out": gdf_series_expected, - } - ) - - df_original = cudf.DataFrame( - {"a": gdf_series_a, "b": gdf_series_b, "c": gdf_series_c} - ) - - with pytest.warns(FutureWarning): - df_actual = df_original.apply_rows( - _kernel_multiply, - ["a", "b"], - {"out": dtype}, - {}, - pessimistic_nulls=pessimistic, - ) - - assert_eq(df_expected, df_actual) - - -def test_df_apply_rows(): - nelem = 20 - - def kernel(in1, in2, in3, out1, out2, extra1, extra2): - for i, (x, y, z) in enumerate(zip(in1, in2, in3)): # noqa: B905 - out1[i] = extra2 * x - extra1 * y - out2[i] = y - extra1 * z - - df = DataFrame() - df["in1"] = in1 = np.arange(nelem) - df["in2"] = in2 = np.arange(nelem) - df["in3"] = in3 = np.arange(nelem) - - extra1 = 2.3 - extra2 = 3.4 - - expect_out1 = extra2 * in1 - extra1 * in2 - expect_out2 = in2 - extra1 * in3 - - with pytest.warns(FutureWarning): - outdf = df.apply_rows( - kernel, - incols=["in1", "in2", "in3"], - outcols=dict(out1=np.float64, out2=np.float64), - kwargs=dict(extra1=extra1, extra2=extra2), - ) - - got_out1 = outdf["out1"].to_numpy() - got_out2 = outdf["out2"].to_numpy() - - np.testing.assert_array_almost_equal(got_out1, expect_out1) - np.testing.assert_array_almost_equal(got_out2, expect_out2) @pytest.mark.parametrize("chunksize", [1, 4, 23]) @@ -111,10 +17,17 @@ def kernel(in1, in2, in3, out1, out2, extra1, extra2): out1[i] = extra2 * x - extra1 * y + z out2[i] = i - df = DataFrame() - df["in1"] = in1 = np.arange(nelem) - df["in2"] = in2 = np.arange(nelem) - df["in3"] = in3 = np.arange(nelem) + in1 = np.arange(nelem) + in2 = np.arange(nelem) + in3 = np.arange(nelem) + + df = DataFrame( + { + "in1": in1, + "in2": in2, + "in3": in3, + } + ) extra1 = 2.3 extra2 = 3.4 @@ -228,37 +141,6 @@ def kernel(in1, in2, in3, out1, out2, extra1, extra2): np.testing.assert_array_almost_equal(got_out2.to_numpy(), expect_out2) -def test_df_apply_rows_incols_mapping(): - nelem = 20 - - def kernel(x, y, z, out1, out2, extra1, extra2): - for i, (a, b, c) in enumerate(zip(x, y, z)): # noqa: B905 - out1[i] = extra2 * a - extra1 * b - out2[i] = b - extra1 * c - - df = DataFrame() - df["in1"] = in1 = np.arange(nelem) - df["in2"] = in2 = np.arange(nelem) - df["in3"] = in3 = np.arange(nelem) - - extra1 = 2.3 - extra2 = 3.4 - - expected_out = DataFrame() - expected_out["out1"] = extra2 * in1 - extra1 * in2 - expected_out["out2"] = in2 - extra1 * in3 - - with pytest.warns(FutureWarning): - outdf = df.apply_rows( - kernel, - incols={"in1": "x", "in2": "y", "in3": "z"}, - outcols=dict(out1=np.float64, out2=np.float64), - kwargs=dict(extra1=extra1, extra2=extra2), - ) - - assert_eq(outdf[["out1", "out2"]], expected_out) - - @pytest.mark.parametrize("chunksize", [1, 4, 23]) def test_df_apply_chunks_incols_mapping(chunksize): nelem = 20 @@ -268,17 +150,27 @@ def kernel(q, p, r, out1, out2, extra1, extra2): out1[i] = extra2 * a - extra1 * b + c out2[i] = i - df = DataFrame() - df["in1"] = in1 = np.arange(nelem) - df["in2"] = in2 = np.arange(nelem) - df["in3"] = in3 = np.arange(nelem) + in1 = np.arange(nelem) + in2 = np.arange(nelem) + in3 = np.arange(nelem) + + df = DataFrame( + { + "in1": in1, + "in2": in2, + "in3": in3, + } + ) extra1 = 2.3 extra2 = 3.4 - expected_out = DataFrame() - expected_out["out1"] = extra2 * in1 - extra1 * in2 + in3 - expected_out["out2"] = np.arange(len(df)) % chunksize + expected_out = DataFrame( + { + "out1": extra2 * in1 - extra1 * in2 + in3, + "out2": np.arange(len(df)) % chunksize, + } + ) outdf = df.apply_chunks( kernel, diff --git a/python/cudf/cudf/tests/dataframe/methods/test_apply_rows.py b/python/cudf/cudf/tests/dataframe/methods/test_apply_rows.py new file mode 100644 index 00000000000..dbdec9e5e46 --- /dev/null +++ b/python/cudf/cudf/tests/dataframe/methods/test_apply_rows.py @@ -0,0 +1,135 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + +import numpy as np +import pytest + +import cudf +from cudf import DataFrame +from cudf.core.column import column +from cudf.testing import assert_eq +from cudf.testing._utils import gen_rand_series + + +def _kernel_multiply(a, b, out): + # numba doesn't support zip(..., strict=True), so we must tell ruff to ignore it. + for i, (x, y) in enumerate(zip(a, b)): # noqa: B905 + out[i] = x * y + + +@pytest.mark.parametrize("dtype", [np.dtype("float32"), np.dtype("float64")]) +@pytest.mark.parametrize("has_nulls", [False, True]) +@pytest.mark.parametrize("pessimistic", [False, True]) +def test_dataframe_apply_rows(dtype, has_nulls, pessimistic): + count = 1000 + gdf_series_a = gen_rand_series(dtype, count, has_nulls=has_nulls) + gdf_series_b = gen_rand_series(dtype, count, has_nulls=has_nulls) + gdf_series_c = gen_rand_series(dtype, count, has_nulls=has_nulls) + + if pessimistic: + # pessimistically combine the null masks + gdf_series_expected = gdf_series_a * gdf_series_b + else: + # optimistically ignore the null masks + a = cudf.Series._from_column( + column.build_column(gdf_series_a.data, dtype) + ) + b = cudf.Series._from_column( + column.build_column(gdf_series_b.data, dtype) + ) + gdf_series_expected = a * b + + df_expected = cudf.DataFrame( + { + "a": gdf_series_a, + "b": gdf_series_b, + "c": gdf_series_c, + "out": gdf_series_expected, + } + ) + + df_original = cudf.DataFrame( + {"a": gdf_series_a, "b": gdf_series_b, "c": gdf_series_c} + ) + + with pytest.warns(FutureWarning): + df_actual = df_original.apply_rows( + _kernel_multiply, + ["a", "b"], + {"out": dtype}, + {}, + pessimistic_nulls=pessimistic, + ) + + assert_eq(df_expected, df_actual) + + +def test_df_apply_rows(): + nelem = 20 + + def kernel(in1, in2, in3, out1, out2, extra1, extra2): + for i, (x, y, z) in enumerate(zip(in1, in2, in3)): # noqa: B905 + out1[i] = extra2 * x - extra1 * y + out2[i] = y - extra1 * z + + in1 = np.arange(nelem) + in2 = np.arange(nelem) + in3 = np.arange(nelem) + + df = DataFrame( + { + "in1": in1, + "in2": in2, + "in3": in3, + } + ) + + extra1 = 2.3 + extra2 = 3.4 + + expect_out1 = extra2 * in1 - extra1 * in2 + expect_out2 = in2 - extra1 * in3 + + with pytest.warns(FutureWarning): + outdf = df.apply_rows( + kernel, + incols=["in1", "in2", "in3"], + outcols=dict(out1=np.float64, out2=np.float64), + kwargs=dict(extra1=extra1, extra2=extra2), + ) + + got_out1 = outdf["out1"].to_numpy() + got_out2 = outdf["out2"].to_numpy() + + np.testing.assert_array_almost_equal(got_out1, expect_out1) + np.testing.assert_array_almost_equal(got_out2, expect_out2) + + +def test_df_apply_rows_incols_mapping(): + nelem = 20 + + def kernel(x, y, z, out1, out2, extra1, extra2): + for i, (a, b, c) in enumerate(zip(x, y, z)): # noqa: B905 + out1[i] = extra2 * a - extra1 * b + out2[i] = b - extra1 * c + + df = DataFrame() + df["in1"] = in1 = np.arange(nelem) + df["in2"] = in2 = np.arange(nelem) + df["in3"] = in3 = np.arange(nelem) + + extra1 = 2.3 + extra2 = 3.4 + + expected_out = DataFrame() + expected_out["out1"] = extra2 * in1 - extra1 * in2 + expected_out["out2"] = in2 - extra1 * in3 + + with pytest.warns(FutureWarning): + outdf = df.apply_rows( + kernel, + incols={"in1": "x", "in2": "y", "in3": "z"}, + outcols=dict(out1=np.float64, out2=np.float64), + kwargs=dict(extra1=extra1, extra2=extra2), + ) + + assert_eq(outdf[["out1", "out2"]], expected_out) diff --git a/python/cudf/cudf/tests/test_dlpack.py b/python/cudf/cudf/tests/input_output/test_dlpack.py similarity index 100% rename from python/cudf/cudf/tests/test_dlpack.py rename to python/cudf/cudf/tests/input_output/test_dlpack.py diff --git a/python/cudf/cudf/tests/private_objects/__init__.py b/python/cudf/cudf/tests/private_objects/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/python/cudf/cudf/tests/test_buffer.py b/python/cudf/cudf/tests/private_objects/test_buffer.py similarity index 100% rename from python/cudf/cudf/tests/test_buffer.py rename to python/cudf/cudf/tests/private_objects/test_buffer.py diff --git a/python/cudf/cudf/tests/test_column.py b/python/cudf/cudf/tests/private_objects/test_column.py similarity index 87% rename from python/cudf/cudf/tests/test_column.py rename to python/cudf/cudf/tests/private_objects/test_column.py index 15988673bcd..6ed3111b587 100644 --- a/python/cudf/cudf/tests/test_column.py +++ b/python/cudf/cudf/tests/private_objects/test_column.py @@ -5,31 +5,17 @@ import pandas as pd import pyarrow as pa import pytest +from numba import cuda import cudf from cudf.core.column.column import _can_values_be_equal, as_column from cudf.testing import assert_eq from cudf.testing._utils import assert_exceptions_equal -from cudf.utils import dtypes as dtypeutils - -dtypes = sorted( - list( - dtypeutils.ALL_TYPES - - { - "datetime64[s]", - "datetime64[ms]", - "datetime64[us]", - "timedelta64[s]", - "timedelta64[ms]", - "timedelta64[us]", - } - ) -) -@pytest.fixture(params=dtypes, ids=dtypes) -def pandas_input(request): - dtype = request.param +@pytest.fixture +def pandas_input(all_supported_types_as_str): + dtype = all_supported_types_as_str rng = np.random.default_rng(seed=0) size = 100 @@ -72,7 +58,7 @@ def str_host_view(list_of_str, to_dtype): @pytest.mark.parametrize("offset", [0, 1, 15]) @pytest.mark.parametrize("size", [50, 10, 0]) def test_column_offset_and_size(pandas_input, offset, size): - col = cudf.core.column.as_column(pandas_input) + col = as_column(pandas_input) col = cudf.core.column.build_column( data=col.base_data, dtype=col.dtype, @@ -141,7 +127,7 @@ def column_slicing_test(col, offset, size, cast_to_float=False): @pytest.mark.parametrize("offset", [0, 1, 15]) @pytest.mark.parametrize("size", [50, 10, 0]) def test_column_slicing(pandas_input, offset, size): - col = cudf.core.column.as_column(pandas_input) + col = as_column(pandas_input) column_slicing_test(col, offset, size) @@ -154,9 +140,7 @@ def test_column_slicing(pandas_input, offset, size): [cudf.Decimal128Dtype, cudf.Decimal64Dtype, cudf.Decimal32Dtype], ) def test_decimal_column_slicing(offset, size, precision, scale, decimal_type): - col = cudf.core.column.as_column( - pd.Series(np.random.default_rng(seed=0).random(1000)) - ) + col = as_column(pd.Series(np.random.default_rng(seed=0).random(1000))) col = col.astype(decimal_type(precision, scale)) column_slicing_test(col, offset, size, True) @@ -173,7 +157,7 @@ def test_column_series_multi_dim(data): cudf.Series(data) with pytest.raises(ValueError): - cudf.core.column.as_column(data) + as_column(data) @pytest.mark.parametrize( @@ -195,14 +179,13 @@ def test_column_mixed_dtype(data, error): cudf.Series(data) -@pytest.mark.parametrize("nan_as_null", [True, False]) @pytest.mark.parametrize( "scalar", [np.nan, pd.Timedelta(days=1), pd.Timestamp(2020, 1, 1)], ids=repr, ) -@pytest.mark.parametrize("size", [1, 10]) -def test_as_column_scalar_with_nan(nan_as_null, scalar, size): +def test_as_column_scalar_with_nan(nan_as_null, scalar): + size = 5 expected = ( cudf.Series([scalar] * size, nan_as_null=nan_as_null) .dropna() @@ -221,23 +204,20 @@ def test_as_column_scalar_with_nan(nan_as_null, scalar, size): @pytest.mark.parametrize("data", [[1.1, 2.2, 3.3, 4.4], [1, 2, 3, 4]]) -@pytest.mark.parametrize("dtype", ["float32", "float64"]) -def test_column_series_cuda_array_dtype(data, dtype): - psr = pd.Series(np.asarray(data), dtype=dtype) - sr = cudf.Series(cp.asarray(data), dtype=dtype) +def test_column_series_cuda_array_dtype(data, float_types_as_str): + psr = pd.Series(np.asarray(data, dtype=float_types_as_str)) + sr = cudf.Series(cp.asarray(data, dtype=float_types_as_str)) assert_eq(psr, sr) - psr = pd.Series(data, dtype=dtype) - sr = cudf.Series(data, dtype=dtype) + psr = pd.Series(data, dtype=float_types_as_str) + sr = cudf.Series(data, dtype=float_types_as_str) assert_eq(psr, sr) def test_column_zero_length_slice(): # see https://github.com/rapidsai/cudf/pull/4777 - from numba import cuda - x = cudf.DataFrame({"a": [1]}) the_column = x[1:]["a"]._column @@ -251,20 +231,16 @@ def test_column_chunked_array_creation(): pyarrow_array = pa.array([1, 2, 3] * 1000) chunked_array = pa.chunked_array(pyarrow_array) - actual_column = cudf.core.column.as_column( - chunked_array, dtype=np.dtype(np.float64) - ) - expected_column = cudf.core.column.as_column( - pyarrow_array, dtype=np.dtype(np.float64) - ) + actual_column = as_column(chunked_array, dtype=np.dtype(np.float64)) + expected_column = as_column(pyarrow_array, dtype=np.dtype(np.float64)) assert_eq( cudf.Series._from_column(actual_column), cudf.Series._from_column(expected_column), ) - actual_column = cudf.core.column.as_column(chunked_array) - expected_column = cudf.core.column.as_column(pyarrow_array) + actual_column = as_column(chunked_array) + expected_column = as_column(pyarrow_array) assert_eq( cudf.Series._from_column(actual_column), @@ -418,8 +394,8 @@ def test_column_view_string_slice(slc): ], ) def test_as_column_buffer(box, data): - expected = cudf.core.column.as_column(data) - actual_column = cudf.core.column.as_column( + expected = as_column(data) + actual_column = as_column( cudf.core.buffer.as_buffer(box(data)), dtype=data.dtype ) assert_eq( @@ -530,29 +506,6 @@ def test_build_series_from_nullable_pandas_dtype(pd_dtype, expect_dtype): np.testing.assert_array_equal(expect_mask, got_mask) -@pytest.mark.parametrize( - "alias,expect_dtype", - [ - ("UInt8", "uint8"), - ("UInt16", "uint16"), - ("UInt32", "uint32"), - ("UInt64", "uint64"), - ("Int8", "int8"), - ("Int16", "int16"), - ("Int32", "int32"), - ("Int64", "int64"), - ("boolean", "bool"), - ("Float32", "float32"), - ("Float64", "float64"), - ], -) -def test_astype_with_aliases(alias, expect_dtype): - pd_data = pd.Series([1, 2, 0]) - gd_data = cudf.Series.from_pandas(pd_data) - - assert_eq(pd_data.astype(expect_dtype), gd_data.astype(alias)) - - @pytest.mark.parametrize( "left, right, expected", [ diff --git a/python/cudf/cudf/tests/test_column_accessor.py b/python/cudf/cudf/tests/private_objects/test_column_accessor.py similarity index 100% rename from python/cudf/cudf/tests/test_column_accessor.py rename to python/cudf/cudf/tests/private_objects/test_column_accessor.py diff --git a/python/cudf/cudf/tests/test_compile_udf.py b/python/cudf/cudf/tests/private_objects/test_compile_udf.py similarity index 96% rename from python/cudf/cudf/tests/test_compile_udf.py rename to python/cudf/cudf/tests/private_objects/test_compile_udf.py index b12200c92dd..8d87c323786 100644 --- a/python/cudf/cudf/tests/test_compile_udf.py +++ b/python/cudf/cudf/tests/private_objects/test_compile_udf.py @@ -1,12 +1,15 @@ # Copyright (c) 2021-2025, NVIDIA CORPORATION. +import pytest from numba import types from cudf.core.udf.utils import _udf_code_cache, compile_udf -def setup_function(): +@pytest.fixture(autouse=True) +def clear_udf_cache(): _udf_code_cache.clear() + return def assert_cache_size(size): diff --git a/python/cudf/cudf/tests/test_extension_compilation.py b/python/cudf/cudf/tests/private_objects/test_extension_compilation.py similarity index 100% rename from python/cudf/cudf/tests/test_extension_compilation.py rename to python/cudf/cudf/tests/private_objects/test_extension_compilation.py diff --git a/python/cudf/cudf/tests/test_performance_tracking.py b/python/cudf/cudf/tests/private_objects/test_memory_records_report.py similarity index 100% rename from python/cudf/cudf/tests/test_performance_tracking.py rename to python/cudf/cudf/tests/private_objects/test_memory_records_report.py diff --git a/python/cudf/cudf/tests/test_nrt_stats.py b/python/cudf/cudf/tests/private_objects/test_nrt_stats.py similarity index 100% rename from python/cudf/cudf/tests/test_nrt_stats.py rename to python/cudf/cudf/tests/private_objects/test_nrt_stats.py diff --git a/python/cudf/cudf/tests/series/methods/test_astype.py b/python/cudf/cudf/tests/series/methods/test_astype.py index 609ad8efedd..2a535478af4 100644 --- a/python/cudf/cudf/tests/series/methods/test_astype.py +++ b/python/cudf/cudf/tests/series/methods/test_astype.py @@ -198,6 +198,29 @@ def test_numeric_to_timedelta( assert_eq(expected, actual) +@pytest.mark.parametrize( + "alias,expect_dtype", + [ + ("UInt8", "uint8"), + ("UInt16", "uint16"), + ("UInt32", "uint32"), + ("UInt64", "uint64"), + ("Int8", "int8"), + ("Int16", "int16"), + ("Int32", "int32"), + ("Int64", "int64"), + ("boolean", "bool"), + ("Float32", "float32"), + ("Float64", "float64"), + ], +) +def test_astype_with_aliases(alias, expect_dtype): + pd_data = pd.Series([1, 2, 0]) + gd_data = cudf.Series.from_pandas(pd_data) + + assert_eq(pd_data.astype(expect_dtype), gd_data.astype(alias)) + + def test_timedelta_datetime_cast_invalid(): sr = cudf.Series([1, 2, 3], dtype="timedelta64[ns]") psr = sr.to_pandas() From 9c63bcd05902a3f5058712652b626e32e48389a8 Mon Sep 17 00:00:00 2001 From: Robert Maynard Date: Mon, 25 Aug 2025 15:52:19 -0400 Subject: [PATCH 206/366] When bundling `libnvcomp.so.X` only append the major version value (#19786) We previously extracted the whole X.Y.Z.P version string of nvcomp and would therefore fail to load the correct version in our wheel packages. Authors: - Robert Maynard (https://github.com/robertmaynard) - Bradley Dice (https://github.com/bdice) Approvers: - Kyle Edwards (https://github.com/KyleFromNVIDIA) URL: https://github.com/rapidsai/cudf/pull/19786 --- cpp/cmake/thirdparty/get_jitify.cmake | 4 ++-- python/libcudf/CMakeLists.txt | 6 ++++-- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/cpp/cmake/thirdparty/get_jitify.cmake b/cpp/cmake/thirdparty/get_jitify.cmake index b6f11e30d28..5db4e3e907f 100644 --- a/cpp/cmake/thirdparty/get_jitify.cmake +++ b/cpp/cmake/thirdparty/get_jitify.cmake @@ -19,8 +19,8 @@ function(find_and_configure_jitify) rapids_cpm_find( jitify 2.0.0 GIT_REPOSITORY https://github.com/NVIDIA/jitify.git - GIT_TAG 70783a3ad7b0cad2992a26a1ebf8fbe3d6b44e25 # jitify2 branch as of 5th Aug 2025 - GIT_SHALLOW TRUE + GIT_TAG 44e978b21fc8bdb6b2d7d8d179523c8350db72e5 # jitify2 branch as of 23rd Aug 2025 + GIT_SHALLOW FALSE DOWNLOAD_ONLY TRUE ) set(JITIFY_INCLUDE_DIR diff --git a/python/libcudf/CMakeLists.txt b/python/libcudf/CMakeLists.txt index 6e4c525edbd..6722db592bb 100644 --- a/python/libcudf/CMakeLists.txt +++ b/python/libcudf/CMakeLists.txt @@ -54,11 +54,13 @@ if(TARGET nvcomp::nvcomp) # Compute the SOVERSION from the library path get_filename_component(nvcomp_lib_dir ${nvcomp_lib_path} DIRECTORY) get_filename_component(nvcomp_lib_name ${nvcomp_lib_path} NAME) - string(REGEX REPLACE "libnvcomp\\.so\\.([0-9]+)" "\\1" nvcomp_soversion ${nvcomp_lib_name}) + string(REPLACE [=[libnvcomp.so.]=] "" nvcomp_soversion ${nvcomp_lib_name}) + string(REPLACE [=[.]=] ";" nvcomp_soversion ${nvcomp_soversion}) + list(GET nvcomp_soversion 0 nvcomp_soversion_major) install( FILES ${nvcomp_lib_path} DESTINATION ${SKBUILD_PLATLIB_DIR}/libcudf/lib64/ - RENAME libnvcomp.so.${nvcomp_soversion} + RENAME libnvcomp.so.${nvcomp_soversion_major} ) else() message(FATAL_ERROR "nvcomp target must be imported") From 6e2aa185766353bb8517024e51eaf3f28e5a90a2 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Mon, 25 Aug 2025 15:02:03 -0700 Subject: [PATCH 207/366] Move test_numerical/{numpy|pandas}_interop/setitem.py to new cudf classic testing directory (#19725) Towards https://github.com/rapidsai/cudf/issues/9999 Towards https://github.com/rapidsai/cudf/issues/15723 Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - Matthew Murray (https://github.com/Matt711) URL: https://github.com/rapidsai/cudf/pull/19725 --- .../cudf/tests/dataframe/indexing/test_loc.py | 76 +++ .../tests/dataframe/indexing/test_setitem.py | 110 +++- .../tests/dataframe/methods/test_to_pandas.py | 28 +- .../dataframe/methods/test_to_records.py | 39 ++ .../cudf/tests/dataframe/test_constructors.py | 90 ++++ .../test_to_numeric.py} | 174 +------ .../indexes/rangeindex/test_constructors.py | 22 + .../cudf/tests/private_objects/test_column.py | 116 +++++ .../tests/private_objects/test_compile_udf.py | 1 - .../cudf/tests/series/indexing/test_iloc.py | 11 + .../cudf/tests/series/indexing/test_loc.py | 36 ++ .../tests/series/indexing/test_setitem.py | 237 ++++++++- .../tests/series/methods/test_to_pandas.py | 22 + .../cudf/cudf/tests/series/test_attributes.py | 21 + .../cudf/tests/series/test_constructors.py | 23 +- python/cudf/cudf/tests/test_numpy_interop.py | 96 ---- python/cudf/cudf/tests/test_pandas_interop.py | 90 ---- python/cudf/cudf/tests/test_setitem.py | 477 ------------------ 18 files changed, 834 insertions(+), 835 deletions(-) create mode 100644 python/cudf/cudf/tests/dataframe/indexing/test_loc.py create mode 100644 python/cudf/cudf/tests/dataframe/methods/test_to_records.py rename python/cudf/cudf/tests/{test_numerical.py => general_functions/test_to_numeric.py} (56%) create mode 100644 python/cudf/cudf/tests/indexes/rangeindex/test_constructors.py create mode 100644 python/cudf/cudf/tests/series/indexing/test_loc.py delete mode 100644 python/cudf/cudf/tests/test_numpy_interop.py delete mode 100644 python/cudf/cudf/tests/test_pandas_interop.py delete mode 100644 python/cudf/cudf/tests/test_setitem.py diff --git a/python/cudf/cudf/tests/dataframe/indexing/test_loc.py b/python/cudf/cudf/tests/dataframe/indexing/test_loc.py new file mode 100644 index 00000000000..82277af12c5 --- /dev/null +++ b/python/cudf/cudf/tests/dataframe/indexing/test_loc.py @@ -0,0 +1,76 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + +import pandas as pd +import pytest + +import cudf +from cudf.testing import assert_eq + + +@pytest.mark.parametrize( + "value", + [ + "7", + pytest.param( + ["7", "8"], + marks=pytest.mark.xfail( + reason="https://github.com/rapidsai/cudf/issues/11298" + ), + ), + ], +) +def test_loc_setitem_string_11298(value): + df = pd.DataFrame({"a": ["a", "b", "c"]}) + cdf = cudf.from_pandas(df) + + df.loc[:1, "a"] = value + + cdf.loc[:1, "a"] = value + + assert_eq(df, cdf) + + +@pytest.mark.xfail(reason="https://github.com/rapidsai/cudf/issues/11944") +def test_loc_setitem_list_11944(): + df = pd.DataFrame( + data={"a": ["yes", "no"], "b": [["l1", "l2"], ["c", "d"]]} + ) + cdf = cudf.from_pandas(df) + df.loc[df.a == "yes", "b"] = [["hello"]] + cdf.loc[cdf.a == "yes", "b"] = [["hello"]] + assert_eq(df, cdf) + + +@pytest.mark.xfail(reason="https://github.com/rapidsai/cudf/issues/12504") +def test_loc_setitem_extend_empty_12504(): + df = pd.DataFrame(columns=["a"]) + cdf = cudf.from_pandas(df) + + df.loc[0] = [1] + + cdf.loc[0] = [1] + + assert_eq(df, cdf) + + +def test_loc_setitem_extend_existing_12505(): + df = pd.DataFrame({"a": [0]}) + cdf = cudf.from_pandas(df) + + df.loc[1] = 1 + + cdf.loc[1] = 1 + + assert_eq(df, cdf) + + +@pytest.mark.xfail(reason="https://github.com/rapidsai/cudf/issues/12801") +def test_loc_setitem_add_column_partial_12801(): + df = pd.DataFrame({"a": [0, 1, 2]}) + cdf = cudf.from_pandas(df) + + df.loc[df.a < 2, "b"] = 1 + + cdf.loc[cdf.a < 2, "b"] = 1 + + assert_eq(df, cdf) diff --git a/python/cudf/cudf/tests/dataframe/indexing/test_setitem.py b/python/cudf/cudf/tests/dataframe/indexing/test_setitem.py index 7d2dab7caa6..0df3cdddb1f 100644 --- a/python/cudf/cudf/tests/dataframe/indexing/test_setitem.py +++ b/python/cudf/cudf/tests/dataframe/indexing/test_setitem.py @@ -1,8 +1,113 @@ # Copyright (c) 2025, NVIDIA CORPORATION. +import numpy as np import pandas as pd +import pytest import cudf +from cudf.testing import assert_eq + + +@pytest.mark.parametrize("arg", [[True, False, True], [True, True, True]]) +@pytest.mark.parametrize("value", [0, -1]) +def test_dataframe_setitem_bool_mask_scalar(arg, value): + df = pd.DataFrame({"a": [1, 2, 3]}) + gdf = cudf.from_pandas(df) + + df[arg] = value + gdf[arg] = value + assert_eq(df, gdf) + + +def test_dataframe_setitem_scalar_bool(): + df = pd.DataFrame({"a": [1, 2, 3]}) + df[[True, False, True]] = pd.DataFrame({"a": [-1, -2]}) + + gdf = cudf.DataFrame({"a": [1, 2, 3]}) + gdf[[True, False, True]] = cudf.DataFrame({"a": [-1, -2]}) + assert_eq(df, gdf) + + +@pytest.mark.parametrize( + "df", + [pd.DataFrame({"a": [1, 2, 3]}), pd.DataFrame({"a": ["x", "y", "z"]})], +) +@pytest.mark.parametrize("arg", [["a"], "a", "b"]) +@pytest.mark.parametrize( + "value", [-10, pd.DataFrame({"a": [-1, -2, -3]}), "abc"] +) +def test_dataframe_setitem_columns(df, arg, value): + gdf = cudf.from_pandas(df) + cudf_replace_value = value + + if isinstance(cudf_replace_value, pd.DataFrame): + cudf_replace_value = cudf.from_pandas(value) + + df[arg] = value + gdf[arg] = cudf_replace_value + assert_eq(df, gdf, check_dtype=False) + + +@pytest.mark.parametrize( + "value", + [ + pd.DataFrame({"0": [-1, -2, -3], "1": [-0, -10, -1]}), + 10, + "rapids", + 0.32234, + np.datetime64(1324232423423342, "ns"), + np.timedelta64(34234324234324234, "ns"), + ], +) +def test_dataframe_setitem_new_columns(value): + df = pd.DataFrame({"a": [1, 2, 3]}) + arg = ["b", "c"] + gdf = cudf.from_pandas(df) + cudf_replace_value = value + + if isinstance(cudf_replace_value, pd.DataFrame): + cudf_replace_value = cudf.from_pandas(value) + + df[arg] = value + gdf[arg] = cudf_replace_value + assert_eq(df, gdf, check_dtype=True) + + +def test_series_setitem_index(): + df = pd.DataFrame( + data={"b": [-1, -2, -3], "c": [1, 2, 3]}, index=[1, 2, 3] + ) + + df["b"] = pd.Series(data=[12, 11, 10], index=[3, 2, 1]) + gdf = cudf.DataFrame( + data={"b": [-1, -2, -3], "c": [1, 2, 3]}, index=[1, 2, 3] + ) + gdf["b"] = cudf.Series(data=[12, 11, 10], index=[3, 2, 1]) + assert_eq(df, gdf, check_dtype=False) + + +@pytest.mark.xfail(reason="Copy-on-Write should make a copy") +@pytest.mark.parametrize( + "index", + [ + pd.MultiIndex.from_frame( + pd.DataFrame({"b": [3, 2, 1], "c": ["a", "b", "c"]}) + ), + ["a", "b", "c"], + ], +) +def test_setitem_dataframe_series_inplace(index): + gdf = cudf.DataFrame({"a": [1, 2, 3]}, index=index) + expected = gdf.copy() + with cudf.option_context("copy_on_write", True): + gdf["a"].replace(1, 500, inplace=True) + + assert_eq(expected, gdf) + + +def test_setitem_datetime(): + df = cudf.DataFrame({"date": pd.date_range("20010101", "20010105").values}) + assert df.date.dtype.kind == "M" def test_listcol_setitem_retain_dtype(): @@ -18,8 +123,3 @@ def test_listcol_setitem_retain_dtype(): # prior to this fix: https://github.com/rapidsai/cudf/pull/10151/ df2 = df1.copy() assert df2["a"].dtype == df["a"].dtype - - -def test_setitem_datetime(): - df = cudf.DataFrame({"date": pd.date_range("20010101", "20010105").values}) - assert df.date.dtype.kind == "M" diff --git a/python/cudf/cudf/tests/dataframe/methods/test_to_pandas.py b/python/cudf/cudf/tests/dataframe/methods/test_to_pandas.py index df549bbac5a..739d97c28a5 100644 --- a/python/cudf/cudf/tests/dataframe/methods/test_to_pandas.py +++ b/python/cudf/cudf/tests/dataframe/methods/test_to_pandas.py @@ -1,11 +1,37 @@ # Copyright (c) 2025, NVIDIA CORPORATION. - +import numpy as np import pytest import cudf +def test_to_pandas(): + df = cudf.DataFrame( + { + "a": np.arange(5, dtype=np.int32), + "b": np.arange(10, 15, dtype=np.float64), + "c": np.array([True, False, None, True, True]), + } + ) + + pdf = df.to_pandas() + + assert tuple(df.columns) == tuple(pdf.columns) + + assert df["a"].dtype == pdf["a"].dtype + assert df["b"].dtype == pdf["b"].dtype + + # Notice, the dtype differ when Pandas and cudf boolean series + # contains None/NaN + assert df["c"].dtype == np.bool_ + assert pdf["c"].dtype == np.object_ + + assert len(df["a"]) == len(pdf["a"]) + assert len(df["b"]) == len(pdf["b"]) + assert len(df["c"]) == len(pdf["c"]) + + def test_list_to_pandas_nullable_true(): df = cudf.DataFrame({"a": cudf.Series([[1, 2, 3]])}) with pytest.raises(NotImplementedError): diff --git a/python/cudf/cudf/tests/dataframe/methods/test_to_records.py b/python/cudf/cudf/tests/dataframe/methods/test_to_records.py new file mode 100644 index 00000000000..eea58d2946d --- /dev/null +++ b/python/cudf/cudf/tests/dataframe/methods/test_to_records.py @@ -0,0 +1,39 @@ +# Copyright (c) 2019-2025, NVIDIA CORPORATION. + +import numpy as np + +import cudf + + +def test_to_records_noindex(): + aa = np.arange(10, dtype=np.int32) + bb = np.arange(10, 20, dtype=np.float64) + df = cudf.DataFrame( + { + "a": aa, + "b": bb, + } + ) + + rec = df.to_records(index=False) + assert rec.dtype.names == ("a", "b") + np.testing.assert_array_equal(rec["a"], aa) + np.testing.assert_array_equal(rec["b"], bb) + + +def test_to_records_withindex(): + aa = np.arange(10, dtype=np.int32) + bb = np.arange(10, 20, dtype=np.float64) + df = cudf.DataFrame( + { + "a": aa, + "b": bb, + } + ) + + rec_indexed = df.to_records(index=True) + assert rec_indexed.size == len(aa) + assert rec_indexed.dtype.names == ("index", "a", "b") + np.testing.assert_array_equal(rec_indexed["a"], aa) + np.testing.assert_array_equal(rec_indexed["b"], bb) + np.testing.assert_array_equal(rec_indexed["index"], np.arange(10)) diff --git a/python/cudf/cudf/tests/dataframe/test_constructors.py b/python/cudf/cudf/tests/dataframe/test_constructors.py index 15926f4faf0..28c515757f0 100644 --- a/python/cudf/cudf/tests/dataframe/test_constructors.py +++ b/python/cudf/cudf/tests/dataframe/test_constructors.py @@ -1354,3 +1354,93 @@ def test_create_interval_df(data1, data2, data3, data4, interval_closed): dtype="interval", ) assert_eq(expect_three, got_three) + + +def test_from_pandas(): + pdf = pd.DataFrame( + { + "a": np.arange(10, dtype=np.int32), + "b": np.arange(10, 20, dtype=np.float64), + } + ) + + df = cudf.DataFrame.from_pandas(pdf) + + assert tuple(df.columns) == tuple(pdf.columns) + + assert df["a"].dtype == pdf["a"].dtype + assert df["b"].dtype == pdf["b"].dtype + + assert len(df["a"]) == len(pdf["a"]) + assert len(df["b"]) == len(pdf["b"]) + + +def test_from_pandas_ex1(): + pdf = pd.DataFrame({"a": [0, 1, 2, 3], "b": [0.1, 0.2, None, 0.3]}) + df = cudf.DataFrame.from_pandas(pdf) + + assert tuple(df.columns) == tuple(pdf.columns) + assert np.all(df["a"].to_numpy() == pdf["a"]) + matches = df["b"].to_numpy(na_value=np.nan) == pdf["b"] + # the 3d element is False due to (nan == nan) == False + assert np.all(matches == [True, True, False, True]) + assert np.isnan(df["b"].to_numpy(na_value=np.nan)[2]) + assert np.isnan(pdf["b"][2]) + + +def test_from_pandas_with_index(): + pdf = pd.DataFrame({"a": [0, 1, 2, 3], "b": [0.1, 0.2, None, 0.3]}) + pdf = pdf.set_index(np.asarray([4, 3, 2, 1])) + df = cudf.DataFrame.from_pandas(pdf) + + # Check columns + assert_eq(df.a, pdf.a) + assert_eq(df.b, pdf.b) + # Check index + assert_eq(df.index.values, pdf.index.values) + # Check again using pandas testing tool on frames + assert_eq(df, pdf) + + +@pytest.mark.parametrize("columns", [None, ("a", "b"), ("a",), ("b",)]) +def test_from_records_noindex(columns): + recdtype = np.dtype([("a", np.int32), ("b", np.float64)]) + rec = np.recarray(10, dtype=recdtype) + rec.a = aa = np.arange(10, dtype=np.int32) + rec.b = bb = np.arange(10, 20, dtype=np.float64) + df = cudf.DataFrame.from_records(rec, columns=columns) + + if columns and "a" in columns: + assert_eq(aa, df["a"].values) + if columns and "b" in columns: + assert_eq(bb, df["b"].values) + assert_eq(np.arange(10), df.index.values) + + +@pytest.mark.parametrize("columns", [None, ("a", "b"), ("a",), ("b",)]) +def test_from_records_withindex(columns): + recdtype = np.dtype( + [("index", np.int64), ("a", np.int32), ("b", np.float64)] + ) + rec = np.recarray(10, dtype=recdtype) + rec.index = ii = np.arange(30, 40) + rec.a = aa = np.arange(10, dtype=np.int32) + rec.b = bb = np.arange(10, 20, dtype=np.float64) + df = cudf.DataFrame.from_records(rec, index="index") + + if columns and "a" in columns: + assert_eq(aa, df["a"].values) + if columns and "b" in columns: + assert_eq(bb, df["b"].values) + assert_eq(ii, df.index.values) + + +def test_numpy_non_contiguous(): + recdtype = np.dtype([("index", np.int64), ("a", np.int32)]) + rec = np.recarray(10, dtype=recdtype) + rec.index = np.arange(30, 40) + rec.a = aa = np.arange(20, dtype=np.int32)[::2] + assert rec.a.flags["C_CONTIGUOUS"] is False + + gdf = cudf.DataFrame.from_records(rec, index="index") + assert_eq(aa, gdf["a"].values) diff --git a/python/cudf/cudf/tests/test_numerical.py b/python/cudf/cudf/tests/general_functions/test_to_numeric.py similarity index 56% rename from python/cudf/cudf/tests/test_numerical.py rename to python/cudf/cudf/tests/general_functions/test_to_numeric.py index b1a2f081cd2..bb4ca6b24bf 100644 --- a/python/cudf/cudf/tests/test_numerical.py +++ b/python/cudf/cudf/tests/general_functions/test_to_numeric.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021-2024, NVIDIA CORPORATION. +# Copyright (c) 2025, NVIDIA CORPORATION. import numpy as np import pandas as pd @@ -7,122 +7,12 @@ import cudf from cudf.core._compat import PANDAS_GE_220 from cudf.testing import assert_eq -from cudf.testing._utils import NUMERIC_TYPES, expect_warning_if -from cudf.utils.dtypes import np_dtypes_to_pandas_dtypes +from cudf.testing._utils import expect_warning_if -def test_can_cast_safely_same_kind(): - # 'i' -> 'i' - data = cudf.Series([1, 2, 3], dtype="int32")._column - to_dtype = np.dtype("int64") - - assert data.can_cast_safely(to_dtype) - - data = cudf.Series([1, 2, 3], dtype="int64")._column - to_dtype = np.dtype("int32") - - assert data.can_cast_safely(to_dtype) - - data = cudf.Series([1, 2, 2**31], dtype="int64")._column - assert not data.can_cast_safely(to_dtype) - - # 'u' -> 'u' - data = cudf.Series([1, 2, 3], dtype="uint32")._column - to_dtype = np.dtype("uint64") - - assert data.can_cast_safely(to_dtype) - - data = cudf.Series([1, 2, 3], dtype="uint64")._column - to_dtype = np.dtype("uint32") - - assert data.can_cast_safely(to_dtype) - - data = cudf.Series([1, 2, 2**33], dtype="uint64")._column - assert not data.can_cast_safely(to_dtype) - - # 'f' -> 'f' - data = cudf.Series([np.inf, 1.0], dtype="float64")._column - to_dtype = np.dtype("float32") - assert data.can_cast_safely(to_dtype) - - data = cudf.Series( - [float(np.finfo("float32").max) * 2, 1.0], dtype="float64" - )._column - to_dtype = np.dtype("float32") - assert not data.can_cast_safely(to_dtype) - - -def test_can_cast_safely_mixed_kind(): - data = cudf.Series([1, 2, 3], dtype="int32")._column - to_dtype = np.dtype("float32") - assert data.can_cast_safely(to_dtype) - - # too big to fit into f32 exactly - data = cudf.Series([1, 2, 2**24 + 1], dtype="int32")._column - assert not data.can_cast_safely(to_dtype) - - data = cudf.Series([1, 2, 3], dtype="uint32")._column - to_dtype = np.dtype("float32") - assert data.can_cast_safely(to_dtype) - - # too big to fit into f32 exactly - data = cudf.Series([1, 2, 2**24 + 1], dtype="uint32")._column - assert not data.can_cast_safely(to_dtype) - - to_dtype = np.dtype("float64") - assert data.can_cast_safely(to_dtype) - - data = cudf.Series([1.0, 2.0, 3.0], dtype="float32")._column - to_dtype = np.dtype("int32") - assert data.can_cast_safely(to_dtype) - - # not integer float - data = cudf.Series([1.0, 2.0, 3.5], dtype="float32")._column - assert not data.can_cast_safely(to_dtype) - - data = cudf.Series([10.0, 11.0, 2000.0], dtype="float64")._column - assert data.can_cast_safely(to_dtype) - - # float out of int range - data = cudf.Series([1.0, 2.0, 1.0 * (2**31)], dtype="float32")._column - assert not data.can_cast_safely(to_dtype) - - # negative signed integers casting to unsigned integers - data = cudf.Series([-1, 0, 1], dtype="int32")._column - to_dtype = np.dtype("uint32") - assert not data.can_cast_safely(to_dtype) - - -def test_to_pandas_nullable_integer(): - gsr_not_null = cudf.Series([1, 2, 3]) - gsr_has_null = cudf.Series([1, 2, None]) - - psr_not_null = pd.Series([1, 2, 3], dtype="int64") - psr_has_null = pd.Series([1, 2, None], dtype="Int64") - - assert_eq(gsr_not_null.to_pandas(), psr_not_null) - assert_eq(gsr_has_null.to_pandas(nullable=True), psr_has_null) - - -def test_to_pandas_nullable_bool(): - gsr_not_null = cudf.Series([True, False, True]) - gsr_has_null = cudf.Series([True, False, None]) - - psr_not_null = pd.Series([True, False, True], dtype="bool") - psr_has_null = pd.Series([True, False, None], dtype="boolean") - - assert_eq(gsr_not_null.to_pandas(), psr_not_null) - assert_eq(gsr_has_null.to_pandas(nullable=True), psr_has_null) - - -def test_can_cast_safely_has_nulls(): - data = cudf.Series([1, 2, 3, None], dtype="float32")._column - to_dtype = np.dtype("int64") - - assert data.can_cast_safely(to_dtype) - - data = cudf.Series([1, 2, 3.1, None], dtype="float32")._column - assert not data.can_cast_safely(to_dtype) +@pytest.fixture(params=["integer", "signed", "unsigned", "float"]) +def downcast(request): + return request.param @pytest.mark.parametrize( @@ -182,9 +72,6 @@ def test_to_numeric_basic_1d(data): [np.iinfo(np.int64).max, np.iinfo(np.int64).min], ], ) -@pytest.mark.parametrize( - "downcast", ["integer", "signed", "unsigned", "float"] -) def test_to_numeric_downcast_int(data, downcast): ps = pd.Series(data) gs = cudf.from_pandas(ps) @@ -212,9 +99,6 @@ def test_to_numeric_downcast_int(data, downcast): [1.0, 1.5, 2.6, 3.4], ], ) -@pytest.mark.parametrize( - "downcast", ["signed", "integer", "unsigned", "float"] -) def test_to_numeric_downcast_float(data, downcast): ps = pd.Series(data) gs = cudf.from_pandas(ps) @@ -237,8 +121,10 @@ def test_to_numeric_downcast_float(data, downcast): [-1.0, -1.79e308], ], ) -@pytest.mark.parametrize("downcast", ["signed", "integer", "unsigned"]) def test_to_numeric_downcast_large_float(data, downcast): + if downcast == "float": + pytest.skip(f"{downcast=} not applicable for test") + ps = pd.Series(data) gs = cudf.from_pandas(ps) @@ -260,8 +146,8 @@ def test_to_numeric_downcast_large_float(data, downcast): [-1.0, -1.79e308], ], ) -@pytest.mark.parametrize("downcast", ["float"]) -def test_to_numeric_downcast_large_float_pd_bug(data, downcast): +def test_to_numeric_downcast_large_float_pd_bug(data): + downcast = "float" ps = pd.Series(data) gs = cudf.from_pandas(ps) @@ -278,9 +164,6 @@ def test_to_numeric_downcast_large_float_pd_bug(data, downcast): [str(np.iinfo(np.int64).max), str(np.iinfo(np.int64).min)], ], ) -@pytest.mark.parametrize( - "downcast", ["signed", "integer", "unsigned", "float"] -) def test_to_numeric_downcast_string_int(data, downcast): ps = pd.Series(data) gs = cudf.from_pandas(ps) @@ -302,9 +185,6 @@ def test_to_numeric_downcast_string_int(data, downcast): ["1", "10", "1.0", "2e3", "", ""], # mixed empty strings ], ) -@pytest.mark.parametrize( - "downcast", ["signed", "integer", "unsigned", "float"] -) def test_to_numeric_downcast_string_float(data, downcast): ps = pd.Series(data) gs = cudf.from_pandas(ps) @@ -335,9 +215,6 @@ def test_to_numeric_downcast_string_float(data, downcast): ], # 2 digits relaxed from np.finfo(np.float64).min/max ], ) -@pytest.mark.parametrize( - "downcast", ["signed", "integer", "unsigned", "float"] -) def test_to_numeric_downcast_string_large_float(data, downcast): ps = pd.Series(data) gs = cudf.from_pandas(ps) @@ -382,35 +259,8 @@ def test_to_numeric_error(data, errors): assert_eq(expect, got) -@pytest.mark.parametrize("dtype", NUMERIC_TYPES) -@pytest.mark.parametrize("input_obj", [[1, cudf.NA, 3]]) -def test_series_construction_with_nulls(dtype, input_obj): - dtype = cudf.dtype(dtype) - # numpy case - - expect = pd.Series(input_obj, dtype=np_dtypes_to_pandas_dtypes[dtype]) - got = cudf.Series(input_obj, dtype=dtype).to_pandas(nullable=True) - - assert_eq(expect, got) - - # Test numpy array of objects case - np_data = [ - dtype.type(v) if v is not cudf.NA else cudf.NA for v in input_obj - ] - - expect = pd.Series(np_data, dtype=np_dtypes_to_pandas_dtypes[dtype]) - got = cudf.Series(np_data, dtype=dtype).to_pandas(nullable=True) - assert_eq(expect, got) - - -@pytest.mark.parametrize( - "data", - [[True, False, True]], -) -@pytest.mark.parametrize( - "downcast", ["signed", "integer", "unsigned", "float"] -) -def test_series_to_numeric_bool(data, downcast): +def test_series_to_numeric_bool(downcast): + data = [True, False, True] ps = pd.Series(data) gs = cudf.from_pandas(ps) diff --git a/python/cudf/cudf/tests/indexes/rangeindex/test_constructors.py b/python/cudf/cudf/tests/indexes/rangeindex/test_constructors.py new file mode 100644 index 00000000000..9bbe35ed33b --- /dev/null +++ b/python/cudf/cudf/tests/indexes/rangeindex/test_constructors.py @@ -0,0 +1,22 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + +import pandas as pd + +import cudf +from cudf.testing import assert_eq + + +def test_from_pandas_rangeindex(): + idx1 = pd.RangeIndex(start=0, stop=4, step=1, name="myindex") + idx2 = cudf.from_pandas(idx1) + + # Check index + assert_eq(idx1.values, idx2.values) + assert idx1.name == idx2.name + + +def test_from_pandas_rangeindex_step(): + expected = pd.RangeIndex(start=0, stop=8, step=2, name="myindex") + actual = cudf.from_pandas(expected) + + assert_eq(expected, actual) diff --git a/python/cudf/cudf/tests/private_objects/test_column.py b/python/cudf/cudf/tests/private_objects/test_column.py index 6ed3111b587..4d2a7eaea41 100644 --- a/python/cudf/cudf/tests/private_objects/test_column.py +++ b/python/cudf/cudf/tests/private_objects/test_column.py @@ -55,6 +55,30 @@ def str_host_view(list_of_str, to_dtype): ) +def test_column_set_equal_length_object_by_mask(): + # Series.__setitem__ might bypass some of the cases + # handled in column.__setitem__ so this test is needed + + data = cudf.Series([0, 0, 1, 1, 1])._column + replace_data = cudf.Series([100, 200, 300, 400, 500])._column + bool_col = cudf.Series([True, True, True, True, True])._column + + data[bool_col] = replace_data + assert_eq( + cudf.Series._from_column(data), + cudf.Series._from_column(replace_data), + ) + + data = cudf.Series([0, 0, 1, 1, 1])._column + bool_col = cudf.Series([True, False, True, False, True])._column + data[bool_col] = replace_data + + assert_eq( + cudf.Series._from_column(data), + cudf.Series([100, 0, 300, 1, 500]), + ) + + @pytest.mark.parametrize("offset", [0, 1, 15]) @pytest.mark.parametrize("size", [50, 10, 0]) def test_column_offset_and_size(pandas_input, offset, size): @@ -404,6 +428,98 @@ def test_as_column_buffer(box, data): ) +def test_can_cast_safely_same_kind(): + # 'i' -> 'i' + data = cudf.Series([1, 2, 3], dtype="int32")._column + to_dtype = np.dtype("int64") + + assert data.can_cast_safely(to_dtype) + + data = cudf.Series([1, 2, 3], dtype="int64")._column + to_dtype = np.dtype("int32") + + assert data.can_cast_safely(to_dtype) + + data = cudf.Series([1, 2, 2**31], dtype="int64")._column + assert not data.can_cast_safely(to_dtype) + + # 'u' -> 'u' + data = cudf.Series([1, 2, 3], dtype="uint32")._column + to_dtype = np.dtype("uint64") + + assert data.can_cast_safely(to_dtype) + + data = cudf.Series([1, 2, 3], dtype="uint64")._column + to_dtype = np.dtype("uint32") + + assert data.can_cast_safely(to_dtype) + + data = cudf.Series([1, 2, 2**33], dtype="uint64")._column + assert not data.can_cast_safely(to_dtype) + + # 'f' -> 'f' + data = cudf.Series([np.inf, 1.0], dtype="float64")._column + to_dtype = np.dtype("float32") + assert data.can_cast_safely(to_dtype) + + data = cudf.Series( + [float(np.finfo("float32").max) * 2, 1.0], dtype="float64" + )._column + to_dtype = np.dtype("float32") + assert not data.can_cast_safely(to_dtype) + + +def test_can_cast_safely_mixed_kind(): + data = cudf.Series([1, 2, 3], dtype="int32")._column + to_dtype = np.dtype("float32") + assert data.can_cast_safely(to_dtype) + + # too big to fit into f32 exactly + data = cudf.Series([1, 2, 2**24 + 1], dtype="int32")._column + assert not data.can_cast_safely(to_dtype) + + data = cudf.Series([1, 2, 3], dtype="uint32")._column + to_dtype = np.dtype("float32") + assert data.can_cast_safely(to_dtype) + + # too big to fit into f32 exactly + data = cudf.Series([1, 2, 2**24 + 1], dtype="uint32")._column + assert not data.can_cast_safely(to_dtype) + + to_dtype = np.dtype("float64") + assert data.can_cast_safely(to_dtype) + + data = cudf.Series([1.0, 2.0, 3.0], dtype="float32")._column + to_dtype = np.dtype("int32") + assert data.can_cast_safely(to_dtype) + + # not integer float + data = cudf.Series([1.0, 2.0, 3.5], dtype="float32")._column + assert not data.can_cast_safely(to_dtype) + + data = cudf.Series([10.0, 11.0, 2000.0], dtype="float64")._column + assert data.can_cast_safely(to_dtype) + + # float out of int range + data = cudf.Series([1.0, 2.0, 1.0 * (2**31)], dtype="float32")._column + assert not data.can_cast_safely(to_dtype) + + # negative signed integers casting to unsigned integers + data = cudf.Series([-1, 0, 1], dtype="int32")._column + to_dtype = np.dtype("uint32") + assert not data.can_cast_safely(to_dtype) + + +def test_can_cast_safely_has_nulls(): + data = cudf.Series([1, 2, 3, None], dtype="float32")._column + to_dtype = np.dtype("int64") + + assert data.can_cast_safely(to_dtype) + + data = cudf.Series([1, 2, 3.1, None], dtype="float32")._column + assert not data.can_cast_safely(to_dtype) + + @pytest.mark.parametrize( "data,pyarrow_kwargs,cudf_kwargs", [ diff --git a/python/cudf/cudf/tests/private_objects/test_compile_udf.py b/python/cudf/cudf/tests/private_objects/test_compile_udf.py index 8d87c323786..2eb12f1ae86 100644 --- a/python/cudf/cudf/tests/private_objects/test_compile_udf.py +++ b/python/cudf/cudf/tests/private_objects/test_compile_udf.py @@ -9,7 +9,6 @@ @pytest.fixture(autouse=True) def clear_udf_cache(): _udf_code_cache.clear() - return def assert_cache_size(size): diff --git a/python/cudf/cudf/tests/series/indexing/test_iloc.py b/python/cudf/cudf/tests/series/indexing/test_iloc.py index 0d460c11dc8..00bc0422c7d 100644 --- a/python/cudf/cudf/tests/series/indexing/test_iloc.py +++ b/python/cudf/cudf/tests/series/indexing/test_iloc.py @@ -1,11 +1,22 @@ # Copyright (c) 2025, NVIDIA CORPORATION. +import numpy as np import pytest import cudf from cudf.testing import assert_eq +def test_series_setitem_singleton_range(): + sr = cudf.Series([1, 2, 3], dtype=np.int64) + psr = sr.to_pandas() + value = np.asarray([7], dtype=np.int64) + sr.iloc[:1] = value + psr.iloc[:1] = value + assert_eq(sr, cudf.Series([7, 2, 3], dtype=np.int64)) + assert_eq(sr, psr, check_dtype=True) + + @pytest.mark.parametrize( "indices", [slice(0, 3), slice(1, 4), slice(None, None, 2), slice(1, None, 2)], diff --git a/python/cudf/cudf/tests/series/indexing/test_loc.py b/python/cudf/cudf/tests/series/indexing/test_loc.py new file mode 100644 index 00000000000..da67da0699e --- /dev/null +++ b/python/cudf/cudf/tests/series/indexing/test_loc.py @@ -0,0 +1,36 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + +import pandas as pd +import pytest + +import cudf +from cudf.testing import assert_eq + + +@pytest.mark.xfail(reason="https://github.com/rapidsai/cudf/issues/13031") +@pytest.mark.parametrize("other_index", [["1", "3", "2"], [1, 2, 3]]) +def test_loc_setitem_series_index_alignment_13031(other_index): + s = pd.Series([1, 2, 3], index=["1", "2", "3"]) + other = pd.Series([5, 6, 7], index=other_index) + + cs = cudf.from_pandas(s) + cother = cudf.from_pandas(other) + + s.loc[["1", "3"]] = other + + cs.loc[["1", "3"]] = cother + + assert_eq(s, cs) + + +def test_series_set_item_index_reference(): + gs1 = cudf.Series([1], index=[7]) + gs2 = cudf.Series([2], index=gs1.index) + + gs1.loc[11] = 2 + ps1 = pd.Series([1], index=[7]) + ps2 = pd.Series([2], index=ps1.index) + ps1.loc[11] = 2 + + assert_eq(ps1, gs1) + assert_eq(ps2, gs2) diff --git a/python/cudf/cudf/tests/series/indexing/test_setitem.py b/python/cudf/cudf/tests/series/indexing/test_setitem.py index 4d78d3f4698..aba9edcc6c7 100644 --- a/python/cudf/cudf/tests/series/indexing/test_setitem.py +++ b/python/cudf/cudf/tests/series/indexing/test_setitem.py @@ -6,10 +6,243 @@ import pytest import cudf +from cudf.core._compat import PANDAS_CURRENT_SUPPORTED_VERSION, PANDAS_VERSION from cudf.testing import assert_eq -from cudf.testing._utils import ( - assert_exceptions_equal, +from cudf.testing._utils import assert_exceptions_equal, expect_warning_if + + +@pytest.mark.parametrize( + "arg", ["b", ["a", "c"], slice(1, 2, 1), [True, False, True]] ) +def test_series_set_item(arg): + psr = pd.Series([1, 2, 3], index=["a", "b", "c"]) + gsr = cudf.from_pandas(psr) + + psr[arg] = 11 + gsr[arg] = 11 + + assert_eq(psr, gsr) + + +def test_column_set_unequal_length_object_by_mask(): + data = [1, 2, 3, 4, 5] + replace_data_1 = [8, 9] + replace_data_2 = [8, 9, 10, 11] + mask = [True, True, False, True, False] + + psr = pd.Series(data) + gsr = cudf.Series(data) + assert_exceptions_equal( + psr.__setitem__, + gsr.__setitem__, + ([mask, replace_data_1], {}), + ([mask, replace_data_1], {}), + ) + + psr = pd.Series(data) + gsr = cudf.Series(data) + assert_exceptions_equal( + psr.__setitem__, + gsr.__setitem__, + ([mask, replace_data_2], {}), + ([mask, replace_data_2], {}), + ) + + +def test_categorical_setitem_invalid(): + ps = pd.Series([1, 2, 3], dtype="category") + gs = cudf.Series([1, 2, 3], dtype="category") + + assert_exceptions_equal( + lfunc=ps.__setitem__, + rfunc=gs.__setitem__, + lfunc_args_and_kwargs=([0, 5], {}), + rfunc_args_and_kwargs=([0, 5], {}), + ) + + +def test_series_slice_setitem_list(): + actual = cudf.Series([[[1, 2], [2, 3]], [[3, 4]], [[4, 5]], [[6, 7]]]) + actual[slice(0, 3, 1)] = [[10, 11], [12, 23]] + expected = cudf.Series( + [ + [[10, 11], [12, 23]], + [[10, 11], [12, 23]], + [[10, 11], [12, 23]], + [[6, 7]], + ] + ) + assert_eq(actual, expected) + + +def test_series_slice_setitem_struct(): + actual = cudf.Series( + [ + {"a": {"b": 10}, "b": 11}, + {"a": {"b": 100}, "b": 5}, + {"a": {"b": 50}, "b": 2}, + {"a": {"b": 1000}, "b": 67}, + {"a": {"b": 4000}, "b": 1090}, + ] + ) + actual[slice(0, 3, 1)] = {"a": {"b": 5050}, "b": 101} + expected = cudf.Series( + [ + {"a": {"b": 5050}, "b": 101}, + {"a": {"b": 5050}, "b": 101}, + {"a": {"b": 5050}, "b": 101}, + {"a": {"b": 1000}, "b": 67}, + {"a": {"b": 4000}, "b": 1090}, + ] + ) + assert_eq(actual, expected) + + +@pytest.mark.skipif( + PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, + reason="warning not present in older pandas versions", +) +@pytest.mark.parametrize("dtype", [np.int32, np.int64, np.float32, np.float64]) +@pytest.mark.parametrize("indices", [0, [1, 2]]) +def test_series_setitem_upcasting(dtype, indices): + sr = pd.Series([0, 0, 0], dtype=dtype) + cr = cudf.from_pandas(sr) + assert_eq(sr, cr) + # Must be a non-integral floating point value that can't be losslessly + # converted to float32, otherwise pandas will try and match the source + # column dtype. + new_value = np.float64(np.pi) + col_ref = cr._column + with expect_warning_if(dtype != np.float64): + sr[indices] = new_value + with expect_warning_if(dtype != np.float64): + cr[indices] = new_value + assert_eq(sr, cr) + + if dtype == np.float64: + # no-op type cast should not modify backing column + assert col_ref == cr._column + + +@pytest.mark.parametrize( + "klass", + [ + list, + cudf.Series, + lambda x: cudf.Series(x, index=[2, 3, 4, 5, 6]), + ], +) +def test_series_set_equal_length_object_by_mask(klass): + replace_data = klass([100, 200, 300, 400, 500]) + psr = pd.Series([1, 2, 3, 4, 5], dtype="Int64") + gsr = cudf.from_pandas(psr) + + # Lengths match in trivial case + pd_bool_col = pd.Series([True] * len(psr), dtype="boolean") + gd_bool_col = cudf.from_pandas(pd_bool_col) + psr[pd_bool_col] = ( + replace_data.to_pandas(nullable=True) + if hasattr(replace_data, "to_pandas") + else pd.Series(replace_data) + ) + gsr[gd_bool_col] = replace_data + + assert_eq(psr.astype("float"), gsr.astype("float")) + + # Test partial masking + psr[psr > 1] = ( + replace_data.to_pandas() + if hasattr(replace_data, "to_pandas") + else pd.Series(replace_data) + ) + gsr[gsr > 1] = replace_data + + assert_eq(psr.astype("float"), gsr.astype("float")) + + +# TODO: these two tests could perhaps be changed once specifics of +# pandas compat wrt upcasting are decided on; this is just baking in +# status-quo. +def test_series_setitem_upcasting_string_column(): + sr = pd.Series([0, 0, 0], dtype=str) + cr = cudf.from_pandas(sr) + new_value = np.float64(10.5) + sr[0] = str(new_value) + cr[0] = str(new_value) + assert_eq(sr, cr) + + +def test_series_setitem_upcasting_string_value(): + sr = cudf.Series([0, 0, 0], dtype=int) + # This is a distinction with pandas, which lets you instead make an + # object column with ["10", 0, 0] + sr[0] = "10" + assert_eq(pd.Series([10, 0, 0], dtype=int), sr) + with pytest.raises(ValueError): + sr[0] = "non-integer" + + +def test_scatter_by_slice_with_start_and_step(): + source = pd.Series([1, 2, 3, 4, 5]) + csource = cudf.from_pandas(source) + target = pd.Series([0, 0, 0, 0, 0, 0, 0, 0, 0, 0]) + ctarget = cudf.from_pandas(target) + target[1::2] = source + ctarget[1::2] = csource + assert_eq(target, ctarget) + + +@pytest.mark.parametrize("n", [1, 3]) +def test_setitem_str_trailing_null(n): + trailing_nulls = "\x00" * n + s = cudf.Series(["a", "b", "c" + trailing_nulls]) + assert s[2] == "c" + trailing_nulls + s[0] = "a" + trailing_nulls + assert s[0] == "a" + trailing_nulls + s[1] = trailing_nulls + assert s[1] == trailing_nulls + s[0] = "" + assert s[0] == "" + s[0] = "\x00" + assert s[0] == "\x00" + + +@pytest.mark.parametrize( + "ps", + [ + pd.Series([1, 2, 3], index=pd.RangeIndex(0, 3)), + pd.Series([1, 2, 3], index=pd.RangeIndex(start=2, stop=-1, step=-1)), + pd.Series([1, 2, 3], index=pd.RangeIndex(start=1, stop=6, step=2)), + pd.Series( + [1, 2, 3, 4, 5], index=pd.RangeIndex(start=1, stop=-9, step=-2) + ), + pd.Series( + [1, 2, 3, 4, 5], index=pd.RangeIndex(start=1, stop=-12, step=-3) + ), + pd.Series([1, 2, 3, 4], index=pd.RangeIndex(start=1, stop=14, step=4)), + pd.Series( + [1, 2, 3, 4], index=pd.RangeIndex(start=1, stop=-14, step=-4) + ), + ], +) +@pytest.mark.parametrize("arg", [[1], 5.6, 3.1]) +def test_series_set_item_range_index(ps, arg): + gsr = cudf.from_pandas(ps) + psr = ps.copy(deep=True) + psr[arg] = 11 + gsr[arg] = 11 + + assert_eq(psr, gsr, check_index_type=True) + + +@pytest.mark.xfail(reason="https://github.com/rapidsai/cudf/issues/7448") +def test_iloc_setitem_7448(): + index = pd.MultiIndex.from_product([(1, 2), (3, 4)]) + expect = cudf.Series([1, 2, 3, 4], index=index) + actual = cudf.from_pandas(expect) + expect[(1, 3)] = 101 + actual[(1, 3)] = 101 + assert_eq(expect, actual) @pytest.mark.parametrize( diff --git a/python/cudf/cudf/tests/series/methods/test_to_pandas.py b/python/cudf/cudf/tests/series/methods/test_to_pandas.py index c49a4bfc7f3..9013675c191 100644 --- a/python/cudf/cudf/tests/series/methods/test_to_pandas.py +++ b/python/cudf/cudf/tests/series/methods/test_to_pandas.py @@ -148,6 +148,28 @@ def test_series_to_pandas_arrow_type_nullable_raises(scalar): ser.to_pandas(nullable=True, arrow_type=True) +def test_to_pandas_nullable_integer(): + gsr_not_null = cudf.Series([1, 2, 3]) + gsr_has_null = cudf.Series([1, 2, None]) + + psr_not_null = pd.Series([1, 2, 3], dtype="int64") + psr_has_null = pd.Series([1, 2, None], dtype="Int64") + + assert_eq(gsr_not_null.to_pandas(), psr_not_null) + assert_eq(gsr_has_null.to_pandas(nullable=True), psr_has_null) + + +def test_to_pandas_nullable_bool(): + gsr_not_null = cudf.Series([True, False, True]) + gsr_has_null = cudf.Series([True, False, None]) + + psr_not_null = pd.Series([True, False, True], dtype="bool") + psr_has_null = pd.Series([True, False, None], dtype="boolean") + + assert_eq(gsr_not_null.to_pandas(), psr_not_null) + assert_eq(gsr_has_null.to_pandas(nullable=True), psr_has_null) + + @pytest.mark.parametrize( "scalar", [ diff --git a/python/cudf/cudf/tests/series/test_attributes.py b/python/cudf/cudf/tests/series/test_attributes.py index 80475505bc4..65ba67dce90 100644 --- a/python/cudf/cudf/tests/series/test_attributes.py +++ b/python/cudf/cudf/tests/series/test_attributes.py @@ -149,6 +149,27 @@ def test_series_iter_error(): iter(gs._column) +@pytest.mark.parametrize( + "data", + [ + lambda: cudf.Series([1, 2, 3, -12, 12, 44]), + lambda: cudf.Series([1, 2, 3, -12, 12, 44], dtype="str"), + lambda: cudf.DataFrame( + {"a": [1, 2, 3, -1234], "b": [0.1, 0.2222, 0.4, -3.14]} + ), + ], +) +@pytest.mark.parametrize("dtype", [None, "float", "int", "str"]) +def test_series_dataframe__array__(data, dtype): + gs = data() + + with pytest.raises(TypeError): + gs.__array__(dtype=dtype) + + with pytest.raises(TypeError): + gs.index.__array__(dtype=dtype) + + @pytest.mark.parametrize("data", [[], [None, None], ["a", None]]) def test_series_size(data): psr = pd.Series(data) diff --git a/python/cudf/cudf/tests/series/test_constructors.py b/python/cudf/cudf/tests/series/test_constructors.py index b5b03f4955b..e9ccee3a3ae 100644 --- a/python/cudf/cudf/tests/series/test_constructors.py +++ b/python/cudf/cudf/tests/series/test_constructors.py @@ -22,6 +22,7 @@ from cudf.errors import MixedTypeError from cudf.testing import assert_eq from cudf.testing._utils import assert_exceptions_equal +from cudf.utils.dtypes import np_dtypes_to_pandas_dtypes @pytest.mark.parametrize( @@ -679,6 +680,26 @@ def test_construct_nonnative_array(arr): assert_eq(result, expected) +@pytest.mark.parametrize("input_obj", [[1, cudf.NA, 3]]) +def test_series_construction_with_nulls(numeric_types_as_str, input_obj): + dtype = np.dtype(numeric_types_as_str) + # numpy case + + expect = pd.Series(input_obj, dtype=np_dtypes_to_pandas_dtypes[dtype]) + got = cudf.Series(input_obj, dtype=dtype).to_pandas(nullable=True) + + assert_eq(expect, got) + + # Test numpy array of objects case + np_data = [ + dtype.type(v) if v is not cudf.NA else cudf.NA for v in input_obj + ] + + expect = pd.Series(np_data, dtype=np_dtypes_to_pandas_dtypes[dtype]) + got = cudf.Series(np_data, dtype=dtype).to_pandas(nullable=True) + assert_eq(expect, got) + + @pytest.mark.parametrize("nan_as_null", [True, False]) def test_construct_all_pd_NA_with_dtype(nan_as_null): result = cudf.Series( @@ -1216,7 +1237,7 @@ def test_roundtrip_series_plc_column(ps): assert_eq(expect, actual) -def test_series_construction_with_nulls(): +def test_series_structarray_construction_with_nulls(): fields = [ pa.array([1], type=pa.int64()), pa.array([None], type=pa.int64()), diff --git a/python/cudf/cudf/tests/test_numpy_interop.py b/python/cudf/cudf/tests/test_numpy_interop.py deleted file mode 100644 index 0bdb806732b..00000000000 --- a/python/cudf/cudf/tests/test_numpy_interop.py +++ /dev/null @@ -1,96 +0,0 @@ -# Copyright (c) 2019-2025, NVIDIA CORPORATION. - -import numpy as np -import pytest - -from cudf import DataFrame, Series -from cudf.testing import assert_eq - - -def test_to_records_noindex(): - df = DataFrame() - df["a"] = aa = np.arange(10, dtype=np.int32) - df["b"] = bb = np.arange(10, 20, dtype=np.float64) - - rec = df.to_records(index=False) - assert rec.dtype.names == ("a", "b") - np.testing.assert_array_equal(rec["a"], aa) - np.testing.assert_array_equal(rec["b"], bb) - - -def test_to_records_withindex(): - df = DataFrame() - df["a"] = aa = np.arange(10, dtype=np.int32) - df["b"] = bb = np.arange(10, 20, dtype=np.float64) - - rec_indexed = df.to_records(index=True) - assert rec_indexed.size == len(aa) - assert rec_indexed.dtype.names == ("index", "a", "b") - np.testing.assert_array_equal(rec_indexed["a"], aa) - np.testing.assert_array_equal(rec_indexed["b"], bb) - np.testing.assert_array_equal(rec_indexed["index"], np.arange(10)) - - -@pytest.mark.parametrize("columns", [None, ("a", "b"), ("a",), ("b",)]) -def test_from_records_noindex(columns): - recdtype = np.dtype([("a", np.int32), ("b", np.float64)]) - rec = np.recarray(10, dtype=recdtype) - rec.a = aa = np.arange(10, dtype=np.int32) - rec.b = bb = np.arange(10, 20, dtype=np.float64) - df = DataFrame.from_records(rec, columns=columns) - - if columns and "a" in columns: - assert_eq(aa, df["a"].values) - if columns and "b" in columns: - assert_eq(bb, df["b"].values) - assert_eq(np.arange(10), df.index.values) - - -@pytest.mark.parametrize("columns", [None, ("a", "b"), ("a",), ("b",)]) -def test_from_records_withindex(columns): - recdtype = np.dtype( - [("index", np.int64), ("a", np.int32), ("b", np.float64)] - ) - rec = np.recarray(10, dtype=recdtype) - rec.index = ii = np.arange(30, 40) - rec.a = aa = np.arange(10, dtype=np.int32) - rec.b = bb = np.arange(10, 20, dtype=np.float64) - df = DataFrame.from_records(rec, index="index") - - if columns and "a" in columns: - assert_eq(aa, df["a"].values) - if columns and "b" in columns: - assert_eq(bb, df["b"].values) - assert_eq(ii, df.index.values) - - -def test_numpy_non_contiguious(): - recdtype = np.dtype([("index", np.int64), ("a", np.int32)]) - rec = np.recarray(10, dtype=recdtype) - rec.index = np.arange(30, 40) - rec.a = aa = np.arange(20, dtype=np.int32)[::2] - assert rec.a.flags["C_CONTIGUOUS"] is False - - gdf = DataFrame.from_records(rec, index="index") - assert_eq(aa, gdf["a"].values) - - -@pytest.mark.parametrize( - "data", - [ - lambda: Series([1, 2, 3, -12, 12, 44]), - lambda: Series([1, 2, 3, -12, 12, 44], dtype="str"), - lambda: DataFrame( - {"a": [1, 2, 3, -1234], "b": [0.1, 0.2222, 0.4, -3.14]} - ), - ], -) -@pytest.mark.parametrize("dtype", [None, "float", "int", "str"]) -def test_series_dataframe__array__(data, dtype): - gs = data() - - with pytest.raises(TypeError): - gs.__array__(dtype=dtype) - - with pytest.raises(TypeError): - gs.index.__array__(dtype=dtype) diff --git a/python/cudf/cudf/tests/test_pandas_interop.py b/python/cudf/cudf/tests/test_pandas_interop.py deleted file mode 100644 index 5782437e394..00000000000 --- a/python/cudf/cudf/tests/test_pandas_interop.py +++ /dev/null @@ -1,90 +0,0 @@ -# Copyright (c) 2018-2024, NVIDIA CORPORATION. - -import numpy as np -import pandas as pd - -import cudf -from cudf import DataFrame -from cudf.testing import assert_eq - - -def test_to_pandas(): - df = DataFrame() - df["a"] = np.arange(5, dtype=np.int32) - df["b"] = np.arange(10, 15, dtype=np.float64) - df["c"] = np.array([True, False, None, True, True]) - - pdf = df.to_pandas() - - assert tuple(df.columns) == tuple(pdf.columns) - - assert df["a"].dtype == pdf["a"].dtype - assert df["b"].dtype == pdf["b"].dtype - - # Notice, the dtype differ when Pandas and cudf boolean series - # contains None/NaN - assert df["c"].dtype == np.bool_ - assert pdf["c"].dtype == np.object_ - - assert len(df["a"]) == len(pdf["a"]) - assert len(df["b"]) == len(pdf["b"]) - assert len(df["c"]) == len(pdf["c"]) - - -def test_from_pandas(): - pdf = pd.DataFrame() - pdf["a"] = np.arange(10, dtype=np.int32) - pdf["b"] = np.arange(10, 20, dtype=np.float64) - - df = DataFrame.from_pandas(pdf) - - assert tuple(df.columns) == tuple(pdf.columns) - - assert df["a"].dtype == pdf["a"].dtype - assert df["b"].dtype == pdf["b"].dtype - - assert len(df["a"]) == len(pdf["a"]) - assert len(df["b"]) == len(pdf["b"]) - - -def test_from_pandas_ex1(): - pdf = pd.DataFrame({"a": [0, 1, 2, 3], "b": [0.1, 0.2, None, 0.3]}) - df = DataFrame.from_pandas(pdf) - - assert tuple(df.columns) == tuple(pdf.columns) - assert np.all(df["a"].to_numpy() == pdf["a"]) - matches = df["b"].to_numpy(na_value=np.nan) == pdf["b"] - # the 3d element is False due to (nan == nan) == False - assert np.all(matches == [True, True, False, True]) - assert np.isnan(df["b"].to_numpy(na_value=np.nan)[2]) - assert np.isnan(pdf["b"][2]) - - -def test_from_pandas_with_index(): - pdf = pd.DataFrame({"a": [0, 1, 2, 3], "b": [0.1, 0.2, None, 0.3]}) - pdf = pdf.set_index(np.asarray([4, 3, 2, 1])) - df = DataFrame.from_pandas(pdf) - - # Check columns - assert_eq(df.a, pdf.a) - assert_eq(df.b, pdf.b) - # Check index - assert_eq(df.index.values, pdf.index.values) - # Check again using pandas testing tool on frames - assert_eq(df, pdf) - - -def test_from_pandas_rangeindex(): - idx1 = pd.RangeIndex(start=0, stop=4, step=1, name="myindex") - idx2 = cudf.from_pandas(idx1) - - # Check index - assert_eq(idx1.values, idx2.values) - assert idx1.name == idx2.name - - -def test_from_pandas_rangeindex_step(): - expected = pd.RangeIndex(start=0, stop=8, step=2, name="myindex") - actual = cudf.from_pandas(expected) - - assert_eq(expected, actual) diff --git a/python/cudf/cudf/tests/test_setitem.py b/python/cudf/cudf/tests/test_setitem.py deleted file mode 100644 index ffbf21b5548..00000000000 --- a/python/cudf/cudf/tests/test_setitem.py +++ /dev/null @@ -1,477 +0,0 @@ -# Copyright (c) 2018-2025, NVIDIA CORPORATION. - -import numpy as np -import pandas as pd -import pytest - -import cudf -from cudf.core._compat import PANDAS_CURRENT_SUPPORTED_VERSION, PANDAS_VERSION -from cudf.testing import assert_eq -from cudf.testing._utils import assert_exceptions_equal, expect_warning_if - - -@pytest.mark.parametrize("arg", [[True, False, True], [True, True, True]]) -@pytest.mark.parametrize("value", [0, -1]) -def test_dataframe_setitem_bool_mask_scaler(arg, value): - df = pd.DataFrame({"a": [1, 2, 3]}) - gdf = cudf.from_pandas(df) - - df[arg] = value - gdf[arg] = value - assert_eq(df, gdf) - - -def test_dataframe_setitem_scaler_bool(): - df = pd.DataFrame({"a": [1, 2, 3]}) - df[[True, False, True]] = pd.DataFrame({"a": [-1, -2]}) - - gdf = cudf.DataFrame({"a": [1, 2, 3]}) - gdf[[True, False, True]] = cudf.DataFrame({"a": [-1, -2]}) - assert_eq(df, gdf) - - -@pytest.mark.parametrize( - "df", - [pd.DataFrame({"a": [1, 2, 3]}), pd.DataFrame({"a": ["x", "y", "z"]})], -) -@pytest.mark.parametrize("arg", [["a"], "a", "b"]) -@pytest.mark.parametrize( - "value", [-10, pd.DataFrame({"a": [-1, -2, -3]}), "abc"] -) -def test_dataframe_setitem_columns(df, arg, value): - gdf = cudf.from_pandas(df) - cudf_replace_value = value - - if isinstance(cudf_replace_value, pd.DataFrame): - cudf_replace_value = cudf.from_pandas(value) - - df[arg] = value - gdf[arg] = cudf_replace_value - assert_eq(df, gdf, check_dtype=False) - - -@pytest.mark.parametrize( - "value", - [ - pd.DataFrame({"0": [-1, -2, -3], "1": [-0, -10, -1]}), - 10, - 20, - 30, - "rapids", - "ai", - 0.32234, - np.datetime64(1324232423423342, "ns"), - np.timedelta64(34234324234324234, "ns"), - ], -) -def test_dataframe_setitem_new_columns(value): - df = pd.DataFrame({"a": [1, 2, 3]}) - arg = ["b", "c"] - gdf = cudf.from_pandas(df) - cudf_replace_value = value - - if isinstance(cudf_replace_value, pd.DataFrame): - cudf_replace_value = cudf.from_pandas(value) - - df[arg] = value - gdf[arg] = cudf_replace_value - assert_eq(df, gdf, check_dtype=True) - - -# set_item_series inconsistency -def test_series_setitem_index(): - df = pd.DataFrame( - data={"b": [-1, -2, -3], "c": [1, 2, 3]}, index=[1, 2, 3] - ) - - df["b"] = pd.Series(data=[12, 11, 10], index=[3, 2, 1]) - gdf = cudf.DataFrame( - data={"b": [-1, -2, -3], "c": [1, 2, 3]}, index=[1, 2, 3] - ) - gdf["b"] = cudf.Series(data=[12, 11, 10], index=[3, 2, 1]) - assert_eq(df, gdf, check_dtype=False) - - -@pytest.mark.parametrize( - "arg", ["b", ["a", "c"], slice(1, 2, 1), [True, False, True]] -) -def test_series_set_item(arg): - psr = pd.Series([1, 2, 3], index=["a", "b", "c"]) - gsr = cudf.from_pandas(psr) - - psr[arg] = 11 - gsr[arg] = 11 - - assert_eq(psr, gsr) - - -def test_series_setitem_singleton_range(): - sr = cudf.Series([1, 2, 3], dtype=np.int64) - psr = sr.to_pandas() - value = np.asarray([7], dtype=np.int64) - sr.iloc[:1] = value - psr.iloc[:1] = value - assert_eq(sr, cudf.Series([7, 2, 3], dtype=np.int64)) - assert_eq(sr, psr, check_dtype=True) - - -@pytest.mark.xfail(reason="Copy-on-Write should make a copy") -@pytest.mark.parametrize( - "index", - [ - pd.MultiIndex.from_frame( - pd.DataFrame({"b": [3, 2, 1], "c": ["a", "b", "c"]}) - ), - ["a", "b", "c"], - ], -) -def test_setitem_dataframe_series_inplace(index): - gdf = cudf.DataFrame({"a": [1, 2, 3]}, index=index) - expected = gdf.copy() - with cudf.option_context("copy_on_write", True): - gdf["a"].replace(1, 500, inplace=True) - - assert_eq(expected, gdf) - - -@pytest.mark.parametrize( - "klass", - [ - list, - cudf.Series, - lambda x: cudf.Series(x, index=[2, 3, 4, 5, 6]), - ], -) -def test_series_set_equal_length_object_by_mask(klass): - replace_data = klass([100, 200, 300, 400, 500]) - psr = pd.Series([1, 2, 3, 4, 5], dtype="Int64") - gsr = cudf.from_pandas(psr) - - # Lengths match in trivial case - pd_bool_col = pd.Series([True] * len(psr), dtype="boolean") - gd_bool_col = cudf.from_pandas(pd_bool_col) - psr[pd_bool_col] = ( - replace_data.to_pandas(nullable=True) - if hasattr(replace_data, "to_pandas") - else pd.Series(replace_data) - ) - gsr[gd_bool_col] = replace_data - - assert_eq(psr.astype("float"), gsr.astype("float")) - - # Test partial masking - psr[psr > 1] = ( - replace_data.to_pandas() - if hasattr(replace_data, "to_pandas") - else pd.Series(replace_data) - ) - gsr[gsr > 1] = replace_data - - assert_eq(psr.astype("float"), gsr.astype("float")) - - -def test_column_set_equal_length_object_by_mask(): - # Series.__setitem__ might bypass some of the cases - # handled in column.__setitem__ so this test is needed - - data = cudf.Series([0, 0, 1, 1, 1])._column - replace_data = cudf.Series([100, 200, 300, 400, 500])._column - bool_col = cudf.Series([True, True, True, True, True])._column - - data[bool_col] = replace_data - assert_eq( - cudf.Series._from_column(data), - cudf.Series._from_column(replace_data), - ) - - data = cudf.Series([0, 0, 1, 1, 1])._column - bool_col = cudf.Series([True, False, True, False, True])._column - data[bool_col] = replace_data - - assert_eq( - cudf.Series._from_column(data), - cudf.Series([100, 0, 300, 1, 500]), - ) - - -def test_column_set_unequal_length_object_by_mask(): - data = [1, 2, 3, 4, 5] - replace_data_1 = [8, 9] - replace_data_2 = [8, 9, 10, 11] - mask = [True, True, False, True, False] - - psr = pd.Series(data) - gsr = cudf.Series(data) - assert_exceptions_equal( - psr.__setitem__, - gsr.__setitem__, - ([mask, replace_data_1], {}), - ([mask, replace_data_1], {}), - ) - - psr = pd.Series(data) - gsr = cudf.Series(data) - assert_exceptions_equal( - psr.__setitem__, - gsr.__setitem__, - ([mask, replace_data_2], {}), - ([mask, replace_data_2], {}), - ) - - -def test_categorical_setitem_invalid(): - ps = pd.Series([1, 2, 3], dtype="category") - gs = cudf.Series([1, 2, 3], dtype="category") - - assert_exceptions_equal( - lfunc=ps.__setitem__, - rfunc=gs.__setitem__, - lfunc_args_and_kwargs=([0, 5], {}), - rfunc_args_and_kwargs=([0, 5], {}), - ) - - -def test_series_slice_setitem_list(): - actual = cudf.Series([[[1, 2], [2, 3]], [[3, 4]], [[4, 5]], [[6, 7]]]) - actual[slice(0, 3, 1)] = [[10, 11], [12, 23]] - expected = cudf.Series( - [ - [[10, 11], [12, 23]], - [[10, 11], [12, 23]], - [[10, 11], [12, 23]], - [[6, 7]], - ] - ) - assert_eq(actual, expected) - - -def test_series_slice_setitem_struct(): - actual = cudf.Series( - [ - {"a": {"b": 10}, "b": 11}, - {"a": {"b": 100}, "b": 5}, - {"a": {"b": 50}, "b": 2}, - {"a": {"b": 1000}, "b": 67}, - {"a": {"b": 4000}, "b": 1090}, - ] - ) - actual[slice(0, 3, 1)] = {"a": {"b": 5050}, "b": 101} - expected = cudf.Series( - [ - {"a": {"b": 5050}, "b": 101}, - {"a": {"b": 5050}, "b": 101}, - {"a": {"b": 5050}, "b": 101}, - {"a": {"b": 1000}, "b": 67}, - {"a": {"b": 4000}, "b": 1090}, - ] - ) - assert_eq(actual, expected) - - -@pytest.mark.skipif( - PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, - reason="warning not present in older pandas versions", -) -@pytest.mark.parametrize("dtype", [np.int32, np.int64, np.float32, np.float64]) -@pytest.mark.parametrize("indices", [0, [1, 2]]) -def test_series_setitem_upcasting(dtype, indices): - sr = pd.Series([0, 0, 0], dtype=dtype) - cr = cudf.from_pandas(sr) - assert_eq(sr, cr) - # Must be a non-integral floating point value that can't be losslessly - # converted to float32, otherwise pandas will try and match the source - # column dtype. - new_value = np.float64(np.pi) - col_ref = cr._column - with expect_warning_if(dtype != np.float64): - sr[indices] = new_value - with expect_warning_if(dtype != np.float64): - cr[indices] = new_value - assert_eq(sr, cr) - - if dtype == np.float64: - # no-op type cast should not modify backing column - assert col_ref == cr._column - - -# TODO: these two tests could perhaps be changed once specifics of -# pandas compat wrt upcasting are decided on; this is just baking in -# status-quo. -def test_series_setitem_upcasting_string_column(): - sr = pd.Series([0, 0, 0], dtype=str) - cr = cudf.from_pandas(sr) - new_value = np.float64(10.5) - sr[0] = str(new_value) - cr[0] = str(new_value) - assert_eq(sr, cr) - - -def test_series_setitem_upcasting_string_value(): - sr = cudf.Series([0, 0, 0], dtype=int) - # This is a distinction with pandas, which lets you instead make an - # object column with ["10", 0, 0] - sr[0] = "10" - assert_eq(pd.Series([10, 0, 0], dtype=int), sr) - with pytest.raises(ValueError): - sr[0] = "non-integer" - - -def test_scatter_by_slice_with_start_and_step(): - source = pd.Series([1, 2, 3, 4, 5]) - csource = cudf.from_pandas(source) - target = pd.Series([0, 0, 0, 0, 0, 0, 0, 0, 0, 0]) - ctarget = cudf.from_pandas(target) - target[1::2] = source - ctarget[1::2] = csource - assert_eq(target, ctarget) - - -@pytest.mark.parametrize("n", [1, 3]) -def test_setitem_str_trailing_null(n): - trailing_nulls = "\x00" * n - s = cudf.Series(["a", "b", "c" + trailing_nulls]) - assert s[2] == "c" + trailing_nulls - s[0] = "a" + trailing_nulls - assert s[0] == "a" + trailing_nulls - s[1] = trailing_nulls - assert s[1] == trailing_nulls - s[0] = "" - assert s[0] == "" - s[0] = "\x00" - assert s[0] == "\x00" - - -@pytest.mark.xfail(reason="https://github.com/rapidsai/cudf/issues/7448") -def test_iloc_setitem_7448(): - index = pd.MultiIndex.from_product([(1, 2), (3, 4)]) - expect = cudf.Series([1, 2, 3, 4], index=index) - actual = cudf.from_pandas(expect) - expect[(1, 3)] = 101 - actual[(1, 3)] = 101 - assert_eq(expect, actual) - - -@pytest.mark.parametrize( - "value", - [ - "7", - pytest.param( - ["7", "8"], - marks=pytest.mark.xfail( - reason="https://github.com/rapidsai/cudf/issues/11298" - ), - ), - ], -) -def test_loc_setitem_string_11298(value): - df = pd.DataFrame({"a": ["a", "b", "c"]}) - cdf = cudf.from_pandas(df) - - df.loc[:1, "a"] = value - - cdf.loc[:1, "a"] = value - - assert_eq(df, cdf) - - -@pytest.mark.xfail(reason="https://github.com/rapidsai/cudf/issues/11944") -def test_loc_setitem_list_11944(): - df = pd.DataFrame( - data={"a": ["yes", "no"], "b": [["l1", "l2"], ["c", "d"]]} - ) - cdf = cudf.from_pandas(df) - df.loc[df.a == "yes", "b"] = [["hello"]] - cdf.loc[df.a == "yes", "b"] = [["hello"]] - assert_eq(df, cdf) - - -@pytest.mark.xfail(reason="https://github.com/rapidsai/cudf/issues/12504") -def test_loc_setitem_extend_empty_12504(): - df = pd.DataFrame(columns=["a"]) - cdf = cudf.from_pandas(df) - - df.loc[0] = [1] - - cdf.loc[0] = [1] - - assert_eq(df, cdf) - - -def test_loc_setitem_extend_existing_12505(): - df = pd.DataFrame({"a": [0]}) - cdf = cudf.from_pandas(df) - - df.loc[1] = 1 - - cdf.loc[1] = 1 - - assert_eq(df, cdf) - - -@pytest.mark.xfail(reason="https://github.com/rapidsai/cudf/issues/12801") -def test_loc_setitem_add_column_partial_12801(): - df = pd.DataFrame({"a": [0, 1, 2]}) - cdf = cudf.from_pandas(df) - - df.loc[df.a < 2, "b"] = 1 - - cdf.loc[cdf.a < 2, "b"] = 1 - - assert_eq(df, cdf) - - -@pytest.mark.xfail(reason="https://github.com/rapidsai/cudf/issues/13031") -@pytest.mark.parametrize("other_index", [["1", "3", "2"], [1, 2, 3]]) -def test_loc_setitem_series_index_alignment_13031(other_index): - s = pd.Series([1, 2, 3], index=["1", "2", "3"]) - other = pd.Series([5, 6, 7], index=other_index) - - cs = cudf.from_pandas(s) - cother = cudf.from_pandas(other) - - s.loc[["1", "3"]] = other - - cs.loc[["1", "3"]] = cother - - assert_eq(s, cs) - - -@pytest.mark.parametrize( - "ps", - [ - pd.Series([1, 2, 3], index=pd.RangeIndex(0, 3)), - pd.Series([1, 2, 3], index=pd.RangeIndex(start=2, stop=-1, step=-1)), - pd.Series([1, 2, 3], index=pd.RangeIndex(start=1, stop=6, step=2)), - pd.Series( - [1, 2, 3, 4, 5], index=pd.RangeIndex(start=1, stop=-9, step=-2) - ), - pd.Series( - [1, 2, 3, 4, 5], index=pd.RangeIndex(start=1, stop=-12, step=-3) - ), - pd.Series([1, 2, 3, 4], index=pd.RangeIndex(start=1, stop=14, step=4)), - pd.Series( - [1, 2, 3, 4], index=pd.RangeIndex(start=1, stop=-14, step=-4) - ), - ], -) -@pytest.mark.parametrize("arg", [*list(range(-20, 20)), 5.6, 3.1]) -def test_series_set_item_range_index(ps, arg): - gsr = cudf.from_pandas(ps) - psr = ps.copy(deep=True) - psr[arg] = 11 - gsr[arg] = 11 - - assert_eq(psr, gsr, check_index_type=True) - - -def test_series_set_item_index_reference(): - gs1 = cudf.Series([1], index=[7]) - gs2 = cudf.Series([2], index=gs1.index) - - gs1.loc[11] = 2 - ps1 = pd.Series([1], index=[7]) - ps2 = pd.Series([2], index=ps1.index) - ps1.loc[11] = 2 - - assert_eq(ps1, gs1) - assert_eq(ps2, gs2) From 5fc994093f3a1686c6a37d144251e9d2fa4e024c Mon Sep 17 00:00:00 2001 From: Kyle Edwards Date: Mon, 25 Aug 2025 18:34:12 -0400 Subject: [PATCH 208/366] Fix how nvcomp major version is extracted (#19791) Resolves https://github.com/rapidsai/cudf/issues/19790 Follow-up to https://github.com/rapidsai/cudf/pull/19786 Authors: - Kyle Edwards (https://github.com/KyleFromNVIDIA) Approvers: - Robert Maynard (https://github.com/robertmaynard) - Nghia Truong (https://github.com/ttnghia) URL: https://github.com/rapidsai/cudf/pull/19791 --- python/libcudf/CMakeLists.txt | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/python/libcudf/CMakeLists.txt b/python/libcudf/CMakeLists.txt index 6722db592bb..580078b1509 100644 --- a/python/libcudf/CMakeLists.txt +++ b/python/libcudf/CMakeLists.txt @@ -54,9 +54,9 @@ if(TARGET nvcomp::nvcomp) # Compute the SOVERSION from the library path get_filename_component(nvcomp_lib_dir ${nvcomp_lib_path} DIRECTORY) get_filename_component(nvcomp_lib_name ${nvcomp_lib_path} NAME) - string(REPLACE [=[libnvcomp.so.]=] "" nvcomp_soversion ${nvcomp_lib_name}) - string(REPLACE [=[.]=] ";" nvcomp_soversion ${nvcomp_soversion}) - list(GET nvcomp_soversion 0 nvcomp_soversion_major) + string(REGEX REPLACE "^libnvcomp\\.so\\.([0-9]+).*$" "\\1" nvcomp_soversion_major + ${nvcomp_lib_name} + ) install( FILES ${nvcomp_lib_path} DESTINATION ${SKBUILD_PLATLIB_DIR}/libcudf/lib64/ From 6bb386441a057d2ec89e73304338d82ca0e1c0c3 Mon Sep 17 00:00:00 2001 From: Tianyu Liu Date: Mon, 25 Aug 2025 19:38:58 -0400 Subject: [PATCH 209/366] Use KvikIO's implementation of file-backed memory mapping (#19164) ### Background Libcudf 25.04 and earlier use `memory_mapped_source` by default. For host read it uses memory mmaped I/O, and for device read it uses standard I/O. Libcudf 25.06 uses standard I/O by default. For host read, `memory_mapped_source` still uses memory mmaped I/O, while the device read is no longer allowed (runtime exception if used). Parquet/ORC readers fall back to the host read + H2D copy to emulate device read for the mapped source. ### This PR Now that the file-backed memory mapping (C++) is supported by KvikIO (https://github.com/rapidsai/kvikio/pull/740), this PR updates libcudf to reinvigorate `memory_mapped_source` using KvikIO's implementation. This re-enables device read and brings performance improvement (e.g. through parallel prefault). ### Notes `memory_mapped_source` is an implementation detail in `datasource.cpp`. Currently testing was conducted on C2C (arm) and PCIe (x86) systems by manually setting `LIBCUDF_MMAP_ENABLED=ON` and running the tests. Refer to https://github.com/rapidsai/kvikio/issues/530#issuecomment-2994871923 for benchmark results. ### Dependency This PR depends on KvikIO PR https://github.com/rapidsai/kvikio/pull/781 to fix unit test errors. Authors: - Tianyu Liu (https://github.com/kingcrimsontianyu) - Vukasin Milovanovic (https://github.com/vuule) Approvers: - Vukasin Milovanovic (https://github.com/vuule) URL: https://github.com/rapidsai/cudf/pull/19164 --- cpp/src/io/utilities/datasource.cpp | 152 ++++++---------------------- 1 file changed, 31 insertions(+), 121 deletions(-) diff --git a/cpp/src/io/utilities/datasource.cpp b/cpp/src/io/utilities/datasource.cpp index 4b2ff497f33..1eeadb68c51 100644 --- a/cpp/src/io/utilities/datasource.cpp +++ b/cpp/src/io/utilities/datasource.cpp @@ -25,6 +25,8 @@ #include #include +#include +#include #include @@ -109,20 +111,8 @@ class kvikio_source : public datasource { rmm::cuda_stream_view stream) override { CUDF_EXPECTS(supports_device_read(), "Device reads are not supported for this file."); - auto const read_size = std::min(size, this->size() - offset); - - if constexpr (std::is_same_v) { - return _kvikio_handle.pread(dst, - read_size, - offset, - kvikio::defaults::task_size(), - kvikio::defaults::gds_threshold(), - false /* not to sync_default_stream */); - } else { - // HandleT is kvikio::RemoteHandle - return _kvikio_handle.pread(dst, read_size, offset); - } + return _kvikio_handle.pread(dst, read_size, offset); } size_t device_read(size_t offset, @@ -167,6 +157,21 @@ class file_source : public kvikio_source { "Reading a file using kvikIO, with compatibility mode %s.", _kvikio_handle.get_compat_mode_manager().is_compat_mode_preferred() ? "on" : "off"); } + + std::future device_read_async(size_t offset, + size_t size, + uint8_t* dst, + rmm::cuda_stream_view stream) override + { + CUDF_EXPECTS(supports_device_read(), "Device reads are not supported for this file."); + auto const read_size = std::min(size, this->size() - offset); + return _kvikio_handle.pread(dst, + read_size, + offset, + kvikio::defaults::task_size(), + kvikio::defaults::gds_threshold(), + false /* not to sync_default_stream */); + } }; /** @@ -175,117 +180,22 @@ class file_source : public kvikio_source { * Unlike Arrow's memory mapped IO class, this implementation allows memory mapping a subset of the * file where the starting offset may not be zero. */ -class memory_mapped_source : public file_source { +class memory_mapped_source : public kvikio_source { public: - explicit memory_mapped_source(char const* filepath, size_t offset, size_t max_size_estimate) - : file_source(filepath) - { - if (this->size() != 0) { - // Memory mapping is not exclusive, so we can include the whole region we expect to read - map(_kvikio_handle.fd(), offset, max_size_estimate); - } - } - - ~memory_mapped_source() override - { - if (_map_addr != nullptr) { unmap(); } - } - - std::unique_ptr host_read(size_t offset, size_t size) override - { - // Clamp length to available data - auto const read_size = std::min(size, this->size() - offset); - - // If the requested range is outside of the mapped region, read from the file - if (offset < _map_offset or offset + read_size > (_map_offset + _map_size)) { - return file_source::host_read(offset, read_size); - } - - // If the requested range is only partially within the registered region, copy to a new - // host buffer to make the data safe to copy to the device - if (_reg_addr != nullptr and - (offset < _reg_offset or offset + read_size > (_reg_offset + _reg_size))) { - auto const src = static_cast(_map_addr) + (offset - _map_offset); - - return std::make_unique>>( - std::vector(src, src + read_size)); - } - - return std::make_unique( - static_cast(_map_addr) + offset - _map_offset, read_size); - } - - std::future> host_read_async(size_t offset, - size_t size) override - { - // Use the default implementation instead of the file_source's implementation - return datasource::host_read_async(offset, size); - } - - size_t host_read(size_t offset, size_t size, uint8_t* dst) override - { - // Clamp length to available data - auto const read_size = std::min(size, this->size() - offset); - - // If the requested range is outside of the mapped region, read from the file - if (offset < _map_offset or offset + read_size > (_map_offset + _map_size)) { - return file_source::host_read(offset, read_size, dst); + explicit memory_mapped_source(char const* filepath, + size_t offset, + [[maybe_unused]] size_t max_size_estimate) + : kvikio_source{kvikio::MmapHandle()} + { + // Since the superclass kvikio_source is initialized with an empty mmap handle, `this->size()` + // returns 0 at this point. Use `kvikio::get_file_size()` instead. + auto const file_size = kvikio::get_file_size(filepath); + if (file_size != 0) { + CUDF_EXPECTS(offset < file_size, "Offset is past end of file", std::overflow_error); + _kvikio_handle = + kvikio::MmapHandle(filepath, "r", std::nullopt, 0, kvikio::FileHandle::m644, MAP_SHARED); } - - auto const src = static_cast(_map_addr) + (offset - _map_offset); - std::memcpy(dst, src, read_size); - return read_size; } - - std::future host_read_async(size_t offset, size_t size, uint8_t* dst) override - { - // Use the default implementation instead of the file_source's implementation - return datasource::host_read_async(offset, size, dst); - } - - [[nodiscard]] bool supports_device_read() const override { return false; } - - [[nodiscard]] bool is_device_read_preferred(size_t size) const override - { - return supports_device_read(); - } - - private: - void map(int fd, size_t offset, size_t size) - { - CUDF_EXPECTS(offset < this->size(), "Offset is past end of file", std::overflow_error); - - // Offset for `mmap()` must be page aligned - _map_offset = offset & ~(sysconf(_SC_PAGESIZE) - 1); - - if (size == 0 || (offset + size) > this->size()) { size = this->size() - offset; } - - // Size for `mmap()` needs to include the page padding - _map_size = size + (offset - _map_offset); - if (_map_size == 0) { return; } - - // Check if accessing a region within already mapped area - _map_addr = mmap(nullptr, _map_size, PROT_READ, MAP_PRIVATE, fd, _map_offset); - CUDF_EXPECTS(_map_addr != MAP_FAILED, "Cannot create memory mapping"); - } - - void unmap() - { - if (_map_addr != nullptr) { - auto const result = munmap(_map_addr, _map_size); - if (result != 0) { CUDF_LOG_WARN("munmap failed with %d", result); } - _map_addr = nullptr; - } - } - - private: - size_t _map_offset = 0; - size_t _map_size = 0; - void* _map_addr = nullptr; - - size_t _reg_offset = 0; - size_t _reg_size = 0; - void* _reg_addr = nullptr; }; /** From 6ddf5670bb681a8357c0708aa3cdd8b6c6bd5d11 Mon Sep 17 00:00:00 2001 From: David Wendt <45795991+davidwendt@users.noreply.github.com> Date: Tue, 26 Aug 2025 10:47:00 -0400 Subject: [PATCH 210/366] Add nvbench benchmark for cudf::encode API (#19777) Adds a benchmark for the libcudf `cudf::encode` API. Looking to improve the performance of this API in follow on PR(s). Authors: - David Wendt (https://github.com/davidwendt) Approvers: - Kyle Edwards (https://github.com/KyleFromNVIDIA) - Nghia Truong (https://github.com/ttnghia) - Yunsong Wang (https://github.com/PointKernel) URL: https://github.com/rapidsai/cudf/pull/19777 --- cpp/benchmarks/CMakeLists.txt | 4 +- cpp/benchmarks/transform/encode.cpp | 64 +++++++++++++++++++++++++++++ 2 files changed, 67 insertions(+), 1 deletion(-) create mode 100644 cpp/benchmarks/transform/encode.cpp diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt index 4f07dd72ed4..a6d070ce2d3 100644 --- a/cpp/benchmarks/CMakeLists.txt +++ b/cpp/benchmarks/CMakeLists.txt @@ -368,7 +368,9 @@ ConfigureNVBench( # ################################################################################################## # * transform benchmark # --------------------------------------------------------------------------------- -ConfigureNVBench(TRANSFORM_NVBENCH transform/polynomials.cpp transform/transform.cpp) +ConfigureNVBench( + TRANSFORM_NVBENCH transform/encode.cpp transform/polynomials.cpp transform/transform.cpp +) # ################################################################################################## # * nvtext benchmark ------------------------------------------------------------------- diff --git a/cpp/benchmarks/transform/encode.cpp b/cpp/benchmarks/transform/encode.cpp new file mode 100644 index 00000000000..dea40a0865d --- /dev/null +++ b/cpp/benchmarks/transform/encode.cpp @@ -0,0 +1,64 @@ +/* + * Copyright (c) 2025, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include +#include +#include +#include +#include +#include +#include + +#include + +#include + +template +static void bench_encode(nvbench::state& state, nvbench::type_list) +{ + auto const num_rows = static_cast(state.get_int64("num_rows")); + auto const width = static_cast(state.get_int64("width")); + auto const nulls = state.get_float64("nulls"); + auto const data_type = cudf::type_to_id(); + + auto range = data_type == cudf::type_id::STRING ? (width / 10) : width; + data_profile const profile = + data_profile_builder().cardinality(0).null_probability(nulls).distribution( + data_type, distribution_id::UNIFORM, 0, range); + auto input = create_random_column(data_type, row_count{num_rows}, profile); + auto tv = cudf::table_view({input->view()}); + + auto alloc_size = input->alloc_size(); + state.add_global_memory_reads(alloc_size); + state.add_global_memory_writes(num_rows); + auto stream = cudf::get_default_stream(); + state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value())); + + state.exec(nvbench::exec_tag::sync, + [&](nvbench::launch&) { auto result = cudf::encode(tv, stream); }); +} + +NVBENCH_DECLARE_TYPE_STRINGS(cudf::string_view, "string_view", "string_view"); + +using Types = nvbench::type_list; + +NVBENCH_BENCH_TYPES(bench_encode, NVBENCH_TYPE_AXES(Types)) + .set_name("encode") + .add_int64_axis("width", {10, 100}) + .add_int64_axis("num_rows", {262144, 2097152, 16777216, 67108864}) + .add_float64_axis("nulls", {0, 0.1}); From a2ec340718db95c639a2692789e961193f6d2ffc Mon Sep 17 00:00:00 2001 From: David Wendt <45795991+davidwendt@users.noreply.github.com> Date: Tue, 26 Aug 2025 10:47:17 -0400 Subject: [PATCH 211/366] Fix ndsh benchmarks nvtx range usage (#19753) Updates the nvtx scoped range usage in the ndsh benchmarks to use `benchmarks` prefix instead of `libcudf` This makes the nsys tracing more clear by identifying benchmark utilities separately from libcudf APIs. Also introduces a new `CUDF_BENCHMARK_RANGE()` macro to be used for benchmarks instead of the libcudf internal one `CUDF_FUNC_RANGE()`. Example nsys trace with this change: ``` 16.3 657,460,069 2 328,730,034.5 328,730,034.5 12,374,462 645,085,607 447,394,341.2 PushPop libcudf:write_parquet 16.0 646,282,446 1 646,282,446.0 646,282,446.0 646,282,446 646,282,446 0.0 PushPop benchmarks:write_to_parquet_device_buffer 4.0 162,550,464 1 162,550,464.0 162,550,464.0 162,550,464 162,550,464 0.0 PushPop benchmarks:generate_orders_lineitem_part 2.2 87,655,406 1 87,655,406.0 87,655,406.0 87,655,406 87,655,406 0.0 PushPop benchmarks:generate_lineitem_partial ... ``` It is now easier to separate the functions by name. Authors: - David Wendt (https://github.com/davidwendt) Approvers: - Bradley Dice (https://github.com/bdice) - Yunsong Wang (https://github.com/PointKernel) URL: https://github.com/rapidsai/cudf/pull/19753 --- .../ndsh_data_generator.cpp | 23 +++++----- .../random_column_generator.cu | 17 +++---- .../ndsh_data_generator/table_helpers.cpp | 21 +++++---- cpp/benchmarks/common/nvtx_ranges.hpp | 46 +++++++++++++++++++ cpp/benchmarks/ndsh/q09.cpp | 10 ++-- cpp/benchmarks/ndsh/utilities.cpp | 31 ++++++------- cpp/benchmarks/ndsh/utilities.hpp | 1 - 7 files changed, 99 insertions(+), 50 deletions(-) create mode 100644 cpp/benchmarks/common/nvtx_ranges.hpp diff --git a/cpp/benchmarks/common/ndsh_data_generator/ndsh_data_generator.cpp b/cpp/benchmarks/common/ndsh_data_generator/ndsh_data_generator.cpp index 8a1df9c97dd..f03cbc5d125 100644 --- a/cpp/benchmarks/common/ndsh_data_generator/ndsh_data_generator.cpp +++ b/cpp/benchmarks/common/ndsh_data_generator/ndsh_data_generator.cpp @@ -19,12 +19,13 @@ #include "random_column_generator.hpp" #include "table_helpers.hpp" +#include + #include #include #include #include -#include #include #include #include @@ -156,7 +157,7 @@ std::unique_ptr generate_orders_independent(double scale_factor, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { - CUDF_FUNC_RANGE(); + CUDF_BENCHMARK_RANGE(); cudf::size_type const o_num_rows = scale_factor * 1'500'000; // Generate the `o_orderkey` column @@ -280,7 +281,7 @@ std::unique_ptr generate_lineitem_partial(cudf::table_view const& o rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { - CUDF_FUNC_RANGE(); + CUDF_BENCHMARK_RANGE(); auto const o_num_rows = orders_independent.num_rows(); // Generate the `lineitem` table. For each row in the `orders` table, // we have a random number (between 1 and 7) of rows in the `lineitem` table @@ -450,7 +451,7 @@ std::unique_ptr generate_orders_dependent(cudf::table_view const& l rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { - CUDF_FUNC_RANGE(); + CUDF_BENCHMARK_RANGE(); auto const l_linestatus_mask = lineitem_partial.column(0); auto const l_orderkey = lineitem_partial.column(1); auto const l_extendedprice = lineitem_partial.column(6); @@ -543,7 +544,7 @@ std::unique_ptr generate_partsupp(double scale_factor, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { - CUDF_FUNC_RANGE(); + CUDF_BENCHMARK_RANGE(); // Define the number of rows in the `part` and `partsupp` tables cudf::size_type const p_num_rows = scale_factor * 200'000; cudf::size_type const ps_num_rows = scale_factor * 800'000; @@ -591,7 +592,7 @@ std::unique_ptr generate_part(double scale_factor, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { - CUDF_FUNC_RANGE(); + CUDF_BENCHMARK_RANGE(); cudf::size_type const num_rows = scale_factor * 200'000; // Generate the `p_partkey` column @@ -717,7 +718,7 @@ generate_orders_lineitem_part(double scale_factor, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { - CUDF_FUNC_RANGE(); + CUDF_BENCHMARK_RANGE(); // Generate a table with the independent columns of the `orders` table auto orders_independent = generate_orders_independent(scale_factor, stream, mr); @@ -784,7 +785,7 @@ std::unique_ptr generate_supplier(double scale_factor, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { - CUDF_FUNC_RANGE(); + CUDF_BENCHMARK_RANGE(); // Calculate the number of rows based on the scale factor cudf::size_type const num_rows = scale_factor * 10'000; @@ -845,7 +846,7 @@ std::unique_ptr generate_customer(double scale_factor, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { - CUDF_FUNC_RANGE(); + CUDF_BENCHMARK_RANGE(); // Calculate the number of rows based on the scale factor cudf::size_type const num_rows = scale_factor * 150'000; @@ -912,7 +913,7 @@ std::unique_ptr generate_customer(double scale_factor, std::unique_ptr generate_nation(rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { - CUDF_FUNC_RANGE(); + CUDF_BENCHMARK_RANGE(); // Define the number of rows constexpr cudf::size_type num_rows = 25; @@ -952,7 +953,7 @@ std::unique_ptr generate_nation(rmm::cuda_stream_view stream, std::unique_ptr generate_region(rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { - CUDF_FUNC_RANGE(); + CUDF_BENCHMARK_RANGE(); // Define the number of rows constexpr cudf::size_type num_rows = 5; diff --git a/cpp/benchmarks/common/ndsh_data_generator/random_column_generator.cu b/cpp/benchmarks/common/ndsh_data_generator/random_column_generator.cu index 4246bd1a83b..aac50aeecb2 100644 --- a/cpp/benchmarks/common/ndsh_data_generator/random_column_generator.cu +++ b/cpp/benchmarks/common/ndsh_data_generator/random_column_generator.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2024, NVIDIA CORPORATION. + * Copyright (c) 2024-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -16,11 +16,12 @@ #include "random_column_generator.hpp" +#include + #include #include #include -#include #include #include @@ -91,7 +92,7 @@ std::unique_ptr generate_random_string_column(cudf::size_type lowe rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { - CUDF_FUNC_RANGE(); + CUDF_BENCHMARK_RANGE(); auto offsets_begin = cudf::detail::make_counting_transform_iterator( 0, random_number_generator(lower, upper)); auto [offsets_column, computed_bytes] = cudf::strings::detail::make_offsets_child_column( @@ -119,7 +120,7 @@ std::unique_ptr generate_random_numeric_column(T lower, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { - CUDF_FUNC_RANGE(); + CUDF_BENCHMARK_RANGE(); auto col = cudf::make_numeric_column( cudf::data_type{cudf::type_to_id()}, num_rows, cudf::mask_state::UNALLOCATED, stream, mr); cudf::size_type begin = 0; @@ -165,7 +166,7 @@ std::unique_ptr generate_primary_key_column(cudf::scalar const& st rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { - CUDF_FUNC_RANGE(); + CUDF_BENCHMARK_RANGE(); return cudf::sequence(num_rows, start, stream, mr); } @@ -174,7 +175,7 @@ std::unique_ptr generate_repeat_string_column(std::string const& v rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { - CUDF_FUNC_RANGE(); + CUDF_BENCHMARK_RANGE(); auto const scalar = cudf::string_scalar(value); return cudf::make_column_from_scalar(scalar, num_rows, stream, mr); } @@ -185,7 +186,7 @@ std::unique_ptr generate_random_string_column_from_set( rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { - CUDF_FUNC_RANGE(); + CUDF_BENCHMARK_RANGE(); // Build a gather map of random strings to choose from // The size of the string sets always fits within 16-bit integers auto const indices = @@ -211,7 +212,7 @@ std::unique_ptr generate_repeat_sequence_column(T seq_length, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { - CUDF_FUNC_RANGE(); + CUDF_BENCHMARK_RANGE(); auto pkey = generate_primary_key_column(cudf::numeric_scalar(0), num_rows, stream, mr); auto repeat_seq_zero_indexed = cudf::binary_operation(pkey->view(), diff --git a/cpp/benchmarks/common/ndsh_data_generator/table_helpers.cpp b/cpp/benchmarks/common/ndsh_data_generator/table_helpers.cpp index 7095c227649..4185bcf60e5 100644 --- a/cpp/benchmarks/common/ndsh_data_generator/table_helpers.cpp +++ b/cpp/benchmarks/common/ndsh_data_generator/table_helpers.cpp @@ -18,13 +18,14 @@ #include "random_column_generator.hpp" +#include + #include #include #include #include #include #include -#include #include #include #include @@ -55,7 +56,7 @@ std::unique_ptr add_calendrical_days(cudf::column_view const& time rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { - CUDF_FUNC_RANGE(); + CUDF_BENCHMARK_RANGE(); auto const days_duration_type = cudf::cast(days, cudf::data_type{cudf::type_id::DURATION_DAYS}); auto const data_type = cudf::data_type{cudf::type_id::TIMESTAMP_DAYS}; return cudf::binary_operation( @@ -80,7 +81,7 @@ std::unique_ptr perform_left_join(cudf::table_view const& left_inpu rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { - CUDF_FUNC_RANGE(); + CUDF_BENCHMARK_RANGE(); constexpr auto oob_policy = cudf::out_of_bounds_policy::NULLIFY; auto const left_selected = left_input.select(left_on); auto const right_selected = right_input.select(right_on); @@ -116,7 +117,7 @@ std::unique_ptr perform_left_join(cudf::table_view const& left_inpu rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { - CUDF_FUNC_RANGE(); + CUDF_BENCHMARK_RANGE(); // Expression: (90000 + ((p_partkey/10) modulo 20001) + 100 * (p_partkey modulo 1000)) / 100 auto table = cudf::table_view({p_partkey}); auto p_partkey_col_ref = cudf::ast::column_reference(0); @@ -160,7 +161,7 @@ std::unique_ptr perform_left_join(cudf::table_view const& left_inpu rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { - CUDF_FUNC_RANGE(); + CUDF_BENCHMARK_RANGE(); // Expression: (l_partkey + (i * (s/4 + (int)(l_partkey - 1)/s))) % s + 1 // Generate the `s` col @@ -232,7 +233,7 @@ std::unique_ptr perform_left_join(cudf::table_view const& left_inpu rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { - CUDF_FUNC_RANGE(); + CUDF_BENCHMARK_RANGE(); // Expression: ps_suppkey = (ps_partkey + (i * (s/4 + (int)(ps_partkey - 1)/s))) % s + 1 // Generate the `s` col @@ -299,7 +300,7 @@ std::unique_ptr perform_left_join(cudf::table_view const& left_inpu rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { - CUDF_FUNC_RANGE(); + CUDF_BENCHMARK_RANGE(); auto const sum_agg = cudf::make_sum_aggregation(); auto const l_num_rows_scalar = cudf::reduce(o_rep_freqs, *sum_agg, cudf::data_type{cudf::type_id::INT32}, stream, mr); @@ -322,7 +323,7 @@ std::unique_ptr perform_left_join(cudf::table_view const& left_inpu rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { - CUDF_FUNC_RANGE(); + CUDF_BENCHMARK_RANGE(); auto const one = cudf::numeric_scalar(1); auto const one_minus_discount = cudf::binary_operation( one, discount, cudf::binary_operator::SUB, cudf::data_type{cudf::type_id::FLOAT64}, stream, mr); @@ -352,7 +353,7 @@ std::unique_ptr perform_left_join(cudf::table_view const& left_inpu [[nodiscard]] std::unique_ptr generate_address_column( cudf::size_type num_rows, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { - CUDF_FUNC_RANGE(); + CUDF_BENCHMARK_RANGE(); return generate_random_string_column(10, 40, num_rows, stream, mr); } @@ -367,7 +368,7 @@ std::unique_ptr perform_left_join(cudf::table_view const& left_inpu rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { - CUDF_FUNC_RANGE(); + CUDF_BENCHMARK_RANGE(); auto const part_a = cudf::strings::from_integers( generate_random_numeric_column(10, 34, num_rows, stream, mr)->view()); auto const part_b = cudf::strings::from_integers( diff --git a/cpp/benchmarks/common/nvtx_ranges.hpp b/cpp/benchmarks/common/nvtx_ranges.hpp new file mode 100644 index 00000000000..edcb0ab358c --- /dev/null +++ b/cpp/benchmarks/common/nvtx_ranges.hpp @@ -0,0 +1,46 @@ +/* + * Copyright (c) 2025, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +namespace cudf::benchmark { +/** + * @brief Tag type for the NVTX domain + */ +struct benchmark_domain { + static constexpr char const* name{"benchmarks"}; ///< Name of the domain +}; + +} // namespace cudf::benchmark + +/** + * @brief Convenience macro for generating an NVTX range in the `benchmarks` domain + * from the lifetime of a function. + * + * Uses the name of the immediately enclosing function returned by `__func__` to + * name the range. + * + * Example: + * ``` + * void some_function(){ + * CUDF_BENCHMARK_RANGE(); + * ... + * } + * ``` + */ +#define CUDF_BENCHMARK_RANGE() NVTX3_FUNC_RANGE_IN(cudf::benchmark::benchmark_domain) diff --git a/cpp/benchmarks/ndsh/q09.cpp b/cpp/benchmarks/ndsh/q09.cpp index 28063533f16..7bb333a4956 100644 --- a/cpp/benchmarks/ndsh/q09.cpp +++ b/cpp/benchmarks/ndsh/q09.cpp @@ -16,6 +16,8 @@ #include "utilities.hpp" +#include + #include #include #include @@ -115,7 +117,7 @@ struct q9_data { rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()) { - CUDF_FUNC_RANGE(); + CUDF_BENCHMARK_RANGE(); auto const one = cudf::numeric_scalar(1); auto const one_minus_discount = @@ -147,7 +149,7 @@ struct q9_data { rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()) { - CUDF_FUNC_RANGE(); + CUDF_BENCHMARK_RANGE(); std::string udf = R"***( @@ -174,7 +176,7 @@ struct q9_data { rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()) { - CUDF_FUNC_RANGE(); + CUDF_BENCHMARK_RANGE(); cudf::ast::tree tree; cudf::table_view table{std::vector{discount, extendedprice, supplycost, quantity}}; @@ -245,7 +247,7 @@ q9_data load_data(std::unordered_map& source std::unique_ptr join_data(q9_data const& data) { - CUDF_FUNC_RANGE(); + CUDF_BENCHMARK_RANGE(); // Generating the `profit` table // Filter the part table using `p_name like '%green%'` diff --git a/cpp/benchmarks/ndsh/utilities.cpp b/cpp/benchmarks/ndsh/utilities.cpp index 766d30bdfeb..074bc2d75f8 100644 --- a/cpp/benchmarks/ndsh/utilities.cpp +++ b/cpp/benchmarks/ndsh/utilities.cpp @@ -16,13 +16,12 @@ #include "utilities.hpp" -#include "common/ndsh_data_generator/ndsh_data_generator.hpp" -#include "common/table_utilities.hpp" -#include "cudf/detail/utilities/integer_utils.hpp" +#include +#include +#include #include #include -#include #include #include #include @@ -137,7 +136,7 @@ table_with_names& table_with_names::append(std::unique_ptr& col, cudf::table_view table_with_names::select(std::vector const& col_names) const { - CUDF_FUNC_RANGE(); + CUDF_BENCHMARK_RANGE(); std::vector col_indices; for (auto const& col_name : col_names) { col_indices.push_back(column_id(col_name)); @@ -147,7 +146,7 @@ cudf::table_view table_with_names::select(std::vector const& col_na void table_with_names::to_parquet(std::string const& filepath) const { - CUDF_FUNC_RANGE(); + CUDF_BENCHMARK_RANGE(); auto const sink_info = cudf::io::sink_info(filepath); cudf::io::table_metadata metadata; metadata.schema_info = @@ -165,7 +164,7 @@ std::unique_ptr join_and_gather(cudf::table_view const& left_input, std::vector const& right_on, cudf::null_equality compare_nulls) { - CUDF_FUNC_RANGE(); + CUDF_BENCHMARK_RANGE(); constexpr auto oob_policy = cudf::out_of_bounds_policy::DONT_CHECK; auto const left_selected = left_input.select(left_on); auto const right_selected = right_input.select(right_on); @@ -200,7 +199,7 @@ std::unique_ptr apply_inner_join( std::vector const& right_on, cudf::null_equality compare_nulls) { - CUDF_FUNC_RANGE(); + CUDF_BENCHMARK_RANGE(); std::vector left_on_indices; std::vector right_on_indices; std::transform( @@ -230,7 +229,7 @@ std::unique_ptr apply_inner_join( std::unique_ptr apply_filter(std::unique_ptr const& table, cudf::ast::operation const& predicate) { - CUDF_FUNC_RANGE(); + CUDF_BENCHMARK_RANGE(); auto const boolean_mask = cudf::compute_column(table->table(), predicate); auto result_table = cudf::apply_boolean_mask(table->table(), boolean_mask->view()); return std::make_unique(std::move(result_table), table->column_names()); @@ -239,7 +238,7 @@ std::unique_ptr apply_filter(std::unique_ptr std::unique_ptr apply_mask(std::unique_ptr const& table, std::unique_ptr const& mask) { - CUDF_FUNC_RANGE(); + CUDF_BENCHMARK_RANGE(); auto result_table = cudf::apply_boolean_mask(table->table(), mask->view()); return std::make_unique(std::move(result_table), table->column_names()); } @@ -247,7 +246,7 @@ std::unique_ptr apply_mask(std::unique_ptr c std::unique_ptr apply_groupby(std::unique_ptr const& table, groupby_context_t const& ctx) { - CUDF_FUNC_RANGE(); + CUDF_BENCHMARK_RANGE(); auto const keys = table->select(ctx.keys); cudf::groupby::groupby groupby_obj(keys); std::vector result_column_names; @@ -291,7 +290,7 @@ std::unique_ptr apply_orderby(std::unique_ptr const& sort_keys, std::vector const& sort_key_orders) { - CUDF_FUNC_RANGE(); + CUDF_BENCHMARK_RANGE(); std::vector column_views; for (auto& key : sort_keys) { column_views.push_back(table->column(key)); @@ -305,7 +304,7 @@ std::unique_ptr apply_reduction(cudf::column_view const& colum cudf::aggregation::Kind const& agg_kind, std::string const& col_name) { - CUDF_FUNC_RANGE(); + CUDF_BENCHMARK_RANGE(); auto const agg = cudf::make_sum_aggregation(); auto const result = cudf::reduce(column, *agg, column.type()); cudf::size_type const len = 1; @@ -322,7 +321,7 @@ std::unique_ptr read_parquet( std::vector const& columns, std::unique_ptr const& predicate) { - CUDF_FUNC_RANGE(); + CUDF_BENCHMARK_RANGE(); auto builder = cudf::io::parquet_reader_options_builder(source_info); if (!columns.empty()) { builder.columns(columns); } if (predicate) { builder.filter(*predicate); } @@ -358,7 +357,7 @@ void write_to_parquet_device_buffer(std::unique_ptr const& table, std::vector const& col_names, cuio_source_sink_pair& source) { - CUDF_FUNC_RANGE(); + CUDF_BENCHMARK_RANGE(); auto const stream = cudf::get_default_stream(); // Prepare the table metadata @@ -410,7 +409,7 @@ void generate_parquet_data_sources(double scale_factor, std::vector const& table_names, std::unordered_map& sources) { - CUDF_FUNC_RANGE(); + CUDF_BENCHMARK_RANGE(); // Set the memory resource to the managed pool auto old_mr = cudf::get_current_device_resource(); diff --git a/cpp/benchmarks/ndsh/utilities.hpp b/cpp/benchmarks/ndsh/utilities.hpp index 82c43d8a5ee..70288bb1d2f 100644 --- a/cpp/benchmarks/ndsh/utilities.hpp +++ b/cpp/benchmarks/ndsh/utilities.hpp @@ -18,7 +18,6 @@ #include "io/cuio_common.hpp" -#include #include #include From 0c7588751fd332b52ba7b3ded0fc3f7d0c4f145f Mon Sep 17 00:00:00 2001 From: James Lamb Date: Tue, 26 Aug 2025 15:18:40 -0500 Subject: [PATCH 212/366] rearrange dependencies.yaml, other small changes (#19794) Splitting some changes off of the CUDA 13 support PR (#19768) ... that has gotten too large to review. Contributes to https://github.com/rapidsai/build-planning/issues/208 * uses the new `[cu12, cu13]` extras added to `dask-cuda` for wheels: https://github.com/rapidsai/dask-cuda/pull/1536 * replaces hard-coding of CUDA major version in `pandas` diff script * moves `numba-cuda` floor from `>=0.19.0` to `>=0.19.1` * consolidates some dependency lists with unnecessary `cuda: "12.*"` filters Authors: - James Lamb (https://github.com/jameslamb) Approvers: - Kyle Edwards (https://github.com/KyleFromNVIDIA) URL: https://github.com/rapidsai/cudf/pull/19794 --- ci/cudf_pandas_scripts/pandas-tests/diff.sh | 3 +- .../all_cuda-129_arch-aarch64.yaml | 2 +- .../all_cuda-129_arch-x86_64.yaml | 2 +- conda/recipes/cudf/recipe.yaml | 4 +- dependencies.yaml | 144 ++++++++---------- python/cudf/pyproject.toml | 4 +- python/pylibcudf/pyproject.toml | 2 +- 7 files changed, 73 insertions(+), 88 deletions(-) diff --git a/ci/cudf_pandas_scripts/pandas-tests/diff.sh b/ci/cudf_pandas_scripts/pandas-tests/diff.sh index aec1acd1539..f84776ad173 100755 --- a/ci/cudf_pandas_scripts/pandas-tests/diff.sh +++ b/ci/cudf_pandas_scripts/pandas-tests/diff.sh @@ -13,7 +13,8 @@ rapids-logger "Github job name: ${GH_JOB_NAME}" rapids-logger "Rapids version: ${RAPIDS_FULL_VERSION}" PY_VER="313" -PR_ARTIFACT=$(rapids-s3-path)cuda12_$(arch)_py${PY_VER}.pr-${RAPIDS_FULL_VERSION}-results.json +RAPIDS_CUDA_MAJOR="${RAPIDS_CUDA_VERSION%%.*}" +PR_ARTIFACT=$(rapids-s3-path)cuda${RAPIDS_CUDA_MAJOR}_$(arch)_py${PY_VER}.pr-${RAPIDS_FULL_VERSION}-results.json rapids-logger "Fetching latest available results from nightly" aws s3api list-objects-v2 --bucket rapids-downloads --prefix "nightly/cudf/" --query "sort_by(Contents[?ends_with(Key, '_py${PY_VER}.main-${RAPIDS_FULL_VERSION}-results.json')], &LastModified)[::].[Key]" --output text | tee s3_output.txt diff --git a/conda/environments/all_cuda-129_arch-aarch64.yaml b/conda/environments/all_cuda-129_arch-aarch64.yaml index ccf44072677..0140f536cc1 100644 --- a/conda/environments/all_cuda-129_arch-aarch64.yaml +++ b/conda/environments/all_cuda-129_arch-aarch64.yaml @@ -55,7 +55,7 @@ dependencies: - nbsphinx - ninja - notebook -- numba-cuda>=0.19.0,<0.20.0a0 +- numba-cuda>=0.19.1,<0.20.0a0 - numba>=0.61.0,<0.62.0a0 - numpy>=1.23,<3.0a0 - numpydoc diff --git a/conda/environments/all_cuda-129_arch-x86_64.yaml b/conda/environments/all_cuda-129_arch-x86_64.yaml index af6ca634e15..36d858a4957 100644 --- a/conda/environments/all_cuda-129_arch-x86_64.yaml +++ b/conda/environments/all_cuda-129_arch-x86_64.yaml @@ -56,7 +56,7 @@ dependencies: - nbsphinx - ninja - notebook -- numba-cuda>=0.19.0,<0.20.0a0 +- numba-cuda>=0.19.1,<0.20.0a0 - numba>=0.61.0,<0.62.0a0 - numpy>=1.23,<3.0a0 - numpydoc diff --git a/conda/recipes/cudf/recipe.yaml b/conda/recipes/cudf/recipe.yaml index 4b787a8178a..b89151fb266 100644 --- a/conda/recipes/cudf/recipe.yaml +++ b/conda/recipes/cudf/recipe.yaml @@ -55,7 +55,7 @@ requirements: - rapids-build-backend >=0.4.0,<0.5.0.dev0 - scikit-build-core >=0.10.0 - dlpack >=0.8,<1.0 - - numba-cuda >=0.19.0,<0.20.0a0 + - numba-cuda >=0.19.1,<0.20.0a0 - libcudf =${{ version }} - pylibcudf =${{ version }} - rmm =${{ minor_version }} @@ -70,7 +70,7 @@ requirements: - typing_extensions >=4.0.0 - pandas >=2.0,<2.4.0dev0 - cupy >=12.0.0 - - numba-cuda >=0.19.0,<0.20.0a0 + - numba-cuda >=0.19.1,<0.20.0a0 # TODO: Revert to numba>=0.60.0,<0.62.0a0 once https://github.com/NVIDIA/numba-cuda/pull/403 is released. - numba >=0.61.0,<0.62.0a0 - numpy >=1.23,<3.0a0 diff --git a/dependencies.yaml b/dependencies.yaml index 6398698546d..614c4fbc395 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -14,9 +14,11 @@ files: - cuda - cuda_version - depends_on_cupy + - depends_on_dask_cuda - depends_on_libkvikio - depends_on_librmm - depends_on_libnvcomp + - depends_on_numba_cuda - depends_on_rapids_logger - depends_on_rmm - develop @@ -38,7 +40,6 @@ files: - test_python_common - test_python_cudf - test_python_cudf_common - - test_python_dask_cudf - test_python_pylibcudf - test_python_cudf_pandas - test_python_cudf_polars @@ -75,6 +76,7 @@ files: - depends_on_cudf - depends_on_pylibcudf - depends_on_libcudf + - depends_on_numba_cuda test_python_pylibcudf: output: none includes: @@ -85,6 +87,7 @@ files: - test_python_pylibcudf - depends_on_pylibcudf - depends_on_libcudf + - depends_on_numba_cuda test_python_other: output: none includes: @@ -93,11 +96,12 @@ files: - py_version - test_python_common - test_python_cudf_common - - test_python_dask_cudf - test_python_pylibcudf - depends_on_cudf + - depends_on_dask_cuda - depends_on_pylibcudf - depends_on_libcudf + - depends_on_numba_cuda - depends_on_dask_cudf - depends_on_cudf_kafka - depends_on_custreamz @@ -140,6 +144,7 @@ files: - cuda - cuda_version - depends_on_cudf + - depends_on_dask_cuda - depends_on_dask_cudf - depends_on_pylibcudf - depends_on_libcudf @@ -162,7 +167,7 @@ files: includes: - build_base - build_python_common - - build_python_cudf + - depends_on_numba_cuda - depends_on_pylibcudf - depends_on_libcudf - depends_on_librmm @@ -179,6 +184,7 @@ files: - pyarrow_run - depends_on_cupy - depends_on_libcudf + - depends_on_numba_cuda - depends_on_pylibcudf_pyarrow - depends_on_rmm py_test_cudf: @@ -275,6 +281,7 @@ files: key: test includes: - depends_on_cupy + - depends_on_numba_cuda - pyarrow_run - test_python_common - test_python_cudf_common @@ -325,6 +332,7 @@ files: table: project.optional-dependencies key: test includes: + - depends_on_dask_cuda - numpy_run - test_python_common - test_python_cudf_polars @@ -353,9 +361,9 @@ files: table: project.optional-dependencies key: test includes: + - depends_on_dask_cuda - test_python_common - test_python_cudf_common - - test_python_dask_cudf py_build_cudf_kafka: output: pyproject pyproject_dir: python/cudf_kafka @@ -441,6 +449,7 @@ dependencies: - output_types: conda packages: - c-compiler + - cuda-nvcc - cxx-compiler - dlpack>=0.8,<1.0 - zlib>=1.2.13 @@ -449,22 +458,14 @@ dependencies: matrices: - matrix: arch: x86_64 - cuda: "12.*" packages: - gcc_linux-64=14.* - sysroot_linux-64==2.28 - matrix: arch: aarch64 - cuda: "12.*" packages: - gcc_linux-aarch64=14.* - sysroot_linux-aarch64==2.28 - - output_types: conda - matrices: - - matrix: - cuda: "12.*" - packages: - - cuda-nvcc build_cpp: common: - output_types: conda @@ -500,21 +501,6 @@ dependencies: - output_types: [conda, requirements, pyproject] packages: - cython>=3.0.3 - build_python_cudf: - common: - - output_types: [conda] - packages: - - &numba_cuda numba-cuda>=0.19.0,<0.20.0a0 - specific: - - output_types: [requirements, pyproject] - matrices: - - matrix: - cuda: "12.*" - packages: - - &numba_cuda_cu12 numba-cuda[cu12]>=0.19.0,<0.20.0a0 - - matrix: # Fallback for no matrix - packages: - - *numba_cuda_cu12 numpy_run: common: - output_types: [conda, requirements, pyproject] @@ -559,24 +545,21 @@ dependencies: packages: - cuda-version=12.9 cuda: + common: + - output_types: [conda] + packages: + - cuda-cudart-dev + - cuda-nvrtc-dev + - cuda-nvtx-dev + - libcurand-dev + - libnvjitlink-dev specific: - - output_types: conda - matrices: - - matrix: - cuda: "12.*" - packages: - - cuda-cudart-dev - - cuda-nvrtc-dev - - cuda-nvtx-dev - - libcurand-dev - - libnvjitlink-dev - output_types: conda matrices: - matrix: arch: aarch64 packages: - matrix: - cuda: "12.*" arch: x86_64 packages: - libcufile-dev @@ -607,7 +590,6 @@ dependencies: - output_types: [conda] packages: - breathe>=4.35.0 - - dask-cuda==25.10.*,>=0.0.0a0 - *doxygen - make - myst-nb @@ -690,9 +672,6 @@ dependencies: - packaging - rich - typing_extensions>=4.0.0 - - output_types: [conda] - packages: - - *numba_cuda - output_types: requirements packages: # pip recognizes the index as a global option for the requirements.txt file @@ -705,14 +684,6 @@ dependencies: packages: &run_cudf_packages_all_cu12 - cuda-python>=12.9.1,<13.0a0 - {matrix: null, packages: *run_cudf_packages_all_cu12} - - output_types: [requirements, pyproject] - matrices: - - matrix: {cuda: "12.*"} - packages: - - *numba_cuda_cu12 - - matrix: # Fallback for no matrix - packages: - - *numba_cuda_cu12 - output_types: [requirements, pyproject] matrices: - matrix: @@ -763,15 +734,7 @@ dependencies: - output_types: conda packages: - *cmake_ver - specific: - - output_types: conda - matrices: - - matrix: - cuda: "12.*" - packages: - - cuda-sanitizer-api - - matrix: # Fallback for no matrix - packages: + - cuda-sanitizer-api # packages we want in the 'test_cpp' group in 'files', for CI, but which # shouldn't be added to 'all' for building a development environment test_cpp_cudf: @@ -807,7 +770,7 @@ dependencies: # TODO: Revert to numba==0.60.0 once https://github.com/NVIDIA/numba-cuda/pull/403 is released. - numba==0.61.0 - pandas==2.0.* - - numba-cuda==0.19.0 + - numba-cuda==0.19.1 - matrix: {dependencies: "latest"} packages: - pandas==2.3.1 @@ -844,11 +807,9 @@ dependencies: - output_types: conda packages: - python-xxhash - - *numba_cuda - output_types: [pyproject, requirements] packages: - xxhash - - *numba_cuda_cu12 test_python_cudf: common: - output_types: [conda, requirements, pyproject] @@ -895,28 +856,10 @@ dependencies: - pytorch>=2.4.0 - matrix: packages: - test_python_dask_cudf: - common: - - output_types: [conda, requirements, pyproject] - packages: - - dask-cuda==25.10.*,>=0.0.0a0 - specific: - - output_types: [conda, requirements] - matrices: - - matrix: {dependencies: "oldest"} - packages: - - numpy==1.24.* - # pyarrow 14 is fine in some circumstances but we require pyarrow - # 15 in our CI tests in order to get a lz4-c that is compatible - # with cudf_kafka's dependencies. - - pyarrow==15.* - - matrix: - packages: test_python_cudf_polars: common: - output_types: [conda, requirements, pyproject] packages: - - dask-cuda==25.10.*,>=0.0.0a0 - rich test_python_narwhals: common: @@ -945,6 +888,21 @@ dependencies: packages: - libcudf-cu12==25.10.*,>=0.0.0a0 - {matrix: null, packages: [*libcudf_unsuffixed]} + depends_on_numba_cuda: + common: + - output_types: [conda] + packages: + - numba-cuda>=0.19.1,<0.20.0a0 + specific: + - output_types: [requirements, pyproject] + matrices: + - matrix: + cuda: "12.*" + packages: + - &numba_cuda_cu12 numba-cuda[cu12]>=0.19.1,<0.20.0a0 + - matrix: + packages: + - *numba_cuda_cu12 depends_on_pylibcudf: common: - output_types: conda @@ -1128,6 +1086,32 @@ dependencies: - nbformat - openpyxl - pytest-rerunfailures + depends_on_dask_cuda: + common: + - output_types: conda + packages: + - &dask_cuda_unsuffixed dask-cuda==25.10.*,>=0.0.0a0 + - output_types: requirements + packages: + # pip recognizes the index as a global option for the requirements.txt file + - --extra-index-url=https://pypi.nvidia.com + - --extra-index-url=https://pypi.anaconda.org/rapidsai-wheels-nightly/simple + specific: + - output_types: [requirements, pyproject] + matrices: + - matrix: + cuda: "12.*" + cuda_suffixed: "true" + packages: + - dask-cuda[cu12]==25.10.*,>=0.0.0a0 + - matrix: + cuda: "13.*" + cuda_suffixed: "true" + packages: + - dask-cuda[cu13]==25.10.*,>=0.0.0a0 + - matrix: + packages: + - *dask_cuda_unsuffixed depends_on_dask_cudf: common: - output_types: conda diff --git a/python/cudf/pyproject.toml b/python/cudf/pyproject.toml index 56915721e37..ece2dd3f07a 100644 --- a/python/cudf/pyproject.toml +++ b/python/cudf/pyproject.toml @@ -23,7 +23,7 @@ dependencies = [ "cupy-cuda12x>=12.0.0", "fsspec>=0.6.0", "libcudf==25.10.*,>=0.0.0a0", - "numba-cuda[cu12]>=0.19.0,<0.20.0a0", + "numba-cuda[cu12]>=0.19.1,<0.20.0a0", "numba>=0.61.0,<0.62.0a0", "numpy>=1.23,<3.0a0", "nvtx>=0.2.1", @@ -125,7 +125,7 @@ requires = [ "libcudf==25.10.*,>=0.0.0a0", "librmm==25.10.*,>=0.0.0a0", "ninja", - "numba-cuda[cu12]>=0.19.0,<0.20.0a0", + "numba-cuda[cu12]>=0.19.1,<0.20.0a0", "pylibcudf==25.10.*,>=0.0.0a0", "rmm==25.10.*,>=0.0.0a0", ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. diff --git a/python/pylibcudf/pyproject.toml b/python/pylibcudf/pyproject.toml index dd871c4bbdb..85fabaa14bf 100644 --- a/python/pylibcudf/pyproject.toml +++ b/python/pylibcudf/pyproject.toml @@ -44,7 +44,7 @@ test = [ "hypothesis>=6.131.7", "mmh3", "nanoarrow", - "numba-cuda[cu12]>=0.19.0,<0.20.0a0", + "numba-cuda[cu12]>=0.19.1,<0.20.0a0", "numba>=0.61.0,<0.62.0a0", "pandas", "pyarrow>=14.0.0,<20.0.0a0,!=17.0.0; platform_machine=='aarch64'", From b5c29e079542f972b066ce0c02825ad81aa65577 Mon Sep 17 00:00:00 2001 From: Kyle Edwards Date: Tue, 26 Aug 2025 16:49:39 -0400 Subject: [PATCH 213/366] Update rapids-dependency-file-generator (#19796) This PR updates the rapids-dependency-file-generator hook to get https://github.com/rapidsai/dependency-file-generator/pull/163. Authors: - Kyle Edwards (https://github.com/KyleFromNVIDIA) Approvers: - James Lamb (https://github.com/jameslamb) URL: https://github.com/rapidsai/cudf/pull/19796 --- .pre-commit-config.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 9850bce2c95..e66e866c4f4 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -158,10 +158,10 @@ repos: - id: verify-codeowners args: [--fix, --project-prefix=cudf] - repo: https://github.com/rapidsai/dependency-file-generator - rev: v1.19.1 + rev: v1.20.0 hooks: - id: rapids-dependency-file-generator - args: ["--clean"] + args: ["--clean", "--warn-all", "--strict"] - repo: https://github.com/shellcheck-py/shellcheck-py rev: v0.10.0.1 hooks: From aecf57636299731f47b46a6148d469dcc4ce6af7 Mon Sep 17 00:00:00 2001 From: Matthew Murray <41342305+Matt711@users.noreply.github.com> Date: Wed, 27 Aug 2025 10:47:20 -0400 Subject: [PATCH 214/366] Some clarifications, improvements to GroupedRollingWindows in cudf-polars (#19776) Follows up #19684. Adds clarifying comments to the implementation replaces the unnecessary sorting with a scatter because we're doing a `left_join`. - Contributes to #18633 Authors: - Matthew Murray (https://github.com/Matt711) Approvers: - Matthew Roeschke (https://github.com/mroeschke) URL: https://github.com/rapidsai/cudf/pull/19776 --- .../cudf_polars/dsl/expressions/rolling.py | 67 +++++++------------ .../tests/expressions/test_rolling.py | 26 +++++++ 2 files changed, 52 insertions(+), 41 deletions(-) diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/rolling.py b/python/cudf_polars/cudf_polars/dsl/expressions/rolling.py index f89d0e4133f..c01043d2d18 100644 --- a/python/cudf_polars/cudf_polars/dsl/expressions/rolling.py +++ b/python/cudf_polars/cudf_polars/dsl/expressions/rolling.py @@ -223,12 +223,11 @@ def do_evaluate( # noqa: D102 ) # pragma: no cover; translation raises first by_exprs = self.children[: self.by_count] - by_cols = list( - broadcast( - *(b.evaluate(df, context=ExecutionContext.FRAME) for b in by_exprs), - target_length=df.num_rows, - ) + by_cols = broadcast( + *(b.evaluate(df) for b in by_exprs), + target_length=df.num_rows, ) + by_tbl = plc.Table([c.obj for c in by_cols]) sorted_flag = ( @@ -253,13 +252,9 @@ def do_evaluate( # noqa: D102 out_dtypes.append(val.dtype) if isinstance(val, expr.Len): - # Count rows per group via sum(1). - ones = plc.Column.from_scalar( - plc.Scalar.from_py(1, plc.DataType(plc.TypeId.INT8)), df.num_rows - ) - gb_requests.append( - plc.groupby.GroupByRequest(ones, [plc.aggregation.sum()]) - ) + # A count aggregation, we need a column so use a key column + col = by_cols[0].obj + gb_requests.append(plc.groupby.GroupByRequest(col, [val.agg_request])) elif isinstance(val, expr.Agg): (child,) = ( val.children if val.name != "quantile" else (val.children[0],) @@ -270,42 +265,32 @@ def do_evaluate( # noqa: D102 group_keys_tbl, value_tables = grouper.aggregate(gb_requests) out_cols = (t.columns()[0] for t in value_tables) - # Build gather maps to broadcast per-group results to all rows. - # Also left-join input keys to group-keys so every input row appears exactly once. - lg, rg = plc.join.left_join( + # We do a left-join between the input keys to group-keys + # so every input row appears exactly once. left_order is + # returned un-ordered by libcudf. + left_order, right_order = plc.join.left_join( by_tbl, group_keys_tbl, plc.types.NullEquality.EQUAL ) - # Reorder the gather maps to preserve left/input order - left_rows, right_rows = by_tbl.num_rows(), group_keys_tbl.num_rows() - init = plc.Scalar.from_py(0, plc.types.SIZE_TYPE) - step = plc.Scalar.from_py(1, plc.types.SIZE_TYPE) - left_order = plc.copying.gather( - plc.Table([plc.filling.sequence(left_rows, init, step)]), - lg, - plc.copying.OutOfBoundsPolicy.DONT_CHECK, - ) - right_order = plc.copying.gather( - plc.Table([plc.filling.sequence(right_rows, init, step)]), - rg, - plc.copying.OutOfBoundsPolicy.NULLIFY, + # Scatter the right order indices into an all-null table + # and at the position of the index in left order. Now we + # the map between rows an groups with the correct ordering + left_rows = left_order.size() + target = plc.Column.from_scalar( + plc.Scalar.from_py(None, plc.types.SIZE_TYPE), left_rows ) - # Sort both maps by (left_order, right_order), then use the reordered right map - # to gather group aggregates in the original row order. - _, rg = plc.sorting.stable_sort_by_key( - plc.Table([lg, rg]), - plc.Table([*left_order.columns(), *right_order.columns()]), - [plc.types.Order.ASCENDING, plc.types.Order.ASCENDING], - [plc.types.NullOrder.AFTER, plc.types.NullOrder.AFTER], - ).columns() - - # Broadcast each aggregated result back to row-shape using the right map. + aligned_map = plc.copying.scatter( + plc.Table([right_order]), + left_order, + plc.Table([target]), + ).columns()[0] + + # Broadcast each aggregated result back to row-shape using + # the aligned mapping between rows indices and group indices broadcasted_cols = [ Column( plc.copying.gather( - plc.Table([col]), - rg, - plc.copying.OutOfBoundsPolicy.NULLIFY, + plc.Table([col]), aligned_map, plc.copying.OutOfBoundsPolicy.NULLIFY ).columns()[0], name=named_expr.name, dtype=dtype, diff --git a/python/cudf_polars/tests/expressions/test_rolling.py b/python/cudf_polars/tests/expressions/test_rolling.py index 6cf3c956358..2ec3abefc59 100644 --- a/python/cudf_polars/tests/expressions/test_rolling.py +++ b/python/cudf_polars/tests/expressions/test_rolling.py @@ -217,3 +217,29 @@ def test_over_with_mapping_strategy_unsupported(df, strategy): def test_over_boolean_function_unsupported(df): q = df.select(pl.col("x").not_().over("g")) assert_ir_translation_raises(q, NotImplementedError) + + +def test_over_ternary(df): + q = df.select( + pl.when(pl.col("g") == 1) + .then(pl.lit(None, dtype=pl.Int64)) + .otherwise(pl.col("x")) + .sum() + .over("g") + ) + + assert_gpu_result_equal(q) + + +def test_over_broadcast_input_row_group_indices_aligned(): + num_rows, num_groups = 512, 64 + + df = pl.LazyFrame( + { + "g": [(i * 31) % num_groups for i in range(num_rows)], + "x": list(range(num_rows)), + } + ) + q = df.select(pl.col("x").sum().over("g")) + + assert_gpu_result_equal(q) From 9fa50b9858748761763e29feacd312d85d869d2c Mon Sep 17 00:00:00 2001 From: David Wendt <45795991+davidwendt@users.noreply.github.com> Date: Wed, 27 Aug 2025 12:58:31 -0400 Subject: [PATCH 215/366] Improve support for sliced input on from_arrow_host APIs (#19491) Improves support accepting ArrowArray sliced input where the range is beyond the 2 billion row limit. The length (number of rows) of the sliced input must still be less than 2 billion but the offset/length range may reference indices above 2 billion. Authors: - David Wendt (https://github.com/davidwendt) Approvers: - Matthew Roeschke (https://github.com/mroeschke) - Lawrence Mitchell (https://github.com/wence-) - MithunR (https://github.com/mythrocks) URL: https://github.com/rapidsai/cudf/pull/19491 --- cpp/src/interop/from_arrow_host.cu | 480 +++++++++++------- cpp/src/interop/from_arrow_host.hpp | 25 +- cpp/src/interop/from_arrow_host_strings.cu | 106 +--- cpp/tests/interop/from_arrow_host_test.cpp | 15 +- cpp/tests/interop/from_arrow_test.cpp | 2 +- .../cudf_polars/cudf_polars/testing/plugin.py | 1 - 6 files changed, 338 insertions(+), 291 deletions(-) diff --git a/cpp/src/interop/from_arrow_host.cu b/cpp/src/interop/from_arrow_host.cu index 78a7bf0e864..b5584b80854 100644 --- a/cpp/src/interop/from_arrow_host.cu +++ b/cpp/src/interop/from_arrow_host.cu @@ -20,17 +20,16 @@ #include #include #include -#include -#include #include #include #include -#include -#include +#include +#include #include #include #include #include +#include #include #include #include @@ -39,6 +38,7 @@ #include #include #include +#include #include #include @@ -53,67 +53,129 @@ namespace detail { namespace { +/** + * @brief Return bitmask word at the given index in the source + * + * This is a 64-bit version of cudf::detail::get_mask_offset_word + * since the source may have a range more than max(int) bits. + */ +__device__ inline bitmask_type get_mask_word(bitmask_type const* __restrict__ source, + int64_t destination_word_index, + int64_t source_begin_bit, + int64_t source_end_bit) +{ + constexpr auto bitmask_bits = size_in_bits(); + auto const word_index = destination_word_index + (source_begin_bit / bitmask_bits); + auto const curr_word = source[word_index]; + auto const end_index = (source_end_bit - 1) / bitmask_bits; + auto const next_word = (end_index > word_index) ? source[word_index + 1] : bitmask_type{0}; + auto const shift = static_cast(source_begin_bit % bitmask_bits); + return __funnelshift_r(curr_word, next_word, shift); +} + +/** + * @brief Copy a shifted bitmask in device memory + * + * Called by get_mask_buffer below when a bit-shift within a bitmask_type is required. + * + * @param destination The destination bitmask. + * @param source The source bitmask. + * @param source_begin_bit The beginning bit of the source bitmask. + * @param source_end_bit The end bit of the source bitmask. + * @param number_of_mask_words The number of mask words. + */ +CUDF_KERNEL void copy_shifted_bitmask(bitmask_type* __restrict__ destination, + bitmask_type const* __restrict__ source, + int64_t source_begin_bit, + int64_t source_end_bit, + size_type number_of_mask_words) +{ + auto const stride = cudf::detail::grid_1d::grid_stride(); + for (thread_index_type destination_word_index = grid_1d::global_thread_id(); + destination_word_index < number_of_mask_words; + destination_word_index += stride) { + destination[destination_word_index] = + detail::get_mask_word(source, destination_word_index, source_begin_bit, source_end_bit); + } +} + +// copies the bitmask to device and automatically applies the offset +std::pair, size_type> get_mask_buffer( + ArrowArray const* input, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) +{ + auto bitmap = static_cast(input->buffers[validity_buffer_idx]); + if (bitmap == nullptr || input->null_count == 0) { + return {std::make_unique(0, stream, mr), 0}; + } + + constexpr auto bits_in_byte = static_cast(size_in_bits()); + + auto const num_rows = static_cast(input->length); + auto const offset_index = input->offset / bits_in_byte; + auto const mask_words = num_bitmask_words(num_rows); + auto const padded_words = bitmask_allocation_size_bytes(num_rows) / sizeof(bitmask_type); + auto const bit_index = input->offset % bits_in_byte; + auto const copy_size = cudf::util::div_rounding_up_safe(num_rows + bit_index, bits_in_byte); + + auto mask = rmm::device_uvector(padded_words, stream, mr); + CUDF_CUDA_TRY(cudaMemcpyAsync( + mask.data(), bitmap + offset_index, copy_size, cudaMemcpyDefault, stream.value())); + + if (mask_words > 0 && bit_index > 0) { + auto dest_mask = rmm::device_uvector(padded_words, stream, mr); + cudf::detail::grid_1d config(mask_words, 256); + copy_shifted_bitmask<<>>( + dest_mask.data(), mask.data(), bit_index, bit_index + num_rows, mask_words); + CUDF_CHECK_CUDA(stream.value()); + mask = std::move(dest_mask); + } + + auto const null_count = + mask_words > 0 ? cudf::detail::count_unset_bits(mask.data(), 0, num_rows, stream) : 0; + + return {std::make_unique(std::move(mask.release())), null_count}; +} + +std::unique_ptr get_column_copy(ArrowSchemaView const* schema, + ArrowArray const* input, + data_type type, + bool skip_mask, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr); + struct dispatch_copy_from_arrow_host { rmm::cuda_stream_view stream; rmm::device_async_resource_ref mr; - std::unique_ptr get_mask_buffer(ArrowArray const* array) - { - auto* bitmap = array->buffers[validity_buffer_idx]; - if (bitmap == nullptr) { return std::make_unique(0, stream, mr); } - - auto const bitmask_size = array->length + array->offset; - auto const allocation_size = - bitmask_allocation_size_bytes(static_cast(bitmask_size)); - auto mask = std::make_unique(allocation_size, stream, mr); - CUDF_CUDA_TRY(cudaMemcpyAsync(mask->data(), - reinterpret_cast(bitmap), - allocation_size, - cudaMemcpyDefault, - stream.value())); - return mask; - } - template () && !is_fixed_point())> - std::unique_ptr operator()(ArrowSchemaView*, ArrowArray const*, data_type, bool) + std::unique_ptr operator()(ArrowSchemaView const*, ArrowArray const*, data_type, bool) { CUDF_FAIL("Unsupported type in copy_from_arrow_host."); } template () || is_fixed_point())> - std::unique_ptr operator()(ArrowSchemaView* schema, + std::unique_ptr operator()(ArrowSchemaView const*, ArrowArray const* input, data_type type, bool skip_mask) { using DeviceType = device_storage_type_t; - size_type const num_rows = input->length; - size_type const offset = input->offset; - size_type const null_count = input->null_count; - auto data_buffer = input->buffers[fixed_width_data_buffer_idx]; + auto const num_rows = static_cast(input->length); + auto const data_buffer = + static_cast(input->buffers[fixed_width_data_buffer_idx]); - auto const has_nulls = skip_mask ? false : input->buffers[validity_buffer_idx] != nullptr; auto col = make_fixed_width_column(type, num_rows, mask_state::UNALLOCATED, stream, mr); auto mutable_column_view = col->mutable_view(); - CUDF_CUDA_TRY( - cudaMemcpyAsync(mutable_column_view.data(), - reinterpret_cast(data_buffer) + offset * sizeof(DeviceType), - sizeof(DeviceType) * num_rows, - cudaMemcpyDefault, - stream.value())); - - if (has_nulls) { - auto tmp_mask = get_mask_buffer(input); - - // if array is sliced, we have to copy the whole mask and then take copy - auto out_mask = - (offset == 0) - ? std::move(*tmp_mask) - : cudf::detail::copy_bitmask( - static_cast(tmp_mask->data()), offset, offset + num_rows, stream, mr); - - col->set_null_mask(std::move(out_mask), null_count); + CUDF_CUDA_TRY(cudaMemcpyAsync(mutable_column_view.data(), + data_buffer + input->offset, + sizeof(DeviceType) * num_rows, + cudaMemcpyDefault, + stream.value())); + + if (!skip_mask) { + auto [mask, null_count] = get_mask_buffer(input, stream, mr); + col->set_null_mask(std::move(*mask), null_count); } return col; @@ -121,35 +183,39 @@ struct dispatch_copy_from_arrow_host { }; template <> -std::unique_ptr dispatch_copy_from_arrow_host::operator()(ArrowSchemaView* schema, +std::unique_ptr dispatch_copy_from_arrow_host::operator()(ArrowSchemaView const*, ArrowArray const* input, data_type type, bool skip_mask) { - auto data_buffer = input->buffers[fixed_width_data_buffer_idx]; - auto const buffer_length = bitmask_allocation_size_bytes(input->length + input->offset); - - auto data = rmm::device_buffer(buffer_length, stream, mr); - CUDF_CUDA_TRY(cudaMemcpyAsync(data.data(), - reinterpret_cast(data_buffer), - buffer_length, - cudaMemcpyDefault, - stream.value())); - auto out_col = mask_to_bools(static_cast(data.data()), - input->offset, - input->offset + input->length, - stream, - mr); - - auto const has_nulls = skip_mask ? false : input->buffers[validity_buffer_idx] != nullptr; - if (has_nulls) { - auto out_mask = detail::copy_bitmask(static_cast(get_mask_buffer(input)->data()), - input->offset, - input->offset + input->length, - stream, - mr); - - out_col->set_null_mask(std::move(out_mask), input->null_count); + auto data_buffer = static_cast(input->buffers[fixed_width_data_buffer_idx]); + + constexpr auto bits_in_byte = static_cast(size_in_bits()); + + auto const num_rows = static_cast(input->length); + auto const offset_index = input->offset / bits_in_byte; + auto const data_words = num_bitmask_words(num_rows); + auto const bit_index = input->offset % bits_in_byte; + auto const copy_size = cudf::util::div_rounding_up_safe(num_rows + bit_index, bits_in_byte); + + auto data = rmm::device_uvector(data_words, stream, mr); + CUDF_CUDA_TRY(cudaMemcpyAsync( + data.data(), data_buffer + offset_index, copy_size, cudaMemcpyDefault, stream.value())); + + if (data_words > 0 && bit_index > 0) { + auto dest_data = rmm::device_uvector(data_words, stream, mr); + cudf::detail::grid_1d config(data_words, 256); + copy_shifted_bitmask<<>>( + dest_data.data(), data.data(), bit_index, bit_index + num_rows, data_words); + CUDF_CHECK_CUDA(stream.value()); + data = std::move(dest_data); + } + + auto out_col = mask_to_bools(static_cast(data.data()), 0, num_rows, stream, mr); + + if (!skip_mask) { + auto [out_mask, null_count] = get_mask_buffer(input, stream, mr); + out_col->set_null_mask(std::move(*out_mask), null_count); } return out_col; @@ -157,15 +223,23 @@ std::unique_ptr dispatch_copy_from_arrow_host::operator()(ArrowSch template <> std::unique_ptr dispatch_copy_from_arrow_host::operator()( - ArrowSchemaView* schema, ArrowArray const* input, data_type type, bool skip_mask) + ArrowSchemaView const* schema, ArrowArray const* input, data_type type, bool skip_mask) { + CUDF_EXPECTS( + input->length + 1 <= static_cast(std::numeric_limits::max()), + "number of rows in Arrow column exceeds the column size limit", + std::overflow_error); + if (input->length == 0) { return make_empty_column(type_id::STRING); } - return string_column_from_arrow_host(schema, input, get_mask_buffer(input), stream, mr); + auto [mask, null_count] = !skip_mask + ? get_mask_buffer(input, stream, mr) + : std::pair{std::make_unique(0, stream, mr), 0}; + return string_column_from_arrow_host(schema, input, std::move(mask), null_count, stream, mr); } template <> std::unique_ptr dispatch_copy_from_arrow_host::operator()( - ArrowSchemaView* schema, ArrowArray const* input, data_type type, bool skip_mask) + ArrowSchemaView const* schema, ArrowArray const* input, data_type type, bool skip_mask) { ArrowSchemaView keys_schema_view; NANOARROW_THROW_NOT_OK( @@ -186,7 +260,7 @@ std::unique_ptr dispatch_copy_from_arrow_host::operator()release(); indices_column = std::make_unique(dict_indices_type, @@ -203,133 +277,94 @@ std::unique_ptr dispatch_copy_from_arrow_host::operator() std::unique_ptr dispatch_copy_from_arrow_host::operator()( - ArrowSchemaView* schema, ArrowArray const* input, data_type type, bool skip_mask) + ArrowSchemaView const* schema, ArrowArray const* input, data_type type, bool skip_mask) { std::vector> child_columns; - std::transform( - input->children, - input->children + input->n_children, - schema->schema->children, - std::back_inserter(child_columns), - [this, input](ArrowArray const* child, ArrowSchema const* child_schema) { - ArrowSchemaView view; - NANOARROW_THROW_NOT_OK(ArrowSchemaViewInit(&view, child_schema, nullptr)); - auto type = arrow_to_cudf_type(&view); - - auto out = get_column_copy(&view, child, type, false, stream, mr); - return input->offset == 0 && input->length == out->size() - ? std::move(out) - : std::make_unique( - cudf::detail::slice(out->view(), - static_cast(input->offset), - static_cast(input->offset + input->length), - stream), - stream, - mr); - }); - - auto out_mask = std::move(*(get_mask_buffer(input))); - if (input->buffers[validity_buffer_idx] != nullptr) { - out_mask = detail::copy_bitmask(static_cast(out_mask.data()), - input->offset, - input->offset + input->length, - stream, - mr); - } + std::transform(input->children, + input->children + input->n_children, + schema->schema->children, + std::back_inserter(child_columns), + [this, input](ArrowArray const* child, ArrowSchema const* child_schema) { + ArrowSchemaView view; + NANOARROW_THROW_NOT_OK(ArrowSchemaViewInit(&view, child_schema, nullptr)); + auto child_type = arrow_to_cudf_type(&view); + + ArrowArray child_array(*child); + child_array.offset += input->offset; + child_array.length = std::min(input->length, child_array.length); + + return get_column_copy(&view, &child_array, child_type, false, stream, mr); + }); + + auto [out_mask, null_count] = + !skip_mask ? get_mask_buffer(input, stream, mr) + : std::pair{std::make_unique(0, stream, mr), 0}; return make_structs_column( - input->length, std::move(child_columns), input->null_count, std::move(out_mask), stream, mr); + input->length, std::move(child_columns), null_count, std::move(*out_mask), stream, mr); } template <> std::unique_ptr dispatch_copy_from_arrow_host::operator()( - ArrowSchemaView* schema, ArrowArray const* input, data_type type, bool skip_mask) + ArrowSchemaView const* schema, ArrowArray const* input, data_type type, bool skip_mask) { - // Initialize schema for 32-bit ints regardless of list type - nanoarrow::UniqueSchema offset_schema; - NANOARROW_THROW_NOT_OK(ArrowSchemaInitFromType(offset_schema.get(), NANOARROW_TYPE_INT32)); - - ArrowSchemaView view; - NANOARROW_THROW_NOT_OK(ArrowSchemaViewInit(&view, offset_schema.get(), nullptr)); - - CUDF_EXPECTS(input->length + input->offset + 1 <= - static_cast(std::numeric_limits::max()), - "Total number of rows in Arrow column exceeds the column size limit.", - std::overflow_error); - size_type const physical_length = input->length + input->offset + 1; - - auto offsets_column = [&] { - void const* offsets_buffers[2] = {nullptr, input->buffers[fixed_width_data_buffer_idx]}; - ArrowArray offsets_array = { - .length = physical_length, - .null_count = 0, - .offset = 0, - .n_buffers = 2, - .n_children = 0, - .buffers = offsets_buffers, - }; - - if (schema->type != NANOARROW_TYPE_LARGE_LIST) { - return this->operator()(&view, &offsets_array, data_type(type_id::INT32), true); - } + CUDF_EXPECTS( + input->length + 1 <= static_cast(std::numeric_limits::max()), + "number of rows in Arrow column exceeds the column size limit", + std::overflow_error); - // For large lists, convert 64-bit offsets to 32-bit on host with bounds checking - int64_t const* large_offsets = - reinterpret_cast(input->buffers[fixed_width_data_buffer_idx]); + auto [offsets_column, offset, length] = get_offsets_column(schema, input, stream, mr); - std::vector int32_offsets(physical_length); - constexpr auto max_offset = static_cast(std::numeric_limits::max()); - CUDF_EXPECTS(large_offsets[physical_length - 1] <= max_offset, - "Large list offsets exceed 32-bit integer bounds", - std::overflow_error); + ArrowSchemaView view; + NANOARROW_THROW_NOT_OK(ArrowSchemaViewInit(&view, schema->schema->children[0], nullptr)); + auto child_type = arrow_to_cudf_type(&view); - std::transform( - large_offsets, large_offsets + physical_length, int32_offsets.begin(), [](int64_t offset) { - return static_cast(offset); - }); + ArrowArray child_array(*input->children[0]); + child_array.offset += offset; + child_array.length = std::min(length, child_array.length); - offsets_buffers[1] = int32_offsets.data(); + auto child_column = get_column_copy(&view, &child_array, child_type, skip_mask, stream, mr); - return this->operator()(&view, &offsets_array, data_type(type_id::INT32), true); - }(); + auto [out_mask, null_count] = + !skip_mask ? get_mask_buffer(input, stream, mr) + : std::pair{std::make_unique(0, stream, mr), 0}; - NANOARROW_THROW_NOT_OK(ArrowSchemaViewInit(&view, schema->schema->children[0], nullptr)); - auto child_type = arrow_to_cudf_type(&view); - auto child_column = get_column_copy(&view, input->children[0], child_type, false, stream, mr); - - auto const num_rows = offsets_column->size() - 1; - auto out_col = make_lists_column(num_rows, - std::move(offsets_column), - std::move(child_column), - input->null_count, - std::move(*get_mask_buffer(input)), - stream, - mr); - - return num_rows == input->length - ? std::move(out_col) - : std::make_unique( - cudf::detail::slice(out_col->view(), - static_cast(input->offset), - static_cast(input->offset + input->length), - stream), - stream, - mr); + return make_lists_column(static_cast(input->length), + std::move(offsets_column), + std::move(child_column), + null_count, + std::move(*out_mask), + stream, + mr); } -} // namespace - -std::unique_ptr get_column_copy(ArrowSchemaView* schema, +/** + * @brief Convert ArrowArray to cudf column utility + * + * This function is simply a convenience wrapper around the dispatch functor with + * some extra handling to avoid having to reproduce it for all of the nested types. + * It also allows us to centralize the location where the recursive calls happen + * so that we only need to forward declare this one function, rather than multiple + * functions which handle the overloads for nested types (list, struct, etc.) + * + * @param schema Arrow schema includes the column type + * @param input Column data, nulls, offset + * @param type The cudf column type to map input to + * @param skip_mask True if the mask is handled by the caller + * @param stream CUDA stream used for device memory operations + * @param mr Device memory resource to use for all device memory allocations + */ +std::unique_ptr get_column_copy(ArrowSchemaView const* schema, ArrowArray const* input, data_type type, bool skip_mask, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { - CUDF_EXPECTS((input->length + input->offset) <= - static_cast(std::numeric_limits::max()), - "Total number of rows in Arrow column exceeds the column size limit.", - std::overflow_error); + CUDF_EXPECTS( + input->length <= static_cast(std::numeric_limits::max()), + "number of rows in Arrow column exceeds the column size limit", + std::overflow_error); return type.id() != type_id::EMPTY ? std::move(type_dispatcher( @@ -341,6 +376,91 @@ std::unique_ptr get_column_copy(ArrowSchemaView* schema, input->length); } +/** + * @brief Utility to copy and normalize the offsets in the given array + */ +template +std::tuple, int64_t, int64_t> copy_offsets_column( + ArrowSchemaView const* schema, + ArrowArray const* offsets, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + auto offsets_buffer = + static_cast(offsets->buffers[fixed_width_data_buffer_idx]); + auto const offset = offsets_buffer[offsets->offset]; + auto const length = offsets_buffer[offsets->offset + offsets->length - 1] - offset; + + // dispatch directly since we know the type + auto result = dispatch_copy_from_arrow_host{stream, mr}.template operator()( + schema, offsets, data_type{type_to_id()}, true); + if (offset != 0) { + auto begin = result->mutable_view().template begin(); + auto end = begin + offsets->length; + thrust::transform( + rmm::exec_policy_nosync(stream), begin, end, begin, [offset] __device__(auto o) { + return o - offset; + }); + } + return std::tuple{std::move(result), offset, length}; +} + +} // namespace + +/** + * @brief Utility to copy the offsets from the given input (strings or list) to a + * cudf column + */ +std::tuple, int64_t, int64_t> get_offsets_column( + ArrowSchemaView const* schema, + ArrowArray const* input, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + void const* offsets_buffer = input->buffers[fixed_width_data_buffer_idx]; + void const* offsets_buffers[2] = {nullptr, offsets_buffer}; + ArrowArray offsets_array = { + .length = input->length + 1, + .null_count = 0, + .offset = input->offset, + .n_buffers = 2, + .n_children = 0, + .buffers = offsets_buffers, + }; + + if (schema->type == NANOARROW_TYPE_STRING || schema->type == NANOARROW_TYPE_LIST) { + return copy_offsets_column(schema, &offsets_array, stream, mr); + } + if (schema->type == NANOARROW_TYPE_LARGE_STRING) { + return copy_offsets_column(schema, &offsets_array, stream, mr); + } + + CUDF_EXPECTS(schema->type == NANOARROW_TYPE_LARGE_LIST, "Unknown offsets parent type"); + + // Large-lists must be copied to int32 column + auto int32_offsets = std::vector(); + int32_offsets.reserve(input->length + 1); + auto int64_offsets = static_cast(offsets_buffer); + auto const offset = int64_offsets[input->offset]; + auto const length = int64_offsets[input->offset + input->length] - offset; + + constexpr auto max_offset = static_cast(std::numeric_limits::max()); + CUDF_EXPECTS( + length <= max_offset, "large list offsets exceed 32-bit integer bounds", std::overflow_error); + + // normalize the offsets while copying from int64 to int32 + std::transform(int64_offsets + input->offset, + int64_offsets + input->offset + input->length + 1, + std::back_inserter(int32_offsets), + [offset](int64_t o) { return static_cast(o - offset); }); + + offsets_buffers[fixed_width_data_buffer_idx] = int32_offsets.data(); + offsets_array.offset = 0; // already accounted for by the above transform + auto result = dispatch_copy_from_arrow_host{stream, mr}.template operator()( + schema, &offsets_array, data_type(type_id::INT32), true); + return std::tuple{std::move(result), offset, length}; +} + std::unique_ptr
from_arrow_host(ArrowSchema const* schema, ArrowDeviceArray const* input, rmm::cuda_stream_view stream, @@ -396,7 +516,7 @@ std::unique_ptr from_arrow_host_column(ArrowSchema const* schema, return get_column_copy(&view, &input->array, type, false, stream, mr); } -std::unique_ptr get_column_from_host_copy(ArrowSchemaView* schema, +std::unique_ptr get_column_from_host_copy(ArrowSchemaView const* schema, ArrowArray const* input, data_type type, bool skip_mask, diff --git a/cpp/src/interop/from_arrow_host.hpp b/cpp/src/interop/from_arrow_host.hpp index 9307bf8c58d..7b88ae3818a 100644 --- a/cpp/src/interop/from_arrow_host.hpp +++ b/cpp/src/interop/from_arrow_host.hpp @@ -32,37 +32,32 @@ namespace detail { * @param schema Arrow schema includes the column type * @param input Column data, nulls, offset * @param mask Mask to apply to the output column + * @param null_count Number of nulls in mask * @param stream CUDA stream used for device memory operations * @param mr Device memory resource to use for all device memory allocations */ -std::unique_ptr string_column_from_arrow_host(ArrowSchemaView* schema, +std::unique_ptr string_column_from_arrow_host(ArrowSchemaView const* schema, ArrowArray const* input, std::unique_ptr&& mask, + size_type null_count, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr); /** - * @brief Convert ArrowArray to cudf column utility + * @brief Create offsets column for list or strings column * - * This function is simply a convenience wrapper around the dispatch functor with - * some extra handling to avoid having to reproduce it for all of the nested types. - * It also allows us to centralize the location where the recursive calls happen - * so that we only need to forward declare this one function, rather than multiple - * functions which handle the overloads for nested types (list, struct, etc.) * * @param schema Arrow schema includes the column type * @param input Column data, nulls, offset - * @param type The cudf column type to map input to - * @param skip_mask True if the mask is handled by the caller * @param stream CUDA stream used for device memory operations * @param mr Device memory resource to use for all device memory allocations + * @return Column plus offset and size bounds for copying data column */ -std::unique_ptr get_column_copy(ArrowSchemaView* schema, - ArrowArray const* input, - data_type type, - bool skip_mask, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr); +std::tuple, int64_t, int64_t> get_offsets_column( + ArrowSchemaView const* schema, + ArrowArray const* input, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr); } // namespace detail } // namespace cudf diff --git a/cpp/src/interop/from_arrow_host_strings.cu b/cpp/src/interop/from_arrow_host_strings.cu index d11197f243a..03810c1398e 100644 --- a/cpp/src/interop/from_arrow_host_strings.cu +++ b/cpp/src/interop/from_arrow_host_strings.cu @@ -47,99 +47,34 @@ namespace cudf { namespace detail { - namespace { constexpr int chars_buffer_idx = 2; -std::unique_ptr from_arrow_string(ArrowSchemaView* schema, +std::unique_ptr from_arrow_string(ArrowSchemaView const* schema, ArrowArray const* input, std::unique_ptr&& mask, + size_type null_count, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { - // offsets column should contain no nulls so we can put nullptr for the bitmask - // nulls are tracked in the parent string column itself, not in the offsets - void const* offset_buffers[] = {nullptr, input->buffers[fixed_width_data_buffer_idx]}; - ArrowArray offsets_array = { - .length = input->offset + input->length + 1, - .null_count = 0, - .offset = 0, - .n_buffers = 2, - .n_children = 0, - .buffers = offset_buffers, - }; - - // chars_column does not contain any nulls, they are tracked by the parent string column - // itself instead. So we pass nullptr for the validity bitmask. - int64_t const char_data_length = [&]() { - if (schema->type == NANOARROW_TYPE_LARGE_STRING) { - return reinterpret_cast(offset_buffers[1])[input->length + input->offset]; - } else if (schema->type == NANOARROW_TYPE_STRING) { - return static_cast( - reinterpret_cast(offset_buffers[1])[input->length + input->offset]); - } else { - CUDF_FAIL("Unsupported string type", cudf::data_type_error); - } - }(); - void const* char_buffers[] = {nullptr, input->buffers[chars_buffer_idx]}; - ArrowArray char_array = { - .length = char_data_length, - .null_count = 0, - .offset = 0, - .n_buffers = 2, - .n_children = 0, - .buffers = char_buffers, - }; - - nanoarrow::UniqueSchema offset_schema; - NANOARROW_THROW_NOT_OK(ArrowSchemaInitFromType(offset_schema.get(), NANOARROW_TYPE_INT32)); - - nanoarrow::UniqueSchema char_data_schema; - NANOARROW_THROW_NOT_OK(ArrowSchemaInitFromType(char_data_schema.get(), NANOARROW_TYPE_INT8)); - - // leverage the dispatch overloads for int32 and char(int8) to generate the child - // offset and char data columns for us. - ArrowSchemaView view; - NANOARROW_THROW_NOT_OK(ArrowSchemaViewInit(&view, offset_schema.get(), nullptr)); - auto offsets_column = [&]() { - if (schema->type == NANOARROW_TYPE_LARGE_STRING) { - return get_column_copy(&view, &offsets_array, data_type(type_id::INT64), true, stream, mr); - } else if (schema->type == NANOARROW_TYPE_STRING) { - return get_column_copy(&view, &offsets_array, data_type(type_id::INT32), true, stream, mr); - } else { - CUDF_FAIL("Unsupported string type", cudf::data_type_error); - } - }(); - NANOARROW_THROW_NOT_OK(ArrowSchemaViewInit(&view, char_data_schema.get(), nullptr)); + auto [offsets_column, offset, char_data_length] = get_offsets_column(schema, input, stream, mr); rmm::device_buffer chars(char_data_length, stream, mr); - CUDF_CUDA_TRY(cudaMemcpyAsync(chars.data(), - reinterpret_cast(char_array.buffers[1]), - chars.size(), - cudaMemcpyDefault, - stream.value())); - auto const num_rows = offsets_column->size() - 1; - auto out_col = make_strings_column(num_rows, - std::move(offsets_column), - std::move(chars), - input->null_count, - std::move(*mask.release())); - - return input->offset == 0 - ? std::move(out_col) - : std::make_unique( - cudf::detail::slice(out_col->view(), - static_cast(input->offset), - static_cast(input->offset + input->length), - stream), - stream, - mr); + auto const* chars_data = static_cast(input->buffers[chars_buffer_idx]) + offset; + CUDF_CUDA_TRY( + cudaMemcpyAsync(chars.data(), chars_data, chars.size(), cudaMemcpyDefault, stream.value())); + + return make_strings_column(static_cast(input->length), + std::move(offsets_column), + std::move(chars), + null_count, + std::move(*mask.release())); } constexpr int stringview_vector_idx = 1; -std::unique_ptr from_arrow_stringview(ArrowSchemaView* schema, +std::unique_ptr from_arrow_stringview(ArrowSchemaView const* schema, ArrowArray const* input, std::unique_ptr&& mask, rmm::cuda_stream_view stream, @@ -152,10 +87,8 @@ std::unique_ptr from_arrow_stringview(ArrowSchemaView* schema, // first copy stringview array to device auto items = view.buffer_views[stringview_vector_idx].data.as_binary_view; auto d_items = rmm::device_uvector(input->length, stream, mr); - // caller ensures that input->offset is < max size_type - auto const offset = static_cast(input->offset); CUDF_CUDA_TRY(cudaMemcpyAsync(d_items.data(), - items + offset, + items + input->offset, input->length * sizeof(ArrowBinaryView), cudaMemcpyDefault, stream.value())); @@ -183,10 +116,8 @@ std::unique_ptr from_arrow_stringview(ArrowSchemaView* schema, thrust::counting_iterator(0), thrust::counting_iterator(input->length), d_indices.begin(), - [d_items = d_items.data(), d_ptrs, d_mask, offset] __device__(auto idx) -> string_index_pair { - if (d_mask && !cudf::bit_is_set(d_mask, idx + offset)) { - return string_index_pair{nullptr, 0}; - } + [d_items = d_items.data(), d_ptrs, d_mask] __device__(auto idx) -> string_index_pair { + if (d_mask && !bit_is_set(d_mask, idx)) { return string_index_pair{nullptr, 0}; } auto const& item = d_items[idx]; auto const size = static_cast(item.inlined.size); auto const data = (size <= NANOARROW_BINARY_VIEW_INLINE_SIZE) @@ -200,15 +131,16 @@ std::unique_ptr from_arrow_stringview(ArrowSchemaView* schema, } // namespace -std::unique_ptr string_column_from_arrow_host(ArrowSchemaView* schema, +std::unique_ptr string_column_from_arrow_host(ArrowSchemaView const* schema, ArrowArray const* input, std::unique_ptr&& mask, + size_type null_count, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { return schema->type == NANOARROW_TYPE_STRING_VIEW ? from_arrow_stringview(schema, input, std::move(mask), stream, mr) - : from_arrow_string(schema, input, std::move(mask), stream, mr); + : from_arrow_string(schema, input, std::move(mask), null_count, stream, mr); } } // namespace detail diff --git a/cpp/tests/interop/from_arrow_host_test.cpp b/cpp/tests/interop/from_arrow_host_test.cpp index e4f168f9c82..f94006517cb 100644 --- a/cpp/tests/interop/from_arrow_host_test.cpp +++ b/cpp/tests/interop/from_arrow_host_test.cpp @@ -363,7 +363,7 @@ TYPED_TEST(FromArrowHostDeviceTestDecimalsTest, FixedPointTableNulls) // converting arrow host memory to cudf table gives us the expected table auto got_cudf_table = cudf::from_arrow_host(input_schema.get(), &input); - CUDF_TEST_EXPECT_TABLES_EQUAL(expected, got_cudf_table->view()); + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(expected, got_cudf_table->view()); // converting to a cudf table with a single struct column gives us the expected // result column @@ -421,7 +421,7 @@ TYPED_TEST(FromArrowHostDeviceTestDecimalsTest, FixedPointTableLargeNulls) // converting arrow host memory to cudf table gives us the expected table auto got_cudf_table = cudf::from_arrow_host(input_schema.get(), &input); - CUDF_TEST_EXPECT_TABLES_EQUAL(expected, got_cudf_table->view()); + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(expected, got_cudf_table->view()); // converting to a cudf table with a single struct column gives us the expected // result column @@ -430,7 +430,7 @@ TYPED_TEST(FromArrowHostDeviceTestDecimalsTest, FixedPointTableLargeNulls) auto got_cudf_col_view = got_cudf_col->view(); cudf::table_view from_struct{std::vector(got_cudf_col_view.child_begin(), got_cudf_col_view.child_end())}; - CUDF_TEST_EXPECT_TABLES_EQUAL(got_cudf_table->view(), from_struct); + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(got_cudf_table->view(), from_struct); } } @@ -499,7 +499,7 @@ TEST_F(FromArrowHostDeviceTest, NestedList) // converting from arrow host memory to cudf gives us the expected table auto got_cudf_table = cudf::from_arrow_host(input_schema.get(), &input); - CUDF_TEST_EXPECT_TABLES_EQUAL(expected_table_view, got_cudf_table->view()); + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(expected_table_view, got_cudf_table->view()); // converting to a single column cudf table gives us the expected struct column auto got_cudf_col = cudf::from_arrow_host_column(input_schema.get(), &input); @@ -663,7 +663,7 @@ TEST_F(FromArrowHostDeviceTest, StructColumn) // test we get the expected cudf::table from the arrow host memory data auto got_cudf_table = cudf::from_arrow_host(input_schema.get(), &input); - CUDF_TEST_EXPECT_TABLES_EQUAL(expected_table_view, got_cudf_table->view()); + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(expected_table_view, got_cudf_table->view()); // test we get the expected cudf struct column auto got_cudf_col = cudf::from_arrow_host_column(input_schema.get(), &input); @@ -851,6 +851,7 @@ TEST_P(FromArrowHostDeviceTestSlice, SliceTest) input.device_type = ARROW_DEVICE_CPU; auto got_cudf_table = cudf::from_arrow_host(schema.get(), &input); + if (got_cudf_table->num_rows() == 0 and sliced_cudf_table.num_rows() == 0) { CUDF_TEST_EXPECT_TABLES_EQUIVALENT(expected_cudf_table.view(), got_cudf_table->view()); @@ -861,14 +862,14 @@ TEST_P(FromArrowHostDeviceTestSlice, SliceTest) got_cudf_col_view.child_end())}; CUDF_TEST_EXPECT_TABLES_EQUIVALENT(got_cudf_table->view(), from_struct); } else { - CUDF_TEST_EXPECT_TABLES_EQUAL(expected_cudf_table.view(), got_cudf_table->view()); + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(expected_cudf_table.view(), got_cudf_table->view()); auto got_cudf_col = cudf::from_arrow_host_column(schema.get(), &input); EXPECT_EQ(got_cudf_col->type(), cudf::data_type{cudf::type_id::STRUCT}); auto got_cudf_col_view = got_cudf_col->view(); cudf::table_view from_struct{std::vector(got_cudf_col_view.child_begin(), got_cudf_col_view.child_end())}; - CUDF_TEST_EXPECT_TABLES_EQUAL(got_cudf_table->view(), from_struct); + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(got_cudf_table->view(), from_struct); } } diff --git a/cpp/tests/interop/from_arrow_test.cpp b/cpp/tests/interop/from_arrow_test.cpp index 400d3852d5a..0fcf4b53b8a 100644 --- a/cpp/tests/interop/from_arrow_test.cpp +++ b/cpp/tests/interop/from_arrow_test.cpp @@ -444,7 +444,7 @@ TEST_P(FromArrowTestSlice, SliceTest) if (got_cudf_table.value()->num_rows() == 0 and expected_cudf_table.num_rows() == 0) { CUDF_TEST_EXPECT_TABLES_EQUIVALENT(expected_cudf_table.view(), got_cudf_table.value()->view()); } else { - CUDF_TEST_EXPECT_TABLES_EQUAL(expected_cudf_table.view(), got_cudf_table.value()->view()); + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(expected_cudf_table.view(), got_cudf_table.value()->view()); } } diff --git a/python/cudf_polars/cudf_polars/testing/plugin.py b/python/cudf_polars/cudf_polars/testing/plugin.py index 84ef9d9234f..1815f9dffea 100644 --- a/python/cudf_polars/cudf_polars/testing/plugin.py +++ b/python/cudf_polars/cudf_polars/testing/plugin.py @@ -163,7 +163,6 @@ def pytest_configure(config: pytest.Config) -> None: "tests/unit/datatypes/test_struct.py::test_struct_agg_all": "Needs nested list[struct] support", "tests/unit/constructors/test_structs.py::test_constructor_non_strict_schema_17956": "Needs nested list[struct] support", "tests/unit/io/test_delta.py::test_read_delta_arrow_map_type": "Needs nested list[struct] support", - "tests/unit/lazyframe/test_collect_schema.py::test_collect_schema_parametric": "https://github.com/pola-rs/polars/issues/23214", "tests/unit/datatypes/test_struct.py::test_struct_null_cast": "pylibcudf.Scalar does not support struct scalars", "tests/unit/datatypes/test_struct.py::test_struct_outer_nullability_zip_18119": "pylibcudf.Scalar does not support struct scalars", "tests/unit/io/test_lazy_parquet.py::test_parquet_schema_arg[True-columns]": "allow_missing_columns argument in read_parquet not translated in IR", From 6dc52ce33384de79456744e189d68ef618b907ae Mon Sep 17 00:00:00 2001 From: Robert Maynard Date: Wed, 27 Aug 2025 14:32:45 -0400 Subject: [PATCH 216/366] Update identify_stream_usage CUDA runtime hooks to CUDA 13 (#19807) The kernel launch and memcpyasync API have changed and we need updates to work properly. Authors: - Robert Maynard (https://github.com/robertmaynard) Approvers: - David Wendt (https://github.com/davidwendt) - Yunsong Wang (https://github.com/PointKernel) - Vukasin Milovanovic (https://github.com/vuule) - Nghia Truong (https://github.com/ttnghia) URL: https://github.com/rapidsai/cudf/pull/19807 --- cpp/tests/utilities/identify_stream_usage.cpp | 41 +++++++++++++++++++ 1 file changed, 41 insertions(+) diff --git a/cpp/tests/utilities/identify_stream_usage.cpp b/cpp/tests/utilities/identify_stream_usage.cpp index 93f4ee3e4a4..69dc1c4430b 100644 --- a/cpp/tests/utilities/identify_stream_usage.cpp +++ b/cpp/tests/utilities/identify_stream_usage.cpp @@ -209,6 +209,40 @@ DEFINE_OVERLOAD(cudaLaunchKernel, size_t sharedMem, cudaStream_t stream), ARG(func, gridDim, blockDim, args, sharedMem, stream)); + +#if CUDART_VERSION >= 13000 +// We need to define the __cudaLaunchKernel ABI as +// it isn't part of cuda_runtime.h when compiling as a C++ source +extern "C" cudaError_t CUDARTAPI __cudaLaunchKernel(cudaKernel_t kernel, + dim3 gridDim, + dim3 blockDim, + void** args, + size_t sharedMem, + cudaStream_t stream); +extern "C" cudaError_t CUDARTAPI __cudaLaunchKernel_ptsz(cudaKernel_t kernel, + dim3 gridDim, + dim3 blockDim, + void** args, + size_t sharedMem, + cudaStream_t stream); +DEFINE_OVERLOAD(__cudaLaunchKernel, + ARG(cudaKernel_t kernel, + dim3 gridDim, + dim3 blockDim, + void** args, + size_t sharedMem, + cudaStream_t stream), + ARG(kernel, gridDim, blockDim, args, sharedMem, stream)); +DEFINE_OVERLOAD(__cudaLaunchKernel_ptsz, + ARG(cudaKernel_t kernel, + dim3 gridDim, + dim3 blockDim, + void** args, + size_t sharedMem, + cudaStream_t stream), + ARG(kernel, gridDim, blockDim, args, sharedMem, stream)); +#endif + DEFINE_OVERLOAD(cudaLaunchCooperativeKernel, ARG(void const* func, dim3 gridDim, @@ -223,9 +257,16 @@ DEFINE_OVERLOAD(cudaLaunchHostFunc, // Memory transfer APIS: // https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY.html#group__CUDART__MEMORY +#if CUDART_VERSION >= 13000 +DEFINE_OVERLOAD( + cudaMemPrefetchAsync, + ARG(void const* devPtr, size_t count, cudaMemLocation loc, int flags, cudaStream_t stream), + ARG(devPtr, count, loc, flags, stream)); +#else DEFINE_OVERLOAD(cudaMemPrefetchAsync, ARG(void const* devPtr, size_t count, int dstDevice, cudaStream_t stream), ARG(devPtr, count, dstDevice, stream)); +#endif DEFINE_OVERLOAD(cudaMemcpy2DAsync, ARG(void* dst, size_t dpitch, From f3632e0405e76ff852a55dc20ed91ab17f90bdcf Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 27 Aug 2025 14:00:55 -0500 Subject: [PATCH 217/366] Filter pandas warning in dask_cudf test (#19808) This fixes a CI failure (observed at https://github.com/rapidsai/cudf/actions/runs/17267746615/job/49006444548?pr=19793#step:10:1636) coming from a pandas warning triggered by dask.dataframe's usage of pandas. We'll filter it here and fix it upstream. Authors: - Tom Augspurger (https://github.com/TomAugspurger) Approvers: - GALI PREM SAGAR (https://github.com/galipremsagar) URL: https://github.com/rapidsai/cudf/pull/19808 --- python/dask_cudf/dask_cudf/tests/test_groupby.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/python/dask_cudf/dask_cudf/tests/test_groupby.py b/python/dask_cudf/dask_cudf/tests/test_groupby.py index 6efb7cf1562..4408c0394d8 100644 --- a/python/dask_cudf/dask_cudf/tests/test_groupby.py +++ b/python/dask_cudf/dask_cudf/tests/test_groupby.py @@ -1,5 +1,7 @@ # Copyright (c) 2021-2025, NVIDIA CORPORATION. +import warnings + import numpy as np import pandas as pd import pytest @@ -623,9 +625,14 @@ def test_groupby_agg_params( 1 if split_out == "use_dask_default" else split_out ) - # Compute for easier multiindex handling - gf = gr.compute() - pf = pr.compute() + with warnings.catch_warnings(): + # dask<=2025.7.0 uses a deprecated "grouper" attribute + # in some of these computations. We'll silence the warning + # here and fix it upstream. + warnings.filterwarnings("ignore", category=FutureWarning) + # Compute for easier multiindex handling + gf = gr.compute() + pf = pr.compute() # Reset index and sort by groupby columns if as_index: From 477c6e27ade679283e603360bbfd42aac54c48f7 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Wed, 27 Aug 2025 14:37:56 -0700 Subject: [PATCH 218/366] Run cudf-polars-polars-tests on changes in test_python file group (#19819) Similar to the other Python based jobs, I believe the `cudf-polars-polars-tests` job only needs to run on relevant changes to Python files e.g. https://github.com/rapidsai/cudf/pull/19816 removes a Python file. None of the other Python test jobs ran except `cudf-polars-polars-tests` Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - Matthew Murray (https://github.com/Matt711) - James Lamb (https://github.com/jameslamb) URL: https://github.com/rapidsai/cudf/pull/19819 --- .github/workflows/pr.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml index 8249fea30a0..01da4160bad 100644 --- a/.github/workflows/pr.yaml +++ b/.github/workflows/pr.yaml @@ -284,6 +284,7 @@ jobs: needs: wheel-build-cudf-polars secrets: inherit uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-25.10 + if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_python with: # This selects "ARCH=amd64 + the latest supported Python + CUDA". matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))])) From 3306234bc290a4204c870614201cd37b7046fc22 Mon Sep 17 00:00:00 2001 From: Matthew Murray <41342305+Matt711@users.noreply.github.com> Date: Wed, 27 Aug 2025 18:18:04 -0400 Subject: [PATCH 219/366] xfail polars `decimal(precision=None)` test (#19821) We return decimals with `precision=38`, polars returns `precision=None`. This is an expected mismatch, so xfail the test. Authors: - Matthew Murray (https://github.com/Matt711) Approvers: - Matthew Roeschke (https://github.com/mroeschke) URL: https://github.com/rapidsai/cudf/pull/19821 --- python/cudf_polars/cudf_polars/testing/plugin.py | 1 + 1 file changed, 1 insertion(+) diff --git a/python/cudf_polars/cudf_polars/testing/plugin.py b/python/cudf_polars/cudf_polars/testing/plugin.py index 1815f9dffea..f55f97de334 100644 --- a/python/cudf_polars/cudf_polars/testing/plugin.py +++ b/python/cudf_polars/cudf_polars/testing/plugin.py @@ -122,6 +122,7 @@ def pytest_configure(config: pytest.Config) -> None: "tests/unit/lazyframe/test_lazyframe.py::test_round[dtype1-123.55-1-123.6]": "Rounding midpoints is handled incorrectly", "tests/unit/lazyframe/test_lazyframe.py::test_cast_frame": "Casting that raises not supported on GPU", "tests/unit/lazyframe/test_lazyframe.py::test_lazy_cache_hit": "Debug output on stderr doesn't match", + "tests/unit/lazyframe/test_collect_schema.py::test_collect_schema_parametric": "polars returns decimal column with precision=None", "tests/unit/operations/aggregation/test_aggregations.py::test_binary_op_agg_context_no_simplify_expr_12423": "groupby-agg of just literals should not produce collect_list", "tests/unit/operations/aggregation/test_aggregations.py::test_nan_inf_aggregation": "treatment of nans and nulls together is different in libcudf and polars in groupby-agg context", "tests/unit/operations/test_abs.py::test_abs_duration": "Need to raise for unsupported uops on timelike values", From f2c1d2e495371a3d2e50170799db0f626e58ba25 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Wed, 27 Aug 2025 16:47:22 -0700 Subject: [PATCH 220/366] Remove test_mvc.py (#19816) xref https://github.com/rapidsai/cudf/issues/19740#issuecomment-3229250933 cc @brandon-b-miller Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - https://github.com/brandon-b-miller URL: https://github.com/rapidsai/cudf/pull/19816 --- python/cudf/cudf/tests/test_mvc.py | 29 ----------------------------- 1 file changed, 29 deletions(-) delete mode 100644 python/cudf/cudf/tests/test_mvc.py diff --git a/python/cudf/cudf/tests/test_mvc.py b/python/cudf/cudf/tests/test_mvc.py deleted file mode 100644 index 0ff9ddef0a0..00000000000 --- a/python/cudf/cudf/tests/test_mvc.py +++ /dev/null @@ -1,29 +0,0 @@ -# Copyright (c) 2023-2025, NVIDIA CORPORATION. -import subprocess -import sys - -TEST_SCRIPT = """ -import numba.cuda -import cudf -from cudf.utils._numba import _CUDFNumbaConfig - -@numba.cuda.jit -def test_kernel(x): - id = numba.cuda.grid(1) - if id < len(x): - x[id] += 1 - -s = cudf.Series([1, 2, 3]) -with _CUDFNumbaConfig(): - test_kernel.forall(len(s))(s) -""" - - -def test_numba_mvc(): - cp = subprocess.run( - [sys.executable, "-c", TEST_SCRIPT], - capture_output=True, - cwd="/", - ) - - assert cp.returncode == 0 From 70ff60d756879d2acc5ee054f79055f135c85a59 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Wed, 27 Aug 2025 17:44:12 -0700 Subject: [PATCH 221/366] Use more cached_property where possible for Index and subclasses (#19799) closes https://github.com/rapidsai/cudf/issues/13357 Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - Matthew Murray (https://github.com/Matt711) URL: https://github.com/rapidsai/cudf/pull/19799 --- python/cudf/cudf/core/column/datetime.py | 4 +- python/cudf/cudf/core/column/interval.py | 11 +- python/cudf/cudf/core/column/timedelta.py | 25 +--- python/cudf/cudf/core/index.py | 113 +++++++++++------- python/cudf/cudf/core/series.py | 4 +- python/cudf/cudf/core/tools/datetimes.py | 2 +- .../indexes/datetimeindex/test_attributes.py | 10 ++ 7 files changed, 91 insertions(+), 78 deletions(-) diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py index 24a5968fa5a..8a0e5e10a15 100644 --- a/python/cudf/cudf/core/column/datetime.py +++ b/python/cudf/cudf/core/column/datetime.py @@ -283,11 +283,11 @@ def days_in_month(self) -> ColumnBase: plc.datetime.days_in_month(self.to_pylibcudf(mode="read")) ) - @property + @functools.cached_property def day_of_week(self) -> ColumnBase: raise NotImplementedError("day_of_week is currently not implemented.") - @property + @functools.cached_property def is_normalized(self) -> bool: raise NotImplementedError( "is_normalized is currently not implemented." diff --git a/python/cudf/cudf/core/column/interval.py b/python/cudf/cudf/core/column/interval.py index f589b852b91..d592afa3dda 100644 --- a/python/cudf/cudf/core/column/interval.py +++ b/python/cudf/cudf/core/column/interval.py @@ -1,6 +1,7 @@ # Copyright (c) 2018-2025, NVIDIA CORPORATION. from __future__ import annotations +import functools from typing import TYPE_CHECKING, Literal import pandas as pd @@ -119,7 +120,7 @@ def copy(self, deep: bool = True) -> Self: children=struct_copy.base_children, # type: ignore[arg-type] ) - @property + @functools.cached_property def is_empty(self) -> ColumnBase: left_equals_right = (self.right == self.left).fillna(False) not_closed_both = as_column( @@ -127,19 +128,19 @@ def is_empty(self) -> ColumnBase: ) return left_equals_right & not_closed_both - @property + @functools.cached_property def is_non_overlapping_monotonic(self) -> bool: raise NotImplementedError( "is_overlapping is currently not implemented." ) - @property + @functools.cached_property def is_overlapping(self) -> bool: raise NotImplementedError( "is_overlapping is currently not implemented." ) - @property + @functools.cached_property def length(self) -> ColumnBase: return self.right - self.left @@ -147,7 +148,7 @@ def length(self) -> ColumnBase: def left(self) -> ColumnBase: return self.children[0] - @property + @functools.cached_property def mid(self) -> ColumnBase: try: return 0.5 * (self.left + self.right) diff --git a/python/cudf/cudf/core/column/timedelta.py b/python/cudf/cudf/core/column/timedelta.py index aa4bcef4454..90dbda3fcb3 100644 --- a/python/cudf/cudf/core/column/timedelta.py +++ b/python/cudf/cudf/core/column/timedelta.py @@ -278,33 +278,14 @@ def sum( unit=self.time_unit, ).as_unit(self.time_unit) + @functools.cached_property def components(self) -> dict[str, NumericalColumn]: """ - Return a Dataframe of the components of the Timedeltas. + Return a dict of the components of the Timedeltas. Returns ------- - DataFrame - - Examples - -------- - >>> s = pd.Series(pd.to_timedelta(np.arange(5), unit='s')) - >>> s = cudf.Series([12231312123, 1231231231, 1123236768712, 2135656, - ... 3244334234], dtype='timedelta64[ms]') - >>> s - 0 141 days 13:35:12.123 - 1 14 days 06:00:31.231 - 2 13000 days 10:12:48.712 - 3 0 days 00:35:35.656 - 4 37 days 13:12:14.234 - dtype: timedelta64[ms] - >>> s.dt.components - days hours minutes seconds milliseconds microseconds nanoseconds - 0 141 13 35 12 123 0 0 - 1 14 6 0 31 231 0 0 - 2 13000 10 12 48 712 0 0 - 3 0 0 35 35 656 0 0 - 4 37 13 12 14 234 0 0 + dict[str, NumericalColumn] """ date_meta = { "hours": ["D", "h"], diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index cc795fad2b1..1770338ff0e 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -479,6 +479,7 @@ def is_monotonic_decreasing(self) -> bool: @property def has_duplicates(self) -> bool: + # .is_unique is already a cached_property return not self.is_unique @property @@ -1838,11 +1839,6 @@ def _binaryop( return ret.values return ret - @property # type: ignore - @_performance_tracking - def _values(self) -> ColumnBase: - return self._column - @classmethod @_performance_tracking def _concat(cls, objs): @@ -2227,7 +2223,7 @@ def _is_categorical(self) -> bool: def _is_interval(self) -> bool: return False - @property # type: ignore + @cached_property @_performance_tracking def hasnans(self) -> bool: return self._column.has_nulls(include_nan=True) @@ -3643,7 +3639,7 @@ def strftime(self, date_format: str) -> Index: self._column.strftime(date_format), name=self.name ) - @property + @cached_property def asi8(self) -> cupy.ndarray: return self._column.astype(np.dtype(np.int64)).values @@ -3684,7 +3680,9 @@ def tz(self) -> tzinfo | None: datetime.tzinfo or None Returns None when the array is tz-naive. """ - return getattr(self.dtype, "tz", None) + if isinstance(self.dtype, pd.DatetimeTZDtype): + return self.dtype.tz + return None @property def tzinfo(self) -> tzinfo | None: @@ -3722,7 +3720,7 @@ def normalize(self) -> Self: self._column.normalize(), name=self.name ) - @property + @cached_property def time(self) -> np.ndarray: """ Returns numpy array of ``datetime.time`` objects. @@ -3731,7 +3729,7 @@ def time(self) -> np.ndarray: """ return self.to_pandas().time - @property + @cached_property def timetz(self) -> np.ndarray: """ Returns numpy array of ``datetime.time`` objects with timezones. @@ -3740,7 +3738,7 @@ def timetz(self) -> np.ndarray: """ return self.to_pandas().timetz - @property + @cached_property def date(self) -> np.ndarray: """ Returns numpy array of python ``datetime.date`` objects. @@ -3755,6 +3753,7 @@ def is_month_start(self) -> cupy.ndarray: """ Booleans indicating if dates are the first day of the month. """ + # .is_month_start is already a cached_property return self._column.is_month_start.values @property @@ -3762,6 +3761,7 @@ def is_month_end(self) -> cupy.ndarray: """ Booleans indicating if dates are the last day of the month. """ + # .is_month_end is already a cached_property return self._column.is_month_end.values @property @@ -3769,6 +3769,7 @@ def is_quarter_end(self) -> cupy.ndarray: """ Booleans indicating if dates are the last day of the quarter. """ + # .is_quarter_end is already a cached_property return self._column.is_quarter_end.values @property @@ -3776,6 +3777,7 @@ def is_quarter_start(self) -> cupy.ndarray: """ Booleans indicating if dates are the start day of the quarter. """ + # .is_quarter_start is already a cached_property return self._column.is_quarter_start.values @property @@ -3783,6 +3785,7 @@ def is_year_end(self) -> cupy.ndarray: """ Booleans indicating if dates are the last day of the year. """ + # .is_year_end is already a cached_property return self._column.is_year_end.values @property @@ -3790,6 +3793,7 @@ def is_year_start(self) -> cupy.ndarray: """ Booleans indicating if dates are the first day of the year. """ + # .is_year_start is already a cached_property return self._column.is_year_start.values @property @@ -3797,6 +3801,7 @@ def is_normalized(self) -> bool: """ Returns True if all of the dates are at midnight ("no time") """ + # .is_normalized is already a cached_property return self._column.is_normalized @property @@ -3804,6 +3809,7 @@ def days_in_month(self) -> Index: """ Get the total number of days in the month that the date falls on. """ + # .days_in_month is already a cached_property return Index._from_column(self._column.days_in_month, name=self.name) daysinmonth = days_in_month @@ -3813,9 +3819,10 @@ def day_of_week(self) -> Index: """ Get the day of week that the date falls on. """ + # .day_of_week is already a cached_property return Index._from_column(self._column.day_of_week, name=self.name) - @property # type: ignore + @property @_performance_tracking def year(self) -> Index: """ @@ -3832,9 +3839,10 @@ def year(self) -> Index: >>> datetime_index.year Index([2000, 2001, 2002], dtype='int16') """ + # .year is already a cached_property return Index._from_column(self._column.year, name=self.name) - @property # type: ignore + @property @_performance_tracking def month(self) -> Index: """ @@ -3851,9 +3859,10 @@ def month(self) -> Index: >>> datetime_index.month Index([1, 2, 3], dtype='int16') """ + # .month is already a cached_property return Index._from_column(self._column.month, name=self.name) - @property # type: ignore + @property @_performance_tracking def day(self) -> Index: """ @@ -3870,9 +3879,10 @@ def day(self) -> Index: >>> datetime_index.day Index([1, 2, 3], dtype='int16') """ + # .day is already a cached_property return Index._from_column(self._column.day, name=self.name) - @property # type: ignore + @property @_performance_tracking def hour(self) -> Index: """ @@ -3891,9 +3901,10 @@ def hour(self) -> Index: >>> datetime_index.hour Index([0, 1, 2], dtype='int16') """ + # .hour is already a cached_property return Index._from_column(self._column.hour, name=self.name) - @property # type: ignore + @property @_performance_tracking def minute(self) -> Index: """ @@ -3912,9 +3923,10 @@ def minute(self) -> Index: >>> datetime_index.minute Index([0, 1, 2], dtype='int16') """ + # .minute is already a cached_property return Index._from_column(self._column.minute, name=self.name) - @property # type: ignore + @property @_performance_tracking def second(self) -> Index: """ @@ -3933,9 +3945,10 @@ def second(self) -> Index: >>> datetime_index.second Index([0, 1, 2], dtype='int16') """ + # .second is already a cached_property return Index._from_column(self._column.second, name=self.name) - @property # type: ignore + @property @_performance_tracking def microsecond(self) -> Index: """ @@ -3952,21 +3965,12 @@ def microsecond(self) -> Index: '2000-01-01 00:00:00.000002'], dtype='datetime64[ns]') >>> datetime_index.microsecond - Index([0, 1, 2], dtype='int32') + Index([0, 1, 2], dtype='int16') """ - return Index._from_column( - ( - # Need to manually promote column to int32 because - # pandas-matching binop behaviour requires that this - # __mul__ returns an int16 column. - self._column.millisecond.astype(np.dtype(np.int32)) - * np.int32(1000) - ) - + self._column.microsecond, - name=self.name, - ) + # .microsecond is already a cached_property + return Index._from_column(self._column.microsecond, name=self.name) - @property # type: ignore + @property @_performance_tracking def nanosecond(self) -> Index: """ @@ -3986,9 +3990,10 @@ def nanosecond(self) -> Index: >>> datetime_index.nanosecond Index([0, 1, 2], dtype='int16') """ + # .nanosecond is already a cached_property return Index._from_column(self._column.nanosecond, name=self.name) - @property # type: ignore + @property @_performance_tracking def weekday(self) -> Index: """ @@ -4008,9 +4013,10 @@ def weekday(self) -> Index: >>> datetime_index.weekday Index([5, 6, 0, 1, 2, 3, 4, 5, 6], dtype='int16') """ + # .weekday is already a cached_property return Index._from_column(self._column.weekday, name=self.name) - @property # type: ignore + @property @_performance_tracking def dayofweek(self) -> Index: """ @@ -4030,9 +4036,10 @@ def dayofweek(self) -> Index: >>> datetime_index.dayofweek Index([5, 6, 0, 1, 2, 3, 4, 5, 6], dtype='int16') """ + # .weekday is already a cached_property return Index._from_column(self._column.weekday, name=self.name) - @property # type: ignore + @property @_performance_tracking def dayofyear(self) -> Index: """ @@ -4053,9 +4060,10 @@ def dayofyear(self) -> Index: >>> datetime_index.dayofyear Index([366, 1, 2, 3, 4, 5, 6, 7, 8], dtype='int16') """ + # .day_of_year is already a cached_property return Index._from_column(self._column.day_of_year, name=self.name) - @property # type: ignore + @property @_performance_tracking def day_of_year(self) -> Index: """ @@ -4076,9 +4084,10 @@ def day_of_year(self) -> Index: >>> datetime_index.day_of_year Index([366, 1, 2, 3, 4, 5, 6, 7, 8], dtype='int16') """ + # .day_of_year is already a cached_property return Index._from_column(self._column.day_of_year, name=self.name) - @property # type: ignore + @property @_performance_tracking def is_leap_year(self) -> cupy.ndarray: """ @@ -4094,10 +4103,11 @@ def is_leap_year(self) -> cupy.ndarray: ndarray Booleans indicating if dates belong to a leap year. """ + # .is_leap_year is already a cached_property res = self._column.is_leap_year.fillna(False) return cupy.asarray(res) - @property # type: ignore + @property @_performance_tracking def quarter(self) -> Index: """ @@ -4120,6 +4130,7 @@ def quarter(self) -> Index: >>> gIndex.quarter Index([2, 4], dtype='int8') """ + # .quarter is already a cached_property return Index._from_column( self._column.quarter.astype(np.dtype(np.int8)) ) @@ -4570,7 +4581,7 @@ def to_pytimedelta(self) -> np.ndarray: """ return self.to_pandas().to_pytimedelta() - @property + @cached_property def asi8(self) -> cupy.ndarray: return self._column.astype(np.dtype(np.int64)).values @@ -4622,57 +4633,62 @@ def round(self, freq: str) -> Self: self._column.round(freq), name=self.name ) - @property # type: ignore + @property @_performance_tracking def days(self) -> Index: """ Number of days for each element. """ + # .days is already a cached_property # Need to specifically return `int64` to avoid overflow. return Index._from_column( self._column.days.astype(np.dtype(np.int64)), name=self.name ) - @property # type: ignore + @property @_performance_tracking def seconds(self) -> Index: """ Number of seconds (>= 0 and less than 1 day) for each element. """ + # .seconds is already a cached_property return Index._from_column( self._column.seconds.astype(np.dtype(np.int32)), name=self.name ) - @property # type: ignore + @property @_performance_tracking def microseconds(self) -> Index: """ Number of microseconds (>= 0 and less than 1 second) for each element. """ + # .microseconds is already a cached_property return Index._from_column( self._column.microseconds.astype(np.dtype(np.int32)), name=self.name, ) - @property # type: ignore + @property @_performance_tracking def nanoseconds(self) -> Index: """ Number of nanoseconds (>= 0 and less than 1 microsecond) for each element. """ + # .nanoseconds is already a cached_property return Index._from_column( self._column.nanoseconds.astype(np.dtype(np.int32)), name=self.name ) - @property # type: ignore + @property @_performance_tracking def components(self) -> DataFrame: """ Return a dataframe of the components (days, hours, minutes, seconds, milliseconds, microseconds, nanoseconds) of the Timedeltas. """ - ca = ColumnAccessor(self._column.components(), verify=False) + # .components is already a cached_property + ca = ColumnAccessor(self._column.components, verify=False) return cudf.DataFrame._from_data(ca) @property @@ -4840,7 +4856,7 @@ def from_codes( def ordered(self) -> bool: return self._column.ordered - @property # type: ignore + @property @_performance_tracking def codes(self) -> Index: """ @@ -4848,7 +4864,7 @@ def codes(self) -> Index: """ return Index._from_column(self._column.codes) - @property # type: ignore + @property @_performance_tracking def categories(self) -> Index: """ @@ -5311,6 +5327,7 @@ def is_empty(self) -> cupy.ndarray: """ Indicates if an interval is empty, meaning it contains no points. """ + # .is_empty is already a cached_property return self._column.is_empty.values @property @@ -5318,6 +5335,7 @@ def is_non_overlapping_monotonic(self) -> bool: """ Return a True if the IntervalIndex is non-overlapping and monotonic. """ + # .is_non_overlapping_monotonic is already a cached_property return self._column.is_non_overlapping_monotonic @property @@ -5327,6 +5345,7 @@ def is_overlapping(self) -> bool: Currently not implemented """ + # .is_overlapping is already a cached_property return self._column.is_overlapping @property @@ -5334,6 +5353,7 @@ def length(self) -> Index: """ Return an Index with entries denoting the length of each Interval. """ + # .length is already a cached_property return _index_from_data({None: self._column.length}) @property @@ -5355,6 +5375,7 @@ def mid(self) -> Index: Each midpoint is calculated as the average of the left and right bounds of each interval. """ + # .mid is already a cached_property return _index_from_data({None: self._column.mid}) @property diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 01cf0905723..eaffa06d96d 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -5081,7 +5081,7 @@ def nanoseconds(self) -> Series: """ return self._return_result_like_self(self.series._column.nanoseconds) - @property # type: ignore + @property @_performance_tracking def components(self) -> DataFrame: """ @@ -5109,7 +5109,7 @@ def components(self) -> DataFrame: 3 0 0 35 35 656 0 0 4 37 13 12 14 234 0 0 """ - ca = ColumnAccessor(self.series._column.components(), verify=False) + ca = ColumnAccessor(self.series._column.components, verify=False) return self.series._constructor_expanddim._from_data( ca, index=self.series.index ) diff --git a/python/cudf/cudf/core/tools/datetimes.py b/python/cudf/cudf/core/tools/datetimes.py index f2094d31267..d4b60c7fc4b 100644 --- a/python/cudf/cudf/core/tools/datetimes.py +++ b/python/cudf/cudf/core/tools/datetimes.py @@ -775,7 +775,7 @@ def date_range( ``U``, ``us``, ``N``, ``ns``. tz : str or tzinfo, optional - Not Supported + Time zone name for returning localized DatetimeIndex. normalize : bool, default False Not Supported diff --git a/python/cudf/cudf/tests/indexes/datetimeindex/test_attributes.py b/python/cudf/cudf/tests/indexes/datetimeindex/test_attributes.py index d5b24a6a88e..527f0d07811 100644 --- a/python/cudf/cudf/tests/indexes/datetimeindex/test_attributes.py +++ b/python/cudf/cudf/tests/indexes/datetimeindex/test_attributes.py @@ -24,6 +24,16 @@ def test_contains_tz_aware(item, expected): assert result == expected +@pytest.mark.parametrize("tz", ["UTC", None]) +def test_tz_attribute(tz): + dti = cudf.date_range("2020", periods=2, freq="D", tz=tz) + if tz is None: + assert dti.tz is None + else: + # TODO(pandas3.0-min): Assert zoneinfo.ZoneInfo(tz) == dti.tz + assert str(dti.tz) == tz + + def test_tz_aware_attributes_local(): data = [ "2008-05-12 13:50:00", From 2da8211744b70ba8b0e108b53ac0eb74d2497f5e Mon Sep 17 00:00:00 2001 From: Matthew Murray <41342305+Matt711@users.noreply.github.com> Date: Wed, 27 Aug 2025 21:49:54 -0400 Subject: [PATCH 222/366] Warn on fallback in the streaming tests in cudf-polars (#19721) If we warn and catch it, the test will fail once we support the feature in the streaming engine. Therefore we'll keep the streaming tests up-to-date. Authors: - Matthew Murray (https://github.com/Matt711) Approvers: - Tom Augspurger (https://github.com/TomAugspurger) URL: https://github.com/rapidsai/cudf/pull/19721 --- .../tests/experimental/test_join.py | 29 +++++++++++++++---- .../tests/experimental/test_unique.py | 26 ++++++++++++++--- 2 files changed, 45 insertions(+), 10 deletions(-) diff --git a/python/cudf_polars/tests/experimental/test_join.py b/python/cudf_polars/tests/experimental/test_join.py index 71c5d0e246e..3d6e373290c 100644 --- a/python/cudf_polars/tests/experimental/test_join.py +++ b/python/cudf_polars/tests/experimental/test_join.py @@ -149,7 +149,7 @@ def test_join_conditional(reverse, max_rows_per_partition): executor_options={ "max_rows_per_partition": max_rows_per_partition, "scheduler": DEFAULT_SCHEDULER, - "fallback_mode": "silent", + "fallback_mode": "warn", }, ) left = pl.LazyFrame({"x": range(15), "y": [1, 2, 3] * 5}) @@ -157,7 +157,13 @@ def test_join_conditional(reverse, max_rows_per_partition): if reverse: left, right = right, left q = left.join_where(right, pl.col("y") < pl.col("yy")) - assert_gpu_result_equal(q, engine=engine, check_row_order=False) + if max_rows_per_partition == 3: + with pytest.warns( + UserWarning, match="ConditionalJoin not supported for multiple partitions." + ): + assert_gpu_result_equal(q, engine=engine, check_row_order=False) + else: + assert_gpu_result_equal(q, engine=engine, check_row_order=False) @pytest.mark.parametrize("zlice", [(0, 2), (2, 2), (-2, None)]) @@ -170,7 +176,7 @@ def test_join_and_slice(zlice): "broadcast_join_limit": 100, "scheduler": DEFAULT_SCHEDULER, "shuffle_method": "tasks", - "fallback_mode": "warn" if zlice[0] == 0 else "silent", + "fallback_mode": "warn", }, ) left = pl.LazyFrame( @@ -190,8 +196,19 @@ def test_join_and_slice(zlice): q = left.join(right, on="a", how="inner").slice(*zlice) # Check that we get the correct row count # See: https://github.com/rapidsai/cudf/issues/19153 - assert q.collect(engine=engine).height == q.collect().height - + if zlice in {(2, 2), (-2, None)}: + with pytest.warns( + UserWarning, match="This slice not supported for multiple partitions." + ): + assert q.collect(engine=engine).height == q.collect().height + else: + assert q.collect(engine=engine).height == q.collect().height # Need sort to match order after a join q = left.join(right, on="a", how="inner").sort(pl.col("a")).slice(*zlice) - assert_gpu_result_equal(q, engine=engine) + if zlice == (2, 2): + with pytest.warns( + UserWarning, match="Sort does not support multiple partitions." + ): + assert_gpu_result_equal(q, engine=engine) + else: + assert_gpu_result_equal(q, engine=engine) diff --git a/python/cudf_polars/tests/experimental/test_unique.py b/python/cudf_polars/tests/experimental/test_unique.py index a699a073b59..b402ab3a933 100644 --- a/python/cudf_polars/tests/experimental/test_unique.py +++ b/python/cudf_polars/tests/experimental/test_unique.py @@ -35,7 +35,7 @@ def test_unique(df, keep, subset, maintain_order, cardinality): "max_rows_per_partition": 50, "scheduler": DEFAULT_SCHEDULER, "unique_fraction": cardinality, - "fallback_mode": "silent", + "fallback_mode": "warn", }, ) @@ -45,7 +45,19 @@ def test_unique(df, keep, subset, maintain_order, cardinality): q = q.select(*(pl.col(col) for col in subset)) check_row_order = False - assert_gpu_result_equal(q, engine=engine, check_row_order=check_row_order) + is_cardinality0 = cardinality == {} + + should_warn = (maintain_order and (not is_cardinality0 or keep == "none")) or ( + not maintain_order and (not is_cardinality0) and keep in {"first", "last"} + ) + + if should_warn: + with pytest.warns( + UserWarning, match="Unsupported unique options for multiple partitions" + ): + assert_gpu_result_equal(q, engine=engine, check_row_order=check_row_order) + else: + assert_gpu_result_equal(q, engine=engine, check_row_order=check_row_order) def test_unique_fallback(df): @@ -77,12 +89,18 @@ def test_unique_select(df, maintain_order, cardinality): "max_rows_per_partition": 4, "scheduler": DEFAULT_SCHEDULER, "unique_fraction": cardinality, - "fallback_mode": "silent", + "fallback_mode": "warn", }, ) q = df.select(pl.col("y").unique(maintain_order=maintain_order)) - assert_gpu_result_equal(q, engine=engine, check_row_order=False) + if cardinality == {"y": 0.5} and maintain_order: + with pytest.warns( + UserWarning, match="Unsupported unique options for multiple partitions." + ): + assert_gpu_result_equal(q, engine=engine, check_row_order=False) + else: + assert_gpu_result_equal(q, engine=engine, check_row_order=False) @pytest.mark.parametrize("keep", ["first", "last", "any"]) From 7326449185f5a5afb05c5f1ec39ceaf3f59b5189 Mon Sep 17 00:00:00 2001 From: Matthew Murray <41342305+Matt711@users.noreply.github.com> Date: Wed, 27 Aug 2025 21:54:51 -0400 Subject: [PATCH 223/366] Improvements to `pylibcudf.from_iterable_of_py` (#19781) - Fix a bug when getting the offsets column - Add validation checks and fast paths to `from_iterable_of_py` Authors: - Matthew Murray (https://github.com/Matt711) Approvers: - Matthew Roeschke (https://github.com/mroeschke) URL: https://github.com/rapidsai/cudf/pull/19781 --- python/pylibcudf/pylibcudf/column.pyx | 35 +++++++++++++++++-- .../tests/test_column_from_iterable.py | 15 ++++++++ 2 files changed, 48 insertions(+), 2 deletions(-) diff --git a/python/pylibcudf/pylibcudf/column.pyx b/python/pylibcudf/pylibcudf/column.pyx index 8773c123d75..cc0ea75c158 100644 --- a/python/pylibcudf/pylibcudf/column.pyx +++ b/python/pylibcudf/pylibcudf/column.pyx @@ -66,6 +66,14 @@ from typing import Iterable __all__ = ["Column", "ListColumnView", "is_c_contiguous"] +cdef is_iterable(obj): + try: + iter(obj) + except Exception: + return False + return True + + cdef class _ArrowColumnHolder: """A holder for an Arrow column for gpumemoryview lifetime management.""" cdef unique_ptr[arrow_column] col @@ -1000,8 +1008,31 @@ cdef class Column: (in bytes) can exceed the maximum 32-bit integer value. In that case, the offsets column is automatically promoted to use 64-bit integers. """ + if not is_iterable(obj): + raise ValueError(f"{obj=} is not iterable") + + if ( + hasattr(obj, "__cuda_array_interface__") + or hasattr(obj, "__array_interface__") + ): + raise TypeError( + "Object has __cuda_array_interface__ or __array_interface__. " + "Please call Column.from_array(obj)." + ) - obj = list(obj) + if ( + hasattr(obj, "__arrow_c_array__") + or hasattr(obj, "__arrow_c_device_array__") + or hasattr(obj, "__arrow_c_stream__") + or hasattr(obj, "__arrow_c_device_stream__") + ): + raise TypeError( + "Object implements the Arrow C data interface protocol. " + "Please call Column.from_arrow(obj)." + ) + + if not isinstance(obj, (list, tuple)): + obj = list(obj) if not obj: if dtype is None: @@ -1276,7 +1307,7 @@ cdef class ListColumnView: cpdef offsets(self): """The offsets column of the underlying list column.""" - return self._column.child(1) + return self._column.child(0) cdef lists_column_view view(self) nogil: """Generate a libcudf lists_column_view to pass to libcudf algorithms. diff --git a/python/pylibcudf/tests/test_column_from_iterable.py b/python/pylibcudf/tests/test_column_from_iterable.py index 9768b0d3960..0cb6515704b 100644 --- a/python/pylibcudf/tests/test_column_from_iterable.py +++ b/python/pylibcudf/tests/test_column_from_iterable.py @@ -4,6 +4,7 @@ import operator from functools import reduce +import numpy as np import pyarrow as pa import pytest from utils import assert_column_eq @@ -216,3 +217,17 @@ def test_from_nested_list_of_large_strings(dummy_large_string_type): assert col.type().id() == plc.TypeId.LIST assert col.children()[1].type().id() == plc.TypeId.STRING assert col.children()[1].children()[0].type().id() == plc.TypeId.INT64 + + +@pytest.mark.parametrize( + "arr", + [pa.array([1, 2], type=pa.int32()), np.array([1, 2], dtype=np.int32)], +) +def test_from_iterable_pyarrow_or_numpy_array(arr): + with pytest.raises(TypeError, match="Please call"): + plc.Column.from_iterable_of_py(arr) + + +def test_from_iterable_plc_column(): + with pytest.raises(ValueError, match="is not iterable"): + plc.Column.from_iterable_of_py(plc.Column.from_iterable_of_py([1])) From a449958a6a5f07ab085daf4a57f1a0f027b3389c Mon Sep 17 00:00:00 2001 From: James Lamb Date: Thu, 28 Aug 2025 08:31:29 -0500 Subject: [PATCH 224/366] pin oldest numpy in dask-cudf tests, update dependency floors (cuda-python 12.9.2, cupy 13.6.0, numba 0.60.0) (#19806) Contributes to https://github.com/rapidsai/build-planning/issues/208 * updates dependency pins: - `cuda-python`: >=12.9.2 (CUDA 12) - `cupy`: >=13.6.0 - `numba`: >=0.60.0 (now that https://github.com/NVIDIA/numba-cuda/pull/403 is done) * ensures that "oldest" `numpy` is pinned in `dask-cudf` tests - _the "oldest" pin for `numpy` was previously not used in `dask-cudf` wheel tests, allowing an incompatible mix of packages (`pandas 2.0.3, numpy 2.0.2`) to be installed together_ ## Notes for Reviewers ### Why a separate PR? In https://github.com/rapidsai/cudf/pull/19768#issuecomment-3225933614, we saw this set of dependency changes caused failures like this in CUDA 12 and CUDA 13 environments: ```text ... ERROR io/tests/test_csv.py - ValueError: numpy.dtype size changed, may indicate binary incompatibility. Expected 96 from C header, got 88 from PyObject ERROR io/tests/test_json.py - ValueError: numpy.dtype size changed, may indicate binary incompatibility. Expected 96 from C header, got 88 from PyObject ERROR io/tests/test_orc.py - ValueError: numpy.dtype size changed, may indicate binary incompatibility. Expected 96 from C header, got 88 from PyObject ERROR io/tests/test_parquet.py - ValueError: numpy.dtype size changed, may indicate binary incompatibility. Expected 96 from C header, got 88 from PyObject ERROR io/tests/test_s3.py - ValueError: numpy.dtype size changed, may indicate binary incompatibility. Expected 96 from C header, got 88 from PyObject ... many more ... ``` ([wheel-test-dask-cudf link](https://github.com/rapidsai/cudf/actions/runs/17249655997/job/48950898976?pr=19768#step:11:11795)) Opening this more narrowly-scoped PR to investigate that. ### How I tested this First commit here contained some of the dependency changes from #19768 , and those were enough to reproduce the test failures! https://github.com/rapidsai/cudf/actions/runs/17271893124/job/49021534507?pr=19806#step:11:11928 # Authors: - James Lamb (https://github.com/jameslamb) Approvers: - Matthew Murray (https://github.com/Matt711) - Gil Forsyth (https://github.com/gforsyth) URL: https://github.com/rapidsai/cudf/pull/19806 --- .../all_cuda-129_arch-aarch64.yaml | 6 +-- .../all_cuda-129_arch-x86_64.yaml | 6 +-- conda/recipes/cudf/recipe.yaml | 7 ++- conda/recipes/pylibcudf/recipe.yaml | 2 +- dependencies.yaml | 54 +++++++++++-------- python/cudf/pyproject.toml | 6 +-- python/dask_cudf/pyproject.toml | 2 +- python/pylibcudf/pyproject.toml | 6 +-- 8 files changed, 49 insertions(+), 40 deletions(-) diff --git a/conda/environments/all_cuda-129_arch-aarch64.yaml b/conda/environments/all_cuda-129_arch-aarch64.yaml index 0140f536cc1..466aaf5d7fa 100644 --- a/conda/environments/all_cuda-129_arch-aarch64.yaml +++ b/conda/environments/all_cuda-129_arch-aarch64.yaml @@ -20,10 +20,10 @@ dependencies: - cuda-nvcc - cuda-nvrtc-dev - cuda-nvtx-dev -- cuda-python>=12.9.1,<13.0a0 +- cuda-python>=12.9.2,<13.0a0 - cuda-sanitizer-api - cuda-version=12.9 -- cupy>=12.0.0 +- cupy>=13.6.0 - cxx-compiler - cython>=3.0.3 - dask-cuda==25.10.*,>=0.0.0a0 @@ -56,7 +56,7 @@ dependencies: - ninja - notebook - numba-cuda>=0.19.1,<0.20.0a0 -- numba>=0.61.0,<0.62.0a0 +- numba>=0.60.0,<0.62.0a0 - numpy>=1.23,<3.0a0 - numpydoc - nvidia-ml-py diff --git a/conda/environments/all_cuda-129_arch-x86_64.yaml b/conda/environments/all_cuda-129_arch-x86_64.yaml index 36d858a4957..1679dfd5d83 100644 --- a/conda/environments/all_cuda-129_arch-x86_64.yaml +++ b/conda/environments/all_cuda-129_arch-x86_64.yaml @@ -20,10 +20,10 @@ dependencies: - cuda-nvcc - cuda-nvrtc-dev - cuda-nvtx-dev -- cuda-python>=12.9.1,<13.0a0 +- cuda-python>=12.9.2,<13.0a0 - cuda-sanitizer-api - cuda-version=12.9 -- cupy>=12.0.0 +- cupy>=13.6.0 - cxx-compiler - cython>=3.0.3 - dask-cuda==25.10.*,>=0.0.0a0 @@ -57,7 +57,7 @@ dependencies: - ninja - notebook - numba-cuda>=0.19.1,<0.20.0a0 -- numba>=0.61.0,<0.62.0a0 +- numba>=0.60.0,<0.62.0a0 - numpy>=1.23,<3.0a0 - numpydoc - nvidia-ml-py diff --git a/conda/recipes/cudf/recipe.yaml b/conda/recipes/cudf/recipe.yaml index b89151fb266..cf85a1c8117 100644 --- a/conda/recipes/cudf/recipe.yaml +++ b/conda/recipes/cudf/recipe.yaml @@ -69,10 +69,9 @@ requirements: - python - typing_extensions >=4.0.0 - pandas >=2.0,<2.4.0dev0 - - cupy >=12.0.0 + - cupy >=13.6.0 - numba-cuda >=0.19.1,<0.20.0a0 - # TODO: Revert to numba>=0.60.0,<0.62.0a0 once https://github.com/NVIDIA/numba-cuda/pull/403 is released. - - numba >=0.61.0,<0.62.0a0 + - numba >=0.60.0,<0.62.0a0 - numpy >=1.23,<3.0a0 - pyarrow>=14.0.0,<20.0.0a0 - libcudf =${{ version }} @@ -80,7 +79,7 @@ requirements: - ${{ pin_compatible("rmm", upper_bound="x.x") }} - fsspec >=0.6.0 - cuda-cudart - - cuda-python >=12.9.1,<13.0a0 + - cuda-python >=12.9.2,<13.0a0 - if: linux and x86_64 then: - libcufile diff --git a/conda/recipes/pylibcudf/recipe.yaml b/conda/recipes/pylibcudf/recipe.yaml index 548a35da119..8bc859fd633 100644 --- a/conda/recipes/pylibcudf/recipe.yaml +++ b/conda/recipes/pylibcudf/recipe.yaml @@ -70,7 +70,7 @@ requirements: - libcudf =${{ version }} - ${{ pin_compatible("rmm", upper_bound="x.x") }} - fsspec >=0.6.0 - - cuda-python >=12.9.1,<13.0a0 + - cuda-python >=12.9.2,<13.0a0 - nvtx >=0.2.1 - packaging run_constraints: diff --git a/dependencies.yaml b/dependencies.yaml index 614c4fbc395..574940c8121 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -364,6 +364,7 @@ files: - depends_on_dask_cuda - test_python_common - test_python_cudf_common + - test_python_dask_cudf py_build_cudf_kafka: output: pyproject pyproject_dir: python/cudf_kafka @@ -657,17 +658,19 @@ dependencies: specific: - output_types: [conda, requirements, pyproject] matrices: - - matrix: {cuda: "12.*"} - packages: &run_pylibcudf_packages_all_cu12 - - cuda-python>=12.9.1,<13.0a0 - - {matrix: null, packages: *run_pylibcudf_packages_all_cu12} + - matrix: &run_pylibcudf_packages_all_cu12 + cuda: "12.*" + packages: + - &cuda_python_cu12 cuda-python>=12.9.2,<13.0a0 + - matrix: + packages: + - *cuda_python_cu12 run_cudf: common: - output_types: [conda, requirements, pyproject] packages: - cachetools - # TODO: Revert to numba>=0.60.0,<0.62.0a0 once https://github.com/NVIDIA/numba-cuda/pull/403 is released. - - &numba numba>=0.61.0,<0.62.0a0 + - &numba numba>=0.60.0,<0.62.0a0 - nvtx>=0.2.1 - packaging - rich @@ -682,7 +685,7 @@ dependencies: matrices: - matrix: {cuda: "12.*"} packages: &run_cudf_packages_all_cu12 - - cuda-python>=12.9.1,<13.0a0 + - cuda-python>=12.9.2,<13.0a0 - {matrix: null, packages: *run_cudf_packages_all_cu12} - output_types: [requirements, pyproject] matrices: @@ -767,8 +770,7 @@ dependencies: matrices: - matrix: {dependencies: "oldest"} packages: - # TODO: Revert to numba==0.60.0 once https://github.com/NVIDIA/numba-cuda/pull/403 is released. - - numba==0.61.0 + - numba==0.60.0 - pandas==2.0.* - numba-cuda==0.19.1 - matrix: {dependencies: "latest"} @@ -778,20 +780,30 @@ dependencies: packages: - output_types: conda matrices: - - matrix: {dependencies: "oldest", arch: "aarch64", cuda: "12.*"} - packages: - - cupy==12.2.0 # cupy 12.2.0 is the earliest with CUDA 12 ARM packages. - matrix: {dependencies: "oldest"} packages: - - cupy==12.0.0 + - cupy==13.6.0 - matrix: packages: - output_types: requirements # Using --constraints for pip install, so we list cupy multiple times matrices: - - matrix: {dependencies: "oldest"} + - matrix: + dependencies: "oldest" + packages: + - cupy-cuda12x==13.6.0 + - matrix: packages: - - cupy-cuda12x==12.0.0 + test_python_dask_cudf: + specific: + - output_types: [conda, requirements, pyproject] + matrices: + - matrix: + dependencies: "oldest" + packages: + # this is sometimes different from the floor in cudf because + # dask imposes its own constraints on 'numpy' (e.g. for 'dask.array') + - numpy==1.24.* - matrix: packages: test_python_pylibcudf: @@ -838,10 +850,7 @@ dependencies: matrices: - matrix: {dependencies: "oldest"} packages: - # TODO: Revert to numpy==1.23.* once - # https://github.com/NVIDIA/numba-cuda/pull/403 is released and - # we revert to an oldest pinning of numba==0.60.0. - - numpy==1.24.* + - numpy==1.23.* # pyarrow 14 is fine in some circumstances but we require pyarrow # 15 in our CI tests in order to get a lz4-c that is compatible # with cudf_kafka's dependencies. @@ -983,13 +992,14 @@ dependencies: common: - output_types: conda packages: - - cupy>=12.0.0 + - cupy>=13.6.0 specific: - output_types: [requirements, pyproject] matrices: - - matrix: {cuda: "12.*"} + - matrix: + cuda: "12.*" packages: &cupy_packages_cu12 - - cupy-cuda12x>=12.0.0 + - cupy-cuda12x>=13.6.0 - {matrix: null, packages: *cupy_packages_cu12} depends_on_libkvikio: common: diff --git a/python/cudf/pyproject.toml b/python/cudf/pyproject.toml index ece2dd3f07a..039e81742f8 100644 --- a/python/cudf/pyproject.toml +++ b/python/cudf/pyproject.toml @@ -19,12 +19,12 @@ license = { text = "Apache-2.0" } requires-python = ">=3.10" dependencies = [ "cachetools", - "cuda-python>=12.9.1,<13.0a0", - "cupy-cuda12x>=12.0.0", + "cuda-python>=12.9.2,<13.0a0", + "cupy-cuda12x>=13.6.0", "fsspec>=0.6.0", "libcudf==25.10.*,>=0.0.0a0", "numba-cuda[cu12]>=0.19.1,<0.20.0a0", - "numba>=0.61.0,<0.62.0a0", + "numba>=0.60.0,<0.62.0a0", "numpy>=1.23,<3.0a0", "nvtx>=0.2.1", "packaging", diff --git a/python/dask_cudf/pyproject.toml b/python/dask_cudf/pyproject.toml index b8603849892..55f319bc268 100644 --- a/python/dask_cudf/pyproject.toml +++ b/python/dask_cudf/pyproject.toml @@ -20,7 +20,7 @@ license = { text = "Apache-2.0" } requires-python = ">=3.10" dependencies = [ "cudf==25.10.*,>=0.0.0a0", - "cupy-cuda12x>=12.0.0", + "cupy-cuda12x>=13.6.0", "fsspec>=0.6.0", "numpy>=1.23,<3.0a0", "pandas>=2.0,<2.4.0dev0", diff --git a/python/pylibcudf/pyproject.toml b/python/pylibcudf/pyproject.toml index 85fabaa14bf..fb49c45a4a2 100644 --- a/python/pylibcudf/pyproject.toml +++ b/python/pylibcudf/pyproject.toml @@ -18,7 +18,7 @@ authors = [ license = { text = "Apache-2.0" } requires-python = ">=3.10" dependencies = [ - "cuda-python>=12.9.1,<13.0a0", + "cuda-python>=12.9.2,<13.0a0", "libcudf==25.10.*,>=0.0.0a0", "nvtx>=0.2.1", "packaging", @@ -39,13 +39,13 @@ classifiers = [ [project.optional-dependencies] test = [ - "cupy-cuda12x>=12.0.0", + "cupy-cuda12x>=13.6.0", "fastavro>=0.22.9", "hypothesis>=6.131.7", "mmh3", "nanoarrow", "numba-cuda[cu12]>=0.19.1,<0.20.0a0", - "numba>=0.61.0,<0.62.0a0", + "numba>=0.60.0,<0.62.0a0", "pandas", "pyarrow>=14.0.0,<20.0.0a0,!=17.0.0; platform_machine=='aarch64'", "pyarrow>=14.0.0,<20.0.0a0; platform_machine=='x86_64'", From 6b12079b44f42421c5fa55e68662ac920642e7c5 Mon Sep 17 00:00:00 2001 From: Matthew Murray <41342305+Matt711@users.noreply.github.com> Date: Thu, 28 Aug 2025 09:43:34 -0400 Subject: [PATCH 225/366] Add changed-files to the needs of every job that requires it (#19830) Follow up to #19819, we're currently getting `fromJSON: empty` input because these jobs reference `needs.changed-files.outputs.changed_file_groups` without listing `changed-files` in their requirements. See the errors: https://github.com/rapidsai/cudf/actions/runs/17276870544/job/49037559718 and https://github.com/rapidsai/cudf/actions/runs/17276870544/job/49037755672 Authors: - Matthew Murray (https://github.com/Matt711) Approvers: - Gil Forsyth (https://github.com/gforsyth) - James Lamb (https://github.com/jameslamb) URL: https://github.com/rapidsai/cudf/pull/19830 --- .github/workflows/pr.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml index 01da4160bad..ae22eaf04c3 100644 --- a/.github/workflows/pr.yaml +++ b/.github/workflows/pr.yaml @@ -281,7 +281,7 @@ jobs: build_type: pull-request script: "ci/test_wheel_cudf_polars.sh" cudf-polars-polars-tests: - needs: wheel-build-cudf-polars + needs: [wheel-build-cudf-polars, changed-files] secrets: inherit uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-25.10 if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_python @@ -341,7 +341,7 @@ jobs: build_type: pull-request script: ci/cudf_pandas_scripts/run_tests.sh third-party-integration-tests-cudf-pandas: - needs: conda-python-build + needs: [conda-python-build, changed-files] secrets: inherit uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-25.10 if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_python || fromJSON(needs.changed-files.outputs.changed_file_groups).test_cudf_pandas From 82f6d2b91601cce7566659af50de2a3b92d906fb Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 28 Aug 2025 08:50:23 -0500 Subject: [PATCH 226/366] Update exception handling in pdsh benchmarks (#19793) This updates how we handle exceptions in the execution of a cudf-polars query. Previously, an exception raised during a query (e.g. an out of memory exception) would cause the process to exit, and no results would be written. Now, we'll 1. Catch the exception, print it to stdout, and record the fact that that query-iteration errored 2. Continue running the remaining queries / iterations 3. Write out the results of any sucessful execution 4. Exit with a non-zero error code if any exceptions occurred Authors: - Tom Augspurger (https://github.com/TomAugspurger) Approvers: - Richard (Rick) Zamora (https://github.com/rjzamora) - Matthew Murray (https://github.com/Matt711) URL: https://github.com/rapidsai/cudf/pull/19793 --- .../cudf_polars/experimental/benchmarks/utils.py | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/python/cudf_polars/cudf_polars/experimental/benchmarks/utils.py b/python/cudf_polars/cudf_polars/experimental/benchmarks/utils.py index 53b492e2e82..45142c31173 100644 --- a/python/cudf_polars/cudf_polars/experimental/benchmarks/utils.py +++ b/python/cudf_polars/cudf_polars/experimental/benchmarks/utils.py @@ -14,6 +14,7 @@ import sys import textwrap import time +import traceback from collections import defaultdict from datetime import datetime, timezone from typing import TYPE_CHECKING, Any, Literal, assert_never @@ -716,6 +717,7 @@ def run_polars( vars(args).update({"query_set": benchmark.name}) run_config = RunConfig.from_args(args) validation_failures: list[int] = [] + query_failures: list[tuple[int, int]] = [] client = initialize_dask_cluster(run_config, args) # type: ignore @@ -743,8 +745,13 @@ def run_polars( for i in range(args.iterations): t0 = time.monotonic() - result = execute_query(q_id, i, q, run_config, args, engine) - + try: + result = execute_query(q_id, i, q, run_config, args, engine) + except Exception: + print(f"❌ query={q_id} iteration={i} failed!") + print(traceback.format_exc()) + query_failures.append((q_id, i)) + continue if run_config.shuffle == "rapidsmpf" and run_config.gather_shuffle_stats: from rapidsmpf.integrations.dask.shuffler import ( clear_shuffle_statistics, @@ -794,5 +801,9 @@ def run_polars( ) else: print("All validated queries passed.") + args.output.write(json.dumps(run_config.serialize(engine=engine))) args.output.write("\n") + + if query_failures or validation_failures: + sys.exit(1) From 94e0f922471ebdf4abdd42d7c23644ab33201391 Mon Sep 17 00:00:00 2001 From: James Lamb Date: Thu, 28 Aug 2025 11:34:26 -0500 Subject: [PATCH 227/366] Build and test with CUDA 13.0.0 (#19768) Contributes to https://github.com/rapidsai/build-planning/issues/208 * uses CUDA 13.0.0 to build and test * adds CUDA 13 devcontainers * adds `cuda-nvvm-tools` as a runtime dependency of `cudf` conda packages - temporary workaround for https://github.com/NVIDIA/numba-cuda/issues/430, from @brandon-b-miller Contributes to https://github.com/rapidsai/build-planning/issues/68 * updates to CUDA 13 dependencies in fallback entries in `dependencies.yaml` matrices (i.e., the ones that get written to `pyproject.toml` in source control) ## Notes for Reviewers This switches GitHub Actions workflows to the `cuda13.0` branch from here: https://github.com/rapidsai/shared-workflows/pull/413 A future round of PRs will revert that back to `branch-25.10`, once all of RAPIDS supports CUDA 13. ### This has dependencies Need these to be merged first: * [x] #19821 * [x] #19806 Authors: - James Lamb (https://github.com/jameslamb) - David Wendt (https://github.com/davidwendt) Approvers: - Gil Forsyth (https://github.com/gforsyth) URL: https://github.com/rapidsai/cudf/pull/19768 --- .../cuda13.0-conda/devcontainer.json | 76 +++++++++++++ .devcontainer/cuda13.0-pip/devcontainer.json | 53 +++++++++ .github/workflows/build.yaml | 28 ++--- .github/workflows/pandas-tests.yaml | 4 +- .github/workflows/pr.yaml | 62 +++++------ .../workflows/pr_issue_status_automation.yml | 8 +- .github/workflows/test.yaml | 32 +++--- .../trigger-breaking-change-alert.yaml | 2 +- CONTRIBUTING.md | 2 +- README.md | 10 +- .../all_cuda-130_arch-aarch64.yaml | 103 +++++++++++++++++ .../all_cuda-130_arch-x86_64.yaml | 104 ++++++++++++++++++ conda/recipes/cudf/recipe.yaml | 8 +- conda/recipes/pylibcudf/recipe.yaml | 4 +- dependencies.yaml | 101 +++++++++++++++-- .../dependencies.yaml | 4 + python/cudf/pyproject.toml | 8 +- python/dask_cudf/pyproject.toml | 2 +- python/pylibcudf/pyproject.toml | 6 +- 19 files changed, 526 insertions(+), 91 deletions(-) create mode 100644 .devcontainer/cuda13.0-conda/devcontainer.json create mode 100644 .devcontainer/cuda13.0-pip/devcontainer.json create mode 100644 conda/environments/all_cuda-130_arch-aarch64.yaml create mode 100644 conda/environments/all_cuda-130_arch-x86_64.yaml diff --git a/.devcontainer/cuda13.0-conda/devcontainer.json b/.devcontainer/cuda13.0-conda/devcontainer.json new file mode 100644 index 00000000000..f236ef00da3 --- /dev/null +++ b/.devcontainer/cuda13.0-conda/devcontainer.json @@ -0,0 +1,76 @@ +{ + "build": { + "context": "${localWorkspaceFolder}/.devcontainer", + "dockerfile": "${localWorkspaceFolder}/.devcontainer/Dockerfile", + "args": { + "CUDA": "13.0", + "PYTHON_PACKAGE_MANAGER": "conda", + "BASE": "rapidsai/devcontainers:25.10-cpp-mambaforge" + } + }, + "runArgs": [ + "--rm", + "--name", + "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-25.10-cuda13.0-conda", + "--ulimit", + "nofile=500000" + ], + "hostRequirements": { + "gpu": "optional" + }, + "features": { + "ghcr.io/rapidsai/devcontainers/features/cuda:25.10": { + "version": "13.0", + "installCompilers": false, + "installProfilers": true, + "installDevPackages": false, + "installcuDNN": false, + "installcuTensor": false, + "installNCCL": false, + "installCUDARuntime": false, + "installNVRTC": false, + "installOpenCL": false, + "installcuBLAS": false, + "installcuSPARSE": false, + "installcuFFT": false, + "installcuFile": false, + "installcuRAND": false, + "installcuSOLVER": false, + "installNPP": false, + "installnvJPEG": false, + "pruneStaticLibs": true + }, + "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:25.10": {} + }, + "overrideFeatureInstallOrder": [ + "ghcr.io/rapidsai/devcontainers/features/cuda", + "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils" + ], + "initializeCommand": [ + "/bin/bash", + "-c", + "mkdir -m 0755 -p ${localWorkspaceFolder}/../.{aws,cache,config,conda/pkgs,conda/${localWorkspaceFolderBasename}-cuda13.0-envs}" + ], + "postAttachCommand": [ + "/bin/bash", + "-c", + "if [ ${CODESPACES:-false} = 'true' ]; then . devcontainer-utils-post-attach-command; . rapids-post-attach-command; fi" + ], + "workspaceFolder": "/home/coder", + "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/cudf,type=bind,consistency=consistent", + "mounts": [ + "source=${localWorkspaceFolder}/../.aws,target=/home/coder/.aws,type=bind,consistency=consistent", + "source=${localWorkspaceFolder}/../.cache,target=/home/coder/.cache,type=bind,consistency=consistent", + "source=${localWorkspaceFolder}/../.config,target=/home/coder/.config,type=bind,consistency=consistent", + "source=${localWorkspaceFolder}/../.conda/pkgs,target=/home/coder/.conda/pkgs,type=bind,consistency=consistent", + "source=${localWorkspaceFolder}/../.conda/${localWorkspaceFolderBasename}-cuda13.0-envs,target=/home/coder/.conda/envs,type=bind,consistency=consistent" + ], + "customizations": { + "vscode": { + "extensions": [ + "ms-python.flake8", + "nvidia.nsight-vscode-edition" + ] + } + } +} diff --git a/.devcontainer/cuda13.0-pip/devcontainer.json b/.devcontainer/cuda13.0-pip/devcontainer.json new file mode 100644 index 00000000000..c6c0f0c2230 --- /dev/null +++ b/.devcontainer/cuda13.0-pip/devcontainer.json @@ -0,0 +1,53 @@ +{ + "build": { + "context": "${localWorkspaceFolder}/.devcontainer", + "dockerfile": "${localWorkspaceFolder}/.devcontainer/Dockerfile", + "args": { + "CUDA": "13.0", + "PYTHON_PACKAGE_MANAGER": "pip", + "BASE": "rapidsai/devcontainers:25.10-cpp-cuda13.0" + } + }, + "runArgs": [ + "--rm", + "--name", + "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-25.10-cuda13.0-pip", + "--ulimit", + "nofile=500000" + ], + "hostRequirements": { + "gpu": "optional" + }, + "features": { + "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:25.10": {} + }, + "overrideFeatureInstallOrder": [ + "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils" + ], + "initializeCommand": [ + "/bin/bash", + "-c", + "mkdir -m 0755 -p ${localWorkspaceFolder}/../.{aws,cache,config/pip,local/share/${localWorkspaceFolderBasename}-cuda13.0-venvs}" + ], + "postAttachCommand": [ + "/bin/bash", + "-c", + "if [ ${CODESPACES:-false} = 'true' ]; then . devcontainer-utils-post-attach-command; . rapids-post-attach-command; fi" + ], + "workspaceFolder": "/home/coder", + "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/cudf,type=bind,consistency=consistent", + "mounts": [ + "source=${localWorkspaceFolder}/../.aws,target=/home/coder/.aws,type=bind,consistency=consistent", + "source=${localWorkspaceFolder}/../.cache,target=/home/coder/.cache,type=bind,consistency=consistent", + "source=${localWorkspaceFolder}/../.config,target=/home/coder/.config,type=bind,consistency=consistent", + "source=${localWorkspaceFolder}/../.local/share/${localWorkspaceFolderBasename}-cuda13.0-venvs,target=/home/coder/.local/share/venvs,type=bind,consistency=consistent" + ], + "customizations": { + "vscode": { + "extensions": [ + "ms-python.flake8", + "nvidia.nsight-vscode-edition" + ] + } + } +} diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml index d9bb501c968..87cc789ac78 100644 --- a/.github/workflows/build.yaml +++ b/.github/workflows/build.yaml @@ -46,7 +46,7 @@ jobs: cpp-build: needs: [telemetry-setup] secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-25.10 + uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@cuda13.0 with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} @@ -57,7 +57,7 @@ jobs: python-build: needs: [telemetry-setup, cpp-build] secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-25.10 + uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@cuda13.0 with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} @@ -67,7 +67,7 @@ jobs: upload-conda: needs: [cpp-build, python-build] secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@branch-25.10 + uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@cuda13.0 with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} @@ -77,7 +77,7 @@ jobs: if: github.ref_type == 'branch' needs: python-build secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-25.10 + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@cuda13.0 with: arch: "amd64" branch: ${{ inputs.branch }} @@ -90,7 +90,7 @@ jobs: wheel-build-libcudf: needs: [telemetry-setup] secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-25.10 + uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@cuda13.0 with: # build for every combination of arch and CUDA version, but only for the latest Python matrix_filter: group_by([.ARCH, (.CUDA_VER|split(".")|map(tonumber)|.[0])]) | map(max_by(.PY_VER|split(".")|map(tonumber))) @@ -105,7 +105,7 @@ jobs: wheel-publish-libcudf: needs: wheel-build-libcudf secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-25.10 + uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@cuda13.0 with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} @@ -116,7 +116,7 @@ jobs: wheel-build-pylibcudf: needs: [telemetry-setup, wheel-build-libcudf] secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-25.10 + uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@cuda13.0 with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} @@ -128,7 +128,7 @@ jobs: wheel-publish-pylibcudf: needs: wheel-build-pylibcudf secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-25.10 + uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@cuda13.0 with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} @@ -139,7 +139,7 @@ jobs: wheel-build-cudf: needs: [telemetry-setup, wheel-build-pylibcudf] secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-25.10 + uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@cuda13.0 with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} @@ -151,7 +151,7 @@ jobs: wheel-publish-cudf: needs: wheel-build-cudf secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-25.10 + uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@cuda13.0 with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} @@ -162,7 +162,7 @@ jobs: wheel-build-dask-cudf: needs: [telemetry-setup, wheel-build-cudf] secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-25.10 + uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@cuda13.0 with: # This selects "ARCH=amd64 + the latest supported Python + CUDA". matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))])) @@ -177,7 +177,7 @@ jobs: wheel-publish-dask-cudf: needs: wheel-build-dask-cudf secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-25.10 + uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@cuda13.0 with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} @@ -188,7 +188,7 @@ jobs: wheel-build-cudf-polars: needs: [telemetry-setup, wheel-build-pylibcudf] secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-25.10 + uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@cuda13.0 with: # This selects "ARCH=amd64 + the latest supported Python + CUDA". matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))])) @@ -203,7 +203,7 @@ jobs: wheel-publish-cudf-polars: needs: wheel-build-cudf-polars secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-25.10 + uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@cuda13.0 with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} diff --git a/.github/workflows/pandas-tests.yaml b/.github/workflows/pandas-tests.yaml index 56fba1f6d8f..085c17bd9f2 100644 --- a/.github/workflows/pandas-tests.yaml +++ b/.github/workflows/pandas-tests.yaml @@ -22,9 +22,9 @@ jobs: pandas-tests: # run the Pandas unit tests secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-25.10 + uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@cuda13.0 with: - matrix_filter: '[{"ARCH": "amd64", "PY_VER": "3.13", "CUDA_VER": "12.9.1", "LINUX_VER": "ubuntu24.04", "GPU": "l4", "DRIVER": "latest", "DEPENDENCIES": "newest"}]' + matrix_filter: '[{"ARCH": "amd64", "PY_VER": "3.13", "CUDA_VER": "13.0.0", "LINUX_VER": "ubuntu24.04", "GPU": "l4", "DRIVER": "latest", "DEPENDENCIES": "newest"}]' build_type: nightly branch: ${{ inputs.branch }} date: ${{ inputs.date }} diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml index ae22eaf04c3..10904fb1425 100644 --- a/.github/workflows/pr.yaml +++ b/.github/workflows/pr.yaml @@ -43,7 +43,7 @@ jobs: - telemetry-setup - third-party-integration-tests-cudf-pandas secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@branch-25.10 + uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@cuda13.0 if: always() with: needs: ${{ toJSON(needs) }} @@ -68,7 +68,7 @@ jobs: changed-files: secrets: inherit needs: telemetry-setup - uses: rapidsai/shared-workflows/.github/workflows/changed-files.yaml@branch-25.10 + uses: rapidsai/shared-workflows/.github/workflows/changed-files.yaml@cuda13.0 with: files_yaml: | test_cpp: @@ -130,14 +130,14 @@ jobs: checks: secrets: inherit needs: telemetry-setup - uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@branch-25.10 + uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@cuda13.0 with: enable_check_generated_files: false ignored_pr_jobs: "telemetry-summarize spark-rapids-jni" conda-cpp-build: needs: checks secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-25.10 + uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@cuda13.0 with: build_type: pull-request node_type: "cpu16" @@ -145,7 +145,7 @@ jobs: cpp-linters: secrets: inherit needs: checks - uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-25.10 + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@cuda13.0 with: build_type: pull-request script: "ci/cpp_linters.sh" @@ -153,13 +153,13 @@ jobs: conda-cpp-checks: needs: conda-cpp-build secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@branch-25.10 + uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@cuda13.0 with: build_type: pull-request conda-cpp-tests: needs: [conda-cpp-build, changed-files] secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-25.10 + uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@cuda13.0 if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_cpp with: build_type: pull-request @@ -167,14 +167,14 @@ jobs: conda-python-build: needs: conda-cpp-build secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-25.10 + uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@cuda13.0 with: build_type: pull-request script: ci/build_python.sh conda-python-cudf-tests: needs: [conda-python-build, changed-files] secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-25.10 + uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@cuda13.0 if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_python with: build_type: pull-request @@ -183,7 +183,7 @@ jobs: # Tests for dask_cudf, custreamz, cudf_kafka are separated for CI parallelism needs: [conda-python-build, changed-files] secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-25.10 + uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@cuda13.0 if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_python with: build_type: pull-request @@ -191,7 +191,7 @@ jobs: conda-java-tests: needs: [conda-cpp-build, changed-files] secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-25.10 + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@cuda13.0 if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_java with: build_type: pull-request @@ -202,7 +202,7 @@ jobs: conda-notebook-tests: needs: [conda-python-build, changed-files] secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-25.10 + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@cuda13.0 if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_notebooks with: build_type: pull-request @@ -213,7 +213,7 @@ jobs: docs-build: needs: conda-python-build secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-25.10 + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@cuda13.0 with: build_type: pull-request node_type: "gpu-l4-latest-1" @@ -223,7 +223,7 @@ jobs: wheel-build-libcudf: needs: checks secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-25.10 + uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@cuda13.0 with: # build for every combination of arch and CUDA version, but only for the latest Python matrix_filter: group_by([.ARCH, (.CUDA_VER|split(".")|map(tonumber)|.[0])]) | map(max_by(.PY_VER|split(".")|map(tonumber))) @@ -235,7 +235,7 @@ jobs: wheel-build-pylibcudf: needs: [checks, wheel-build-libcudf] secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-25.10 + uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@cuda13.0 with: build_type: pull-request script: "ci/build_wheel_pylibcudf.sh" @@ -244,7 +244,7 @@ jobs: wheel-build-cudf: needs: wheel-build-pylibcudf secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-25.10 + uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@cuda13.0 with: build_type: pull-request script: "ci/build_wheel_cudf.sh" @@ -253,7 +253,7 @@ jobs: wheel-tests-cudf: needs: [wheel-build-cudf, changed-files] secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-25.10 + uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@cuda13.0 if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_python with: build_type: pull-request @@ -261,7 +261,7 @@ jobs: wheel-build-cudf-polars: needs: wheel-build-pylibcudf secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-25.10 + uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@cuda13.0 with: # This selects "ARCH=amd64 + the latest supported Python + CUDA". matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))])) @@ -273,7 +273,7 @@ jobs: wheel-tests-cudf-polars: needs: [wheel-build-cudf-polars, changed-files] secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-25.10 + uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@cuda13.0 if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_python with: # This selects "ARCH=amd64 + the latest supported Python + CUDA". @@ -283,7 +283,7 @@ jobs: cudf-polars-polars-tests: needs: [wheel-build-cudf-polars, changed-files] secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-25.10 + uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@cuda13.0 if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_python with: # This selects "ARCH=amd64 + the latest supported Python + CUDA". @@ -293,7 +293,7 @@ jobs: wheel-build-dask-cudf: needs: wheel-build-cudf secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-25.10 + uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@cuda13.0 with: # This selects "ARCH=amd64 + the latest supported Python + CUDA". matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))])) @@ -305,7 +305,7 @@ jobs: wheel-tests-dask-cudf: needs: [wheel-build-dask-cudf, changed-files] secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-25.10 + uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@cuda13.0 if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_python with: # This selects "ARCH=amd64 + the latest supported Python + CUDA". @@ -315,10 +315,10 @@ jobs: devcontainer: secrets: inherit needs: telemetry-setup - uses: rapidsai/shared-workflows/.github/workflows/build-in-devcontainer.yaml@branch-25.10 + uses: rapidsai/shared-workflows/.github/workflows/build-in-devcontainer.yaml@cuda13.0 with: arch: '["amd64", "arm64"]' - cuda: '["12.9"]' + cuda: '["13.0"]' node_type: "cpu8" rapids-aux-secret-1: GIST_REPO_READ_ORG_GITHUB_TOKEN env: | @@ -333,7 +333,7 @@ jobs: unit-tests-cudf-pandas: needs: [wheel-build-cudf, changed-files] secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-25.10 + uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@cuda13.0 if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_python || fromJSON(needs.changed-files.outputs.changed_file_groups).test_cudf_pandas with: # This selects "ARCH=amd64 + the latest supported Python + CUDA". @@ -343,7 +343,7 @@ jobs: third-party-integration-tests-cudf-pandas: needs: [conda-python-build, changed-files] secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-25.10 + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@cuda13.0 if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_python || fromJSON(needs.changed-files.outputs.changed_file_groups).test_cudf_pandas with: build_type: pull-request @@ -352,17 +352,17 @@ jobs: sha: ${{ inputs.sha }} node_type: "gpu-l4-latest-1" continue-on-error: true - container_image: "rapidsai/ci-conda:cuda12.9.0-ubuntu24.04-py3.12" + container_image: "rapidsai/ci-conda:25.10-latest" script: | ci/cudf_pandas_scripts/third-party-integration/test.sh python/cudf/cudf_pandas_tests/third_party_integration_tests/dependencies.yaml pandas-tests: # run the Pandas unit tests using PR branch needs: [wheel-build-cudf, changed-files] secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-25.10 + uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@cuda13.0 if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_python || fromJSON(needs.changed-files.outputs.changed_file_groups).test_cudf_pandas with: - matrix_filter: '[{"ARCH": "amd64", "PY_VER": "3.13", "CUDA_VER": "12.9.1", "LINUX_VER": "ubuntu24.04", "GPU": "l4", "DRIVER": "latest", "DEPENDENCIES": "newest"}]' + matrix_filter: '[{"ARCH": "amd64", "PY_VER": "3.13", "CUDA_VER": "13.0.0", "LINUX_VER": "ubuntu24.04", "GPU": "l4", "DRIVER": "latest", "DEPENDENCIES": "newest"}]' build_type: pull-request script: ci/cudf_pandas_scripts/pandas-tests/run.sh pr # Hide test failures because they exceed the GITHUB_STEP_SUMMARY output limit. @@ -370,7 +370,7 @@ jobs: pandas-tests-diff: # diff the results of running the Pandas unit tests and publish a job summary needs: pandas-tests - uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-25.10 + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@cuda13.0 with: node_type: "cpu4" build_type: pull-request @@ -378,7 +378,7 @@ jobs: narwhals-tests: needs: [conda-python-build, changed-files] secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-25.10 + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@cuda13.0 if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_python with: build_type: pull-request diff --git a/.github/workflows/pr_issue_status_automation.yml b/.github/workflows/pr_issue_status_automation.yml index 148d83e73d6..e7e48eac44c 100644 --- a/.github/workflows/pr_issue_status_automation.yml +++ b/.github/workflows/pr_issue_status_automation.yml @@ -23,7 +23,7 @@ on: jobs: get-project-id: - uses: rapidsai/shared-workflows/.github/workflows/project-get-item-id.yaml@branch-25.10 + uses: rapidsai/shared-workflows/.github/workflows/project-get-item-id.yaml@cuda13.0 if: github.event.pull_request.state == 'open' secrets: inherit permissions: @@ -34,7 +34,7 @@ jobs: update-status: # This job sets the PR and its linked issues to "In Progress" status - uses: rapidsai/shared-workflows/.github/workflows/project-get-set-single-select-field.yaml@branch-25.10 + uses: rapidsai/shared-workflows/.github/workflows/project-get-set-single-select-field.yaml@cuda13.0 if: ${{ github.event.pull_request.state == 'open' && needs.get-project-id.outputs.ITEM_PROJECT_ID != '' }} needs: get-project-id with: @@ -50,7 +50,7 @@ jobs: update-sprint: # This job sets the PR and its linked issues to the current "Weekly Sprint" - uses: rapidsai/shared-workflows/.github/workflows/project-get-set-iteration-field.yaml@branch-25.10 + uses: rapidsai/shared-workflows/.github/workflows/project-get-set-iteration-field.yaml@cuda13.0 if: ${{ github.event.pull_request.state == 'open' && needs.get-project-id.outputs.ITEM_PROJECT_ID != '' }} needs: get-project-id with: @@ -79,7 +79,7 @@ jobs: update-release: # This job sets the PR and its linked issues to the release they are targeting - uses: rapidsai/shared-workflows/.github/workflows/project-get-set-single-select-field.yaml@branch-25.10 + uses: rapidsai/shared-workflows/.github/workflows/project-get-set-single-select-field.yaml@cuda13.0 if: ${{ github.event.pull_request.state == 'open' && needs.get-project-id.outputs.ITEM_PROJECT_ID != '' }} needs: [get-project-id, process-branch-name] with: diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index b8af253bc56..52e381dd7d8 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -24,7 +24,7 @@ on: jobs: conda-cpp-checks: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@branch-25.10 + uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@cuda13.0 with: build_type: ${{ inputs.build_type }} branch: ${{ inputs.branch }} @@ -32,7 +32,7 @@ jobs: sha: ${{ inputs.sha }} conda-cpp-tests: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-25.10 + uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@cuda13.0 with: build_type: ${{ inputs.build_type }} branch: ${{ inputs.branch }} @@ -41,7 +41,7 @@ jobs: sha: ${{ inputs.sha }} conda-cpp-memcheck-tests: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-25.10 + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@cuda13.0 with: build_type: ${{ inputs.build_type }} branch: ${{ inputs.branch }} @@ -53,7 +53,7 @@ jobs: script: "ci/test_cpp_memcheck.sh" cpp-linters: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-25.10 + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@cuda13.0 with: build_type: ${{ inputs.build_type }} branch: ${{ inputs.branch }} @@ -63,7 +63,7 @@ jobs: file_to_upload: iwyu_results.txt conda-python-cudf-tests: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-25.10 + uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@cuda13.0 with: build_type: ${{ inputs.build_type }} branch: ${{ inputs.branch }} @@ -73,7 +73,7 @@ jobs: conda-python-other-tests: # Tests for dask_cudf, custreamz, cudf_kafka are separated for CI parallelism secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-25.10 + uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@cuda13.0 with: build_type: ${{ inputs.build_type }} branch: ${{ inputs.branch }} @@ -82,7 +82,7 @@ jobs: script: "ci/test_python_other.sh" conda-java-tests: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-25.10 + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@cuda13.0 with: build_type: ${{ inputs.build_type }} branch: ${{ inputs.branch }} @@ -94,7 +94,7 @@ jobs: script: "ci/test_java.sh" conda-notebook-tests: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-25.10 + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@cuda13.0 with: build_type: ${{ inputs.build_type }} branch: ${{ inputs.branch }} @@ -106,7 +106,7 @@ jobs: script: "ci/test_notebooks.sh" wheel-tests-cudf: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-25.10 + uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@cuda13.0 with: build_type: ${{ inputs.build_type }} branch: ${{ inputs.branch }} @@ -115,7 +115,7 @@ jobs: script: ci/test_wheel_cudf.sh wheel-tests-dask-cudf: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-25.10 + uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@cuda13.0 with: build_type: ${{ inputs.build_type }} branch: ${{ inputs.branch }} @@ -124,7 +124,7 @@ jobs: script: ci/test_wheel_dask_cudf.sh unit-tests-cudf-pandas: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-25.10 + uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@cuda13.0 with: build_type: ${{ inputs.build_type }} branch: ${{ inputs.branch }} @@ -133,19 +133,19 @@ jobs: script: ci/cudf_pandas_scripts/run_tests.sh third-party-integration-tests-cudf-pandas: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-25.10 + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@cuda13.0 with: build_type: ${{ inputs.build_type }} branch: ${{ inputs.branch }} date: ${{ inputs.date }} sha: ${{ inputs.sha }} node_type: "gpu-l4-latest-1" - container_image: "rapidsai/ci-conda:cuda12.9.0-ubuntu24.04-py3.12" + container_image: "rapidsai/ci-conda:25.10-latest" script: | ci/cudf_pandas_scripts/third-party-integration/test.sh python/cudf/cudf_pandas_tests/third_party_integration_tests/dependencies.yaml wheel-tests-cudf-polars: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-25.10 + uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@cuda13.0 with: build_type: ${{ inputs.build_type }} branch: ${{ inputs.branch }} @@ -154,7 +154,7 @@ jobs: script: "ci/test_wheel_cudf_polars.sh" cudf-polars-polars-tests: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-25.10 + uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@cuda13.0 with: build_type: ${{ inputs.build_type }} branch: ${{ inputs.branch }} @@ -163,7 +163,7 @@ jobs: script: "ci/test_cudf_polars_polars_tests.sh" narwhals-tests: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-25.10 + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@cuda13.0 with: build_type: ${{ inputs.build_type }} branch: ${{ inputs.branch }} diff --git a/.github/workflows/trigger-breaking-change-alert.yaml b/.github/workflows/trigger-breaking-change-alert.yaml index 48bf37afc40..72751d071bb 100644 --- a/.github/workflows/trigger-breaking-change-alert.yaml +++ b/.github/workflows/trigger-breaking-change-alert.yaml @@ -12,7 +12,7 @@ jobs: trigger-notifier: if: contains(github.event.pull_request.labels.*.name, 'breaking') secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/breaking-change-alert.yaml@branch-25.10 + uses: rapidsai/shared-workflows/.github/workflows/breaking-change-alert.yaml@cuda13.0 with: sender_login: ${{ github.event.sender.login }} sender_avatar: ${{ github.event.sender.avatar_url }} diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index ab7a5731b69..35a896559cd 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -121,7 +121,7 @@ Instructions for a minimal build environment without conda are included below. # create the conda environment (assuming in base `cudf` directory) # note: RAPIDS currently doesn't support `channel_priority: strict`; # use `channel_priority: flexible` instead -conda env create --name cudf_dev --file conda/environments/all_cuda-129_arch-x86_64.yaml +conda env create --name cudf_dev --file conda/environments/all_cuda-130_arch-x86_64.yaml # activate the environment conda activate cudf_dev ``` diff --git a/README.md b/README.md index 04980005846..538a1b6d344 100644 --- a/README.md +++ b/README.md @@ -65,6 +65,10 @@ Be sure to select the appropriate cuDF package depending on the major version of CUDA available in your environment: ```bash +# CUDA 13 +pip install cudf-cu13 + +# CUDA 12 pip install cudf-cu12 ``` @@ -73,7 +77,11 @@ pip install cudf-cu12 cuDF can be installed with conda (via [miniforge](https://github.com/conda-forge/miniforge)) from the `rapidsai` channel: ```bash -conda install -c rapidsai -c conda-forge cudf=25.10 +# CUDA 13 +conda install -c rapidsai -c conda-forge cudf=25.10 cuda-version=13.0 + +# CUDA 12 +conda install -c rapidsai -c conda-forge cudf=25.10 cuda-version=12.9 ``` We also provide [nightly Conda packages](https://anaconda.org/rapidsai-nightly) built from the HEAD diff --git a/conda/environments/all_cuda-130_arch-aarch64.yaml b/conda/environments/all_cuda-130_arch-aarch64.yaml new file mode 100644 index 00000000000..bd29b08c758 --- /dev/null +++ b/conda/environments/all_cuda-130_arch-aarch64.yaml @@ -0,0 +1,103 @@ +# This file is generated by `rapids-dependency-file-generator`. +# To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. +channels: +- rapidsai +- rapidsai-nightly +- conda-forge +dependencies: +- aiobotocore>=2.2.0 +- boto3>=1.21.21 +- botocore>=1.24.21 +- breathe>=4.35.0 +- c-compiler +- cachetools +- certifi +- clang-tools==20.1.4 +- clang==20.1.4 +- cmake>=3.30.4 +- cramjam +- cuda-cudart-dev +- cuda-nvcc +- cuda-nvrtc-dev +- cuda-nvtx-dev +- cuda-nvvm-tools +- cuda-python>=13.0.1,<14.0a0 +- cuda-sanitizer-api +- cuda-version=13.0 +- cupy>=13.6.0 +- cxx-compiler +- cython>=3.0.3 +- dask-cuda==25.10.*,>=0.0.0a0 +- dlpack>=0.8,<1.0 +- doxygen=1.9.1 +- fastavro>=0.22.9 +- flatbuffers==24.3.25 +- fsspec>=0.6.0 +- gcc_linux-aarch64=14.* +- hypothesis>=6.131.7 +- identify>=2.5.20 +- include-what-you-use==0.24.0 +- ipython +- jupyter_client +- libcurand-dev +- libkvikio==25.10.*,>=0.0.0a0 +- libnvcomp-dev==5.0.0.6 +- libnvjitlink-dev +- librdkafka>=2.8.0,<2.9.0a0 +- librmm==25.10.*,>=0.0.0a0 +- make +- mmh3 +- moto>=4.0.8 +- msgpack-python +- myst-nb +- nanoarrow +- nbconvert +- nbformat +- nbsphinx +- ninja +- notebook +- numba-cuda>=0.19.1,<0.20.0a0 +- numba>=0.60.0,<0.62.0a0 +- numpy>=1.23,<3.0a0 +- numpydoc +- nvidia-ml-py +- nvtx>=0.2.1 +- openpyxl +- packaging +- pandas +- pandas>=2.0,<2.4.0dev0 +- pandoc +- polars>=1.28,<1.33 +- pre-commit +- pyarrow>=14.0.0,<20.0.0a0 +- pydata-sphinx-theme>=0.15.4 +- pynvml>=12.0.0,<13.0.0a0 +- pytest +- pytest-benchmark +- pytest-cases>=3.8.2 +- pytest-cov +- pytest-rerunfailures +- pytest-xdist +- python-confluent-kafka>=2.8.0,<2.9.0a0 +- python-xxhash +- python>=3.10,<3.14 +- rapids-build-backend>=0.4.0,<0.5.0.dev0 +- rapids-dask-dependency==25.10.*,>=0.0.0a0 +- rapids-logger==0.1.*,>=0.0.0a0 +- rich +- rmm==25.10.*,>=0.0.0a0 +- s3fs>=2022.3.0 +- scikit-build-core>=0.10.0 +- scipy +- sphinx-autobuild +- sphinx-copybutton +- sphinx-markdown-tables +- sphinx-remove-toctrees +- sphinx>=8.1.0 +- sphinxcontrib-websupport +- streamz +- sysroot_linux-aarch64==2.28 +- typing_extensions>=4.0.0 +- zlib>=1.2.13 +- zstandard +name: all_cuda-130_arch-aarch64 diff --git a/conda/environments/all_cuda-130_arch-x86_64.yaml b/conda/environments/all_cuda-130_arch-x86_64.yaml new file mode 100644 index 00000000000..4f06590e585 --- /dev/null +++ b/conda/environments/all_cuda-130_arch-x86_64.yaml @@ -0,0 +1,104 @@ +# This file is generated by `rapids-dependency-file-generator`. +# To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. +channels: +- rapidsai +- rapidsai-nightly +- conda-forge +dependencies: +- aiobotocore>=2.2.0 +- boto3>=1.21.21 +- botocore>=1.24.21 +- breathe>=4.35.0 +- c-compiler +- cachetools +- certifi +- clang-tools==20.1.4 +- clang==20.1.4 +- cmake>=3.30.4 +- cramjam +- cuda-cudart-dev +- cuda-nvcc +- cuda-nvrtc-dev +- cuda-nvtx-dev +- cuda-nvvm-tools +- cuda-python>=13.0.1,<14.0a0 +- cuda-sanitizer-api +- cuda-version=13.0 +- cupy>=13.6.0 +- cxx-compiler +- cython>=3.0.3 +- dask-cuda==25.10.*,>=0.0.0a0 +- dlpack>=0.8,<1.0 +- doxygen=1.9.1 +- fastavro>=0.22.9 +- flatbuffers==24.3.25 +- fsspec>=0.6.0 +- gcc_linux-64=14.* +- hypothesis>=6.131.7 +- identify>=2.5.20 +- include-what-you-use==0.24.0 +- ipython +- jupyter_client +- libcufile-dev +- libcurand-dev +- libkvikio==25.10.*,>=0.0.0a0 +- libnvcomp-dev==5.0.0.6 +- libnvjitlink-dev +- librdkafka>=2.8.0,<2.9.0a0 +- librmm==25.10.*,>=0.0.0a0 +- make +- mmh3 +- moto>=4.0.8 +- msgpack-python +- myst-nb +- nanoarrow +- nbconvert +- nbformat +- nbsphinx +- ninja +- notebook +- numba-cuda>=0.19.1,<0.20.0a0 +- numba>=0.60.0,<0.62.0a0 +- numpy>=1.23,<3.0a0 +- numpydoc +- nvidia-ml-py +- nvtx>=0.2.1 +- openpyxl +- packaging +- pandas +- pandas>=2.0,<2.4.0dev0 +- pandoc +- polars>=1.28,<1.33 +- pre-commit +- pyarrow>=14.0.0,<20.0.0a0 +- pydata-sphinx-theme>=0.15.4 +- pynvml>=12.0.0,<13.0.0a0 +- pytest +- pytest-benchmark +- pytest-cases>=3.8.2 +- pytest-cov +- pytest-rerunfailures +- pytest-xdist +- python-confluent-kafka>=2.8.0,<2.9.0a0 +- python-xxhash +- python>=3.10,<3.14 +- rapids-build-backend>=0.4.0,<0.5.0.dev0 +- rapids-dask-dependency==25.10.*,>=0.0.0a0 +- rapids-logger==0.1.*,>=0.0.0a0 +- rich +- rmm==25.10.*,>=0.0.0a0 +- s3fs>=2022.3.0 +- scikit-build-core>=0.10.0 +- scipy +- sphinx-autobuild +- sphinx-copybutton +- sphinx-markdown-tables +- sphinx-remove-toctrees +- sphinx>=8.1.0 +- sphinxcontrib-websupport +- streamz +- sysroot_linux-64==2.28 +- typing_extensions>=4.0.0 +- zlib>=1.2.13 +- zstandard +name: all_cuda-130_arch-x86_64 diff --git a/conda/recipes/cudf/recipe.yaml b/conda/recipes/cudf/recipe.yaml index cf85a1c8117..05b8f6f7b89 100644 --- a/conda/recipes/cudf/recipe.yaml +++ b/conda/recipes/cudf/recipe.yaml @@ -71,6 +71,10 @@ requirements: - pandas >=2.0,<2.4.0dev0 - cupy >=13.6.0 - numba-cuda >=0.19.1,<0.20.0a0 + # TODO: remove cuda-nvvm-tools once https://github.com/NVIDIA/numba-cuda/issues/430 is resolved + # and we move the numba-cuda floor up to a version containing a fix for it + - if: cuda_major == "13" + then: cuda-nvvm-tools - numba >=0.60.0,<0.62.0a0 - numpy >=1.23,<3.0a0 - pyarrow>=14.0.0,<20.0.0a0 @@ -79,7 +83,9 @@ requirements: - ${{ pin_compatible("rmm", upper_bound="x.x") }} - fsspec >=0.6.0 - cuda-cudart - - cuda-python >=12.9.2,<13.0a0 + - if: cuda_major == "12" + then: cuda-python >=12.9.2,<13.0a0 + else: cuda-python >=13.0.1,<14.0a0 - if: linux and x86_64 then: - libcufile diff --git a/conda/recipes/pylibcudf/recipe.yaml b/conda/recipes/pylibcudf/recipe.yaml index 8bc859fd633..4273baf5fd3 100644 --- a/conda/recipes/pylibcudf/recipe.yaml +++ b/conda/recipes/pylibcudf/recipe.yaml @@ -70,7 +70,9 @@ requirements: - libcudf =${{ version }} - ${{ pin_compatible("rmm", upper_bound="x.x") }} - fsspec >=0.6.0 - - cuda-python >=12.9.2,<13.0a0 + - if: cuda_major == "12" + then: cuda-python >=12.9.2,<13.0a0 + else: cuda-python >=13.0.1,<14.0a0 - nvtx >=0.2.1 - packaging run_constraints: diff --git a/dependencies.yaml b/dependencies.yaml index 574940c8121..fe6d8dcf2bb 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -3,7 +3,7 @@ files: all: output: conda matrix: - cuda: ["12.9"] + cuda: ["12.9", "13.0"] arch: [x86_64, aarch64] includes: - build_base @@ -545,6 +545,10 @@ dependencies: cuda: "12.9" packages: - cuda-version=12.9 + - matrix: + cuda: "13.0" + packages: + - cuda-version=13.0 cuda: common: - output_types: [conda] @@ -658,13 +662,14 @@ dependencies: specific: - output_types: [conda, requirements, pyproject] matrices: - - matrix: &run_pylibcudf_packages_all_cu12 + - matrix: cuda: "12.*" packages: - - &cuda_python_cu12 cuda-python>=12.9.2,<13.0a0 + - cuda-python>=12.9.2,<13.0a0 + # fallback to CUDA 13 versions if 'cuda' is '13.*' or not provided - matrix: packages: - - *cuda_python_cu12 + - cuda-python>=13.0.1,<14.0a0 run_cudf: common: - output_types: [conda, requirements, pyproject] @@ -683,10 +688,14 @@ dependencies: specific: - output_types: [conda, requirements, pyproject] matrices: - - matrix: {cuda: "12.*"} - packages: &run_cudf_packages_all_cu12 + - matrix: + cuda: "12.*" + packages: - cuda-python>=12.9.2,<13.0a0 - - {matrix: null, packages: *run_cudf_packages_all_cu12} + # fallback to CUDA 13 versions if 'cuda' is '13.*' or not provided + - matrix: + packages: + - cuda-python>=13.0.1,<14.0a0 - output_types: [requirements, pyproject] matrices: - matrix: @@ -695,6 +704,12 @@ dependencies: packages: - nvidia-cuda-nvcc-cu12 - nvidia-cuda-nvrtc-cu12 + - matrix: + cuda: "13.*" + cuda_suffixed: "true" + packages: + - nvidia-cuda-nvcc-cu13 + - nvidia-cuda-nvrtc-cu13 - {matrix: null, packages: []} run_cudf_polars: common: @@ -789,9 +804,15 @@ dependencies: # Using --constraints for pip install, so we list cupy multiple times matrices: - matrix: + cuda: "12.*" dependencies: "oldest" packages: - cupy-cuda12x==13.6.0 + - matrix: + cuda: "13.*" + dependencies: "oldest" + packages: + - cupy-cuda13x==13.6.0 - matrix: packages: test_python_dask_cudf: @@ -863,6 +884,7 @@ dependencies: cuda: "12.*" packages: - pytorch>=2.4.0 + # TODO: add a 13.x entry here when pytorch has CUDA 13 packages (https://github.com/pytorch/pytorch/issues/159779) - matrix: packages: test_python_cudf_polars: @@ -896,6 +918,11 @@ dependencies: cuda_suffixed: "true" packages: - libcudf-cu12==25.10.*,>=0.0.0a0 + - matrix: + cuda: "13.*" + cuda_suffixed: "true" + packages: + - libcudf-cu13==25.10.*,>=0.0.0a0 - {matrix: null, packages: [*libcudf_unsuffixed]} depends_on_numba_cuda: common: @@ -903,15 +930,26 @@ dependencies: packages: - numba-cuda>=0.19.1,<0.20.0a0 specific: + # TODO: remove cuda-nvvm-tools once https://github.com/NVIDIA/numba-cuda/issues/430 is resolved + # and we move the numba-cuda floor up to a version containing a fix for it + - output_types: [conda] + matrices: + - matrix: + cuda: "13.*" + packages: + - cuda-nvvm-tools + - matrix: + packages: - output_types: [requirements, pyproject] matrices: - matrix: cuda: "12.*" packages: - - &numba_cuda_cu12 numba-cuda[cu12]>=0.19.1,<0.20.0a0 + - numba-cuda[cu12]>=0.19.1,<0.20.0a0 + # fallback to CUDA 13 versions if 'cuda' is '13.*' or not provided - matrix: packages: - - *numba_cuda_cu12 + - numba-cuda[cu13]>=0.19.1,<0.20.0a0 depends_on_pylibcudf: common: - output_types: conda @@ -930,6 +968,11 @@ dependencies: cuda_suffixed: "true" packages: - pylibcudf-cu12==25.10.*,>=0.0.0a0 + - matrix: + cuda: "13.*" + cuda_suffixed: "true" + packages: + - pylibcudf-cu13==25.10.*,>=0.0.0a0 - {matrix: null, packages: [*pylibcudf_unsuffixed]} depends_on_pylibcudf_pyarrow: common: @@ -949,6 +992,11 @@ dependencies: cuda_suffixed: "true" packages: - pylibcudf-cu12[pyarrow]==25.10.*,>=0.0.0a0 + - matrix: + cuda: "13.*" + cuda_suffixed: "true" + packages: + - pylibcudf-cu13[pyarrow]==25.10.*,>=0.0.0a0 - {matrix: null, packages: [*plc_unsuffixed]} depends_on_cudf: common: @@ -968,6 +1016,11 @@ dependencies: cuda_suffixed: "true" packages: - cudf-cu12==25.10.*,>=0.0.0a0 + - matrix: + cuda: "13.*" + cuda_suffixed: "true" + packages: + - cudf-cu13==25.10.*,>=0.0.0a0 - {matrix: null, packages: [*cudf_unsuffixed]} depends_on_cudf_kafka: common: @@ -987,20 +1040,31 @@ dependencies: cuda_suffixed: "true" packages: - cudf_kafka-cu12==25.10.*,>=0.0.0a0 + - matrix: + cuda: "13.*" + cuda_suffixed: "true" + packages: + - cudf_kafka-cu13==25.10.*,>=0.0.0a0 - {matrix: null, packages: [*cudf_kafka_unsuffixed]} depends_on_cupy: common: - output_types: conda packages: - cupy>=13.6.0 + # NOTE: This is intentionally not broken into groups by a 'cuda_suffixed' selector like + # other packages with -cu{nn}x suffixes in this file. + # All RAPIDS wheel builds (including in devcontainers) expect cupy to be suffixed. specific: - output_types: [requirements, pyproject] matrices: - matrix: cuda: "12.*" - packages: &cupy_packages_cu12 + packages: - cupy-cuda12x>=13.6.0 - - {matrix: null, packages: *cupy_packages_cu12} + # fallback to CUDA 13 versions if 'cuda' is '13.*' or not provided + - matrix: + packages: + - cupy-cuda13x>=13.6.0 depends_on_libkvikio: common: - output_types: conda @@ -1018,6 +1082,11 @@ dependencies: cuda_suffixed: "true" packages: - libkvikio-cu12==25.10.*,>=0.0.0a0 + - matrix: + cuda: "13.*" + cuda_suffixed: "true" + packages: + - libkvikio-cu13==25.10.*,>=0.0.0a0 - matrix: packages: - *libkvikio_unsuffixed @@ -1039,6 +1108,11 @@ dependencies: cuda_suffixed: "true" packages: - librmm-cu12==25.10.*,>=0.0.0a0 + - matrix: + cuda: "13.*" + cuda_suffixed: "true" + packages: + - librmm-cu13==25.10.*,>=0.0.0a0 - matrix: packages: - *librmm_unsuffixed @@ -1060,6 +1134,11 @@ dependencies: cuda_suffixed: "true" packages: - rmm-cu12==25.10.*,>=0.0.0a0 + - matrix: + cuda: "13.*" + cuda_suffixed: "true" + packages: + - rmm-cu13==25.10.*,>=0.0.0a0 - matrix: packages: - *rmm_unsuffixed diff --git a/python/cudf/cudf_pandas_tests/third_party_integration_tests/dependencies.yaml b/python/cudf/cudf_pandas_tests/third_party_integration_tests/dependencies.yaml index 98b4a31a391..6e01e034cf4 100644 --- a/python/cudf/cudf_pandas_tests/third_party_integration_tests/dependencies.yaml +++ b/python/cudf/cudf_pandas_tests/third_party_integration_tests/dependencies.yaml @@ -161,6 +161,10 @@ dependencies: cuda: "12.9" packages: - cuda-version=12.9 + - matrix: + cuda: "13.0" + packages: + - cuda-version=13.0 py_version: specific: - output_types: conda diff --git a/python/cudf/pyproject.toml b/python/cudf/pyproject.toml index 039e81742f8..0c6c84bdca1 100644 --- a/python/cudf/pyproject.toml +++ b/python/cudf/pyproject.toml @@ -19,11 +19,11 @@ license = { text = "Apache-2.0" } requires-python = ">=3.10" dependencies = [ "cachetools", - "cuda-python>=12.9.2,<13.0a0", - "cupy-cuda12x>=13.6.0", + "cuda-python>=13.0.1,<14.0a0", + "cupy-cuda13x>=13.6.0", "fsspec>=0.6.0", "libcudf==25.10.*,>=0.0.0a0", - "numba-cuda[cu12]>=0.19.1,<0.20.0a0", + "numba-cuda[cu13]>=0.19.1,<0.20.0a0", "numba>=0.60.0,<0.62.0a0", "numpy>=1.23,<3.0a0", "nvtx>=0.2.1", @@ -125,7 +125,7 @@ requires = [ "libcudf==25.10.*,>=0.0.0a0", "librmm==25.10.*,>=0.0.0a0", "ninja", - "numba-cuda[cu12]>=0.19.1,<0.20.0a0", + "numba-cuda[cu13]>=0.19.1,<0.20.0a0", "pylibcudf==25.10.*,>=0.0.0a0", "rmm==25.10.*,>=0.0.0a0", ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. diff --git a/python/dask_cudf/pyproject.toml b/python/dask_cudf/pyproject.toml index 55f319bc268..3b5f7a1d2e7 100644 --- a/python/dask_cudf/pyproject.toml +++ b/python/dask_cudf/pyproject.toml @@ -20,7 +20,7 @@ license = { text = "Apache-2.0" } requires-python = ">=3.10" dependencies = [ "cudf==25.10.*,>=0.0.0a0", - "cupy-cuda12x>=13.6.0", + "cupy-cuda13x>=13.6.0", "fsspec>=0.6.0", "numpy>=1.23,<3.0a0", "pandas>=2.0,<2.4.0dev0", diff --git a/python/pylibcudf/pyproject.toml b/python/pylibcudf/pyproject.toml index fb49c45a4a2..a76bbc36bae 100644 --- a/python/pylibcudf/pyproject.toml +++ b/python/pylibcudf/pyproject.toml @@ -18,7 +18,7 @@ authors = [ license = { text = "Apache-2.0" } requires-python = ">=3.10" dependencies = [ - "cuda-python>=12.9.2,<13.0a0", + "cuda-python>=13.0.1,<14.0a0", "libcudf==25.10.*,>=0.0.0a0", "nvtx>=0.2.1", "packaging", @@ -39,12 +39,12 @@ classifiers = [ [project.optional-dependencies] test = [ - "cupy-cuda12x>=13.6.0", + "cupy-cuda13x>=13.6.0", "fastavro>=0.22.9", "hypothesis>=6.131.7", "mmh3", "nanoarrow", - "numba-cuda[cu12]>=0.19.1,<0.20.0a0", + "numba-cuda[cu13]>=0.19.1,<0.20.0a0", "numba>=0.60.0,<0.62.0a0", "pandas", "pyarrow>=14.0.0,<20.0.0a0,!=17.0.0; platform_machine=='aarch64'", From 47e9f2a6c1a436b2852bd83b48269c389f3ba490 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Thu, 28 Aug 2025 13:29:33 -0700 Subject: [PATCH 228/366] Construct cuDF classic Decimal32/64Columns from RMM buffers (#19834) Precursor of https://github.com/rapidsai/cudf/issues/18726 towards consistently constructing a cuDF column with a pylibcudf Column Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - Tom Augspurger (https://github.com/TomAugspurger) URL: https://github.com/rapidsai/cudf/pull/19834 --- python/cudf/cudf/core/column/column.py | 12 ---- python/cudf/cudf/core/column/decimal.py | 83 ++++++++++++++---------- python/cudf/cudf/core/column/interval.py | 22 +------ 3 files changed, 51 insertions(+), 66 deletions(-) diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index c462278d2d4..162c62fe2f5 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -122,18 +122,6 @@ def _can_values_be_equal(left: DtypeObj, right: DtypeObj) -> bool: return False -def pa_mask_buffer_to_mask(mask_buf: pa.Buffer, size: int) -> Buffer: - """ - Convert PyArrow mask buffer to cuDF mask buffer - """ - mask_size = plc.null_mask.bitmask_allocation_size_bytes(size) - if mask_buf.size < mask_size: - dbuf = rmm.DeviceBuffer(size=mask_size) - dbuf.copy_from_host(np.asarray(mask_buf).view("u1")) - return as_buffer(dbuf) - return as_buffer(mask_buf) - - class ColumnBase(Serializable, BinaryOperand, Reducible): """ A ColumnBase stores columnar data in device memory. diff --git a/python/cudf/cudf/core/column/decimal.py b/python/cudf/cudf/core/column/decimal.py index af1419fceb7..f036cd6c084 100644 --- a/python/cudf/cudf/core/column/decimal.py +++ b/python/cudf/cudf/core/column/decimal.py @@ -4,20 +4,20 @@ import warnings from decimal import Decimal -from typing import TYPE_CHECKING, Any, cast +from typing import TYPE_CHECKING, Any, Literal, cast -import cupy as cp import numpy as np import pandas as pd import pyarrow as pa import pylibcudf as plc +import rmm import cudf from cudf.api.types import is_scalar from cudf.core._internals import binaryop -from cudf.core.buffer import acquire_spill_lock, as_buffer -from cudf.core.column.column import ColumnBase, pa_mask_buffer_to_mask +from cudf.core.buffer import acquire_spill_lock +from cudf.core.column.column import ColumnBase from cudf.core.column.numerical_base import NumericalBaseColumn from cudf.core.dtypes import ( Decimal32Dtype, @@ -96,6 +96,47 @@ def __cuda_array_interface__(self): "Decimals are not yet supported via `__cuda_array_interface__`" ) + @classmethod + def _from_32_64_arrow( + cls, + data: pa.Array, + *, + view_type: Literal["int32", "int64"], + plc_type: plc.TypeId, + step: int, + ) -> Self: + # Can remove when pyarrow 19 is the minimum version + mask_buf, data_buf = data.buffers() + rmm_data_buffer = rmm.DeviceBuffer.to_device( + np.frombuffer(data_buf) + .view(view_type)[::step] + .copy() + .view("uint8") + ) + plc_column = plc.Column.from_rmm_buffer( + rmm_data_buffer, + plc.DataType(plc_type, -data.type.scale), + len(data), + [], + ) + if mask_buf is not None: + mask_size = plc.null_mask.bitmask_allocation_size_bytes(len(data)) + if mask_buf.size < mask_size: + rmm_mask_buffer = rmm.DeviceBuffer(size=mask_size) + rmm_mask_buffer.copy_from_host( + np.asarray(mask_buf).view("uint8") + ) + else: + rmm_mask_buffer = rmm.DeviceBuffer.to_device( + np.frombuffer(mask_buf).view("uint8") + ) + plc_column = plc_column.with_mask( + plc.gpumemoryview(rmm_mask_buffer), data.null_count + ) + column = cls.from_pylibcudf(plc_column) + column.dtype.precision = data.type.precision + return column + def element_indexing(self, index: int): result = super().element_indexing(index) if isinstance(result, pa.Scalar): @@ -323,21 +364,8 @@ def __init__( @classmethod def from_arrow(cls, data: pa.Array) -> Self: - dtype = Decimal32Dtype.from_arrow(data.type) - mask_buf = data.buffers()[0] - mask = ( - mask_buf - if mask_buf is None - else pa_mask_buffer_to_mask(mask_buf, len(data)) - ) - data_128 = cp.array(np.frombuffer(data.buffers()[1]).view("int32")) - data_32 = data_128[::4].copy() - return cls( - data=as_buffer(data_32.view("uint8")), - size=len(data), - dtype=dtype, - offset=data.offset, - mask=mask, + return cls._from_32_64_arrow( + data, view_type="int32", plc_type=plc.TypeId.DECIMAL32, step=4 ) def to_arrow(self) -> pa.Array: @@ -460,21 +488,8 @@ def __init__( @classmethod def from_arrow(cls, data: pa.Array) -> Self: - dtype = Decimal64Dtype.from_arrow(data.type) - mask_buf = data.buffers()[0] - mask = ( - mask_buf - if mask_buf is None - else pa_mask_buffer_to_mask(mask_buf, len(data)) - ) - data_128 = cp.array(np.frombuffer(data.buffers()[1]).view("int64")) - data_64 = data_128[::2].copy() - return cls( - data=as_buffer(data_64.view("uint8")), - size=len(data), - dtype=dtype, - offset=data.offset, - mask=mask, + return cls._from_32_64_arrow( + data, view_type="int64", plc_type=plc.TypeId.DECIMAL64, step=2 ) def to_arrow(self) -> pa.Array: diff --git a/python/cudf/cudf/core/column/interval.py b/python/cudf/cudf/core/column/interval.py index d592afa3dda..1aab299e7ad 100644 --- a/python/cudf/cudf/core/column/interval.py +++ b/python/cudf/cudf/core/column/interval.py @@ -8,7 +8,7 @@ import pyarrow as pa import cudf -from cudf.core.column.column import as_column, pa_mask_buffer_to_mask +from cudf.core.column.column import as_column from cudf.core.column.struct import StructColumn from cudf.core.dtypes import IntervalDtype from cudf.utils.dtypes import is_dtype_obj_interval @@ -60,25 +60,7 @@ def _validate_dtype_instance(dtype: IntervalDtype) -> IntervalDtype: @classmethod def from_arrow(cls, data: pa.Array) -> Self: new_col = super().from_arrow(data.storage) - size = len(data) - dtype = IntervalDtype.from_arrow(data.type) - mask = data.buffers()[0] - if mask is not None: - mask = pa_mask_buffer_to_mask(mask, len(data)) - - offset = data.offset - null_count = data.null_count - children = new_col.children - - return cls( - data=None, - size=size, - dtype=dtype, - mask=mask, - offset=offset, - null_count=null_count, - children=children, # type: ignore[arg-type] - ) + return new_col._with_type_metadata(IntervalDtype.from_arrow(data.type)) # type: ignore[return-value] def to_arrow(self) -> pa.Array: typ = self.dtype.to_arrow() From 2bfd896b4e0c1f0b66402c1e067b4904dbd15c5e Mon Sep 17 00:00:00 2001 From: Tanmay Gujar Date: Thu, 28 Aug 2025 13:43:27 -0700 Subject: [PATCH 229/366] Add multi-column support for primitive row operator dispatch (#18940) This PR extends the primitive row operator to support multi-column tables, addressing the current limitation to single-column input. The primary goal is to improve performance in operations that rely on row-wise comparisons by enabling more efficient handling of multi-column data. Authors: - Tanmay Gujar (https://github.com/tgujar) - Yunsong Wang (https://github.com/PointKernel) Approvers: - Yunsong Wang (https://github.com/PointKernel) - Nghia Truong (https://github.com/ttnghia) URL: https://github.com/rapidsai/cudf/pull/18940 --- .../cudf/table/primitive_row_operators.cuh | 61 +++++-- cpp/src/table/primitive_row_operators.cu | 5 +- cpp/tests/join/distinct_join_tests.cpp | 164 ++++++++++++++++++ 3 files changed, 210 insertions(+), 20 deletions(-) diff --git a/cpp/include/cudf/table/primitive_row_operators.cuh b/cpp/include/cudf/table/primitive_row_operators.cuh index 3016422938e..cd96d5768a5 100644 --- a/cpp/include/cudf/table/primitive_row_operators.cuh +++ b/cpp/include/cudf/table/primitive_row_operators.cuh @@ -21,13 +21,16 @@ #include #include #include -#include #include +#include #include #include #include #include +#include + +#include namespace CUDF_EXPORT cudf { @@ -126,21 +129,26 @@ class row_equality_comparator { */ __device__ bool operator()(size_type lhs_row_index, size_type rhs_row_index) const { - if (_has_nulls) { - bool const lhs_is_null{_lhs.column(0).is_null(lhs_row_index)}; - bool const rhs_is_null{_rhs.column(0).is_null(rhs_row_index)}; - if (lhs_is_null and rhs_is_null) { - return _nulls_are_equal == null_equality::EQUAL; - } else if (lhs_is_null != rhs_is_null) { - return false; + auto equal_elements = [this, lhs_row_index, rhs_row_index](column_device_view const& l, + column_device_view const& r) { + // Handle null comparison for each element + if (_has_nulls) { + bool const lhs_is_null{l.is_null(lhs_row_index)}; + bool const rhs_is_null{r.is_null(rhs_row_index)}; + if (lhs_is_null and rhs_is_null) { + return _nulls_are_equal == null_equality::EQUAL; + } else if (lhs_is_null != rhs_is_null) { + return false; + } } - } - return cudf::type_dispatcher(_lhs.begin()->type(), - element_equality_comparator{}, - _lhs.column(0), - _rhs.column(0), - lhs_row_index, - rhs_row_index); + + // Both elements are non-null, compare their values + element_equality_comparator comparator; + return cudf::type_dispatcher( + l.type(), comparator, l, r, lhs_row_index, rhs_row_index); + }; + + return thrust::equal(thrust::seq, _lhs.begin(), _lhs.end(), _rhs.begin(), equal_elements); } /** @@ -243,11 +251,26 @@ class row_hasher { */ __device__ auto operator()(size_type row_index) const { - if (_has_nulls && _table.column(0).is_null(row_index)) { - return cuda::std::numeric_limits::max(); + element_hasher hasher; + // avoid hash combine call if there is only one column + auto hash = cuda::std::numeric_limits::max(); + if (!_has_nulls || !_table.column(0).is_null(row_index)) { + hash = cudf::type_dispatcher( + _table.column(0).type(), hasher, _seed, _table.column(0), row_index); + } + + for (size_type i = 1; i < _table.num_columns(); ++i) { + if (!(_has_nulls && _table.column(i).is_null(row_index))) { + hash = cudf::hashing::detail::hash_combine( + hash, + cudf::type_dispatcher( + _table.column(i).type(), hasher, _seed, _table.column(i), row_index)); + } else { + hash = cudf::hashing::detail::hash_combine( + hash, cuda::std::numeric_limits::max()); + } } - return cudf::type_dispatcher( - _table.column(0).type(), element_hasher{}, _seed, _table.column(0), row_index); + return hash; } private: diff --git a/cpp/src/table/primitive_row_operators.cu b/cpp/src/table/primitive_row_operators.cu index e95f9855790..d879b6d50f7 100644 --- a/cpp/src/table/primitive_row_operators.cu +++ b/cpp/src/table/primitive_row_operators.cu @@ -18,9 +18,12 @@ #include #include +#include + namespace cudf { bool is_primitive_row_op_compatible(cudf::table_view const& table) { - return table.num_columns() == 1 and cudf::is_numeric(table.column(0).type()); + return std::all_of( + table.begin(), table.end(), [](auto const& col) { return cudf::is_numeric(col.type()); }); } } // namespace cudf diff --git a/cpp/tests/join/distinct_join_tests.cpp b/cpp/tests/join/distinct_join_tests.cpp index ce27a3fbc0a..34067ac9b6c 100644 --- a/cpp/tests/join/distinct_join_tests.cpp +++ b/cpp/tests/join/distinct_join_tests.cpp @@ -143,6 +143,48 @@ TEST_F(DistinctJoinTest, InnerJoinNoNulls) this->compare_to_reference(build.view(), probe.view(), result, gold.view()); } +TEST_F(DistinctJoinTest, PrimitiveInnerJoinNoNulls) +{ + column_wrapper col0_0{{1, 2, 3, 4, 5}}; + column_wrapper col0_1({0, 0, 3, 4, 5}); + column_wrapper col0_2({9, 9, 9, 9, 9}); + + column_wrapper col1_0{{1, 2, 3, 4, 9}}; + column_wrapper col1_1({0, 0, 0, 4, 4}); + column_wrapper col1_2({9, 9, 9, 0, 9}); + + CVector cols0, cols1; + cols0.push_back(col0_0.release()); + cols0.push_back(col0_1.release()); + cols0.push_back(col0_2.release()); + cols1.push_back(col1_0.release()); + cols1.push_back(col1_1.release()); + cols1.push_back(col1_2.release()); + + Table build(std::move(cols0)); + Table probe(std::move(cols1)); + + auto distinct_join = cudf::distinct_hash_join{build.view()}; + auto result = distinct_join.inner_join(probe.view()); + + column_wrapper col_gold_0{{1, 2}}; + column_wrapper col_gold_1({0, 0}); + column_wrapper col_gold_2{{9, 9}}; + column_wrapper col_gold_3{{1, 2}}; + column_wrapper col_gold_4({0, 0}); + column_wrapper col_gold_5{{9, 9}}; + CVector cols_gold; + cols_gold.push_back(col_gold_0.release()); + cols_gold.push_back(col_gold_1.release()); + cols_gold.push_back(col_gold_2.release()); + cols_gold.push_back(col_gold_3.release()); + cols_gold.push_back(col_gold_4.release()); + cols_gold.push_back(col_gold_5.release()); + Table gold(std::move(cols_gold)); + + this->compare_to_reference(build.view(), probe.view(), result, gold.view()); +} + TEST_F(DistinctJoinTest, InnerJoinWithNulls) { column_wrapper col0_0{{3, 1, 2, 0, 2}}; @@ -192,6 +234,55 @@ TEST_F(DistinctJoinTest, InnerJoinWithNulls) } } +TEST_F(DistinctJoinTest, PrimitiveInnerJoinWithNulls) +{ + column_wrapper col0_0{{3, 1, 2, 0, 2}}; + column_wrapper col0_1({1, 1, 0, 4, 0}, {true, true, false, true, true}); + column_wrapper col0_2{{1, 1, 2, 4, 1}}; + + column_wrapper col1_0{{1, 2, 0, 2, 3}}; + column_wrapper col1_1({1, 0, 1, 0, 1}); + column_wrapper col1_2{{1, 1, 1, 1, 1}, {false, true, true, false, true}}; + + CVector cols0, cols1; + cols0.push_back(col0_0.release()); + cols0.push_back(col0_1.release()); + cols0.push_back(col0_2.release()); + cols1.push_back(col1_0.release()); + cols1.push_back(col1_1.release()); + cols1.push_back(col1_2.release()); + + Table build(std::move(cols0)); + Table probe(std::move(cols1)); + + // Create gold table once + column_wrapper col_gold_0{{3, 2}}; + column_wrapper col_gold_1({1, 0}, {true, true}); + column_wrapper col_gold_2{{1, 1}}; + column_wrapper col_gold_3{{3, 2}}; + column_wrapper col_gold_4({1, 0}, {true, true}); + column_wrapper col_gold_5{{1, 1}}; + CVector cols_gold; + cols_gold.push_back(col_gold_0.release()); + cols_gold.push_back(col_gold_1.release()); + cols_gold.push_back(col_gold_2.release()); + cols_gold.push_back(col_gold_3.release()); + cols_gold.push_back(col_gold_4.release()); + cols_gold.push_back(col_gold_5.release()); + Table gold(std::move(cols_gold)); + + // Test with different load factors + std::vector load_factors = {0.5, 1.0}; + + for (auto load_factor : load_factors) { + auto distinct_join = + cudf::distinct_hash_join{build.view(), cudf::null_equality::EQUAL, load_factor}; + auto result = distinct_join.inner_join(probe.view()); + + this->compare_to_reference(build.view(), probe.view(), result, gold.view()); + } +} + TEST_F(DistinctJoinTest, InnerJoinWithStructsAndNulls) { column_wrapper col0_0{{3, 1, 2, 0, 2}}; @@ -408,6 +499,42 @@ TEST_F(DistinctJoinTest, LeftJoinNoNulls) build.view(), probe.view(), gather_map, gold.view(), cudf::out_of_bounds_policy::NULLIFY); } +TEST_F(DistinctJoinTest, PrimitiveLeftJoinNoNulls) +{ + column_wrapper col0_0({3, 1, 2, 0, 3}); + column_wrapper col0_1({0, 1, 2, 4, 1}); + + column_wrapper col1_0({2, 2, 0, 4, 3}); + column_wrapper col1_1({1, 0, 1, 2, 1}); + + CVector cols0, cols1; + cols0.push_back(col0_0.release()); + cols0.push_back(col0_1.release()); + cols1.push_back(col1_0.release()); + cols1.push_back(col1_1.release()); + + Table probe(std::move(cols0)); + Table build(std::move(cols1)); + + column_wrapper col_gold_0({3, 1, 2, 0, 3}); + column_wrapper col_gold_1({0, 1, 2, 4, 1}); + column_wrapper col_gold_2{{-1, -1, -1, -1, 3}, {false, false, false, false, true}}; + column_wrapper col_gold_3{{-1, -1, -1, -1, 1}, {false, false, false, false, true}}; + CVector cols_gold; + cols_gold.push_back(col_gold_0.release()); + cols_gold.push_back(col_gold_1.release()); + cols_gold.push_back(col_gold_2.release()); + cols_gold.push_back(col_gold_3.release()); + Table gold(std::move(cols_gold)); + + auto distinct_join = cudf::distinct_hash_join{build.view()}; + auto result = distinct_join.left_join(probe.view()); + auto gather_map = std::pair{get_left_indices(result->size()), std::move(result)}; + + this->compare_to_reference( + build.view(), probe.view(), gather_map, gold.view(), cudf::out_of_bounds_policy::NULLIFY); +} + TEST_F(DistinctJoinTest, LeftJoinWithNulls) { column_wrapper col0_0{{3, 1, 2, 0, 2}}; @@ -445,6 +572,43 @@ TEST_F(DistinctJoinTest, LeftJoinWithNulls) build.view(), probe.view(), gather_map, gold.view(), cudf::out_of_bounds_policy::NULLIFY); } +TEST_F(DistinctJoinTest, PrimitiveLeftJoinWithNulls) +{ + column_wrapper col0_0{{3, 1, 2, 0, 2}}; + column_wrapper col0_1({1, 1, -1, 4, 0}, {true, true, false, true, true}); + + column_wrapper col1_0{{2, 2, 0, 4, 3}}; + column_wrapper col1_1({1, 0, 1, 2, 1}); + + CVector cols0, cols1; + cols0.push_back(col0_0.release()); + cols0.push_back(col0_1.release()); + cols1.push_back(col1_0.release()); + cols1.push_back(col1_1.release()); + + Table probe(std::move(cols0)); + Table build(std::move(cols1)); + + auto distinct_join = cudf::distinct_hash_join{build.view()}; + auto result = distinct_join.left_join(probe.view()); + auto gather_map = std::pair{get_left_indices(result->size()), std::move(result)}; + + column_wrapper col_gold_0{{3, 1, 2, 0, 2}, {true, true, true, true, true}}; + column_wrapper col_gold_1({1, 1, -1, 4, 0}, {true, true, false, true, true}); + column_wrapper col_gold_2{{3, -1, -1, -1, 2}, {true, false, false, false, true}}; + column_wrapper col_gold_3({1, -1, -1, -1, 0}, {true, false, false, false, true}); + + CVector cols_gold; + cols_gold.push_back(col_gold_0.release()); + cols_gold.push_back(col_gold_1.release()); + cols_gold.push_back(col_gold_2.release()); + cols_gold.push_back(col_gold_3.release()); + Table gold(std::move(cols_gold)); + + this->compare_to_reference( + build.view(), probe.view(), gather_map, gold.view(), cudf::out_of_bounds_policy::NULLIFY); +} + TEST_F(DistinctJoinTest, LeftJoinWithStructsAndNulls) { auto col0_names_col = strcol_wrapper{ From edbedc1ff3e5cf06b08cb05ccb19d24ad24b0e24 Mon Sep 17 00:00:00 2001 From: James Lamb Date: Fri, 29 Aug 2025 08:33:10 -0500 Subject: [PATCH 230/366] revert numba CUDA 13 workaround (#19842) CUDA 13 support was initially added here in #19768 During that work, we faced some runtime issues with conda packages that @brandon-b-miller diagnosed as a missing dependency in `numba-cuda` (https://github.com/NVIDIA/numba-cuda/issues/430). To get past that, we temporarily introduced a runtime dependency on `cuda-nvvm-tools` in this project. That's no longer necessary, thanks to these: * https://github.com/conda-forge/numba-cuda-feedstock/pull/47 * https://github.com/conda-forge/numba-cuda-feedstock/pull/46 This removes that workaround. ## Notes for Reviewers ### Don't we need to change the `numba-cuda` pin? No, the fixes are just in new builds of 0.19.1. # Authors: - James Lamb (https://github.com/jameslamb) Approvers: - https://github.com/brandon-b-miller - Jake Awe (https://github.com/AyodeAwe) URL: https://github.com/rapidsai/cudf/pull/19842 --- conda/environments/all_cuda-130_arch-aarch64.yaml | 1 - conda/environments/all_cuda-130_arch-x86_64.yaml | 1 - conda/recipes/cudf/recipe.yaml | 4 ---- dependencies.yaml | 10 ---------- 4 files changed, 16 deletions(-) diff --git a/conda/environments/all_cuda-130_arch-aarch64.yaml b/conda/environments/all_cuda-130_arch-aarch64.yaml index bd29b08c758..e18d021ee4e 100644 --- a/conda/environments/all_cuda-130_arch-aarch64.yaml +++ b/conda/environments/all_cuda-130_arch-aarch64.yaml @@ -20,7 +20,6 @@ dependencies: - cuda-nvcc - cuda-nvrtc-dev - cuda-nvtx-dev -- cuda-nvvm-tools - cuda-python>=13.0.1,<14.0a0 - cuda-sanitizer-api - cuda-version=13.0 diff --git a/conda/environments/all_cuda-130_arch-x86_64.yaml b/conda/environments/all_cuda-130_arch-x86_64.yaml index 4f06590e585..359ed939f77 100644 --- a/conda/environments/all_cuda-130_arch-x86_64.yaml +++ b/conda/environments/all_cuda-130_arch-x86_64.yaml @@ -20,7 +20,6 @@ dependencies: - cuda-nvcc - cuda-nvrtc-dev - cuda-nvtx-dev -- cuda-nvvm-tools - cuda-python>=13.0.1,<14.0a0 - cuda-sanitizer-api - cuda-version=13.0 diff --git a/conda/recipes/cudf/recipe.yaml b/conda/recipes/cudf/recipe.yaml index 05b8f6f7b89..2389ed684c1 100644 --- a/conda/recipes/cudf/recipe.yaml +++ b/conda/recipes/cudf/recipe.yaml @@ -71,10 +71,6 @@ requirements: - pandas >=2.0,<2.4.0dev0 - cupy >=13.6.0 - numba-cuda >=0.19.1,<0.20.0a0 - # TODO: remove cuda-nvvm-tools once https://github.com/NVIDIA/numba-cuda/issues/430 is resolved - # and we move the numba-cuda floor up to a version containing a fix for it - - if: cuda_major == "13" - then: cuda-nvvm-tools - numba >=0.60.0,<0.62.0a0 - numpy >=1.23,<3.0a0 - pyarrow>=14.0.0,<20.0.0a0 diff --git a/dependencies.yaml b/dependencies.yaml index fe6d8dcf2bb..3808369ace3 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -930,16 +930,6 @@ dependencies: packages: - numba-cuda>=0.19.1,<0.20.0a0 specific: - # TODO: remove cuda-nvvm-tools once https://github.com/NVIDIA/numba-cuda/issues/430 is resolved - # and we move the numba-cuda floor up to a version containing a fix for it - - output_types: [conda] - matrices: - - matrix: - cuda: "13.*" - packages: - - cuda-nvvm-tools - - matrix: - packages: - output_types: [requirements, pyproject] matrices: - matrix: From f0bdf9acace00a841ce9421417312a05a311e724 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Fri, 29 Aug 2025 09:21:39 -0700 Subject: [PATCH 231/366] Add join `*_match_context` APIs to hash join (#19835) Related to #18677 This PR moves the strong types `join_match_context` and `join_partition_context` out of the `sort_merge_join` class into the public interface, allowing them to be reused in `hash_join`. It also introduces `*_join_match_context` APIs to `hash_join`, enabling per-row match data retrieval to better distribute cuDF workloads for downstream libraries like Spark. Follow-up work, such as adding partition context retrieval, will be handled in a separate PR to keep the review process simpler. Authors: - Yunsong Wang (https://github.com/PointKernel) Approvers: - Shruti Shivakumar (https://github.com/shrshi) - David Wendt (https://github.com/davidwendt) URL: https://github.com/rapidsai/cudf/pull/19835 --- cpp/include/cudf/detail/join/hash_join.cuh | 30 ++ cpp/include/cudf/join/hash_join.hpp | 115 +++++-- cpp/include/cudf/join/join.hpp | 31 ++ cpp/include/cudf/join/sort_merge_join.hpp | 51 +-- cpp/src/join/hash_join.cu | 160 +++++++++- cpp/src/join/sort_merge_join.cu | 23 +- cpp/tests/join/join_tests.cpp | 349 ++++++++++++++++++++- 7 files changed, 673 insertions(+), 86 deletions(-) diff --git a/cpp/include/cudf/detail/join/hash_join.cuh b/cpp/include/cudf/detail/join/hash_join.cuh index c71e1548d3d..4bff2d0b7ed 100644 --- a/cpp/include/cudf/detail/join/hash_join.cuh +++ b/cpp/include/cudf/detail/join/hash_join.cuh @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include @@ -188,7 +189,36 @@ struct hash_join { rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) const; + /** + * @copydoc cudf::hash_join::inner_join_match_context + */ + [[nodiscard]] cudf::join_match_context inner_join_match_context( + cudf::table_view const& probe, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) const; + + /** + * @copydoc cudf::hash_join::left_join_match_context + */ + [[nodiscard]] cudf::join_match_context left_join_match_context( + cudf::table_view const& probe, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) const; + + /** + * @copydoc cudf::hash_join::full_join_match_context + */ + [[nodiscard]] cudf::join_match_context full_join_match_context( + cudf::table_view const& probe, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) const; + private: + template + void compute_match_counts(cudf::table_view const& probe, + OutputIterator output_iter, + rmm::cuda_stream_view stream) const; + /** * @brief Probes the `_hash_table` built from `_build` for tuples in `probe_table`, * and returns the output indices of `build_table` and `probe_table` as a combined table, diff --git a/cpp/include/cudf/join/hash_join.hpp b/cpp/include/cudf/join/hash_join.hpp index 85af304f6f6..f00fbaf20eb 100644 --- a/cpp/include/cudf/join/hash_join.hpp +++ b/cpp/include/cudf/join/hash_join.hpp @@ -17,6 +17,7 @@ #pragma once #include +#include #include #include #include @@ -89,7 +90,7 @@ class hash_join { * @note The `hash_join` object must not outlive the table viewed by `build`, else behavior is * undefined. * - * @throws cudf::logic_error if the build table has no columns + * @throws std::invalid_argument if the build table has no columns * * @param build The build table, from which the hash table is built * @param compare_nulls Controls whether null join-key values should match or not @@ -120,15 +121,15 @@ class hash_join { * an inner join between two tables. @see cudf::inner_join(). Behavior is undefined if the * provided `output_size` is smaller than the actual output size. * + * @throw std::invalid_argument If the input probe table has nulls while this hash_join object was + * not constructed with null check. + * * @param probe The probe table, from which the tuples are probed * @param output_size Optional value which allows users to specify the exact output size * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned table and columns' device * memory. * - * @throw cudf::logic_error If the input probe table has nulls while this hash_join object was not - * constructed with null check. - * * @return A pair of columns [`left_indices`, `right_indices`] that can be used to construct * the result of performing an inner join between two tables with `build` and `probe` * as the join keys . @@ -145,15 +146,15 @@ class hash_join { * a left join between two tables. @see cudf::left_join(). Behavior is undefined if the * provided `output_size` is smaller than the actual output size. * + * @throw std::invalid_argument If the input probe table has nulls while this hash_join object was + * not constructed with null check. + * * @param probe The probe table, from which the tuples are probed * @param output_size Optional value which allows users to specify the exact output size * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned table and columns' device * memory. * - * @throw cudf::logic_error If the input probe table has nulls while this hash_join object was not - * constructed with null check. - * * @return A pair of columns [`left_indices`, `right_indices`] that can be used to construct * the result of performing a left join between two tables with `build` and `probe` * as the join keys. @@ -170,15 +171,15 @@ class hash_join { * a full join between two tables. @see cudf::full_join(). Behavior is undefined if the * provided `output_size` is smaller than the actual output size. * + * @throw std::invalid_argument If the input probe table has nulls while this hash_join object was + * not constructed with null check. + * * @param probe The probe table, from which the tuples are probed * @param output_size Optional value which allows users to specify the exact output size * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned table and columns' device * memory. * - * @throw cudf::logic_error If the input probe table has nulls while this hash_join object was not - * constructed with null check. - * * @return A pair of columns [`left_indices`, `right_indices`] that can be used to construct * the result of performing a full join between two tables with `build` and `probe` * as the join keys . @@ -194,12 +195,12 @@ class hash_join { * Returns the exact number of matches (rows) when performing an inner join with the specified * probe table. * + * @throw std::invalid_argument If the input probe table has nulls while this hash_join object was + * not constructed with null check. + * * @param probe The probe table, from which the tuples are probed * @param stream CUDA stream used for device memory operations and kernel launches * - * @throw cudf::logic_error If the input probe table has nulls while this hash_join object was not - * constructed with null check. - * * @return The exact number of output when performing an inner join between two tables with * `build` and `probe` as the join keys . */ @@ -210,12 +211,12 @@ class hash_join { * Returns the exact number of matches (rows) when performing a left join with the specified probe * table. * + * @throw std::invalid_argument If the input probe table has nulls while this hash_join object was + * not constructed with null check. + * * @param probe The probe table, from which the tuples are probed * @param stream CUDA stream used for device memory operations and kernel launches * - * @throw cudf::logic_error If the input probe table has nulls while this hash_join object was not - * constructed with null check. - * * @return The exact number of output when performing a left join between two tables with `build` * and `probe` as the join keys . */ @@ -226,14 +227,14 @@ class hash_join { * Returns the exact number of matches (rows) when performing a full join with the specified probe * table. * + * @throw std::invalid_argument If the input probe table has nulls while this hash_join object was + * not constructed with null check. + * * @param probe The probe table, from which the tuples are probed * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the intermediate table and columns' device * memory. * - * @throw cudf::logic_error If the input probe table has nulls while this hash_join object was not - * constructed with null check. - * * @return The exact number of output when performing a full join between two tables with `build` * and `probe` as the join keys . */ @@ -242,6 +243,82 @@ class hash_join { rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()) const; + /** + * @brief Returns context information about matches between the probe and build tables. + * + * This method computes, for each row in the probe table, how many matching rows exist in + * the build table according to inner join semantics, and returns the number of matches through a + * join_match_context object. + * + * This is particularly useful for: + * - Determining the total size of a potential join result without materializing it + * - Planning partitioned join operations for large datasets + * + * @throw std::invalid_argument If the input probe table has nulls while this hash_join object was + * not constructed with null check. + * + * @param probe The probe table to join with the pre-processed build table + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the result device memory + * + * @return A join_match_context object containing the probe table view and a device vector + * of match counts for each row in the probe table + */ + [[nodiscard]] cudf::join_match_context inner_join_match_context( + cudf::table_view const& probe, + rmm::cuda_stream_view stream = cudf::get_default_stream(), + rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()) const; + + /** + * @brief Returns context information about matches between the probe and build tables. + * + * This method computes, for each row in the probe table, how many matching rows exist in + * the build table according to left join semantics, and returns the number of matches through a + * join_match_context object. + * + * For left join, every row in the probe table will have at least one match (either with a + * matching row from the build table or with a null placeholder). + * + * @throw std::invalid_argument If the input probe table has nulls while this hash_join object was + * not constructed with null check. + * + * @param probe The probe table to join with the pre-processed build table + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the result device memory + * + * @return A join_match_context object containing the probe table view and a device vector + * of match counts for each row in the probe table + */ + [[nodiscard]] cudf::join_match_context left_join_match_context( + cudf::table_view const& probe, + rmm::cuda_stream_view stream = cudf::get_default_stream(), + rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()) const; + + /** + * @brief Returns context information about matches between the probe and build tables. + * + * This method computes, for each row in the probe table, how many matching rows exist in + * the build table according to full join semantics, and returns the number of matches through a + * join_match_context object. + * + * For full join, this includes matches for probe table rows, and the result may need to be + * combined with unmatched rows from the build table to get the complete picture. + * + * @throw std::invalid_argument If the input probe table has nulls while this hash_join object was + * not constructed with null check. + * + * @param probe The probe table to join with the pre-processed build table + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the result device memory + * + * @return A join_match_context object containing the probe table view and a device vector + * of match counts for each row in the probe table + */ + [[nodiscard]] cudf::join_match_context full_join_match_context( + cudf::table_view const& probe, + rmm::cuda_stream_view stream = cudf::get_default_stream(), + rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()) const; + private: std::unique_ptr _impl; }; diff --git a/cpp/include/cudf/join/join.hpp b/cpp/include/cudf/join/join.hpp index cf686e40d4b..d1b5e6a5e07 100644 --- a/cpp/include/cudf/join/join.hpp +++ b/cpp/include/cudf/join/join.hpp @@ -33,6 +33,37 @@ namespace CUDF_EXPORT cudf { * @file */ +/** + * @brief Holds context information about matches between tables during a join operation. + * + * This structure stores the left table view and a device vector containing the count of + * matching rows in the right table for each row in the left table. Used primarily by + * inner_join_match_context() to track join match information. + */ +struct join_match_context { + table_view _left_table; ///< View of the left table involved in the join operation + std::unique_ptr> + _match_counts; ///< A device vector containing the count of matching rows in the right table + ///< for each row in left table +}; + +/** + * @brief Stores context information for partitioned join operations. + * + * This structure maintains context for partitioned join operations, containing the match + * context from a previous join operation along with the start and end indices that define + * the current partition of the left table being processed. + * + * Used with partitioned_inner_join() to perform large joins in smaller chunks while + * preserving the context from the initial match operation. + */ +struct join_partition_context { + join_match_context + left_table_context; ///< The match context from a previous inner_join_match_context call + size_type left_start_idx; ///< The starting row index of the current left table partition + size_type left_end_idx; ///< The ending row index (exclusive) of the current left table partition +}; + /** * @brief Returns a pair of row index vectors corresponding to an * inner join between the specified tables. diff --git a/cpp/include/cudf/join/sort_merge_join.hpp b/cpp/include/cudf/join/sort_merge_join.hpp index cdc71fc46fb..c7f572d4525 100644 --- a/cpp/include/cudf/join/sort_merge_join.hpp +++ b/cpp/include/cudf/join/sort_merge_join.hpp @@ -18,6 +18,7 @@ #include #include +#include #include #include #include @@ -43,38 +44,6 @@ namespace CUDF_EXPORT cudf { */ class sort_merge_join { public: - /** - * @brief Holds context information about matches between tables during a join operation. - * - * This structure stores the left table view and a device vector containing the count of - * matching rows in the right table for each row in the left table. Used primarily by - * inner_join_match_context() to track join match information. - */ - struct match_context { - table_view _left_table; ///< View of the left table involved in the join operation - std::unique_ptr> - _match_counts; ///< A device vector containing the count of matching rows in the right table - ///< for each row in left table - }; - - /** - * @brief Stores context information for partitioned join operations. - * - * This structure maintains context for partitioned join operations, containing the match - * context from a previous join operation along with the start and end indices that define - * the current partition of the left table being processed. - * - * Used with partitioned_inner_join() to perform large joins in smaller chunks while - * preserving the context from the initial match operation. - */ - struct partition_context { - match_context - left_table_context; ///< The match context from a previous inner_join_match_context call - size_type left_start_idx; ///< The starting row index of the current left table partition - size_type - left_end_idx; ///< The ending row index (exclusive) of the current left table partition - }; - /** * @brief Construct a sort-merge join object that pre-processes the right table * on creation, and can be used on subsequent join operations with multiple @@ -126,7 +95,7 @@ class sort_merge_join { * - Determining the total size of a potential join result without materializing it * - Planning partitioned join operations for large datasets * - * The returned match_context can be used directly with partitioned_inner_join() to + * The returned join_match_context can be used directly with partitioned_inner_join() to * process large joins in manageable chunks. * * @param left The left table to join with the pre-processed right table @@ -134,10 +103,10 @@ class sort_merge_join { * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the result device memory * - * @return A match_context object containing the left table view and a device vector + * @return A join_match_context object containing the left table view and a device vector * of match counts for each row in the left table */ - match_context inner_join_match_context( + cudf::join_match_context inner_join_match_context( table_view const& left, sorted is_left_sorted, rmm::cuda_stream_view stream = cudf::get_default_stream(), @@ -147,8 +116,8 @@ class sort_merge_join { * @brief Performs an inner join between a partition of the left table and the right table. * * This method executes an inner join operation between a specific partition of the left table - * (defined by the partition_context) and the right table that was provided when constructing - * the sort_merge_join object. The partition_context must have been previously created by + * (defined by the join_partition_context) and the right table that was provided when constructing + * the sort_merge_join object. The join_partition_context must have been previously created by * calling inner_join_match_context(). * * This partitioning approach enables processing large joins in smaller, memory-efficient chunks, @@ -180,7 +149,7 @@ class sort_merge_join { * size_type end = std::min(start + 1000, left_table.num_rows()); * * // Create partition context - * sort_merge_join::partition_context part_ctx{context, start, end}; + * cudf::join_partition_context part_ctx{context, start, end}; * * // Get join indices for this partition * auto [left_indices, right_indices] = join_obj.partitioned_inner_join(part_ctx); @@ -192,7 +161,7 @@ class sort_merge_join { std::pair>, std::unique_ptr>> partitioned_inner_join( - partition_context const& context, + cudf::join_partition_context const& context, rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); @@ -318,7 +287,7 @@ class sort_merge_join { * Result: {{1}, {0}} * @endcode * - * @throw cudf::logic_error if number of elements in `left_keys` or `right_keys` + * @throw std::invalid_argument if number of elements in `left_keys` or `right_keys` * mismatch. * * @param[in] left_keys The left table @@ -360,7 +329,7 @@ sort_merge_inner_join(cudf::table_view const& left_keys, * Result: {{1}, {0}} * @endcode * - * @throw cudf::logic_error if number of elements in `left_keys` or `right_keys` + * @throw std::invalid_argument if number of elements in `left_keys` or `right_keys` * mismatch. * * @param[in] left_keys The left table diff --git a/cpp/src/join/hash_join.cu b/cpp/src/join/hash_join.cu index a2cdf122fe8..06220e299de 100644 --- a/cpp/src/join/hash_join.cu +++ b/cpp/src/join/hash_join.cu @@ -44,6 +44,7 @@ #include #include +#include namespace cudf { namespace detail { @@ -55,7 +56,7 @@ using hash_table_t = cudf::hash_join::impl_type::hash_table_t; // mixed-join migration template struct pair_fn { - pair_fn(Hasher hash) : _hash{hash} {} + pair_fn(Hasher hash) : _hash{std::move(hash)} {} __device__ cuco::pair operator()(size_type i) const noexcept { @@ -489,8 +490,10 @@ std::size_t get_full_join_size( // Assume all the indices in invalid_index_map are invalid auto invalid_index_map = std::make_unique>(right_table_row_count, stream); - thrust::uninitialized_fill( - rmm::exec_policy(stream), invalid_index_map->begin(), invalid_index_map->end(), int32_t{1}); + thrust::uninitialized_fill(rmm::exec_policy_nosync(stream), + invalid_index_map->begin(), + invalid_index_map->end(), + int32_t{1}); // Functor to check for index validity since left joins can create invalid indices valid_range valid(0, right_table_row_count); @@ -540,7 +543,7 @@ hash_join::hash_join(cudf::table_view const& build, cudf::experimental::row::equality::preprocessed_table::create(_build, stream)} { CUDF_FUNC_RANGE(); - CUDF_EXPECTS(0 != build.num_columns(), "Hash join build table is empty"); + CUDF_EXPECTS(0 != build.num_columns(), "Hash join build table is empty", std::invalid_argument); CUDF_EXPECTS(load_factor > 0 && load_factor <= 1, "Invalid load factor: must be greater than 0 and less than or equal to 1.", std::invalid_argument); @@ -604,7 +607,8 @@ std::size_t hash_join::inner_join_size(cudf::table_view const& probe, if (_is_empty) { return 0; } CUDF_EXPECTS(_has_nulls || !cudf::has_nested_nulls(probe), - "Probe table has nulls while build table was not hashed with null check."); + "Probe table has nulls while build table was not hashed with null check.", + std::invalid_argument); auto const preprocessed_probe = cudf::experimental::row::equality::preprocessed_table::create(probe, stream); @@ -630,7 +634,8 @@ std::size_t hash_join::left_join_size(cudf::table_view const& probe, if (_is_empty) { return probe.num_rows(); } CUDF_EXPECTS(_has_nulls || !cudf::has_nested_nulls(probe), - "Probe table has nulls while build table was not hashed with null check."); + "Probe table has nulls while build table was not hashed with null check.", + std::invalid_argument); auto const preprocessed_probe = cudf::experimental::row::equality::preprocessed_table::create(probe, stream); @@ -657,7 +662,8 @@ std::size_t hash_join::full_join_size(cudf::table_view const& probe, if (_is_empty) { return probe.num_rows(); } CUDF_EXPECTS(_has_nulls || !cudf::has_nested_nulls(probe), - "Probe table has nulls while build table was not hashed with null check."); + "Probe table has nulls while build table was not hashed with null check.", + std::invalid_argument); auto const preprocessed_probe = cudf::experimental::row::equality::preprocessed_table::create(probe, stream); @@ -673,6 +679,112 @@ std::size_t hash_join::full_join_size(cudf::table_view const& probe, mr); } +template +template +void hash_join::compute_match_counts(cudf::table_view const& probe, + OutputIterator output_iter, + rmm::cuda_stream_view stream) const +{ + CUDF_EXPECTS(_has_nulls || !cudf::has_nested_nulls(probe), + "Probe table has nulls while build table was not hashed with null check.", + std::invalid_argument); + + auto const preprocessed_probe = + cudf::experimental::row::equality::preprocessed_table::create(probe, stream); + auto const probe_nulls = cudf::nullate::DYNAMIC{_has_nulls}; + auto const probe_table_num_rows = probe.num_rows(); + + auto compute_counts = [&](auto equality, auto d_hasher) { + auto const iter = cudf::detail::make_counting_transform_iterator(0, pair_fn{d_hasher}); + _hash_table.count_each(iter, + iter + probe_table_num_rows, + equality, + _hash_table.hash_function(), + output_iter, + stream.value()); + }; + + if (cudf::is_primitive_row_op_compatible(_build)) { + auto const d_hasher = cudf::row::primitive::row_hasher{probe_nulls, preprocessed_probe}; + auto const d_equal = cudf::row::primitive::row_equality_comparator{ + probe_nulls, preprocessed_probe, _preprocessed_build, _nulls_equal}; + compute_counts(primitive_pair_equal{d_equal}, d_hasher); + } else { + auto const d_hasher = + cudf::experimental::row::hash::row_hasher{preprocessed_probe}.device_hasher(probe_nulls); + auto const row_comparator = cudf::experimental::row::equality::two_table_comparator{ + preprocessed_probe, _preprocessed_build}; + auto const d_equal = row_comparator.equal_to(probe_nulls, _nulls_equal); + compute_counts(pair_equal{d_equal}, d_hasher); + } +} + +template +cudf::join_match_context hash_join::inner_join_match_context( + cudf::table_view const& probe, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) const +{ + cudf::scoped_range range{"hash_join::inner_join_match_context"}; + + auto match_counts = + std::make_unique>(probe.num_rows(), stream, mr); + + if (_is_empty) { + thrust::fill(rmm::exec_policy_nosync(stream), match_counts->begin(), match_counts->end(), 0); + } else { + compute_match_counts(probe, match_counts->begin(), stream); + } + + return cudf::join_match_context{probe, std::move(match_counts)}; +} + +template +cudf::join_match_context hash_join::left_join_match_context( + cudf::table_view const& probe, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) const +{ + cudf::scoped_range range{"hash_join::left_join_match_context"}; + + auto match_counts = + std::make_unique>(probe.num_rows(), stream, mr); + + if (_is_empty) { + thrust::fill(rmm::exec_policy_nosync(stream), match_counts->begin(), match_counts->end(), 1); + } else { + auto transform = [] __device__(size_type count) { return count == 0 ? 1 : count; }; + auto transformed_output = + thrust::make_transform_output_iterator(match_counts->begin(), transform); + compute_match_counts(probe, transformed_output, stream); + } + + return cudf::join_match_context{probe, std::move(match_counts)}; +} + +template +cudf::join_match_context hash_join::full_join_match_context( + cudf::table_view const& probe, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) const +{ + cudf::scoped_range range{"hash_join::full_join_match_context"}; + + auto match_counts = + std::make_unique>(probe.num_rows(), stream, mr); + + if (_is_empty) { + thrust::fill(rmm::exec_policy_nosync(stream), match_counts->begin(), match_counts->end(), 1); + } else { + auto transform = [] __device__(size_type count) { return count == 0 ? 1 : count; }; + auto transformed_output = + thrust::make_transform_output_iterator(match_counts->begin(), transform); + compute_match_counts(probe, transformed_output, stream); + } + + return cudf::join_match_context{probe, std::move(match_counts)}; +} + template std::pair>, std::unique_ptr>> @@ -690,7 +802,8 @@ hash_join::probe_join_indices(cudf::table_view const& probe_table, CUDF_EXPECTS(!_is_empty, "Hash table of hash join is null."); CUDF_EXPECTS(_has_nulls || !cudf::has_nested_nulls(probe_table), - "Probe table has nulls while build table was not hashed with null check."); + "Probe table has nulls while build table was not hashed with null check.", + std::invalid_argument); auto const preprocessed_probe = cudf::experimental::row::equality::preprocessed_table::create(probe_table, stream); @@ -723,13 +836,15 @@ hash_join::compute_hash_join(cudf::table_view const& probe, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) const { - CUDF_EXPECTS(0 != probe.num_columns(), "Hash join probe table is empty"); + CUDF_EXPECTS(0 != probe.num_columns(), "Hash join probe table is empty", std::invalid_argument); CUDF_EXPECTS(_build.num_columns() == probe.num_columns(), - "Mismatch in number of columns to be joined on"); + "Mismatch in number of columns to be joined on", + std::invalid_argument); CUDF_EXPECTS(_has_nulls || !cudf::has_nested_nulls(probe), - "Probe table has nulls while build table was not hashed with null check."); + "Probe table has nulls while build table was not hashed with null check.", + std::invalid_argument); if (is_trivial_join(probe, _build, join)) { return std::pair(std::make_unique>(0, stream, mr), @@ -749,7 +864,6 @@ hash_join::~hash_join() = default; hash_join::hash_join(cudf::table_view const& build, null_equality compare_nulls, rmm::cuda_stream_view stream) - // If we cannot know beforehand about null existence then let's assume that there are nulls. : hash_join( build, nullable_join::YES, compare_nulls, cudf::detail::CUCO_DESIRED_LOAD_FACTOR, stream) { @@ -814,4 +928,26 @@ std::size_t hash_join::full_join_size(cudf::table_view const& probe, return _impl->full_join_size(probe, stream, mr); } +cudf::join_match_context hash_join::inner_join_match_context( + cudf::table_view const& probe, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) const +{ + return _impl->inner_join_match_context(probe, stream, mr); +} + +cudf::join_match_context hash_join::left_join_match_context(cudf::table_view const& probe, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) const +{ + return _impl->left_join_match_context(probe, stream, mr); +} + +cudf::join_match_context hash_join::full_join_match_context(cudf::table_view const& probe, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) const +{ + return _impl->full_join_match_context(probe, stream, mr); +} + } // namespace cudf diff --git a/cpp/src/join/sort_merge_join.cu b/cpp/src/join/sort_merge_join.cu index ec015c704b2..28bf8ca923f 100644 --- a/cpp/src/join/sort_merge_join.cu +++ b/cpp/src/join/sort_merge_join.cu @@ -408,7 +408,8 @@ sort_merge_join::sort_merge_join(table_view const& right, cudf::scoped_range range{"sort_merge_join::sort_merge_join"}; // Sanity checks CUDF_EXPECTS(right.num_columns() != 0, - "Number of columns the keys table must be non-zero for a join"); + "Number of columns the keys table must be non-zero for a join", + std::invalid_argument); this->compare_nulls = compare_nulls; @@ -528,9 +529,11 @@ sort_merge_join::inner_join(table_view const& left, cudf::scoped_range range{"sort_merge_join::inner_join"}; // Sanity checks CUDF_EXPECTS(left.num_columns() != 0, - "Number of columns in left keys must be non-zero for a join"); + "Number of columns in left keys must be non-zero for a join", + std::invalid_argument); CUDF_EXPECTS(left.num_columns() == preprocessed_right._null_processed_table_view.num_columns(), - "Number of columns must match for a join"); + "Number of columns must match for a join", + std::invalid_argument); // Preprocessing the left table preprocessed_left._table_view = left; @@ -558,7 +561,7 @@ sort_merge_join::inner_join(table_view const& left, }); } -sort_merge_join::match_context sort_merge_join::inner_join_match_context( +cudf::join_match_context sort_merge_join::inner_join_match_context( table_view const& left, sorted is_left_sorted, rmm::cuda_stream_view stream, @@ -567,9 +570,11 @@ sort_merge_join::match_context sort_merge_join::inner_join_match_context( cudf::scoped_range range{"sort_merge_join::inner_join_match_context"}; // Sanity checks CUDF_EXPECTS(left.num_columns() != 0, - "Number of columns in left keys must be non-zero for a join"); + "Number of columns in left keys must be non-zero for a join", + std::invalid_argument); CUDF_EXPECTS(left.num_columns() == preprocessed_right._null_processed_table_view.num_columns(), - "Number of columns must match for a join"); + "Number of columns must match for a join", + std::invalid_argument); // Preprocessing the left table preprocessed_left._table_view = left; @@ -606,18 +611,18 @@ sort_merge_join::match_context sort_merge_join::inner_join_match_context( mapping.begin(), unprocessed_matches_per_row.begin()); stream.synchronize(); - return match_context{ + return join_match_context{ left, std::make_unique>(std::move(unprocessed_matches_per_row))}; } - return match_context{left, std::move(matches_per_row)}; + return join_match_context{left, std::move(matches_per_row)}; }); } // left_partition_end exclusive std::pair>, std::unique_ptr>> -sort_merge_join::partitioned_inner_join(sort_merge_join::partition_context const& context, +sort_merge_join::partitioned_inner_join(cudf::join_partition_context const& context, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { diff --git a/cpp/tests/join/join_tests.cpp b/cpp/tests/join/join_tests.cpp index 1f371576d55..f1114e761f9 100644 --- a/cpp/tests/join/join_tests.cpp +++ b/cpp/tests/join/join_tests.cpp @@ -282,10 +282,10 @@ TEST_P(JoinParameterizedTest, InvalidInput) cudf::table_view right({right_first_col, right_second_col, right_third_col}); EXPECT_THROW(inner_join(left, right, {0, 1}, {0, 1, 2}, cudf::null_equality::EQUAL, algo), - cudf::logic_error); + std::invalid_argument); EXPECT_THROW(inner_join(left, right, {}, {0, 1, 2}, cudf::null_equality::EQUAL, algo), - cudf::logic_error); + std::invalid_argument); } TEST_P(JoinParameterizedTest, EmptySentinelRepro) @@ -954,10 +954,9 @@ TEST_F(JoinTest, PartitionedInnerJoinWithNulls) cudf::sort_merge_join obj(t1.select(right_on), cudf::sorted::NO, compare_nulls, stream); auto match_context = obj.inner_join_match_context( t0.select(left_on), cudf::sorted::NO, stream, cudf::get_current_device_resource_ref()); - auto partition_context = cudf::sort_merge_join::partition_context{std::move(match_context), 0, 0}; + auto partition_context = cudf::join_partition_context{std::move(match_context), 0, 0}; - auto join_and_gather = [&t0, &t1, &obj, stream]( - cudf::sort_merge_join::partition_context const& cxt) { + auto join_and_gather = [&t0, &t1, &obj, stream](cudf::join_partition_context const& cxt) { auto const [left_join_indices, right_join_indices] = obj.partitioned_inner_join(cxt, stream, cudf::get_current_device_resource_ref()); @@ -1969,6 +1968,346 @@ TEST_F(JoinTest, HashJoinLargeOutputSize) EXPECT_EQ(col_size * col_size, output_size); } +TEST_F(JoinTest, HashJoinInnerMatchContext) +{ + // Test inner join match context functionality with multiple matches and nulls + // Use the same test data as SortMergeInnerJoinSizePerRowWithNulls for consistency + column_wrapper col0_0{{3, 1, 2, 0, 2}}; + strcol_wrapper col0_1({"s1", "s1", "s0", "s4", "s0"}, {true, true, false, true, true}); + column_wrapper col0_2{{0, 1, 2, 4, 1}}; + + column_wrapper col1_0{{2, 2, 0, 4, 3}}; + strcol_wrapper col1_1({"s1", "s0", "s1", "s2", "s1"}, {true, false, true, true, true}); + column_wrapper col1_2{{1, 0, 1, 2, 1}, {true, false, true, true, true}}; + + CVector cols0, cols1; + cols0.emplace_back(col0_0.release()); + cols0.emplace_back(col0_1.release()); + cols0.emplace_back(col0_2.release()); + cols1.emplace_back(col1_0.release()); + cols1.emplace_back(col1_1.release()); + cols1.emplace_back(col1_2.release()); + + Table t0(std::move(cols0)); + Table t1(std::move(cols1)); + + // Test single column join with null_equality::EQUAL + { + cudf::hash_join hash_join(t1.select({0}), cudf::null_equality::EQUAL); + auto match_context = hash_join.inner_join_match_context(t0.select({0})); + + // Check the match counts for each row + std::vector h_match_counts(t0.num_rows()); + CUDF_CUDA_TRY(cudaMemcpy(h_match_counts.data(), + match_context._match_counts->data(), + sizeof(cudf::size_type) * t0.num_rows(), + cudaMemcpyDeviceToHost)); + + // Expected: Row 0(3)=1 match, Row 1(1)=0 matches, Row 2(2)=2 matches, Row 3(0)=1 match, Row + // 4(2)=2 matches + std::vector expected_counts = {1, 0, 2, 1, 2}; + EXPECT_EQ(h_match_counts, expected_counts); + + // Verify total matches equals inner join size + cudf::size_type total_matches = + std::accumulate(h_match_counts.begin(), h_match_counts.end(), 0); + auto inner_join_size = hash_join.inner_join_size(t0.select({0})); + EXPECT_EQ(total_matches, inner_join_size); + } + + // Test multi-column join with null_equality::EQUAL + { + cudf::hash_join hash_join(t1.select({0, 1}), cudf::null_equality::EQUAL); + auto match_context = hash_join.inner_join_match_context(t0.select({0, 1})); + + std::vector h_match_counts(t0.num_rows()); + CUDF_CUDA_TRY(cudaMemcpy(h_match_counts.data(), + match_context._match_counts->data(), + sizeof(cudf::size_type) * t0.num_rows(), + cudaMemcpyDeviceToHost)); + + // Expected: Row 0(3,s1)=1 match, Row 1(1,s1)=0 matches, Row 2(2,null)=1 match, Row 3(0,s4)=0 + // matches, Row 4(2,s0)=0 matches + std::vector expected_counts = {1, 0, 1, 0, 0}; + EXPECT_EQ(h_match_counts, expected_counts); + } + + // Test single column join with null_equality::UNEQUAL + { + cudf::hash_join hash_join(t1.select({0}), cudf::null_equality::UNEQUAL); + auto match_context = hash_join.inner_join_match_context(t0.select({0})); + + std::vector h_match_counts(t0.num_rows()); + CUDF_CUDA_TRY(cudaMemcpy(h_match_counts.data(), + match_context._match_counts->data(), + sizeof(cudf::size_type) * t0.num_rows(), + cudaMemcpyDeviceToHost)); + + // Same as EQUAL for single column since nulls don't affect the integer column matching + std::vector expected_counts = {1, 0, 2, 1, 2}; + EXPECT_EQ(h_match_counts, expected_counts); + } + + // Test multi-column join with null_equality::UNEQUAL + { + cudf::hash_join hash_join(t1.select({0, 1}), cudf::null_equality::UNEQUAL); + auto match_context = hash_join.inner_join_match_context(t0.select({0, 1})); + + std::vector h_match_counts(t0.num_rows()); + CUDF_CUDA_TRY(cudaMemcpy(h_match_counts.data(), + match_context._match_counts->data(), + sizeof(cudf::size_type) * t0.num_rows(), + cudaMemcpyDeviceToHost)); + + // With UNEQUAL, rows with nulls should not match: Row 0(3,s1)=1 match, others=0 matches + std::vector expected_counts = {1, 0, 0, 0, 0}; + EXPECT_EQ(h_match_counts, expected_counts); + } +} + +TEST_F(JoinTest, HashJoinLeftMatchContext) +{ + // Test left join match context functionality with comprehensive null handling + column_wrapper col0_0{{3, 1, 2, 0, 2}}; + strcol_wrapper col0_1({"s1", "s1", "s0", "s4", "s0"}, {true, true, false, true, true}); + + column_wrapper col1_0{{2, 2, 0, 4, 3}}; + strcol_wrapper col1_1({"s1", "s0", "s1", "s2", "s1"}, {true, false, true, true, true}); + + CVector cols0, cols1; + cols0.emplace_back(col0_0.release()); + cols0.emplace_back(col0_1.release()); + cols1.emplace_back(col1_0.release()); + cols1.emplace_back(col1_1.release()); + + Table t0(std::move(cols0)); + Table t1(std::move(cols1)); + + // Test single column join - for left join, every row should have at least 1 match + { + cudf::hash_join hash_join(t1.select({0}), cudf::null_equality::EQUAL); + auto match_context = hash_join.left_join_match_context(t0.select({0})); + + std::vector h_match_counts(t0.num_rows()); + CUDF_CUDA_TRY(cudaMemcpy(h_match_counts.data(), + match_context._match_counts->data(), + sizeof(cudf::size_type) * t0.num_rows(), + cudaMemcpyDeviceToHost)); + + // For left join: Row 0(3)=1 match, Row 1(1)=1 match(null), Row 2(2)=2 matches, Row 3(0)=1 + // match, Row 4(2)=2 matches + std::vector expected_counts = {1, 1, 2, 1, 2}; + EXPECT_EQ(h_match_counts, expected_counts); + + // Verify total matches equals left join size + cudf::size_type total_matches = + std::accumulate(h_match_counts.begin(), h_match_counts.end(), 0); + auto left_join_size = hash_join.left_join_size(t0.select({0})); + EXPECT_EQ(total_matches, left_join_size); + } + + // Test multi-column join with null_equality::EQUAL + { + cudf::hash_join hash_join(t1.select({0, 1}), cudf::null_equality::EQUAL); + auto match_context = hash_join.left_join_match_context(t0.select({0, 1})); + + std::vector h_match_counts(t0.num_rows()); + CUDF_CUDA_TRY(cudaMemcpy(h_match_counts.data(), + match_context._match_counts->data(), + sizeof(cudf::size_type) * t0.num_rows(), + cudaMemcpyDeviceToHost)); + + // For left join, all rows get at least 1 match: Row 0(3,s1)=1 match, others=1 match (null) + std::vector expected_counts = {1, 1, 1, 1, 1}; + EXPECT_EQ(h_match_counts, expected_counts); + } + + // Test multi-column join with null_equality::UNEQUAL + { + cudf::hash_join hash_join(t1.select({0, 1}), cudf::null_equality::UNEQUAL); + auto match_context = hash_join.left_join_match_context(t0.select({0, 1})); + + std::vector h_match_counts(t0.num_rows()); + CUDF_CUDA_TRY(cudaMemcpy(h_match_counts.data(), + match_context._match_counts->data(), + sizeof(cudf::size_type) * t0.num_rows(), + cudaMemcpyDeviceToHost)); + + // For left join with UNEQUAL, all rows still get at least 1 match (nulls for unmatched) + std::vector expected_counts = {1, 1, 1, 1, 1}; + EXPECT_EQ(h_match_counts, expected_counts); + } +} + +TEST_F(JoinTest, HashJoinFullMatchContext) +{ + // Test full join match context functionality with same comprehensive data + column_wrapper col0_0{{3, 1, 2, 0, 2}}; + strcol_wrapper col0_1({"s1", "s1", "s0", "s4", "s0"}, {true, true, false, true, true}); + + column_wrapper col1_0{{2, 2, 0, 4, 3}}; + strcol_wrapper col1_1({"s1", "s0", "s1", "s2", "s1"}, {true, false, true, true, true}); + + CVector cols0, cols1; + cols0.emplace_back(col0_0.release()); + cols0.emplace_back(col0_1.release()); + cols1.emplace_back(col1_0.release()); + cols1.emplace_back(col1_1.release()); + + Table t0(std::move(cols0)); + Table t1(std::move(cols1)); + + // Test single column join - for full join probe side, every row should have at least 1 match + { + cudf::hash_join hash_join(t1.select({0}), cudf::null_equality::EQUAL); + auto match_context = hash_join.full_join_match_context(t0.select({0})); + + std::vector h_match_counts(t0.num_rows()); + CUDF_CUDA_TRY(cudaMemcpy(h_match_counts.data(), + match_context._match_counts->data(), + sizeof(cudf::size_type) * t0.num_rows(), + cudaMemcpyDeviceToHost)); + + // For full join: Row 0(3)=1 match, Row 1(1)=1 match(null), Row 2(2)=2 matches, Row 3(0)=1 + // match, Row 4(2)=2 matches + std::vector expected_counts = {1, 1, 2, 1, 2}; + EXPECT_EQ(h_match_counts, expected_counts); + } + + // Test multi-column join + { + cudf::hash_join hash_join(t1.select({0, 1}), cudf::null_equality::EQUAL); + auto match_context = hash_join.full_join_match_context(t0.select({0, 1})); + + std::vector h_match_counts(t0.num_rows()); + CUDF_CUDA_TRY(cudaMemcpy(h_match_counts.data(), + match_context._match_counts->data(), + sizeof(cudf::size_type) * t0.num_rows(), + cudaMemcpyDeviceToHost)); + + // For full join, all rows get at least 1 match + std::vector expected_counts = {1, 1, 1, 1, 1}; + EXPECT_EQ(h_match_counts, expected_counts); + } +} + +TEST_F(JoinTest, HashJoinMatchContextEmptyBuild) +{ + // Test match context with empty build table + column_wrapper col0_0{{3, 1, 2}}; + column_wrapper col1_0{}; // Empty + + CVector cols0, cols1; + cols0.emplace_back(col0_0.release()); + cols1.emplace_back(col1_0.release()); + + Table t0(std::move(cols0)); + Table t1(std::move(cols1)); + + cudf::hash_join hash_join(t1, cudf::null_equality::EQUAL); + + // Test inner join match context + { + auto match_context = hash_join.inner_join_match_context(t0); + std::vector h_match_counts(t0.num_rows()); + CUDF_CUDA_TRY(cudaMemcpy(h_match_counts.data(), + match_context._match_counts->data(), + sizeof(cudf::size_type) * t0.num_rows(), + cudaMemcpyDeviceToHost)); + + // All should be 0 for inner join with empty build table + std::vector expected_counts = {0, 0, 0}; + EXPECT_EQ(h_match_counts, expected_counts); + } + + // Test left join match context + { + auto match_context = hash_join.left_join_match_context(t0); + std::vector h_match_counts(t0.num_rows()); + CUDF_CUDA_TRY(cudaMemcpy(h_match_counts.data(), + match_context._match_counts->data(), + sizeof(cudf::size_type) * t0.num_rows(), + cudaMemcpyDeviceToHost)); + + // All should be 1 for left join (null matches) + std::vector expected_counts = {1, 1, 1}; + EXPECT_EQ(h_match_counts, expected_counts); + } + + // Test full join match context + { + auto match_context = hash_join.full_join_match_context(t0); + std::vector h_match_counts(t0.num_rows()); + CUDF_CUDA_TRY(cudaMemcpy(h_match_counts.data(), + match_context._match_counts->data(), + sizeof(cudf::size_type) * t0.num_rows(), + cudaMemcpyDeviceToHost)); + + // All should be 1 for full join (null matches) + std::vector expected_counts = {1, 1, 1}; + EXPECT_EQ(h_match_counts, expected_counts); + } +} + +TEST_F(JoinTest, HashJoinMatchContextDuplicatesAndEdgeCases) +{ + // Test with duplicate keys to ensure multiple matches work correctly + column_wrapper col0_0{{1, 1, 2, 2, 3}}; + strcol_wrapper col0_1({"a", "a", "b", "b", "c"}); + + column_wrapper col1_0{{1, 1, 1, 2, 4}}; + strcol_wrapper col1_1({"a", "a", "a", "b", "d"}); + + CVector cols0, cols1; + cols0.emplace_back(col0_0.release()); + cols0.emplace_back(col0_1.release()); + cols1.emplace_back(col1_0.release()); + cols1.emplace_back(col1_1.release()); + + Table t0(std::move(cols0)); + Table t1(std::move(cols1)); + + // Test inner join with multiple matches per row + { + cudf::hash_join hash_join(t1.select({0}), cudf::null_equality::EQUAL); + auto match_context = hash_join.inner_join_match_context(t0.select({0})); + + std::vector h_match_counts(t0.num_rows()); + CUDF_CUDA_TRY(cudaMemcpy(h_match_counts.data(), + match_context._match_counts->data(), + sizeof(cudf::size_type) * t0.num_rows(), + cudaMemcpyDeviceToHost)); + + // Row 0(1)=3 matches, Row 1(1)=3 matches, Row 2(2)=1 match, Row 3(2)=1 match, Row 4(3)=0 + // matches + std::vector expected_counts = {3, 3, 1, 1, 0}; + EXPECT_EQ(h_match_counts, expected_counts); + + // Verify total matches + cudf::size_type total_matches = + std::accumulate(h_match_counts.begin(), h_match_counts.end(), 0); + auto inner_join_size = hash_join.inner_join_size(t0.select({0})); + EXPECT_EQ(total_matches, inner_join_size); + } + + // Test multi-column join + { + cudf::hash_join hash_join(t1.select({0, 1}), cudf::null_equality::EQUAL); + auto match_context = hash_join.inner_join_match_context(t0.select({0, 1})); + + std::vector h_match_counts(t0.num_rows()); + CUDF_CUDA_TRY(cudaMemcpy(h_match_counts.data(), + match_context._match_counts->data(), + sizeof(cudf::size_type) * t0.num_rows(), + cudaMemcpyDeviceToHost)); + + // Row 0(1,a)=3 matches, Row 1(1,a)=3 matches, Row 2(2,b)=1 match, Row 3(2,b)=1 match, Row + // 4(3,c)=0 matches + std::vector expected_counts = {3, 3, 1, 1, 0}; + EXPECT_EQ(h_match_counts, expected_counts); + } +} + struct JoinDictionaryTest : public cudf::test::BaseFixture {}; TEST_F(JoinDictionaryTest, LeftJoinNoNulls) From d5a1d556143cf3faed5c668e55aaa339c6f1f884 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Fri, 29 Aug 2025 11:24:27 -0700 Subject: [PATCH 232/366] Pin pytest-rerunfailures<16 (#19846) xref https://github.com/pytest-dev/pytest-rerunfailures/issues/302 Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - James Lamb (https://github.com/jameslamb) URL: https://github.com/rapidsai/cudf/pull/19846 --- conda/environments/all_cuda-129_arch-aarch64.yaml | 2 +- conda/environments/all_cuda-129_arch-x86_64.yaml | 2 +- conda/environments/all_cuda-130_arch-aarch64.yaml | 2 +- conda/environments/all_cuda-130_arch-x86_64.yaml | 2 +- dependencies.yaml | 3 ++- python/cudf/pyproject.toml | 2 +- 6 files changed, 7 insertions(+), 6 deletions(-) diff --git a/conda/environments/all_cuda-129_arch-aarch64.yaml b/conda/environments/all_cuda-129_arch-aarch64.yaml index 466aaf5d7fa..8227b760b36 100644 --- a/conda/environments/all_cuda-129_arch-aarch64.yaml +++ b/conda/environments/all_cuda-129_arch-aarch64.yaml @@ -75,7 +75,7 @@ dependencies: - pytest-benchmark - pytest-cases>=3.8.2 - pytest-cov -- pytest-rerunfailures +- pytest-rerunfailures<16.0 - pytest-xdist - python-confluent-kafka>=2.8.0,<2.9.0a0 - python-xxhash diff --git a/conda/environments/all_cuda-129_arch-x86_64.yaml b/conda/environments/all_cuda-129_arch-x86_64.yaml index 1679dfd5d83..dd2aa0a6e71 100644 --- a/conda/environments/all_cuda-129_arch-x86_64.yaml +++ b/conda/environments/all_cuda-129_arch-x86_64.yaml @@ -76,7 +76,7 @@ dependencies: - pytest-benchmark - pytest-cases>=3.8.2 - pytest-cov -- pytest-rerunfailures +- pytest-rerunfailures<16.0 - pytest-xdist - python-confluent-kafka>=2.8.0,<2.9.0a0 - python-xxhash diff --git a/conda/environments/all_cuda-130_arch-aarch64.yaml b/conda/environments/all_cuda-130_arch-aarch64.yaml index e18d021ee4e..f550d88d4b3 100644 --- a/conda/environments/all_cuda-130_arch-aarch64.yaml +++ b/conda/environments/all_cuda-130_arch-aarch64.yaml @@ -75,7 +75,7 @@ dependencies: - pytest-benchmark - pytest-cases>=3.8.2 - pytest-cov -- pytest-rerunfailures +- pytest-rerunfailures<16.0 - pytest-xdist - python-confluent-kafka>=2.8.0,<2.9.0a0 - python-xxhash diff --git a/conda/environments/all_cuda-130_arch-x86_64.yaml b/conda/environments/all_cuda-130_arch-x86_64.yaml index 359ed939f77..d6103d5e73b 100644 --- a/conda/environments/all_cuda-130_arch-x86_64.yaml +++ b/conda/environments/all_cuda-130_arch-x86_64.yaml @@ -76,7 +76,7 @@ dependencies: - pytest-benchmark - pytest-cases>=3.8.2 - pytest-cov -- pytest-rerunfailures +- pytest-rerunfailures<16.0 - pytest-xdist - python-confluent-kafka>=2.8.0,<2.9.0a0 - python-xxhash diff --git a/dependencies.yaml b/dependencies.yaml index 3808369ace3..3dc704cebf4 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -1164,7 +1164,8 @@ dependencies: - nbconvert - nbformat - openpyxl - - pytest-rerunfailures + # https://github.com/pytest-dev/pytest-rerunfailures/issues/302 + - pytest-rerunfailures<16.0 depends_on_dask_cuda: common: - output_types: conda diff --git a/python/cudf/pyproject.toml b/python/cudf/pyproject.toml index 0c6c84bdca1..50c6fd782e8 100644 --- a/python/cudf/pyproject.toml +++ b/python/cudf/pyproject.toml @@ -76,7 +76,7 @@ cudf-pandas-tests = [ "nbconvert", "nbformat", "openpyxl", - "pytest-rerunfailures", + "pytest-rerunfailures<16.0", ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. [project.urls] From 6237d9429061e32d1c0969650794805a1b28c476 Mon Sep 17 00:00:00 2001 From: brandon-b-miller <53796099+brandon-b-miller@users.noreply.github.com> Date: Fri, 29 Aug 2025 15:38:54 -0500 Subject: [PATCH 233/366] Fix flaky DataFrame `to_string` test (#19847) Fixes an issue where a test, `test_dataframe_to_string_wide`, sometimes failed to to improperly falling back to console based settings. Authors: - https://github.com/brandon-b-miller - Matthew Roeschke (https://github.com/mroeschke) Approvers: - Matthew Murray (https://github.com/Matt711) - GALI PREM SAGAR (https://github.com/galipremsagar) URL: https://github.com/rapidsai/cudf/pull/19847 --- python/cudf/cudf/tests/test_dataframe.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index 8d2e7f9a707..ecd68f46e46 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -373,7 +373,7 @@ def test_dataframe_to_string_with_masked_data(): def test_dataframe_to_string_wide(): # Test basic df = cudf.DataFrame({f"a{i}": [0, 1, 2] for i in range(100)}) - with pd.option_context("display.max_columns", 0): + with pd.option_context("display.max_columns", 16): got = df.to_string() expect = textwrap.dedent( From b47101312493e3fb1d5c013962c473c9e39de1f9 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Fri, 29 Aug 2025 16:21:44 -0700 Subject: [PATCH 234/366] Fix .str.replace ignoring n for single character replacements (#19848) closes https://github.com/rapidsai/cudf/issues/19844 Authors: - Matthew Roeschke (https://github.com/mroeschke) - Tom Augspurger (https://github.com/TomAugspurger) Approvers: - Matthew Murray (https://github.com/Matt711) - Tom Augspurger (https://github.com/TomAugspurger) URL: https://github.com/rapidsai/cudf/pull/19848 --- python/cudf/cudf/core/accessors/string.py | 4 +--- python/cudf/cudf/core/column/string.py | 5 ++++- python/cudf/cudf/tests/series/accessors/test_str.py | 8 ++++++++ 3 files changed, 13 insertions(+), 4 deletions(-) diff --git a/python/cudf/cudf/core/accessors/string.py b/python/cudf/cudf/core/accessors/string.py index 0b3f14d6267..7b85271e057 100644 --- a/python/cudf/cudf/core/accessors/string.py +++ b/python/cudf/cudf/core/accessors/string.py @@ -994,9 +994,6 @@ def replace( as_column(repl, dtype=CUDF_STRING_DTYPE), # type: ignore[arg-type] ) return self._return_or_inplace(result) - # Pandas treats 0 as all - if n == 0: - n = -1 # If 'pat' is re.Pattern then get the pattern string from it if regex and isinstance(pat, re.Pattern): @@ -1017,6 +1014,7 @@ def replace( result = self._column.replace_str( pat, # type: ignore[arg-type] pa_repl, + n, ) return self._return_or_inplace(result) diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index 1fa05377099..a947ae7a275 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -1279,11 +1279,14 @@ def replace_re( return type(self).from_pylibcudf(plc_column) # type: ignore[return-value] @acquire_spill_lock() - def replace_str(self, pattern: str, replacement: pa.Scalar) -> Self: + def replace_str( + self, pattern: str, replacement: pa.Scalar, max_replace_count: int = -1 + ) -> Self: plc_result = plc.strings.replace.replace( self.to_pylibcudf(mode="read"), pa_scalar_to_plc_scalar(pa.scalar(pattern)), pa_scalar_to_plc_scalar(replacement), + max_replace_count, ) return type(self).from_pylibcudf(plc_result) # type: ignore[return-value] diff --git a/python/cudf/cudf/tests/series/accessors/test_str.py b/python/cudf/cudf/tests/series/accessors/test_str.py index 509ade12ce1..f6efb7a2bc0 100644 --- a/python/cudf/cudf/tests/series/accessors/test_str.py +++ b/python/cudf/cudf/tests/series/accessors/test_str.py @@ -2296,6 +2296,14 @@ def test_string_replace_zero_length(ps_gs, pat): assert_eq(expect, got) +@pytest.mark.parametrize("n", [-1, 0, 1]) +def test_string_replace_n(n): + data = ["a,b,c", "d,e,f,g"] + expect = pd.Series(data).str.replace(pat=",", repl="_", n=n) + got = cudf.Series(data).str.replace(pat=",", repl="_", n=n) + assert_eq(expect, got) + + @pytest.mark.parametrize( "pat,regex", [ From 8da8ef0ed6d222ec4f96650f8430d70e49027f6f Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Fri, 29 Aug 2025 18:29:28 -0500 Subject: [PATCH 235/366] Update `pandas-tests-diff` to only display GPU/CPU usage metrics (#19210) Resolves: #19204 This PR updates `pandas-tests-diff` to only display GPU/CPU usage metrics instead of removing the job altogether. Authors: - GALI PREM SAGAR (https://github.com/galipremsagar) Approvers: - Matthew Roeschke (https://github.com/mroeschke) - James Lamb (https://github.com/jameslamb) - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/19210 --- .github/workflows/pandas-tests.yaml | 7 +- .github/workflows/pr.yaml | 12 +- ci/cudf_pandas_scripts/pandas-tests/diff.sh | 9 +- .../pandas-tests/job-summary.py | 124 ++++++++++++------ ci/cudf_pandas_scripts/pandas-tests/run.sh | 6 +- .../pandas/scripts/summarize-test-results.py | 31 +++++ 6 files changed, 136 insertions(+), 53 deletions(-) diff --git a/.github/workflows/pandas-tests.yaml b/.github/workflows/pandas-tests.yaml index 085c17bd9f2..7b4fed24910 100644 --- a/.github/workflows/pandas-tests.yaml +++ b/.github/workflows/pandas-tests.yaml @@ -22,11 +22,14 @@ jobs: pandas-tests: # run the Pandas unit tests secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@cuda13.0 + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@cuda13.0 with: - matrix_filter: '[{"ARCH": "amd64", "PY_VER": "3.13", "CUDA_VER": "13.0.0", "LINUX_VER": "ubuntu24.04", "GPU": "l4", "DRIVER": "latest", "DEPENDENCIES": "newest"}]' build_type: nightly branch: ${{ inputs.branch }} date: ${{ inputs.date }} sha: ${{ inputs.sha }} + node_type: "gpu-l4-latest-1" + container_image: "rapidsai/citestwheel:25.10-cuda13.0.0-ubuntu24.04-py3.13" script: ci/cudf_pandas_scripts/pandas-tests/run.sh main + file_to_upload: ./artifacts/main-results.json + artifact-name: main-results.json diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml index 10904fb1425..d46de789706 100644 --- a/.github/workflows/pr.yaml +++ b/.github/workflows/pr.yaml @@ -359,14 +359,18 @@ jobs: # run the Pandas unit tests using PR branch needs: [wheel-build-cudf, changed-files] secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@cuda13.0 + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@cuda13.0 if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_python || fromJSON(needs.changed-files.outputs.changed_file_groups).test_cudf_pandas with: - matrix_filter: '[{"ARCH": "amd64", "PY_VER": "3.13", "CUDA_VER": "13.0.0", "LINUX_VER": "ubuntu24.04", "GPU": "l4", "DRIVER": "latest", "DEPENDENCIES": "newest"}]' build_type: pull-request + branch: ${{ inputs.branch }} + date: ${{ inputs.date }} + sha: ${{ inputs.sha }} + node_type: "gpu-l4-latest-1" + container_image: "rapidsai/citestwheel:25.10-cuda13.0.0-ubuntu24.04-py3.13" script: ci/cudf_pandas_scripts/pandas-tests/run.sh pr - # Hide test failures because they exceed the GITHUB_STEP_SUMMARY output limit. - test_summary_show: "none" + file_to_upload: ./artifacts/pr-results.json + artifact-name: pr-results.json pandas-tests-diff: # diff the results of running the Pandas unit tests and publish a job summary needs: pandas-tests diff --git a/ci/cudf_pandas_scripts/pandas-tests/diff.sh b/ci/cudf_pandas_scripts/pandas-tests/diff.sh index f84776ad173..e30236fdbf1 100755 --- a/ci/cudf_pandas_scripts/pandas-tests/diff.sh +++ b/ci/cudf_pandas_scripts/pandas-tests/diff.sh @@ -13,8 +13,6 @@ rapids-logger "Github job name: ${GH_JOB_NAME}" rapids-logger "Rapids version: ${RAPIDS_FULL_VERSION}" PY_VER="313" -RAPIDS_CUDA_MAJOR="${RAPIDS_CUDA_VERSION%%.*}" -PR_ARTIFACT=$(rapids-s3-path)cuda${RAPIDS_CUDA_MAJOR}_$(arch)_py${PY_VER}.pr-${RAPIDS_FULL_VERSION}-results.json rapids-logger "Fetching latest available results from nightly" aws s3api list-objects-v2 --bucket rapids-downloads --prefix "nightly/cudf/" --query "sort_by(Contents[?ends_with(Key, '_py${PY_VER}.main-${RAPIDS_FULL_VERSION}-results.json')], &LastModified)[::].[Key]" --output text | tee s3_output.txt @@ -22,11 +20,12 @@ COMPARE_ENV=$(tail -n 1 s3_output.txt) rapids-logger "Latest available results from nightly: ${COMPARE_ENV}" aws s3 cp "s3://rapids-downloads/${COMPARE_ENV}" main-results.json -aws s3 cp "$PR_ARTIFACT" pr-results.json - +# TODO: To be enabled in a follow-up PR. +# MAIN_RUN_ID=$(gh run list -w "Pandas Test Job" -b branch-25.10 --status success --limit 7 --json databaseId --jq ".[0].databaseId") +gh run download $GITHUB_RUN_ID -n pr-results.json # Compute the diff and prepare job summary: python -m pip install pandas tabulate -python ci/cudf_pandas_scripts/pandas-tests/job-summary.py main-results.json pr-results.json | tee summary.txt >> "$GITHUB_STEP_SUMMARY" +python ci/cudf_pandas_scripts/pandas-tests/job-summary.py main-results.json pr-results.json "${RAPIDS_FULL_VERSION}" | tee summary.txt >> "$GITHUB_STEP_SUMMARY" COMMENT=$(head -1 summary.txt | grep -oP '\d+/\d+ \(\d+\.\d+%\).*?(a decrease by|an increase by) \d+\.\d+%') echo "$COMMENT" diff --git a/ci/cudf_pandas_scripts/pandas-tests/job-summary.py b/ci/cudf_pandas_scripts/pandas-tests/job-summary.py index af3e28f440f..60668280d7e 100644 --- a/ci/cudf_pandas_scripts/pandas-tests/job-summary.py +++ b/ci/cudf_pandas_scripts/pandas-tests/job-summary.py @@ -12,41 +12,85 @@ def get_total_and_passed(results): total_failed = 0 total_errored = 0 total_passed = 0 + total_skipped = 0 + total_xfailed_by_cudf_pandas = 0 + total_skipped_by_cudf_pandas = 0 for module_name, row in results.items(): total_failed += row.get("failed", 0) total_errored += row.get("errored", 0) total_passed += row.get("passed", 0) - total_tests = total_failed + total_errored + total_passed - return total_tests, total_passed + total_skipped += row.get("skipped", 0) + total_xfailed_by_cudf_pandas += row.get("xfailed_by_cudf_pandas", 0) + total_skipped_by_cudf_pandas += row.get("skipped_by_cudf_pandas", 0) + total_tests = total_failed + total_errored + total_passed + total_skipped + return ( + total_tests, + total_passed, + total_xfailed_by_cudf_pandas, + total_skipped_by_cudf_pandas, + total_skipped, + ) main_json = sys.argv[1] pr_json = sys.argv[2] +branch_version = sys.argv[3] # read the results of summarize-test-results.py --summary with open(main_json) as f: main_results = json.load(f) -main_total, main_passed = get_total_and_passed(main_results) +( + main_total, + main_passed, + main_xfailed_by_cudf_pandas, + main_skipped_by_cudf_pandas, + main_skipped, +) = get_total_and_passed(main_results) with open(pr_json) as f: pr_results = json.load(f) -pr_total, pr_passed = get_total_and_passed(pr_results) +( + pr_total, + pr_passed, + pr_xfailed_by_cudf_pandas, + pr_skipped_by_cudf_pandas, + pr_skipped, +) = get_total_and_passed(pr_results) passing_percentage = pr_passed / pr_total * 100 -pass_rate_change = abs(pr_passed - main_passed) / main_passed * 100 -rate_change_type = "a decrease" if pr_passed < main_passed else "an increase" - -comment = ( - "Merging this PR would result in " - f"{pr_passed}/{pr_total} ({passing_percentage:.2f}%) " - "Pandas tests passing, " - f"{rate_change_type} by " - f"{pass_rate_change:.2f}%. " - f"Trunk stats: {main_passed}/{main_total}." + + +metrics_df = pd.DataFrame( + { + "This PR": [ + pr_total, + pr_passed, + pr_skipped_by_cudf_pandas, + pr_xfailed_by_cudf_pandas, + pr_skipped + - (pr_skipped_by_cudf_pandas + pr_xfailed_by_cudf_pandas), + ], + f"branch-{branch_version}": [ + main_total, + main_passed, + main_skipped_by_cudf_pandas, + main_xfailed_by_cudf_pandas, + main_skipped + - (main_skipped_by_cudf_pandas + main_xfailed_by_cudf_pandas), + ], + }, + index=[ + "Total tests", + "Passed tests", + "cudf.Pandas Skipped", + "cudf.Pandas xFailed", + "pandas skipped", + ], ) def emoji_passed(x): + """Format number with emoji: positive -> ✅, negative -> ❌""" if x > 0: return f"{x}✅" elif x < 0: @@ -56,6 +100,7 @@ def emoji_passed(x): def emoji_failed(x): + """Format number with emoji: positive -> ❌, negative -> ✅ (inverse of emoji_passed)""" if x > 0: return f"{x}❌" elif x < 0: @@ -67,6 +112,7 @@ def emoji_failed(x): # convert pr_results to a pandas DataFrame and then a markdown table pr_df = pd.DataFrame.from_dict(pr_results, orient="index").sort_index() main_df = pd.DataFrame.from_dict(main_results, orient="index").sort_index() +# Calculate CPU and GPU usage percentages for main branch total_usage = main_df["_slow_function_call"] + main_df["_fast_function_call"] main_df["CPU Usage"] = ( (main_df["_slow_function_call"] / total_usage) * 100.0 @@ -75,6 +121,7 @@ def emoji_failed(x): (main_df["_fast_function_call"] / total_usage) * 100.0 ).round(1) +# Calculate CPU and GPU usage percentages for PR total_usage = pr_df["_slow_function_call"] + pr_df["_fast_function_call"] pr_df["CPU Usage"] = ( (pr_df["_slow_function_call"] / total_usage) * 100.0 @@ -83,17 +130,20 @@ def emoji_failed(x): (pr_df["_fast_function_call"] / total_usage) * 100.0 ).round(1) +# Calculate average usages cpu_usage_mean = pr_df["CPU Usage"].mean().round(2) gpu_usage_mean = pr_df["GPU Usage"].mean().round(2) - -gpu_usage_rate_change = abs( +gpu_usage_rate_change = ( pr_df["GPU Usage"].mean() - main_df["GPU Usage"].mean() -) +).round(2) + +# Handle NaN values pr_df["CPU Usage"] = pr_df["CPU Usage"].fillna(0) pr_df["GPU Usage"] = pr_df["GPU Usage"].fillna(0) main_df["CPU Usage"] = main_df["CPU Usage"].fillna(0) main_df["GPU Usage"] = main_df["GPU Usage"].fillna(0) +# Calculate differences between PR and main diff_df = pr_df - main_df diff_df["CPU Usage"] = diff_df["CPU Usage"].round(1).fillna(0) diff_df["GPU Usage"] = diff_df["GPU Usage"].round(1).fillna(0) @@ -102,59 +152,51 @@ def emoji_failed(x): pr_df["CPU Usage"] = pr_df["CPU Usage"].astype(str) + "%" pr_df["GPU Usage"] = pr_df["GPU Usage"].astype(str) + "%" -pr_df = pr_df[ - ["total", "passed", "failed", "skipped", "CPU Usage", "GPU Usage"] -] -diff_df = diff_df[ - ["total", "passed", "failed", "skipped", "CPU Usage", "GPU Usage"] -] +# Select relevant columns +pr_df = pr_df[["total", "CPU Usage", "GPU Usage"]] +diff_df = diff_df[["total", "CPU Usage", "GPU Usage"]] + +# Rename diff columns to indicate they are differences diff_df.columns = diff_df.columns + "_diff" -diff_df["passed_diff"] = diff_df["passed_diff"].map(emoji_passed) -diff_df["failed_diff"] = diff_df["failed_diff"].map(emoji_failed) -diff_df["skipped_diff"] = diff_df["skipped_diff"].map(emoji_failed) +# Combine PR results with differences df = pd.concat([pr_df, diff_df], axis=1) df = df.rename_axis("Test module") +# Rename columns for better readability df = df.rename( columns={ "total": "Total tests", - "passed": "Passed tests", - "failed": "Failed tests", - "skipped": "Skipped tests", "total_diff": "Total delta", - "passed_diff": "Passed delta", - "failed_diff": "Failed delta", - "skipped_diff": "Skipped delta", "CPU Usage_diff": "CPU Usage delta", "GPU Usage_diff": "GPU Usage delta", } ) + +# Sort by CPU usage delta and total tests df = df.sort_values(by=["CPU Usage delta", "Total tests"], ascending=False) + +# Apply emoji formatting to usage deltas df["CPU Usage delta"] = df["CPU Usage delta"].map(emoji_failed) df["GPU Usage delta"] = df["GPU Usage delta"].map(emoji_passed) + +# Select final columns to display df = df[ [ "Total tests", "CPU Usage delta", "GPU Usage delta", - "Passed tests", - "Failed tests", - "Skipped tests", "CPU Usage", "GPU Usage", "Total delta", - "Passed delta", - "Failed delta", - "Skipped delta", ] ] -print(comment) +# Print summary and results +print(metrics_df.to_markdown()) print() print( - f"Average GPU usage: {gpu_usage_mean}% {'an increase' if gpu_usage_rate_change > 0 else 'a decrease'} by {gpu_usage_rate_change}%" + f"Average GPU usage: {gpu_usage_mean}% ({gpu_usage_rate_change:+.2f}% change from trunk)" ) -print() print(f"Average CPU usage: {cpu_usage_mean}%") print() print("Here are the results of running the Pandas tests against this PR:") diff --git a/ci/cudf_pandas_scripts/pandas-tests/run.sh b/ci/cudf_pandas_scripts/pandas-tests/run.sh index 25ce70da01f..715f7c71649 100755 --- a/ci/cudf_pandas_scripts/pandas-tests/run.sh +++ b/ci/cudf_pandas_scripts/pandas-tests/run.sh @@ -11,6 +11,9 @@ EXITCODE=0 trap "EXITCODE=1" ERR set +e +rapids-logger "Check GPU usage" +nvidia-smi + PANDAS_TESTS_BRANCH=${1} RAPIDS_FULL_VERSION=$(<./VERSION) rapids-logger "Running Pandas tests using $PANDAS_TESTS_BRANCH branch and rapids-version $RAPIDS_FULL_VERSION" @@ -34,7 +37,7 @@ RAPIDS_TESTS_DIR=${RAPIDS_TESTS_DIR:-"${RESULTS_DIR}/test-results"}/ mkdir -p "${RAPIDS_TESTS_DIR}" timeout 90m bash python/cudf/cudf/pandas/scripts/run-pandas-tests.sh \ - --numprocesses 5 \ + --numprocesses 6 \ --tb=line \ -vv \ --disable-warnings \ @@ -51,5 +54,6 @@ RAPIDS_ARTIFACTS_DIR=${RAPIDS_ARTIFACTS_DIR:-"${PWD}/artifacts"} mkdir -p "${RAPIDS_ARTIFACTS_DIR}" mv pandas-testing/"${SUMMARY_FILE_NAME}" "${RAPIDS_ARTIFACTS_DIR}"/ rapids-upload-to-s3 "${RAPIDS_ARTIFACTS_DIR}"/"${SUMMARY_FILE_NAME}" "${RAPIDS_ARTIFACTS_DIR}" +mv "${RAPIDS_ARTIFACTS_DIR}"/"${SUMMARY_FILE_NAME}" "${RAPIDS_ARTIFACTS_DIR}"/${PANDAS_TESTS_BRANCH}-results.json rapids-logger "Test script exiting with value: $EXITCODE" exit ${EXITCODE} diff --git a/python/cudf/cudf/pandas/scripts/summarize-test-results.py b/python/cudf/cudf/pandas/scripts/summarize-test-results.py index 6a91f9b07c9..4493550d941 100644 --- a/python/cudf/cudf/pandas/scripts/summarize-test-results.py +++ b/python/cudf/cudf/pandas/scripts/summarize-test-results.py @@ -29,9 +29,11 @@ def get_per_module_results(log_file_name): for line in f: try: line = json.loads(line) + except Exception: line = {} if "outcome" in line: + was_skipped_by_cudf_pandas = False outcome = line["outcome"] # outcome can be "passed", "failed", or "skipped". # Depending on other fields, it can indicate @@ -42,6 +44,16 @@ def get_per_module_results(log_file_name): # if the test failed during setup or teardown, # it counts as an "errored" test: outcome = "errored" + elif outcome == "skipped": + longrepr = line.get("longrepr", []) + if ( + longrepr is not None + and "Skipped: XPASSes with cudf.pandas enabled." + in longrepr + ): + was_skipped_by_cudf_pandas = True + else: + continue else: # we don't care about other outcomes during # setup or teardown @@ -50,6 +62,11 @@ def get_per_module_results(log_file_name): if line.get("wasxfail", False) and outcome == "passed": # it's an xpassed test outcome = "failed" + + was_xfailed_by_cudf_pandas = ( + line.get("wasxfail", "") + == "Fails with cudf.pandas enabled." + ) module_name = ( line["nodeid"] .split("::")[0] @@ -58,8 +75,22 @@ def get_per_module_results(log_file_name): per_module_results.setdefault(module_name, {}) per_module_results[module_name].setdefault("total", 0) per_module_results[module_name].setdefault(outcome, 0) + per_module_results[module_name].setdefault( + "xfailed_by_cudf_pandas", 0 + ) + per_module_results[module_name].setdefault( + "skipped_by_cudf_pandas", 0 + ) per_module_results[module_name]["total"] += 1 per_module_results[module_name][outcome] += 1 + if was_xfailed_by_cudf_pandas: + per_module_results[module_name][ + "xfailed_by_cudf_pandas" + ] += 1 + if was_skipped_by_cudf_pandas: + per_module_results[module_name][ + "skipped_by_cudf_pandas" + ] += 1 directory = os.path.dirname(log_file_name) pattern = os.path.join(directory, "function_call_counts_worker_*.json") From 9bf911cbfc7649aa8811cbc90d85cabfb2c4ca65 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Fri, 29 Aug 2025 16:38:25 -0700 Subject: [PATCH 236/366] Disallow loc.__setitem__ with list-like indexer when list elements not in index (#19851) closes https://github.com/rapidsai/cudf/issues/19843 Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - Tom Augspurger (https://github.com/TomAugspurger) - Matthew Murray (https://github.com/Matt711) URL: https://github.com/rapidsai/cudf/pull/19851 --- python/cudf/cudf/core/dataframe.py | 2 ++ python/cudf/cudf/core/series.py | 5 ++++- .../cudf/tests/dataframe/indexing/test_loc.py | 20 +++++++++++++++++++ 3 files changed, 26 insertions(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 9833768a15a..4571a94cd03 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -387,6 +387,8 @@ def _setitem_tuple_arg(self, key, value): for col in columns_df._column_names: self._frame[col].loc[key[0]] = value except KeyError: + if not is_scalar(key[0]): + raise # TODO: There is a potential bug here if the inplace modifications # done above fail half-way we are left with a partially modified # frame. Need to handle this case better. diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index eaffa06d96d..f2adf407d2c 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -379,7 +379,10 @@ def _loc_to_iloc(self, arg): self._frame, Index._from_column(col) ) if indices.null_count > 0: - raise KeyError("label scalar is out of bound") + missing = ( + indices[indices.isnull()].index.to_pandas().tolist() + ) + raise KeyError(f"{missing} not in the index.") return indices diff --git a/python/cudf/cudf/tests/dataframe/indexing/test_loc.py b/python/cudf/cudf/tests/dataframe/indexing/test_loc.py index 82277af12c5..7821347cbe8 100644 --- a/python/cudf/cudf/tests/dataframe/indexing/test_loc.py +++ b/python/cudf/cudf/tests/dataframe/indexing/test_loc.py @@ -1,4 +1,5 @@ # Copyright (c) 2025, NVIDIA CORPORATION. +import re import pandas as pd import pytest @@ -64,6 +65,25 @@ def test_loc_setitem_extend_existing_12505(): assert_eq(df, cdf) +def test_loc_setitem_list_arg_missing_raises(): + data = {"a": [0]} + gdf = cudf.DataFrame(data) + pdf = pd.DataFrame(data) + + cudf_msg = re.escape("[1] not in the index.") + with pytest.raises(KeyError, match=cudf_msg): + gdf.loc[[1]] = 1 + + with pytest.raises(KeyError, match=cudf_msg): + gdf.loc[[1], "a"] = 1 + + with pytest.raises(KeyError): + pdf.loc[[1]] = 1 + + with pytest.raises(KeyError): + pdf.loc[[1], "a"] = 1 + + @pytest.mark.xfail(reason="https://github.com/rapidsai/cudf/issues/12801") def test_loc_setitem_add_column_partial_12801(): df = pd.DataFrame({"a": [0, 1, 2]}) From b15d7aae5d5ea66e5ff427d5353531b976fad9e4 Mon Sep 17 00:00:00 2001 From: "Richard (Rick) Zamora" Date: Sat, 30 Aug 2025 12:04:03 -0500 Subject: [PATCH 237/366] Collect join-key information in cudf-polars (#19736) Closes https://github.com/rapidsai/cudf/issues/19392 - [x] Blocked by https://github.com/rapidsai/cudf/pull/19752 This PR includes the following changes: - Updates the `ColumnStats` class to use `ColumnStats.unique_count` instead of `ColumnStats.unique_stats`. - *Rational*: I spent a lot of time experimenting with follow-on work, and I found no value for the "fraction" component of `UniqueStats` outside of the data source information. - Adds an `initialize_join_info` function (called within `collect_base_stats`) to collect basic join-key information. - Adds a dedicated `JoinKey` and `JoinInfo` classes to track the necessary join information. - Adds new `join_info: JoinInfo` attribute to `StatsCollector`. - Adds `find_equivalence_sets` and `apply_pkfk_heuristics` functions. - These functions are used to process the information in `StatsCollector.join_info` after the `collect_base_stats` traversal. - For now, these functions are only used for testing (`test_base_stats_join_key_info`). In a follow-up PR, these functions will be used in a *second* statistics traversal to set final row-count and unique-value statistics for each IR node. **Note**: The logic in this PR was adapted from @wence-'s ["doodle"](https://github.com/wence-/cudf/blob/65213d21a4917c4df6a23691acd9dc455c02027f/python/cudf_polars/cudf_polars/experimental/tablestats.py#L67). Authors: - Richard (Rick) Zamora (https://github.com/rjzamora) Approvers: - Tom Augspurger (https://github.com/TomAugspurger) URL: https://github.com/rapidsai/cudf/pull/19736 --- .../cudf_polars/experimental/base.py | 112 ++++++++++++-- .../cudf_polars/experimental/statistics.py | 144 +++++++++++++++++- python/cudf_polars/docs/overview.md | 16 +- .../tests/experimental/test_stats.py | 79 +++++++++- 4 files changed, 334 insertions(+), 17 deletions(-) diff --git a/python/cudf_polars/cudf_polars/experimental/base.py b/python/cudf_polars/cudf_polars/experimental/base.py index e4c0a43aa64..37a88263c38 100644 --- a/python/cudf_polars/cudf_polars/experimental/base.py +++ b/python/cudf_polars/cudf_polars/experimental/base.py @@ -5,10 +5,12 @@ from __future__ import annotations import dataclasses +from collections import defaultdict +from functools import cached_property from typing import TYPE_CHECKING, Any, Generic, TypeVar if TYPE_CHECKING: - from collections.abc import Generator, Iterator + from collections.abc import Generator, Iterator, MutableMapping from cudf_polars.dsl.expr import NamedExpr from cudf_polars.dsl.ir import IR @@ -72,7 +74,7 @@ class ColumnStat(Generic[T]): @dataclasses.dataclass class UniqueStats: """ - Unique-value statistics. + Sampled unique-value statistics. Parameters ---------- @@ -82,6 +84,11 @@ class UniqueStats: Unique-value fraction. This corresponds to the total number of unique values (count) divided by the total number of rows. + + Notes + ----- + This class is used to track unique-value column statistics + that have been sampled from a data source. """ count: ColumnStat[int] = dataclasses.field(default_factory=ColumnStat[int]) @@ -134,14 +141,22 @@ class ColumnSourceInfo: direct access to column-specific information. """ - __slots__ = ("_allow_unique_sampling", "column_name", "table_source_info") + __slots__ = ( + "_allow_unique_sampling", + "column_name", + "implied_unique_count", + "table_source_info", + ) table_source_info: DataSourceInfo column_name: str + implied_unique_count: ColumnStat[int] + """Unique-value count implied by join heuristics.""" _allow_unique_sampling: bool def __init__(self, table_source_info: DataSourceInfo, column_name: str) -> None: self.table_source_info = table_source_info self.column_name = column_name + self.implied_unique_count = ColumnStat[int](None) self._allow_unique_sampling = False @property @@ -193,16 +208,16 @@ class ColumnStats: Child ColumnStats objects. source_info Column source information. - unique_stats - Unique-value statistics. + unique_count + Unique-value count. """ - __slots__ = ("children", "name", "source_info", "unique_stats") + __slots__ = ("children", "name", "source_info", "unique_count") name: str children: tuple[ColumnStats, ...] source_info: ColumnSourceInfo - unique_stats: UniqueStats + unique_count: ColumnStat[int] def __init__( self, @@ -210,12 +225,12 @@ def __init__( *, children: tuple[ColumnStats, ...] = (), source_info: ColumnSourceInfo | None = None, - unique_stats: UniqueStats | None = None, + unique_count: ColumnStat[int] | None = None, ) -> None: self.name = name self.children = children self.source_info = source_info or ColumnSourceInfo(DataSourceInfo(), name) - self.unique_stats = unique_stats or UniqueStats() + self.unique_count = unique_count or ColumnStat[int](None) def new_parent( self, @@ -243,21 +258,94 @@ def new_parent( children=(self,), # Want to reference the same DataSourceInfo source_info=self.source_info, - # Want fresh UniqueStats so we can mutate in place - unique_stats=UniqueStats(), ) +class JoinKey: + """ + Join-key information. + + Parameters + ---------- + column_stats + Column statistics for the join key. + + Notes + ----- + This class is used to track join-key information. + It is used to track the columns being joined on + and the estimated unique-value count for the join key. + """ + + column_stats: tuple[ColumnStats, ...] + implied_unique_count: int | None + """Estimated unique-value count from join heuristics.""" + + def __init__(self, *column_stats: ColumnStats) -> None: + self.column_stats = column_stats + self.implied_unique_count = None + + @cached_property + def source_row_count(self) -> int | None: + """ + Return the estimated row-count of the source columns. + + Notes + ----- + This is the maximum row-count estimate of the source columns. + """ + return max( + ( + cs.source_info.row_count.value + for cs in self.column_stats + if cs.source_info.row_count.value is not None + ), + default=None, + ) + + +class JoinInfo: + """ + Join information. + + Notes + ----- + This class is used to track mappings between joined-on + columns and joined-on keys (groups of columns). We need + these mappings to calculate equivalence sets and make + join-based unique-count and row-count estimates. + """ + + __slots__ = ("column_map", "join_map", "key_map") + + column_map: MutableMapping[ColumnStats, set[ColumnStats]] + """Mapping between joined columns.""" + key_map: MutableMapping[JoinKey, set[JoinKey]] + """Mapping between joined keys (groups of columns).""" + join_map: dict[IR, list[JoinKey]] + """Mapping between IR nodes and associated join keys.""" + + def __init__(self) -> None: + self.column_map: MutableMapping[ColumnStats, set[ColumnStats]] = defaultdict( + set[ColumnStats] + ) + self.key_map: MutableMapping[JoinKey, set[JoinKey]] = defaultdict(set[JoinKey]) + self.join_map: dict[IR, list[JoinKey]] = {} + + class StatsCollector: """Column statistics collector.""" - __slots__ = ("column_stats", "row_count") + __slots__ = ("column_stats", "join_info", "row_count") row_count: dict[IR, ColumnStat[int]] """Estimated row count for each IR node.""" column_stats: dict[IR, dict[str, ColumnStats]] """Column statistics for each IR node.""" + join_info: JoinInfo + """Join information.""" def __init__(self) -> None: self.row_count: dict[IR, ColumnStat[int]] = {} self.column_stats: dict[IR, dict[str, ColumnStats]] = {} + self.join_info = JoinInfo() diff --git a/python/cudf_polars/cudf_polars/experimental/statistics.py b/python/cudf_polars/cudf_polars/experimental/statistics.py index 18588ac7e29..fcd6644a71a 100644 --- a/python/cudf_polars/cudf_polars/experimental/statistics.py +++ b/python/cudf_polars/cudf_polars/experimental/statistics.py @@ -6,7 +6,7 @@ from __future__ import annotations import itertools -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, TypeVar from cudf_polars.dsl.ir import ( IR, @@ -20,14 +20,17 @@ ) from cudf_polars.dsl.traversal import post_traversal from cudf_polars.experimental.base import ( + ColumnStat, ColumnStats, + JoinKey, StatsCollector, ) from cudf_polars.experimental.dispatch import initialize_column_stats if TYPE_CHECKING: - from collections.abc import Sequence + from collections.abc import Mapping, Sequence + from cudf_polars.experimental.base import JoinInfo from cudf_polars.utils.config import ConfigOptions @@ -45,13 +48,150 @@ def collect_base_stats(root: IR, config_options: ConfigOptions) -> StatsCollecto Returns ------- A new StatsCollector object with populated datasource statistics. + + Notes + ----- + This function initializes the ``StatsCollector`` object + with the base datasource statistics. The goal is to build an + outline of the statistics that will be collected before any + real data is sampled. """ stats: StatsCollector = StatsCollector() for node in post_traversal([root]): + # Initialize column statistics from datasource information stats.column_stats[node] = initialize_column_stats(node, stats, config_options) + # Initialize join information + if isinstance(node, Join): + initialize_join_info(node, stats) return stats +def initialize_join_info(node: Join, stats: StatsCollector) -> None: + """ + Initialize join information for the given node. + + Parameters + ---------- + node + Join node to initialize join-key information for. + stats + StatsCollector object to update. + + Notes + ----- + This function updates ``stats.join_info``. + """ + left, right = node.children + join_info = stats.join_info + right_keys = [stats.column_stats[right][n.name] for n in node.right_on] + left_keys = [stats.column_stats[left][n.name] for n in node.left_on] + lkey = JoinKey(*right_keys) + rkey = JoinKey(*left_keys) + join_info.key_map[lkey].add(rkey) + join_info.key_map[rkey].add(lkey) + join_info.join_map[node] = [lkey, rkey] + for u, v in zip(left_keys, right_keys, strict=True): + join_info.column_map[u].add(v) + join_info.column_map[v].add(u) + + +T = TypeVar("T") + + +def find_equivalence_sets(join_map: Mapping[T, set[T]]) -> list[set[T]]: + """ + Find equivalence sets in a join-key mapping. + + Parameters + ---------- + join_map + Joined key or column mapping to find equivalence sets in. + + Returns + ------- + List of equivalence sets. + + Notes + ----- + This function is used by ``apply_pkfk_heuristics``. + """ + seen = set() + components = [] + for v in join_map: + if v not in seen: + cluster = {v} + stack = [v] + while stack: + node = stack.pop() + for n in join_map[node]: + if n not in cluster: + cluster.add(n) + stack.append(n) + components.append(cluster) + seen.update(cluster) + return components + + +def apply_pkfk_heuristics(join_info: JoinInfo) -> None: + """ + Apply PK-FK unique-count heuristics to join keys. + + Parameters + ---------- + join_info + Join information to apply PK-FK heuristics to. + + Notes + ----- + This function modifies the ``JoinKey`` objects being tracked + in ``StatsCollector.join_info`` using PK-FK heuristics to + estimate the "implied" unique-value count. This function also + modifies the inderlying ``ColumnStats`` objects included in + a join key. + """ + # This applies the PK-FK matching scheme of + # https://blobs.duckdb.org/papers/tom-ebergen-msc-thesis-join-order-optimization-with-almost-no-statistics.pdf + # See section 3.2 + for keys in find_equivalence_sets(join_info.key_map): + implied_unique_count = max( + ( + c.implied_unique_count + for c in keys + if c.implied_unique_count is not None + ), + # Default unique-count estimate is the minimum source row count + default=min( + (c.source_row_count for c in keys if c.source_row_count is not None), + default=None, + ), + ) + for key in keys: + # Update unique-count estimate for each join key + key.implied_unique_count = implied_unique_count + + # We separately apply PK-FK heuristics to individual columns so + # that we can update ColumnStats.source_info.implied_unique_count + # and use the per-column information elsewhere in the query plan. + for cols in find_equivalence_sets(join_info.column_map): + unique_count = max( + ( + cs.source_info.implied_unique_count.value + for cs in cols + if cs.source_info.implied_unique_count.value is not None + ), + default=min( + ( + cs.source_info.row_count.value + for cs in cols + if cs.source_info.row_count.value is not None + ), + default=None, + ), + ) + for cs in cols: + cs.source_info.implied_unique_count = ColumnStat[int](unique_count) + + def _update_unique_stats_columns( child_column_stats: dict[str, ColumnStats], key_names: Sequence[str], diff --git a/python/cudf_polars/docs/overview.md b/python/cudf_polars/docs/overview.md index 34e66e62436..8cf510416e4 100644 --- a/python/cudf_polars/docs/overview.md +++ b/python/cudf_polars/docs/overview.md @@ -421,10 +421,21 @@ datasource (e.g. a Parquet dataset or in-memory `DataFrame`). Since `DataSourceInfo` tracks information for an entire table, we use `ColumnSourceInfo` to provide a single-column view of the object. - `ColumnStats`: This class is used to group together the "base" -`ColumnSourceInfo` reference and the local `UniqueStats` estimates +`ColumnSourceInfo` reference and the local unique-count estimate for a specific IR + column combination. We bundle these references together to simplify the design and maintenance of `StatsCollector`. -**NOTE:** The current `UniqueStats` estimates are not yet populated. +**NOTE:** The local unique-count estimate is not yet populated. +- `JoinKey`: This class is used to define a set of columns being +joined on and the estimated unique-value count of the key. +- `JoinInfo`: This class is used to define the necessary data +structures for applying join heuristics to our query plan. +Each object contains the following attributes: + - `JoinInfo.key_map`: Returns a mapping between distinct + `JoinKey` objects that are joined on in the query plan. + - `JoinInfo.col_map`: Returns a mapping between distinct + `ColumnStats` objects that are joined on in the query plan. + - `JoinInfo.join_map`: Returns a mapping between each IR node + and the associated `JoinKey` objects. - `StatsCollector`: This class is used to collect and store statistics for all IR nodes within a single query. The statistics attached to each IR node refer to the **output** columns of the @@ -436,6 +447,7 @@ Each object has two important attributes: **NOTE:** This attribute is not yet populated. - `StatsCollector.column_stats`: Returns a mapping between each IR node and the `dict[str, ColumnStats]` mapping for that node. + - `StatsCollector.join_info`: Returns a `JoinInfo` object. ## Collecting and using statistics diff --git a/python/cudf_polars/tests/experimental/test_stats.py b/python/cudf_polars/tests/experimental/test_stats.py index f139d344eaa..dfc1d5ca065 100644 --- a/python/cudf_polars/tests/experimental/test_stats.py +++ b/python/cudf_polars/tests/experimental/test_stats.py @@ -11,7 +11,11 @@ from cudf_polars import Translator from cudf_polars.experimental.io import _clear_source_info_cache -from cudf_polars.experimental.statistics import collect_base_stats +from cudf_polars.experimental.statistics import ( + apply_pkfk_heuristics, + collect_base_stats, + find_equivalence_sets, +) from cudf_polars.testing.asserts import DEFAULT_SCHEDULER, assert_gpu_result_equal from cudf_polars.testing.io import make_partitioned_source from cudf_polars.utils.config import ConfigOptions @@ -368,3 +372,76 @@ def test_base_stats_distinct(df): source_info_y = column_stats["y"].source_info assert source_info_y.row_count.value == row_count assert source_info_y.row_count.exact + + +def test_base_stats_join_key_info(): + engine = pl.GPUEngine( + raise_on_fail=True, + executor="streaming", + executor_options={ + "scheduler": DEFAULT_SCHEDULER, + "shuffle_method": "tasks", + }, + ) + + # Customers table (PK: cust_id) + customers = pl.LazyFrame( + { + "cust_id": [1, 2], + "cust_name": ["Alice", "Bob"], + } + ) + + # Orders table (PK: order_id) + orders = pl.LazyFrame( + { + "order_id": [100, 101, 102], + "cust_id": [1, 2, 1], + "prod_id": [10, 20, 10], + "loc_id": [501, 501, 502], + "quant": [2, 1, 4], + } + ) + + # locations table (PK: prod_id, loc_id) + locations = pl.LazyFrame( + { + "prod_id": [10, 20, 10], + "loc_id": [501, 501, 502], + "price": [50, 60, 55], + } + ) + + # Step 1: Multi-key join orders and locations on prod_id & loc_id + orders_with_price = orders.join(locations, on=["prod_id", "loc_id"], how="inner") + + # Step 2: Join result to customers on cust_id + q = orders_with_price.join(customers, on="cust_id", how="inner") + + ir = Translator(q._ldf.visit(), engine).translate_ir() + config_options = ConfigOptions.from_polars_engine(engine) + stats = collect_base_stats(ir, config_options) + join_info = stats.join_info + + # Check equivalence sets + key_sets = sorted( + sorted(tuple(cs.name for cs in k.column_stats) for k in group) + for group in find_equivalence_sets(join_info.key_map) + ) + assert len(key_sets) == 2 + assert key_sets[0] == [("cust_id",), ("cust_id",)] + assert key_sets[1] == [("prod_id", "loc_id"), ("prod_id", "loc_id")] + + # Check basic PK-FK unique-count heuristics + apply_pkfk_heuristics(join_info) + implied_unique_count = join_info.join_map[ir][0].implied_unique_count + assert implied_unique_count == join_info.join_map[ir][1].implied_unique_count + assert ( + q.select(pl.col("cust_id").n_unique()).collect().item() == implied_unique_count + ) + assert ( + # Calling apply_pkfk_heuristics should update the implied_unique_count + # estimate on the associated ColumnSourceInfo as well + stats.column_stats[ir]["cust_id"].source_info.implied_unique_count.value + == implied_unique_count + ) From 3a5582a9b4c44424b341d95803020f438359c87a Mon Sep 17 00:00:00 2001 From: Peter Andreas Entschev Date: Tue, 2 Sep 2025 19:50:16 +0200 Subject: [PATCH 238/366] Prevent installation of pytest-rerunfailures 16.0.0 (#19863) The `pytest-rerunfailures=16.0.0` package had a breaking change (see https://github.com/pytest-dev/pytest-rerunfailures/issues/302) and thus in https://github.com/rapidsai/cudf/pull/19846 we pinned to prevent installation of version 16 and higher. Since then a new build `16.0.1` is out that reverts the bad change, and although some of the bad builds were yanked, the PyPI build is still not yanked, so this change prevents installing only the bad release. Authors: - Peter Andreas Entschev (https://github.com/pentschev) Approvers: - James Lamb (https://github.com/jameslamb) - GALI PREM SAGAR (https://github.com/galipremsagar) URL: https://github.com/rapidsai/cudf/pull/19863 --- conda/environments/all_cuda-129_arch-aarch64.yaml | 2 +- conda/environments/all_cuda-129_arch-x86_64.yaml | 2 +- conda/environments/all_cuda-130_arch-aarch64.yaml | 2 +- conda/environments/all_cuda-130_arch-x86_64.yaml | 2 +- dependencies.yaml | 2 +- python/cudf/pyproject.toml | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) diff --git a/conda/environments/all_cuda-129_arch-aarch64.yaml b/conda/environments/all_cuda-129_arch-aarch64.yaml index 8227b760b36..8fccfe513f8 100644 --- a/conda/environments/all_cuda-129_arch-aarch64.yaml +++ b/conda/environments/all_cuda-129_arch-aarch64.yaml @@ -75,7 +75,7 @@ dependencies: - pytest-benchmark - pytest-cases>=3.8.2 - pytest-cov -- pytest-rerunfailures<16.0 +- pytest-rerunfailures!=16.0.0 - pytest-xdist - python-confluent-kafka>=2.8.0,<2.9.0a0 - python-xxhash diff --git a/conda/environments/all_cuda-129_arch-x86_64.yaml b/conda/environments/all_cuda-129_arch-x86_64.yaml index dd2aa0a6e71..bad1763d906 100644 --- a/conda/environments/all_cuda-129_arch-x86_64.yaml +++ b/conda/environments/all_cuda-129_arch-x86_64.yaml @@ -76,7 +76,7 @@ dependencies: - pytest-benchmark - pytest-cases>=3.8.2 - pytest-cov -- pytest-rerunfailures<16.0 +- pytest-rerunfailures!=16.0.0 - pytest-xdist - python-confluent-kafka>=2.8.0,<2.9.0a0 - python-xxhash diff --git a/conda/environments/all_cuda-130_arch-aarch64.yaml b/conda/environments/all_cuda-130_arch-aarch64.yaml index f550d88d4b3..322fefc07da 100644 --- a/conda/environments/all_cuda-130_arch-aarch64.yaml +++ b/conda/environments/all_cuda-130_arch-aarch64.yaml @@ -75,7 +75,7 @@ dependencies: - pytest-benchmark - pytest-cases>=3.8.2 - pytest-cov -- pytest-rerunfailures<16.0 +- pytest-rerunfailures!=16.0.0 - pytest-xdist - python-confluent-kafka>=2.8.0,<2.9.0a0 - python-xxhash diff --git a/conda/environments/all_cuda-130_arch-x86_64.yaml b/conda/environments/all_cuda-130_arch-x86_64.yaml index d6103d5e73b..3e68fbf4d07 100644 --- a/conda/environments/all_cuda-130_arch-x86_64.yaml +++ b/conda/environments/all_cuda-130_arch-x86_64.yaml @@ -76,7 +76,7 @@ dependencies: - pytest-benchmark - pytest-cases>=3.8.2 - pytest-cov -- pytest-rerunfailures<16.0 +- pytest-rerunfailures!=16.0.0 - pytest-xdist - python-confluent-kafka>=2.8.0,<2.9.0a0 - python-xxhash diff --git a/dependencies.yaml b/dependencies.yaml index 3dc704cebf4..55c9f7cc446 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -1165,7 +1165,7 @@ dependencies: - nbformat - openpyxl # https://github.com/pytest-dev/pytest-rerunfailures/issues/302 - - pytest-rerunfailures<16.0 + - pytest-rerunfailures!=16.0.0 depends_on_dask_cuda: common: - output_types: conda diff --git a/python/cudf/pyproject.toml b/python/cudf/pyproject.toml index 50c6fd782e8..69c889c0cc8 100644 --- a/python/cudf/pyproject.toml +++ b/python/cudf/pyproject.toml @@ -76,7 +76,7 @@ cudf-pandas-tests = [ "nbconvert", "nbformat", "openpyxl", - "pytest-rerunfailures<16.0", + "pytest-rerunfailures!=16.0.0", ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. [project.urls] From a377d23897b1f07d49ce795b969c1625f80f62e7 Mon Sep 17 00:00:00 2001 From: David Wendt <45795991+davidwendt@users.noreply.github.com> Date: Tue, 2 Sep 2025 15:22:49 -0400 Subject: [PATCH 239/366] Benchmarks comparing Arrow string formats (#19552) Creates nvbench benchmarks comparing theoretical ArrowStringView implementation with the Arrow string format currently used in libcudf for select operations like hash, starts-with, sort. The ArrowStringView device vector is created manually from a generated libcudf STRING column. The functions are compared by using thrust/cub functions where custom functors wrap individual rows with a `cudf::string_view`. This way the detailed instructions are the same and only the data layout is different. Authors: - David Wendt (https://github.com/davidwendt) Approvers: - Lawrence Mitchell (https://github.com/wence-) - Vukasin Milovanovic (https://github.com/vuule) - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/19552 --- cpp/benchmarks/CMakeLists.txt | 5 + .../string/experimental/stringview_compare.cu | 500 ++++++++++++++++++ 2 files changed, 505 insertions(+) create mode 100644 cpp/benchmarks/string/experimental/stringview_compare.cu diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt index a6d070ce2d3..531a0c92c7b 100644 --- a/cpp/benchmarks/CMakeLists.txt +++ b/cpp/benchmarks/CMakeLists.txt @@ -427,6 +427,11 @@ ConfigureNVBench( string/url_decode.cu ) +# ################################################################################################## +# * strings experimental benchmark --------------------------------------------------- +ConfigureNVBench(STRINGS_EXPERIMENTAL_NVBENCH string/experimental/stringview_compare.cu) +target_link_libraries(STRINGS_EXPERIMENTAL_NVBENCH PRIVATE nanoarrow) + # ################################################################################################## # * json benchmark ------------------------------------------------------------------- ConfigureNVBench(JSON_NVBENCH json/json.cu) diff --git a/cpp/benchmarks/string/experimental/stringview_compare.cu b/cpp/benchmarks/string/experimental/stringview_compare.cu new file mode 100644 index 00000000000..f012e647dac --- /dev/null +++ b/cpp/benchmarks/string/experimental/stringview_compare.cu @@ -0,0 +1,500 @@ +/* + * Copyright (c) 2025, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include +#include + +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include + +namespace { + +// Runtime switch to use ArrowStringView instead of cudf's Arrow string format. +// +// Set to anything to use ArrowStringView, and unset to use cudf. +// Example command line to generate ArrowStringView numbers for sv_hash benchmark: +// CUDF_BM_ARROWSTRINGVIEW=1 benchmarks/STRINGS_EXPERIMENTAL_NVBENCH -d 0 -b sv_hash +// +// This will generate nvbench benchmark outputs that can be compared directly +// using the `nvbench compare.py` script. +auto const BM_ARROWSTRINGVIEW = "CUDF_BM_ARROWSTRINGVIEW"; + +/** + * Creates ArrowBinaryView objects from a strings column. + */ +struct strings_to_binary_view { + cudf::column_device_view d_strings; + cudf::detail::input_offsetalator d_offsets; + ArrowBinaryView* d_items; // output + + __device__ void operator()(cudf::size_type idx) const + { + auto& item = d_items[idx]; + if (d_strings.is_null(idx)) { + item.inlined.size = 0; // not used in this benchmark + return; + } + + auto const d_str = d_strings.element(idx); + item.inlined.size = d_str.size_bytes(); + // copy the string data to the inlined buffer if it fits + if (d_str.size_bytes() <= NANOARROW_BINARY_VIEW_INLINE_SIZE) { + thrust::copy(thrust::seq, d_str.data(), d_str.data() + d_str.size_bytes(), item.inlined.data); + thrust::uninitialized_fill(thrust::seq, + item.inlined.data + item.inlined.size, + item.inlined.data + NANOARROW_BINARY_VIEW_INLINE_SIZE, + 0); + } else { + // otherwise, copy the prefix and set the offset to the data buffer + thrust::copy(thrust::seq, + d_str.data(), + d_str.data() + NANOARROW_BINARY_VIEW_PREFIX_SIZE, + item.ref.prefix); + auto const offset = d_offsets[idx]; + item.ref.buffer_index = 0; // only one buffer in this benchmark + item.ref.offset = static_cast(offset); + } + } +}; + +/** + * Returns a string_view from an ArrowBinaryView. + * This helps in the comparison by both implementations using `cudf::string_view` + * as the base type so the actual operations are the same and only the + * format (how the data is organized) is different. + */ +__device__ cudf::string_view get_string_view(ArrowBinaryView const& item, char const* d_chars) +{ + auto const data = item.inlined.size <= NANOARROW_BINARY_VIEW_INLINE_SIZE + ? reinterpret_cast(item.inlined.data) + : d_chars + item.ref.offset; + return cudf::string_view(data, item.inlined.size); +} + +/** + * Hashes a string from an ArrowBinaryView. + */ +struct hash_arrow_sv { + ArrowBinaryView* d_items; + char const* d_chars; + __device__ cudf::hash_value_type operator()(cudf::size_type idx) const + { + auto& item = d_items[idx]; + auto const d_str = get_string_view(item, d_chars); + auto const hasher = cudf::hashing::detail::MurmurHash3_x86_32{0}; + return hasher(d_str); + } +}; + +/** + * Checks if a string from an ArrowBinaryView starts with a target string. + */ +struct starts_arrow_sv { + ArrowBinaryView* d_items; + char const* d_chars; + cudf::size_type tgt_size; + __device__ bool operator()(cudf::size_type idx) const + { + // note that this requires tgt_size <= 26 + auto const d_tgt = cudf::string_view("abcdefghijklmnopqrstuvwxyz", tgt_size); + auto& item = d_items[idx]; + auto const size = item.inlined.size; + auto const data = (size <= NANOARROW_BINARY_VIEW_INLINE_SIZE) || (tgt_size <= 4) + ? reinterpret_cast(item.inlined.data) + : d_chars + item.ref.offset; + auto const d_str = cudf::string_view(data, size); + return d_str.size_bytes() >= d_tgt.size_bytes() && + d_tgt.compare(d_str.data(), d_tgt.size_bytes()) == 0; + } +}; + +/** + * Compares two strings from ArrowBinaryView objects. + */ +struct compare_arrow_sv { + ArrowBinaryView* d_items; + char const* d_chars; + __device__ bool operator()(cudf::size_type lhs, cudf::size_type rhs) + { + auto& item_lhs = d_items[lhs]; + auto& item_rhs = d_items[rhs]; + + // shortcut to check preview bytes + auto pv_lhs = reinterpret_cast(item_lhs.inlined.data)[0]; + auto pv_rhs = reinterpret_cast(item_rhs.inlined.data)[0]; + if (pv_lhs != pv_rhs) { + return cudf::hashing::detail::swap_endian(pv_lhs) < + cudf::hashing::detail::swap_endian(pv_rhs); + } + + // prefix matches so check how many bytes are left to compare + constexpr auto prefix_size = static_cast(sizeof(uint32_t)); + auto const size_lhs = item_lhs.inlined.size; + auto const size_rhs = item_rhs.inlined.size; + // if no bytes left to compare, we are done (strings are equal) + if (size_lhs <= prefix_size && size_rhs <= prefix_size) { return false; } + + // compare the remaining bytes + auto const d_str_lhs = cudf::string_view( + get_string_view(item_lhs, d_chars).data() + prefix_size, size_lhs - prefix_size); + auto const d_str_rhs = cudf::string_view( + get_string_view(item_rhs, d_chars).data() + prefix_size, size_rhs - prefix_size); + + return d_str_lhs < d_str_rhs; + } +}; + +/** + * Hashes a string from a cudf column + */ +struct hash_sv { + cudf::column_device_view d_strings; + __device__ cudf::hash_value_type operator()(cudf::size_type idx) const + { + auto const d_str = d_strings.element(idx); + auto const hasher = cudf::hashing::detail::MurmurHash3_x86_32{0}; + return hasher(d_str); + } +}; + +/** + * Checks if a string from a cudf column starts with a target string + */ +struct starts_sv { + cudf::column_device_view d_strings; + cudf::size_type tgt_size; + __device__ bool operator()(cudf::size_type idx) const + { + auto const d_str = d_strings.element(idx); + auto const d_tgt = cudf::string_view("abcdefghijklmnopqrstuvwxyz", tgt_size); + return d_str.size_bytes() >= d_tgt.size_bytes() && + d_tgt.compare(d_str.data(), d_tgt.size_bytes()) == 0; + } +}; + +/** + * Compares two strings from a cudf column + */ +struct compare_sv { + cudf::column_device_view d_strings; + __device__ bool operator()(cudf::size_type lhs, cudf::size_type rhs) + { + auto const d_str_lhs = d_strings.element(lhs); + auto const d_str_rhs = d_strings.element(rhs); + return d_str_lhs < d_str_rhs; + } +}; + +/** + * Creates an ArrowBinaryView vector and data buffer from a strings column. + */ +std::pair, rmm::device_buffer> create_sv_array( + cudf::strings_column_view const& input, rmm::cuda_stream_view stream) +{ + auto const d_strings = cudf::column_device_view::create(input.parent(), stream); + auto d_offsets = + cudf::detail::offsetalator_factory::make_input_iterator(input.offsets(), input.offset()); + + // count the (longer) strings that will need to be stored in the data buffer + auto const num_longer_strings = thrust::count_if( + rmm::exec_policy_nosync(stream), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(input.size()), + [d_offsets] __device__(auto idx) { + return d_offsets[idx + 1] - d_offsets[idx] > NANOARROW_BINARY_VIEW_INLINE_SIZE; + }); + + // gather all the long-ish strings into a single strings column + auto [unused_col, longer_strings] = [&] { + if (num_longer_strings == input.size()) { + // we can use the input column as is for the remainder of this function + return std::pair{cudf::make_empty_column(cudf::type_id::STRING), input}; + } + auto indices = cudf::detail::make_counting_transform_iterator( + 0, + cuda::proclaim_return_type( + [d_strings = *d_strings] __device__(auto idx) { + if (d_strings.is_null(idx)) { + return cudf::strings::detail::string_index_pair{nullptr, 0}; + } + auto const d_str = d_strings.element(idx); + return (d_str.size_bytes() > NANOARROW_BINARY_VIEW_INLINE_SIZE) + ? cudf::strings::detail::string_index_pair{d_str.data(), d_str.size_bytes()} + : cudf::strings::detail::string_index_pair{"", 0}; + })); + auto longer_strings = cudf::strings::detail::make_strings_column( + indices, indices + input.size(), stream, cudf::get_current_device_resource_ref()); + stream.synchronize(); + auto const sv = cudf::strings_column_view(longer_strings->view()); + return std::pair{std::move(longer_strings), sv}; + }(); + auto [first, last] = cudf::strings::detail::get_first_and_last_offset(longer_strings, stream); + auto const longer_chars_size = last - first; + + // Make sure only one buffer is needed. + // Using a single data buffer makes the two formats more similar focusing on the layout. + constexpr int64_t max_size = std::numeric_limits::max() / 2; + auto const num_buffers = cudf::util::div_rounding_up_safe(longer_chars_size, max_size); + CUDF_EXPECTS(num_buffers <= 1, "num_buffers must be <= 1"); + + // now build BinaryView objects from the strings in device memory + // (for-each works better than transform due to the prefix/data of the ArrowBinaryView) + auto d_items = rmm::device_uvector(input.size(), stream); + thrust::for_each_n(rmm::exec_policy_nosync(stream), + thrust::counting_iterator(0), + input.size(), + strings_to_binary_view{*d_strings, d_offsets, d_items.data()}); + + rmm::device_buffer data_buffer(longer_chars_size, stream); + auto const chars_data = longer_strings.chars_begin(stream); + CUDF_CUDA_TRY(cudaMemcpyAsync( + data_buffer.data(), chars_data, longer_chars_size, cudaMemcpyDefault, stream.value())); + + return std::pair{std::move(d_items), std::move(data_buffer)}; +} +} // namespace + +static void BM_sv_hash(nvbench::state& state) +{ + auto const num_rows = static_cast(state.get_int64("num_rows")); + auto const max_width = static_cast(state.get_int64("max_width")); + auto const min_width = state.get_int64("fw") ? max_width : 1; // fw = fixed width + + data_profile const profile = + data_profile_builder() + .distribution(cudf::type_id::STRING, distribution_id::NORMAL, min_width, max_width) + .no_validity(); + auto const column = create_random_column(cudf::type_id::STRING, row_count{num_rows}, profile); + auto col_view = column->view(); + auto stream = cudf::get_default_stream(); + state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value())); + state.add_global_memory_writes(num_rows * sizeof(cudf::hash_value_type)); + auto output = rmm::device_uvector(num_rows, stream); + auto begin = thrust::make_counting_iterator(0); + auto end = thrust::make_counting_iterator(num_rows); + + if (std::getenv(BM_ARROWSTRINGVIEW)) { + auto [d_items, data_buffer] = create_sv_array(col_view, stream); + auto const d_chars = reinterpret_cast(data_buffer.data()); + state.add_global_memory_reads(num_rows * sizeof(ArrowBinaryView) + data_buffer.size()); + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { + thrust::transform(rmm::exec_policy(stream), + begin, + end, + output.begin(), + hash_arrow_sv{d_items.data(), d_chars}); + }); + } else { + auto d_strings = cudf::column_device_view::create(col_view, stream); + auto col_size = column->alloc_size(); + state.add_global_memory_reads(col_size); + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { + thrust::transform(rmm::exec_policy(stream), begin, end, output.begin(), hash_sv{*d_strings}); + }); + } +} + +static void BM_sv_starts(nvbench::state& state) +{ + auto const num_rows = static_cast(state.get_int64("num_rows")); + auto const max_width = static_cast(state.get_int64("max_width")); + auto const min_width = state.get_int64("fw") ? max_width : 1; + auto const tgt_size = static_cast(state.get_int64("tgt_size")); + + data_profile const profile = + data_profile_builder() + .distribution(cudf::type_id::STRING, distribution_id::NORMAL, min_width, max_width) + .no_validity(); + auto const column = create_random_column(cudf::type_id::STRING, row_count{num_rows}, profile); + auto col_view = column->view(); + auto stream = cudf::get_default_stream(); + state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value())); + state.add_global_memory_writes(num_rows * sizeof(bool)); + auto output = rmm::device_uvector(num_rows, stream); + auto begin = thrust::make_counting_iterator(0); + auto end = thrust::make_counting_iterator(num_rows); + + if (std::getenv(BM_ARROWSTRINGVIEW)) { + auto [d_items, data_buffer] = create_sv_array(col_view, stream); + auto const d_chars = reinterpret_cast(data_buffer.data()); + state.add_global_memory_reads(num_rows * sizeof(ArrowBinaryView) + data_buffer.size()); + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { + thrust::transform(rmm::exec_policy(stream), + begin, + end, + output.begin(), + starts_arrow_sv{d_items.data(), d_chars, tgt_size}); + }); + } else { + auto d_strings = cudf::column_device_view::create(col_view, stream); + auto col_size = column->alloc_size(); + state.add_global_memory_reads(col_size); + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { + thrust::transform( + rmm::exec_policy(stream), begin, end, output.begin(), starts_sv{*d_strings, tgt_size}); + }); + } +} + +static void BM_sv_sort(nvbench::state& state) +{ + auto const num_rows = static_cast(state.get_int64("num_rows")); + auto const max_width = static_cast(state.get_int64("max_width")); + auto const card = static_cast(state.get_int64("card")); + + auto h_data = std::vector(card); + std::transform(thrust::counting_iterator(0), + thrust::counting_iterator(card), + h_data.begin(), + [max_width](auto idx) { + auto const fmt = std::format("{{:0{}d}}", max_width); + return std::vformat(fmt, std::make_format_args(idx)); + }); + auto d_data = cudf::test::strings_column_wrapper(h_data.begin(), h_data.end()).release(); + + data_profile gather_profile = data_profile_builder().cardinality(0).no_validity().distribution( + cudf::type_id::INT32, distribution_id::UNIFORM, 0, d_data->size() - 1); + auto gather_map = create_random_column(cudf::type_id::INT32, row_count{num_rows}, gather_profile); + + auto table = cudf::gather(cudf::table_view({d_data->view()}), gather_map->view()); + auto column = std::move(table->release().front()); + + auto col_view = column->view(); + auto stream = cudf::get_default_stream(); + state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value())); + state.add_global_memory_writes(num_rows * sizeof(cudf::size_type)); + + // indices are the keys that are sorted (not inplace) + auto keys = rmm::device_uvector(num_rows, stream); + auto in_keys = thrust::make_counting_iterator(0); + auto out_keys = keys.begin(); + auto tmp_bytes = std::size_t{0}; + + if (std::getenv(BM_ARROWSTRINGVIEW)) { + auto [d_items, data_buffer] = create_sv_array(col_view, stream); + auto const d_chars = reinterpret_cast(data_buffer.data()); + auto comparator = compare_arrow_sv{d_items.data(), d_chars}; + cub::DeviceMergeSort::SortKeysCopy( + nullptr, tmp_bytes, in_keys, out_keys, num_rows, comparator, stream.value()); + auto tmp_stg = rmm::device_buffer(tmp_bytes, stream); + state.add_global_memory_reads(num_rows * sizeof(ArrowBinaryView) + data_buffer.size()); + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { + cub::DeviceMergeSort::SortKeysCopy( + tmp_stg.data(), tmp_bytes, in_keys, out_keys, num_rows, comparator, stream.value()); + }); + } else { + auto d_strings = cudf::column_device_view::create(col_view, stream); + auto col_size = column->alloc_size(); + state.add_global_memory_reads(col_size); + auto comparator = compare_sv{*d_strings}; + cub::DeviceMergeSort::SortKeysCopy( + nullptr, tmp_bytes, in_keys, out_keys, num_rows, comparator, stream.value()); + auto tmp_stg = rmm::device_buffer(tmp_bytes, stream); + state.add_global_memory_reads(col_size); + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { + cub::DeviceMergeSort::SortKeysCopy( + tmp_stg.data(), tmp_bytes, in_keys, out_keys, num_rows, comparator, stream.value()); + }); + } +} + +static void BM_sv_gather(nvbench::state& state) +{ + auto const num_rows = static_cast(state.get_int64("num_rows")); + auto const width = static_cast(state.get_int64("width")); + auto const map_rows = static_cast(state.get_int64("map_rows")); + + data_profile profile = data_profile_builder().no_validity().distribution( + cudf::type_id::STRING, distribution_id::NORMAL, width, width); + auto const column = create_random_column(cudf::type_id::STRING, row_count{num_rows}, profile); + auto col_view = column->view(); + + data_profile map_profile = data_profile_builder().cardinality(0).no_validity().distribution( + cudf::type_id::INT32, distribution_id::UNIFORM, 0, num_rows - 1); + auto map = create_random_column(cudf::type_id::INT32, row_count{map_rows}, map_profile); + auto map_view = map->view(); + + auto stream = cudf::get_default_stream(); + state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value())); + + if (std::getenv(BM_ARROWSTRINGVIEW)) { + auto [d_items, data_buffer] = create_sv_array(col_view, stream); + + auto begin = map_view.begin(); + auto end = map_view.end(); + auto input = d_items.data(); + auto output = rmm::device_uvector(map_view.size(), stream); + + state.add_global_memory_writes(map_rows * sizeof(ArrowBinaryView)); + state.add_global_memory_reads(map_rows * sizeof(ArrowBinaryView)); + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { + thrust::gather(rmm::exec_policy(stream), begin, end, input, output.begin()); + }); + } else { + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { + cudf::gather( + cudf::table_view({col_view}), map_view, cudf::out_of_bounds_policy::DONT_CHECK, stream); + }); + } +} + +NVBENCH_BENCH(BM_sv_hash) + .set_name("sv_hash") + .add_int64_axis("num_rows", {100'000, 1'000'000, 10'000'000}) + .add_int64_axis("max_width", {5, 10, 15, 20, 30, 60}) + .add_int64_axis("fw", {1, 0}); + +NVBENCH_BENCH(BM_sv_starts) + .set_name("sv_starts") + .add_int64_axis("num_rows", {100'000, 1'000'000, 10'000'000}) + .add_int64_axis("max_width", {10, 20, 30, 60}) + .add_int64_axis("tgt_size", {4, 8, 16}) + .add_int64_axis("fw", {1, 0}); + +NVBENCH_BENCH(BM_sv_sort) + .set_name("sv_sort") + .add_int64_axis("num_rows", {100'000, 1'000'000, 10'000'000}) + .add_int64_axis("max_width", {10, 20, 30, 60}) + .add_int64_axis("card", {100, 1000}); + +NVBENCH_BENCH(BM_sv_gather) + .set_name("sv_gather") + .add_int64_axis("num_rows", {100'000, 1'000'000, 10'000'000}) + .add_int64_axis("width", {6, 12, 24, 48}) + .add_int64_axis("map_rows", {10'000, 100'000}); From 3de49dfb97c305c878593a0bda12b835f04abc16 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Tue, 2 Sep 2025 12:26:09 -0700 Subject: [PATCH 240/366] Add streams to strings convert APIs (#19780) Contributes to #15163 Authors: - Vyas Ramasubramani (https://github.com/vyasr) Approvers: - Tom Augspurger (https://github.com/TomAugspurger) URL: https://github.com/rapidsai/cudf/pull/19780 --- .../strings/convert/convert_booleans.pxd | 10 ++- .../strings/convert/convert_datetime.pxd | 13 ++-- .../strings/convert/convert_durations.pxd | 10 ++- .../strings/convert/convert_fixed_point.pxd | 13 ++-- .../strings/convert/convert_floats.pxd | 13 ++-- .../strings/convert/convert_integers.pxd | 25 ++++--- .../libcudf/strings/convert/convert_ipv4.pxd | 13 ++-- .../libcudf/strings/convert/convert_lists.pxd | 7 +- .../libcudf/strings/convert/convert_urls.pxd | 10 ++- .../strings/convert/convert_booleans.pyi | 13 +++- .../strings/convert/convert_booleans.pyx | 26 ++++++-- .../strings/convert/convert_datetime.pyi | 18 +++-- .../strings/convert/convert_datetime.pyx | 40 +++++++++--- .../strings/convert/convert_durations.pxd | 9 ++- .../strings/convert/convert_durations.pyi | 15 ++++- .../strings/convert/convert_durations.pyx | 28 ++++++-- .../strings/convert/convert_fixed_point.pxd | 9 +-- .../strings/convert/convert_fixed_point.pyi | 16 +++-- .../strings/convert/convert_fixed_point.pyx | 35 +++++++--- .../strings/convert/convert_floats.pyi | 12 ++-- .../strings/convert/convert_floats.pyx | 34 +++++++--- .../strings/convert/convert_integers.pxd | 15 +++-- .../strings/convert/convert_integers.pyi | 26 ++++++-- .../strings/convert/convert_integers.pyx | 65 ++++++++++++++----- .../strings/convert/convert_ipv4.pyi | 14 ++-- .../strings/convert/convert_ipv4.pyx | 35 +++++++--- .../strings/convert/convert_lists.pxd | 6 +- .../strings/convert/convert_lists.pyi | 5 +- .../strings/convert/convert_lists.pyx | 18 +++-- .../strings/convert/convert_urls.pyi | 8 ++- .../strings/convert/convert_urls.pyx | 25 +++++-- 31 files changed, 423 insertions(+), 163 deletions(-) diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_booleans.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_booleans.pxd index 37f39b098b3..0cd44017804 100644 --- a/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_booleans.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_booleans.pxd @@ -1,18 +1,22 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. +# Copyright (c) 2020-2025, NVIDIA CORPORATION. from libcpp.memory cimport unique_ptr from pylibcudf.exception_handler cimport libcudf_exception_handler from pylibcudf.libcudf.column.column cimport column from pylibcudf.libcudf.column.column_view cimport column_view from pylibcudf.libcudf.scalar.scalar cimport string_scalar +from rmm.librmm.cuda_stream_view cimport cuda_stream_view + cdef extern from "cudf/strings/convert/convert_booleans.hpp" namespace \ "cudf::strings" nogil: cdef unique_ptr[column] to_booleans( column_view input, - string_scalar true_string) except +libcudf_exception_handler + string_scalar true_string, + cuda_stream_view stream) except +libcudf_exception_handler cdef unique_ptr[column] from_booleans( column_view booleans, string_scalar true_string, - string_scalar false_string) except +libcudf_exception_handler + string_scalar false_string, + cuda_stream_view stream) except +libcudf_exception_handler diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_datetime.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_datetime.pxd index c316b7891a3..d692ff6ea7a 100644 --- a/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_datetime.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_datetime.pxd @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. +# Copyright (c) 2020-2025, NVIDIA CORPORATION. from libcpp.memory cimport unique_ptr from libcpp.string cimport string from pylibcudf.exception_handler cimport libcudf_exception_handler @@ -6,19 +6,24 @@ from pylibcudf.libcudf.column.column cimport column from pylibcudf.libcudf.column.column_view cimport column_view from pylibcudf.libcudf.types cimport data_type +from rmm.librmm.cuda_stream_view cimport cuda_stream_view + cdef extern from "cudf/strings/convert/convert_datetime.hpp" namespace \ "cudf::strings" nogil: cdef unique_ptr[column] to_timestamps( column_view input, data_type timestamp_type, - string format) except +libcudf_exception_handler + string format, + cuda_stream_view stream) except +libcudf_exception_handler cdef unique_ptr[column] from_timestamps( column_view timestamps, string format, - column_view names) except +libcudf_exception_handler + column_view names, + cuda_stream_view stream) except +libcudf_exception_handler cdef unique_ptr[column] is_timestamp( column_view input_col, - string format) except +libcudf_exception_handler + string format, + cuda_stream_view stream) except +libcudf_exception_handler diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_durations.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_durations.pxd index 75374208172..7d7329d58e9 100644 --- a/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_durations.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_durations.pxd @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. +# Copyright (c) 2020-2025, NVIDIA CORPORATION. from libcpp.memory cimport unique_ptr from libcpp.string cimport string from pylibcudf.exception_handler cimport libcudf_exception_handler @@ -6,14 +6,18 @@ from pylibcudf.libcudf.column.column cimport column from pylibcudf.libcudf.column.column_view cimport column_view from pylibcudf.libcudf.types cimport data_type +from rmm.librmm.cuda_stream_view cimport cuda_stream_view + cdef extern from "cudf/strings/convert/convert_durations.hpp" namespace \ "cudf::strings" nogil: cdef unique_ptr[column] to_durations( const column_view & input, data_type duration_type, - const string & format) except +libcudf_exception_handler + const string & format, + cuda_stream_view stream) except +libcudf_exception_handler cdef unique_ptr[column] from_durations( const column_view & durations, - const string & format) except +libcudf_exception_handler + const string & format, + cuda_stream_view stream) except +libcudf_exception_handler diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_fixed_point.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_fixed_point.pxd index 71c866ad211..bc8a752e18b 100644 --- a/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_fixed_point.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_fixed_point.pxd @@ -1,21 +1,26 @@ -# Copyright (c) 2021-2024, NVIDIA CORPORATION. +# Copyright (c) 2021-2025, NVIDIA CORPORATION. from libcpp.memory cimport unique_ptr from pylibcudf.exception_handler cimport libcudf_exception_handler from pylibcudf.libcudf.column.column cimport column from pylibcudf.libcudf.column.column_view cimport column_view from pylibcudf.libcudf.types cimport data_type +from rmm.librmm.cuda_stream_view cimport cuda_stream_view + cdef extern from "cudf/strings/convert/convert_fixed_point.hpp" namespace \ "cudf::strings" nogil: cdef unique_ptr[column] to_fixed_point( column_view input, - data_type output_type) except +libcudf_exception_handler + data_type output_type, + cuda_stream_view stream) except +libcudf_exception_handler cdef unique_ptr[column] from_fixed_point( - column_view input) except +libcudf_exception_handler + column_view input, + cuda_stream_view stream) except +libcudf_exception_handler cdef unique_ptr[column] is_fixed_point( column_view input, - data_type decimal_type + data_type decimal_type, + cuda_stream_view stream ) except +libcudf_exception_handler diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_floats.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_floats.pxd index 7df6b914458..198b0d8be00 100644 --- a/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_floats.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_floats.pxd @@ -1,20 +1,25 @@ -# Copyright (c) 2021-2024, NVIDIA CORPORATION. +# Copyright (c) 2021-2025, NVIDIA CORPORATION. from libcpp.memory cimport unique_ptr from pylibcudf.exception_handler cimport libcudf_exception_handler from pylibcudf.libcudf.column.column cimport column from pylibcudf.libcudf.column.column_view cimport column_view from pylibcudf.libcudf.types cimport data_type +from rmm.librmm.cuda_stream_view cimport cuda_stream_view + cdef extern from "cudf/strings/convert/convert_floats.hpp" namespace \ "cudf::strings" nogil: cdef unique_ptr[column] to_floats( column_view strings, - data_type output_type) except +libcudf_exception_handler + data_type output_type, + cuda_stream_view stream) except +libcudf_exception_handler cdef unique_ptr[column] from_floats( - column_view floats) except +libcudf_exception_handler + column_view floats, + cuda_stream_view stream) except +libcudf_exception_handler cdef unique_ptr[column] is_float( - column_view input + column_view input, + cuda_stream_view stream ) except +libcudf_exception_handler diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_integers.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_integers.pxd index 4033ef51480..f8a1b11e096 100644 --- a/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_integers.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_integers.pxd @@ -1,36 +1,45 @@ -# Copyright (c) 2021-2024, NVIDIA CORPORATION. +# Copyright (c) 2021-2025, NVIDIA CORPORATION. from libcpp.memory cimport unique_ptr from pylibcudf.exception_handler cimport libcudf_exception_handler from pylibcudf.libcudf.column.column cimport column from pylibcudf.libcudf.column.column_view cimport column_view from pylibcudf.libcudf.types cimport data_type +from rmm.librmm.cuda_stream_view cimport cuda_stream_view + cdef extern from "cudf/strings/convert/convert_integers.hpp" namespace \ "cudf::strings" nogil: cdef unique_ptr[column] to_integers( column_view input, - data_type output_type) except +libcudf_exception_handler + data_type output_type, + cuda_stream_view stream) except +libcudf_exception_handler cdef unique_ptr[column] from_integers( - column_view integers) except +libcudf_exception_handler + column_view integers, + cuda_stream_view stream) except +libcudf_exception_handler cdef unique_ptr[column] is_integer( - column_view input + column_view input, + cuda_stream_view stream ) except +libcudf_exception_handler cdef unique_ptr[column] is_integer( column_view input, - data_type int_type + data_type int_type, + cuda_stream_view stream ) except +libcudf_exception_handler cdef unique_ptr[column] hex_to_integers( column_view input, - data_type output_type) except + + data_type output_type, + cuda_stream_view stream) except +libcudf_exception_handler cdef unique_ptr[column] is_hex( - column_view input + column_view input, + cuda_stream_view stream ) except +libcudf_exception_handler cdef unique_ptr[column] integers_to_hex( - column_view input) except +libcudf_exception_handler + column_view input, + cuda_stream_view stream) except +libcudf_exception_handler diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_ipv4.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_ipv4.pxd index 33f9c798ae6..bff93ab48e9 100644 --- a/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_ipv4.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_ipv4.pxd @@ -1,18 +1,23 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. +# Copyright (c) 2020-2025, NVIDIA CORPORATION. from libcpp.memory cimport unique_ptr from pylibcudf.exception_handler cimport libcudf_exception_handler from pylibcudf.libcudf.column.column cimport column from pylibcudf.libcudf.column.column_view cimport column_view +from rmm.librmm.cuda_stream_view cimport cuda_stream_view + cdef extern from "cudf/strings/convert/convert_ipv4.hpp" namespace \ "cudf::strings" nogil: cdef unique_ptr[column] ipv4_to_integers( - column_view input) except +libcudf_exception_handler + column_view input, + cuda_stream_view stream) except +libcudf_exception_handler cdef unique_ptr[column] integers_to_ipv4( - column_view integers) except +libcudf_exception_handler + column_view integers, + cuda_stream_view stream) except +libcudf_exception_handler cdef unique_ptr[column] is_ipv4( - column_view input + column_view input, + cuda_stream_view stream ) except +libcudf_exception_handler diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_lists.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_lists.pxd index 3d0a677424e..da9f9d5b5a2 100644 --- a/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_lists.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_lists.pxd @@ -1,10 +1,12 @@ -# Copyright (c) 2021-2024, NVIDIA CORPORATION. +# Copyright (c) 2021-2025, NVIDIA CORPORATION. from libcpp.memory cimport unique_ptr from pylibcudf.exception_handler cimport libcudf_exception_handler from pylibcudf.libcudf.column.column cimport column from pylibcudf.libcudf.column.column_view cimport column_view from pylibcudf.libcudf.scalar.scalar cimport string_scalar +from rmm.librmm.cuda_stream_view cimport cuda_stream_view + cdef extern from "cudf/strings/convert/convert_lists.hpp" namespace \ "cudf::strings" nogil: @@ -12,4 +14,5 @@ cdef extern from "cudf/strings/convert/convert_lists.hpp" namespace \ cdef unique_ptr[column] format_list_column( column_view input, string_scalar na_rep, - column_view separators) except +libcudf_exception_handler + column_view separators, + cuda_stream_view stream) except +libcudf_exception_handler diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_urls.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_urls.pxd index 03a14e215e0..37911a4118a 100644 --- a/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_urls.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_urls.pxd @@ -1,14 +1,18 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. +# Copyright (c) 2020-2025, NVIDIA CORPORATION. from libcpp.memory cimport unique_ptr from pylibcudf.exception_handler cimport libcudf_exception_handler from pylibcudf.libcudf.column.column cimport column from pylibcudf.libcudf.column.column_view cimport column_view +from rmm.librmm.cuda_stream_view cimport cuda_stream_view + cdef extern from "cudf/strings/convert/convert_urls.hpp" namespace \ "cudf::strings" nogil: cdef unique_ptr[column] url_encode( - column_view input) except +libcudf_exception_handler + column_view input, + cuda_stream_view stream) except +libcudf_exception_handler cdef unique_ptr[column] url_decode( - column_view input) except +libcudf_exception_handler + column_view input, + cuda_stream_view stream) except +libcudf_exception_handler diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_booleans.pyi b/python/pylibcudf/pylibcudf/strings/convert/convert_booleans.pyi index 77c09242e9a..de3637afc40 100644 --- a/python/pylibcudf/pylibcudf/strings/convert/convert_booleans.pyi +++ b/python/pylibcudf/pylibcudf/strings/convert/convert_booleans.pyi @@ -1,9 +1,16 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. +# Copyright (c) 2024-2025, NVIDIA CORPORATION. + +from rmm.pylibrmm.stream import Stream from pylibcudf.column import Column from pylibcudf.scalar import Scalar -def to_booleans(input: Column, true_string: Scalar) -> Column: ... +def to_booleans( + input: Column, true_string: Scalar, stream: Stream | None = None +) -> Column: ... def from_booleans( - booleans: Column, true_string: Scalar, false_string: Scalar + booleans: Column, + true_string: Scalar, + false_string: Scalar, + stream: Stream | None = None, ) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_booleans.pyx b/python/pylibcudf/pylibcudf/strings/convert/convert_booleans.pyx index 1899a3b27cc..3622cca5a9c 100644 --- a/python/pylibcudf/pylibcudf/strings/convert/convert_booleans.pyx +++ b/python/pylibcudf/pylibcudf/strings/convert/convert_booleans.pyx @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. +# Copyright (c) 2024-2025, NVIDIA CORPORATION. from libcpp.memory cimport unique_ptr from libcpp.utility cimport move @@ -9,12 +9,14 @@ from pylibcudf.libcudf.strings.convert cimport ( convert_booleans as cpp_convert_booleans, ) from pylibcudf.scalar cimport Scalar +from pylibcudf.utils cimport _get_stream from cython.operator import dereference +from rmm.pylibrmm.stream cimport Stream __all__ = ["from_booleans", "to_booleans"] -cpdef Column to_booleans(Column input, Scalar true_string): +cpdef Column to_booleans(Column input, Scalar true_string, Stream stream=None): """ Returns a new bool column by parsing boolean values from the strings in the provided strings column. @@ -29,6 +31,9 @@ cpdef Column to_booleans(Column input, Scalar true_string): true_string : Scalar String to expect for true. Non-matching strings are false + stream : Stream | None + CUDA stream on which to perform the operation. + Returns ------- Column @@ -38,16 +43,20 @@ cpdef Column to_booleans(Column input, Scalar true_string): cdef const string_scalar* c_true_string = ( true_string.c_obj.get() ) + stream = _get_stream(stream) with nogil: c_result = cpp_convert_booleans.to_booleans( input.view(), - dereference(c_true_string) + dereference(c_true_string), + stream.view() ) - return Column.from_libcudf(move(c_result)) + return Column.from_libcudf(move(c_result), stream) -cpdef Column from_booleans(Column booleans, Scalar true_string, Scalar false_string): +cpdef Column from_booleans( + Column booleans, Scalar true_string, Scalar false_string, Stream stream=None +): """ Returns a new strings column converting the boolean values from the provided column into strings. @@ -65,6 +74,9 @@ cpdef Column from_booleans(Column booleans, Scalar true_string, Scalar false_str false_string : Scalar String to use for false in the output column. + stream : Stream | None + CUDA stream on which to perform the operation. + Returns ------- Column @@ -77,12 +89,14 @@ cpdef Column from_booleans(Column booleans, Scalar true_string, Scalar false_str cdef const string_scalar* c_false_string = ( false_string.c_obj.get() ) + stream = _get_stream(stream) with nogil: c_result = cpp_convert_booleans.from_booleans( booleans.view(), dereference(c_true_string), dereference(c_false_string), + stream.view() ) - return Column.from_libcudf(move(c_result)) + return Column.from_libcudf(move(c_result), stream) diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_datetime.pyi b/python/pylibcudf/pylibcudf/strings/convert/convert_datetime.pyi index c6857169765..782a57895fd 100644 --- a/python/pylibcudf/pylibcudf/strings/convert/convert_datetime.pyi +++ b/python/pylibcudf/pylibcudf/strings/convert/convert_datetime.pyi @@ -1,12 +1,22 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. +# Copyright (c) 2024-2025, NVIDIA CORPORATION. + +from rmm.pylibrmm.stream import Stream from pylibcudf.column import Column from pylibcudf.types import DataType def to_timestamps( - input: Column, timestamp_type: DataType, format: str + input: Column, + timestamp_type: DataType, + format: str, + stream: Stream | None = None, ) -> Column: ... def from_timestamps( - timestamps: Column, format: str, input_strings_names: Column + timestamps: Column, + format: str, + input_strings_names: Column, + stream: Stream | None = None, +) -> Column: ... +def is_timestamp( + input: Column, format: str, stream: Stream | None = None ) -> Column: ... -def is_timestamp(input: Column, format: str) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_datetime.pyx b/python/pylibcudf/pylibcudf/strings/convert/convert_datetime.pyx index f1cd684166c..9df47def056 100644 --- a/python/pylibcudf/pylibcudf/strings/convert/convert_datetime.pyx +++ b/python/pylibcudf/pylibcudf/strings/convert/convert_datetime.pyx @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. +# Copyright (c) 2024-2025, NVIDIA CORPORATION. from libcpp.memory cimport unique_ptr from libcpp.string cimport string @@ -8,15 +8,18 @@ from pylibcudf.libcudf.column.column cimport column from pylibcudf.libcudf.strings.convert cimport ( convert_datetime as cpp_convert_datetime, ) +from pylibcudf.utils cimport _get_stream from pylibcudf.types import DataType +from rmm.pylibrmm.stream cimport Stream __all__ = ["from_timestamps", "is_timestamp", "to_timestamps"] cpdef Column to_timestamps( Column input, DataType timestamp_type, - str format + str format, + Stream stream=None ): """ Returns a new timestamp column converting a strings column into @@ -35,6 +38,9 @@ cpdef Column to_timestamps( format : str String specifying the timestamp format in strings. + stream : Stream | None + CUDA stream on which to perform the operation. + Returns ------- Column @@ -42,19 +48,22 @@ cpdef Column to_timestamps( """ cdef unique_ptr[column] c_result cdef string c_format = format.encode() + stream = _get_stream(stream) with nogil: c_result = cpp_convert_datetime.to_timestamps( input.view(), timestamp_type.c_obj, - c_format + c_format, + stream.view() ) - return Column.from_libcudf(move(c_result)) + return Column.from_libcudf(move(c_result), stream) cpdef Column from_timestamps( Column timestamps, str format, - Column input_strings_names + Column input_strings_names, + Stream stream=None ): """ Returns a new strings column converting a timestamp column into @@ -73,6 +82,9 @@ cpdef Column from_timestamps( input_strings_names : Column The string names to use for weekdays ("%a", "%A") and months ("%b", "%B"). + stream : Stream | None + CUDA stream on which to perform the operation. + Returns ------- Column @@ -80,18 +92,21 @@ cpdef Column from_timestamps( """ cdef unique_ptr[column] c_result cdef string c_format = format.encode() + stream = _get_stream(stream) with nogil: c_result = cpp_convert_datetime.from_timestamps( timestamps.view(), c_format, - input_strings_names.view() + input_strings_names.view(), + stream.view() ) - return Column.from_libcudf(move(c_result)) + return Column.from_libcudf(move(c_result), stream) cpdef Column is_timestamp( Column input, - str format + str format, + Stream stream=None ): """ Verifies the given strings column can be parsed to timestamps @@ -107,6 +122,9 @@ cpdef Column is_timestamp( format : str String specifying the timestamp format in strings. + stream : Stream | None + CUDA stream on which to perform the operation. + Returns ------- Column @@ -114,10 +132,12 @@ cpdef Column is_timestamp( """ cdef unique_ptr[column] c_result cdef string c_format = format.encode() + stream = _get_stream(stream) with nogil: c_result = cpp_convert_datetime.is_timestamp( input.view(), - c_format + c_format, + stream.view() ) - return Column.from_libcudf(move(c_result)) + return Column.from_libcudf(move(c_result), stream) diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_durations.pxd b/python/pylibcudf/pylibcudf/strings/convert/convert_durations.pxd index eecdade4ef9..1634c65458e 100644 --- a/python/pylibcudf/pylibcudf/strings/convert/convert_durations.pxd +++ b/python/pylibcudf/pylibcudf/strings/convert/convert_durations.pxd @@ -1,17 +1,20 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. +# Copyright (c) 2024-2025, NVIDIA CORPORATION. from libcpp.string cimport string from pylibcudf.column cimport Column from pylibcudf.types cimport DataType +from rmm.pylibrmm.stream cimport Stream cpdef Column to_durations( Column input, DataType duration_type, - str format + str format, + Stream stream=* ) cpdef Column from_durations( Column durations, - str format=* + str format=*, + Stream stream=* ) diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_durations.pyi b/python/pylibcudf/pylibcudf/strings/convert/convert_durations.pyi index a5787a5fe49..90c1b8d84b2 100644 --- a/python/pylibcudf/pylibcudf/strings/convert/convert_durations.pyi +++ b/python/pylibcudf/pylibcudf/strings/convert/convert_durations.pyi @@ -1,9 +1,18 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. +# Copyright (c) 2024-2025, NVIDIA CORPORATION. + +from rmm.pylibrmm.stream import Stream from pylibcudf.column import Column from pylibcudf.types import DataType def to_durations( - input: Column, duration_type: DataType, format: str + input: Column, + duration_type: DataType, + format: str, + stream: Stream | None = None, +) -> Column: ... +def from_durations( + durations: Column, + format: str | None = None, + stream: Stream | None = None, ) -> Column: ... -def from_durations(durations: Column, format: str | None = None) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_durations.pyx b/python/pylibcudf/pylibcudf/strings/convert/convert_durations.pyx index a9654afd00a..686ddeddb81 100644 --- a/python/pylibcudf/pylibcudf/strings/convert/convert_durations.pyx +++ b/python/pylibcudf/pylibcudf/strings/convert/convert_durations.pyx @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. +# Copyright (c) 2024-2025, NVIDIA CORPORATION. from libcpp.memory cimport unique_ptr from libcpp.string cimport string @@ -8,6 +8,8 @@ from pylibcudf.libcudf.column.column cimport column from pylibcudf.libcudf.strings.convert cimport ( convert_durations as cpp_convert_durations, ) +from pylibcudf.utils cimport _get_stream +from rmm.pylibrmm.stream cimport Stream from pylibcudf.types import DataType @@ -16,7 +18,8 @@ __all__ = ["from_durations", "to_durations"] cpdef Column to_durations( Column input, DataType duration_type, - str format + str format, + Stream stream=None ): """ Returns a new duration column converting a strings column into @@ -35,6 +38,9 @@ cpdef Column to_durations( format : str String specifying the duration format in strings. + stream : Stream | None + CUDA stream on which to perform the operation. + Returns ------- Column @@ -42,19 +48,22 @@ cpdef Column to_durations( """ cdef unique_ptr[column] c_result cdef string c_format = format.encode() + stream = _get_stream(stream) with nogil: c_result = cpp_convert_durations.to_durations( input.view(), duration_type.c_obj, - c_format + c_format, + stream.view() ) - return Column.from_libcudf(move(c_result)) + return Column.from_libcudf(move(c_result), stream) cpdef Column from_durations( Column durations, - str format=None + str format=None, + Stream stream=None ): """ Returns a new strings column converting a duration column into @@ -71,12 +80,16 @@ cpdef Column from_durations( The string specifying output format. Default format is "%D days %H:%M:%S". + stream : Stream | None + CUDA stream on which to perform the operation. + Returns ------- Column New strings column with formatted durations. """ cdef unique_ptr[column] c_result + stream = _get_stream(stream) if format is None: format = "%D days %H:%M:%S" @@ -85,7 +98,8 @@ cpdef Column from_durations( with nogil: c_result = cpp_convert_durations.from_durations( durations.view(), - c_format + c_format, + stream.view() ) - return Column.from_libcudf(move(c_result)) + return Column.from_libcudf(move(c_result), stream) diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_fixed_point.pxd b/python/pylibcudf/pylibcudf/strings/convert/convert_fixed_point.pxd index 049b9b3fffe..925dd04d30b 100644 --- a/python/pylibcudf/pylibcudf/strings/convert/convert_fixed_point.pxd +++ b/python/pylibcudf/pylibcudf/strings/convert/convert_fixed_point.pxd @@ -1,11 +1,12 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. +# Copyright (c) 2024-2025, NVIDIA CORPORATION. from pylibcudf.column cimport Column from pylibcudf.types cimport DataType +from rmm.pylibrmm.stream cimport Stream -cpdef Column to_fixed_point(Column input, DataType output_type) +cpdef Column to_fixed_point(Column input, DataType output_type, Stream stream=*) -cpdef Column from_fixed_point(Column input) +cpdef Column from_fixed_point(Column input, Stream stream=*) -cpdef Column is_fixed_point(Column input, DataType decimal_type=*) +cpdef Column is_fixed_point(Column input, DataType decimal_type=*, Stream stream=*) diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_fixed_point.pyi b/python/pylibcudf/pylibcudf/strings/convert/convert_fixed_point.pyi index 1192d3dfcd6..5a0464f1fb3 100644 --- a/python/pylibcudf/pylibcudf/strings/convert/convert_fixed_point.pyi +++ b/python/pylibcudf/pylibcudf/strings/convert/convert_fixed_point.pyi @@ -1,10 +1,18 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. +# Copyright (c) 2024-2025, NVIDIA CORPORATION. + +from rmm.pylibrmm.stream import Stream from pylibcudf.column import Column from pylibcudf.types import DataType -def to_fixed_point(input: Column, output_type: DataType) -> Column: ... -def from_fixed_point(input: Column) -> Column: ... +def to_fixed_point( + input: Column, output_type: DataType, stream: Stream | None = None +) -> Column: ... +def from_fixed_point( + input: Column, stream: Stream | None = None +) -> Column: ... def is_fixed_point( - input: Column, decimal_type: DataType | None = None + input: Column, + decimal_type: DataType | None = None, + stream: Stream | None = None, ) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_fixed_point.pyx b/python/pylibcudf/pylibcudf/strings/convert/convert_fixed_point.pyx index 00cbc822f36..367120492b0 100644 --- a/python/pylibcudf/pylibcudf/strings/convert/convert_fixed_point.pyx +++ b/python/pylibcudf/pylibcudf/strings/convert/convert_fixed_point.pyx @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. +# Copyright (c) 2024-2025, NVIDIA CORPORATION. from libcpp.memory cimport unique_ptr from libcpp.utility cimport move @@ -8,11 +8,14 @@ from pylibcudf.libcudf.strings.convert cimport ( convert_fixed_point as cpp_fixed_point, ) from pylibcudf.types cimport DataType, type_id +from pylibcudf.utils cimport _get_stream + +from rmm.pylibrmm.stream cimport Stream __all__ = ["from_fixed_point", "is_fixed_point", "to_fixed_point"] -cpdef Column to_fixed_point(Column input, DataType output_type): +cpdef Column to_fixed_point(Column input, DataType output_type, Stream stream=None): """ Returns a new fixed-point column parsing decimal values from the provided strings column. @@ -27,22 +30,27 @@ cpdef Column to_fixed_point(Column input, DataType output_type): output_type : DataType Type of fixed-point column to return including the scale value. + stream : Stream | None + CUDA stream on which to perform the operation. + Returns ------- Column New column of output_type. """ cdef unique_ptr[column] c_result + stream = _get_stream(stream) with nogil: c_result = cpp_fixed_point.to_fixed_point( input.view(), output_type.c_obj, + stream.view() ) - return Column.from_libcudf(move(c_result)) + return Column.from_libcudf(move(c_result), stream) -cpdef Column from_fixed_point(Column input): +cpdef Column from_fixed_point(Column input, Stream stream=None): """ Returns a new strings column converting the fixed-point values into a strings column. @@ -54,19 +62,25 @@ cpdef Column from_fixed_point(Column input): input : Column Fixed-point column to convert. + stream : Stream | None + CUDA stream on which to perform the operation. + Returns ------- Column New strings column. """ cdef unique_ptr[column] c_result + stream = _get_stream(stream) with nogil: - c_result = cpp_fixed_point.from_fixed_point(input.view()) + c_result = cpp_fixed_point.from_fixed_point(input.view(), stream.view()) - return Column.from_libcudf(move(c_result)) + return Column.from_libcudf(move(c_result), stream) -cpdef Column is_fixed_point(Column input, DataType decimal_type=None): +cpdef Column is_fixed_point( + Column input, DataType decimal_type=None, Stream stream=None +): """ Returns a boolean column identifying strings in which all characters are valid for conversion to fixed-point. @@ -82,12 +96,16 @@ cpdef Column is_fixed_point(Column input, DataType decimal_type=None): Fixed-point type (with scale) used only for checking overflow. Defaults to Decimal64 + stream : Stream | None + CUDA stream on which to perform the operation. + Returns ------- Column New column of boolean results for each string. """ cdef unique_ptr[column] c_result + stream = _get_stream(stream) if decimal_type is None: decimal_type = DataType(type_id.DECIMAL64) @@ -96,6 +114,7 @@ cpdef Column is_fixed_point(Column input, DataType decimal_type=None): c_result = cpp_fixed_point.is_fixed_point( input.view(), decimal_type.c_obj, + stream.view() ) - return Column.from_libcudf(move(c_result)) + return Column.from_libcudf(move(c_result), stream) diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_floats.pyi b/python/pylibcudf/pylibcudf/strings/convert/convert_floats.pyi index ddf4042e10d..9f4d9ecf107 100644 --- a/python/pylibcudf/pylibcudf/strings/convert/convert_floats.pyi +++ b/python/pylibcudf/pylibcudf/strings/convert/convert_floats.pyi @@ -1,8 +1,12 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. +# Copyright (c) 2024-2025, NVIDIA CORPORATION. + +from rmm.pylibrmm.stream import Stream from pylibcudf.column import Column from pylibcudf.types import DataType -def to_floats(strings: Column, output_type: DataType) -> Column: ... -def from_floats(floats: Column) -> Column: ... -def is_float(input: Column) -> Column: ... +def to_floats( + strings: Column, output_type: DataType, stream: Stream | None = None +) -> Column: ... +def from_floats(floats: Column, stream: Stream | None = None) -> Column: ... +def is_float(input: Column, stream: Stream | None = None) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_floats.pyx b/python/pylibcudf/pylibcudf/strings/convert/convert_floats.pyx index b5199aac577..a62a12605f9 100644 --- a/python/pylibcudf/pylibcudf/strings/convert/convert_floats.pyx +++ b/python/pylibcudf/pylibcudf/strings/convert/convert_floats.pyx @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. +# Copyright (c) 2024-2025, NVIDIA CORPORATION. from libcpp.memory cimport unique_ptr from libcpp.utility cimport move @@ -8,10 +8,13 @@ from pylibcudf.libcudf.strings.convert cimport ( convert_floats as cpp_convert_floats, ) from pylibcudf.types cimport DataType +from pylibcudf.utils cimport _get_stream + +from rmm.pylibrmm.stream cimport Stream __all__ = ["from_floats", "is_float", "to_floats"] -cpdef Column to_floats(Column strings, DataType output_type): +cpdef Column to_floats(Column strings, DataType output_type, Stream stream=None): """ Returns a new numeric column by parsing float values from each string in the provided strings column. @@ -26,23 +29,28 @@ cpdef Column to_floats(Column strings, DataType output_type): output_type : DataType Type of float numeric column to return. + stream : Stream | None + CUDA stream on which to perform the operation. + Returns ------- Column New column with floats converted from strings. """ cdef unique_ptr[column] c_result + stream = _get_stream(stream) with nogil: c_result = cpp_convert_floats.to_floats( strings.view(), output_type.c_obj, + stream.view() ) - return Column.from_libcudf(move(c_result)) + return Column.from_libcudf(move(c_result), stream) -cpdef Column from_floats(Column floats): +cpdef Column from_floats(Column floats, Stream stream=None): """ Returns a new strings column converting the float values from the provided column into strings. @@ -54,20 +62,24 @@ cpdef Column from_floats(Column floats): floats : Column Numeric column to convert. + stream : Stream | None + CUDA stream on which to perform the operation. + Returns ------- Column New strings column with floats as strings. """ cdef unique_ptr[column] c_result + stream = _get_stream(stream) with nogil: - c_result = cpp_convert_floats.from_floats(floats.view()) + c_result = cpp_convert_floats.from_floats(floats.view(), stream.view()) - return Column.from_libcudf(move(c_result)) + return Column.from_libcudf(move(c_result), stream) -cpdef Column is_float(Column input): +cpdef Column is_float(Column input, Stream stream=None): """ Returns a boolean column identifying strings in which all characters are valid for conversion to floats. @@ -79,14 +91,18 @@ cpdef Column is_float(Column input): input : Column Strings instance for this operation. + stream : Stream | None + CUDA stream on which to perform the operation. + Returns ------- Column New column of boolean results for each string. """ cdef unique_ptr[column] c_result + stream = _get_stream(stream) with nogil: - c_result = cpp_convert_floats.is_float(input.view()) + c_result = cpp_convert_floats.is_float(input.view(), stream.view()) - return Column.from_libcudf(move(c_result)) + return Column.from_libcudf(move(c_result), stream) diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_integers.pxd b/python/pylibcudf/pylibcudf/strings/convert/convert_integers.pxd index eff2e080c27..f678f4b7974 100644 --- a/python/pylibcudf/pylibcudf/strings/convert/convert_integers.pxd +++ b/python/pylibcudf/pylibcudf/strings/convert/convert_integers.pxd @@ -1,17 +1,18 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. +# Copyright (c) 2024-2025, NVIDIA CORPORATION. from pylibcudf.column cimport Column from pylibcudf.types cimport DataType +from rmm.pylibrmm.stream cimport Stream -cpdef Column to_integers(Column input, DataType output_type) +cpdef Column to_integers(Column input, DataType output_type, Stream stream=*) -cpdef Column from_integers(Column integers) +cpdef Column from_integers(Column integers, Stream stream=*) -cpdef Column is_integer(Column input, DataType int_type=*) +cpdef Column is_integer(Column input, DataType int_type=*, Stream stream=*) -cpdef Column hex_to_integers(Column input, DataType output_type) +cpdef Column hex_to_integers(Column input, DataType output_type, Stream stream=*) -cpdef Column is_hex(Column input) +cpdef Column is_hex(Column input, Stream stream=*) -cpdef Column integers_to_hex(Column input) +cpdef Column integers_to_hex(Column input, Stream stream=*) diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_integers.pyi b/python/pylibcudf/pylibcudf/strings/convert/convert_integers.pyi index b96226fba90..947812fa882 100644 --- a/python/pylibcudf/pylibcudf/strings/convert/convert_integers.pyi +++ b/python/pylibcudf/pylibcudf/strings/convert/convert_integers.pyi @@ -1,11 +1,23 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. +# Copyright (c) 2024-2025, NVIDIA CORPORATION. + +from rmm.pylibrmm.stream import Stream from pylibcudf.column import Column from pylibcudf.types import DataType -def to_integers(input: Column, output_type: DataType) -> Column: ... -def from_integers(integers: Column) -> Column: ... -def is_integer(input: Column, int_type: DataType | None = None) -> Column: ... -def hex_to_integers(input: Column, output_type: DataType) -> Column: ... -def is_hex(input: Column) -> Column: ... -def integers_to_hex(input: Column) -> Column: ... +def to_integers( + input: Column, output_type: DataType, stream: Stream | None = None +) -> Column: ... +def from_integers( + integers: Column, stream: Stream | None = None +) -> Column: ... +def is_integer( + input: Column, + int_type: DataType | None = None, + stream: Stream | None = None, +) -> Column: ... +def hex_to_integers( + input: Column, output_type: DataType, stream: Stream | None = None +) -> Column: ... +def is_hex(input: Column, stream: Stream | None = None) -> Column: ... +def integers_to_hex(input: Column, stream: Stream | None = None) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_integers.pyx b/python/pylibcudf/pylibcudf/strings/convert/convert_integers.pyx index 12984e15ce9..b581b3541e1 100644 --- a/python/pylibcudf/pylibcudf/strings/convert/convert_integers.pyx +++ b/python/pylibcudf/pylibcudf/strings/convert/convert_integers.pyx @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. +# Copyright (c) 2024-2025, NVIDIA CORPORATION. from libcpp.memory cimport unique_ptr from libcpp.utility cimport move @@ -8,6 +8,8 @@ from pylibcudf.libcudf.strings.convert cimport ( convert_integers as cpp_convert_integers, ) from pylibcudf.types cimport DataType +from pylibcudf.utils cimport _get_stream +from rmm.pylibrmm.stream cimport Stream __all__ = [ "from_integers", @@ -18,7 +20,7 @@ __all__ = [ "to_integers" ] -cpdef Column to_integers(Column input, DataType output_type): +cpdef Column to_integers(Column input, DataType output_type, Stream stream=None): """ Returns a new integer numeric column parsing integer values from the provided strings column. @@ -33,25 +35,30 @@ cpdef Column to_integers(Column input, DataType output_type): output_type : DataType Type of integer numeric column to return. + stream : Stream | None + CUDA stream on which to perform the operation. + Returns ------- Column New column with integers converted from strings. """ cdef unique_ptr[column] c_result + stream = _get_stream(stream) with nogil: c_result = move( cpp_convert_integers.to_integers( input.view(), - output_type.c_obj + output_type.c_obj, + stream.view() ) ) - return Column.from_libcudf(move(c_result)) + return Column.from_libcudf(move(c_result), stream) -cpdef Column from_integers(Column integers): +cpdef Column from_integers(Column integers, Stream stream=None): """ Returns a new strings column converting the integer values from the provided column into strings. @@ -63,24 +70,29 @@ cpdef Column from_integers(Column integers): integers : Column Strings instance for this operation. + stream : Stream | None + CUDA stream on which to perform the operation. + Returns ------- Column New strings column with integers as strings. """ cdef unique_ptr[column] c_result + stream = _get_stream(stream) with nogil: c_result = move( cpp_convert_integers.from_integers( integers.view(), + stream.view() ) ) - return Column.from_libcudf(move(c_result)) + return Column.from_libcudf(move(c_result), stream) -cpdef Column is_integer(Column input, DataType int_type=None): +cpdef Column is_integer(Column input, DataType int_type=None, Stream stream=None): """ Returns a boolean column identifying strings in which all characters are valid for conversion to integers. @@ -97,18 +109,23 @@ cpdef Column is_integer(Column input, DataType int_type=None): By default, does not check an integer type for underflow or overflow. + stream : Stream | None + CUDA stream on which to perform the operation. + Returns ------- Column New column of boolean results for each string. """ cdef unique_ptr[column] c_result + stream = _get_stream(stream) if int_type is None: with nogil: c_result = move( cpp_convert_integers.is_integer( input.view(), + stream.view() ) ) else: @@ -116,14 +133,15 @@ cpdef Column is_integer(Column input, DataType int_type=None): c_result = move( cpp_convert_integers.is_integer( input.view(), - int_type.c_obj + int_type.c_obj, + stream.view() ) ) - return Column.from_libcudf(move(c_result)) + return Column.from_libcudf(move(c_result), stream) -cpdef Column hex_to_integers(Column input, DataType output_type): +cpdef Column hex_to_integers(Column input, DataType output_type, Stream stream=None): """ Returns a new integer numeric column parsing hexadecimal values from the provided strings column. @@ -138,25 +156,30 @@ cpdef Column hex_to_integers(Column input, DataType output_type): output_type : DataType Type of integer numeric column to return. + stream : Stream | None + CUDA stream on which to perform the operation. + Returns ------- Column New column with integers converted from strings. """ cdef unique_ptr[column] c_result + stream = _get_stream(stream) with nogil: c_result = move( cpp_convert_integers.hex_to_integers( input.view(), - output_type.c_obj + output_type.c_obj, + stream.view() ) ) - return Column.from_libcudf(move(c_result)) + return Column.from_libcudf(move(c_result), stream) -cpdef Column is_hex(Column input): +cpdef Column is_hex(Column input, Stream stream=None): """ Returns a boolean column identifying strings in which all characters are valid for conversion to integers from hex. @@ -168,24 +191,29 @@ cpdef Column is_hex(Column input): input : Column Strings instance for this operation. + stream : Stream | None + CUDA stream on which to perform the operation. + Returns ------- Column New column of boolean results for each string. """ cdef unique_ptr[column] c_result + stream = _get_stream(stream) with nogil: c_result = move( cpp_convert_integers.is_hex( input.view(), + stream.view() ) ) - return Column.from_libcudf(move(c_result)) + return Column.from_libcudf(move(c_result), stream) -cpdef Column integers_to_hex(Column input): +cpdef Column integers_to_hex(Column input, Stream stream=None): """ Returns a new strings column converting integer columns to hexadecimal characters. @@ -197,18 +225,23 @@ cpdef Column integers_to_hex(Column input): input : Column Integer column to convert to hex. + stream : Stream | None + CUDA stream on which to perform the operation. + Returns ------- Column New strings column with hexadecimal characters. """ cdef unique_ptr[column] c_result + stream = _get_stream(stream) with nogil: c_result = move( cpp_convert_integers.integers_to_hex( input.view(), + stream.view() ) ) - return Column.from_libcudf(move(c_result)) + return Column.from_libcudf(move(c_result), stream) diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_ipv4.pyi b/python/pylibcudf/pylibcudf/strings/convert/convert_ipv4.pyi index b017b32598c..a333d7801df 100644 --- a/python/pylibcudf/pylibcudf/strings/convert/convert_ipv4.pyi +++ b/python/pylibcudf/pylibcudf/strings/convert/convert_ipv4.pyi @@ -1,7 +1,13 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. +# Copyright (c) 2024-2025, NVIDIA CORPORATION. + +from rmm.pylibrmm.stream import Stream from pylibcudf.column import Column -def ipv4_to_integers(input: Column) -> Column: ... -def integers_to_ipv4(integers: Column) -> Column: ... -def is_ipv4(input: Column) -> Column: ... +def ipv4_to_integers( + input: Column, stream: Stream | None = None +) -> Column: ... +def integers_to_ipv4( + integers: Column, stream: Stream | None = None +) -> Column: ... +def is_ipv4(input: Column, stream: Stream | None = None) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_ipv4.pyx b/python/pylibcudf/pylibcudf/strings/convert/convert_ipv4.pyx index e7c6aae4fa8..c50ed0769bf 100644 --- a/python/pylibcudf/pylibcudf/strings/convert/convert_ipv4.pyx +++ b/python/pylibcudf/pylibcudf/strings/convert/convert_ipv4.pyx @@ -1,14 +1,17 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. +# Copyright (c) 2024-2025, NVIDIA CORPORATION. from libcpp.memory cimport unique_ptr from libcpp.utility cimport move from pylibcudf.column cimport Column from pylibcudf.libcudf.column.column cimport column from pylibcudf.libcudf.strings.convert cimport convert_ipv4 as cpp_convert_ipv4 +from pylibcudf.utils cimport _get_stream + +from rmm.pylibrmm.stream cimport Stream __all__ = ["integers_to_ipv4", "ipv4_to_integers", "is_ipv4"] -cpdef Column ipv4_to_integers(Column input): +cpdef Column ipv4_to_integers(Column input, Stream stream=None): """ Converts IPv4 addresses into integers. @@ -19,20 +22,24 @@ cpdef Column ipv4_to_integers(Column input): input : Column Strings instance for this operation + stream : Stream | None + CUDA stream on which to perform the operation. + Returns ------- Column New uint32 column converted from strings. """ cdef unique_ptr[column] c_result + stream = _get_stream(stream) with nogil: - c_result = cpp_convert_ipv4.ipv4_to_integers(input.view()) + c_result = cpp_convert_ipv4.ipv4_to_integers(input.view(), stream.view()) - return Column.from_libcudf(move(c_result)) + return Column.from_libcudf(move(c_result), stream) -cpdef Column integers_to_ipv4(Column integers): +cpdef Column integers_to_ipv4(Column integers, Stream stream=None): """ Converts integers into IPv4 addresses as strings. @@ -43,20 +50,24 @@ cpdef Column integers_to_ipv4(Column integers): integers : Column Integer (uint32) column to convert. + stream : Stream | None + CUDA stream on which to perform the operation. + Returns ------- Column New strings column. """ cdef unique_ptr[column] c_result + stream = _get_stream(stream) with nogil: - c_result = cpp_convert_ipv4.integers_to_ipv4(integers.view()) + c_result = cpp_convert_ipv4.integers_to_ipv4(integers.view(), stream.view()) - return Column.from_libcudf(move(c_result)) + return Column.from_libcudf(move(c_result), stream) -cpdef Column is_ipv4(Column input): +cpdef Column is_ipv4(Column input, Stream stream=None): """ Returns a boolean column identifying strings in which all characters are valid for conversion to integers from IPv4 format. @@ -68,14 +79,18 @@ cpdef Column is_ipv4(Column input): input : Column Strings instance for this operation. + stream : Stream | None + CUDA stream on which to perform the operation. + Returns ------- Column New column of boolean results for each string. """ cdef unique_ptr[column] c_result + stream = _get_stream(stream) with nogil: - c_result = cpp_convert_ipv4.is_ipv4(input.view()) + c_result = cpp_convert_ipv4.is_ipv4(input.view(), stream.view()) - return Column.from_libcudf(move(c_result)) + return Column.from_libcudf(move(c_result), stream) diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_lists.pxd b/python/pylibcudf/pylibcudf/strings/convert/convert_lists.pxd index 1ba4272afa2..14eb3adddb0 100644 --- a/python/pylibcudf/pylibcudf/strings/convert/convert_lists.pxd +++ b/python/pylibcudf/pylibcudf/strings/convert/convert_lists.pxd @@ -1,11 +1,13 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. +# Copyright (c) 2024-2025, NVIDIA CORPORATION. from pylibcudf.column cimport Column from pylibcudf.scalar cimport Scalar +from rmm.pylibrmm.stream cimport Stream cpdef Column format_list_column( Column input, Scalar na_rep=*, - Column separators=* + Column separators=*, + Stream stream=* ) diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_lists.pyi b/python/pylibcudf/pylibcudf/strings/convert/convert_lists.pyi index 6ab3a4183e9..c3994f24787 100644 --- a/python/pylibcudf/pylibcudf/strings/convert/convert_lists.pyi +++ b/python/pylibcudf/pylibcudf/strings/convert/convert_lists.pyi @@ -1,4 +1,6 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. +# Copyright (c) 2024-2025, NVIDIA CORPORATION. + +from rmm.pylibrmm.stream import Stream from pylibcudf.column import Column from pylibcudf.scalar import Scalar @@ -7,4 +9,5 @@ def format_list_column( input: Column, na_rep: Scalar | None = None, separators: Column | None = None, + stream: Stream | None = None, ) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_lists.pyx b/python/pylibcudf/pylibcudf/strings/convert/convert_lists.pyx index e2abe69e519..5c0fa8f6cdc 100644 --- a/python/pylibcudf/pylibcudf/strings/convert/convert_lists.pyx +++ b/python/pylibcudf/pylibcudf/strings/convert/convert_lists.pyx @@ -24,7 +24,8 @@ __all__ = ["format_list_column"] cpdef Column format_list_column( Column input, Scalar na_rep=None, - Column separators=None + Column separators=None, + Stream stream=None ): """ Convert a list column of strings into a formatted strings column. @@ -44,18 +45,22 @@ cpdef Column format_list_column( Strings to use for enclosing list components and separating elements. Default, ``,``, ``[``, ``]`` + stream : Stream | None + CUDA stream on which to perform the operation. + Returns ------- Column New strings column """ cdef unique_ptr[column] c_result - cdef Stream stream + cdef Stream stream_local + + stream_local = _get_stream(stream) if na_rep is None: - stream = _get_stream(None) na_rep = Scalar.from_libcudf( - cpp_make_string_scalar("".encode(), stream.view()) + cpp_make_string_scalar("".encode(), stream_local.view()) ) cdef const string_scalar* c_na_rep = ( @@ -69,7 +74,8 @@ cpdef Column format_list_column( c_result = cpp_convert_lists.format_list_column( input.view(), dereference(c_na_rep), - separators.view() + separators.view(), + stream_local.view() ) - return Column.from_libcudf(move(c_result)) + return Column.from_libcudf(move(c_result), stream_local) diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_urls.pyi b/python/pylibcudf/pylibcudf/strings/convert/convert_urls.pyi index 49b8468957c..43e11dc615e 100644 --- a/python/pylibcudf/pylibcudf/strings/convert/convert_urls.pyi +++ b/python/pylibcudf/pylibcudf/strings/convert/convert_urls.pyi @@ -1,6 +1,8 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. +# Copyright (c) 2024-2025, NVIDIA CORPORATION. + +from rmm.pylibrmm.stream import Stream from pylibcudf.column import Column -def url_encode(input: Column) -> Column: ... -def url_decode(input: Column) -> Column: ... +def url_encode(input: Column, stream: Stream | None = None) -> Column: ... +def url_decode(input: Column, stream: Stream | None = None) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_urls.pyx b/python/pylibcudf/pylibcudf/strings/convert/convert_urls.pyx index bd5e23bca43..24d08d1c6e6 100644 --- a/python/pylibcudf/pylibcudf/strings/convert/convert_urls.pyx +++ b/python/pylibcudf/pylibcudf/strings/convert/convert_urls.pyx @@ -1,14 +1,17 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. +# Copyright (c) 2024-2025, NVIDIA CORPORATION. from libcpp.memory cimport unique_ptr from libcpp.utility cimport move from pylibcudf.column cimport Column from pylibcudf.libcudf.column.column cimport column from pylibcudf.libcudf.strings.convert cimport convert_urls as cpp_convert_urls +from pylibcudf.utils cimport _get_stream + +from rmm.pylibrmm.stream cimport Stream __all__ = ["url_decode", "url_encode"] -cpdef Column url_encode(Column input): +cpdef Column url_encode(Column input, Stream stream=None): """ Encodes each string using URL encoding. @@ -19,20 +22,24 @@ cpdef Column url_encode(Column input): input : Column Strings instance for this operation. + stream : Stream | None + CUDA stream on which to perform the operation. + Returns ------- Column New strings column. """ cdef unique_ptr[column] c_result + stream = _get_stream(stream) with nogil: - c_result = cpp_convert_urls.url_encode(input.view()) + c_result = cpp_convert_urls.url_encode(input.view(), stream.view()) - return Column.from_libcudf(move(c_result)) + return Column.from_libcudf(move(c_result), stream) -cpdef Column url_decode(Column input): +cpdef Column url_decode(Column input, Stream stream=None): """ Decodes each string using URL encoding. @@ -43,14 +50,18 @@ cpdef Column url_decode(Column input): input : Column Strings instance for this operation. + stream : Stream | None + CUDA stream on which to perform the operation. + Returns ------- Column New strings column. """ cdef unique_ptr[column] c_result + stream = _get_stream(stream) with nogil: - c_result = cpp_convert_urls.url_decode(input.view()) + c_result = cpp_convert_urls.url_decode(input.view(), stream.view()) - return Column.from_libcudf(move(c_result)) + return Column.from_libcudf(move(c_result), stream) From 770c542192234ecf1c6471c1174904e0bd6129a1 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 2 Sep 2025 13:31:21 -0700 Subject: [PATCH 241/366] Avoid CategoricalColumn constructors in cuDF classic (#19837) Precursor to https://github.com/rapidsai/cudf/issues/18726. We'll want to minimize direct construction of cuDF classic column via their attributes and instead use a pylibcudf in the future Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - Tom Augspurger (https://github.com/TomAugspurger) URL: https://github.com/rapidsai/cudf/pull/19837 --- python/cudf/cudf/core/column/categorical.py | 9 +--- python/cudf/cudf/core/column/column.py | 48 +++++---------------- python/cudf/cudf/core/cut.py | 19 ++------ python/cudf/cudf/core/dataframe.py | 13 +----- python/cudf/cudf/core/index.py | 11 +---- python/cudf/cudf/io/parquet.py | 14 ++---- 6 files changed, 23 insertions(+), 91 deletions(-) diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py index a7bdbac409f..464d63f75f0 100644 --- a/python/cudf/cudf/core/column/categorical.py +++ b/python/cudf/cudf/core/column/categorical.py @@ -807,14 +807,7 @@ def _concat( len(cats), cast(cudf.core.column.numerical.NumericalColumn, codes_col), ) - return CategoricalColumn( - data=None, - size=codes_col.size, - dtype=CategoricalDtype(categories=cats), - mask=codes_col.base_mask, - offset=codes_col.offset, - children=(codes_col,), # type: ignore[arg-type] - ) + return codes_col._with_type_metadata(CategoricalDtype(categories=cats)) # type: ignore[return-value] def _with_type_metadata(self: Self, dtype: Dtype) -> Self: if isinstance(dtype, CategoricalDtype): diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 162c62fe2f5..161556391eb 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -73,7 +73,6 @@ is_mixed_with_object_dtype, is_pandas_nullable_extension_dtype, min_signed_type, - min_unsigned_type, np_dtypes_to_pandas_dtypes, ) from cudf.utils.scalar import pa_scalar_to_plc_scalar @@ -1851,43 +1850,18 @@ def astype(self, dtype: DtypeObj, copy: bool = False) -> ColumnBase: def as_categorical_column( self, dtype: CategoricalDtype ) -> CategoricalColumn: - ordered = dtype.ordered - - # Re-label self w.r.t. the provided categories if dtype._categories is not None: - cat_col = dtype._categories - codes = self._label_encoding(cats=cat_col) - codes = cudf.core.column.categorical.as_unsigned_codes( - len(cat_col), codes - ) - return cudf.core.column.categorical.CategoricalColumn( - data=None, - size=None, - dtype=dtype, - mask=self.mask, - children=(codes,), - ) - - # Categories must be unique and sorted in ascending order. - cats = self.unique().sort_values() - label_dtype = min_unsigned_type(len(cats)) - labels = self._label_encoding( - cats=cats, dtype=label_dtype, na_sentinel=pa.scalar(1) - ) - # columns include null index in factorization; remove: - if self.has_nulls(): - cats = cats.dropna() - - labels = cudf.core.column.categorical.as_unsigned_codes( - len(cats), labels - ) - return cudf.core.column.categorical.CategoricalColumn( - data=None, - size=None, - dtype=CategoricalDtype(categories=cats, ordered=ordered), - mask=self.mask, - children=(labels,), - ) + # Re-label self w.r.t. the provided categories + codes = self._label_encoding(cats=dtype._categories) + else: + # Compute categories from self + cats = self.unique().sort_values() + codes = self._label_encoding(cats=cats) + if self.has_nulls(): + # TODO: Make dropna shallow copy if there are no nulls? + cats = cats.dropna() + dtype = CategoricalDtype(categories=cats, ordered=dtype.ordered) + return codes.set_mask(self.mask)._with_type_metadata(dtype) # type: ignore[return-value] def as_numerical_column(self, dtype: np.dtype) -> NumericalColumn: raise NotImplementedError diff --git a/python/cudf/cudf/core/cut.py b/python/cudf/cudf/core/cut.py index 767899fef04..fe02bd8b550 100644 --- a/python/cudf/cudf/core/cut.py +++ b/python/cudf/cudf/core/cut.py @@ -9,7 +9,6 @@ import cudf from cudf.api.types import is_list_like from cudf.core.column import as_column -from cudf.core.column.categorical import CategoricalColumn, as_unsigned_codes from cudf.core.index import IntervalIndex, interval_range @@ -286,22 +285,12 @@ def cut( # should allow duplicate categories. return interval_labels[index_labels] - index_labels = as_unsigned_codes(len(interval_labels), index_labels) # type: ignore[arg-type] - - col = CategoricalColumn( - data=None, - size=index_labels.size, - dtype=cudf.CategoricalDtype( - categories=interval_labels, ordered=ordered - ), - mask=index_labels.base_mask, - offset=index_labels.offset, - children=(index_labels,), + categorical_index = cudf.CategoricalIndex.from_codes( + categories=interval_labels, + codes=index_labels, + ordered=ordered, ) - # we return a categorical index, as we don't have a Categorical method - categorical_index = cudf.CategoricalIndex._from_column(col) - if isinstance(orig_x, (pd.Series, cudf.Series)): # if we have a series input we return a series output res_series = cudf.Series(categorical_index, index=orig_x.index) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 4571a94cd03..ab91e1a9a9b 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -61,7 +61,6 @@ column_empty, concat_columns, ) -from cudf.core.column.categorical import as_unsigned_codes from cudf.core.column_accessor import ColumnAccessor from cudf.core.copy_types import BooleanMask from cudf.core.dtypes import ( @@ -8969,16 +8968,8 @@ def _cast_cols_to_common_dtypes(col_idxs, list_of_columns, dtypes, categories): def _reassign_categories(categories, cols, col_idxs): for name, idx in zip(cols, col_idxs, strict=True): if idx in categories: - codes = as_unsigned_codes(len(categories[idx]), cols[name]) - cols[name] = CategoricalColumn( - data=None, - size=codes.size, - dtype=CategoricalDtype( - categories=categories[idx], ordered=False - ), - mask=codes.base_mask, - offset=codes.offset, - children=(codes,), + cols[name] = cols[name]._with_type_metadata( + CategoricalDtype(categories=categories[idx], ordered=False) ) diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index 1770338ff0e..e4478aa586b 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -4840,15 +4840,8 @@ def from_codes( """ codes = as_column(codes, dtype=np.dtype(np.int32)) categories = as_column(categories) - cat_col = CategoricalColumn( - data=None, - size=len(codes), - dtype=cudf.CategoricalDtype( - categories=categories, ordered=ordered - ), - offset=0, - null_count=0, - children=(codes,), + cat_col = codes._with_type_metadata( + cudf.CategoricalDtype(categories=categories, ordered=ordered) ) return cls._from_column(cat_col, name=name) diff --git a/python/cudf/cudf/io/parquet.py b/python/cudf/cudf/io/parquet.py index 25a38a4709e..a67335fb8a6 100644 --- a/python/cudf/cudf/io/parquet.py +++ b/python/cudf/cudf/io/parquet.py @@ -24,7 +24,6 @@ from cudf.api.types import is_list_like from cudf.core.buffer import acquire_spill_lock from cudf.core.column import ColumnBase, as_column, column_empty -from cudf.core.column.categorical import CategoricalColumn, as_unsigned_codes from cudf.core.dataframe import DataFrame from cudf.core.dtypes import ( CategoricalDtype, @@ -1277,17 +1276,10 @@ def _parquet_to_frame( partition_categories[name].index(value), length=_len, ) - codes = as_unsigned_codes( - len(partition_categories[name]), codes - ) - col = CategoricalColumn( - data=None, - size=codes.size, - dtype=CategoricalDtype( + col = codes._with_type_metadata( + CategoricalDtype( categories=partition_categories[name], ordered=False - ), - offset=codes.offset, - children=(codes,), + ) ) else: # Not building categorical columns, so From 245f94a2919cde7d5a72e00ad5645e5f323b1a6b Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Tue, 2 Sep 2025 22:32:16 +0200 Subject: [PATCH 242/366] Update Arrow bounds to >=15,<22 (#19592) Updates Arrow bounds to >=15,<22. This makes cuDF compatible with Arrow 20 and 21. Authors: - Bradley Dice (https://github.com/bdice) - Matthew Murray (https://github.com/Matt711) - Vyas Ramasubramani (https://github.com/vyasr) - David Wendt (https://github.com/davidwendt) - GALI PREM SAGAR (https://github.com/galipremsagar) Approvers: - Muhammad Haseeb (https://github.com/mhaseeb123) - Nghia Truong (https://github.com/ttnghia) - Vyas Ramasubramani (https://github.com/vyasr) - GALI PREM SAGAR (https://github.com/galipremsagar) URL: https://github.com/rapidsai/cudf/pull/19592 --- .../all_cuda-129_arch-aarch64.yaml | 2 +- .../all_cuda-129_arch-x86_64.yaml | 2 +- .../all_cuda-130_arch-aarch64.yaml | 2 +- .../all_cuda-130_arch-x86_64.yaml | 2 +- conda/recipes/cudf/recipe.yaml | 2 +- conda/recipes/pylibcudf/recipe.yaml | 2 +- cpp/cmake/thirdparty/get_arrow.cmake | 58 ++++++++++---- cpp/tests/CMakeLists.txt | 1 + .../quantiles/percentile_approx_test.cpp | 3 + dependencies.yaml | 6 +- python/cudf/cudf/testing/__init__.py | 1 + python/cudf/cudf/testing/testing.py | 78 +++++++++++++++++++ .../cudf/tests/input_output/test_parquet.py | 16 ++-- python/cudf/pyproject.toml | 4 +- python/pylibcudf/pyproject.toml | 8 +- 15 files changed, 151 insertions(+), 36 deletions(-) diff --git a/conda/environments/all_cuda-129_arch-aarch64.yaml b/conda/environments/all_cuda-129_arch-aarch64.yaml index 8fccfe513f8..4a296164cec 100644 --- a/conda/environments/all_cuda-129_arch-aarch64.yaml +++ b/conda/environments/all_cuda-129_arch-aarch64.yaml @@ -68,7 +68,7 @@ dependencies: - pandoc - polars>=1.28,<1.33 - pre-commit -- pyarrow>=14.0.0,<20.0.0a0 +- pyarrow>=15.0.0,<22.0.0a0 - pydata-sphinx-theme>=0.15.4 - pynvml>=12.0.0,<13.0.0a0 - pytest diff --git a/conda/environments/all_cuda-129_arch-x86_64.yaml b/conda/environments/all_cuda-129_arch-x86_64.yaml index bad1763d906..30ac023ca78 100644 --- a/conda/environments/all_cuda-129_arch-x86_64.yaml +++ b/conda/environments/all_cuda-129_arch-x86_64.yaml @@ -69,7 +69,7 @@ dependencies: - pandoc - polars>=1.28,<1.33 - pre-commit -- pyarrow>=14.0.0,<20.0.0a0 +- pyarrow>=15.0.0,<22.0.0a0 - pydata-sphinx-theme>=0.15.4 - pynvml>=12.0.0,<13.0.0a0 - pytest diff --git a/conda/environments/all_cuda-130_arch-aarch64.yaml b/conda/environments/all_cuda-130_arch-aarch64.yaml index 322fefc07da..5ab54367559 100644 --- a/conda/environments/all_cuda-130_arch-aarch64.yaml +++ b/conda/environments/all_cuda-130_arch-aarch64.yaml @@ -68,7 +68,7 @@ dependencies: - pandoc - polars>=1.28,<1.33 - pre-commit -- pyarrow>=14.0.0,<20.0.0a0 +- pyarrow>=15.0.0,<22.0.0a0 - pydata-sphinx-theme>=0.15.4 - pynvml>=12.0.0,<13.0.0a0 - pytest diff --git a/conda/environments/all_cuda-130_arch-x86_64.yaml b/conda/environments/all_cuda-130_arch-x86_64.yaml index 3e68fbf4d07..906a143b428 100644 --- a/conda/environments/all_cuda-130_arch-x86_64.yaml +++ b/conda/environments/all_cuda-130_arch-x86_64.yaml @@ -69,7 +69,7 @@ dependencies: - pandoc - polars>=1.28,<1.33 - pre-commit -- pyarrow>=14.0.0,<20.0.0a0 +- pyarrow>=15.0.0,<22.0.0a0 - pydata-sphinx-theme>=0.15.4 - pynvml>=12.0.0,<13.0.0a0 - pytest diff --git a/conda/recipes/cudf/recipe.yaml b/conda/recipes/cudf/recipe.yaml index 2389ed684c1..98013e84597 100644 --- a/conda/recipes/cudf/recipe.yaml +++ b/conda/recipes/cudf/recipe.yaml @@ -73,7 +73,7 @@ requirements: - numba-cuda >=0.19.1,<0.20.0a0 - numba >=0.60.0,<0.62.0a0 - numpy >=1.23,<3.0a0 - - pyarrow>=14.0.0,<20.0.0a0 + - pyarrow>=15.0.0,<22.0.0a0 - libcudf =${{ version }} - pylibcudf =${{ version }} - ${{ pin_compatible("rmm", upper_bound="x.x") }} diff --git a/conda/recipes/pylibcudf/recipe.yaml b/conda/recipes/pylibcudf/recipe.yaml index 4273baf5fd3..bfaec91b72c 100644 --- a/conda/recipes/pylibcudf/recipe.yaml +++ b/conda/recipes/pylibcudf/recipe.yaml @@ -77,7 +77,7 @@ requirements: - packaging run_constraints: - numpy >=1.23,<3.0a0 - - pyarrow>=14.0.0,<20.0.0a0 + - pyarrow>=15.0.0,<22.0.0a0 ignore_run_exports: from_package: - cuda-cudart-dev diff --git a/cpp/cmake/thirdparty/get_arrow.cmake b/cpp/cmake/thirdparty/get_arrow.cmake index 8293f96fb5b..1a52d263ea1 100644 --- a/cpp/cmake/thirdparty/get_arrow.cmake +++ b/cpp/cmake/thirdparty/get_arrow.cmake @@ -23,17 +23,26 @@ include_guard(GLOBAL) # This function finds arrow and sets any additional necessary environment variables. -function(find_and_configure_arrow VERSION BUILD_STATIC EXCLUDE_FROM_ALL ENABLE_PARQUET) +function(find_and_configure_arrow VERSION BUILD_STATIC EXCLUDE_FROM_ALL ENABLE_PARQUET + ENABLE_COMPUTE +) if(BUILD_STATIC) if(TARGET arrow_static) set(ARROW_FOUND TRUE PARENT_SCOPE ) - set(ARROW_LIBRARIES - arrow_static - PARENT_SCOPE - ) + if(ENABLE_COMPUTE) + set(ARROW_LIBRARIES + arrow_static arrow_compute_static + PARENT_SCOPE + ) + else() + set(ARROW_LIBRARIES + arrow_static + PARENT_SCOPE + ) + endif() return() endif() else() @@ -42,10 +51,17 @@ function(find_and_configure_arrow VERSION BUILD_STATIC EXCLUDE_FROM_ALL ENABLE_P TRUE PARENT_SCOPE ) - set(ARROW_LIBRARIES - arrow_shared - PARENT_SCOPE - ) + if(ENABLE_COMPUTE) + set(ARROW_LIBRARIES + arrow_shared arrow_compute_shared + PARENT_SCOPE + ) + else() + set(ARROW_LIBRARIES + arrow_shared + PARENT_SCOPE + ) + endif() return() endif() endif() @@ -92,7 +108,7 @@ function(find_and_configure_arrow VERSION BUILD_STATIC EXCLUDE_FROM_ALL ENABLE_P EXCLUDE_FROM_ALL ${EXCLUDE_FROM_ALL} OPTIONS "CMAKE_VERBOSE_MAKEFILE ON" "ARROW_ACERO ON" - "ARROW_COMPUTE ON" + "ARROW_COMPUTE ${ENABLE_COMPUTE}" "ARROW_IPC ON" "ARROW_DATASET ON" "ARROW_WITH_BACKTRACE ON" @@ -126,9 +142,17 @@ function(find_and_configure_arrow VERSION BUILD_STATIC EXCLUDE_FROM_ALL ENABLE_P ) if(BUILD_STATIC) - set(ARROW_LIBRARIES arrow_static) + if(ENABLE_COMPUTE) + set(ARROW_LIBRARIES arrow_static arrow_compute_static) + else() + set(ARROW_LIBRARIES arrow_static) + endif() else() - set(ARROW_LIBRARIES arrow_shared) + if(ENABLE_COMPUTE) + set(ARROW_LIBRARIES arrow_shared arrow_compute_shared) + else() + set(ARROW_LIBRARIES arrow_shared) + endif() endif() # Arrow_DIR: set if CPM found Arrow on the system/conda/etc. @@ -248,7 +272,7 @@ function(find_and_configure_arrow VERSION BUILD_STATIC EXCLUDE_FROM_ALL ENABLE_P BUILD Arrow VERSION ${VERSION} EXPORT_SET arrow_targets - GLOBAL_TARGETS arrow_shared arrow_static + GLOBAL_TARGETS arrow_shared arrow_static arrow_compute_static arrow_compute_shared NAMESPACE cudf:: FINAL_CODE_BLOCK arrow_code_string ) @@ -357,7 +381,7 @@ if(NOT DEFINED CUDF_VERSION_Arrow) set(CUDF_VERSION_Arrow # This version must be kept in sync with the libarrow version pinned for builds in # dependencies.yaml. - 19.0.0 + 21.0.0 CACHE STRING "The version of Arrow to find (or build)" ) endif() @@ -376,7 +400,11 @@ if(NOT DEFINED CUDF_ENABLE_ARROW_PARQUET) set(CUDF_ENABLE_ARROW_PARQUET OFF) endif() +if(NOT DEFINED CUDF_ENABLE_ARROW_COMPUTE) + set(CUDF_ENABLE_ARROW_COMPUTE OFF) +endif() + find_and_configure_arrow( ${CUDF_VERSION_Arrow} ${CUDF_USE_ARROW_STATIC} ${CUDF_EXCLUDE_ARROW_FROM_ALL} - ${CUDF_ENABLE_ARROW_PARQUET} + ${CUDF_ENABLE_ARROW_PARQUET} ${CUDF_ENABLE_ARROW_COMPUTE} ) diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index 63b41e55f51..d0a8763a5b0 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -91,6 +91,7 @@ endfunction() # No need to install Arrow libs when only the final test executables are shipped. set(CUDF_EXCLUDE_ARROW_FROM_ALL ON) +set(CUDF_ENABLE_ARROW_COMPUTE ON) include(../cmake/thirdparty/get_arrow.cmake) # ################################################################################################## diff --git a/cpp/tests/quantiles/percentile_approx_test.cpp b/cpp/tests/quantiles/percentile_approx_test.cpp index a65f4766159..c5c481ded3b 100644 --- a/cpp/tests/quantiles/percentile_approx_test.cpp +++ b/cpp/tests/quantiles/percentile_approx_test.cpp @@ -33,12 +33,15 @@ #include #include +#include namespace { std::unique_ptr arrow_percentile_approx(cudf::column_view const& _values, int delta, std::vector const& percentages) { + static auto const _arrow_init_status = arrow::compute::Initialize(); + EXPECT_TRUE(_arrow_init_status.ok()); // sort the incoming values using the same settings that groupby does. // this is a little weak because null_order::AFTER is hardcoded internally to groupby. cudf::table_view t({_values}); diff --git a/dependencies.yaml b/dependencies.yaml index 55c9f7cc446..de0115a9e6d 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -511,7 +511,7 @@ dependencies: common: - output_types: [conda] packages: - - pyarrow>=14.0.0,<20.0.0a0 + - pyarrow>=15.0.0,<22.0.0a0 - output_types: [requirements, pyproject] packages: # pyarrow 17.0.0 wheels have a subtle issue around threading that @@ -519,8 +519,8 @@ dependencies: # be highly dependent on the exact build configuration, so we'll just # avoid 17.0.0 for now unless we observe similar issues in future # releases as well. - - pyarrow>=14.0.0,<20.0.0a0; platform_machine=='x86_64' - - pyarrow>=14.0.0,<20.0.0a0,!=17.0.0; platform_machine=='aarch64' + - pyarrow>=15.0.0,<22.0.0a0; platform_machine=='x86_64' + - pyarrow>=15.0.0,<22.0.0a0,!=17.0.0; platform_machine=='aarch64' cuda_version: specific: - output_types: conda diff --git a/python/cudf/cudf/testing/__init__.py b/python/cudf/cudf/testing/__init__.py index b03e5bf4375..53cdc183c47 100644 --- a/python/cudf/cudf/testing/__init__.py +++ b/python/cudf/cudf/testing/__init__.py @@ -2,6 +2,7 @@ from cudf.testing import narwhals_test_plugin from cudf.testing.testing import ( + assert_arrow_table_equal, assert_eq, assert_frame_equal, assert_groupby_results_equal, diff --git a/python/cudf/cudf/testing/testing.py b/python/cudf/cudf/testing/testing.py index 048a54a76f8..1b2595072be 100644 --- a/python/cudf/cudf/testing/testing.py +++ b/python/cudf/cudf/testing/testing.py @@ -7,12 +7,55 @@ import cupy as cp import numpy as np import pandas as pd +import pyarrow as pa from pandas import testing as tm import cudf from cudf.core.missing import NA, NaT from cudf.utils.dtypes import CUDF_STRING_DTYPE, is_dtype_obj_numeric +pa_types = pa.types + + +def _is_string_view(dtype: pa.DataType) -> bool: + return hasattr(pa_types, "is_string_view") and pa_types.is_string_view( + dtype + ) + + +def _map_string_view_to_string(dtype: pa.DataType) -> pa.DataType: + """Convert string_view -> string""" + if _is_string_view(dtype): + return pa.string() + if pa_types.is_list(dtype): + return pa.list_(_map_string_view_to_string(dtype.value_type)) + return dtype + + +def _string_view_to_string_schema(schema: pa.Schema) -> pa.Schema: + return pa.schema( + [ + pa.field( + f.name, + _map_string_view_to_string(f.type), + nullable=f.nullable, + metadata=f.metadata, + ) + for f in schema + ], + metadata=schema.metadata, + ) + + +def _string_view_to_string(obj): + """Cast string_view -> string""" + if isinstance(obj, pa.Table): + return pa.Table.from_arrays( + obj.columns, schema=_string_view_to_string_schema(obj.schema) + ) + + return obj + def dtype_can_compare_equal_to_other(dtype): # return True if values of this dtype can compare @@ -92,6 +135,41 @@ def _check_types( ) +def assert_arrow_table_equal(left: pa.Table, right: pa.Table) -> None: + """ + Check if two pyarrow Tables are equal. + + Parameters + ---------- + left : pyarrow.Table + Left table to compare. + right : pyarrow.Table + Right table to compare. + + Raises + ------ + AssertionError + + Notes + ----- + PyArrow 21+ has shifted toward using ``string_view`` + internally in more places, whereas previous versions + used ``string``. This change causes schema equality + checks in cuDF tests to fail. To make our tests stable + and future-proof against Arrow changing representations + over time, we cast all ``string_view`` types to ``string`` + before comparison. + """ + left = _string_view_to_string(left) + right = _string_view_to_string(right) + try: + assert left.equals(right) + except AssertionError: + raise_assert_detail( + "pyarrow.Table", "Arrow Tables are different", left, right + ) + + def assert_column_equal( left, right, diff --git a/python/cudf/cudf/tests/input_output/test_parquet.py b/python/cudf/cudf/tests/input_output/test_parquet.py index c57f91e6c42..98de571ecf9 100644 --- a/python/cudf/cudf/tests/input_output/test_parquet.py +++ b/python/cudf/cudf/tests/input_output/test_parquet.py @@ -29,7 +29,11 @@ ParquetWriter, merge_parquet_filemetadata, ) -from cudf.testing import assert_eq, dataset_generator as dg +from cudf.testing import ( + assert_arrow_table_equal, + assert_eq, + dataset_generator as dg, +) from cudf.testing._utils import TIMEDELTA_TYPES, set_random_null_mask_inplace @@ -1110,7 +1114,7 @@ def test_parquet_reader_struct_basic(tmp_path, data): pa.parquet.write_table(expect, fname) assert os.path.exists(fname) got = cudf.read_parquet(fname) - assert expect.equals(got.to_arrow()) + assert_arrow_table_equal(expect, got.to_arrow()) def select_columns_params(): @@ -1188,7 +1192,7 @@ def test_parquet_reader_struct_select_columns(data, columns): expect = pq.ParquetFile(buff).read(columns=columns) got = cudf.read_parquet(buff, columns=columns) - assert expect.equals(got.to_arrow()) + assert_arrow_table_equal(expect, got.to_arrow()) def test_parquet_reader_struct_los_large(tmp_path): @@ -1205,7 +1209,7 @@ def test_parquet_reader_struct_los_large(tmp_path): pa.parquet.write_table(expect, fname) assert os.path.exists(fname) got = cudf.read_parquet(fname) - assert expect.equals(got.to_arrow()) + assert_arrow_table_equal(expect, got.to_arrow()) @pytest.mark.parametrize( @@ -1243,7 +1247,7 @@ def string_list_gen_wrapped(x, y): pa.parquet.write_table(expect, fname) assert os.path.exists(fname) got = cudf.read_parquet(fname) - assert expect.equals(got.to_arrow()) + assert_arrow_table_equal(expect, got.to_arrow()) def test_parquet_reader_v2(tmp_path, simple_pdf): @@ -2403,7 +2407,7 @@ def test_parquet_writer_list_chunked(tmp_path, store_schema): got = pq.read_table(fname) # compare with pyarrow since pandas doesn't # have a list or struct dtype - assert expect.to_arrow().equals(got) + assert_arrow_table_equal(expect.to_arrow(), got) def test_parquet_nullable_boolean(tmp_path, engine): diff --git a/python/cudf/pyproject.toml b/python/cudf/pyproject.toml index 69c889c0cc8..f97bd6a7eb3 100644 --- a/python/cudf/pyproject.toml +++ b/python/cudf/pyproject.toml @@ -29,8 +29,8 @@ dependencies = [ "nvtx>=0.2.1", "packaging", "pandas>=2.0,<2.4.0dev0", - "pyarrow>=14.0.0,<20.0.0a0,!=17.0.0; platform_machine=='aarch64'", - "pyarrow>=14.0.0,<20.0.0a0; platform_machine=='x86_64'", + "pyarrow>=15.0.0,<22.0.0a0,!=17.0.0; platform_machine=='aarch64'", + "pyarrow>=15.0.0,<22.0.0a0; platform_machine=='x86_64'", "pylibcudf==25.10.*,>=0.0.0a0", "rich", "rmm==25.10.*,>=0.0.0a0", diff --git a/python/pylibcudf/pyproject.toml b/python/pylibcudf/pyproject.toml index a76bbc36bae..811af529493 100644 --- a/python/pylibcudf/pyproject.toml +++ b/python/pylibcudf/pyproject.toml @@ -47,16 +47,16 @@ test = [ "numba-cuda[cu13]>=0.19.1,<0.20.0a0", "numba>=0.60.0,<0.62.0a0", "pandas", - "pyarrow>=14.0.0,<20.0.0a0,!=17.0.0; platform_machine=='aarch64'", - "pyarrow>=14.0.0,<20.0.0a0; platform_machine=='x86_64'", + "pyarrow>=15.0.0,<22.0.0a0,!=17.0.0; platform_machine=='aarch64'", + "pyarrow>=15.0.0,<22.0.0a0; platform_machine=='x86_64'", "pytest", "pytest-cov", "pytest-xdist", "xxhash", ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. pyarrow = [ - "pyarrow>=14.0.0,<20.0.0a0,!=17.0.0; platform_machine=='aarch64'", - "pyarrow>=14.0.0,<20.0.0a0; platform_machine=='x86_64'", + "pyarrow>=15.0.0,<22.0.0a0,!=17.0.0; platform_machine=='aarch64'", + "pyarrow>=15.0.0,<22.0.0a0; platform_machine=='x86_64'", ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. numpy = [ "numpy>=1.23,<3.0a0", From 3419b88d6673f79c2abd8803a6a08ca7f594a9f9 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 2 Sep 2025 17:01:39 -0700 Subject: [PATCH 243/366] Move test_timedelta/string/sorting/list/datetime.py to new cudf classic directory structure (#19723) Towards https://github.com/rapidsai/cudf/issues/9999 Towards https://github.com/rapidsai/cudf/issues/15723 Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/19723 --- python/cudf/cudf/tests/conftest.py | 6 + .../tests/dataframe/indexing/test_setitem.py | 56 +- .../tests/dataframe/methods/test_repeat.py | 18 + .../cudf/tests/dataframe/test_attributes.py | 17 + .../cudf/tests/groupby/test_reductions.py | 90 ++ .../datetimeindex/methods/test_reductions.py | 19 + .../indexes/multiindex/methods/test_repeat.py | 17 + .../timedeltaindex/methods/test_reductions.py | 26 + .../indexes/timedeltaindex/test_binops.py | 195 ++++ .../cudf/tests/private_objects/test_column.py | 107 +++ .../cudf/tests/series/accessors/test_list.py | 33 + .../tests/series/indexing/test_getitem.py | 22 + .../cudf/tests/series/indexing/test_iloc.py | 61 ++ .../tests/series/indexing/test_setitem.py | 434 +++++++++ .../cudf/tests/series/methods/test_argsort.py | 17 + .../cudf/tests/series/methods/test_astype.py | 391 +++++++- .../series/methods/test_nlargest_nsmallest.py | 39 + .../cudf/tests/series/methods/test_repeat.py | 27 + .../tests/series/methods/test_sort_values.py | 29 + .../tests/series/methods/test_to_pandas.py | 19 + .../cudf/tests/series/methods/test_unique.py | 20 + python/cudf/cudf/tests/series/test_binops.py | 851 +++++++++++++++++- .../cudf/tests/series/test_constructors.py | 28 + .../cudf/cudf/tests/series/test_np_ufuncs.py | 48 +- python/cudf/cudf/tests/series/test_repr.py | 20 + python/cudf/cudf/tests/test_categorical.py | 45 - python/cudf/cudf/tests/test_contains.py | 49 - python/cudf/cudf/tests/test_copying.py | 439 --------- python/cudf/cudf/tests/test_datetime.py | 422 --------- python/cudf/cudf/tests/test_decimal.py | 373 -------- python/cudf/cudf/tests/test_list.py | 25 - python/cudf/cudf/tests/test_serialize.py | 32 +- python/cudf/cudf/tests/test_sorting.py | 109 --- python/cudf/cudf/tests/test_string.py | 644 ------------- python/cudf/cudf/tests/test_timedelta.py | 644 ------------- 35 files changed, 2599 insertions(+), 2773 deletions(-) create mode 100644 python/cudf/cudf/tests/dataframe/methods/test_repeat.py create mode 100644 python/cudf/cudf/tests/indexes/datetimeindex/methods/test_reductions.py create mode 100644 python/cudf/cudf/tests/indexes/multiindex/methods/test_repeat.py create mode 100644 python/cudf/cudf/tests/indexes/timedeltaindex/methods/test_reductions.py create mode 100644 python/cudf/cudf/tests/indexes/timedeltaindex/test_binops.py create mode 100644 python/cudf/cudf/tests/series/methods/test_argsort.py create mode 100644 python/cudf/cudf/tests/series/methods/test_repeat.py create mode 100644 python/cudf/cudf/tests/series/methods/test_sort_values.py delete mode 100644 python/cudf/cudf/tests/test_contains.py delete mode 100644 python/cudf/cudf/tests/test_copying.py delete mode 100644 python/cudf/cudf/tests/test_datetime.py delete mode 100644 python/cudf/cudf/tests/test_decimal.py delete mode 100644 python/cudf/cudf/tests/test_list.py delete mode 100644 python/cudf/cudf/tests/test_sorting.py delete mode 100644 python/cudf/cudf/tests/test_string.py delete mode 100644 python/cudf/cudf/tests/test_timedelta.py diff --git a/python/cudf/cudf/tests/conftest.py b/python/cudf/cudf/tests/conftest.py index f4157430185..8f6155bfb2c 100644 --- a/python/cudf/cudf/tests/conftest.py +++ b/python/cudf/cudf/tests/conftest.py @@ -461,6 +461,12 @@ def datetime_types_as_str(request): return request.param +@pytest.fixture +def datetime_types_as_str2(datetime_types_as_str): + """Used for testing cartesian product of datetime_types_as_str""" + return datetime_types_as_str + + @pytest.fixture(params=timedelta_types) def timedelta_types_as_str(request): """ diff --git a/python/cudf/cudf/tests/dataframe/indexing/test_setitem.py b/python/cudf/cudf/tests/dataframe/indexing/test_setitem.py index 0df3cdddb1f..c566be17d4a 100644 --- a/python/cudf/cudf/tests/dataframe/indexing/test_setitem.py +++ b/python/cudf/cudf/tests/dataframe/indexing/test_setitem.py @@ -105,11 +105,6 @@ def test_setitem_dataframe_series_inplace(index): assert_eq(expected, gdf) -def test_setitem_datetime(): - df = cudf.DataFrame({"date": pd.date_range("20010101", "20010105").values}) - assert df.date.dtype.kind == "M" - - def test_listcol_setitem_retain_dtype(): df = cudf.DataFrame( {"a": cudf.Series([["a", "b"], []]), "b": [1, 2], "c": [123, 321]} @@ -123,3 +118,54 @@ def test_listcol_setitem_retain_dtype(): # prior to this fix: https://github.com/rapidsai/cudf/pull/10151/ df2 = df1.copy() assert df2["a"].dtype == df["a"].dtype + + +def test_setitem_datetime(): + df = cudf.DataFrame({"date": pd.date_range("20010101", "20010105").values}) + assert df.date.dtype.kind == "M" + + +@pytest.mark.parametrize("scalar", ["a", None]) +def test_string_set_scalar(scalar): + pdf = pd.DataFrame( + { + "a": [1, 2, 3, 4, 5], + } + ) + gdf = cudf.DataFrame.from_pandas(pdf) + + pdf["b"] = "a" + gdf["b"] = "a" + + assert_eq(pdf["b"], gdf["b"]) + assert_eq(pdf, gdf) + + +def test_dataframe_cow_slice_setitem(): + with cudf.option_context("copy_on_write", True): + df = cudf.DataFrame( + {"a": [10, 11, 12, 13, 14], "b": [20, 30, 40, 50, 60]} + ) + slice_df = df[1:4] + + assert_eq( + slice_df, + cudf.DataFrame( + {"a": [11, 12, 13], "b": [30, 40, 50]}, index=[1, 2, 3] + ), + ) + + slice_df["a"][2] = 1111 + + assert_eq( + slice_df, + cudf.DataFrame( + {"a": [11, 1111, 13], "b": [30, 40, 50]}, index=[1, 2, 3] + ), + ) + assert_eq( + df, + cudf.DataFrame( + {"a": [10, 11, 12, 13, 14], "b": [20, 30, 40, 50, 60]} + ), + ) diff --git a/python/cudf/cudf/tests/dataframe/methods/test_repeat.py b/python/cudf/cudf/tests/dataframe/methods/test_repeat.py new file mode 100644 index 00000000000..6dbb012c03a --- /dev/null +++ b/python/cudf/cudf/tests/dataframe/methods/test_repeat.py @@ -0,0 +1,18 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + +import numpy as np +import pandas as pd + +import cudf +from cudf.testing import assert_eq + + +def test_repeat_dataframe(): + rng = np.random.default_rng(seed=0) + psr = pd.DataFrame({"a": [1, 1, 2, 2]}) + gsr = cudf.from_pandas(psr) + repeats = rng.integers(10, size=4) + + # pd.DataFrame doesn't have repeat() so as a workaround, we are + # comparing pd.Series.repeat() with cudf.DataFrame.repeat()['a'] + assert_eq(psr["a"].repeat(repeats), gsr.repeat(repeats)["a"]) diff --git a/python/cudf/cudf/tests/dataframe/test_attributes.py b/python/cudf/cudf/tests/dataframe/test_attributes.py index d2cef2d8bdc..3a3c43a15c7 100644 --- a/python/cudf/cudf/tests/dataframe/test_attributes.py +++ b/python/cudf/cudf/tests/dataframe/test_attributes.py @@ -180,3 +180,20 @@ def test_ndim(): pdf = pd.DataFrame({"x": range(5), "y": range(5, 10)}) gdf = cudf.DataFrame.from_pandas(pdf) assert pdf.ndim == gdf.ndim + + +@pytest.mark.parametrize( + "index", + [ + ["a", "b", "c", "d", "e"], + np.array(["a", "b", "c", "d", "e"]), + pd.Index(["a", "b", "c", "d", "e"], name="name"), + ], +) +def test_string_index(index): + rng = np.random.default_rng(seed=0) + pdf = pd.DataFrame(rng.random(size=(5, 5))) + gdf = cudf.DataFrame.from_pandas(pdf) + pdf.index = index + gdf.index = index + assert_eq(pdf, gdf) diff --git a/python/cudf/cudf/tests/groupby/test_reductions.py b/python/cudf/cudf/tests/groupby/test_reductions.py index f54c0a79337..f4d248f79c4 100644 --- a/python/cudf/cudf/tests/groupby/test_reductions.py +++ b/python/cudf/cudf/tests/groupby/test_reductions.py @@ -985,3 +985,93 @@ def test_group_by_reduce_numeric_only(by, data, groupby_reduction_methods): numeric_only=True ) assert_eq(expected, result) + + +@pytest.mark.parametrize( + "str_data", [[], ["a", "b", "c", "d", "e"], [None, None, None, None, None]] +) +def test_string_groupby_key(str_data): + num_keys = 2 + other_data = [1, 2, 3, 4, 5][: len(str_data)] + + pdf = pd.DataFrame( + { + 0: pd.Series(str_data, dtype="str"), + 1: pd.Series(str_data, dtype="str"), + "a": other_data, + } + ) + gdf = cudf.DataFrame( + { + 0: cudf.Series(str_data, dtype="str"), + 1: cudf.Series(str_data, dtype="str"), + "a": other_data, + } + ) + + expect = pdf.groupby(list(range(num_keys)), as_index=False).count() + got = gdf.groupby(list(range(num_keys)), as_index=False).count() + + expect = expect.sort_values([0]).reset_index(drop=True) + got = got.sort_values([0]).reset_index(drop=True) + + assert_eq(expect, got, check_dtype=False) + + +@pytest.mark.parametrize( + "str_data", [[], ["a", "b", "c", "d", "e"], [None, None, None, None, None]] +) +@pytest.mark.parametrize("agg", ["count", "max", "min"]) +def test_string_groupby_non_key(str_data, agg): + num_cols = 2 + other_data = [1, 2, 3, 4, 5][: len(str_data)] + + pdf = pd.DataFrame( + { + 0: pd.Series(str_data, dtype="str"), + 1: pd.Series(str_data, dtype="str"), + "a": other_data, + } + ) + gdf = cudf.DataFrame( + { + 0: cudf.Series(str_data, dtype="str"), + 1: cudf.Series(str_data, dtype="str"), + "a": other_data, + } + ) + + expect = getattr(pdf.groupby("a", as_index=False), agg)() + got = getattr(gdf.groupby("a", as_index=False), agg)() + + expect = expect.sort_values(["a"]).reset_index(drop=True) + got = got.sort_values(["a"]).reset_index(drop=True) + + if agg in ["min", "max"] and len(expect) == 0 and len(got) == 0: + for i in range(num_cols): + expect[i] = expect[i].astype("str") + + assert_eq(expect, got, check_dtype=False) + + +def test_string_groupby_key_index(): + str_data = ["a", "b", "c", "d", "e"] + other_data = [1, 2, 3, 4, 5] + + pdf = pd.DataFrame( + { + "a": pd.Series(str_data, dtype="str"), + "b": other_data, + } + ) + gdf = cudf.DataFrame( + { + "a": cudf.Series(str_data, dtype="str"), + "b": other_data, + } + ) + + expect = pdf.groupby("a", sort=True).count() + got = gdf.groupby("a", sort=True).count() + + assert_eq(expect, got, check_dtype=False) diff --git a/python/cudf/cudf/tests/indexes/datetimeindex/methods/test_reductions.py b/python/cudf/cudf/tests/indexes/datetimeindex/methods/test_reductions.py new file mode 100644 index 00000000000..be62e12edf2 --- /dev/null +++ b/python/cudf/cudf/tests/indexes/datetimeindex/methods/test_reductions.py @@ -0,0 +1,19 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + +import pandas as pd +import pytest + +import cudf + + +@pytest.mark.parametrize( + "method, kwargs", + [["mean", {}], ["std", {}], ["std", {"ddof": 0}]], +) +def test_dti_reduction(method, kwargs): + pd_dti = pd.DatetimeIndex(["2020-01-01", "2020-12-31"], name="foo") + cudf_dti = cudf.from_pandas(pd_dti) + + result = getattr(cudf_dti, method)(**kwargs) + expected = getattr(pd_dti, method)(**kwargs) + assert result == expected diff --git a/python/cudf/cudf/tests/indexes/multiindex/methods/test_repeat.py b/python/cudf/cudf/tests/indexes/multiindex/methods/test_repeat.py new file mode 100644 index 00000000000..b4ea86c8841 --- /dev/null +++ b/python/cudf/cudf/tests/indexes/multiindex/methods/test_repeat.py @@ -0,0 +1,17 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + +import numpy as np +import pandas as pd + +import cudf +from cudf.testing import assert_eq + + +def test_repeat_index(): + rng = np.random.default_rng(seed=0) + arrays = [[1, 1, 2, 2], ["red", "blue", "red", "blue"]] + psr = pd.MultiIndex.from_arrays(arrays, names=("number", "color")) + gsr = cudf.from_pandas(psr) + repeats = rng.integers(10, size=4) + + assert_eq(psr.repeat(repeats), gsr.repeat(repeats)) diff --git a/python/cudf/cudf/tests/indexes/timedeltaindex/methods/test_reductions.py b/python/cudf/cudf/tests/indexes/timedeltaindex/methods/test_reductions.py new file mode 100644 index 00000000000..1e1c9b89166 --- /dev/null +++ b/python/cudf/cudf/tests/indexes/timedeltaindex/methods/test_reductions.py @@ -0,0 +1,26 @@ +# Copyright (c) 2020-2025, NVIDIA CORPORATION. + + +import pandas as pd +import pytest + +import cudf + + +@pytest.mark.parametrize( + "method, kwargs", + [ + ["sum", {}], + ["mean", {}], + ["median", {}], + ["std", {}], + ["std", {"ddof": 0}], + ], +) +def test_tdi_reductions(method, kwargs): + pd_tdi = pd.TimedeltaIndex(["1 day", "2 days", "3 days"]) + cudf_tdi = cudf.from_pandas(pd_tdi) + + result = getattr(pd_tdi, method)(**kwargs) + expected = getattr(cudf_tdi, method)(**kwargs) + assert result == expected diff --git a/python/cudf/cudf/tests/indexes/timedeltaindex/test_binops.py b/python/cudf/cudf/tests/indexes/timedeltaindex/test_binops.py new file mode 100644 index 00000000000..eb23ae5041f --- /dev/null +++ b/python/cudf/cudf/tests/indexes/timedeltaindex/test_binops.py @@ -0,0 +1,195 @@ +# Copyright (c) 2020-2025, NVIDIA CORPORATION. + +import datetime + +import cupy as cp +import numpy as np +import pandas as pd +import pytest + +import cudf +from cudf.testing import assert_eq + + +@pytest.mark.parametrize( + "data_non_overflow", + [ + [1000000, 200000, 3000000], + [1000000, 200000, None], + [], + [None], + [12, 12, 22, 343, 4353534, 435342], + np.array([10, 20, 30, None, 100]), + cp.asarray([10, 20, 30, 100]), + [1000000, 200000, 3000000], + [1000000, 200000, None], + [1], + [12, 11, 232, 223432411, 2343241, 234324, 23234], + [12, 11, 2.32, 2234.32411, 2343.241, 23432.4, 23234], + [1.321, 1132.324, 23223231.11, 233.41, 0.2434, 332, 323], + [12, 11, 2.32, 2234.32411, 2343.241, 23432.4, 23234], + ], +) +def test_timedelta_index_datetime_index_ops( + data_non_overflow, datetime_types_as_str, timedelta_types_as_str +): + gdt = cudf.Index(data_non_overflow, dtype=datetime_types_as_str) + gtd = cudf.Index(data_non_overflow, dtype=timedelta_types_as_str) + + pdt = gdt.to_pandas() + ptd = gtd.to_pandas() + + assert_eq(gdt - gtd, pdt - ptd) + assert_eq(gdt + gtd, pdt + ptd) + + +@pytest.mark.parametrize( + "datetime_data,timedelta_data", + [ + ([1000000, 200000, 3000000], [1000000, 200000, 3000000]), + ([1000000, 200000, None], [1000000, 200000, None]), + ([], []), + ([None], [None]), + ( + [12, 12, 22, 343, 4353534, 435342], + [12, 12, 22, 343, 4353534, 435342], + ), + (np.array([10, 20, 30, None, 100]), np.array([10, 20, 30, None, 100])), + (cp.asarray([10, 20, 30, 100]), cp.asarray([10, 20, 30, 100])), + ([1000000, 200000, 3000000], [200000, 34543, 3000000]), + ([1000000, 200000, None], [1000000, 200000, 3000000]), + ([None], [1]), + ( + [12, 12, 22, 343, 4353534, 435342], + [None, 1, 220, 3, 34, 4353423287], + ), + ( + [12, 11, 232, 223432411, 2343241, 234324, 23234], + [11, 1132324, 2322323111, 23341, 2434, 332, 323], + ), + ( + [12, 11, 2.32, 2234.32411, 2343.241, 23432.4, 23234], + [11, 1132324, 2322323111, 23341, 2434, 332, 323], + ), + ( + [11, 1132324, 2322323111, 23341, 2434, 332, 323], + [12, 11, 2.32, 2234.32411, 2343.241, 23432.4, 23234], + ), + ( + [1.321, 1132.324, 23223231.11, 233.41, 0.2434, 332, 323], + [12, 11, 2.32, 2234.32411, 2343.241, 23432.4, 23234], + ), + ], +) +def test_timedelta_datetime_index_ops_misc( + datetime_data, + timedelta_data, + datetime_types_as_str, + timedelta_types_as_str, +): + gdt = cudf.Index(datetime_data, dtype=datetime_types_as_str) + gtd = cudf.Index(timedelta_data, dtype=timedelta_types_as_str) + + pdt = gdt.to_pandas() + ptd = gtd.to_pandas() + + assert_eq(gdt - gtd, pdt - ptd) + assert_eq(gdt + gtd, pdt + ptd) + + +@pytest.mark.parametrize( + "data_non_overflow", + [ + [1000000, 200000, 3000000], + [1000000, 200000, None], + [], + [None], + [12, 12, 22, 343, 4353534, 435342], + np.array([10, 20, 30, None, 100]), + cp.asarray([10, 20, 30, 100]), + [1000000, 200000, 3000000], + [1000000, 200000, None], + [1], + [12, 11, 232, 223432411, 2343241, 234324, 23234], + [12, 11, 2.32, 2234.32411, 2343.241, 23432.4, 23234], + [1.321, 1132.324, 23223231.11, 233.41, 0.2434, 332, 323], + [12, 11, 2.32, 2234.32411, 2343.241, 23432.4, 23234], + ], +) +@pytest.mark.parametrize( + "other_scalars", + [ + pd.Timedelta(1513393355.5, unit="s"), + pd.Timedelta(34765, unit="D"), + datetime.timedelta(days=768), + datetime.timedelta(seconds=768), + datetime.timedelta(microseconds=7), + datetime.timedelta(minutes=447), + datetime.timedelta(hours=447), + datetime.timedelta(weeks=734), + np.timedelta64(4, "s"), + np.timedelta64(456, "D"), + np.timedelta64(46, "h"), + np.timedelta64("nat"), + np.timedelta64(1, "s"), + np.timedelta64(1, "ms"), + np.timedelta64(1, "us"), + np.timedelta64(1, "ns"), + ], +) +@pytest.mark.filterwarnings("ignore:divide by zero:RuntimeWarning:pandas") +def test_timedelta_index_ops_with_scalars( + request, + data_non_overflow, + other_scalars, + timedelta_types_as_str, + arithmetic_op_method, +): + if arithmetic_op_method not in ("add", "sub", "truediv", "floordiv"): + pytest.skip(f"Test not applicable for {arithmetic_op_method}") + + gtdi = cudf.Index(data=data_non_overflow, dtype=timedelta_types_as_str) + ptdi = gtdi.to_pandas() + + if arithmetic_op_method == "add": + expected = ptdi + other_scalars + actual = gtdi + other_scalars + elif arithmetic_op_method == "sub": + expected = ptdi - other_scalars + actual = gtdi - other_scalars + elif arithmetic_op_method == "truediv": + expected = ptdi / other_scalars + actual = gtdi / other_scalars + elif arithmetic_op_method == "floordiv": + expected = ptdi // other_scalars + actual = gtdi // other_scalars + + assert_eq(expected, actual) + + if arithmetic_op_method == "add": + expected = other_scalars + ptdi + actual = other_scalars + gtdi + elif arithmetic_op_method == "sub": + expected = other_scalars - ptdi + actual = other_scalars - gtdi + elif arithmetic_op_method == "truediv": + expected = other_scalars / ptdi + actual = other_scalars / gtdi + elif arithmetic_op_method == "floordiv": + expected = other_scalars // ptdi + actual = other_scalars // gtdi + + # Division by zero for datetime or timedelta is + # dubiously defined in both pandas (Any // 0 -> 0 in + # pandas) and cuDF (undefined behaviour) + request.applymarker( + pytest.mark.xfail( + condition=( + arithmetic_op_method == "floordiv" + and 0 in ptdi.astype("int") + and np.timedelta64(other_scalars).item() is not None + ), + reason="Related to https://github.com/rapidsai/cudf/issues/5938", + ) + ) + assert_eq(expected, actual) diff --git a/python/cudf/cudf/tests/private_objects/test_column.py b/python/cudf/cudf/tests/private_objects/test_column.py index 4d2a7eaea41..f7c7b0808d4 100644 --- a/python/cudf/cudf/tests/private_objects/test_column.py +++ b/python/cudf/cudf/tests/private_objects/test_column.py @@ -1,4 +1,6 @@ # Copyright (c) 2020-2025, NVIDIA CORPORATION. +import sys +from decimal import Decimal import cupy as cp import numpy as np @@ -7,8 +9,16 @@ import pytest from numba import cuda +import rmm + import cudf +from cudf.core._compat import ( + PANDAS_CURRENT_SUPPORTED_VERSION, + PANDAS_VERSION, +) +from cudf.core.buffer import as_buffer from cudf.core.column.column import _can_values_be_equal, as_column +from cudf.core.column.decimal import Decimal32Column, Decimal64Column from cudf.testing import assert_eq from cudf.testing._utils import assert_exceptions_equal @@ -639,3 +649,100 @@ def test_build_series_from_nullable_pandas_dtype(pd_dtype, expect_dtype): def test__can_values_be_equal(left, right, expected): assert _can_values_be_equal(left, right) is expected assert _can_values_be_equal(right, left) is expected + + +def test_string_no_children_properties(): + empty_col = cudf.core.column.StringColumn( + as_buffer(rmm.DeviceBuffer(size=0)), + size=0, + dtype=np.dtype("object"), + children=(), + ) + assert empty_col.base_children == () + assert empty_col.base_size == 0 + + assert empty_col.children == () + assert empty_col.size == 0 + + assert sys.getsizeof(empty_col) >= 0 # Accounts for Python GC overhead + + +def test_string_int_to_ipv4(): + gsr = cudf.Series([0, None, 0, 698875905, 2130706433, 700776449]).astype( + "uint32" + ) + expected = cudf.Series( + ["0.0.0.0", None, "0.0.0.0", "41.168.0.1", "127.0.0.1", "41.197.0.1"] + ) + + got = cudf.Series._from_column(gsr._column.int2ip()) + + assert_eq(expected, got) + + +def test_string_int_to_ipv4_dtype_fail(numeric_types_as_str): + if numeric_types_as_str == "uint32": + pytest.skip(f"int2ip passes with {numeric_types_as_str}") + gsr = cudf.Series([1, 2, 3, 4, 5]).astype(numeric_types_as_str) + with pytest.raises(TypeError): + gsr._column.int2ip() + + +@pytest.mark.skipif( + PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, + reason="Fails in older versions of pandas.", +) +def test_datetime_can_cast_safely(): + sr = cudf.Series( + ["1679-01-01", "2000-01-31", "2261-01-01"], dtype="datetime64[ms]" + ) + assert sr._column.can_cast_safely(np.dtype("datetime64[ns]")) + + sr = cudf.Series( + ["1677-01-01", "2000-01-31", "2263-01-01"], dtype="datetime64[ms]" + ) + + assert sr._column.can_cast_safely(np.dtype("datetime64[ns]")) is False + + +@pytest.mark.parametrize( + "data_", + [ + [Decimal("1.1"), Decimal("2.2"), Decimal("3.3"), Decimal("4.4")], + [Decimal("-1.1"), Decimal("2.2"), Decimal("3.3"), Decimal("4.4")], + [1], + [-1], + [1, 2, 3, 4], + [42, 17, 41], + [1, 2, None, 4], + [None, None, None], + [], + ], +) +@pytest.mark.parametrize( + "typ_", + [ + pa.decimal128(precision=4, scale=2), + pa.decimal128(precision=5, scale=3), + pa.decimal128(precision=6, scale=4), + ], +) +@pytest.mark.parametrize("col", [Decimal32Column, Decimal64Column]) +def test_round_trip_decimal_column(data_, typ_, col): + pa_arr = pa.array(data_, type=typ_) + col_32 = col.from_arrow(pa_arr) + assert pa_arr.equals(col_32.to_arrow()) + + +def test_from_arrow_max_precision_decimal64(): + with pytest.raises(ValueError): + Decimal64Column.from_arrow( + pa.array([1, 2, 3], type=pa.decimal128(scale=0, precision=19)) + ) + + +def test_from_arrow_max_precision_decimal32(): + with pytest.raises(ValueError): + Decimal32Column.from_arrow( + pa.array([1, 2, 3], type=pa.decimal128(scale=0, precision=10)) + ) diff --git a/python/cudf/cudf/tests/series/accessors/test_list.py b/python/cudf/cudf/tests/series/accessors/test_list.py index 18f86dfd7c2..a6487d496d8 100644 --- a/python/cudf/cudf/tests/series/accessors/test_list.py +++ b/python/cudf/cudf/tests/series/accessors/test_list.py @@ -534,3 +534,36 @@ def test_list_methods_setattr(): with pytest.raises(AttributeError): ser.list.a = "b" + + +def test_lists_contains(numeric_types_as_str): + inner_data = np.array([1, 2, 3], dtype=numeric_types_as_str) + + data = cudf.Series([inner_data]) + + contained_scalar = inner_data.dtype.type(2) + not_contained_scalar = inner_data.dtype.type(42) + + assert data.list.contains(contained_scalar)[0] + assert not data.list.contains(not_contained_scalar)[0] + + +def test_lists_contains_datetime(temporal_types_as_str): + inner_data = np.array([1, 2, 3], dtype=temporal_types_as_str) + + unit, _ = np.datetime_data(inner_data.dtype) + + data = cudf.Series([inner_data]) + + contained_scalar = inner_data.dtype.type(2, unit) + not_contained_scalar = inner_data.dtype.type(42, unit) + + assert data.list.contains(contained_scalar)[0] + assert not data.list.contains(not_contained_scalar)[0] + + +def test_lists_contains_bool(): + data = cudf.Series([[True, True, True]]) + + assert data.list.contains(True)[0] + assert not data.list.contains(False)[0] diff --git a/python/cudf/cudf/tests/series/indexing/test_getitem.py b/python/cudf/cudf/tests/series/indexing/test_getitem.py index aecc70335f1..37e0c10618f 100644 --- a/python/cudf/cudf/tests/series/indexing/test_getitem.py +++ b/python/cudf/cudf/tests/series/indexing/test_getitem.py @@ -216,3 +216,25 @@ def test_datetime_getitem_na(): def test_timedelta_getitem_na(): s = cudf.Series([1, 2, None, 3], dtype="timedelta64[ns]") assert s[2] is cudf.NaT + + +def test_string_table_view_creation(): + data = ["hi"] * 25 + [None] * 2027 + psr = pd.Series(data) + gsr = cudf.Series.from_pandas(psr) + + expect = psr[:1] + got = gsr[:1] + + assert_eq(expect, got) + + +def test_string_slice_with_mask(): + actual = cudf.Series(["hi", "hello", None]) + expected = actual[0:3] + + assert actual._column.base_size == 3 + assert_eq(actual._column.base_size, expected._column.base_size) + assert_eq(actual._column.null_count, expected._column.null_count) + + assert_eq(actual, expected) diff --git a/python/cudf/cudf/tests/series/indexing/test_iloc.py b/python/cudf/cudf/tests/series/indexing/test_iloc.py index 00bc0422c7d..838091cff3e 100644 --- a/python/cudf/cudf/tests/series/indexing/test_iloc.py +++ b/python/cudf/cudf/tests/series/indexing/test_iloc.py @@ -1,6 +1,9 @@ # Copyright (c) 2025, NVIDIA CORPORATION. +import cupy as cp import numpy as np +import pandas as pd +import pyarrow as pa import pytest import cudf @@ -32,3 +35,61 @@ def test_struct_empty_children_slice(indices, values): actual = s.iloc[indices] expect = cudf.Series(values[indices], index=range(len(values))[indices]) assert_eq(actual, expect) + + +@pytest.mark.parametrize( + "item", + [ + 0, + 2, + 4, + slice(1, 3), + [1, 1, 1, 1, 1, 1, 1, 1, 1, 1], + [0, 1, 2, 3, 4, 4, 3, 2, 1, 0], + np.array([0, 1, 2, 3, 4]), + cp.asarray(np.array([0, 1, 2, 3, 4])), + ], +) +@pytest.mark.parametrize("data", [["a"] * 5, ["a", None] * 3, [None] * 5]) +def test_string_get_item(data, item): + ps = pd.Series(data, dtype="str", name="nice name") + gs = cudf.Series(data, dtype="str", name="nice name") + + got = gs.iloc[item] + if isinstance(got, cudf.Series): + got = got.to_arrow() + + if isinstance(item, cp.ndarray): + item = cp.asnumpy(item) + + expect = ps.iloc[item] + if isinstance(expect, pd.Series): + expect = pa.Array.from_pandas(expect) + pa.Array.equals(expect, got) + else: + if got is cudf.NA and expect is None: + return + assert expect == got + + +@pytest.mark.parametrize("bool_", [True, False]) +@pytest.mark.parametrize("data", [["a"], ["a", None], [None]]) +@pytest.mark.parametrize("box", [list, np.array, cp.array]) +def test_string_bool_mask(data, bool_, box): + ps = pd.Series(data, dtype="str", name="nice name") + gs = cudf.Series(data, dtype="str", name="nice name") + item = box([bool_] * len(data)) + + got = gs.iloc[item] + if isinstance(got, cudf.Series): + got = got.to_arrow() + + if isinstance(item, cp.ndarray): + item = cp.asnumpy(item) + + expect = ps[item] + if isinstance(expect, pd.Series): + expect = pa.Array.from_pandas(expect) + pa.Array.equals(expect, got) + else: + assert expect == got diff --git a/python/cudf/cudf/tests/series/indexing/test_setitem.py b/python/cudf/cudf/tests/series/indexing/test_setitem.py index aba9edcc6c7..2468477bdb7 100644 --- a/python/cudf/cudf/tests/series/indexing/test_setitem.py +++ b/python/cudf/cudf/tests/series/indexing/test_setitem.py @@ -1,5 +1,7 @@ # Copyright (c) 2025, NVIDIA CORPORATION. +import decimal +import cupy as cp import numpy as np import pandas as pd import pyarrow as pa @@ -7,6 +9,7 @@ import cudf from cudf.core._compat import PANDAS_CURRENT_SUPPORTED_VERSION, PANDAS_VERSION +from cudf.core.buffer.spill_manager import get_global_manager from cudf.testing import assert_eq from cudf.testing._utils import assert_exceptions_equal, expect_warning_if @@ -416,3 +419,434 @@ def test_struct_setitem(data, item): data[1] = item expected = cudf.Series(data) assert sr.to_arrow() == expected.to_arrow() + + +def test_null_copy(): + col = cudf.Series(range(2049)) + col[:] = None + assert len(col) == 2049 + + +@pytest.mark.parametrize( + "copy_on_write, expected", + [ + (True, [1, 2, 3, 4, 5]), + (False, [1, 100, 3, 4, 5]), + ], +) +def test_series_setitem_cow(copy_on_write, expected): + with cudf.option_context("copy_on_write", copy_on_write): + actual = cudf.Series([1, 2, 3, 4, 5]) + new_copy = actual.copy(deep=False) + + actual[1] = 100 + assert_eq(actual, cudf.Series([1, 100, 3, 4, 5])) + assert_eq(new_copy, cudf.Series(expected)) + + +def test_series_setitem_both_slice_cow_on(): + with cudf.option_context("copy_on_write", True): + actual = cudf.Series([1, 2, 3, 4, 5]) + new_copy = actual.copy(deep=False) + + actual[slice(0, 2, 1)] = 100 + assert_eq(actual, cudf.Series([100, 100, 3, 4, 5])) + assert_eq(new_copy, cudf.Series([1, 2, 3, 4, 5])) + + new_copy[slice(2, 4, 1)] = 300 + assert_eq(actual, cudf.Series([100, 100, 3, 4, 5])) + assert_eq(new_copy, cudf.Series([1, 2, 300, 300, 5])) + + +def test_series_setitem_both_slice_cow_off(): + with cudf.option_context("copy_on_write", False): + actual = cudf.Series([1, 2, 3, 4, 5]) + new_copy = actual.copy(deep=False) + + actual[slice(0, 2, 1)] = 100 + assert_eq(actual, cudf.Series([100, 100, 3, 4, 5])) + assert_eq(new_copy, cudf.Series([100, 100, 3, 4, 5])) + + new_copy[slice(2, 4, 1)] = 300 + assert_eq(actual, cudf.Series([100, 100, 300, 300, 5])) + assert_eq(new_copy, cudf.Series([100, 100, 300, 300, 5])) + + +def test_series_setitem_partial_slice_cow_on(): + with cudf.option_context("copy_on_write", True): + actual = cudf.Series([1, 2, 3, 4, 5]) + new_copy = actual.copy(deep=False) + + new_copy[slice(2, 4, 1)] = 300 + assert_eq(actual, cudf.Series([1, 2, 3, 4, 5])) + assert_eq(new_copy, cudf.Series([1, 2, 300, 300, 5])) + + new_slice = actual[2:] + assert ( + new_slice._column.base_data.owner == actual._column.base_data.owner + ) + new_slice[0:2] = 10 + assert_eq(new_slice, cudf.Series([10, 10, 5], index=[2, 3, 4])) + assert_eq(actual, cudf.Series([1, 2, 3, 4, 5])) + + +def test_series_setitem_partial_slice_cow_off(): + with cudf.option_context("copy_on_write", False): + actual = cudf.Series([1, 2, 3, 4, 5]) + new_copy = actual.copy(deep=False) + + new_copy[slice(2, 4, 1)] = 300 + assert_eq(actual, cudf.Series([1, 2, 300, 300, 5])) + assert_eq(new_copy, cudf.Series([1, 2, 300, 300, 5])) + + new_slice = actual[2:] + # Since COW is off, a slice should point to the same memory + ptr1 = new_slice._column.base_data.get_ptr(mode="read") + ptr2 = actual._column.base_data.get_ptr(mode="read") + assert ptr1 == ptr2 + + new_slice[0:2] = 10 + assert_eq(new_slice, cudf.Series([10, 10, 5], index=[2, 3, 4])) + assert_eq(actual, cudf.Series([1, 2, 10, 10, 5])) + + +def test_multiple_series_cow(): + with cudf.option_context("copy_on_write", True): + # Verify constructing, modifying, deleting + # multiple copies of a series preserves + # the data appropriately when COW is enabled. + s = cudf.Series([10, 20, 30, 40, 50]) + s1 = s.copy(deep=False) + s2 = s.copy(deep=False) + s3 = s.copy(deep=False) + s4 = s2.copy(deep=False) + s5 = s4.copy(deep=False) + s6 = s3.copy(deep=False) + + s1[0:3] = 10000 + # s1 will be unlinked from actual data in s, + # and then modified. Rest all should + # contain the original data. + assert_eq(s1, cudf.Series([10000, 10000, 10000, 40, 50])) + for ser in [s, s2, s3, s4, s5, s6]: + assert_eq(ser, cudf.Series([10, 20, 30, 40, 50])) + + s6[0:3] = 3000 + # s6 will be unlinked from actual data in s, + # and then modified. Rest all should + # contain the original data. + assert_eq(s1, cudf.Series([10000, 10000, 10000, 40, 50])) + assert_eq(s6, cudf.Series([3000, 3000, 3000, 40, 50])) + for ser in [s2, s3, s4, s5]: + assert_eq(ser, cudf.Series([10, 20, 30, 40, 50])) + + s2[1:4] = 4000 + # s2 will be unlinked from actual data in s, + # and then modified. Rest all should + # contain the original data. + assert_eq(s2, cudf.Series([10, 4000, 4000, 4000, 50])) + assert_eq(s1, cudf.Series([10000, 10000, 10000, 40, 50])) + assert_eq(s6, cudf.Series([3000, 3000, 3000, 40, 50])) + for ser in [s3, s4, s5]: + assert_eq(ser, cudf.Series([10, 20, 30, 40, 50])) + + s4[2:4] = 5000 + # s4 will be unlinked from actual data in s, + # and then modified. Rest all should + # contain the original data. + assert_eq(s4, cudf.Series([10, 20, 5000, 5000, 50])) + assert_eq(s2, cudf.Series([10, 4000, 4000, 4000, 50])) + assert_eq(s1, cudf.Series([10000, 10000, 10000, 40, 50])) + assert_eq(s6, cudf.Series([3000, 3000, 3000, 40, 50])) + for ser in [s3, s5]: + assert_eq(ser, cudf.Series([10, 20, 30, 40, 50])) + + s5[2:4] = 6000 + # s5 will be unlinked from actual data in s, + # and then modified. Rest all should + # contain the original data. + assert_eq(s5, cudf.Series([10, 20, 6000, 6000, 50])) + assert_eq(s4, cudf.Series([10, 20, 5000, 5000, 50])) + assert_eq(s2, cudf.Series([10, 4000, 4000, 4000, 50])) + assert_eq(s1, cudf.Series([10000, 10000, 10000, 40, 50])) + assert_eq(s6, cudf.Series([3000, 3000, 3000, 40, 50])) + for ser in [s3]: + assert_eq(ser, cudf.Series([10, 20, 30, 40, 50])) + + s7 = s5.copy(deep=False) + assert_eq(s7, cudf.Series([10, 20, 6000, 6000, 50])) + s7[1:3] = 55 + # Making a copy of s5, i.e., s7 and modifying shouldn't + # be touching/modifying data in other series. + assert_eq(s7, cudf.Series([10, 55, 55, 6000, 50])) + + assert_eq(s4, cudf.Series([10, 20, 5000, 5000, 50])) + assert_eq(s2, cudf.Series([10, 4000, 4000, 4000, 50])) + assert_eq(s1, cudf.Series([10000, 10000, 10000, 40, 50])) + assert_eq(s6, cudf.Series([3000, 3000, 3000, 40, 50])) + for ser in [s3]: + assert_eq(ser, cudf.Series([10, 20, 30, 40, 50])) + + # Deleting any of the following series objects + # shouldn't delete rest of the weekly referenced data + # elsewhere. + + del s2 + + assert_eq(s1, cudf.Series([10000, 10000, 10000, 40, 50])) + assert_eq(s3, cudf.Series([10, 20, 30, 40, 50])) + assert_eq(s4, cudf.Series([10, 20, 5000, 5000, 50])) + assert_eq(s5, cudf.Series([10, 20, 6000, 6000, 50])) + assert_eq(s6, cudf.Series([3000, 3000, 3000, 40, 50])) + assert_eq(s7, cudf.Series([10, 55, 55, 6000, 50])) + + del s4 + del s1 + + assert_eq(s3, cudf.Series([10, 20, 30, 40, 50])) + assert_eq(s5, cudf.Series([10, 20, 6000, 6000, 50])) + assert_eq(s6, cudf.Series([3000, 3000, 3000, 40, 50])) + assert_eq(s7, cudf.Series([10, 55, 55, 6000, 50])) + + del s + del s6 + + assert_eq(s3, cudf.Series([10, 20, 30, 40, 50])) + assert_eq(s5, cudf.Series([10, 20, 6000, 6000, 50])) + assert_eq(s7, cudf.Series([10, 55, 55, 6000, 50])) + + del s5 + + assert_eq(s3, cudf.Series([10, 20, 30, 40, 50])) + assert_eq(s7, cudf.Series([10, 55, 55, 6000, 50])) + + del s3 + assert_eq(s7, cudf.Series([10, 55, 55, 6000, 50])) + + +def test_series_zero_copy_cow_on(): + with cudf.option_context("copy_on_write", True): + s = cudf.Series([1, 2, 3, 4, 5]) + s1 = s.copy(deep=False) + cp_array = cp.asarray(s) + + # Ensure all original data & zero-copied + # data is same. + assert_eq(s, cudf.Series([1, 2, 3, 4, 5])) + assert_eq(s1, cudf.Series([1, 2, 3, 4, 5])) + assert_eq(cp_array, cp.array([1, 2, 3, 4, 5])) + + cp_array[0:3] = 10 + # Modifying a zero-copied array should only + # modify `s` and will leave rest of the copies + # untouched. + + assert_eq(s.to_numpy(), np.array([10, 10, 10, 4, 5])) + assert_eq(s, cudf.Series([10, 10, 10, 4, 5])) + assert_eq(s1, cudf.Series([1, 2, 3, 4, 5])) + assert_eq(cp_array, cp.array([10, 10, 10, 4, 5])) + + s2 = cudf.Series(cp_array) + assert_eq(s2, cudf.Series([10, 10, 10, 4, 5])) + + s3 = s2.copy(deep=False) + cp_array[0] = 20 + # Modifying a zero-copied array should modify + # `s2` and `s` only. Because `cp_array` + # is zero-copy shared with `s` & `s2`. + + assert_eq(s, cudf.Series([20, 10, 10, 4, 5])) + assert_eq(s1, cudf.Series([1, 2, 3, 4, 5])) + assert_eq(cp_array, cp.array([20, 10, 10, 4, 5])) + assert_eq(s2, cudf.Series([20, 10, 10, 4, 5])) + assert_eq(s3, cudf.Series([10, 10, 10, 4, 5])) + + s4 = cudf.Series([10, 20, 30, 40, 50]) + s5 = cudf.Series(s4) + assert_eq(s5, cudf.Series([10, 20, 30, 40, 50])) + s5[0:2] = 1 + # Modifying `s5` should also modify `s4` + # because they are zero-copied. + assert_eq(s5, cudf.Series([1, 1, 30, 40, 50])) + assert_eq(s4, cudf.Series([1, 1, 30, 40, 50])) + + +def test_series_zero_copy_cow_off(): + is_spill_enabled = get_global_manager() is not None + + with cudf.option_context("copy_on_write", False): + s = cudf.Series([1, 2, 3, 4, 5]) + s1 = s.copy(deep=False) + cp_array = cp.asarray(s) + + # Ensure all original data & zero-copied + # data is same. + assert_eq(s, cudf.Series([1, 2, 3, 4, 5])) + assert_eq(s1, cudf.Series([1, 2, 3, 4, 5])) + assert_eq(cp_array, cp.array([1, 2, 3, 4, 5])) + + cp_array[0:3] = 10 + # When COW is off, modifying a zero-copied array + # will need to modify `s` & `s1` since they are + # shallow copied. + + assert_eq(s, cudf.Series([10, 10, 10, 4, 5])) + assert_eq(s1, cudf.Series([10, 10, 10, 4, 5])) + assert_eq(cp_array, cp.array([10, 10, 10, 4, 5])) + + s2 = cudf.Series(cp_array) + assert_eq(s2, cudf.Series([10, 10, 10, 4, 5])) + s3 = s2.copy(deep=False) + cp_array[0] = 20 + + # Modifying `cp_array`, will propagate the changes + # across all Series objects, because they are + # either shallow copied or zero-copied. + + assert_eq(s, cudf.Series([20, 10, 10, 4, 5])) + assert_eq(s1, cudf.Series([20, 10, 10, 4, 5])) + assert_eq(cp_array, cp.array([20, 10, 10, 4, 5])) + if not is_spill_enabled: + # Since spilling might make a copy of the data, we cannot + # expect the two series to be a zero-copy of the cupy array + # when spilling is enabled globally. + assert_eq(s2, cudf.Series([20, 10, 10, 4, 5])) + assert_eq(s3, cudf.Series([20, 10, 10, 4, 5])) + + s4 = cudf.Series([10, 20, 30, 40, 50]) + s5 = cudf.Series(s4) + assert_eq(s5, cudf.Series([10, 20, 30, 40, 50])) + s5[0:2] = 1 + + # Modifying `s5` should also modify `s4` + # because they are zero-copied. + assert_eq(s5, cudf.Series([1, 1, 30, 40, 50])) + assert_eq(s4, cudf.Series([1, 1, 30, 40, 50])) + + +@pytest.mark.parametrize("copy_on_write", [True, False]) +def test_series_str_copy(copy_on_write): + with cudf.option_context("copy_on_write", copy_on_write): + s = cudf.Series(["a", "b", "c", "d", "e"]) + s1 = s.copy(deep=True) + s2 = s.copy(deep=True) + + assert_eq(s, cudf.Series(["a", "b", "c", "d", "e"])) + assert_eq(s1, cudf.Series(["a", "b", "c", "d", "e"])) + assert_eq(s2, cudf.Series(["a", "b", "c", "d", "e"])) + + s[0:3] = "abc" + + assert_eq(s, cudf.Series(["abc", "abc", "abc", "d", "e"])) + assert_eq(s1, cudf.Series(["a", "b", "c", "d", "e"])) + assert_eq(s2, cudf.Series(["a", "b", "c", "d", "e"])) + + s2[1:4] = "xyz" + + assert_eq(s, cudf.Series(["abc", "abc", "abc", "d", "e"])) + assert_eq(s1, cudf.Series(["a", "b", "c", "d", "e"])) + assert_eq(s2, cudf.Series(["a", "xyz", "xyz", "xyz", "e"])) + + +@pytest.mark.parametrize("copy_on_write", [True, False]) +def test_series_cat_copy(copy_on_write): + with cudf.option_context("copy_on_write", copy_on_write): + s = cudf.Series([10, 20, 30, 40, 50], dtype="category") + s1 = s.copy(deep=True) + s2 = s1.copy(deep=True) + s3 = s1.copy(deep=True) + + s[0] = 50 + assert_eq(s, cudf.Series([50, 20, 30, 40, 50], dtype=s.dtype)) + assert_eq(s1, cudf.Series([10, 20, 30, 40, 50], dtype="category")) + assert_eq(s2, cudf.Series([10, 20, 30, 40, 50], dtype="category")) + assert_eq(s3, cudf.Series([10, 20, 30, 40, 50], dtype="category")) + + s2[3] = 10 + s3[2:5] = 20 + assert_eq(s, cudf.Series([50, 20, 30, 40, 50], dtype=s.dtype)) + assert_eq(s1, cudf.Series([10, 20, 30, 40, 50], dtype=s.dtype)) + assert_eq(s2, cudf.Series([10, 20, 30, 10, 50], dtype=s.dtype)) + assert_eq(s3, cudf.Series([10, 20, 20, 20, 20], dtype=s.dtype)) + + +@pytest.mark.parametrize( + "data, dtype, item, to, expect", + [ + # scatter to a single index + ( + ["1", "2", "3"], + cudf.Decimal64Dtype(1, 0), + decimal.Decimal(5), + 1, + ["1", "5", "3"], + ), + ( + ["1.5", "2.5", "3.5"], + cudf.Decimal64Dtype(2, 1), + decimal.Decimal("5.5"), + 1, + ["1.5", "5.5", "3.5"], + ), + ( + ["1.0042", "2.0042", "3.0042"], + cudf.Decimal64Dtype(5, 4), + decimal.Decimal("5.0042"), + 1, + ["1.0042", "5.0042", "3.0042"], + ), + # scatter via boolmask + ( + ["1", "2", "3"], + cudf.Decimal64Dtype(1, 0), + decimal.Decimal(5), + [True, False, True], + ["5", "2", "5"], + ), + ( + ["1.5", "2.5", "3.5"], + cudf.Decimal64Dtype(2, 1), + decimal.Decimal("5.5"), + [True, True, True], + ["5.5", "5.5", "5.5"], + ), + ( + ["1.0042", "2.0042", "3.0042"], + cudf.Decimal64Dtype(5, 4), + decimal.Decimal("5.0042"), + [False, False, True], + ["1.0042", "2.0042", "5.0042"], + ), + # We will allow assigning a decimal with less precision + ( + ["1.00", "2.00", "3.00"], + cudf.Decimal64Dtype(3, 2), + decimal.Decimal(5), + 1, + ["1.00", "5.00", "3.00"], + ), + # But not truncation + ( + ["1", "2", "3"], + cudf.Decimal64Dtype(1, 0), + decimal.Decimal("5.5"), + 1, + pa.ArrowInvalid, + ), + # We will allow for setting scalars into decimal columns + (["1", "2", "3"], cudf.Decimal64Dtype(1, 0), 5, 1, ["1", "5", "3"]), + # But not if it has too many digits to fit the precision + (["1", "2", "3"], cudf.Decimal64Dtype(1, 0), 50, 1, pa.ArrowInvalid), + ], +) +def test_series_setitem_decimal(data, dtype, item, to, expect): + data = cudf.Series([decimal.Decimal(x) for x in data], dtype=dtype) + + if expect is pa.ArrowInvalid: + with pytest.raises(expect): + data[to] = item + return + else: + expect = cudf.Series([decimal.Decimal(x) for x in expect], dtype=dtype) + data[to] = item + assert_eq(data, expect) diff --git a/python/cudf/cudf/tests/series/methods/test_argsort.py b/python/cudf/cudf/tests/series/methods/test_argsort.py new file mode 100644 index 00000000000..377f849a97c --- /dev/null +++ b/python/cudf/cudf/tests/series/methods/test_argsort.py @@ -0,0 +1,17 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + +import numpy as np + +import cudf + + +def test_series_argsort(numeric_types_as_str, ascending): + sr = cudf.Series([1, 3, 2, 5, 4]).astype(numeric_types_as_str) + res = sr.argsort(ascending=ascending) + + if ascending: + expected = np.argsort(sr.to_numpy(), kind="mergesort") + else: + # -1 multiply works around missing desc sort (may promote to float64) + expected = np.argsort(sr.to_numpy() * np.int8(-1), kind="mergesort") + np.testing.assert_array_equal(expected, res.to_numpy()) diff --git a/python/cudf/cudf/tests/series/methods/test_astype.py b/python/cudf/cudf/tests/series/methods/test_astype.py index 2a535478af4..8d51d6226bc 100644 --- a/python/cudf/cudf/tests/series/methods/test_astype.py +++ b/python/cudf/cudf/tests/series/methods/test_astype.py @@ -1,7 +1,7 @@ # Copyright (c) 2025, NVIDIA CORPORATION. - import datetime import zoneinfo +from decimal import Decimal import cupy as cp import numpy as np @@ -10,8 +10,10 @@ import pytest import cudf +from cudf.core.column.decimal import Decimal32Column, Decimal64Column +from cudf.core.column.numerical import NumericalColumn from cudf.testing import assert_eq -from cudf.testing._utils import assert_exceptions_equal +from cudf.testing._utils import assert_exceptions_equal, expect_warning_if @pytest.mark.parametrize( @@ -557,6 +559,237 @@ def test_datetime_infer_format(data, timezone, datetime_types_as_str): sr.astype(datetime_types_as_str) +def test_string_astype(all_supported_types_as_str): + if all_supported_types_as_str.startswith( + "int" + ) or all_supported_types_as_str.startswith("uint"): + data = ["1", "2", "3", "4", "5"] + elif all_supported_types_as_str.startswith("float"): + data = [ + "1.0", + "2.0", + "3.0", + "4.0", + None, + "5.0", + "nan", + "-INF", + "NaN", + "inF", + "NAn", + ] + elif all_supported_types_as_str.startswith("bool"): + data = ["True", "False", "True", "False", "False"] + elif all_supported_types_as_str.startswith("datetime64"): + data = [ + "2019-06-04T00:00:00", + "2019-06-04T12:12:12", + "2019-06-03T00:00:00", + "2019-05-04T00:00:00", + "2018-06-04T00:00:00", + "1922-07-21T01:02:03", + ] + elif all_supported_types_as_str.startswith("timedelta64"): + data = [ + "1 days 00:00:00", + "2 days 00:00:00", + "3 days 00:00:00", + ] + elif all_supported_types_as_str in {"str", "category"}: + data = ["ab", "cd", "ef", "gh", "ij"] + ps = pd.Series(data) + gs = cudf.Series(data) + + expect = ps.astype(all_supported_types_as_str) + got = gs.astype(all_supported_types_as_str) + + assert_eq(expect, got) + + +@pytest.mark.parametrize( + "data, scale, precision", + [ + (["1.11", "2.22", "3.33"], 2, 3), + (["111", "222", "33"], 0, 3), + (["111000", "22000", "3000"], -3, 3), + ([None, None, None], 0, 5), + ([None, "-2345", None], 0, 5), + ([], 0, 5), + ], +) +@pytest.mark.parametrize( + "decimal_dtype", + [cudf.Decimal128Dtype, cudf.Decimal64Dtype, cudf.Decimal32Dtype], +) +def test_string_to_decimal(data, scale, precision, decimal_dtype): + gs = cudf.Series(data, dtype="str") + fp = gs.astype(decimal_dtype(scale=scale, precision=precision)) + got = fp.astype("str") + assert_eq(gs, got) + + +def test_string_empty_to_decimal(): + gs = cudf.Series(["", "-85", ""], dtype="str") + got = gs.astype(cudf.Decimal64Dtype(scale=0, precision=5)) + expected = cudf.Series( + [0, -85, 0], + dtype=cudf.Decimal64Dtype(scale=0, precision=5), + ) + assert_eq(expected, got) + + +@pytest.mark.parametrize( + "data, scale, precision", + [ + (["1.23", "-2.34", "3.45"], 2, 3), + (["123", "-234", "345"], 0, 3), + (["12300", "-400", "5000.0"], -2, 5), + ([None, None, None], 0, 5), + ([None, "-100", None], 0, 5), + ([], 0, 5), + ], +) +@pytest.mark.parametrize( + "decimal_dtype", + [cudf.Decimal128Dtype, cudf.Decimal32Dtype, cudf.Decimal64Dtype], +) +def test_string_from_decimal(data, scale, precision, decimal_dtype): + decimal_data = [] + for d in data: + if d is None: + decimal_data.append(None) + else: + decimal_data.append(Decimal(d)) + fp = cudf.Series( + decimal_data, + dtype=decimal_dtype(scale=scale, precision=precision), + ) + gs = fp.astype("str") + got = gs.astype(decimal_dtype(scale=scale, precision=precision)) + assert_eq(fp, got) + + +def test_string_empty_astype(all_supported_types_as_str): + data = [] + ps = pd.Series(data, dtype="str") + gs = cudf.Series(data, dtype="str") + + expect = ps.astype(all_supported_types_as_str) + got = gs.astype(all_supported_types_as_str) + + assert_eq(expect, got) + + +def test_string_numeric_astype(numeric_and_temporal_types_as_str): + if numeric_and_temporal_types_as_str.startswith("timedelta64"): + pytest.skip( + f"Test not applicable for {numeric_and_temporal_types_as_str}" + ) + if numeric_and_temporal_types_as_str.startswith("bool"): + data = [1, 0, 1, 0, 1] + elif numeric_and_temporal_types_as_str.startswith( + "int" + ) or numeric_and_temporal_types_as_str.startswith("uint"): + data = [1, 2, 3, 4, 5] + elif numeric_and_temporal_types_as_str.startswith("float"): + data = [1.0, 2.0, 3.0, 4.0, 5.0] + elif numeric_and_temporal_types_as_str.startswith("datetime64"): + # pandas rounds the output format based on the data + # Use numpy instead + # but fix '2011-01-01T00:00:00' -> '2011-01-01 00:00:00' + data = [1000000001, 2000000001, 3000000001, 4000000001, 5000000001] + ps = np.asarray(data, dtype=numeric_and_temporal_types_as_str).astype( + str + ) + ps = np.array([i.replace("T", " ") for i in ps]) + + if not numeric_and_temporal_types_as_str.startswith("datetime64"): + ps = pd.Series(data, dtype=numeric_and_temporal_types_as_str) + + gs = cudf.Series(data, dtype=numeric_and_temporal_types_as_str) + + expect = pd.Series(ps.astype("str")) + got = gs.astype("str") + + assert_eq(expect, got) + + +def test_string_empty_numeric_astype(numeric_and_temporal_types_as_str): + data = [] + + ps = pd.Series(data, dtype=numeric_and_temporal_types_as_str) + gs = cudf.Series(data, dtype=numeric_and_temporal_types_as_str) + + expect = ps.astype("str") + got = gs.astype("str") + + assert_eq(expect, got) + + +@pytest.mark.parametrize( + "data,dtype", + [ + (["0.1", "10.2", "10.876"], "float"), + (["-0.1", "10.2", "+10.876"], "float"), + (["1", "10.2", "10.876"], "float32"), + (["+123", "6344556789", "0"], "int"), + (["+123", "6344556789", "0"], "uint64"), + (["+123", "6344556789", "0"], "float"), + (["0.1", "-10.2", "10.876", None], "float"), + ], +) +@pytest.mark.parametrize("obj_type", [None, "str", "category"]) +def test_string_typecast(data, obj_type, dtype): + psr = pd.Series(data, dtype=obj_type) + gsr = cudf.Series(data, dtype=obj_type) + + expect = psr.astype(dtype=dtype) + actual = gsr.astype(dtype=dtype) + assert_eq(expect, actual) + + +@pytest.mark.parametrize( + "data,dtype", + [ + (["0.1", "10.2", "10.876"], "int"), + (["1", "10.2", "+10.876"], "int"), + (["abc", "1", "2", " "], "int"), + (["0.1", "10.2", "10.876"], "uint64"), + (["1", "10.2", "+10.876"], "uint64"), + (["abc", "1", "2", " "], "uint64"), + ([" ", "0.1", "2"], "float"), + ([""], "int"), + ([""], "uint64"), + ([" "], "float"), + (["\n"], "int"), + (["\n"], "uint64"), + (["0.1", "-10.2", "10.876", None], "int"), + (["0.1", "-10.2", "10.876", None], "uint64"), + (["0.1", "-10.2", "10.876", None, "ab"], "float"), + (["+", "-"], "float"), + (["+", "-"], "int"), + (["+", "-"], "uint64"), + (["1++++", "--2"], "float"), + (["1++++", "--2"], "int"), + (["1++++", "--2"], "uint64"), + (["++++1", "--2"], "float"), + (["++++1", "--2"], "int"), + (["++++1", "--2"], "uint64"), + ], +) +@pytest.mark.parametrize("obj_type", [None, "str", "category"]) +def test_string_typecast_error(data, obj_type, dtype): + psr = pd.Series(data, dtype=obj_type) + gsr = cudf.Series(data, dtype=obj_type) + + assert_exceptions_equal( + lfunc=psr.astype, + rfunc=gsr.astype, + lfunc_args_and_kwargs=([dtype],), + rfunc_args_and_kwargs=([dtype],), + ) + + @pytest.mark.parametrize("unit", ["ns", "us"]) def test_astype_aware_to_aware(unit): ser = cudf.Series( @@ -859,3 +1092,157 @@ def test_series_astype_null_categorical(): expect = cudf.Series([None, None, None], dtype="int32") got = sr.astype("int32") assert_eq(expect, got) + + +@pytest.mark.parametrize("precision, scale", [(7, 2), (11, 4), (18, 9)]) +def test_typecast_from_float_to_decimal( + request, float_types_as_str, precision, scale +): + to_dtype = cudf.Decimal64Dtype(precision, scale) + data = cudf.Series( + [ + 14.12302, + 97938.2, + np.nan, + 0.0, + -8.302014, + np.nan, + 94.31304, + -112.2314, + 0.3333333, + np.nan, + ] + ) + request.applymarker( + pytest.mark.xfail( + float_types_as_str == "float32" and to_dtype.precision > 12, + reason="https://github.com/rapidsai/cudf/issues/14169", + ) + ) + got = data.astype(float_types_as_str) + + pa_arr = got.to_arrow().cast( + pa.decimal128(to_dtype.precision, to_dtype.scale) + ) + expected = cudf.Series._from_column(Decimal64Column.from_arrow(pa_arr)) + + got = got.astype(to_dtype) + + assert_eq(got, expected) + + +@pytest.mark.parametrize("precision, scale", [(9, 3), (11, 4), (18, 9)]) +def test_typecast_from_int_to_decimal(integer_types_as_str, precision, scale): + to_dtype = cudf.Decimal64Dtype(precision, scale) + data = cudf.Series( + [ + 14.12302, + 38.2, + np.nan, + 0.0, + -8.302014, + np.nan, + 94.31304, + np.nan, + -112.2314, + 0.3333333, + np.nan, + ] + ) + got = data.astype(integer_types_as_str) + + pa_arr = ( + got.to_arrow() + .cast("float64") + .cast(pa.decimal128(to_dtype.precision, to_dtype.scale)) + ) + expected = cudf.Series._from_column(Decimal64Column.from_arrow(pa_arr)) + + got = got.astype(to_dtype) + + assert_eq(got, expected) + + +@pytest.mark.parametrize( + "from_dtype", + [ + cudf.Decimal64Dtype(7, 2), + cudf.Decimal64Dtype(11, 4), + cudf.Decimal64Dtype(18, 10), + cudf.Decimal32Dtype(7, 2), + cudf.Decimal32Dtype(5, 3), + cudf.Decimal32Dtype(9, 5), + ], +) +@pytest.mark.parametrize( + "to_dtype", + [ + cudf.Decimal64Dtype(7, 2), + cudf.Decimal64Dtype(18, 10), + cudf.Decimal64Dtype(11, 4), + cudf.Decimal32Dtype(7, 2), + cudf.Decimal32Dtype(9, 5), + cudf.Decimal32Dtype(5, 3), + ], +) +def test_typecast_to_from_decimal(from_dtype, to_dtype): + data = cudf.Series( + [ + 14.12309, + 2.343942, + np.nan, + 0.0, + -8.302082, + np.nan, + 94.31308, + -112.2364, + -8.029972, + np.nan, + ] + ) + if from_dtype.scale > to_dtype.MAX_PRECISION: + pytest.skip( + "This is supposed to overflow because the representation value in " + "the source exceeds the max representable in destination dtype." + ) + s = data.astype(from_dtype) + + pa_arr = s.to_arrow().cast( + pa.decimal128(to_dtype.precision, to_dtype.scale), safe=False + ) + if isinstance(to_dtype, cudf.Decimal32Dtype): + expected = cudf.Series._from_column(Decimal32Column.from_arrow(pa_arr)) + elif isinstance(to_dtype, cudf.Decimal64Dtype): + expected = cudf.Series._from_column(Decimal64Column.from_arrow(pa_arr)) + + with expect_warning_if(to_dtype.scale < s.dtype.scale, UserWarning): + got = s.astype(to_dtype) + + assert_eq(got, expected) + + +@pytest.mark.parametrize("precision, scale", [(7, 2), (11, 4), (17, 10)]) +def test_typecast_from_decimal(precision, scale, signed_integer_types_as_str): + from_dtype = cudf.Decimal64Dtype(precision, scale) + data = cudf.Series( + [ + 14.12309, + 2.343942, + np.nan, + 0.0, + -8.302082, + np.nan, + 94.31308, + -112.2364, + -8.029972, + np.nan, + ] + ) + got = data.astype(from_dtype) + pa_arr = got.to_arrow().cast(signed_integer_types_as_str, safe=False) + + got = got.astype(signed_integer_types_as_str) + expected = cudf.Series._from_column(NumericalColumn.from_arrow(pa_arr)) + + assert_eq(got, expected) + assert_eq(got.dtype, expected.dtype) diff --git a/python/cudf/cudf/tests/series/methods/test_nlargest_nsmallest.py b/python/cudf/cudf/tests/series/methods/test_nlargest_nsmallest.py index 329c7f96602..3a60e7abdf9 100644 --- a/python/cudf/cudf/tests/series/methods/test_nlargest_nsmallest.py +++ b/python/cudf/cudf/tests/series/methods/test_nlargest_nsmallest.py @@ -1,8 +1,10 @@ # Copyright (c) 2025, NVIDIA CORPORATION. +import pandas as pd import pytest import cudf +from cudf.testing import assert_eq from cudf.testing._utils import ( assert_exceptions_equal, ) @@ -16,3 +18,40 @@ def test_series_nlargest_nsmallest_str_error(attr): assert_exceptions_equal( getattr(gs, attr), getattr(ps, attr), ([], {"n": 1}), ([], {"n": 1}) ) + + +@pytest.mark.parametrize("data", [[0, 1, 1, 2, 2, 2, 3, 3], [0], [1, 2, 3]]) +@pytest.mark.parametrize("n", [-100, -2, 0, 1, 4]) +def test_series_nlargest(data, n): + """Indirectly tests Series.sort_values()""" + sr = cudf.Series(data) + psr = pd.Series(data) + assert_eq(sr.nlargest(n), psr.nlargest(n)) + assert_eq(sr.nlargest(n, keep="last"), psr.nlargest(n, keep="last")) + + assert_exceptions_equal( + lfunc=psr.nlargest, + rfunc=sr.nlargest, + lfunc_args_and_kwargs=([], {"n": 3, "keep": "what"}), + rfunc_args_and_kwargs=([], {"n": 3, "keep": "what"}), + ) + + +@pytest.mark.parametrize("data", [[0, 1, 1, 2, 2, 2, 3, 3], [0], [1, 2, 3]]) +@pytest.mark.parametrize("n", [-100, -2, 0, 1, 4]) +def test_series_nsmallest(data, n): + """Indirectly tests Series.sort_values()""" + sr = cudf.Series(data) + psr = pd.Series(data) + assert_eq(sr.nsmallest(n), psr.nsmallest(n)) + assert_eq( + sr.nsmallest(n, keep="last").sort_index(), + psr.nsmallest(n, keep="last").sort_index(), + ) + + assert_exceptions_equal( + lfunc=psr.nsmallest, + rfunc=sr.nsmallest, + lfunc_args_and_kwargs=([], {"n": 3, "keep": "what"}), + rfunc_args_and_kwargs=([], {"n": 3, "keep": "what"}), + ) diff --git a/python/cudf/cudf/tests/series/methods/test_repeat.py b/python/cudf/cudf/tests/series/methods/test_repeat.py new file mode 100644 index 00000000000..dcf87331857 --- /dev/null +++ b/python/cudf/cudf/tests/series/methods/test_repeat.py @@ -0,0 +1,27 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + +import numpy as np +import pandas as pd + +import cudf +from cudf.testing import assert_eq + + +def test_repeat(all_supported_types_as_str): + rng = np.random.default_rng(seed=0) + arr = rng.random(10) * 10 + repeats = rng.integers(10, size=10) + psr = pd.Series(arr).astype(all_supported_types_as_str) + gsr = cudf.from_pandas(psr) + + assert_eq(psr.repeat(repeats), gsr.repeat(repeats)) + + +def test_repeat_scalar(numeric_types_as_str): + rng = np.random.default_rng(seed=0) + arr = rng.random(10) * 10 + repeats = 10 + psr = pd.Series(arr).astype(numeric_types_as_str) + gsr = cudf.from_pandas(psr) + + assert_eq(psr.repeat(repeats), gsr.repeat(repeats)) diff --git a/python/cudf/cudf/tests/series/methods/test_sort_values.py b/python/cudf/cudf/tests/series/methods/test_sort_values.py new file mode 100644 index 00000000000..1ce6a55087f --- /dev/null +++ b/python/cudf/cudf/tests/series/methods/test_sort_values.py @@ -0,0 +1,29 @@ +# Copyright (c) 2020-2025, NVIDIA CORPORATION. + +import pandas as pd +import pytest + +import cudf +from cudf.testing import assert_eq + + +@pytest.mark.parametrize( + "data", [["z", "1", "a"], ["c", None, "b"], [None] * 3] +) +def test_string_sort(data, ascending): + ps = pd.Series(data, dtype="str", name="nice name") + gs = cudf.Series(data, dtype="str", name="nice name") + + expect = ps.sort_values(ascending=ascending) + got = gs.sort_values(ascending=ascending) + + assert_eq(expect, got) + + +def test_series_sort_values_ignore_index(ignore_index): + gsr = cudf.Series([1, 3, 5, 2, 4]) + psr = gsr.to_pandas() + + expect = psr.sort_values(ignore_index=ignore_index) + got = gsr.sort_values(ignore_index=ignore_index) + assert_eq(expect, got) diff --git a/python/cudf/cudf/tests/series/methods/test_to_pandas.py b/python/cudf/cudf/tests/series/methods/test_to_pandas.py index 9013675c191..7c7ca5c1ff4 100644 --- a/python/cudf/cudf/tests/series/methods/test_to_pandas.py +++ b/python/cudf/cudf/tests/series/methods/test_to_pandas.py @@ -248,3 +248,22 @@ def test_to_from_pandas_nulls(nulls): got = gdf_data.to_pandas() assert_eq(expect, got) + + +@pytest.mark.parametrize("data", [["a"], ["a", None], [None]]) +def test_string_export(data): + ps = pd.Series(data, dtype="str", name="nice name") + gs = cudf.Series(data, dtype="str", name="nice name") + + expect = ps + got = gs.to_pandas() + assert_eq(expect, got) + + expect = np.array(ps) + got = gs.to_numpy() + assert_eq(expect, got) + + expect = pa.Array.from_pandas(ps) + got = gs.to_arrow() + + assert pa.Array.equals(expect, got) diff --git a/python/cudf/cudf/tests/series/methods/test_unique.py b/python/cudf/cudf/tests/series/methods/test_unique.py index 8346388ad78..257c57aa1e7 100644 --- a/python/cudf/cudf/tests/series/methods/test_unique.py +++ b/python/cudf/cudf/tests/series/methods/test_unique.py @@ -84,3 +84,23 @@ def test_series_nunique(request, nan_as_null, dropna): expect = pd_series.nunique(dropna=dropna) got = cudf_series.nunique(dropna=dropna) assert expect == got + + +@pytest.mark.parametrize( + "item", + [ + ["Cbe", "cbe", "CbeD", "Cb", "ghi", "Cb"], + ["a", "a", "a", "a", "A"], + ["A"], + ["abc", "xyz", None, "ab", "123"], + [None, None, "abc", None, "abc"], + ], +) +def test_string_unique(item): + ps = pd.Series(item) + gs = cudf.Series(item) + # Pandas `unique` returns a numpy array + pres = pd.Series(ps.unique()) + # cudf returns a cudf.Series + gres = gs.unique() + assert_eq(pres, gres) diff --git a/python/cudf/cudf/tests/series/test_binops.py b/python/cudf/cudf/tests/series/test_binops.py index 1c288a48e90..d24dab0a012 100644 --- a/python/cudf/cudf/tests/series/test_binops.py +++ b/python/cudf/cudf/tests/series/test_binops.py @@ -1,13 +1,16 @@ # Copyright (c) 2025, NVIDIA CORPORATION. +import datetime +import decimal import operator +import cupy as cp +import numpy as np import pandas as pd import pytest import cudf -from cudf.testing._utils import ( - assert_exceptions_equal, -) +from cudf.testing import assert_eq +from cudf.testing._utils import assert_exceptions_equal @pytest.mark.parametrize( @@ -17,19 +20,839 @@ "sr2", [pd.Series([], dtype="float64"), pd.Series(["a", "a", "c", "z", "A"])], ) +def test_series_error_equality(sr1, sr2, comparison_op): + gsr1 = cudf.from_pandas(sr1) + gsr2 = cudf.from_pandas(sr2) + + assert_exceptions_equal( + comparison_op, comparison_op, ([sr1, sr2],), ([gsr1, gsr2],) + ) + + @pytest.mark.parametrize( - "op", + "data,other", [ - operator.eq, - operator.ne, - operator.lt, - operator.gt, - operator.le, - operator.ge, + ([1000000, 200000, 3000000], [1000000, 200000, 3000000]), + ([1000000, 200000, None], [1000000, 200000, None]), + ([], []), + ([None], [None]), + ( + [12, 12, 22, 343, 4353534, 435342], + [12, 12, 22, 343, 4353534, 435342], + ), + ([1000000, 200000, 3000000], [200000, 34543, 3000000]), + ([1000000, 200000, None], [1000000, 200000, 3000000]), + ([None], [1]), + ( + [12, 12, 22, 343, 4353534, 435342], + [None, 1, 220, 3, 34, 4353423287], + ), + (np.array([10, 20, 30, None, 100]), np.array([10, 20, 30, None, 100])), + (cp.asarray([10, 20, 30, 100]), cp.asarray([10, 20, 30, 100])), ], ) -def test_series_error_equality(sr1, sr2, op): - gsr1 = cudf.from_pandas(sr1) - gsr2 = cudf.from_pandas(sr2) +def test_timedelta_ops_misc_inputs( + data, other, timedelta_types_as_str, binary_op_method +): + if binary_op_method in {"mul", "rmul", "pow", "rpow"}: + pytest.skip(f"Test not applicable for {binary_op_method}") + gsr = cudf.Series(data, dtype=timedelta_types_as_str) + other_gsr = cudf.Series(other, dtype=timedelta_types_as_str) + + psr = gsr.to_pandas() + other_psr = other_gsr.to_pandas() + + expected = getattr(psr, binary_op_method)(other_psr) + actual = getattr(gsr, binary_op_method)(other_gsr) + if binary_op_method in ("eq", "lt", "gt", "le", "ge"): + actual = actual.fillna(False) + elif binary_op_method == "ne": + actual = actual.fillna(True) + + if binary_op_method == "floordiv": + expected[actual.isna().to_pandas()] = np.nan + + assert_eq(expected, actual) + + +@pytest.mark.parametrize( + "datetime_data,timedelta_data", + [ + ([1000000, 200000, 3000000], [1000000, 200000, 3000000]), + ([1000000, 200000, None], [1000000, 200000, None]), + ([], []), + ([None], [None]), + ( + [12, 12, 22, 343, 4353534, 435342], + [12, 12, 22, 343, 4353534, 435342], + ), + ([1000000, 200000, 3000000], [200000, 34543, 3000000]), + ([1000000, 200000, None], [1000000, 200000, 3000000]), + ([None], [1]), + ( + [12, 12, 22, 343, 4353534, 435342], + [None, 1, 220, 3, 34, 4353423287], + ), + (np.array([10, 20, 30, None, 100]), np.array([10, 20, 30, None, 100])), + (cp.asarray([10, 20, 30, 100]), cp.asarray([10, 20, 30, 100])), + ( + [12, 11, 232, 223432411, 2343241, 234324, 23234], + [11, 1132324, 2322323111, 23341, 2434, 332, 323], + ), + ( + [12, 11, 2.32, 2234.32411, 2343.241, 23432.4, 23234], + [11, 1132324, 2322323111, 23341, 2434, 332, 323], + ), + ( + [11, 1132324, 2322323111, 23341, 2434, 332, 323], + [12, 11, 2.32, 2234.32411, 2343.241, 23432.4, 23234], + ), + ( + [1.321, 1132.324, 23223231.11, 233.41, 0.2434, 332, 323], + [12, 11, 2.32, 2234.32411, 2343.241, 23432.4, 23234], + ), + ], +) +@pytest.mark.parametrize("ops", ["add", "sub"]) +def test_timedelta_ops_datetime_inputs( + datetime_types_as_str, + timedelta_types_as_str, + datetime_data, + timedelta_data, + ops, +): + gsr_datetime = cudf.Series(datetime_data, dtype=datetime_types_as_str) + gsr_timedelta = cudf.Series(timedelta_data, dtype=timedelta_types_as_str) + + psr_datetime = gsr_datetime.to_pandas() + psr_timedelta = gsr_timedelta.to_pandas() + + expected = getattr(psr_datetime, ops)(psr_timedelta) + actual = getattr(gsr_datetime, ops)(gsr_timedelta) + + assert_eq(expected, actual) + + if ops == "add": + expected = getattr(psr_timedelta, ops)(psr_datetime) + actual = getattr(gsr_timedelta, ops)(gsr_datetime) + + assert_eq(expected, actual) + elif ops == "sub": + assert_exceptions_equal( + lfunc=operator.sub, + rfunc=operator.sub, + lfunc_args_and_kwargs=([psr_timedelta, psr_datetime],), + rfunc_args_and_kwargs=([gsr_timedelta, gsr_datetime],), + ) + + +@pytest.mark.parametrize( + "df", + [ + pd.DataFrame( + { + "A": pd.Series(pd.date_range("2012-1-1", periods=3, freq="D")), + "B": pd.Series( + pd.timedelta_range(start="1 day", periods=3, freq="D") + ), + } + ), + pd.DataFrame( + { + "A": pd.Series( + pd.date_range("1994-1-1", periods=10, freq="D") + ), + "B": pd.Series( + pd.timedelta_range(start="1 day", periods=10, freq="D") + ), + } + ), + ], +) +@pytest.mark.parametrize("op", ["add", "sub"]) +def test_timedelta_dataframe_ops(df, op): + pdf = df + gdf = cudf.from_pandas(pdf) + + if op == "add": + pdf["C"] = pdf["A"] + pdf["B"] + gdf["C"] = gdf["A"] + gdf["B"] + elif op == "sub": + pdf["C"] = pdf["A"] - pdf["B"] + gdf["C"] = gdf["A"] - gdf["B"] + + assert_eq(pdf, gdf) + + +@pytest.mark.parametrize( + "data", + [ + [1000000, 200000, 3000000], + [1000000, 200000, None], + [], + [None], + [None, None, None, None, None], + [12, 12, 22, 343, 4353534, 435342], + np.array([10, 20, 30, None, 100]), + cp.asarray([10, 20, 30, 100]), + [1000000, 200000, 3000000], + [1000000, 200000, None], + [1], + [12, 11, 232, 223432411, 2343241, 234324, 23234], + [12, 11, 2.32, 2234.32411, 2343.241, 23432.4, 23234], + [1.321, 1132.324, 23223231.11, 233.41, 332, 323], + [12, 11, 2.32, 2234.32411, 2343.241, 23432.4, 23234], + ], +) +@pytest.mark.parametrize( + "other_scalars", + [ + datetime.timedelta(days=768), + datetime.timedelta(seconds=768), + datetime.timedelta(microseconds=7), + datetime.timedelta(minutes=447), + datetime.timedelta(hours=447), + datetime.timedelta(weeks=734), + np.timedelta64(4, "s"), + np.timedelta64(456, "D"), + np.timedelta64(46, "h"), + np.timedelta64("nat"), + np.timedelta64(1, "s"), + np.timedelta64(1, "ms"), + np.timedelta64(1, "us"), + np.timedelta64(1, "ns"), + ], +) +def test_timedelta_series_ops_with_scalars( + data, other_scalars, timedelta_types_as_str, arithmetic_op_method, request +): + if arithmetic_op_method in { + "mul", + "rmul", + "rtruediv", + "pow", + "rpow", + "radd", + "rsub", + "rfloordiv", + "rmod", + }: + pytest.skip(f"Test not applicable for {arithmetic_op_method}") + gsr = cudf.Series(data=data, dtype=timedelta_types_as_str) + psr = gsr.to_pandas() + + if arithmetic_op_method == "add": + expected = psr + other_scalars + actual = gsr + other_scalars + elif arithmetic_op_method == "sub": + expected = psr - other_scalars + actual = gsr - other_scalars + elif arithmetic_op_method == "truediv": + expected = psr / other_scalars + actual = gsr / other_scalars + elif arithmetic_op_method == "floordiv": + expected = psr // other_scalars + actual = gsr // other_scalars + elif arithmetic_op_method == "mod": + expected = psr % other_scalars + actual = gsr % other_scalars + + assert_eq(expected, actual) + + if arithmetic_op_method == "add": + expected = other_scalars + psr + actual = other_scalars + gsr + elif arithmetic_op_method == "sub": + expected = other_scalars - psr + actual = other_scalars - gsr + elif arithmetic_op_method == "truediv": + expected = other_scalars / psr + actual = other_scalars / gsr + elif arithmetic_op_method == "floordiv": + expected = other_scalars // psr + actual = other_scalars // gsr + elif arithmetic_op_method == "mod": + expected = other_scalars % psr + actual = other_scalars % gsr + + assert_eq(expected, actual) + + +@pytest.mark.parametrize( + "reverse", + [ + False, + pytest.param( + True, + marks=pytest.mark.xfail( + strict=True, + reason=( + "timedelta modulo by zero is dubiously defined in " + "both pandas and cuDF " + "(see https://github.com/rapidsai/cudf/issues/5938)" + ), + ), + ), + ], +) +def test_timedelta_series_mod_with_scalar_zero(reverse): + gsr = cudf.Series(data=[0.2434], dtype=np.timedelta64(1, "ns")) + psr = gsr.to_pandas() + scalar = datetime.timedelta(days=768) + if reverse: + expected = scalar % psr + actual = scalar % gsr + else: + expected = psr % scalar + actual = gsr % scalar + assert_eq(expected, actual) + + +def test_timedelta_invalid_ops(): + sr = cudf.Series([1, 2, 3], dtype="timedelta64[ns]") + psr = sr.to_pandas() + + assert_exceptions_equal( + lfunc=operator.add, + rfunc=operator.add, + lfunc_args_and_kwargs=([psr, 1],), + rfunc_args_and_kwargs=([sr, 1],), + ) + + assert_exceptions_equal( + lfunc=operator.add, + rfunc=operator.add, + lfunc_args_and_kwargs=([psr, "a"],), + rfunc_args_and_kwargs=([sr, "a"],), + ) + + dt_sr = cudf.Series([1, 2, 3], dtype="datetime64[ns]") + dt_psr = dt_sr.to_pandas() + + assert_exceptions_equal( + lfunc=operator.mod, + rfunc=operator.mod, + lfunc_args_and_kwargs=([psr, dt_psr],), + rfunc_args_and_kwargs=([sr, dt_sr],), + ) + + assert_exceptions_equal( + lfunc=operator.mod, + rfunc=operator.mod, + lfunc_args_and_kwargs=([psr, "a"],), + rfunc_args_and_kwargs=([sr, "a"],), + check_exception_type=False, + ) + + assert_exceptions_equal( + lfunc=operator.gt, + rfunc=operator.gt, + lfunc_args_and_kwargs=([psr, dt_psr],), + rfunc_args_and_kwargs=([sr, dt_sr],), + ) + + assert_exceptions_equal( + lfunc=operator.lt, + rfunc=operator.lt, + lfunc_args_and_kwargs=([psr, dt_psr],), + rfunc_args_and_kwargs=([sr, dt_sr],), + ) + + assert_exceptions_equal( + lfunc=operator.ge, + rfunc=operator.ge, + lfunc_args_and_kwargs=([psr, dt_psr],), + rfunc_args_and_kwargs=([sr, dt_sr],), + ) + + assert_exceptions_equal( + lfunc=operator.le, + rfunc=operator.le, + lfunc_args_and_kwargs=([psr, dt_psr],), + rfunc_args_and_kwargs=([sr, dt_sr],), + ) + + assert_exceptions_equal( + lfunc=operator.truediv, + rfunc=operator.truediv, + lfunc_args_and_kwargs=([psr, dt_psr],), + rfunc_args_and_kwargs=([sr, dt_sr],), + ) + + assert_exceptions_equal( + lfunc=operator.floordiv, + rfunc=operator.floordiv, + lfunc_args_and_kwargs=([psr, dt_psr],), + rfunc_args_and_kwargs=([sr, dt_sr],), + ) + + assert_exceptions_equal( + lfunc=operator.mul, + rfunc=operator.mul, + lfunc_args_and_kwargs=([psr, dt_psr],), + rfunc_args_and_kwargs=([sr, dt_sr],), + ) + + assert_exceptions_equal( + lfunc=operator.mul, + rfunc=operator.mul, + lfunc_args_and_kwargs=([psr, psr],), + rfunc_args_and_kwargs=([sr, sr],), + check_exception_type=False, + ) + + assert_exceptions_equal( + lfunc=operator.xor, + rfunc=operator.xor, + lfunc_args_and_kwargs=([psr, psr],), + rfunc_args_and_kwargs=([sr, sr],), + ) + + +@pytest.mark.parametrize("op", [operator.add, operator.sub]) +def test_timdelta_binop_tz_timestamp(op): + s = cudf.Series([1, 2, 3], dtype="timedelta64[ns]") + pd_tz_timestamp = pd.Timestamp("1970-01-01 00:00:00.000000001", tz="utc") + with pytest.raises(NotImplementedError): + op(s, pd_tz_timestamp) + date_tz_scalar = datetime.datetime.now(datetime.timezone.utc) + with pytest.raises(NotImplementedError): + op(s, date_tz_scalar) + + +def test_timedelta_series_cmpops_pandas_compatibility(comparison_op): + gsr1 = cudf.Series( + data=[123, 456, None, 321, None], dtype="timedelta64[ns]" + ) + psr1 = gsr1.to_pandas() + + gsr2 = cudf.Series( + data=[123, 456, 789, None, None], dtype="timedelta64[ns]" + ) + psr2 = gsr2.to_pandas() + + expect = comparison_op(psr1, psr2) + with cudf.option_context("mode.pandas_compatible", True): + got = comparison_op(gsr1, gsr2) + + assert_eq(expect, got) + + +def test_string_equality(): + data1 = ["b", "c", "d", "a", "c"] + data2 = ["a", None, "c", "a", "c"] + + ps1 = pd.Series(data1) + ps2 = pd.Series(data2) + gs1 = cudf.Series(data1) + gs2 = cudf.Series(data2) + + expect = ps1 == ps2 + got = gs1 == gs2 + + assert_eq(expect, got.fillna(False)) + + expect = ps1 == "m" + got = gs1 == "m" + + assert_eq(expect, got.fillna(False)) + + ps1 = pd.Series(["a"]) + gs1 = cudf.Series(["a"]) + + expect = ps1 == "m" + got = gs1 == "m" + + assert_eq(expect, got) + + +@pytest.mark.parametrize( + "lhs", + [ + ["Cbe", "cbe", "CbeD", "Cb", "ghi", "Cb"], + ["abc", "xyz", "a", "ab", "123", "097"], + ], +) +@pytest.mark.parametrize( + "rhs", + [ + ["Cbe", "cbe", "CbeD", "Cb", "ghi", "Cb"], + ["a", "a", "a", "a", "A", "z"], + ], +) +def test_string_binary_op_add(lhs, rhs): + pds = pd.Series(lhs) + pd.Series(rhs) + gds = cudf.Series(lhs) + cudf.Series(rhs) + + assert_eq(pds, gds) + + +def test_concatenate_rows_of_lists(): + pser = pd.Series([["a", "a"], ["b"], ["c"]]) + gser = cudf.Series([["a", "a"], ["b"], ["c"]]) + + expect = pser + pser + got = gser + gser + + assert_eq(expect, got) + + +def test_concatenate_list_with_nonlist(): + gser1 = cudf.Series([["a", "c"], ["b", "d"], ["c", "d"]]) + gser2 = cudf.Series(["a", "b", "c"]) + with pytest.raises(TypeError): + gser1 + gser2 + + +def test_datetime_series_binops_pandas( + datetime_types_as_str, datetime_types_as_str2 +): + dti = pd.date_range("20010101", "20020215", freq="400h", name="times") + pd_data_1 = pd.Series(dti) + pd_data_2 = pd_data_1 + gdf_data_1 = cudf.Series(pd_data_1).astype(datetime_types_as_str) + gdf_data_2 = cudf.Series(pd_data_2).astype(datetime_types_as_str2) + assert_eq(pd_data_1, gdf_data_1.astype("datetime64[ns]")) + assert_eq(pd_data_2, gdf_data_2.astype("datetime64[ns]")) + assert_eq(pd_data_1 < pd_data_2, gdf_data_1 < gdf_data_2) + assert_eq(pd_data_1 > pd_data_2, gdf_data_1 > gdf_data_2) + assert_eq(pd_data_1 == pd_data_2, gdf_data_1 == gdf_data_2) + assert_eq(pd_data_1 <= pd_data_2, gdf_data_1 <= gdf_data_2) + assert_eq(pd_data_1 >= pd_data_2, gdf_data_1 >= gdf_data_2) + + +def test_datetime_series_binops_numpy( + datetime_types_as_str, datetime_types_as_str2 +): + dti = pd.date_range("20010101", "20020215", freq="400h", name="times") + pd_data_1 = pd.Series(dti) + pd_data_2 = pd_data_1 + gdf_data_1 = cudf.Series(pd_data_1).astype(datetime_types_as_str) + gdf_data_2 = cudf.Series(pd_data_2).astype(datetime_types_as_str2) + np_data_1 = np.array(pd_data_1).astype(datetime_types_as_str) + np_data_2 = np.array(pd_data_2).astype(datetime_types_as_str2) + np.testing.assert_equal(np_data_1, gdf_data_1.to_numpy()) + np.testing.assert_equal(np_data_2, gdf_data_2.to_numpy()) + np.testing.assert_equal( + np.less(np_data_1, np_data_2), (gdf_data_1 < gdf_data_2).to_numpy() + ) + np.testing.assert_equal( + np.greater(np_data_1, np_data_2), (gdf_data_1 > gdf_data_2).to_numpy() + ) + np.testing.assert_equal( + np.equal(np_data_1, np_data_2), (gdf_data_1 == gdf_data_2).to_numpy() + ) + np.testing.assert_equal( + np.less_equal(np_data_1, np_data_2), + (gdf_data_1 <= gdf_data_2).to_numpy(), + ) + np.testing.assert_equal( + np.greater_equal(np_data_1, np_data_2), + (gdf_data_1 >= gdf_data_2).to_numpy(), + ) + + +@pytest.mark.parametrize( + "data", + [ + pd.date_range("20010101", "20020215", freq="400h", name="times"), + pd.date_range( + "20010101", freq="243434324423423234ns", name="times", periods=10 + ), + ], +) +def test_dt_ops(data): + pd_data = pd.Series(data) + gdf_data = cudf.Series(data) + + assert_eq(pd_data == pd_data, gdf_data == gdf_data) + assert_eq(pd_data < pd_data, gdf_data < gdf_data) + assert_eq(pd_data > pd_data, gdf_data > gdf_data) + + +@pytest.mark.parametrize( + "data", + [ + [1, 2, 3, 4, 10, 100, 20000], + [None] * 7, + [10, 20, 30, None, 100, 200, None], + [3223.234, 342.2332, 23423.23, 3343.23324, 23432.2323, 242.23, 233], + ], +) +@pytest.mark.parametrize( + "other", + [ + [1, 2, 3, 4, 10, 100, 20000], + [None] * 7, + [10, 20, 30, None, 100, 200, None], + [3223.234, 342.2332, 23423.23, 3343.23324, 23432.2323, 242.23, 233], + datetime.datetime(1993, 6, 22, 13, 30), + datetime.datetime(2005, 1, 22, 10, 00), + np.datetime64("2005-02"), + np.datetime64("2005-02-25"), + np.datetime64("2005-02-25T03:30"), + np.datetime64("nat"), + # TODO: https://github.com/pandas-dev/pandas/issues/52295 + ], +) +def test_datetime_subtract( + data, other, datetime_types_as_str, datetime_types_as_str2 +): + gsr = cudf.Series(data, dtype=datetime_types_as_str) + psr = gsr.to_pandas() + + if isinstance(other, np.datetime64): + gsr_other = other + psr_other = other + else: + gsr_other = cudf.Series(other, dtype=datetime_types_as_str2) + psr_other = gsr_other.to_pandas() + + expected = psr - psr_other + actual = gsr - gsr_other + + assert_eq(expected, actual) + + expected = psr_other - psr + actual = gsr_other - gsr + + assert_eq(expected, actual) + + +@pytest.mark.parametrize( + "data", + [ + [1000000, 200000, 3000000], + [1000000, 200000, None], + [], + [None], + [None, None, None, None, None], + [12, 12, 22, 343, 4353534, 435342], + np.array([10, 20, 30, None, 100]), + cp.asarray([10, 20, 30, 100]), + [1000000, 200000, 3000000], + [1000000, 200000, None], + [1], + [12, 11, 232, 223432411, 2343241, 234324, 23234], + [12, 11, 2.32, 2234.32411, 2343.241, 23432.4, 23234], + [1.321, 1132.324, 23223231.11, 233.41, 0.2434, 332, 323], + [12, 11, 2.32, 2234.32411, 2343.241, 23432.4, 23234], + ], +) +@pytest.mark.parametrize( + "other_scalars", + [ + datetime.timedelta(days=768), + datetime.timedelta(seconds=768), + datetime.timedelta(microseconds=7), + datetime.timedelta(minutes=447), + datetime.timedelta(hours=447), + datetime.timedelta(weeks=734), + np.timedelta64(4, "s"), + np.timedelta64(456, "D"), + np.timedelta64(46, "h"), + np.timedelta64("nat"), + np.timedelta64(1, "s"), + np.timedelta64(1, "ms"), + np.timedelta64(1, "us"), + np.timedelta64(1, "ns"), + ], +) +@pytest.mark.parametrize("op", ["add", "sub"]) +def test_datetime_series_ops_with_scalars( + data, other_scalars, datetime_types_as_str, op +): + gsr = cudf.Series(data=data, dtype=datetime_types_as_str) + psr = gsr.to_pandas() + + if op == "add": + expected = psr + other_scalars + actual = gsr + other_scalars + elif op == "sub": + expected = psr - other_scalars + actual = gsr - other_scalars + + assert_eq(expected, actual) + + if op == "add": + expected = other_scalars + psr + actual = other_scalars + gsr + + assert_eq(expected, actual) + + elif op == "sub": + assert_exceptions_equal( + lfunc=operator.sub, + rfunc=operator.sub, + lfunc_args_and_kwargs=([other_scalars, psr],), + rfunc_args_and_kwargs=([other_scalars, gsr],), + ) + + +@pytest.mark.parametrize("data", ["20110101", "20120101", "20130101"]) +@pytest.mark.parametrize("other_scalars", ["20110101", "20120101", "20130101"]) +def test_datetime_series_cmpops_with_scalars( + data, other_scalars, datetime_types_as_str, comparison_op +): + gsr = cudf.Series(data=data, dtype=datetime_types_as_str) + psr = gsr.to_pandas() + + expect = comparison_op(psr, other_scalars) + got = comparison_op(gsr, other_scalars) + + assert_eq(expect, got) + + +def test_datetime_invalid_ops(): + sr = cudf.Series([1, 2, 3], dtype="datetime64[ns]") + psr = sr.to_pandas() + + assert_exceptions_equal( + lfunc=operator.add, + rfunc=operator.add, + lfunc_args_and_kwargs=([psr, pd.Timestamp(1513393355.5, unit="s")],), + rfunc_args_and_kwargs=([sr, pd.Timestamp(1513393355.5, unit="s")],), + ) + + assert_exceptions_equal( + lfunc=operator.truediv, + rfunc=operator.truediv, + lfunc_args_and_kwargs=([psr, pd.Timestamp(1513393355.5, unit="s")],), + rfunc_args_and_kwargs=([sr, pd.Timestamp(1513393355.5, unit="s")],), + ) + + assert_exceptions_equal( + lfunc=operator.add, + rfunc=operator.add, + lfunc_args_and_kwargs=([psr, psr],), + rfunc_args_and_kwargs=([sr, sr],), + ) + + assert_exceptions_equal( + lfunc=operator.floordiv, + rfunc=operator.floordiv, + lfunc_args_and_kwargs=([psr, psr],), + rfunc_args_and_kwargs=([sr, sr],), + ) + + assert_exceptions_equal( + lfunc=operator.floordiv, + rfunc=operator.floordiv, + lfunc_args_and_kwargs=([psr, pd.Timestamp(1513393355.5, unit="s")],), + rfunc_args_and_kwargs=([sr, pd.Timestamp(1513393355.5, unit="s")],), + ) + + assert_exceptions_equal( + lfunc=operator.add, + rfunc=operator.add, + lfunc_args_and_kwargs=([psr, 1],), + rfunc_args_and_kwargs=([sr, 1],), + ) + + assert_exceptions_equal( + lfunc=operator.truediv, + rfunc=operator.truediv, + lfunc_args_and_kwargs=([psr, "a"],), + rfunc_args_and_kwargs=([sr, "a"],), + ) + + assert_exceptions_equal( + lfunc=operator.mul, + rfunc=operator.mul, + lfunc_args_and_kwargs=([psr, 1],), + rfunc_args_and_kwargs=([sr, 1],), + ) + + +def test_datetime_binop_tz_timestamp(comparison_op): + s = cudf.Series([1, 2, 3], dtype="datetime64[ns]") + pd_tz_timestamp = pd.Timestamp("1970-01-01 00:00:00.000000001", tz="utc") + with pytest.raises(NotImplementedError): + comparison_op(s, pd_tz_timestamp) + + date_scalar = datetime.datetime.now(datetime.timezone.utc) + with pytest.raises(NotImplementedError): + comparison_op(s, date_scalar) + + +def test_datetime_series_cmpops_pandas_compatibility(comparison_op): + data1 = ["20110101", "20120101", None, "20140101", None] + data2 = ["20110101", "20120101", "20130101", None, None] + gsr1 = cudf.Series(data=data1, dtype="datetime64[ns]") + psr1 = gsr1.to_pandas() + + gsr2 = cudf.Series(data=data2, dtype="datetime64[ns]") + psr2 = gsr2.to_pandas() + + expect = comparison_op(psr1, psr2) + with cudf.option_context("mode.pandas_compatible", True): + got = comparison_op(gsr1, gsr2) + + assert_eq(expect, got) + + +def test_decimal_overflow(): + s = cudf.Series( + [decimal.Decimal("0.0009384233522166997927180531650178250")] + ) + result = s * s + assert_eq(cudf.Decimal128Dtype(precision=38, scale=37), result.dtype) + + s = cudf.Series([1, 2], dtype=cudf.Decimal128Dtype(precision=38, scale=0)) + result = s * decimal.Decimal("1.0") + assert_eq(cudf.Decimal128Dtype(precision=38, scale=1), result.dtype) + + +def test_decimal_binop_upcast_operands(): + ser1 = cudf.Series([0.51, 1.51, 2.51]).astype(cudf.Decimal64Dtype(18, 2)) + ser2 = cudf.Series([0.90, 0.96, 0.99]).astype(cudf.Decimal128Dtype(19, 2)) + result = ser1 + ser2 + expected = cudf.Series([1.41, 2.47, 3.50]).astype( + cudf.Decimal128Dtype(20, 2) + ) + assert_eq(result, expected) + + +def test_categorical_compare_ordered(): + cat1 = pd.Categorical( + ["a", "a", "b", "c", "a"], categories=["a", "b", "c"], ordered=True + ) + pdsr1 = pd.Series(cat1) + sr1 = cudf.Series(cat1) + cat2 = pd.Categorical( + ["a", "b", "a", "c", "b"], categories=["a", "b", "c"], ordered=True + ) + pdsr2 = pd.Series(cat2) + sr2 = cudf.Series(cat2) + + # test equal + out = sr1 == sr1 + assert out.dtype == np.bool_ + assert type(out[0]) is np.bool_ + assert np.all(out.to_numpy()) + assert np.all(pdsr1 == pdsr1) + + # test inequality + out = sr1 != sr1 + assert not np.any(out.to_numpy()) + assert not np.any(pdsr1 != pdsr1) + + assert pdsr1.cat.ordered + assert sr1.cat.ordered + + # test using ordered operators + np.testing.assert_array_equal(pdsr1 < pdsr2, (sr1 < sr2).to_numpy()) + np.testing.assert_array_equal(pdsr1 > pdsr2, (sr1 > sr2).to_numpy()) + + +def test_categorical_binary_add(): + cat = pd.Categorical(["a", "a", "b", "c", "a"], categories=["a", "b", "c"]) + pdsr = pd.Series(cat) + sr = cudf.Series(cat) - assert_exceptions_equal(op, op, ([sr1, sr2],), ([gsr1, gsr2],)) + assert_exceptions_equal( + lfunc=operator.add, + rfunc=operator.add, + lfunc_args_and_kwargs=([pdsr, pdsr],), + rfunc_args_and_kwargs=([sr, sr],), + ) diff --git a/python/cudf/cudf/tests/series/test_constructors.py b/python/cudf/cudf/tests/series/test_constructors.py index e9ccee3a3ae..6ad6a1a41cb 100644 --- a/python/cudf/cudf/tests/series/test_constructors.py +++ b/python/cudf/cudf/tests/series/test_constructors.py @@ -1361,6 +1361,34 @@ def test_timezone_pyarrow_array(): assert_eq(result, expected) +def test_string_ingest(one_dimensional_array_types): + expect = ["a", "a", "b", "c", "a"] + data = one_dimensional_array_types(expect) + got = cudf.Series(data) + assert got.dtype == np.dtype("object") + assert len(got) == 5 + for idx, val in enumerate(expect): + assert expect[idx] == got[idx] + + +def test_decimal_invalid_precision(): + with pytest.raises(pa.ArrowInvalid): + cudf.Series([10, 20, 30], dtype=cudf.Decimal64Dtype(2, 2)) + + with pytest.raises(pa.ArrowInvalid): + cudf.Series([decimal.Decimal("300")], dtype=cudf.Decimal64Dtype(2, 1)) + + +@pytest.mark.parametrize( + "input_obj", [[decimal.Decimal(1), cudf.NA, decimal.Decimal(3)]] +) +def test_series_construction_decimals_with_nulls(input_obj): + expect = pa.array(input_obj, from_pandas=True) + got = cudf.Series(input_obj).to_arrow() + + assert expect.equals(got) + + @pytest.mark.parametrize( "klass", ["Series", "DatetimeIndex", "Index", "CategoricalIndex"] ) diff --git a/python/cudf/cudf/tests/series/test_np_ufuncs.py b/python/cudf/cudf/tests/series/test_np_ufuncs.py index c43f73b7e1f..450bc06dd46 100644 --- a/python/cudf/cudf/tests/series/test_np_ufuncs.py +++ b/python/cudf/cudf/tests/series/test_np_ufuncs.py @@ -1,5 +1,5 @@ # Copyright (c) 2025, NVIDIA CORPORATION. - +import datetime import operator from functools import reduce @@ -208,3 +208,49 @@ def test_ufunc_cudf_series_error_with_out_kwarg(): cudf_s3 = cudf.Series(data=[0, 0, 0, 0]) with pytest.raises(TypeError): np.add(x1=cudf_s1, x2=cudf_s2, out=cudf_s3) + + +@pytest.mark.parametrize( + "data", + [ + [1000000, 200000, 3000000], + [1000000, 200000, None], + [], + [None], + [None, None, None, None, None], + [12, 12, 22, 343, 4353534, 435342], + np.array([10, 20, 30, None, 100]), + cp.asarray([10, 20, 30, 100]), + [1000000, 200000, 3000000], + [1000000, 200000, None], + [1], + [12, 11, 232, 223432411, 2343241, 234324, 23234], + [12, 11, 2.32, 2234.32411, 2343.241, 23432.4, 23234], + [1.321, 1132.324, 23223231.11, 233.41, 0.2434, 332, 323], + [12, 11, 2.32, 2234.32411, 2343.241, 23432.4, 23234], + ], +) +@pytest.mark.parametrize( + "scalar", + [ + datetime.timedelta(days=768), + datetime.timedelta(seconds=768), + datetime.timedelta(microseconds=7), + np.timedelta64("nat"), + np.timedelta64(1, "s"), + np.timedelta64(1, "ms"), + np.timedelta64(1, "us"), + np.timedelta64(1, "ns"), + ], +) +@pytest.mark.parametrize("op", [np.add, np.subtract]) +def test_datetime_series_ops_with_scalars_misc( + data, scalar, datetime_types_as_str, op +): + gsr = cudf.Series(data=data, dtype=datetime_types_as_str) + psr = gsr.to_pandas() + + expect = op(psr, scalar) + got = op(gsr, scalar) + + assert_eq(expect, got) diff --git a/python/cudf/cudf/tests/series/test_repr.py b/python/cudf/cudf/tests/series/test_repr.py index 2e1e9888ff4..1bf01cd94da 100644 --- a/python/cudf/cudf/tests/series/test_repr.py +++ b/python/cudf/cudf/tests/series/test_repr.py @@ -519,3 +519,23 @@ def test_empty_series_name(): gs = cudf.from_pandas(ps) assert repr(ps) == repr(gs) + + +@pytest.mark.parametrize("item", [0, slice(0, 1)]) +@pytest.mark.parametrize("data", [["a"], ["a", None], [None]]) +def test_string_repr(data, item, request): + if data == [None]: + request.applymarker( + pytest.mark.xfail( + reason="Missing value repr should be instead of None", + ) + ) + ps = pd.Series(data, dtype="str", name="nice name") + gs = cudf.Series(data, dtype="str", name="nice name") + + got_out = gs.iloc[item] + expect_out = ps.iloc[item] + + expect = str(expect_out) + got = str(got_out) + assert expect == got diff --git a/python/cudf/cudf/tests/test_categorical.py b/python/cudf/cudf/tests/test_categorical.py index c5e2f05fcd9..bfa21b35bbd 100644 --- a/python/cudf/cudf/tests/test_categorical.py +++ b/python/cudf/cudf/tests/test_categorical.py @@ -111,51 +111,6 @@ def test_categorical_compare_unordered(): ) -def test_categorical_compare_ordered(): - cat1 = pd.Categorical( - ["a", "a", "b", "c", "a"], categories=["a", "b", "c"], ordered=True - ) - pdsr1 = pd.Series(cat1) - sr1 = cudf.Series(cat1) - cat2 = pd.Categorical( - ["a", "b", "a", "c", "b"], categories=["a", "b", "c"], ordered=True - ) - pdsr2 = pd.Series(cat2) - sr2 = cudf.Series(cat2) - - # test equal - out = sr1 == sr1 - assert out.dtype == np.bool_ - assert type(out[0]) is np.bool_ - assert np.all(out.to_numpy()) - assert np.all(pdsr1 == pdsr1) - - # test inequality - out = sr1 != sr1 - assert not np.any(out.to_numpy()) - assert not np.any(pdsr1 != pdsr1) - - assert pdsr1.cat.ordered - assert sr1.cat.ordered - - # test using ordered operators - np.testing.assert_array_equal(pdsr1 < pdsr2, (sr1 < sr2).to_numpy()) - np.testing.assert_array_equal(pdsr1 > pdsr2, (sr1 > sr2).to_numpy()) - - -def test_categorical_binary_add(): - cat = pd.Categorical(["a", "a", "b", "c", "a"], categories=["a", "b", "c"]) - pdsr = pd.Series(cat) - sr = cudf.Series(cat) - - assert_exceptions_equal( - lfunc=operator.add, - rfunc=operator.add, - lfunc_args_and_kwargs=([pdsr, pdsr],), - rfunc_args_and_kwargs=([sr, sr],), - ) - - def test_categorical_element_indexing(): """ Element indexing to a cat column must give the underlying object diff --git a/python/cudf/cudf/tests/test_contains.py b/python/cudf/cudf/tests/test_contains.py deleted file mode 100644 index 7db558335fe..00000000000 --- a/python/cudf/cudf/tests/test_contains.py +++ /dev/null @@ -1,49 +0,0 @@ -# Copyright (c) 2019-2025, NVIDIA CORPORATION. - - -import numpy as np -import pytest - -import cudf -from cudf import Series -from cudf.testing._utils import DATETIME_TYPES, NUMERIC_TYPES, TIMEDELTA_TYPES - - -@pytest.mark.parametrize("dtype", NUMERIC_TYPES) -def test_lists_contains(dtype): - dtype = cudf.dtype(dtype) - inner_data = np.array([1, 2, 3], dtype=dtype) - - data = Series([inner_data]) - - contained_scalar = inner_data.dtype.type(2) - not_contained_scalar = inner_data.dtype.type(42) - - assert data.list.contains(contained_scalar)[0] - assert not data.list.contains(not_contained_scalar)[0] - - -@pytest.mark.parametrize("dtype", DATETIME_TYPES + TIMEDELTA_TYPES) -def test_lists_contains_datetime(dtype): - dtype = cudf.dtype(dtype) - inner_data = np.array([1, 2, 3]) - - unit, _ = np.datetime_data(dtype) - - data = Series([inner_data]) - - contained_scalar = inner_data.dtype.type(2) - not_contained_scalar = inner_data.dtype.type(42) - - assert data.list.contains(contained_scalar)[0] - assert not data.list.contains(not_contained_scalar)[0] - - -def test_lists_contains_bool(): - data = Series([[True, True, True]]) - - contained_scalar = True - not_contained_scalar = False - - assert data.list.contains(contained_scalar)[0] - assert not data.list.contains(not_contained_scalar)[0] diff --git a/python/cudf/cudf/tests/test_copying.py b/python/cudf/cudf/tests/test_copying.py deleted file mode 100644 index dc19c52715a..00000000000 --- a/python/cudf/cudf/tests/test_copying.py +++ /dev/null @@ -1,439 +0,0 @@ -# Copyright (c) 2020-2025, NVIDIA CORPORATION. - -import cupy as cp -import numpy as np -import pandas as pd -import pytest - -import cudf -from cudf import Series -from cudf.core.buffer.spill_manager import get_global_manager -from cudf.testing import assert_eq -from cudf.testing._utils import NUMERIC_TYPES, OTHER_TYPES - -pytestmark = pytest.mark.spilling - - -@pytest.mark.parametrize("dtype", NUMERIC_TYPES + OTHER_TYPES) -def test_repeat(dtype): - rng = np.random.default_rng(seed=0) - arr = rng.random(10) * 10 - repeats = rng.integers(10, size=10) - psr = pd.Series(arr).astype(dtype) - gsr = cudf.from_pandas(psr) - - assert_eq(psr.repeat(repeats), gsr.repeat(repeats)) - - -def test_repeat_index(): - rng = np.random.default_rng(seed=0) - arrays = [[1, 1, 2, 2], ["red", "blue", "red", "blue"]] - psr = pd.MultiIndex.from_arrays(arrays, names=("number", "color")) - gsr = cudf.from_pandas(psr) - repeats = rng.integers(10, size=4) - - assert_eq(psr.repeat(repeats), gsr.repeat(repeats)) - - -def test_repeat_dataframe(): - rng = np.random.default_rng(seed=0) - psr = pd.DataFrame({"a": [1, 1, 2, 2]}) - gsr = cudf.from_pandas(psr) - repeats = rng.integers(10, size=4) - - # pd.DataFrame doesn't have repeat() so as a workaround, we are - # comparing pd.Series.repeat() with cudf.DataFrame.repeat()['a'] - assert_eq(psr["a"].repeat(repeats), gsr.repeat(repeats)["a"]) - - -@pytest.mark.parametrize("dtype", NUMERIC_TYPES) -def test_repeat_scalar(dtype): - rng = np.random.default_rng(seed=0) - arr = rng.random(10) * 10 - repeats = 10 - psr = pd.Series(arr).astype(dtype) - gsr = cudf.from_pandas(psr) - - assert_eq(psr.repeat(repeats), gsr.repeat(repeats)) - - -def test_null_copy(): - col = Series(np.arange(2049)) - col[:] = None - assert len(col) == 2049 - - -def test_series_setitem_cow_on(): - with cudf.option_context("copy_on_write", True): - actual = cudf.Series([1, 2, 3, 4, 5]) - new_copy = actual.copy(deep=False) - - actual[1] = 100 - assert_eq(actual, cudf.Series([1, 100, 3, 4, 5])) - assert_eq(new_copy, cudf.Series([1, 2, 3, 4, 5])) - - -def test_series_setitem_cow_off(): - with cudf.option_context("copy_on_write", False): - actual = cudf.Series([1, 2, 3, 4, 5]) - new_copy = actual.copy(deep=False) - - actual[1] = 100 - assert_eq(actual, cudf.Series([1, 100, 3, 4, 5])) - assert_eq(new_copy, cudf.Series([1, 100, 3, 4, 5])) - - -def test_series_setitem_both_slice_cow_on(): - with cudf.option_context("copy_on_write", True): - actual = cudf.Series([1, 2, 3, 4, 5]) - new_copy = actual.copy(deep=False) - - actual[slice(0, 2, 1)] = 100 - assert_eq(actual, cudf.Series([100, 100, 3, 4, 5])) - assert_eq(new_copy, cudf.Series([1, 2, 3, 4, 5])) - - new_copy[slice(2, 4, 1)] = 300 - assert_eq(actual, cudf.Series([100, 100, 3, 4, 5])) - assert_eq(new_copy, cudf.Series([1, 2, 300, 300, 5])) - - -def test_series_setitem_both_slice_cow_off(): - with cudf.option_context("copy_on_write", False): - actual = cudf.Series([1, 2, 3, 4, 5]) - new_copy = actual.copy(deep=False) - - actual[slice(0, 2, 1)] = 100 - assert_eq(actual, cudf.Series([100, 100, 3, 4, 5])) - assert_eq(new_copy, cudf.Series([100, 100, 3, 4, 5])) - - new_copy[slice(2, 4, 1)] = 300 - assert_eq(actual, cudf.Series([100, 100, 300, 300, 5])) - assert_eq(new_copy, cudf.Series([100, 100, 300, 300, 5])) - - -def test_series_setitem_partial_slice_cow_on(): - with cudf.option_context("copy_on_write", True): - actual = cudf.Series([1, 2, 3, 4, 5]) - new_copy = actual.copy(deep=False) - - new_copy[slice(2, 4, 1)] = 300 - assert_eq(actual, cudf.Series([1, 2, 3, 4, 5])) - assert_eq(new_copy, cudf.Series([1, 2, 300, 300, 5])) - - new_slice = actual[2:] - assert ( - new_slice._column.base_data.owner == actual._column.base_data.owner - ) - new_slice[0:2] = 10 - assert_eq(new_slice, cudf.Series([10, 10, 5], index=[2, 3, 4])) - assert_eq(actual, cudf.Series([1, 2, 3, 4, 5])) - - -def test_series_setitem_partial_slice_cow_off(): - with cudf.option_context("copy_on_write", False): - actual = cudf.Series([1, 2, 3, 4, 5]) - new_copy = actual.copy(deep=False) - - new_copy[slice(2, 4, 1)] = 300 - assert_eq(actual, cudf.Series([1, 2, 300, 300, 5])) - assert_eq(new_copy, cudf.Series([1, 2, 300, 300, 5])) - - new_slice = actual[2:] - # Since COW is off, a slice should point to the same memory - ptr1 = new_slice._column.base_data.get_ptr(mode="read") - ptr2 = actual._column.base_data.get_ptr(mode="read") - assert ptr1 == ptr2 - - new_slice[0:2] = 10 - assert_eq(new_slice, cudf.Series([10, 10, 5], index=[2, 3, 4])) - assert_eq(actual, cudf.Series([1, 2, 10, 10, 5])) - - -def test_multiple_series_cow(): - with cudf.option_context("copy_on_write", True): - # Verify constructing, modifying, deleting - # multiple copies of a series preserves - # the data appropriately when COW is enabled. - s = cudf.Series([10, 20, 30, 40, 50]) - s1 = s.copy(deep=False) - s2 = s.copy(deep=False) - s3 = s.copy(deep=False) - s4 = s2.copy(deep=False) - s5 = s4.copy(deep=False) - s6 = s3.copy(deep=False) - - s1[0:3] = 10000 - # s1 will be unlinked from actual data in s, - # and then modified. Rest all should - # contain the original data. - assert_eq(s1, cudf.Series([10000, 10000, 10000, 40, 50])) - for ser in [s, s2, s3, s4, s5, s6]: - assert_eq(ser, cudf.Series([10, 20, 30, 40, 50])) - - s6[0:3] = 3000 - # s6 will be unlinked from actual data in s, - # and then modified. Rest all should - # contain the original data. - assert_eq(s1, cudf.Series([10000, 10000, 10000, 40, 50])) - assert_eq(s6, cudf.Series([3000, 3000, 3000, 40, 50])) - for ser in [s2, s3, s4, s5]: - assert_eq(ser, cudf.Series([10, 20, 30, 40, 50])) - - s2[1:4] = 4000 - # s2 will be unlinked from actual data in s, - # and then modified. Rest all should - # contain the original data. - assert_eq(s2, cudf.Series([10, 4000, 4000, 4000, 50])) - assert_eq(s1, cudf.Series([10000, 10000, 10000, 40, 50])) - assert_eq(s6, cudf.Series([3000, 3000, 3000, 40, 50])) - for ser in [s3, s4, s5]: - assert_eq(ser, cudf.Series([10, 20, 30, 40, 50])) - - s4[2:4] = 5000 - # s4 will be unlinked from actual data in s, - # and then modified. Rest all should - # contain the original data. - assert_eq(s4, cudf.Series([10, 20, 5000, 5000, 50])) - assert_eq(s2, cudf.Series([10, 4000, 4000, 4000, 50])) - assert_eq(s1, cudf.Series([10000, 10000, 10000, 40, 50])) - assert_eq(s6, cudf.Series([3000, 3000, 3000, 40, 50])) - for ser in [s3, s5]: - assert_eq(ser, cudf.Series([10, 20, 30, 40, 50])) - - s5[2:4] = 6000 - # s5 will be unlinked from actual data in s, - # and then modified. Rest all should - # contain the original data. - assert_eq(s5, cudf.Series([10, 20, 6000, 6000, 50])) - assert_eq(s4, cudf.Series([10, 20, 5000, 5000, 50])) - assert_eq(s2, cudf.Series([10, 4000, 4000, 4000, 50])) - assert_eq(s1, cudf.Series([10000, 10000, 10000, 40, 50])) - assert_eq(s6, cudf.Series([3000, 3000, 3000, 40, 50])) - for ser in [s3]: - assert_eq(ser, cudf.Series([10, 20, 30, 40, 50])) - - s7 = s5.copy(deep=False) - assert_eq(s7, cudf.Series([10, 20, 6000, 6000, 50])) - s7[1:3] = 55 - # Making a copy of s5, i.e., s7 and modifying shouldn't - # be touching/modifying data in other series. - assert_eq(s7, cudf.Series([10, 55, 55, 6000, 50])) - - assert_eq(s4, cudf.Series([10, 20, 5000, 5000, 50])) - assert_eq(s2, cudf.Series([10, 4000, 4000, 4000, 50])) - assert_eq(s1, cudf.Series([10000, 10000, 10000, 40, 50])) - assert_eq(s6, cudf.Series([3000, 3000, 3000, 40, 50])) - for ser in [s3]: - assert_eq(ser, cudf.Series([10, 20, 30, 40, 50])) - - # Deleting any of the following series objects - # shouldn't delete rest of the weekly referenced data - # elsewhere. - - del s2 - - assert_eq(s1, cudf.Series([10000, 10000, 10000, 40, 50])) - assert_eq(s3, cudf.Series([10, 20, 30, 40, 50])) - assert_eq(s4, cudf.Series([10, 20, 5000, 5000, 50])) - assert_eq(s5, cudf.Series([10, 20, 6000, 6000, 50])) - assert_eq(s6, cudf.Series([3000, 3000, 3000, 40, 50])) - assert_eq(s7, cudf.Series([10, 55, 55, 6000, 50])) - - del s4 - del s1 - - assert_eq(s3, cudf.Series([10, 20, 30, 40, 50])) - assert_eq(s5, cudf.Series([10, 20, 6000, 6000, 50])) - assert_eq(s6, cudf.Series([3000, 3000, 3000, 40, 50])) - assert_eq(s7, cudf.Series([10, 55, 55, 6000, 50])) - - del s - del s6 - - assert_eq(s3, cudf.Series([10, 20, 30, 40, 50])) - assert_eq(s5, cudf.Series([10, 20, 6000, 6000, 50])) - assert_eq(s7, cudf.Series([10, 55, 55, 6000, 50])) - - del s5 - - assert_eq(s3, cudf.Series([10, 20, 30, 40, 50])) - assert_eq(s7, cudf.Series([10, 55, 55, 6000, 50])) - - del s3 - assert_eq(s7, cudf.Series([10, 55, 55, 6000, 50])) - - -def test_series_zero_copy_cow_on(): - with cudf.option_context("copy_on_write", True): - s = cudf.Series([1, 2, 3, 4, 5]) - s1 = s.copy(deep=False) - cp_array = cp.asarray(s) - - # Ensure all original data & zero-copied - # data is same. - assert_eq(s, cudf.Series([1, 2, 3, 4, 5])) - assert_eq(s1, cudf.Series([1, 2, 3, 4, 5])) - assert_eq(cp_array, cp.array([1, 2, 3, 4, 5])) - - cp_array[0:3] = 10 - # Modifying a zero-copied array should only - # modify `s` and will leave rest of the copies - # untouched. - - assert_eq(s.to_numpy(), np.array([10, 10, 10, 4, 5])) - assert_eq(s, cudf.Series([10, 10, 10, 4, 5])) - assert_eq(s1, cudf.Series([1, 2, 3, 4, 5])) - assert_eq(cp_array, cp.array([10, 10, 10, 4, 5])) - - s2 = cudf.Series(cp_array) - assert_eq(s2, cudf.Series([10, 10, 10, 4, 5])) - - s3 = s2.copy(deep=False) - cp_array[0] = 20 - # Modifying a zero-copied array should modify - # `s2` and `s` only. Because `cp_array` - # is zero-copy shared with `s` & `s2`. - - assert_eq(s, cudf.Series([20, 10, 10, 4, 5])) - assert_eq(s1, cudf.Series([1, 2, 3, 4, 5])) - assert_eq(cp_array, cp.array([20, 10, 10, 4, 5])) - assert_eq(s2, cudf.Series([20, 10, 10, 4, 5])) - assert_eq(s3, cudf.Series([10, 10, 10, 4, 5])) - - s4 = cudf.Series([10, 20, 30, 40, 50]) - s5 = cudf.Series(s4) - assert_eq(s5, cudf.Series([10, 20, 30, 40, 50])) - s5[0:2] = 1 - # Modifying `s5` should also modify `s4` - # because they are zero-copied. - assert_eq(s5, cudf.Series([1, 1, 30, 40, 50])) - assert_eq(s4, cudf.Series([1, 1, 30, 40, 50])) - - -def test_series_zero_copy_cow_off(): - is_spill_enabled = get_global_manager() is not None - - with cudf.option_context("copy_on_write", False): - s = cudf.Series([1, 2, 3, 4, 5]) - s1 = s.copy(deep=False) - cp_array = cp.asarray(s) - - # Ensure all original data & zero-copied - # data is same. - assert_eq(s, cudf.Series([1, 2, 3, 4, 5])) - assert_eq(s1, cudf.Series([1, 2, 3, 4, 5])) - assert_eq(cp_array, cp.array([1, 2, 3, 4, 5])) - - cp_array[0:3] = 10 - # When COW is off, modifying a zero-copied array - # will need to modify `s` & `s1` since they are - # shallow copied. - - assert_eq(s, cudf.Series([10, 10, 10, 4, 5])) - assert_eq(s1, cudf.Series([10, 10, 10, 4, 5])) - assert_eq(cp_array, cp.array([10, 10, 10, 4, 5])) - - s2 = cudf.Series(cp_array) - assert_eq(s2, cudf.Series([10, 10, 10, 4, 5])) - s3 = s2.copy(deep=False) - cp_array[0] = 20 - - # Modifying `cp_array`, will propagate the changes - # across all Series objects, because they are - # either shallow copied or zero-copied. - - assert_eq(s, cudf.Series([20, 10, 10, 4, 5])) - assert_eq(s1, cudf.Series([20, 10, 10, 4, 5])) - assert_eq(cp_array, cp.array([20, 10, 10, 4, 5])) - if not is_spill_enabled: - # Since spilling might make a copy of the data, we cannot - # expect the two series to be a zero-copy of the cupy array - # when spilling is enabled globally. - assert_eq(s2, cudf.Series([20, 10, 10, 4, 5])) - assert_eq(s3, cudf.Series([20, 10, 10, 4, 5])) - - s4 = cudf.Series([10, 20, 30, 40, 50]) - s5 = cudf.Series(s4) - assert_eq(s5, cudf.Series([10, 20, 30, 40, 50])) - s5[0:2] = 1 - - # Modifying `s5` should also modify `s4` - # because they are zero-copied. - assert_eq(s5, cudf.Series([1, 1, 30, 40, 50])) - assert_eq(s4, cudf.Series([1, 1, 30, 40, 50])) - - -@pytest.mark.parametrize("copy_on_write", [True, False]) -def test_series_str_copy(copy_on_write): - with cudf.option_context("copy_on_write", copy_on_write): - s = cudf.Series(["a", "b", "c", "d", "e"]) - s1 = s.copy(deep=True) - s2 = s.copy(deep=True) - - assert_eq(s, cudf.Series(["a", "b", "c", "d", "e"])) - assert_eq(s1, cudf.Series(["a", "b", "c", "d", "e"])) - assert_eq(s2, cudf.Series(["a", "b", "c", "d", "e"])) - - s[0:3] = "abc" - - assert_eq(s, cudf.Series(["abc", "abc", "abc", "d", "e"])) - assert_eq(s1, cudf.Series(["a", "b", "c", "d", "e"])) - assert_eq(s2, cudf.Series(["a", "b", "c", "d", "e"])) - - s2[1:4] = "xyz" - - assert_eq(s, cudf.Series(["abc", "abc", "abc", "d", "e"])) - assert_eq(s1, cudf.Series(["a", "b", "c", "d", "e"])) - assert_eq(s2, cudf.Series(["a", "xyz", "xyz", "xyz", "e"])) - - -@pytest.mark.parametrize("copy_on_write", [True, False]) -def test_series_cat_copy(copy_on_write): - with cudf.option_context("copy_on_write", copy_on_write): - s = cudf.Series([10, 20, 30, 40, 50], dtype="category") - s1 = s.copy(deep=True) - s2 = s1.copy(deep=True) - s3 = s1.copy(deep=True) - - s[0] = 50 - assert_eq(s, cudf.Series([50, 20, 30, 40, 50], dtype=s.dtype)) - assert_eq(s1, cudf.Series([10, 20, 30, 40, 50], dtype="category")) - assert_eq(s2, cudf.Series([10, 20, 30, 40, 50], dtype="category")) - assert_eq(s3, cudf.Series([10, 20, 30, 40, 50], dtype="category")) - - s2[3] = 10 - s3[2:5] = 20 - assert_eq(s, cudf.Series([50, 20, 30, 40, 50], dtype=s.dtype)) - assert_eq(s1, cudf.Series([10, 20, 30, 40, 50], dtype=s.dtype)) - assert_eq(s2, cudf.Series([10, 20, 30, 10, 50], dtype=s.dtype)) - assert_eq(s3, cudf.Series([10, 20, 20, 20, 20], dtype=s.dtype)) - - -def test_dataframe_cow_slice_setitem(): - with cudf.option_context("copy_on_write", True): - df = cudf.DataFrame( - {"a": [10, 11, 12, 13, 14], "b": [20, 30, 40, 50, 60]} - ) - slice_df = df[1:4] - - assert_eq( - slice_df, - cudf.DataFrame( - {"a": [11, 12, 13], "b": [30, 40, 50]}, index=[1, 2, 3] - ), - ) - - slice_df["a"][2] = 1111 - - assert_eq( - slice_df, - cudf.DataFrame( - {"a": [11, 1111, 13], "b": [30, 40, 50]}, index=[1, 2, 3] - ), - ) - assert_eq( - df, - cudf.DataFrame( - {"a": [10, 11, 12, 13, 14], "b": [20, 30, 40, 50, 60]} - ), - ) diff --git a/python/cudf/cudf/tests/test_datetime.py b/python/cudf/cudf/tests/test_datetime.py deleted file mode 100644 index b1678889eff..00000000000 --- a/python/cudf/cudf/tests/test_datetime.py +++ /dev/null @@ -1,422 +0,0 @@ -# Copyright (c) 2019-2025, NVIDIA CORPORATION. - -import datetime -import operator - -import cupy as cp -import numpy as np -import pandas as pd -import pytest - -import cudf -from cudf import Series -from cudf.core._compat import ( - PANDAS_CURRENT_SUPPORTED_VERSION, - PANDAS_VERSION, -) -from cudf.testing import assert_eq -from cudf.testing._utils import ( - DATETIME_TYPES, - assert_exceptions_equal, -) - - -@pytest.fixture( - params=[ - operator.lt, - operator.gt, - operator.le, - operator.ge, - operator.eq, - operator.ne, - ] -) -def op(request): - return request.param - - -@pytest.fixture( - params=[ - pd.date_range("20010101", "20020215", freq="400h", name="times"), - pd.date_range( - "20010101", freq="243434324423423234ns", name="times", periods=10 - ), - ] -) -def data(request): - return request.param - - -@pytest.mark.parametrize( - "lhs_dtype", - ["datetime64[s]", "datetime64[ms]", "datetime64[us]", "datetime64[ns]"], -) -@pytest.mark.parametrize( - "rhs_dtype", - ["datetime64[s]", "datetime64[ms]", "datetime64[us]", "datetime64[ns]"], -) -def test_datetime_series_binops_pandas(lhs_dtype, rhs_dtype): - pd_data_1 = pd.Series( - pd.date_range("20010101", "20020215", freq="400h", name="times") - ) - pd_data_2 = pd.Series( - pd.date_range("20010101", "20020215", freq="401h", name="times") - ) - gdf_data_1 = Series(pd_data_1).astype(lhs_dtype) - gdf_data_2 = Series(pd_data_2).astype(rhs_dtype) - assert_eq(pd_data_1, gdf_data_1.astype("datetime64[ns]")) - assert_eq(pd_data_2, gdf_data_2.astype("datetime64[ns]")) - assert_eq(pd_data_1 < pd_data_2, gdf_data_1 < gdf_data_2) - assert_eq(pd_data_1 > pd_data_2, gdf_data_1 > gdf_data_2) - assert_eq(pd_data_1 == pd_data_2, gdf_data_1 == gdf_data_2) - assert_eq(pd_data_1 <= pd_data_2, gdf_data_1 <= gdf_data_2) - assert_eq(pd_data_1 >= pd_data_2, gdf_data_1 >= gdf_data_2) - - -@pytest.mark.parametrize( - "lhs_dtype", - ["datetime64[s]", "datetime64[ms]", "datetime64[us]", "datetime64[ns]"], -) -@pytest.mark.parametrize( - "rhs_dtype", - ["datetime64[s]", "datetime64[ms]", "datetime64[us]", "datetime64[ns]"], -) -def test_datetime_series_binops_numpy(lhs_dtype, rhs_dtype): - pd_data_1 = pd.Series( - pd.date_range("20010101", "20020215", freq="400h", name="times") - ) - pd_data_2 = pd.Series( - pd.date_range("20010101", "20020215", freq="401h", name="times") - ) - gdf_data_1 = Series(pd_data_1).astype(lhs_dtype) - gdf_data_2 = Series(pd_data_2).astype(rhs_dtype) - np_data_1 = np.array(pd_data_1).astype(lhs_dtype) - np_data_2 = np.array(pd_data_2).astype(rhs_dtype) - np.testing.assert_equal(np_data_1, gdf_data_1.to_numpy()) - np.testing.assert_equal(np_data_2, gdf_data_2.to_numpy()) - np.testing.assert_equal( - np.less(np_data_1, np_data_2), (gdf_data_1 < gdf_data_2).to_numpy() - ) - np.testing.assert_equal( - np.greater(np_data_1, np_data_2), (gdf_data_1 > gdf_data_2).to_numpy() - ) - np.testing.assert_equal( - np.equal(np_data_1, np_data_2), (gdf_data_1 == gdf_data_2).to_numpy() - ) - np.testing.assert_equal( - np.less_equal(np_data_1, np_data_2), - (gdf_data_1 <= gdf_data_2).to_numpy(), - ) - np.testing.assert_equal( - np.greater_equal(np_data_1, np_data_2), - (gdf_data_1 >= gdf_data_2).to_numpy(), - ) - - -def test_dt_ops(data): - pd_data = pd.Series(data) - gdf_data = Series(data) - - assert_eq(pd_data == pd_data, gdf_data == gdf_data) - assert_eq(pd_data < pd_data, gdf_data < gdf_data) - assert_eq(pd_data > pd_data, gdf_data > gdf_data) - - -@pytest.mark.skipif( - PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, - reason="Fails in older versions of pandas.", -) -def test_datetime_can_cast_safely(): - sr = cudf.Series( - ["1679-01-01", "2000-01-31", "2261-01-01"], dtype="datetime64[ms]" - ) - assert sr._column.can_cast_safely(np.dtype("datetime64[ns]")) - - sr = cudf.Series( - ["1677-01-01", "2000-01-31", "2263-01-01"], dtype="datetime64[ms]" - ) - - assert sr._column.can_cast_safely(np.dtype("datetime64[ns]")) is False - - -@pytest.mark.parametrize( - "data", - [ - [1, 2, 3, 4, 10, 100, 20000], - [None] * 7, - [10, 20, 30, None, 100, 200, None], - [3223.234, 342.2332, 23423.23, 3343.23324, 23432.2323, 242.23, 233], - ], -) -@pytest.mark.parametrize( - "other", - [ - [1, 2, 3, 4, 10, 100, 20000], - [None] * 7, - [10, 20, 30, None, 100, 200, None], - [3223.234, 342.2332, 23423.23, 3343.23324, 23432.2323, 242.23, 233], - datetime.datetime(1993, 6, 22, 13, 30), - datetime.datetime(2005, 1, 22, 10, 00), - np.datetime64("2005-02"), - np.datetime64("2005-02-25"), - np.datetime64("2005-02-25T03:30"), - np.datetime64("nat"), - # TODO: https://github.com/pandas-dev/pandas/issues/52295 - ], -) -@pytest.mark.parametrize("data_dtype", DATETIME_TYPES) -@pytest.mark.parametrize("other_dtype", DATETIME_TYPES) -def test_datetime_subtract(data, other, data_dtype, other_dtype): - gsr = cudf.Series(data, dtype=data_dtype) - psr = gsr.to_pandas() - - if isinstance(other, np.datetime64): - gsr_other = other - psr_other = other - else: - gsr_other = cudf.Series(other, dtype=other_dtype) - psr_other = gsr_other.to_pandas() - - expected = psr - psr_other - actual = gsr - gsr_other - - assert_eq(expected, actual) - - expected = psr_other - psr - actual = gsr_other - gsr - - assert_eq(expected, actual) - - -@pytest.mark.parametrize( - "data", - [ - [1000000, 200000, 3000000], - [1000000, 200000, None], - [], - [None], - [None, None, None, None, None], - [12, 12, 22, 343, 4353534, 435342], - np.array([10, 20, 30, None, 100]), - cp.asarray([10, 20, 30, 100]), - [1000000, 200000, 3000000], - [1000000, 200000, None], - [1], - [12, 11, 232, 223432411, 2343241, 234324, 23234], - [12, 11, 2.32, 2234.32411, 2343.241, 23432.4, 23234], - [1.321, 1132.324, 23223231.11, 233.41, 0.2434, 332, 323], - [12, 11, 2.32, 2234.32411, 2343.241, 23432.4, 23234], - ], -) -@pytest.mark.parametrize( - "other_scalars", - [ - datetime.timedelta(days=768), - datetime.timedelta(seconds=768), - datetime.timedelta(microseconds=7), - datetime.timedelta(minutes=447), - datetime.timedelta(hours=447), - datetime.timedelta(weeks=734), - np.timedelta64(4, "s"), - np.timedelta64(456, "D"), - np.timedelta64(46, "h"), - np.timedelta64("nat"), - np.timedelta64(1, "s"), - np.timedelta64(1, "ms"), - np.timedelta64(1, "us"), - np.timedelta64(1, "ns"), - ], -) -@pytest.mark.parametrize("dtype", DATETIME_TYPES) -@pytest.mark.parametrize( - "op", - ["add", "sub"], -) -def test_datetime_series_ops_with_scalars(data, other_scalars, dtype, op): - gsr = cudf.Series(data=data, dtype=dtype) - psr = gsr.to_pandas() - - if op == "add": - expected = psr + other_scalars - actual = gsr + other_scalars - elif op == "sub": - expected = psr - other_scalars - actual = gsr - other_scalars - - assert_eq(expected, actual) - - if op == "add": - expected = other_scalars + psr - actual = other_scalars + gsr - - assert_eq(expected, actual) - - elif op == "sub": - assert_exceptions_equal( - lfunc=operator.sub, - rfunc=operator.sub, - lfunc_args_and_kwargs=([other_scalars, psr],), - rfunc_args_and_kwargs=([other_scalars, gsr],), - ) - - -@pytest.mark.parametrize("data", ["20110101", "20120101", "20130101"]) -@pytest.mark.parametrize("other_scalars", ["20110101", "20120101", "20130101"]) -@pytest.mark.parametrize( - "dtype", - ["datetime64[ns]", "datetime64[us]", "datetime64[ms]", "datetime64[s]"], -) -def test_datetime_series_cmpops_with_scalars(data, other_scalars, dtype, op): - gsr = cudf.Series(data=data, dtype=dtype) - psr = gsr.to_pandas() - - expect = op(psr, other_scalars) - got = op(gsr, other_scalars) - - assert_eq(expect, got) - - -@pytest.mark.parametrize( - "data", - [ - [1000000, 200000, 3000000], - [1000000, 200000, None], - [], - [None], - [None, None, None, None, None], - [12, 12, 22, 343, 4353534, 435342], - np.array([10, 20, 30, None, 100]), - cp.asarray([10, 20, 30, 100]), - [1000000, 200000, 3000000], - [1000000, 200000, None], - [1], - [12, 11, 232, 223432411, 2343241, 234324, 23234], - [12, 11, 2.32, 2234.32411, 2343.241, 23432.4, 23234], - [1.321, 1132.324, 23223231.11, 233.41, 0.2434, 332, 323], - [12, 11, 2.32, 2234.32411, 2343.241, 23432.4, 23234], - ], -) -@pytest.mark.parametrize( - "scalar", - [ - datetime.timedelta(days=768), - datetime.timedelta(seconds=768), - datetime.timedelta(microseconds=7), - np.timedelta64("nat"), - np.timedelta64(1, "s"), - np.timedelta64(1, "ms"), - np.timedelta64(1, "us"), - np.timedelta64(1, "ns"), - ], -) -@pytest.mark.parametrize("dtype", DATETIME_TYPES) -@pytest.mark.parametrize("op", [np.add, np.subtract]) -def test_datetime_series_ops_with_scalars_misc(data, scalar, dtype, op): - gsr = cudf.Series(data=data, dtype=dtype) - psr = gsr.to_pandas() - - expect = op(psr, scalar) - got = op(gsr, scalar) - - assert_eq(expect, got) - - -def test_datetime_invalid_ops(): - sr = cudf.Series([1, 2, 3], dtype="datetime64[ns]") - psr = sr.to_pandas() - - assert_exceptions_equal( - lfunc=operator.add, - rfunc=operator.add, - lfunc_args_and_kwargs=([psr, pd.Timestamp(1513393355.5, unit="s")],), - rfunc_args_and_kwargs=([sr, pd.Timestamp(1513393355.5, unit="s")],), - ) - - assert_exceptions_equal( - lfunc=operator.truediv, - rfunc=operator.truediv, - lfunc_args_and_kwargs=([psr, pd.Timestamp(1513393355.5, unit="s")],), - rfunc_args_and_kwargs=([sr, pd.Timestamp(1513393355.5, unit="s")],), - ) - - assert_exceptions_equal( - lfunc=operator.add, - rfunc=operator.add, - lfunc_args_and_kwargs=([psr, psr],), - rfunc_args_and_kwargs=([sr, sr],), - ) - - assert_exceptions_equal( - lfunc=operator.floordiv, - rfunc=operator.floordiv, - lfunc_args_and_kwargs=([psr, psr],), - rfunc_args_and_kwargs=([sr, sr],), - ) - - assert_exceptions_equal( - lfunc=operator.floordiv, - rfunc=operator.floordiv, - lfunc_args_and_kwargs=([psr, pd.Timestamp(1513393355.5, unit="s")],), - rfunc_args_and_kwargs=([sr, pd.Timestamp(1513393355.5, unit="s")],), - ) - - assert_exceptions_equal( - lfunc=operator.add, - rfunc=operator.add, - lfunc_args_and_kwargs=([psr, 1],), - rfunc_args_and_kwargs=([sr, 1],), - ) - - assert_exceptions_equal( - lfunc=operator.truediv, - rfunc=operator.truediv, - lfunc_args_and_kwargs=([psr, "a"],), - rfunc_args_and_kwargs=([sr, "a"],), - ) - - assert_exceptions_equal( - lfunc=operator.mul, - rfunc=operator.mul, - lfunc_args_and_kwargs=([psr, 1],), - rfunc_args_and_kwargs=([sr, 1],), - ) - - -def test_datetime_binop_tz_timestamp(op): - s = cudf.Series([1, 2, 3], dtype="datetime64[ns]") - pd_tz_timestamp = pd.Timestamp("1970-01-01 00:00:00.000000001", tz="utc") - with pytest.raises(NotImplementedError): - op(s, pd_tz_timestamp) - - date_scalar = datetime.datetime.now(datetime.timezone.utc) - with pytest.raises(NotImplementedError): - op(s, date_scalar) - - -def test_datetime_series_cmpops_pandas_compatibility(op): - data1 = ["20110101", "20120101", None, "20140101", None] - data2 = ["20110101", "20120101", "20130101", None, None] - gsr1 = cudf.Series(data=data1, dtype="datetime64[ns]") - psr1 = gsr1.to_pandas() - - gsr2 = cudf.Series(data=data2, dtype="datetime64[ns]") - psr2 = gsr2.to_pandas() - - expect = op(psr1, psr2) - with cudf.option_context("mode.pandas_compatible", True): - got = op(gsr1, gsr2) - - assert_eq(expect, got) - - -@pytest.mark.parametrize( - "method, kwargs", - [["mean", {}], ["std", {}], ["std", {"ddof": 0}]], -) -def test_dti_reduction(method, kwargs): - pd_dti = pd.DatetimeIndex(["2020-01-01", "2020-12-31"], name="foo") - cudf_dti = cudf.from_pandas(pd_dti) - - result = getattr(cudf_dti, method)(**kwargs) - expected = getattr(pd_dti, method)(**kwargs) - assert result == expected diff --git a/python/cudf/cudf/tests/test_decimal.py b/python/cudf/cudf/tests/test_decimal.py deleted file mode 100644 index 2cb16f71011..00000000000 --- a/python/cudf/cudf/tests/test_decimal.py +++ /dev/null @@ -1,373 +0,0 @@ -# Copyright (c) 2021-2025, NVIDIA CORPORATION. - -import decimal -from decimal import Decimal - -import numpy as np -import pyarrow as pa -import pytest - -import cudf -from cudf.core.column import Decimal32Column, Decimal64Column, NumericalColumn -from cudf.core.dtypes import Decimal32Dtype, Decimal64Dtype -from cudf.testing import assert_eq -from cudf.testing._utils import ( - FLOAT_TYPES, - INTEGER_TYPES, - SIGNED_TYPES, - expect_warning_if, -) - - -@pytest.mark.parametrize( - "data_", - [ - [Decimal("1.1"), Decimal("2.2"), Decimal("3.3"), Decimal("4.4")], - [Decimal("-1.1"), Decimal("2.2"), Decimal("3.3"), Decimal("4.4")], - [1], - [-1], - [1, 2, 3, 4], - [42, 17, 41], - [1, 2, None, 4], - [None, None, None], - [], - ], -) -@pytest.mark.parametrize( - "typ_", - [ - pa.decimal128(precision=4, scale=2), - pa.decimal128(precision=5, scale=3), - pa.decimal128(precision=6, scale=4), - ], -) -@pytest.mark.parametrize("col", [Decimal32Column, Decimal64Column]) -def test_round_trip_decimal_column(data_, typ_, col): - pa_arr = pa.array(data_, type=typ_) - col_32 = col.from_arrow(pa_arr) - assert pa_arr.equals(col_32.to_arrow()) - - -def test_from_arrow_max_precision_decimal64(): - with pytest.raises(ValueError): - Decimal64Column.from_arrow( - pa.array([1, 2, 3], type=pa.decimal128(scale=0, precision=19)) - ) - - -def test_from_arrow_max_precision_decimal32(): - with pytest.raises(ValueError): - Decimal32Column.from_arrow( - pa.array([1, 2, 3], type=pa.decimal128(scale=0, precision=10)) - ) - - -@pytest.mark.parametrize("from_dtype", FLOAT_TYPES) -@pytest.mark.parametrize( - "to_dtype", - [Decimal64Dtype(7, 2), Decimal64Dtype(11, 4), Decimal64Dtype(18, 9)], -) -def test_typecast_from_float_to_decimal(request, from_dtype, to_dtype): - data = cudf.Series( - [ - 14.12302, - 97938.2, - np.nan, - 0.0, - -8.302014, - np.nan, - 94.31304, - -112.2314, - 0.3333333, - np.nan, - ] - ) - request.applymarker( - pytest.mark.xfail( - from_dtype == np.dtype("float32") and to_dtype.precision > 12, - reason="https://github.com/rapidsai/cudf/issues/14169", - ) - ) - got = data.astype(from_dtype) - - pa_arr = got.to_arrow().cast( - pa.decimal128(to_dtype.precision, to_dtype.scale) - ) - expected = cudf.Series._from_column(Decimal64Column.from_arrow(pa_arr)) - - got = got.astype(to_dtype) - - assert_eq(got, expected) - - -@pytest.mark.parametrize("from_dtype", INTEGER_TYPES) -@pytest.mark.parametrize( - "to_dtype", - [Decimal64Dtype(9, 3), Decimal64Dtype(11, 4), Decimal64Dtype(18, 9)], -) -def test_typecast_from_int_to_decimal(from_dtype, to_dtype): - data = cudf.Series( - [ - 14.12302, - 38.2, - np.nan, - 0.0, - -8.302014, - np.nan, - 94.31304, - np.nan, - -112.2314, - 0.3333333, - np.nan, - ] - ) - got = data.astype(from_dtype) - - pa_arr = ( - got.to_arrow() - .cast("float64") - .cast(pa.decimal128(to_dtype.precision, to_dtype.scale)) - ) - expected = cudf.Series._from_column(Decimal64Column.from_arrow(pa_arr)) - - got = got.astype(to_dtype) - - assert_eq(got, expected) - - -@pytest.mark.parametrize( - "from_dtype", - [ - Decimal64Dtype(7, 2), - Decimal64Dtype(11, 4), - Decimal64Dtype(18, 10), - Decimal32Dtype(7, 2), - Decimal32Dtype(5, 3), - Decimal32Dtype(9, 5), - ], -) -@pytest.mark.parametrize( - "to_dtype", - [ - Decimal64Dtype(7, 2), - Decimal64Dtype(18, 10), - Decimal64Dtype(11, 4), - Decimal32Dtype(7, 2), - Decimal32Dtype(9, 5), - Decimal32Dtype(5, 3), - ], -) -def test_typecast_to_from_decimal(from_dtype, to_dtype): - data = cudf.Series( - [ - 14.12309, - 2.343942, - np.nan, - 0.0, - -8.302082, - np.nan, - 94.31308, - -112.2364, - -8.029972, - np.nan, - ] - ) - if from_dtype.scale > to_dtype.MAX_PRECISION: - pytest.skip( - "This is supposed to overflow because the representation value in " - "the source exceeds the max representable in destination dtype." - ) - s = data.astype(from_dtype) - - pa_arr = s.to_arrow().cast( - pa.decimal128(to_dtype.precision, to_dtype.scale), safe=False - ) - if isinstance(to_dtype, Decimal32Dtype): - expected = cudf.Series._from_column(Decimal32Column.from_arrow(pa_arr)) - elif isinstance(to_dtype, Decimal64Dtype): - expected = cudf.Series._from_column(Decimal64Column.from_arrow(pa_arr)) - - with expect_warning_if(to_dtype.scale < s.dtype.scale, UserWarning): - got = s.astype(to_dtype) - - assert_eq(got, expected) - - -@pytest.mark.parametrize( - "from_dtype", - [Decimal64Dtype(7, 2), Decimal64Dtype(11, 4), Decimal64Dtype(17, 10)], -) -@pytest.mark.parametrize("to_dtype", SIGNED_TYPES) -def test_typecast_from_decimal(from_dtype, to_dtype): - data = cudf.Series( - [ - 14.12309, - 2.343942, - np.nan, - 0.0, - -8.302082, - np.nan, - 94.31308, - -112.2364, - -8.029972, - np.nan, - ] - ) - got = data.astype(from_dtype) - pa_arr = got.to_arrow().cast(to_dtype, safe=False) - - got = got.astype(to_dtype) - expected = cudf.Series._from_column(NumericalColumn.from_arrow(pa_arr)) - - assert_eq(got, expected) - assert_eq(got.dtype, expected.dtype) - - -@pytest.mark.parametrize( - "data, dtype, item, to, expect", - [ - # scatter to a single index - ( - ["1", "2", "3"], - Decimal64Dtype(1, 0), - Decimal(5), - 1, - ["1", "5", "3"], - ), - ( - ["1.5", "2.5", "3.5"], - Decimal64Dtype(2, 1), - Decimal("5.5"), - 1, - ["1.5", "5.5", "3.5"], - ), - ( - ["1.0042", "2.0042", "3.0042"], - Decimal64Dtype(5, 4), - Decimal("5.0042"), - 1, - ["1.0042", "5.0042", "3.0042"], - ), - # scatter via boolmask - ( - ["1", "2", "3"], - Decimal64Dtype(1, 0), - Decimal(5), - [True, False, True], - ["5", "2", "5"], - ), - ( - ["1.5", "2.5", "3.5"], - Decimal64Dtype(2, 1), - Decimal("5.5"), - [True, True, True], - ["5.5", "5.5", "5.5"], - ), - ( - ["1.0042", "2.0042", "3.0042"], - Decimal64Dtype(5, 4), - Decimal("5.0042"), - [False, False, True], - ["1.0042", "2.0042", "5.0042"], - ), - # We will allow assigning a decimal with less precision - ( - ["1.00", "2.00", "3.00"], - Decimal64Dtype(3, 2), - Decimal(5), - 1, - ["1.00", "5.00", "3.00"], - ), - # But not truncation - ( - ["1", "2", "3"], - Decimal64Dtype(1, 0), - Decimal("5.5"), - 1, - pa.lib.ArrowInvalid, - ), - # We will allow for setting scalars into decimal columns - (["1", "2", "3"], Decimal64Dtype(1, 0), 5, 1, ["1", "5", "3"]), - # But not if it has too many digits to fit the precision - (["1", "2", "3"], Decimal64Dtype(1, 0), 50, 1, pa.lib.ArrowInvalid), - ], -) -def test_series_setitem_decimal(data, dtype, item, to, expect): - data = cudf.Series([Decimal(x) for x in data], dtype=dtype) - - if expect is pa.lib.ArrowInvalid: - with pytest.raises(expect): - data[to] = item - return - else: - expect = cudf.Series([Decimal(x) for x in expect], dtype=dtype) - data[to] = item - assert_eq(data, expect) - - -@pytest.mark.parametrize( - "input_obj", [[decimal.Decimal(1), cudf.NA, decimal.Decimal(3)]] -) -def test_series_construction_with_nulls(input_obj): - expect = pa.array(input_obj, from_pandas=True) - got = cudf.Series(input_obj).to_arrow() - - assert expect == got - - -@pytest.mark.parametrize( - "data", - [ - [(["1", "2", "3"], cudf.Decimal64Dtype(1, 0))], - [ - (["1", "2", "3"], cudf.Decimal64Dtype(1, 0)), - (["1.0", "2.0", "3.0"], cudf.Decimal64Dtype(2, 1)), - (["10.1", "20.2", "30.3"], cudf.Decimal64Dtype(3, 1)), - ], - [ - (["1", None, "3"], cudf.Decimal64Dtype(1, 0)), - (["1.0", "2.0", None], cudf.Decimal64Dtype(2, 1)), - ([None, "20.2", "30.3"], cudf.Decimal64Dtype(3, 1)), - ], - ], -) -def test_serialize_decimal_columns(data): - df = cudf.DataFrame( - { - str(i): cudf.Series( - [Decimal(x) if x is not None else x for x in values], - dtype=dtype, - ) - for i, (values, dtype) in enumerate(data) - } - ) - recreated = df.__class__.deserialize(*df.serialize()) - assert_eq(recreated, df) - - -def test_decimal_invalid_precision(): - with pytest.raises(pa.ArrowInvalid): - _ = cudf.Series([10, 20, 30], dtype=cudf.Decimal64Dtype(2, 2)) - - with pytest.raises(pa.ArrowInvalid): - _ = cudf.Series([Decimal("300")], dtype=cudf.Decimal64Dtype(2, 1)) - - -def test_decimal_overflow(): - s = cudf.Series([Decimal("0.0009384233522166997927180531650178250")]) - result = s * s - assert_eq(cudf.Decimal128Dtype(precision=38, scale=37), result.dtype) - - s = cudf.Series([1, 2], dtype=cudf.Decimal128Dtype(precision=38, scale=0)) - result = s * Decimal("1.0") - assert_eq(cudf.Decimal128Dtype(precision=38, scale=1), result.dtype) - - -def test_decimal_binop_upcast_operands(): - ser1 = cudf.Series([0.51, 1.51, 2.51]).astype(cudf.Decimal64Dtype(18, 2)) - ser2 = cudf.Series([0.90, 0.96, 0.99]).astype(cudf.Decimal128Dtype(19, 2)) - result = ser1 + ser2 - expected = cudf.Series([1.41, 2.47, 3.50]).astype( - cudf.Decimal128Dtype(20, 2) - ) - assert_eq(result, expected) diff --git a/python/cudf/cudf/tests/test_list.py b/python/cudf/cudf/tests/test_list.py deleted file mode 100644 index 039bb7cbf9c..00000000000 --- a/python/cudf/cudf/tests/test_list.py +++ /dev/null @@ -1,25 +0,0 @@ -# Copyright (c) 2020-2025, NVIDIA CORPORATION. - - -import pandas as pd -import pytest - -import cudf -from cudf.testing import assert_eq - - -def test_concatenate_rows_of_lists(): - pdf = pd.DataFrame({"val": [["a", "a"], ["b"], ["c"]]}) - gdf = cudf.from_pandas(pdf) - - expect = pdf["val"] + pdf["val"] - got = gdf["val"] + gdf["val"] - - assert_eq(expect, got) - - -def test_concatenate_list_with_nonlist(): - with pytest.raises(TypeError): - gdf1 = cudf.DataFrame({"A": [["a", "c"], ["b", "d"], ["c", "d"]]}) - gdf2 = cudf.DataFrame({"A": ["a", "b", "c"]}) - gdf1["A"] + gdf2["A"] diff --git a/python/cudf/cudf/tests/test_serialize.py b/python/cudf/cudf/tests/test_serialize.py index 1c508307e32..59609158728 100644 --- a/python/cudf/cudf/tests/test_serialize.py +++ b/python/cudf/cudf/tests/test_serialize.py @@ -1,5 +1,5 @@ # Copyright (c) 2018-2025, NVIDIA CORPORATION. - +import decimal import itertools import pickle @@ -440,3 +440,33 @@ def test_serialize_column_types_preserved(columns): expected = cudf.DataFrame([[10, 11]], columns=columns()) result = cudf.DataFrame.deserialize(*expected.serialize()) assert_eq(result, expected) + + +@pytest.mark.parametrize( + "data", + [ + [(["1", "2", "3"], cudf.Decimal64Dtype(1, 0))], + [ + (["1", "2", "3"], cudf.Decimal64Dtype(1, 0)), + (["1.0", "2.0", "3.0"], cudf.Decimal64Dtype(2, 1)), + (["10.1", "20.2", "30.3"], cudf.Decimal64Dtype(3, 1)), + ], + [ + (["1", None, "3"], cudf.Decimal64Dtype(1, 0)), + (["1.0", "2.0", None], cudf.Decimal64Dtype(2, 1)), + ([None, "20.2", "30.3"], cudf.Decimal64Dtype(3, 1)), + ], + ], +) +def test_serialize_decimal_columns(data): + df = cudf.DataFrame( + { + str(i): cudf.Series( + [decimal.Decimal(x) if x is not None else x for x in values], + dtype=dtype, + ) + for i, (values, dtype) in enumerate(data) + } + ) + recreated = df.__class__.deserialize(*df.serialize()) + assert_eq(recreated, df) diff --git a/python/cudf/cudf/tests/test_sorting.py b/python/cudf/cudf/tests/test_sorting.py deleted file mode 100644 index 2ee6904ef4d..00000000000 --- a/python/cudf/cudf/tests/test_sorting.py +++ /dev/null @@ -1,109 +0,0 @@ -# Copyright (c) 2018-2025, NVIDIA CORPORATION. - - -import numpy as np -import pandas as pd -import pytest - -from cudf import Series -from cudf.testing import assert_eq -from cudf.testing._utils import ( - assert_exceptions_equal, -) - - -@pytest.fixture(params=[2, 257]) -def nelem(request): - return request.param - - -@pytest.fixture( - params=[ - np.int32, - np.int64, - np.uint32, - np.uint64, - np.float32, - np.float64, - ] -) -def dtype(request): - return request.param - - -@pytest.fixture(params=[slice(1, None), slice(None, -1), slice(1, -1)]) -def sliceobj(request): - return request.param - - -@pytest.mark.parametrize("ignore_index", [True, False]) -def test_series_sort_values_ignore_index(ignore_index): - gsr = Series([1, 3, 5, 2, 4]) - psr = gsr.to_pandas() - - expect = psr.sort_values(ignore_index=ignore_index) - got = gsr.sort_values(ignore_index=ignore_index) - assert_eq(expect, got) - - -@pytest.mark.parametrize("asc", [True, False]) -def test_series_argsort(nelem, dtype, asc): - rng = np.random.default_rng(seed=0) - sr = Series((100 * rng.random(nelem)).astype(dtype)) - res = sr.argsort(ascending=asc) - - if asc: - expected = np.argsort(sr.to_numpy(), kind="mergesort") - else: - # -1 multiply works around missing desc sort (may promote to float64) - expected = np.argsort(sr.to_numpy() * np.int8(-1), kind="mergesort") - np.testing.assert_array_equal(expected, res.to_numpy()) - - -@pytest.mark.parametrize("asc", [True, False]) -def test_series_sort_index(nelem, asc): - rng = np.random.default_rng(seed=0) - sr = Series(100 * rng.random(nelem)) - psr = sr.to_pandas() - - expected = psr.sort_index(ascending=asc) - got = sr.sort_index(ascending=asc) - - assert_eq(expected, got) - - -@pytest.mark.parametrize("data", [[0, 1, 1, 2, 2, 2, 3, 3], [0], [1, 2, 3]]) -@pytest.mark.parametrize("n", [-100, -50, -12, -2, 0, 1, 2, 3, 4, 7]) -def test_series_nlargest(data, n): - """Indirectly tests Series.sort_values()""" - sr = Series(data) - psr = pd.Series(data) - assert_eq(sr.nlargest(n), psr.nlargest(n)) - assert_eq(sr.nlargest(n, keep="last"), psr.nlargest(n, keep="last")) - - assert_exceptions_equal( - lfunc=psr.nlargest, - rfunc=sr.nlargest, - lfunc_args_and_kwargs=([], {"n": 3, "keep": "what"}), - rfunc_args_and_kwargs=([], {"n": 3, "keep": "what"}), - ) - - -@pytest.mark.parametrize("data", [[0, 1, 1, 2, 2, 2, 3, 3], [0], [1, 2, 3]]) -@pytest.mark.parametrize("n", [-100, -50, -12, -2, 0, 1, 2, 3, 4, 9]) -def test_series_nsmallest(data, n): - """Indirectly tests Series.sort_values()""" - sr = Series(data) - psr = pd.Series(data) - assert_eq(sr.nsmallest(n), psr.nsmallest(n)) - assert_eq( - sr.nsmallest(n, keep="last").sort_index(), - psr.nsmallest(n, keep="last").sort_index(), - ) - - assert_exceptions_equal( - lfunc=psr.nsmallest, - rfunc=sr.nsmallest, - lfunc_args_and_kwargs=([], {"n": 3, "keep": "what"}), - rfunc_args_and_kwargs=([], {"n": 3, "keep": "what"}), - ) diff --git a/python/cudf/cudf/tests/test_string.py b/python/cudf/cudf/tests/test_string.py deleted file mode 100644 index f04c636757f..00000000000 --- a/python/cudf/cudf/tests/test_string.py +++ /dev/null @@ -1,644 +0,0 @@ -# Copyright (c) 2018-2025, NVIDIA CORPORATION. - -from decimal import Decimal -from sys import getsizeof - -import cupy -import numpy as np -import pandas as pd -import pyarrow as pa -import pytest - -import rmm - -import cudf -from cudf.core.buffer import as_buffer -from cudf.core.column.string import StringColumn -from cudf.core.index import Index -from cudf.testing import assert_eq -from cudf.testing._utils import ( - DATETIME_TYPES, - NUMERIC_TYPES, - assert_exceptions_equal, -) -from cudf.utils import dtypes as dtypeutils - - -@pytest.fixture( - params=[ - ["AbC", "de", "FGHI", "j", "kLm"], - ["nOPq", None, "RsT", None, "uVw"], - [None, None, None, None, None], - ], - ids=["no_nulls", "some_nulls", "all_nulls"], -) -def data(request): - return request.param - - -@pytest.fixture( - params=[None, [10, 11, 12, 13, 14]], ids=["None_index", "Set_index"] -) -def index(request): - return request.param - - -@pytest.fixture -def ps_gs(data, index): - ps = pd.Series(data, index=index, dtype="str", name="nice name") - gs = cudf.Series(data, index=index, dtype="str", name="nice name") - return (ps, gs) - - -@pytest.mark.parametrize("construct", [list, np.array, pd.Series, pa.array]) -def test_string_ingest(construct): - expect = ["a", "a", "b", "c", "a"] - data = construct(expect) - got = cudf.Series(data) - assert got.dtype == np.dtype("object") - assert len(got) == 5 - for idx, val in enumerate(expect): - assert expect[idx] == got[idx] - - -def test_string_export(ps_gs): - ps, gs = ps_gs - - expect = ps - got = gs.to_pandas() - assert_eq(expect, got) - - expect = np.array(ps) - got = gs.to_numpy() - assert_eq(expect, got) - - expect = pa.Array.from_pandas(ps) - got = gs.to_arrow() - - assert pa.Array.equals(expect, got) - - -@pytest.mark.parametrize( - "item", - [ - 0, - 2, - 4, - slice(1, 3), - [1, 1, 1, 1, 1, 1, 1, 1, 1, 1], - [0, 1, 2, 3, 4, 4, 3, 2, 1, 0], - np.array([0, 1, 2, 3, 4]), - cupy.asarray(np.array([0, 1, 2, 3, 4])), - ], -) -def test_string_get_item(ps_gs, item): - ps, gs = ps_gs - - got = gs.iloc[item] - if isinstance(got, cudf.Series): - got = got.to_arrow() - - if isinstance(item, cupy.ndarray): - item = cupy.asnumpy(item) - - expect = ps.iloc[item] - if isinstance(expect, pd.Series): - expect = pa.Array.from_pandas(expect) - pa.Array.equals(expect, got) - else: - if got is cudf.NA and expect is None: - return - assert expect == got - - -@pytest.mark.parametrize( - "item", - [ - [True] * 5, - [False] * 5, - np.array([True] * 5), - np.array([False] * 5), - cupy.asarray(np.array([True] * 5)), - cupy.asarray(np.array([False] * 5)), - np.random.default_rng(seed=0) - .integers(0, 2, 5) - .astype("bool") - .tolist(), - np.random.default_rng(seed=0).integers(0, 2, 5).astype("bool"), - cupy.asarray( - np.random.default_rng(seed=0).integers(0, 2, 5).astype("bool") - ), - ], -) -def test_string_bool_mask(ps_gs, item): - ps, gs = ps_gs - - got = gs.iloc[item] - if isinstance(got, cudf.Series): - got = got.to_arrow() - - if isinstance(item, cupy.ndarray): - item = cupy.asnumpy(item) - - expect = ps[item] - if isinstance(expect, pd.Series): - expect = pa.Array.from_pandas(expect) - pa.Array.equals(expect, got) - else: - assert expect == got - - -@pytest.mark.parametrize("item", [0, slice(1, 3), slice(5)]) -def test_string_repr(ps_gs, item): - ps, gs = ps_gs - - got_out = gs.iloc[item] - expect_out = ps.iloc[item] - - expect = str(expect_out) - got = str(got_out) - - if got_out is not cudf.NA and len(got_out) > 1: - expect = expect.replace("None", "") - - assert expect == got or (expect == "None" and got == "") - - -@pytest.mark.parametrize( - "dtype", NUMERIC_TYPES + DATETIME_TYPES + ["bool", "object", "str"] -) -def test_string_astype(dtype): - if ( - dtype.startswith("int") - or dtype.startswith("uint") - or dtype.startswith("long") - ): - data = ["1", "2", "3", "4", "5"] - elif dtype.startswith("float"): - data = [ - "1.0", - "2.0", - "3.0", - "4.0", - None, - "5.0", - "nan", - "-INF", - "NaN", - "inF", - "NAn", - ] - elif dtype.startswith("bool"): - data = ["True", "False", "True", "False", "False"] - elif dtype.startswith("datetime64"): - data = [ - "2019-06-04T00:00:00", - "2019-06-04T12:12:12", - "2019-06-03T00:00:00", - "2019-05-04T00:00:00", - "2018-06-04T00:00:00", - "1922-07-21T01:02:03", - ] - elif dtype == "str" or dtype == "object": - data = ["ab", "cd", "ef", "gh", "ij"] - ps = pd.Series(data) - gs = cudf.Series(data) - - expect = ps.astype(dtype) - got = gs.astype(dtype) - - assert_eq(expect, got) - - -@pytest.mark.parametrize( - "data, scale, precision", - [ - (["1.11", "2.22", "3.33"], 2, 3), - (["111", "222", "33"], 0, 3), - (["111000", "22000", "3000"], -3, 3), - ([None, None, None], 0, 5), - ([None, "-2345", None], 0, 5), - ([], 0, 5), - ], -) -@pytest.mark.parametrize( - "decimal_dtype", - [cudf.Decimal128Dtype, cudf.Decimal64Dtype, cudf.Decimal32Dtype], -) -def test_string_to_decimal(data, scale, precision, decimal_dtype): - gs = cudf.Series(data, dtype="str") - fp = gs.astype(decimal_dtype(scale=scale, precision=precision)) - got = fp.astype("str") - assert_eq(gs, got) - - -def test_string_empty_to_decimal(): - gs = cudf.Series(["", "-85", ""], dtype="str") - got = gs.astype(cudf.Decimal64Dtype(scale=0, precision=5)) - expected = cudf.Series( - [0, -85, 0], - dtype=cudf.Decimal64Dtype(scale=0, precision=5), - ) - assert_eq(expected, got) - - -@pytest.mark.parametrize( - "data, scale, precision", - [ - (["1.23", "-2.34", "3.45"], 2, 3), - (["123", "-234", "345"], 0, 3), - (["12300", "-400", "5000.0"], -2, 5), - ([None, None, None], 0, 5), - ([None, "-100", None], 0, 5), - ([], 0, 5), - ], -) -@pytest.mark.parametrize( - "decimal_dtype", - [cudf.Decimal128Dtype, cudf.Decimal32Dtype, cudf.Decimal64Dtype], -) -def test_string_from_decimal(data, scale, precision, decimal_dtype): - decimal_data = [] - for d in data: - if d is None: - decimal_data.append(None) - else: - decimal_data.append(Decimal(d)) - fp = cudf.Series( - decimal_data, - dtype=decimal_dtype(scale=scale, precision=precision), - ) - gs = fp.astype("str") - got = gs.astype(decimal_dtype(scale=scale, precision=precision)) - assert_eq(fp, got) - - -@pytest.mark.parametrize( - "dtype", NUMERIC_TYPES + DATETIME_TYPES + ["bool", "object", "str"] -) -def test_string_empty_astype(dtype): - data = [] - ps = pd.Series(data, dtype="str") - gs = cudf.Series(data, dtype="str") - - expect = ps.astype(dtype) - got = gs.astype(dtype) - - assert_eq(expect, got) - - -@pytest.mark.parametrize("dtype", NUMERIC_TYPES + DATETIME_TYPES + ["bool"]) -def test_string_numeric_astype(dtype): - if dtype.startswith("bool"): - data = [1, 0, 1, 0, 1] - elif ( - dtype.startswith("int") - or dtype.startswith("uint") - or dtype.startswith("long") - ): - data = [1, 2, 3, 4, 5] - elif dtype.startswith("float"): - data = [1.0, 2.0, 3.0, 4.0, 5.0] - elif dtype.startswith("datetime64"): - # pandas rounds the output format based on the data - # Use numpy instead - # but fix '2011-01-01T00:00:00' -> '2011-01-01 00:00:00' - data = [1000000001, 2000000001, 3000000001, 4000000001, 5000000001] - ps = np.asarray(data, dtype=dtype).astype(str) - ps = np.array([i.replace("T", " ") for i in ps]) - - if not dtype.startswith("datetime64"): - ps = pd.Series(data, dtype=dtype) - - gs = cudf.Series(data, dtype=dtype) - - expect = pd.Series(ps.astype("str")) - got = gs.astype("str") - - assert_eq(expect, got) - - -@pytest.mark.parametrize("dtype", NUMERIC_TYPES + DATETIME_TYPES + ["bool"]) -def test_string_empty_numeric_astype(dtype): - data = [] - - if dtype.startswith("datetime64"): - ps = pd.Series(data, dtype="datetime64[ns]") - else: - ps = pd.Series(data, dtype=dtype) - gs = cudf.Series(data, dtype=dtype) - - expect = ps.astype("str") - got = gs.astype("str") - - assert_eq(expect, got) - - -@pytest.mark.parametrize("ascending", [True, False]) -def test_string_sort(ps_gs, ascending): - ps, gs = ps_gs - - expect = ps.sort_values(ascending=ascending) - got = gs.sort_values(ascending=ascending) - - assert_eq(expect, got) - - -@pytest.mark.parametrize( - "str_data", [[], ["a", "b", "c", "d", "e"], [None, None, None, None, None]] -) -@pytest.mark.parametrize("num_keys", [1, 2, 3]) -def test_string_groupby_key(str_data, num_keys): - other_data = [1, 2, 3, 4, 5][: len(str_data)] - - pdf = pd.DataFrame() - gdf = cudf.DataFrame() - for i in range(num_keys): - pdf[i] = pd.Series(str_data, dtype="str") - gdf[i] = cudf.Series(str_data, dtype="str") - pdf["a"] = other_data - gdf["a"] = other_data - - expect = pdf.groupby(list(range(num_keys)), as_index=False).count() - got = gdf.groupby(list(range(num_keys)), as_index=False).count() - - expect = expect.sort_values([0]).reset_index(drop=True) - got = got.sort_values([0]).reset_index(drop=True) - - assert_eq(expect, got, check_dtype=False) - - -@pytest.mark.parametrize( - "str_data", [[], ["a", "b", "c", "d", "e"], [None, None, None, None, None]] -) -@pytest.mark.parametrize("num_cols", [1, 2, 3]) -@pytest.mark.parametrize("agg", ["count", "max", "min"]) -def test_string_groupby_non_key(str_data, num_cols, agg): - other_data = [1, 2, 3, 4, 5][: len(str_data)] - - pdf = pd.DataFrame() - gdf = cudf.DataFrame() - for i in range(num_cols): - pdf[i] = pd.Series(str_data, dtype="str") - gdf[i] = cudf.Series(str_data, dtype="str") - pdf["a"] = other_data - gdf["a"] = other_data - - expect = getattr(pdf.groupby("a", as_index=False), agg)() - got = getattr(gdf.groupby("a", as_index=False), agg)() - - expect = expect.sort_values(["a"]).reset_index(drop=True) - got = got.sort_values(["a"]).reset_index(drop=True) - - if agg in ["min", "max"] and len(expect) == 0 and len(got) == 0: - for i in range(num_cols): - expect[i] = expect[i].astype("str") - - assert_eq(expect, got, check_dtype=False) - - -def test_string_groupby_key_index(): - str_data = ["a", "b", "c", "d", "e"] - other_data = [1, 2, 3, 4, 5] - - pdf = pd.DataFrame() - gdf = cudf.DataFrame() - pdf["a"] = pd.Series(str_data, dtype="str") - gdf["a"] = cudf.Series(str_data, dtype="str") - pdf["b"] = other_data - gdf["b"] = other_data - - expect = pdf.groupby("a", sort=True).count() - got = gdf.groupby("a", sort=True).count() - - assert_eq(expect, got, check_dtype=False) - - -@pytest.mark.parametrize("scalar", ["a", None]) -def test_string_set_scalar(scalar): - pdf = pd.DataFrame() - pdf["a"] = [1, 2, 3, 4, 5] - gdf = cudf.DataFrame.from_pandas(pdf) - - pdf["b"] = "a" - gdf["b"] = "a" - - assert_eq(pdf["b"], gdf["b"]) - assert_eq(pdf, gdf) - - -def test_string_index(): - rng = np.random.default_rng(seed=0) - pdf = pd.DataFrame(rng.random(size=(5, 5))) - gdf = cudf.DataFrame.from_pandas(pdf) - stringIndex = ["a", "b", "c", "d", "e"] - pdf.index = stringIndex - gdf.index = stringIndex - assert_eq(pdf, gdf) - stringIndex = np.array(["a", "b", "c", "d", "e"]) - pdf.index = stringIndex - gdf.index = stringIndex - assert_eq(pdf, gdf) - stringIndex = Index(["a", "b", "c", "d", "e"], name="name") - pdf.index = stringIndex.to_pandas() - gdf.index = stringIndex - assert_eq(pdf, gdf) - stringIndex = cudf.Index._from_column( - cudf.core.column.as_column(["a", "b", "c", "d", "e"]), name="name" - ) - pdf.index = stringIndex.to_pandas() - gdf.index = stringIndex - assert_eq(pdf, gdf) - - -@pytest.mark.parametrize( - "item", - [ - ["Cbe", "cbe", "CbeD", "Cb", "ghi", "Cb"], - ["a", "a", "a", "a", "A"], - ["A"], - ["abc", "xyz", None, "ab", "123"], - [None, None, "abc", None, "abc"], - ], -) -def test_string_unique(item): - ps = pd.Series(item) - gs = cudf.Series(item) - # Pandas `unique` returns a numpy array - pres = pd.Series(ps.unique()) - # cudf returns a cudf.Series - gres = gs.unique() - assert_eq(pres, gres) - - -def test_string_equality(): - data1 = ["b", "c", "d", "a", "c"] - data2 = ["a", None, "c", "a", "c"] - - ps1 = pd.Series(data1) - ps2 = pd.Series(data2) - gs1 = cudf.Series(data1) - gs2 = cudf.Series(data2) - - expect = ps1 == ps2 - got = gs1 == gs2 - - assert_eq(expect, got.fillna(False)) - - expect = ps1 == "m" - got = gs1 == "m" - - assert_eq(expect, got.fillna(False)) - - ps1 = pd.Series(["a"]) - gs1 = cudf.Series(["a"]) - - expect = ps1 == "m" - got = gs1 == "m" - - assert_eq(expect, got) - - -@pytest.mark.parametrize( - "lhs", - [ - ["Cbe", "cbe", "CbeD", "Cb", "ghi", "Cb"], - ["abc", "xyz", "a", "ab", "123", "097"], - ], -) -@pytest.mark.parametrize( - "rhs", - [ - ["Cbe", "cbe", "CbeD", "Cb", "ghi", "Cb"], - ["a", "a", "a", "a", "A", "z"], - ], -) -def test_string_binary_op_add(lhs, rhs): - pds = pd.Series(lhs) + pd.Series(rhs) - gds = cudf.Series(lhs) + cudf.Series(rhs) - - assert_eq(pds, gds) - - -def test_string_no_children_properties(): - empty_col = StringColumn( - as_buffer(rmm.DeviceBuffer(size=0)), - size=0, - dtype=np.dtype("object"), - children=(), - ) - assert empty_col.base_children == () - assert empty_col.base_size == 0 - - assert empty_col.children == () - assert empty_col.size == 0 - - assert getsizeof(empty_col) >= 0 # Accounts for Python GC overhead - - -def test_string_table_view_creation(): - data = ["hi"] * 25 + [None] * 2027 - psr = pd.Series(data) - gsr = cudf.Series.from_pandas(psr) - - expect = psr[:1] - got = gsr[:1] - - assert_eq(expect, got) - - -@pytest.mark.parametrize( - "data,dtype", - [ - (["0.1", "10.2", "10.876"], "float"), - (["-0.1", "10.2", "+10.876"], "float"), - (["1", "10.2", "10.876"], "float32"), - (["+123", "6344556789", "0"], "int"), - (["+123", "6344556789", "0"], "uint64"), - (["+123", "6344556789", "0"], "float"), - (["0.1", "-10.2", "10.876", None], "float"), - ], -) -@pytest.mark.parametrize("obj_type", [None, "str", "category"]) -def test_string_typecast(data, obj_type, dtype): - psr = pd.Series(data, dtype=obj_type) - gsr = cudf.Series(data, dtype=obj_type) - - expect = psr.astype(dtype=dtype) - actual = gsr.astype(dtype=dtype) - assert_eq(expect, actual) - - -@pytest.mark.parametrize( - "data,dtype", - [ - (["0.1", "10.2", "10.876"], "int"), - (["1", "10.2", "+10.876"], "int"), - (["abc", "1", "2", " "], "int"), - (["0.1", "10.2", "10.876"], "uint64"), - (["1", "10.2", "+10.876"], "uint64"), - (["abc", "1", "2", " "], "uint64"), - ([" ", "0.1", "2"], "float"), - ([""], "int"), - ([""], "uint64"), - ([" "], "float"), - (["\n"], "int"), - (["\n"], "uint64"), - (["0.1", "-10.2", "10.876", None], "int"), - (["0.1", "-10.2", "10.876", None], "uint64"), - (["0.1", "-10.2", "10.876", None, "ab"], "float"), - (["+", "-"], "float"), - (["+", "-"], "int"), - (["+", "-"], "uint64"), - (["1++++", "--2"], "float"), - (["1++++", "--2"], "int"), - (["1++++", "--2"], "uint64"), - (["++++1", "--2"], "float"), - (["++++1", "--2"], "int"), - (["++++1", "--2"], "uint64"), - ], -) -@pytest.mark.parametrize("obj_type", [None, "str", "category"]) -def test_string_typecast_error(data, obj_type, dtype): - psr = pd.Series(data, dtype=obj_type) - gsr = cudf.Series(data, dtype=obj_type) - - assert_exceptions_equal( - lfunc=psr.astype, - rfunc=gsr.astype, - lfunc_args_and_kwargs=([dtype],), - rfunc_args_and_kwargs=([dtype],), - ) - - -def test_string_int_to_ipv4(): - gsr = cudf.Series([0, None, 0, 698875905, 2130706433, 700776449]).astype( - "uint32" - ) - expected = cudf.Series( - ["0.0.0.0", None, "0.0.0.0", "41.168.0.1", "127.0.0.1", "41.197.0.1"] - ) - - got = cudf.Series._from_column(gsr._column.int2ip()) - - assert_eq(expected, got) - - -@pytest.mark.parametrize( - "dtype", sorted(list(dtypeutils.NUMERIC_TYPES - {"uint32"})) -) -def test_string_int_to_ipv4_dtype_fail(dtype): - gsr = cudf.Series([1, 2, 3, 4, 5]).astype(dtype) - with pytest.raises(TypeError): - gsr._column.int2ip() - - -def test_string_slice_with_mask(): - actual = cudf.Series(["hi", "hello", None]) - expected = actual[0:3] - - assert actual._column.base_size == 3 - assert_eq(actual._column.base_size, expected._column.base_size) - assert_eq(actual._column.null_count, expected._column.null_count) - - assert_eq(actual, expected) diff --git a/python/cudf/cudf/tests/test_timedelta.py b/python/cudf/cudf/tests/test_timedelta.py deleted file mode 100644 index 833035858be..00000000000 --- a/python/cudf/cudf/tests/test_timedelta.py +++ /dev/null @@ -1,644 +0,0 @@ -# Copyright (c) 2020-2025, NVIDIA CORPORATION. - -import datetime -import operator - -import cupy as cp -import numpy as np -import pandas as pd -import pytest - -import cudf -from cudf.testing import _utils as utils, assert_eq -from cudf.testing._utils import assert_exceptions_equal - - -@pytest.fixture( - params=[ - [1000000, 200000, 3000000], - [1000000, 200000, None], - [], - [None], - [None, None, None, None, None], - [12, 12, 22, 343, 4353534, 435342], - np.array([10, 20, 30, None, 100]), - cp.asarray([10, 20, 30, 100]), - [1000000, 200000, 3000000], - [1000000, 200000, None], - [1], - [12, 11, 232, 223432411, 2343241, 234324, 23234], - [12, 11, 2.32, 2234.32411, 2343.241, 23432.4, 23234], - [1.321, 1132.324, 23223231.11, 233.41, 0.2434, 332, 323], - [12, 11, 2.32, 2234.32411, 2343.241, 23432.4, 23234], - ] -) -def data_non_overflow(request): - return request.param - - -@pytest.fixture(params=utils.TIMEDELTA_TYPES) -def timedelta_dtype(request): - return request.param - - -@pytest.mark.parametrize( - "data,other", - [ - ([1000000, 200000, 3000000], [1000000, 200000, 3000000]), - ([1000000, 200000, None], [1000000, 200000, None]), - ([], []), - ([None], [None]), - ([None, None, None, None, None], [None, None, None, None, None]), - ( - [12, 12, 22, 343, 4353534, 435342], - [12, 12, 22, 343, 4353534, 435342], - ), - (np.array([10, 20, 30, None, 100]), np.array([10, 20, 30, None, 100])), - (cp.asarray([10, 20, 30, 100]), cp.asarray([10, 20, 30, 100])), - ([1000000, 200000, 3000000], [200000, 34543, 3000000]), - ([1000000, 200000, None], [1000000, 200000, 3000000]), - ([None], [1]), - ( - [12, 12, 22, 343, 4353534, 435342], - [None, 1, 220, 3, 34, 4353423287], - ), - (np.array([10, 20, 30, None, 100]), np.array([10, 20, 30, None, 100])), - (cp.asarray([10, 20, 30, 100]), cp.asarray([10, 20, 30, 100])), - ], -) -@pytest.mark.parametrize( - "ops", - [ - "eq", - "ne", - "lt", - "gt", - "le", - "ge", - "add", - "radd", - "sub", - "rsub", - "floordiv", - "truediv", - "mod", - ], -) -def test_timedelta_ops_misc_inputs(data, other, timedelta_dtype, ops): - gsr = cudf.Series(data, dtype=timedelta_dtype) - other_gsr = cudf.Series(other, dtype=timedelta_dtype) - - psr = gsr.to_pandas() - other_psr = other_gsr.to_pandas() - - expected = getattr(psr, ops)(other_psr) - actual = getattr(gsr, ops)(other_gsr) - if ops in ("eq", "lt", "gt", "le", "ge"): - actual = actual.fillna(False) - elif ops == "ne": - actual = actual.fillna(True) - - if ops == "floordiv": - expected[actual.isna().to_pandas()] = np.nan - - assert_eq(expected, actual) - - -@pytest.mark.parametrize( - "datetime_data,timedelta_data", - [ - ([1000000, 200000, 3000000], [1000000, 200000, 3000000]), - ([1000000, 200000, None], [1000000, 200000, None]), - ([], []), - ([None], [None]), - ([None, None, None, None, None], [None, None, None, None, None]), - ( - [12, 12, 22, 343, 4353534, 435342], - [12, 12, 22, 343, 4353534, 435342], - ), - (np.array([10, 20, 30, None, 100]), np.array([10, 20, 30, None, 100])), - (cp.asarray([10, 20, 30, 100]), cp.asarray([10, 20, 30, 100])), - ([1000000, 200000, 3000000], [200000, 34543, 3000000]), - ([1000000, 200000, None], [1000000, 200000, 3000000]), - ([None], [1]), - ( - [12, 12, 22, 343, 4353534, 435342], - [None, 1, 220, 3, 34, 4353423287], - ), - (np.array([10, 20, 30, None, 100]), np.array([10, 20, 30, None, 100])), - (cp.asarray([10, 20, 30, 100]), cp.asarray([10, 20, 30, 100])), - ( - [12, 11, 232, 223432411, 2343241, 234324, 23234], - [11, 1132324, 2322323111, 23341, 2434, 332, 323], - ), - ( - [12, 11, 2.32, 2234.32411, 2343.241, 23432.4, 23234], - [11, 1132324, 2322323111, 23341, 2434, 332, 323], - ), - ( - [11, 1132324, 2322323111, 23341, 2434, 332, 323], - [12, 11, 2.32, 2234.32411, 2343.241, 23432.4, 23234], - ), - ( - [1.321, 1132.324, 23223231.11, 233.41, 0.2434, 332, 323], - [12, 11, 2.32, 2234.32411, 2343.241, 23432.4, 23234], - ), - ], -) -@pytest.mark.parametrize("datetime_dtype", utils.DATETIME_TYPES) -@pytest.mark.parametrize( - "ops", - ["add", "sub"], -) -def test_timedelta_ops_datetime_inputs( - datetime_data, timedelta_data, datetime_dtype, timedelta_dtype, ops -): - gsr_datetime = cudf.Series(datetime_data, dtype=datetime_dtype) - gsr_timedelta = cudf.Series(timedelta_data, dtype=timedelta_dtype) - - psr_datetime = gsr_datetime.to_pandas() - psr_timedelta = gsr_timedelta.to_pandas() - - expected = getattr(psr_datetime, ops)(psr_timedelta) - actual = getattr(gsr_datetime, ops)(gsr_timedelta) - - assert_eq(expected, actual) - - if ops == "add": - expected = getattr(psr_timedelta, ops)(psr_datetime) - actual = getattr(gsr_timedelta, ops)(gsr_datetime) - - assert_eq(expected, actual) - elif ops == "sub": - assert_exceptions_equal( - lfunc=operator.sub, - rfunc=operator.sub, - lfunc_args_and_kwargs=([psr_timedelta, psr_datetime],), - rfunc_args_and_kwargs=([gsr_timedelta, gsr_datetime],), - ) - - -@pytest.mark.parametrize( - "df", - [ - pd.DataFrame( - { - "A": pd.Series(pd.date_range("2012-1-1", periods=3, freq="D")), - "B": pd.Series([pd.Timedelta(days=i) for i in range(3)]), - } - ), - pd.DataFrame( - { - "A": pd.Series( - pd.date_range("1994-1-1", periods=50, freq="D") - ), - "B": pd.Series([pd.Timedelta(days=i) for i in range(50)]), - } - ), - ], -) -@pytest.mark.parametrize("op", ["add", "sub"]) -def test_timedelta_dataframe_ops(df, op): - pdf = df - gdf = cudf.from_pandas(pdf) - - if op == "add": - pdf["C"] = pdf["A"] + pdf["B"] - gdf["C"] = gdf["A"] + gdf["B"] - elif op == "sub": - pdf["C"] = pdf["A"] - pdf["B"] - gdf["C"] = gdf["A"] - gdf["B"] - - assert_eq(pdf, gdf) - - -@pytest.mark.parametrize( - "data", - [ - [1000000, 200000, 3000000], - [1000000, 200000, None], - [], - [None], - [None, None, None, None, None], - [12, 12, 22, 343, 4353534, 435342], - np.array([10, 20, 30, None, 100]), - cp.asarray([10, 20, 30, 100]), - [1000000, 200000, 3000000], - [1000000, 200000, None], - [1], - [12, 11, 232, 223432411, 2343241, 234324, 23234], - [12, 11, 2.32, 2234.32411, 2343.241, 23432.4, 23234], - [1.321, 1132.324, 23223231.11, 233.41, 332, 323], - [12, 11, 2.32, 2234.32411, 2343.241, 23432.4, 23234], - ], -) -@pytest.mark.parametrize( - "other_scalars", - [ - datetime.timedelta(days=768), - datetime.timedelta(seconds=768), - datetime.timedelta(microseconds=7), - datetime.timedelta(minutes=447), - datetime.timedelta(hours=447), - datetime.timedelta(weeks=734), - np.timedelta64(4, "s"), - np.timedelta64(456, "D"), - np.timedelta64(46, "h"), - np.timedelta64("nat"), - np.timedelta64(1, "s"), - np.timedelta64(1, "ms"), - np.timedelta64(1, "us"), - np.timedelta64(1, "ns"), - ], -) -@pytest.mark.parametrize( - "op", - [ - "add", - "sub", - "truediv", - "mod", - "floordiv", - ], -) -def test_timedelta_series_ops_with_scalars( - data, other_scalars, timedelta_dtype, op -): - gsr = cudf.Series(data=data, dtype=timedelta_dtype) - psr = gsr.to_pandas() - - if op == "add": - expected = psr + other_scalars - actual = gsr + other_scalars - elif op == "sub": - expected = psr - other_scalars - actual = gsr - other_scalars - elif op == "truediv": - expected = psr / other_scalars - actual = gsr / other_scalars - elif op == "floordiv": - expected = psr // other_scalars - actual = gsr // other_scalars - elif op == "mod": - expected = psr % other_scalars - actual = gsr % other_scalars - - assert_eq(expected, actual) - - if op == "add": - expected = other_scalars + psr - actual = other_scalars + gsr - elif op == "sub": - expected = other_scalars - psr - actual = other_scalars - gsr - elif op == "truediv": - expected = other_scalars / psr - actual = other_scalars / gsr - elif op == "floordiv": - expected = other_scalars // psr - actual = other_scalars // gsr - elif op == "mod": - expected = other_scalars % psr - actual = other_scalars % gsr - - assert_eq(expected, actual) - - -@pytest.mark.parametrize( - "reverse", - [ - False, - pytest.param( - True, - marks=pytest.mark.xfail( - strict=True, - reason=( - "timedelta modulo by zero is dubiously defined in " - "both pandas and cuDF " - "(see https://github.com/rapidsai/cudf/issues/5938)" - ), - ), - ), - ], -) -def test_timedelta_series_mod_with_scalar_zero(reverse): - gsr = cudf.Series(data=[0.2434], dtype=np.timedelta64(1, "ns")) - psr = gsr.to_pandas() - scalar = datetime.timedelta(days=768) - if reverse: - expected = scalar % psr - actual = scalar % gsr - else: - expected = psr % scalar - actual = gsr % scalar - assert_eq(expected, actual) - - -@pytest.mark.parametrize("datetime_dtype", utils.DATETIME_TYPES) -def test_timedelta_index_datetime_index_ops( - data_non_overflow, datetime_dtype, timedelta_dtype -): - gdt = cudf.Index(data_non_overflow, dtype=datetime_dtype) - gtd = cudf.Index(data_non_overflow, dtype=timedelta_dtype) - - pdt = gdt.to_pandas() - ptd = gtd.to_pandas() - - assert_eq(gdt - gtd, pdt - ptd) - assert_eq(gdt + gtd, pdt + ptd) - - -@pytest.mark.parametrize( - "datetime_data,timedelta_data", - [ - ([1000000, 200000, 3000000], [1000000, 200000, 3000000]), - ([1000000, 200000, None], [1000000, 200000, None]), - ([], []), - ([None], [None]), - ([None, None, None, None, None], [None, None, None, None, None]), - ( - [12, 12, 22, 343, 4353534, 435342], - [12, 12, 22, 343, 4353534, 435342], - ), - (np.array([10, 20, 30, None, 100]), np.array([10, 20, 30, None, 100])), - (cp.asarray([10, 20, 30, 100]), cp.asarray([10, 20, 30, 100])), - ([1000000, 200000, 3000000], [200000, 34543, 3000000]), - ([1000000, 200000, None], [1000000, 200000, 3000000]), - ([None], [1]), - ( - [12, 12, 22, 343, 4353534, 435342], - [None, 1, 220, 3, 34, 4353423287], - ), - (np.array([10, 20, 30, None, 100]), np.array([10, 20, 30, None, 100])), - (cp.asarray([10, 20, 30, 100]), cp.asarray([10, 20, 30, 100])), - ( - [12, 11, 232, 223432411, 2343241, 234324, 23234], - [11, 1132324, 2322323111, 23341, 2434, 332, 323], - ), - ( - [12, 11, 2.32, 2234.32411, 2343.241, 23432.4, 23234], - [11, 1132324, 2322323111, 23341, 2434, 332, 323], - ), - ( - [11, 1132324, 2322323111, 23341, 2434, 332, 323], - [12, 11, 2.32, 2234.32411, 2343.241, 23432.4, 23234], - ), - ( - [1.321, 1132.324, 23223231.11, 233.41, 0.2434, 332, 323], - [12, 11, 2.32, 2234.32411, 2343.241, 23432.4, 23234], - ), - ], -) -@pytest.mark.parametrize("datetime_dtype", utils.DATETIME_TYPES) -def test_timedelta_datetime_index_ops_misc( - datetime_data, timedelta_data, datetime_dtype, timedelta_dtype -): - gdt = cudf.Index(datetime_data, dtype=datetime_dtype) - gtd = cudf.Index(timedelta_data, dtype=timedelta_dtype) - - pdt = gdt.to_pandas() - ptd = gtd.to_pandas() - - assert_eq(gdt - gtd, pdt - ptd) - assert_eq(gdt + gtd, pdt + ptd) - - -@pytest.mark.parametrize( - "other_scalars", - [ - pd.Timedelta(1513393355.5, unit="s"), - pd.Timedelta(34765, unit="D"), - datetime.timedelta(days=768), - datetime.timedelta(seconds=768), - datetime.timedelta(microseconds=7), - datetime.timedelta(minutes=447), - datetime.timedelta(hours=447), - datetime.timedelta(weeks=734), - np.timedelta64(4, "s"), - np.timedelta64(456, "D"), - np.timedelta64(46, "h"), - np.timedelta64("nat"), - np.timedelta64(1, "s"), - np.timedelta64(1, "ms"), - np.timedelta64(1, "us"), - np.timedelta64(1, "ns"), - ], -) -@pytest.mark.parametrize( - "op", - [ - "add", - "sub", - "truediv", - "floordiv", - ], -) -@pytest.mark.filterwarnings("ignore:divide by zero:RuntimeWarning:pandas") -def test_timedelta_index_ops_with_scalars( - request, data_non_overflow, other_scalars, timedelta_dtype, op -): - gtdi = cudf.Index(data=data_non_overflow, dtype=timedelta_dtype) - ptdi = gtdi.to_pandas() - - if op == "add": - expected = ptdi + other_scalars - actual = gtdi + other_scalars - elif op == "sub": - expected = ptdi - other_scalars - actual = gtdi - other_scalars - elif op == "truediv": - expected = ptdi / other_scalars - actual = gtdi / other_scalars - elif op == "floordiv": - expected = ptdi // other_scalars - actual = gtdi // other_scalars - - assert_eq(expected, actual) - - if op == "add": - expected = other_scalars + ptdi - actual = other_scalars + gtdi - elif op == "sub": - expected = other_scalars - ptdi - actual = other_scalars - gtdi - elif op == "truediv": - expected = other_scalars / ptdi - actual = other_scalars / gtdi - elif op == "floordiv": - expected = other_scalars // ptdi - actual = other_scalars // gtdi - - # Division by zero for datetime or timedelta is - # dubiously defined in both pandas (Any // 0 -> 0 in - # pandas) and cuDF (undefined behaviour) - request.applymarker( - pytest.mark.xfail( - condition=( - op == "floordiv" - and 0 in ptdi.astype("int") - and np.timedelta64(other_scalars).item() is not None - ), - reason="Related to https://github.com/rapidsai/cudf/issues/5938", - ) - ) - assert_eq(expected, actual) - - -def test_timedelta_invalid_ops(): - sr = cudf.Series([1, 2, 3], dtype="timedelta64[ns]") - psr = sr.to_pandas() - - assert_exceptions_equal( - lfunc=operator.add, - rfunc=operator.add, - lfunc_args_and_kwargs=([psr, 1],), - rfunc_args_and_kwargs=([sr, 1],), - ) - - assert_exceptions_equal( - lfunc=operator.add, - rfunc=operator.add, - lfunc_args_and_kwargs=([psr, "a"],), - rfunc_args_and_kwargs=([sr, "a"],), - ) - - dt_sr = cudf.Series([1, 2, 3], dtype="datetime64[ns]") - dt_psr = dt_sr.to_pandas() - - assert_exceptions_equal( - lfunc=operator.mod, - rfunc=operator.mod, - lfunc_args_and_kwargs=([psr, dt_psr],), - rfunc_args_and_kwargs=([sr, dt_sr],), - ) - - assert_exceptions_equal( - lfunc=operator.mod, - rfunc=operator.mod, - lfunc_args_and_kwargs=([psr, "a"],), - rfunc_args_and_kwargs=([sr, "a"],), - check_exception_type=False, - ) - - assert_exceptions_equal( - lfunc=operator.gt, - rfunc=operator.gt, - lfunc_args_and_kwargs=([psr, dt_psr],), - rfunc_args_and_kwargs=([sr, dt_sr],), - ) - - assert_exceptions_equal( - lfunc=operator.lt, - rfunc=operator.lt, - lfunc_args_and_kwargs=([psr, dt_psr],), - rfunc_args_and_kwargs=([sr, dt_sr],), - ) - - assert_exceptions_equal( - lfunc=operator.ge, - rfunc=operator.ge, - lfunc_args_and_kwargs=([psr, dt_psr],), - rfunc_args_and_kwargs=([sr, dt_sr],), - ) - - assert_exceptions_equal( - lfunc=operator.le, - rfunc=operator.le, - lfunc_args_and_kwargs=([psr, dt_psr],), - rfunc_args_and_kwargs=([sr, dt_sr],), - ) - - assert_exceptions_equal( - lfunc=operator.truediv, - rfunc=operator.truediv, - lfunc_args_and_kwargs=([psr, dt_psr],), - rfunc_args_and_kwargs=([sr, dt_sr],), - ) - - assert_exceptions_equal( - lfunc=operator.floordiv, - rfunc=operator.floordiv, - lfunc_args_and_kwargs=([psr, dt_psr],), - rfunc_args_and_kwargs=([sr, dt_sr],), - ) - - assert_exceptions_equal( - lfunc=operator.mul, - rfunc=operator.mul, - lfunc_args_and_kwargs=([psr, dt_psr],), - rfunc_args_and_kwargs=([sr, dt_sr],), - ) - - assert_exceptions_equal( - lfunc=operator.mul, - rfunc=operator.mul, - lfunc_args_and_kwargs=([psr, psr],), - rfunc_args_and_kwargs=([sr, sr],), - check_exception_type=False, - ) - - assert_exceptions_equal( - lfunc=operator.xor, - rfunc=operator.xor, - lfunc_args_and_kwargs=([psr, psr],), - rfunc_args_and_kwargs=([sr, sr],), - ) - - -@pytest.mark.parametrize("op", [operator.add, operator.sub]) -def test_timdelta_binop_tz_timestamp(op): - s = cudf.Series([1, 2, 3], dtype="timedelta64[ns]") - pd_tz_timestamp = pd.Timestamp("1970-01-01 00:00:00.000000001", tz="utc") - with pytest.raises(NotImplementedError): - op(s, pd_tz_timestamp) - date_tz_scalar = datetime.datetime.now(datetime.timezone.utc) - with pytest.raises(NotImplementedError): - op(s, date_tz_scalar) - - -@pytest.mark.parametrize( - "op", - [ - operator.lt, - operator.gt, - operator.le, - operator.ge, - operator.eq, - operator.ne, - ], -) -def test_timedelta_series_cmpops_pandas_compatibility(op): - gsr1 = cudf.Series( - data=[123, 456, None, 321, None], dtype="timedelta64[ns]" - ) - psr1 = gsr1.to_pandas() - - gsr2 = cudf.Series( - data=[123, 456, 789, None, None], dtype="timedelta64[ns]" - ) - psr2 = gsr2.to_pandas() - - expect = op(psr1, psr2) - with cudf.option_context("mode.pandas_compatible", True): - got = op(gsr1, gsr2) - - assert_eq(expect, got) - - -@pytest.mark.parametrize( - "method, kwargs", - [ - ["sum", {}], - ["mean", {}], - ["median", {}], - ["std", {}], - ["std", {"ddof": 0}], - ], -) -def test_tdi_reductions(method, kwargs): - pd_tdi = pd.TimedeltaIndex(["1 day", "2 days", "3 days"]) - cudf_tdi = cudf.from_pandas(pd_tdi) - - result = getattr(pd_tdi, method)(**kwargs) - expected = getattr(cudf_tdi, method)(**kwargs) - assert result == expected From 57f59474a04f98dafc63f7ffa40bab39d9141829 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 2 Sep 2025 18:35:25 -0700 Subject: [PATCH 244/366] Avoid more direct construction of cuDF classic columns (#19858) Precursor to https://github.com/rapidsai/cudf/issues/18726. We'll want to minimize direct construction of cuDF classic column via their attributes and instead use a pylibcudf in the future Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - Tom Augspurger (https://github.com/TomAugspurger) URL: https://github.com/rapidsai/cudf/pull/19858 --- python/cudf/cudf/core/column/categorical.py | 132 ++++-------------- python/cudf/cudf/core/column/column.py | 4 +- python/cudf/cudf/core/column/datetime.py | 8 +- python/cudf/cudf/core/column/interval.py | 30 +--- python/cudf/cudf/core/column/struct.py | 10 +- python/cudf/cudf/core/series.py | 6 +- .../cudf/pandas/scripts/conftest-patch.py | 7 - 7 files changed, 45 insertions(+), 152 deletions(-) diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py index 464d63f75f0..3b23265d2c3 100644 --- a/python/cudf/cudf/core/column/categorical.py +++ b/python/cudf/cudf/core/column/categorical.py @@ -231,13 +231,7 @@ def __setitem__(self, key, value): value = value.codes codes = self.codes codes[key] = value - out = type(self)( - data=self.data, - size=codes.size, - dtype=self.dtype, - mask=codes.base_mask, - children=(codes,), - ) + out = codes._with_type_metadata(self.dtype) self._mimic_inplace(out, inplace=True) def _fill( @@ -261,14 +255,8 @@ def _fill( return result def slice(self, start: int, stop: int, stride: int | None = None) -> Self: - codes = self.codes.slice(start, stop, stride) - return type(self)( - data=self.data, # type: ignore[arg-type] - size=codes.size, - dtype=self.dtype, - mask=codes.base_mask, - offset=codes.offset, - children=(codes,), + return self.codes.slice(start, stop, stride)._with_type_metadata( # type: ignore[return-value] + self.dtype ) def _reduce( @@ -328,23 +316,12 @@ def _normalize_binop_operand( codes = column.as_column( self._encode(other), length=len(self), dtype=self.codes.dtype ) - return type(self)( - data=None, - size=self.size, - dtype=self.dtype, - mask=self.base_mask, - children=(codes,), # type: ignore[arg-type] - ) + return codes._with_type_metadata(self.dtype) def sort_values(self, ascending: bool = True, na_position="last") -> Self: - codes = self.codes.sort_values(ascending, na_position) - return type(self)( - data=self.data, # type: ignore[arg-type] - size=codes.size, - dtype=self.dtype, - mask=codes.base_mask, - children=(codes,), - ) + return self.codes.sort_values( # type: ignore[return-value] + ascending, na_position + )._with_type_metadata(self.dtype) def element_indexing(self, index: int) -> ScalarLike: val = self.codes.element_indexing(index) @@ -439,15 +416,7 @@ def data_array_view( return self.codes.data_array_view(mode=mode) def unique(self) -> Self: - codes = self.codes.unique() - return type(self)( - data=self.data, # type: ignore[arg-type] - size=codes.size, - dtype=self.dtype, - mask=codes.base_mask, - offset=codes.offset, - children=(codes,), - ) + return self.codes.unique()._with_type_metadata(self.dtype) # type: ignore[return-value] def _cast_self_and_other_for_where( self, other: ScalarLike | ColumnBase, inplace: bool @@ -595,18 +564,11 @@ def find_and_replace( replacement_col = catmap._data["index"].astype(replaced.codes.dtype) replaced_codes = column.as_column(replaced.codes) - output = replaced_codes.replace(to_replace_col, replacement_col) - codes = as_unsigned_codes(len(new_cats["cats"]), output) # type: ignore[arg-type] - - result = type(self)( - data=self.data, # type: ignore[arg-type] - size=codes.size, - dtype=CategoricalDtype( + new_codes = replaced_codes.replace(to_replace_col, replacement_col) + result = new_codes._with_type_metadata( + CategoricalDtype( categories=new_cats["cats"], ordered=self.dtype.ordered - ), - mask=codes.base_mask, - offset=codes.offset, - children=(codes,), + ) ) if result.dtype != self.dtype: warnings.warn( @@ -617,7 +579,7 @@ def find_and_replace( "instead.", FutureWarning, ) - return result + return result # type: ignore[return-value] def isnull(self) -> ColumnBase: """ @@ -700,7 +662,7 @@ def as_categorical_column(self, dtype: CategoricalDtype) -> Self: if isinstance( self.categories.dtype, cudf.StructDtype ) and isinstance(dtype.categories.dtype, cudf.IntervalDtype): - codes = self.codes + return self._with_type_metadata(dtype) else: # Otherwise if both categories are of different Column types, # return a column full of nulls. @@ -712,15 +674,7 @@ def as_categorical_column(self, dtype: CategoricalDtype) -> Self: dtype=self.codes.dtype, ), ) - codes = as_unsigned_codes(len(dtype.categories), codes) - return type(self)( - data=self.data, # type: ignore[arg-type] - size=self.size, - dtype=dtype, - mask=self.base_mask, - offset=self.offset, - children=(codes,), - ) + return codes._with_type_metadata(dtype) # type: ignore[return-value] return self.set_categories( new_categories=dtype.categories, ordered=bool(dtype.ordered) @@ -813,12 +767,12 @@ def _with_type_metadata(self: Self, dtype: Dtype) -> Self: if isinstance(dtype, CategoricalDtype): return type(self)( data=self.data, # type: ignore[arg-type] - size=self.codes.size, + size=self.size, dtype=dtype, - mask=self.codes.base_mask, - offset=self.codes.offset, - null_count=self.codes.null_count, - children=(self.codes,), + mask=self.base_mask, + offset=self.offset, + null_count=self.null_count, + children=self.base_children, # type: ignore[arg-type] ) return self @@ -847,15 +801,8 @@ def set_categories( "new_categories must have the same " "number of items as old categories" ) - out_col = type(self)( - data=self.data, # type: ignore[arg-type] - size=self.size, - dtype=CategoricalDtype( - categories=new_categories, ordered=ordered - ), - mask=self.base_mask, - offset=self.offset, - children=(self.codes,), + return self._with_type_metadata( + CategoricalDtype(categories=new_categories, ordered=ordered) ) else: out_col = self @@ -870,16 +817,10 @@ def set_categories( dtype=self.codes.dtype, ), ) - new_codes = as_unsigned_codes(len(new_categories), new_codes) - out_col = type(self)( - data=self.data, # type: ignore[arg-type] - size=self.size, - dtype=CategoricalDtype( + out_col = new_codes._with_type_metadata( # type: ignore[assignment] + CategoricalDtype( categories=new_categories, ordered=ordered - ), - mask=self.base_mask, - offset=self.offset, - children=(new_codes,), + ) ) elif ( not out_col._categories_equal(new_categories, ordered=True) @@ -967,16 +908,8 @@ def _set_categories( new_codes = cast( cudf.core.column.numerical.NumericalColumn, df._data["new_codes"] ) - - # codes can't have masks, so take mask out before moving in - new_codes = as_unsigned_codes(len(new_cats), new_codes) - return type(self)( - data=self.data, # type: ignore[arg-type] - size=new_codes.size, - dtype=CategoricalDtype(categories=new_cats, ordered=ordered), - mask=new_codes.base_mask, - offset=new_codes.offset, - children=(new_codes,), + return new_codes._with_type_metadata( # type: ignore[return-value] + CategoricalDtype(categories=new_cats, ordered=ordered) ) def add_categories(self, new_categories: Any) -> Self: @@ -1056,13 +989,6 @@ def remove_unused_categories(self) -> Self: def as_ordered(self, ordered: bool) -> Self: if self.dtype.ordered == ordered: return self - return type(self)( - data=self.data, # type: ignore[arg-type] - size=self.size, - dtype=CategoricalDtype( - categories=self.categories, ordered=ordered - ), - mask=self.base_mask, - offset=self.offset, - children=self.children, + return self._with_type_metadata( + CategoricalDtype(categories=self.categories, ordered=ordered) ) diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 161556391eb..091e6a81c32 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -2325,9 +2325,7 @@ def reduce(self, reduction_op: str, **kwargs) -> ScalarLike: new_dtype = type(col_dtype)(precision, scale) result_col = result_col.astype(new_dtype) elif isinstance(col_dtype, IntervalDtype): - result_col = type(self).from_struct_column( # type: ignore[attr-defined] - result_col, closed=col_dtype.closed - ) + result_col = result_col._with_type_metadata(col_dtype) return result_col.element_indexing(0) @acquire_spill_lock() diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py index 8a0e5e10a15..0cc1402c141 100644 --- a/python/cudf/cudf/core/column/datetime.py +++ b/python/cudf/cudf/core/column/datetime.py @@ -908,10 +908,6 @@ def tz_convert(self, tz: str | None) -> DatetimeColumn: elif tz == str(self.dtype.tz): return self.copy() utc_time = self._utc_time - return type(self)( - data=utc_time.base_data, # type: ignore[arg-type] - dtype=pd.DatetimeTZDtype(self.time_unit, tz), - mask=utc_time.base_mask, - size=utc_time.size, - offset=utc_time.offset, + return utc_time._with_type_metadata( + pd.DatetimeTZDtype(self.time_unit, tz) ) diff --git a/python/cudf/cudf/core/column/interval.py b/python/cudf/cudf/core/column/interval.py index 1aab299e7ad..4fae8d77d88 100644 --- a/python/cudf/cudf/core/column/interval.py +++ b/python/cudf/cudf/core/column/interval.py @@ -71,36 +71,8 @@ def to_arrow(self) -> pa.Array: struct_arrow = pa.array([], typ.storage_type) return pa.ExtensionArray.from_storage(typ, struct_arrow) - @classmethod - def from_struct_column( - cls, - struct_column: StructColumn, - closed: Literal["left", "right", "both", "neither"] = "right", - ) -> Self: - first_field_name = next(iter(struct_column.dtype.fields.keys())) - return cls( - data=None, - size=struct_column.size, - dtype=IntervalDtype( - struct_column.dtype.fields[first_field_name], closed - ), - mask=struct_column.base_mask, - offset=struct_column.offset, - null_count=struct_column.null_count, - children=struct_column.base_children, # type: ignore[arg-type] - ) - def copy(self, deep: bool = True) -> Self: - struct_copy = super().copy(deep=deep) - return IntervalColumn( # type: ignore[return-value] - data=None, - size=struct_copy.size, - dtype=IntervalDtype(self.dtype.subtype, self.dtype.closed), - mask=struct_copy.base_mask, - offset=struct_copy.offset, - null_count=struct_copy.null_count, - children=struct_copy.base_children, # type: ignore[arg-type] - ) + return super().copy(deep=deep)._with_type_metadata(self.dtype) # type: ignore[return-value] @functools.cached_property def is_empty(self) -> ColumnBase: diff --git a/python/cudf/cudf/core/column/struct.py b/python/cudf/cudf/core/column/struct.py index d0a9f0389f6..0815af9c223 100644 --- a/python/cudf/cudf/core/column/struct.py +++ b/python/cudf/cudf/core/column/struct.py @@ -220,7 +220,15 @@ def _with_type_metadata(self: StructColumn, dtype: Dtype) -> StructColumn: # Check IntervalDtype first because it's a subclass of StructDtype if isinstance(dtype, IntervalDtype): - return IntervalColumn.from_struct_column(self, closed=dtype.closed) + return IntervalColumn( + data=None, + size=self.size, + dtype=dtype, + mask=self.base_mask, + offset=self.offset, + null_count=self.null_count, + children=self.base_children, # type: ignore[arg-type] + ) elif isinstance(dtype, StructDtype): return StructColumn( data=None, diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index f2adf407d2c..d07a8f76205 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -35,7 +35,6 @@ ) from cudf.core.column import ( ColumnBase, - IntervalColumn, as_column, ) from cudf.core.column.column import concat_columns @@ -3112,8 +3111,9 @@ def value_counts( # Pandas returns an IntervalIndex as the index of res # this condition makes sure we do too if bins is given if bins is not None and len(res) == len(res.index.categories): - interval_col = IntervalColumn.from_struct_column( - res.index._column._get_decategorized_column() + struct_col = res.index._column._get_decategorized_column() + interval_col = struct_col._with_type_metadata( + res.index.dtype.categories.dtype ) res.index = cudf.IntervalIndex._from_column( interval_col, name=res.index.name diff --git a/python/cudf/cudf/pandas/scripts/conftest-patch.py b/python/cudf/cudf/pandas/scripts/conftest-patch.py index cf12fae4337..c8b645b6d26 100644 --- a/python/cudf/cudf/pandas/scripts/conftest-patch.py +++ b/python/cudf/cudf/pandas/scripts/conftest-patch.py @@ -3777,12 +3777,8 @@ def pytest_unconfigure(config): "tests/extension/test_categorical.py::TestCategorical::test_series_constructor_scalar_na_with_index", "tests/extension/test_categorical.py::TestCategorical::test_setitem_frame_2d_values", "tests/extension/test_categorical.py::TestCategorical::test_to_numpy", - "tests/extension/test_categorical.py::TestCategorical::test_unstack[frame-index0]", - "tests/extension/test_categorical.py::TestCategorical::test_unstack[frame-index1]", "tests/extension/test_categorical.py::TestCategorical::test_unstack[frame-index2]", "tests/extension/test_categorical.py::TestCategorical::test_unstack[frame-index3]", - "tests/extension/test_categorical.py::TestCategorical::test_unstack[series-index0]", - "tests/extension/test_categorical.py::TestCategorical::test_unstack[series-index1]", "tests/extension/test_categorical.py::TestCategorical::test_unstack[series-index2]", "tests/extension/test_categorical.py::TestCategorical::test_unstack[series-index3]", "tests/extension/test_common.py::test_ellipsis_index", @@ -6021,8 +6017,6 @@ def pytest_unconfigure(config): "tests/frame/test_stack_unstack.py::TestDataFrameReshape::test_stack_unstack[False]", "tests/frame/test_stack_unstack.py::TestDataFrameReshape::test_stack_unstack[True]", "tests/frame/test_stack_unstack.py::TestDataFrameReshape::test_unstack_long_index", - "tests/frame/test_stack_unstack.py::TestDataFrameReshape::test_unstack_mixed_extension_types[0]", - "tests/frame/test_stack_unstack.py::TestDataFrameReshape::test_unstack_mixed_extension_types[1]", "tests/frame/test_stack_unstack.py::TestDataFrameReshape::test_unstack_multi_level_cols", "tests/frame/test_stack_unstack.py::TestDataFrameReshape::test_unstack_multi_level_rows_and_cols", "tests/frame/test_stack_unstack.py::TestDataFrameReshape::test_unstack_nan_index2", @@ -6058,7 +6052,6 @@ def pytest_unconfigure(config): "tests/frame/test_stack_unstack.py::TestStackUnstackMultiLevel::test_stack_unstack_preserve_names[True]", "tests/frame/test_stack_unstack.py::TestStackUnstackMultiLevel::test_stack_unstack_wrong_level_name[False-unstack]", "tests/frame/test_stack_unstack.py::TestStackUnstackMultiLevel::test_stack_unstack_wrong_level_name[True-unstack]", - "tests/frame/test_stack_unstack.py::TestStackUnstackMultiLevel::test_unstack_categorical_columns", "tests/frame/test_stack_unstack.py::TestStackUnstackMultiLevel::test_unstack_preserve_types", "tests/frame/test_stack_unstack.py::TestStackUnstackMultiLevel::test_unstack_with_level_has_nan", "tests/frame/test_stack_unstack.py::TestStackUnstackMultiLevel::test_unstack_with_missing_int_cast_to_float", From 161f21d88a4e3b57cafa49585d2572603a0d01e7 Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Tue, 2 Sep 2025 21:06:45 -0500 Subject: [PATCH 245/366] Bump pandas supported version to `2.3.2` (#19856) This PR updates latest supported pandas version in `cudf`. Authors: - GALI PREM SAGAR (https://github.com/galipremsagar) Approvers: - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/19856 --- dependencies.yaml | 2 +- python/cudf/cudf/core/_compat.py | 2 +- python/pylibcudf/tests/common/utils.py | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/dependencies.yaml b/dependencies.yaml index de0115a9e6d..73d6900d8f1 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -790,7 +790,7 @@ dependencies: - numba-cuda==0.19.1 - matrix: {dependencies: "latest"} packages: - - pandas==2.3.1 + - pandas==2.3.2 - matrix: packages: - output_types: conda diff --git a/python/cudf/cudf/core/_compat.py b/python/cudf/cudf/core/_compat.py index d17ea599f71..9b6f3c6ad67 100644 --- a/python/cudf/cudf/core/_compat.py +++ b/python/cudf/cudf/core/_compat.py @@ -3,7 +3,7 @@ import pandas as pd from packaging import version -PANDAS_CURRENT_SUPPORTED_VERSION = version.parse("2.3.1") +PANDAS_CURRENT_SUPPORTED_VERSION = version.parse("2.3.2") PANDAS_VERSION = version.parse(pd.__version__) diff --git a/python/pylibcudf/tests/common/utils.py b/python/pylibcudf/tests/common/utils.py index 1ec0afed6cb..6302d59563b 100644 --- a/python/pylibcudf/tests/common/utils.py +++ b/python/pylibcudf/tests/common/utils.py @@ -338,11 +338,11 @@ def sink_to_str(sink): for comparison """ if isinstance(sink, (str, os.PathLike)): - with open(sink, "r") as f: + with open(sink, "r", encoding="utf-8") as f: str_result = f.read() elif isinstance(sink, io.BytesIO): sink.seek(0) - str_result = sink.read().decode() + str_result = sink.read().decode("utf-8") else: sink.seek(0) str_result = sink.read() From 839e03952de15eb52d366f554d166c210ad906d2 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Wed, 3 Sep 2025 11:10:49 -0700 Subject: [PATCH 246/366] Migrate mixed join to use multiset (#19660) This PR migrates mixed join to use the new `cuco::static_multiset` data structure and fixes several small issues in the mixed semi join code. This is the last PR to close #12261. Authors: - Yunsong Wang (https://github.com/PointKernel) - Nghia Truong (https://github.com/ttnghia) Approvers: - Lawrence Mitchell (https://github.com/wence-) - Shruti Shivakumar (https://github.com/shrshi) - Nghia Truong (https://github.com/ttnghia) URL: https://github.com/rapidsai/cudf/pull/19660 --- cpp/include/cudf/join/mixed_join.hpp | 127 ++--- cpp/src/join/hash_join.cu | 15 +- cpp/src/join/join_common_utils.cuh | 77 ++- cpp/src/join/join_common_utils.hpp | 12 +- cpp/src/join/mixed_join.cu | 459 +++++++++--------- cpp/src/join/mixed_join_common_utils.cuh | 97 ++-- cpp/src/join/mixed_join_kernel.cu | 29 +- cpp/src/join/mixed_join_kernel.cuh | 296 ++++++++--- cpp/src/join/mixed_join_kernel.hpp | 79 ++- cpp/src/join/mixed_join_kernel_nulls.cu | 29 +- cpp/src/join/mixed_join_kernels_semi.cu | 5 +- cpp/src/join/mixed_join_semi.cu | 2 +- cpp/src/join/mixed_join_size_kernel.cu | 21 +- cpp/src/join/mixed_join_size_kernel.cuh | 181 ++++--- cpp/src/join/mixed_join_size_kernel.hpp | 63 ++- cpp/src/join/mixed_join_size_kernel_nulls.cu | 22 +- cpp/tests/join/mixed_join_tests.cu | 65 ++- .../java/ai/rapids/cudf/MixedJoinSize.java | 43 -- java/src/main/java/ai/rapids/cudf/Table.java | 114 ++--- java/src/main/native/src/TableJni.cpp | 157 +++--- .../test/java/ai/rapids/cudf/TableTest.java | 40 +- python/pylibcudf/pylibcudf/join.pyx | 106 +++- python/pylibcudf/pylibcudf/libcudf/join.pxd | 27 +- 23 files changed, 1121 insertions(+), 945 deletions(-) delete mode 100644 java/src/main/java/ai/rapids/cudf/MixedJoinSize.java diff --git a/cpp/include/cudf/join/mixed_join.hpp b/cpp/include/cudf/join/mixed_join.hpp index 1d7261eebb3..8f17b0ebbc7 100644 --- a/cpp/include/cudf/join/mixed_join.hpp +++ b/cpp/include/cudf/join/mixed_join.hpp @@ -53,7 +53,7 @@ namespace CUDF_EXPORT cudf { * responsibility to choose a suitable compare_nulls value AND use appropriate * null-safe operators in the expression. * - * If the provided output size or per-row counts are incorrect, behavior is undefined. + * If the specified output size is less than the actual output size, the behavior is undefined. * * @code{.pseudo} * left_equality: {{0, 1, 2}} @@ -76,8 +76,7 @@ namespace CUDF_EXPORT cudf { * @param right_conditional The right table used for the conditional join * @param binary_predicate The condition on which to join * @param compare_nulls Whether or not null values join to each other or not - * @param output_size_data An optional pair of values indicating the exact output size and the - * number of matches for each row in the larger of the two input tables, left or right (may be + * @param output_size An optional value indicating the exact output size (may be * precomputed using the corresponding mixed_inner_join_size API). * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned table and columns' device memory @@ -87,16 +86,15 @@ namespace CUDF_EXPORT cudf { */ std::pair>, std::unique_ptr>> -mixed_inner_join( - table_view const& left_equality, - table_view const& right_equality, - table_view const& left_conditional, - table_view const& right_conditional, - ast::expression const& binary_predicate, - null_equality compare_nulls = null_equality::EQUAL, - std::optional>> output_size_data = {}, - rmm::cuda_stream_view stream = cudf::get_default_stream(), - rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); +mixed_inner_join(table_view const& left_equality, + table_view const& right_equality, + table_view const& left_conditional, + table_view const& right_conditional, + ast::expression const& binary_predicate, + null_equality compare_nulls = null_equality::EQUAL, + std::optional output_size = {}, + rmm::cuda_stream_view stream = cudf::get_default_stream(), + rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); /** * @brief Returns a pair of row index vectors corresponding to all pairs of @@ -115,7 +113,7 @@ mixed_inner_join( * responsibility to choose a suitable compare_nulls value AND use appropriate * null-safe operators in the expression. * - * If the provided output size or per-row counts are incorrect, behavior is undefined. + * If the specified output size is less than the actual output size, the behavior is undefined. * * @code{.pseudo} * left_equality: {{0, 1, 2}} @@ -138,8 +136,7 @@ mixed_inner_join( * @param right_conditional The right table used for the conditional join * @param binary_predicate The condition on which to join * @param compare_nulls Whether or not null values join to each other or not - * @param output_size_data An optional pair of values indicating the exact output size and the - * number of matches for each row in the larger of the two input tables, left or right (may be + * @param output_size An optional value indicating the exact output size (may be * precomputed using the corresponding mixed_left_join_size API). * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned table and columns' device memory @@ -149,16 +146,15 @@ mixed_inner_join( */ std::pair>, std::unique_ptr>> -mixed_left_join( - table_view const& left_equality, - table_view const& right_equality, - table_view const& left_conditional, - table_view const& right_conditional, - ast::expression const& binary_predicate, - null_equality compare_nulls = null_equality::EQUAL, - std::optional>> output_size_data = {}, - rmm::cuda_stream_view stream = cudf::get_default_stream(), - rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); +mixed_left_join(table_view const& left_equality, + table_view const& right_equality, + table_view const& left_conditional, + table_view const& right_conditional, + ast::expression const& binary_predicate, + null_equality compare_nulls = null_equality::EQUAL, + std::optional output_size = {}, + rmm::cuda_stream_view stream = cudf::get_default_stream(), + rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); /** * @brief Returns a pair of row index vectors corresponding to all pairs of @@ -177,7 +173,7 @@ mixed_left_join( * responsibility to choose a suitable compare_nulls value AND use appropriate * null-safe operators in the expression. * - * If the provided output size or per-row counts are incorrect, behavior is undefined. + * If the specified output size is less than the actual output size, the behavior is undefined. * * @code{.pseudo} * left_equality: {{0, 1, 2}} @@ -200,9 +196,7 @@ mixed_left_join( * @param right_conditional The right table used for the conditional join * @param binary_predicate The condition on which to join * @param compare_nulls Whether or not null values join to each other or not - * @param output_size_data An optional pair of values indicating the exact output size and the - * number of matches for each row in the larger of the two input tables, left or right (may be - * precomputed using the corresponding mixed_full_join_size API). + * @param output_size An optional value indicating the exact output size * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned table and columns' device memory * @@ -211,16 +205,15 @@ mixed_left_join( */ std::pair>, std::unique_ptr>> -mixed_full_join( - table_view const& left_equality, - table_view const& right_equality, - table_view const& left_conditional, - table_view const& right_conditional, - ast::expression const& binary_predicate, - null_equality compare_nulls = null_equality::EQUAL, - std::optional>> output_size_data = {}, - rmm::cuda_stream_view stream = cudf::get_default_stream(), - rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); +mixed_full_join(table_view const& left_equality, + table_view const& right_equality, + table_view const& left_conditional, + table_view const& right_conditional, + ast::expression const& binary_predicate, + null_equality compare_nulls = null_equality::EQUAL, + std::optional output_size = {}, + rmm::cuda_stream_view stream = cudf::get_default_stream(), + rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); /** * @brief Returns an index vector corresponding to all rows in the left tables @@ -232,7 +225,6 @@ mixed_full_join( * choose a suitable compare_nulls value AND use appropriate null-safe * operators in the expression. * - * If the provided output size or per-row counts are incorrect, behavior is undefined. * * @code{.pseudo} * left_equality: {{0, 1, 2}} @@ -258,8 +250,7 @@ mixed_full_join( * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned table and columns' device memory * - * @return A pair of vectors [`left_indices`, `right_indices`] that can be used to construct - * the result of performing a mixed full join between the four input tables. + * @return A vector of indices from the left table that have matches in the right table. */ std::unique_ptr> mixed_left_semi_join( table_view const& left_equality, @@ -282,7 +273,6 @@ std::unique_ptr> mixed_left_semi_join( * choose a suitable compare_nulls value AND use appropriate null-safe * operators in the expression. * - * If the provided output size or per-row counts are incorrect, behavior is undefined. * * @code{.pseudo} * left_equality: {{0, 1, 2}} @@ -308,8 +298,7 @@ std::unique_ptr> mixed_left_semi_join( * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned table and columns' device memory * - * @return A pair of vectors [`left_indices`, `right_indices`] that can be used to construct - * the result of performing a mixed full join between the four input tables. + * @return A vector of indices from the left table that do not have matches in the right table. */ std::unique_ptr> mixed_left_anti_join( table_view const& left_equality, @@ -345,23 +334,16 @@ std::unique_ptr> mixed_left_anti_join( * @param binary_predicate The condition on which to join * @param compare_nulls Whether or not null values join to each other or not * @param stream CUDA stream used for device memory operations and kernel launches - * @param mr Device memory resource used to allocate the returned table and columns' device memory * - * @return A pair containing the size that would result from performing the - * requested join and the number of matches for each row in one of the two - * tables. Which of the two tables is an implementation detail and should not - * be relied upon, simply passed to the corresponding `mixed_inner_join` API as - * is. + * @return The size that would result from performing the requested join. */ -std::pair>> mixed_inner_join_size( - table_view const& left_equality, - table_view const& right_equality, - table_view const& left_conditional, - table_view const& right_conditional, - ast::expression const& binary_predicate, - null_equality compare_nulls = null_equality::EQUAL, - rmm::cuda_stream_view stream = cudf::get_default_stream(), - rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); +std::size_t mixed_inner_join_size(table_view const& left_equality, + table_view const& right_equality, + table_view const& left_conditional, + table_view const& right_conditional, + ast::expression const& binary_predicate, + null_equality compare_nulls = null_equality::EQUAL, + rmm::cuda_stream_view stream = cudf::get_default_stream()); /** * @brief Returns the exact number of matches (rows) when performing a @@ -387,23 +369,16 @@ std::pair>> mixed_in * @param binary_predicate The condition on which to join * @param compare_nulls Whether or not null values join to each other or not * @param stream CUDA stream used for device memory operations and kernel launches - * @param mr Device memory resource used to allocate the returned table and columns' device memory * - * @return A pair containing the size that would result from performing the - * requested join and the number of matches for each row in one of the two - * tables. Which of the two tables is an implementation detail and should not - * be relied upon, simply passed to the corresponding `mixed_left_join` API as - * is. + * @return The size that would result from performing the requested join. */ -std::pair>> mixed_left_join_size( - table_view const& left_equality, - table_view const& right_equality, - table_view const& left_conditional, - table_view const& right_conditional, - ast::expression const& binary_predicate, - null_equality compare_nulls = null_equality::EQUAL, - rmm::cuda_stream_view stream = cudf::get_default_stream(), - rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); +std::size_t mixed_left_join_size(table_view const& left_equality, + table_view const& right_equality, + table_view const& left_conditional, + table_view const& right_conditional, + ast::expression const& binary_predicate, + null_equality compare_nulls = null_equality::EQUAL, + rmm::cuda_stream_view stream = cudf::get_default_stream()); /** @} */ // end of group diff --git a/cpp/src/join/hash_join.cu b/cpp/src/join/hash_join.cu index 06220e299de..5a211b91466 100644 --- a/cpp/src/join/hash_join.cu +++ b/cpp/src/join/hash_join.cu @@ -52,20 +52,7 @@ namespace { using hash_table_t = cudf::hash_join::impl_type::hash_table_t; // Multimap type used for mixed joins. TODO: This is a temporary alias used -// TODO: `pair_equal` and `pair_fn` to be moved to common utils during -// mixed-join migration -template -struct pair_fn { - pair_fn(Hasher hash) : _hash{std::move(hash)} {} - - __device__ cuco::pair operator()(size_type i) const noexcept - { - return cuco::pair{_hash(i), i}; - } - - private: - Hasher _hash; -}; +// TODO: `pair_equal` to be moved to common utils during mixed-join migration template class pair_equal { diff --git a/cpp/src/join/join_common_utils.cuh b/cpp/src/join/join_common_utils.cuh index 91dab3e67f4..11210e11362 100644 --- a/cpp/src/join/join_common_utils.cuh +++ b/cpp/src/join/join_common_utils.cuh @@ -33,43 +33,17 @@ #include namespace cudf::detail { -/** - * @brief Remaps a hash value to a new value if it is equal to the specified sentinel value. - * - * @param hash The hash value to potentially remap - * @param sentinel The reserved value - */ -template -constexpr auto remap_sentinel_hash(H hash, S sentinel) -{ - // Arbitrarily choose hash - 1 - return (hash == sentinel) ? (hash - 1) : hash; -} +template +struct pair_fn { + CUDF_HOST_DEVICE pair_fn(Hasher hash) : _hash{std::move(hash)} {} -/** - * @brief Device functor to create a pair of {hash_value, row_index} for a given row. - * - * @tparam T Type of row index, must be convertible to `size_type`. - * @tparam Hasher The type of internal hasher to compute row hash. - */ -template -class make_pair_function { - public: - CUDF_HOST_DEVICE make_pair_function(Hasher const& hash, hash_value_type const empty_key_sentinel) - : _hash{hash}, _empty_key_sentinel{empty_key_sentinel} - { - } - - __device__ __forceinline__ auto operator()(size_type i) const noexcept + __device__ cuco::pair operator()(size_type i) const noexcept { - // Compute the hash value of row `i` - auto row_hash_value = remap_sentinel_hash(_hash(i), _empty_key_sentinel); - return cuco::make_pair(row_hash_value, T{i}); + return cuco::pair{_hash(i), i}; } private: Hasher _hash; - hash_value_type const _empty_key_sentinel; }; /** @@ -165,37 +139,39 @@ get_trivial_left_join_indices(table_view const& left, * @param bitmask Bitmask to denote whether a row is valid. * @param stream CUDA stream used for device memory operations and kernel launches. */ -template +template void build_join_hash_table( cudf::table_view const& build, std::shared_ptr const& preprocessed_build, - MultimapType& hash_table, - bool has_nulls, + HashTable& hash_table, + bool has_nested_nulls, null_equality nulls_equal, [[maybe_unused]] bitmask_type const* bitmask, rmm::cuda_stream_view stream) { - CUDF_EXPECTS(0 != build.num_columns(), "Selected build dataset is empty"); - CUDF_EXPECTS(0 != build.num_rows(), "Build side table has no rows"); + CUDF_EXPECTS(0 != build.num_columns(), "Selected build dataset is empty", std::invalid_argument); + CUDF_EXPECTS(0 != build.num_rows(), "Build side table has no rows", std::invalid_argument); - auto const row_hash = experimental::row::hash::row_hasher{preprocessed_build}; - auto const hash_build = row_hash.device_hasher(nullate::DYNAMIC{has_nulls}); + auto insert_rows = [&](auto const& build, auto const& d_hasher) { + auto const iter = cudf::detail::make_counting_transform_iterator(0, pair_fn{d_hasher}); - auto const empty_key_sentinel = hash_table.get_empty_key_sentinel(); - make_pair_function pair_func{hash_build, empty_key_sentinel}; + if (nulls_equal == cudf::null_equality::EQUAL or not nullable(build)) { + hash_table.insert_async(iter, iter + build.num_rows(), stream.value()); + } else { + auto const stencil = thrust::counting_iterator{0}; + auto const pred = row_is_valid{bitmask}; - auto const iter = cudf::detail::make_counting_transform_iterator(0, pair_func); + // insert valid rows + hash_table.insert_if_async(iter, iter + build.num_rows(), stencil, pred, stream.value()); + } + }; - size_type const build_table_num_rows{build.num_rows()}; - if (nulls_equal == cudf::null_equality::EQUAL or (not nullable(build))) { - hash_table.insert(iter, iter + build_table_num_rows, stream.value()); - } else { - thrust::counting_iterator stencil(0); - row_is_valid pred{bitmask}; + auto const nulls = nullate::DYNAMIC{has_nested_nulls}; - // insert valid rows - hash_table.insert_if(iter, iter + build_table_num_rows, stencil, pred, stream.value()); - } + auto const row_hash = experimental::row::hash::row_hasher{preprocessed_build}; + auto const d_hasher = row_hash.device_hasher(nulls); + + insert_rows(build, d_hasher); } // Convenient alias for a pair of unique pointers to device uvectors. @@ -262,4 +238,5 @@ struct valid_range { return ((index >= start) && (index < stop)); } }; + } // namespace cudf::detail diff --git a/cpp/src/join/join_common_utils.hpp b/cpp/src/join/join_common_utils.hpp index a78b07e6be4..27d3c3afe45 100644 --- a/cpp/src/join/join_common_utils.hpp +++ b/cpp/src/join/join_common_utils.hpp @@ -20,7 +20,7 @@ #include #include -#include +#include #include namespace cudf::detail { @@ -31,15 +31,5 @@ using pair_type = cuco::pair; using hash_type = cuco::murmurhash3_32; -// Multimap type used for mixed joins. TODO: This is a temporary alias used -// until the mixed joins are converted to using CGs properly. Right now it's -// using a cooperative group of size 1. -using mixed_multimap_type = - cuco::static_multimap, - cuco::legacy::double_hashing<1, hash_type, hash_type>>; - bool is_trivial_join(table_view const& left, table_view const& right, join_kind join_type); } // namespace cudf::detail diff --git a/cpp/src/join/mixed_join.cu b/cpp/src/join/mixed_join.cu index 2109dc8951d..d48c2fe7873 100644 --- a/cpp/src/join/mixed_join.cu +++ b/cpp/src/join/mixed_join.cu @@ -16,6 +16,7 @@ #include "join_common_utils.cuh" #include "join_common_utils.hpp" +#include "mixed_join_common_utils.cuh" #include "mixed_join_kernel.hpp" #include "mixed_join_size_kernel.hpp" @@ -37,8 +38,9 @@ #include #include -#include -#include +#include +#include +#include #include #include @@ -46,19 +48,86 @@ namespace cudf { namespace detail { +namespace { + +/** + * @brief Precompute input pairs and hash indices for mixed join operations + * + * Precomputes input pairs and hash indices in a single pass to reduce code duplication + * between mixed_join and compute_mixed_join_output_size functions. + * + * Precomputation reduces register pressure in probing kernels by avoiding expensive + * on-the-fly calculations of iterator transforms and hash table indices. + * + * @tparam HashProbe Type of the device hasher for computing probe keys + * @param hash_table Hash table for probing + * @param hash_probe Device hasher for probe keys + * @param probe_table_num_rows Number of rows in probe table + * @param stream CUDA stream + * @param mr Memory resource + * @return Pair of device vectors: precomputed input pairs and hash indices + */ +template +std::pair>, + rmm::device_uvector>> +precompute_mixed_join_data(mixed_join_hash_table_t const& hash_table, + HashProbe const& hash_probe, + size_type probe_table_num_rows, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + auto input_pairs = + rmm::device_uvector>(probe_table_num_rows, stream, mr); + auto hash_indices = + rmm::device_uvector>(probe_table_num_rows, stream, mr); + + auto const extent = hash_table.capacity(); + auto const probe_hash_fn = hash_table.hash_function(); + static constexpr std::size_t bucket_size = mixed_join_hash_table_t::bucket_size; + + // Functor to pre-compute both input pairs and initial slots and step sizes for double hashing. + auto precompute_fn = [=] __device__(size_type i) { + auto const probe_key = cuco::pair{hash_probe(i), i}; + + // Use the probing scheme's hash functions for proper double hashing + auto const hash1_val = cuda::std::get<0>(probe_hash_fn)(probe_key); + auto const hash2_val = cuda::std::get<1>(probe_hash_fn)(probe_key); + + // Double hashing logic: initial position and step size + auto const init_idx = (hash1_val % (extent / bucket_size)) * bucket_size; + auto const step_val = + ((hash2_val % (extent / bucket_size - std::size_t{1})) + std::size_t{1}) * bucket_size; + + return cuda::std::pair{ + probe_key, + cuda::std::pair{static_cast(init_idx), static_cast(step_val)}}; + }; + + // Single transform to fill both arrays using zip iterator + thrust::transform( + rmm::exec_policy_nosync(stream), + thrust::counting_iterator(0), + thrust::counting_iterator(probe_table_num_rows), + thrust::make_zip_iterator(thrust::make_tuple(input_pairs.begin(), hash_indices.begin())), + precompute_fn); + + return std::make_pair(std::move(input_pairs), std::move(hash_indices)); +} + +} // anonymous namespace + std::pair>, std::unique_ptr>> -mixed_join( - table_view const& left_equality, - table_view const& right_equality, - table_view const& left_conditional, - table_view const& right_conditional, - ast::expression const& binary_predicate, - null_equality compare_nulls, - join_kind join_type, - std::optional>> const& output_size_data, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) +mixed_join(table_view const& left_equality, + table_view const& right_equality, + table_view const& left_conditional, + table_view const& right_conditional, + ast::expression const& binary_predicate, + null_equality compare_nulls, + join_kind join_type, + std::optional const& output_size, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) { CUDF_EXPECTS(left_conditional.num_rows() == left_equality.num_rows(), "The left conditional and equality tables must have the same number of rows."); @@ -72,10 +141,10 @@ mixed_join( auto const left_num_rows{left_conditional.num_rows()}; auto const swap_tables = (join_type == join_kind::INNER_JOIN) && (right_num_rows > left_num_rows); - // The "outer" table is the larger of the two tables. The kernels are - // launched with one thread per row of the outer table, which also means that - // it is the probe table for the hash - auto const outer_num_rows{swap_tables ? right_num_rows : left_num_rows}; + // The "probe" table is the table we iterate over during the join operation. + // For performance optimization, we choose the larger table as the probe table. + // The kernels are launched with one thread per row of the probe table. + auto const probe_table_num_rows{swap_tables ? right_num_rows : left_num_rows}; // We can immediately filter out cases where the right table is empty. In // some cases, we return all the rows of the left table with a corresponding @@ -129,13 +198,17 @@ mixed_join( auto probe_view = table_device_view::create(probe, stream); auto build_view = table_device_view::create(build, stream); - // Don't use multimap_type because we want a CG size of 1. - mixed_multimap_type hash_table{ - compute_hash_table_size(build.num_rows()), - cuco::empty_key{std::numeric_limits::max()}, - cuco::empty_value{cudf::detail::JoinNoneValue}, - stream.value(), - cudf::detail::cuco_allocator{rmm::mr::polymorphic_allocator{}, stream}}; + mixed_join_hash_table_t hash_table{ + cuco::extent{static_cast(build.num_rows())}, + cudf::detail::CUCO_DESIRED_LOAD_FACTOR, + cuco::empty_key{ + cuco::pair{std::numeric_limits::max(), cudf::detail::JoinNoneValue}}, + {}, + {}, + {}, + {}, + cudf::detail::cuco_allocator{rmm::mr::polymorphic_allocator{}, stream.value()}, + stream.value()}; // TODO: To add support for nested columns we will need to flatten in many // places. However, this probably isn't worth adding any time soon since we @@ -151,24 +224,21 @@ mixed_join( compare_nulls, static_cast(row_bitmask.data()), stream); - auto hash_table_view = hash_table.get_device_view(); + auto hash_table_storage = cudf::device_span>{ + hash_table.data(), hash_table.capacity()}; auto left_conditional_view = table_device_view::create(left_conditional, stream); auto right_conditional_view = table_device_view::create(right_conditional, stream); // For inner joins we support optimizing the join by launching one thread for // whichever table is larger rather than always using the left table. - detail::grid_1d const config(outer_num_rows, DEFAULT_JOIN_BLOCK_SIZE); + detail::grid_1d const config(probe_table_num_rows, DEFAULT_JOIN_BLOCK_SIZE); auto const shmem_size_per_block = parser.shmem_per_thread * config.num_threads_per_block; join_kind const kernel_join_type = join_type == join_kind::FULL_JOIN ? join_kind::LEFT_JOIN : join_type; - // If the join size data was not provided as an input, compute it here. + // If the join size was not provided as an input, compute it here. std::size_t join_size; - // Using an optional because we only need to allocate a new vector if one was - // not passed as input, and rmm::device_uvector is not default constructible - std::optional> matches_per_row{}; - device_span matches_per_row_span{}; auto const preprocessed_probe = experimental::row::equality::preprocessed_table::create(probe, stream); @@ -178,50 +248,38 @@ mixed_join( cudf::experimental::row::equality::two_table_comparator{preprocessed_probe, preprocessed_build}; auto const equality_probe = row_comparator.equal_to(has_nulls, compare_nulls); - if (output_size_data.has_value()) { - join_size = output_size_data->first; - matches_per_row_span = output_size_data->second; + auto [input_pairs, hash_indices] = + precompute_mixed_join_data(hash_table, hash_probe, probe_table_num_rows, stream, mr); + + if (output_size.has_value()) { + join_size = output_size.value(); } else { - matches_per_row = - rmm::device_uvector{static_cast(outer_num_rows), stream, mr}; - // Note that the view goes out of scope after this else statement, but the - // data owned by matches_per_row stays alive so the data pointer is valid. - auto mutable_matches_per_row_span = cudf::device_span{ - matches_per_row->begin(), static_cast(outer_num_rows)}; - matches_per_row_span = cudf::device_span{ - matches_per_row->begin(), static_cast(outer_num_rows)}; if (has_nulls) { join_size = launch_compute_mixed_join_output_size(*left_conditional_view, *right_conditional_view, - *probe_view, - *build_view, - hash_probe, - equality_probe, kernel_join_type, - hash_table_view, + equality_probe, + hash_table_storage, + input_pairs.data(), + hash_indices.data(), parser.device_expression_data, swap_tables, - mutable_matches_per_row_span, config, shmem_size_per_block, - stream, - mr); + stream); } else { join_size = launch_compute_mixed_join_output_size(*left_conditional_view, *right_conditional_view, - *probe_view, - *build_view, - hash_probe, - equality_probe, kernel_join_type, - hash_table_view, + equality_probe, + hash_table_storage, + input_pairs.data(), + hash_indices.data(), parser.device_expression_data, swap_tables, - mutable_matches_per_row_span, config, shmem_size_per_block, - stream, - mr); + stream); } } @@ -236,14 +294,6 @@ mixed_join( std::make_unique>(0, stream, mr)); } - // Given the number of matches per row, we need to compute the offsets for insertion. - auto join_result_offsets = - rmm::device_uvector{static_cast(outer_num_rows), stream, mr}; - thrust::exclusive_scan(rmm::exec_policy{stream}, - matches_per_row_span.begin(), - matches_per_row_span.end(), - join_result_offsets.begin()); - auto left_indices = std::make_unique>(join_size, stream, mr); auto right_indices = std::make_unique>(join_size, stream, mr); @@ -253,16 +303,14 @@ mixed_join( if (has_nulls) { launch_mixed_join(*left_conditional_view, *right_conditional_view, - *probe_view, - *build_view, - hash_probe, - equality_probe, kernel_join_type, - hash_table_view, + equality_probe, + hash_table_storage, + input_pairs.data(), + hash_indices.data(), join_output_l, join_output_r, parser.device_expression_data, - join_result_offsets.data(), swap_tables, config, shmem_size_per_block, @@ -270,16 +318,14 @@ mixed_join( } else { launch_mixed_join(*left_conditional_view, *right_conditional_view, - *probe_view, - *build_view, - hash_probe, - equality_probe, kernel_join_type, - hash_table_view, + equality_probe, + hash_table_storage, + input_pairs.data(), + hash_indices.data(), join_output_l, join_output_r, parser.device_expression_data, - join_result_offsets.data(), swap_tables, config, shmem_size_per_block, @@ -298,16 +344,14 @@ mixed_join( return join_indices; } -std::pair>> -compute_mixed_join_output_size(table_view const& left_equality, - table_view const& right_equality, - table_view const& left_conditional, - table_view const& right_conditional, - ast::expression const& binary_predicate, - null_equality compare_nulls, - join_kind join_type, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) +std::size_t compute_mixed_join_output_size(table_view const& left_equality, + table_view const& right_equality, + table_view const& left_conditional, + table_view const& right_conditional, + ast::expression const& binary_predicate, + null_equality compare_nulls, + join_kind join_type, + rmm::cuda_stream_view stream) { // Until we add logic to handle the number of non-matches in the right table, // full joins are not supported in this function. Note that this does not @@ -329,52 +373,37 @@ compute_mixed_join_output_size(table_view const& left_equality, auto const left_num_rows{left_conditional.num_rows()}; auto const swap_tables = (join_type == join_kind::INNER_JOIN) && (right_num_rows > left_num_rows); - // The "outer" table is the larger of the two tables. The kernels are - // launched with one thread per row of the outer table, which also means that - // it is the probe table for the hash - auto const outer_num_rows{swap_tables ? right_num_rows : left_num_rows}; + // The "probe" table is the table we iterate over during the join operation. + // For performance optimization, we choose the larger table as the probe table. + // The kernels are launched with one thread per row of the probe table. + auto const probe_table_num_rows{swap_tables ? right_num_rows : left_num_rows}; - auto matches_per_row = std::make_unique>( - static_cast(outer_num_rows), stream, mr); - auto matches_per_row_span = cudf::device_span{ - matches_per_row->begin(), static_cast(outer_num_rows)}; - - // We can immediately filter out cases where one table is empty. In - // some cases, we return all the rows of the other table with a corresponding - // null index for the empty table; in others, we return an empty output. + // We can immediately filter out cases where one table is empty. if (right_num_rows == 0) { switch (join_type) { - // Left, left anti, and full all return all the row indices from left - // with a corresponding NULL from the right. - case join_kind::LEFT_JOIN: - case join_kind::FULL_JOIN: { - thrust::fill(matches_per_row->begin(), matches_per_row->end(), 1); - return {left_num_rows, std::move(matches_per_row)}; + // Left joins return all the row indices from left with a corresponding NULL from the right. + case join_kind::LEFT_JOIN: { + return left_num_rows; } - // Inner and left semi joins return empty output because no matches can exist. + // Inner joins return empty output because no matches can exist. case join_kind::INNER_JOIN: { - thrust::fill(matches_per_row->begin(), matches_per_row->end(), 0); - return {0, std::move(matches_per_row)}; + return 0; } default: CUDF_FAIL("Invalid join kind."); break; } } else if (left_num_rows == 0) { switch (join_type) { - // Left, left anti, left semi, and inner joins all return empty sets. + // Left and inner joins all return empty sets. case join_kind::LEFT_JOIN: case join_kind::INNER_JOIN: { - thrust::fill(matches_per_row->begin(), matches_per_row->end(), 0); - return {0, std::move(matches_per_row)}; - } - // Full joins need to return the trivial complement. - case join_kind::FULL_JOIN: { - thrust::fill(matches_per_row->begin(), matches_per_row->end(), 1); - return {right_num_rows, std::move(matches_per_row)}; + return 0; } default: CUDF_FAIL("Invalid join kind."); break; } } + auto mr = cudf::get_current_device_resource_ref(); + // If evaluating the expression may produce null outputs we create a nullable // output column and follow the null-supporting expression evaluation code // path. @@ -396,19 +425,22 @@ compute_mixed_join_output_size(table_view const& left_equality, auto probe_view = table_device_view::create(probe, stream); auto build_view = table_device_view::create(build, stream); - // Don't use multimap_type because we want a CG size of 1. - mixed_multimap_type hash_table{ - compute_hash_table_size(build.num_rows()), - cuco::empty_key{std::numeric_limits::max()}, - cuco::empty_value{cudf::detail::JoinNoneValue}, - stream.value(), - cudf::detail::cuco_allocator{rmm::mr::polymorphic_allocator{}, stream}}; + mixed_join_hash_table_t hash_table{ + cuco::extent{static_cast(build.num_rows())}, + cudf::detail::CUCO_DESIRED_LOAD_FACTOR, + cuco::empty_key{ + cuco::pair{std::numeric_limits::max(), cudf::detail::JoinNoneValue}}, + {}, + {}, + {}, + {}, + cudf::detail::cuco_allocator{rmm::mr::polymorphic_allocator{}, stream.value()}, + stream.value()}; // TODO: To add support for nested columns we will need to flatten in many // places. However, this probably isn't worth adding any time soon since we // won't be able to support AST conditions for those types anyway. - auto const row_bitmask = - cudf::detail::bitmask_and(build, stream, cudf::get_current_device_resource_ref()).first; + auto const row_bitmask = cudf::detail::bitmask_and(build, stream, mr).first; auto const preprocessed_build = experimental::row::equality::preprocessed_table::create(build, stream); build_join_hash_table(build, @@ -418,14 +450,15 @@ compute_mixed_join_output_size(table_view const& left_equality, compare_nulls, static_cast(row_bitmask.data()), stream); - auto hash_table_view = hash_table.get_device_view(); + auto hash_table_storage = cudf::device_span>{ + hash_table.data(), hash_table.capacity()}; auto left_conditional_view = table_device_view::create(left_conditional, stream); auto right_conditional_view = table_device_view::create(right_conditional, stream); // For inner joins we support optimizing the join by launching one thread for // whichever table is larger rather than always using the left table. - detail::grid_1d const config(outer_num_rows, DEFAULT_JOIN_BLOCK_SIZE); + detail::grid_1d const config(probe_table_num_rows, DEFAULT_JOIN_BLOCK_SIZE); auto const shmem_size_per_block = parser.shmem_per_thread * config.num_threads_per_block; auto const preprocessed_probe = @@ -436,60 +469,58 @@ compute_mixed_join_output_size(table_view const& left_equality, cudf::experimental::row::equality::two_table_comparator{preprocessed_probe, preprocessed_build}; auto const equality_probe = row_comparator.equal_to(has_nulls, compare_nulls); + // Precompute input pairs and hash indices using common utility function + auto [input_pairs, hash_indices] = + precompute_mixed_join_data(hash_table, hash_probe, probe_table_num_rows, stream, mr); + // Determine number of output rows without actually building the output to simply // find what the size of the output will be. - std::size_t size = 0; - if (has_nulls) { - size = launch_compute_mixed_join_output_size(*left_conditional_view, - *right_conditional_view, - *probe_view, - *build_view, - hash_probe, - equality_probe, - join_type, - hash_table_view, - parser.device_expression_data, - swap_tables, - matches_per_row_span, - config, - shmem_size_per_block, - stream, - mr); - } else { - size = launch_compute_mixed_join_output_size(*left_conditional_view, - *right_conditional_view, - *probe_view, - *build_view, - hash_probe, - equality_probe, - join_type, - hash_table_view, - parser.device_expression_data, - swap_tables, - matches_per_row_span, - config, - shmem_size_per_block, - stream, - mr); - } + std::size_t const size = [&]() { + if (has_nulls) { + return launch_compute_mixed_join_output_size(*left_conditional_view, + *right_conditional_view, + join_type, + equality_probe, + hash_table_storage, + input_pairs.data(), + hash_indices.data(), + parser.device_expression_data, + swap_tables, + config, + shmem_size_per_block, + stream); + } else { + return launch_compute_mixed_join_output_size(*left_conditional_view, + *right_conditional_view, + join_type, + equality_probe, + hash_table_storage, + input_pairs.data(), + hash_indices.data(), + parser.device_expression_data, + swap_tables, + config, + shmem_size_per_block, + stream); + } + }(); - return {size, std::move(matches_per_row)}; + return size; } } // namespace detail std::pair>, std::unique_ptr>> -mixed_inner_join( - table_view const& left_equality, - table_view const& right_equality, - table_view const& left_conditional, - table_view const& right_conditional, - ast::expression const& binary_predicate, - null_equality compare_nulls, - std::optional>> const output_size_data, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) +mixed_inner_join(table_view const& left_equality, + table_view const& right_equality, + table_view const& left_conditional, + table_view const& right_conditional, + ast::expression const& binary_predicate, + null_equality compare_nulls, + std::optional const output_size, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); return detail::mixed_join(left_equality, @@ -499,20 +530,18 @@ mixed_inner_join( binary_predicate, compare_nulls, detail::join_kind::INNER_JOIN, - output_size_data, + output_size, stream, mr); } -std::pair>> mixed_inner_join_size( - table_view const& left_equality, - table_view const& right_equality, - table_view const& left_conditional, - table_view const& right_conditional, - ast::expression const& binary_predicate, - null_equality compare_nulls, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) +std::size_t mixed_inner_join_size(table_view const& left_equality, + table_view const& right_equality, + table_view const& left_conditional, + table_view const& right_conditional, + ast::expression const& binary_predicate, + null_equality compare_nulls, + rmm::cuda_stream_view stream) { CUDF_FUNC_RANGE(); return detail::compute_mixed_join_output_size(left_equality, @@ -522,22 +551,20 @@ std::pair>> mixed_in binary_predicate, compare_nulls, detail::join_kind::INNER_JOIN, - stream, - mr); + stream); } std::pair>, std::unique_ptr>> -mixed_left_join( - table_view const& left_equality, - table_view const& right_equality, - table_view const& left_conditional, - table_view const& right_conditional, - ast::expression const& binary_predicate, - null_equality compare_nulls, - std::optional>> const output_size_data, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) +mixed_left_join(table_view const& left_equality, + table_view const& right_equality, + table_view const& left_conditional, + table_view const& right_conditional, + ast::expression const& binary_predicate, + null_equality compare_nulls, + std::optional const output_size, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); return detail::mixed_join(left_equality, @@ -547,20 +574,18 @@ mixed_left_join( binary_predicate, compare_nulls, detail::join_kind::LEFT_JOIN, - output_size_data, + output_size, stream, mr); } -std::pair>> mixed_left_join_size( - table_view const& left_equality, - table_view const& right_equality, - table_view const& left_conditional, - table_view const& right_conditional, - ast::expression const& binary_predicate, - null_equality compare_nulls, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) +std::size_t mixed_left_join_size(table_view const& left_equality, + table_view const& right_equality, + table_view const& left_conditional, + table_view const& right_conditional, + ast::expression const& binary_predicate, + null_equality compare_nulls, + rmm::cuda_stream_view stream) { CUDF_FUNC_RANGE(); return detail::compute_mixed_join_output_size(left_equality, @@ -570,22 +595,20 @@ std::pair>> mixed_le binary_predicate, compare_nulls, detail::join_kind::LEFT_JOIN, - stream, - mr); + stream); } std::pair>, std::unique_ptr>> -mixed_full_join( - table_view const& left_equality, - table_view const& right_equality, - table_view const& left_conditional, - table_view const& right_conditional, - ast::expression const& binary_predicate, - null_equality compare_nulls, - std::optional>> const output_size_data, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) +mixed_full_join(table_view const& left_equality, + table_view const& right_equality, + table_view const& left_conditional, + table_view const& right_conditional, + ast::expression const& binary_predicate, + null_equality compare_nulls, + std::optional const output_size, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); return detail::mixed_join(left_equality, @@ -595,7 +618,7 @@ mixed_full_join( binary_predicate, compare_nulls, detail::join_kind::FULL_JOIN, - output_size_data, + output_size, stream, mr); } diff --git a/cpp/src/join/mixed_join_common_utils.cuh b/cpp/src/join/mixed_join_common_utils.cuh index 4a52cfe098a..cc3babf7e0d 100644 --- a/cpp/src/join/mixed_join_common_utils.cuh +++ b/cpp/src/join/mixed_join_common_utils.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2024, NVIDIA CORPORATION. + * Copyright (c) 2022-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -25,6 +25,7 @@ #include #include +#include #include namespace cudf { @@ -38,6 +39,56 @@ using row_hash = using row_equality = cudf::experimental::row::equality::strong_index_comparator_adapter< cudf::experimental::row::equality::device_row_comparator>; +// Comparator that always returns false to ensure all values are inserted (like hash_join) +struct mixed_join_always_not_equal { + __device__ constexpr bool operator()(cuco::pair const&, + cuco::pair const&) const noexcept + { + // multiset always insert + return false; + } +}; + +// hasher1 and hasher2 used for double hashing. The first hash is used to determine the initial slot +// and the second hash is used to determine the step size. +// +// For the first hash, we use the row hash value directly so there is no need to hash it again. +// +// For the second hash, we hash the row hash value again to determine the step size. +struct mixed_join_hasher1 { + __device__ constexpr hash_value_type operator()( + cuco::pair const& key) const noexcept + { + return key.first; + } +}; + +struct mixed_join_hasher2 { + mixed_join_hasher2(hash_value_type seed) : _hash{seed} {} + + __device__ constexpr hash_value_type operator()( + cuco::pair const& key) const noexcept + { + return _hash(key.first); + } + + private: + using hash_type = cuco::murmurhash3_32; + hash_type _hash; +}; + +// Hash table type used for mixed joins +using mixed_join_hash_table_t = + cuco::static_multiset, + cuco::extent, + cuda::thread_scope_device, + mixed_join_always_not_equal, + cuco::double_hashing<1, mixed_join_hasher1, mixed_join_hasher2>, + cudf::detail::cuco_allocator, + cuco::storage<2>>; +template +using mixed_join_hash_table_ref_t = mixed_join_hash_table_t::ref_type; + /** * @brief Equality comparator for use with cuco map methods that require expression evaluation. * @@ -77,17 +128,8 @@ template struct single_expression_equality : expression_equality { using expression_equality::expression_equality; - // The parameters are build/probe rather than left/right because the operator - // is called by cuco's kernels with parameters in this order (note that this - // is an implementation detail that we should eventually stop relying on by - // defining operators with suitable heterogeneous typing). Rather than - // converting to left/right semantics, we can operate directly on build/probe - // until we get to the expression evaluator, which needs to convert back to - // left/right semantics because the conditional expression need not be - // commutative. - // TODO: The input types should really be size_type. - __device__ __forceinline__ bool operator()(hash_value_type const build_row_index, - hash_value_type const probe_row_index) const noexcept + __device__ __forceinline__ bool operator()(size_type const left_index, + size_type const right_index) const noexcept { using cudf::experimental::row::lhs_index_type; using cudf::experimental::row::rhs_index_type; @@ -97,12 +139,13 @@ struct single_expression_equality : expression_equality { // 1. The contents of the columns involved in the equality condition are equal. // 2. The predicate evaluated on the relevant columns (already encoded in the evaluator) // evaluates to true. - if (this->equality_probe(lhs_index_type{probe_row_index}, rhs_index_type{build_row_index})) { - auto const lrow_idx = this->swap_tables ? build_row_index : probe_row_index; - auto const rrow_idx = this->swap_tables ? probe_row_index : build_row_index; + if (this->equality_probe(lhs_index_type{left_index}, rhs_index_type{right_index})) { + // For the AST evaluator, we need to map back to left/right table semantics + auto const left_table_idx = this->swap_tables ? right_index : left_index; + auto const right_table_idx = this->swap_tables ? left_index : right_index; this->evaluator.evaluate(output_dest, - static_cast(lrow_idx), - static_cast(rrow_idx), + static_cast(left_table_idx), + static_cast(right_table_idx), 0, this->thread_intermediate_storage); return (output_dest.is_valid() && output_dest.value()); @@ -129,16 +172,8 @@ template struct pair_expression_equality : public expression_equality { using expression_equality::expression_equality; - // The parameters are build/probe rather than left/right because the operator - // is called by cuco's kernels with parameters in this order (note that this - // is an implementation detail that we should eventually stop relying on by - // defining operators with suitable heterogeneous typing). Rather than - // converting to left/right semantics, we can operate directly on build/probe - // until we get to the expression evaluator, which needs to convert back to - // left/right semantics because the conditional expression need not be - // commutative. - __device__ __forceinline__ bool operator()(pair_type const& build_row, - pair_type const& probe_row) const noexcept + __device__ __forceinline__ bool operator()(pair_type const& left_row, + pair_type const& right_row) const noexcept { using cudf::experimental::row::lhs_index_type; using cudf::experimental::row::rhs_index_type; @@ -149,10 +184,10 @@ struct pair_expression_equality : public expression_equality { // 2. The contents of the columns involved in the equality condition are equal. // 3. The predicate evaluated on the relevant columns (already encoded in the evaluator) // evaluates to true. - if ((probe_row.first == build_row.first) && - this->equality_probe(lhs_index_type{probe_row.second}, rhs_index_type{build_row.second})) { - auto const lrow_idx = this->swap_tables ? build_row.second : probe_row.second; - auto const rrow_idx = this->swap_tables ? probe_row.second : build_row.second; + if ((left_row.first == right_row.first) && + this->equality_probe(lhs_index_type{left_row.second}, rhs_index_type{right_row.second})) { + auto const lrow_idx = this->swap_tables ? right_row.second : left_row.second; + auto const rrow_idx = this->swap_tables ? left_row.second : right_row.second; this->evaluator.evaluate( output_dest, lrow_idx, rrow_idx, 0, this->thread_intermediate_storage); return (output_dest.is_valid() && output_dest.value()); diff --git a/cpp/src/join/mixed_join_kernel.cu b/cpp/src/join/mixed_join_kernel.cu index cd4016837cc..307a7cb5035 100644 --- a/cpp/src/join/mixed_join_kernel.cu +++ b/cpp/src/join/mixed_join_kernel.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2024, NVIDIA CORPORATION. + * Copyright (c) 2022-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -17,27 +17,22 @@ #include "mixed_join_kernel.cuh" #include "mixed_join_kernel.hpp" -namespace cudf { -namespace detail { +namespace cudf::detail { template void launch_mixed_join( table_device_view left_table, table_device_view right_table, - table_device_view probe, - table_device_view build, - row_hash const hash_probe, - row_equality const equality_probe, - join_kind const join_type, - cudf::detail::mixed_multimap_type::device_view hash_table_view, - size_type* join_output_l, - size_type* join_output_r, + join_kind join_type, + row_equality equality_probe, + cudf::device_span> hash_table_storage, + cuco::pair const* input_pairs, + cuda::std::pair const* hash_indices, + cudf::size_type* join_output_l, + cudf::size_type* join_output_r, cudf::ast::detail::expression_device_view device_expression_data, - cudf::size_type const* join_result_offsets, - bool const swap_tables, - detail::grid_1d const config, + bool swap_tables, + detail::grid_1d const& config, int64_t shmem_size_per_block, rmm::cuda_stream_view stream); -} // namespace detail - -} // namespace cudf +} // namespace cudf::detail diff --git a/cpp/src/join/mixed_join_kernel.cuh b/cpp/src/join/mixed_join_kernel.cuh index 830c783f2e1..52659b316c5 100644 --- a/cpp/src/join/mixed_join_kernel.cuh +++ b/cpp/src/join/mixed_join_kernel.cuh @@ -26,40 +26,185 @@ #include #include #include +#include #include #include -#include -#include -#include +#include -namespace cudf { -namespace detail { +namespace cudf::detail { -namespace cg = cooperative_groups; +/** + * @brief Optimized standalone retrieve implementation for hash table probing + * + * This implementation uses precomputed hash indices and storage references + * for efficient mixed join operations with minimal overhead. + * + * @tparam is_outer Boolean flag indicating whether outer join semantics should be used + */ +template +__device__ __forceinline__ void retrieve( + cooperative_groups::thread_block const& block, + cudf::device_span> hash_table_storage, + pair_expression_equality const& key_equal, + cuco::pair const* input_probe_begin, + cuco::pair const* input_probe_end, + cuda::std::pair const* input_hash_begin, + cudf::size_type* output_probe, + cudf::size_type* output_match, + cuda::atomic& atomic_counter) noexcept +{ + static constexpr auto bucket_size = 2; + static constexpr auto block_size = DEFAULT_JOIN_BLOCK_SIZE; + namespace cg = cooperative_groups; + + auto const n = cuda::std::distance(input_probe_begin, input_probe_end); + + // Use warps to efficiently flush shared memory buffers when they get full + // Each warp manages its own buffer segment to avoid conflicts + auto constexpr num_warps = block_size / warp_size; + auto constexpr max_matches_per_step = warp_size * bucket_size; + auto constexpr buffer_size = max_matches_per_step + warp_size; + + auto const warp = cg::tiled_partition(block); + auto const warp_id = warp.meta_group_rank(); + auto const stride = block_size; + auto idx = threadIdx.x; + + __shared__ cudf::size_type probe_output_buffer[num_warps][buffer_size]; + __shared__ cudf::size_type match_output_buffer[num_warps][buffer_size]; + __shared__ cudf::size_type warp_counter[num_warps]; + + if (warp.thread_rank() == 0) { + cuda::atomic_ref init_counter_ref{ + warp_counter[warp_id]}; + init_counter_ref.store(0, cuda::memory_order_relaxed); + } + warp.sync(); + + auto flush_output_buffer = [&](auto const& warp_group) { + size_type offset = 0; + auto const count = warp_counter[warp_id]; + auto const rank = warp_group.thread_rank(); + if (rank == 0) { offset = atomic_counter.fetch_add(count, cuda::memory_order_relaxed); } + offset = warp_group.shfl(offset, 0); + + for (auto i = rank; i < count; i += warp_group.size()) { + *(output_probe + offset + i) = probe_output_buffer[warp_id][i]; + *(output_match + offset + i) = match_output_buffer[warp_id][i]; + } + }; + + while (warp.any(idx < n)) { + bool active_flag = idx < n; + auto const active_warp = cg::binary_partition(warp, active_flag); + + if (active_flag) { + auto const& probe_key = *(input_probe_begin + idx); + auto const& hash_idx = *(input_hash_begin + idx); + + auto const extent = hash_table_storage.size(); + auto current_slot_idx = static_cast(hash_idx.first); + auto const step = static_cast(hash_idx.second); + + bool running = true; + [[maybe_unused]] bool found_match = false; + + while (active_warp.any(running)) { + if (running) { + auto const bucket_slots = *reinterpret_cast< + cuda::std::array, 2> const*>( + hash_table_storage.data() + current_slot_idx); + + auto const first_slot_is_empty = bucket_slots[0].second == cudf::detail::JoinNoneValue; + auto const second_slot_is_empty = bucket_slots[1].second == cudf::detail::JoinNoneValue; + auto const first_equals = + (not first_slot_is_empty and key_equal(probe_key, bucket_slots[0])); + auto const second_equals = + (not second_slot_is_empty and key_equal(probe_key, bucket_slots[1])); -template -CUDF_KERNEL void __launch_bounds__(block_size) + if (first_equals or second_equals) { + if constexpr (is_outer) { found_match = true; } + + cudf::size_type num_matches = (first_equals ? 1 : 0) + (second_equals ? 1 : 0); + cuda::atomic_ref counter_ref{ + warp_counter[warp_id]}; + cudf::size_type output_idx = + counter_ref.fetch_add(num_matches, cuda::memory_order_relaxed); + + auto const probe_row_index = probe_key.second; + + if (first_equals) { + probe_output_buffer[warp_id][output_idx] = probe_row_index; + match_output_buffer[warp_id][output_idx] = bucket_slots[0].second; + if (second_equals) { + probe_output_buffer[warp_id][output_idx + 1] = probe_row_index; + match_output_buffer[warp_id][output_idx + 1] = bucket_slots[1].second; + } + } else if (second_equals) { + probe_output_buffer[warp_id][output_idx] = probe_row_index; + match_output_buffer[warp_id][output_idx] = bucket_slots[1].second; + } + } + + if (first_slot_is_empty or second_slot_is_empty) { + running = false; + + if constexpr (is_outer) { + if (not found_match) { + cuda::atomic_ref counter_ref{ + warp_counter[warp_id]}; + auto const output_idx = counter_ref.fetch_add(1, cuda::memory_order_relaxed); + auto const probe_row_index = probe_key.second; + probe_output_buffer[warp_id][output_idx] = probe_row_index; + match_output_buffer[warp_id][output_idx] = cudf::detail::JoinNoneValue; + } + } + } + } + + active_warp.sync(); + // Check if warp's shared memory buffer is getting full and needs flushing + if (warp_counter[warp_id] > (buffer_size - max_matches_per_step)) { + flush_output_buffer(active_warp); + active_warp.sync(); + + if (active_warp.thread_rank() == 0) { warp_counter[warp_id] = 0; } + active_warp.sync(); + } + + current_slot_idx = (current_slot_idx + step) % extent; + if (current_slot_idx == static_cast(hash_idx.first)) { running = false; } + } + } + + warp.sync(); + idx += stride; + } + + warp.sync(); + // Final flush: ensure any remaining buffered matches are written to global memory + cuda::atomic_ref final_counter_ref{ + warp_counter[warp_id]}; + if (final_counter_ref.load(cuda::memory_order_relaxed) > 0) { flush_output_buffer(warp); } +} + +template +CUDF_KERNEL void __launch_bounds__(DEFAULT_JOIN_BLOCK_SIZE) mixed_join(table_device_view left_table, table_device_view right_table, - table_device_view probe, - table_device_view build, - row_hash const hash_probe, - row_equality const equality_probe, - join_kind const join_type, - cudf::detail::mixed_multimap_type::device_view hash_table_view, + join_kind join_type, + row_equality equality_probe, + cudf::device_span> hash_table_storage, + cuco::pair const* input_pairs, + cuda::std::pair const* hash_indices, size_type* join_output_l, size_type* join_output_r, cudf::ast::detail::expression_device_view device_expression_data, - cudf::size_type const* join_result_offsets, - bool const swap_tables) + bool swap_tables) { - // Normally the casting of a shared memory array is used to create multiple - // arrays of different types from the shared memory buffer, but here it is - // used to circumvent conflicts between arrays of different types between - // different template instantiations due to the extern specifier. extern __shared__ char raw_intermediate_storage[]; - cudf::ast::detail::IntermediateDataType* intermediate_storage = + auto intermediate_storage = reinterpret_cast*>(raw_intermediate_storage); auto thread_intermediate_storage = &intermediate_storage[threadIdx.x * device_expression_data.num_intermediates]; @@ -68,84 +213,75 @@ CUDF_KERNEL void __launch_bounds__(block_size) cudf::size_type const right_num_rows = right_table.num_rows(); auto const outer_num_rows = (swap_tables ? right_num_rows : left_num_rows); - cudf::size_type outer_row_index = threadIdx.x + blockIdx.x * block_size; + auto const evaluator = cudf::ast::detail::expression_evaluator{ + left_table, right_table, device_expression_data}; + auto const equality = pair_expression_equality{ + evaluator, thread_intermediate_storage, swap_tables, equality_probe}; - auto evaluator = cudf::ast::detail::expression_evaluator( - left_table, right_table, device_expression_data); + namespace cg = cooperative_groups; - auto const empty_key_sentinel = hash_table_view.get_empty_key_sentinel(); - make_pair_function pair_func{hash_probe, empty_key_sentinel}; + auto const block = cg::this_thread_block(); + cuda::atomic counter_ref{0}; - if (outer_row_index < outer_num_rows) { - // Figure out the number of elements for this key. - cg::thread_block_tile<1> this_thread = cg::this_thread(); - // Figure out the number of elements for this key. - auto query_pair = pair_func(outer_row_index); - auto equality = pair_expression_equality{ - evaluator, thread_intermediate_storage, swap_tables, equality_probe}; - - auto probe_key_begin = thrust::make_discard_iterator(); - auto probe_value_begin = swap_tables ? join_output_r + join_result_offsets[outer_row_index] - : join_output_l + join_result_offsets[outer_row_index]; - auto contained_key_begin = thrust::make_discard_iterator(); - auto contained_value_begin = swap_tables ? join_output_l + join_result_offsets[outer_row_index] - : join_output_r + join_result_offsets[outer_row_index]; + auto const block_begin_offset = block.group_index().x * DEFAULT_JOIN_BLOCK_SIZE; + auto const block_end_offset = cuda::std::min( + outer_num_rows, static_cast(block_begin_offset + DEFAULT_JOIN_BLOCK_SIZE)); + if (block_begin_offset < block_end_offset) { if (join_type == join_kind::LEFT_JOIN || join_type == join_kind::FULL_JOIN) { - hash_table_view.pair_retrieve_outer(this_thread, - query_pair, - probe_key_begin, - probe_value_begin, - contained_key_begin, - contained_value_begin, - equality); + retrieve(block, + hash_table_storage, + equality, + input_pairs + block_begin_offset, + input_pairs + block_end_offset, + hash_indices + block_begin_offset, + swap_tables ? join_output_r : join_output_l, + swap_tables ? join_output_l : join_output_r, + counter_ref); } else { - hash_table_view.pair_retrieve(this_thread, - query_pair, - probe_key_begin, - probe_value_begin, - contained_key_begin, - contained_value_begin, - equality); + retrieve(block, + hash_table_storage, + equality, + input_pairs + block_begin_offset, + input_pairs + block_end_offset, + hash_indices + block_begin_offset, + swap_tables ? join_output_r : join_output_l, + swap_tables ? join_output_l : join_output_r, + counter_ref); } } } template -void launch_mixed_join(table_device_view left_table, - table_device_view right_table, - table_device_view probe, - table_device_view build, - row_hash const hash_probe, - row_equality const equality_probe, - join_kind const join_type, - cudf::detail::mixed_multimap_type::device_view hash_table_view, - size_type* join_output_l, - size_type* join_output_r, - cudf::ast::detail::expression_device_view device_expression_data, - cudf::size_type const* join_result_offsets, - bool const swap_tables, - detail::grid_1d const config, - int64_t shmem_size_per_block, - rmm::cuda_stream_view stream) +void launch_mixed_join( + table_device_view left_table, + table_device_view right_table, + join_kind join_type, + row_equality equality_probe, + cudf::device_span> hash_table_storage, + cuco::pair const* input_pairs, + cuda::std::pair const* hash_indices, + size_type* join_output_l, + size_type* join_output_r, + cudf::ast::detail::expression_device_view device_expression_data, + bool swap_tables, + detail::grid_1d const& config, + int64_t shmem_size_per_block, + rmm::cuda_stream_view stream) { - mixed_join + mixed_join <<>>( left_table, right_table, - probe, - build, - hash_probe, - equality_probe, join_type, - hash_table_view, + equality_probe, + hash_table_storage, + input_pairs, + hash_indices, join_output_l, join_output_r, device_expression_data, - join_result_offsets, swap_tables); } -} // namespace detail - -} // namespace cudf +} // namespace cudf::detail diff --git a/cpp/src/join/mixed_join_kernel.hpp b/cpp/src/join/mixed_join_kernel.hpp index 8187ad03aca..e72d9370db9 100644 --- a/cpp/src/join/mixed_join_kernel.hpp +++ b/cpp/src/join/mixed_join_kernel.hpp @@ -16,66 +16,65 @@ #pragma once -#include "join/join_common_utils.hpp" -#include "join/mixed_join_common_utils.cuh" +#include "join_common_utils.cuh" +#include "join_common_utils.hpp" +#include "mixed_join_common_utils.cuh" #include #include #include #include +#include + +#include + namespace CUDF_EXPORT cudf { namespace detail { /** - * @brief Performs a join using the combination of a hash lookup to identify - * equal rows between one pair of tables and the evaluation of an expression - * containing an arbitrary expression. + * @brief Performs a mixed join using hash lookup and expression evaluation. * * This method probes the hash table with each row in the probe table using a * custom equality comparator that also checks that the conditional expression * evaluates to true between the left/right tables when a match is found * between probe and build rows. * - * @tparam block_size The number of threads per block for this kernel * @tparam has_nulls Whether or not the inputs may contain nulls. * - * @param[in] left_table The left table - * @param[in] right_table The right table - * @param[in] probe The table with which to probe the hash table for matches. - * @param[in] build The table with which the hash table was built. - * @param[in] hash_probe The hasher used for the probe table. - * @param[in] equality_probe The equality comparator used when probing the hash table. - * @param[in] join_type The type of join to be performed - * @param[in] hash_table_view The hash table built from `build`. - * @param[out] join_output_l The left result of the join operation - * @param[out] join_output_r The right result of the join operation - * @param[in] device_expression_data Container of device data required to evaluate the desired - * expression. - * @param[in] join_result_offsets The starting indices in join_output[l|r] - * where the matches for each row begin. Equivalent to a prefix sum of - * matches_per_row. - * @param[in] swap_tables If true, the kernel was launched with one thread per right row and - * the kernel needs to internally loop over left rows. Otherwise, loop over right rows. + * @param left_table The left table + * @param right_table The right table + * @param join_type The type of join to be performed + * @param equality_probe The equality comparator used when probing the hash table + * @param hash_table_storage The hash table storage for probing operations + * @param input_pairs Array of hash-value/row-index pairs for probing + * @param hash_indices Array of hash index pairs for efficient lookup + * @param join_output_l The left result of the join operation + * @param join_output_r The right result of the join operation + * @param device_expression_data Container of device data required to evaluate the desired + * expression + * @param swap_tables If true, the kernel was launched with one thread per right row and + * the kernel needs to internally loop over left rows. Otherwise, loop over right rows + * @param config Grid configuration for kernel launch + * @param shmem_size_per_block Shared memory size per block in bytes + * @param stream CUDA stream used for device memory operations and kernel launches */ template -void launch_mixed_join(table_device_view left_table, - table_device_view right_table, - table_device_view probe, - table_device_view build, - row_hash const hash_probe, - row_equality const equality_probe, - join_kind const join_type, - cudf::detail::mixed_multimap_type::device_view hash_table_view, - size_type* join_output_l, - size_type* join_output_r, - cudf::ast::detail::expression_device_view device_expression_data, - cudf::size_type const* join_result_offsets, - bool const swap_tables, - detail::grid_1d const config, - int64_t shmem_size_per_block, - rmm::cuda_stream_view stream); +void launch_mixed_join( + cudf::table_device_view left_table, + cudf::table_device_view right_table, + join_kind join_type, + row_equality equality_probe, + cudf::device_span> hash_table_storage, + cuco::pair const* input_pairs, + cuda::std::pair const* hash_indices, + cudf::size_type* join_output_l, + cudf::size_type* join_output_r, + cudf::ast::detail::expression_device_view device_expression_data, + bool swap_tables, + detail::grid_1d const& config, + int64_t shmem_size_per_block, + rmm::cuda_stream_view stream); } // namespace detail - } // namespace CUDF_EXPORT cudf diff --git a/cpp/src/join/mixed_join_kernel_nulls.cu b/cpp/src/join/mixed_join_kernel_nulls.cu index 185aa133f2d..a45781f2588 100644 --- a/cpp/src/join/mixed_join_kernel_nulls.cu +++ b/cpp/src/join/mixed_join_kernel_nulls.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2024, NVIDIA CORPORATION. + * Copyright (c) 2022-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -17,27 +17,22 @@ #include "mixed_join_kernel.cuh" #include "mixed_join_kernel.hpp" -namespace cudf { -namespace detail { +namespace cudf::detail { template void launch_mixed_join( table_device_view left_table, table_device_view right_table, - table_device_view probe, - table_device_view build, - row_hash const hash_probe, - row_equality const equality_probe, - join_kind const join_type, - cudf::detail::mixed_multimap_type::device_view hash_table_view, - size_type* join_output_l, - size_type* join_output_r, + join_kind join_type, + row_equality equality_probe, + cudf::device_span> hash_table_storage, + cuco::pair const* input_pairs, + cuda::std::pair const* hash_indices, + cudf::size_type* join_output_l, + cudf::size_type* join_output_r, cudf::ast::detail::expression_device_view device_expression_data, - cudf::size_type const* join_result_offsets, - bool const swap_tables, - detail::grid_1d const config, + bool swap_tables, + detail::grid_1d const& config, int64_t shmem_size_per_block, rmm::cuda_stream_view stream); -} // namespace detail - -} // namespace cudf +} // namespace cudf::detail diff --git a/cpp/src/join/mixed_join_kernels_semi.cu b/cpp/src/join/mixed_join_kernels_semi.cu index 3cf081e5ded..69bf8fa1870 100644 --- a/cpp/src/join/mixed_join_kernels_semi.cu +++ b/cpp/src/join/mixed_join_kernels_semi.cu @@ -51,8 +51,9 @@ CUDF_KERNEL void __launch_bounds__(block_size) auto const evaluator = cudf::ast::detail::expression_evaluator( left_table, right_table, device_expression_data); - // Make sure to swap_tables here as hash_set will use probe table as the left one - auto constexpr swap_tables = true; + // The cuco API passes parameters in the same (left, right) order we use here, + // so no swapping needed + auto constexpr swap_tables = false; auto const equality = single_expression_equality{ evaluator, thread_intermediate_storage, swap_tables, equality_probe}; diff --git a/cpp/src/join/mixed_join_semi.cu b/cpp/src/join/mixed_join_semi.cu index 4926c6b2792..3a19f534f3b 100644 --- a/cpp/src/join/mixed_join_semi.cu +++ b/cpp/src/join/mixed_join_semi.cu @@ -123,7 +123,7 @@ std::unique_ptr> mixed_join_semi( auto const preprocessed_probe = cudf::experimental::row::equality::preprocessed_table::create(probe, stream); auto const row_comparator = - cudf::experimental::row::equality::two_table_comparator{preprocessed_build, preprocessed_probe}; + cudf::experimental::row::equality::two_table_comparator{preprocessed_probe, preprocessed_build}; auto const equality_probe = row_comparator.equal_to(has_nulls, compare_nulls); // Create hash table containing all keys found in right table diff --git a/cpp/src/join/mixed_join_size_kernel.cu b/cpp/src/join/mixed_join_size_kernel.cu index 4882c8769e6..bce24655bae 100644 --- a/cpp/src/join/mixed_join_size_kernel.cu +++ b/cpp/src/join/mixed_join_size_kernel.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2024, NVIDIA CORPORATION. + * Copyright (c) 2022-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -23,19 +23,16 @@ namespace detail { template std::size_t launch_compute_mixed_join_output_size( table_device_view left_table, table_device_view right_table, - table_device_view probe, - table_device_view build, - row_hash const hash_probe, - row_equality const equality_probe, - join_kind const join_type, - cudf::detail::mixed_multimap_type::device_view hash_table_view, + join_kind join_type, + row_equality equality_probe, + cudf::device_span> hash_table_storage, + cuco::pair const* input_pairs, + cuda::std::pair const* hash_indices, ast::detail::expression_device_view device_expression_data, - bool const swap_tables, - cudf::device_span matches_per_row, - detail::grid_1d const config, + bool swap_tables, + detail::grid_1d const& config, int64_t shmem_size_per_block, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr); + rmm::cuda_stream_view stream); } // namespace detail } // namespace cudf diff --git a/cpp/src/join/mixed_join_size_kernel.cuh b/cpp/src/join/mixed_join_size_kernel.cuh index 07d9bee4aff..51426baed2e 100644 --- a/cpp/src/join/mixed_join_size_kernel.cuh +++ b/cpp/src/join/mixed_join_size_kernel.cuh @@ -29,79 +29,127 @@ #include #include -#include -#include -#include - -namespace cudf { -namespace detail { -namespace cg = cooperative_groups; - -template -CUDF_KERNEL void __launch_bounds__(block_size) - compute_mixed_join_output_size(table_device_view left_table, - table_device_view right_table, - table_device_view probe, - table_device_view build, - row_hash const hash_probe, - row_equality const equality_probe, - join_kind const join_type, - cudf::detail::mixed_multimap_type::device_view hash_table_view, - ast::detail::expression_device_view device_expression_data, - bool const swap_tables, - std::size_t* output_size, - cudf::device_span matches_per_row) +#include + +namespace cudf::detail { + +/** + * @brief Standalone count implementation using precomputed hash indices + * + * This implementation provides essential count functionality for mixed joins + * using precomputed probe indices and step sizes. + */ +template +__device__ __forceinline__ auto standalone_count( + pair_expression_equality const& key_equal, + cudf::device_span> hash_table_storage, + cuco::pair const& probe_key, + cuda::std::pair const& hash_idx, + join_kind join_type) noexcept +{ + cudf::size_type count = 0; + auto const extent = hash_table_storage.size(); + auto const* data = hash_table_storage.data(); + auto probe_idx = static_cast(hash_idx.first); // initial probe index + auto const step = static_cast(hash_idx.second); // step size + + while (true) { + auto const bucket_slots = + *reinterpret_cast, 2> const*>( + data + probe_idx); + + // Check for empty slots and key equality + auto const first_slot_is_empty = bucket_slots[0].second == cudf::detail::JoinNoneValue; + auto const second_slot_is_empty = bucket_slots[1].second == cudf::detail::JoinNoneValue; + auto const first_slot_equals = + (not first_slot_is_empty and key_equal(probe_key, bucket_slots[0])); + auto const second_slot_equals = + (not second_slot_is_empty and key_equal(probe_key, bucket_slots[1])); + + count += (first_slot_equals + second_slot_equals); + + // Exit if we find an empty slot + if (first_slot_is_empty or second_slot_is_empty) { + // Handle outer join logic: non-matching rows are counted as 1 match + if ((join_type == join_kind::LEFT_JOIN || join_type == join_kind::FULL_JOIN) && count == 0) { + return 1; + } + return count; + } + + // Move to next bucket using precomputed step + probe_idx = (probe_idx + step) % extent; + + // Detect full cycle completion + if (probe_idx == static_cast(hash_idx.first)) { + // Handle outer join logic: non-matching rows are counted as 1 match + if ((join_type == join_kind::LEFT_JOIN || join_type == join_kind::FULL_JOIN) && count == 0) { + return 1; + } + return count; + } + } +} + +template +CUDF_KERNEL void __launch_bounds__(DEFAULT_JOIN_BLOCK_SIZE) compute_mixed_join_output_size( + table_device_view left_table, + table_device_view right_table, + join_kind join_type, + row_equality equality_probe, + cudf::device_span> hash_table_storage, + cuco::pair const* input_pairs, + cuda::std::pair const* hash_indices, + ast::detail::expression_device_view device_expression_data, + bool swap_tables, + size_t* d_total_count) { // The (required) extern storage of the shared memory array leads to // conflicting declarations between different templates. The easiest // workaround is to declare an arbitrary (here char) array type then cast it // after the fact to the appropriate type. extern __shared__ char raw_intermediate_storage[]; - cudf::ast::detail::IntermediateDataType* intermediate_storage = + auto intermediate_storage = reinterpret_cast*>(raw_intermediate_storage); auto thread_intermediate_storage = intermediate_storage + (threadIdx.x * device_expression_data.num_intermediates); - std::size_t thread_counter{0}; + using BlockReduce = cub::BlockReduce; + __shared__ typename BlockReduce::TempStorage temp_storage; + auto const start_idx = cudf::detail::grid_1d::global_thread_id(); auto const stride = cudf::detail::grid_1d::grid_stride(); cudf::size_type const left_num_rows = left_table.num_rows(); cudf::size_type const right_num_rows = right_table.num_rows(); auto const outer_num_rows = (swap_tables ? right_num_rows : left_num_rows); - auto evaluator = cudf::ast::detail::expression_evaluator( - left_table, right_table, device_expression_data); - - auto const empty_key_sentinel = hash_table_view.get_empty_key_sentinel(); - make_pair_function pair_func{hash_probe, empty_key_sentinel}; + auto const evaluator = cudf::ast::detail::expression_evaluator{ + left_table, right_table, device_expression_data}; // Figure out the number of elements for this key. - cg::thread_block_tile<1> this_thread = cg::this_thread(); // TODO: Address asymmetry in operator. auto count_equality = pair_expression_equality{ evaluator, thread_intermediate_storage, swap_tables, equality_probe}; + // Thread-local count + size_t per_thread_count = 0; + for (auto outer_row_index = start_idx; outer_row_index < outer_num_rows; outer_row_index += stride) { - auto query_pair = pair_func(outer_row_index); - if (join_type == join_kind::LEFT_JOIN || join_type == join_kind::FULL_JOIN) { - matches_per_row[outer_row_index] = - hash_table_view.pair_count_outer(this_thread, query_pair, count_equality); - } else { - matches_per_row[outer_row_index] = - hash_table_view.pair_count(this_thread, query_pair, count_equality); - } - thread_counter += matches_per_row[outer_row_index]; + auto const& probe_key = input_pairs[outer_row_index]; + auto const& hash_idx = hash_indices[outer_row_index]; + + auto match_count = + standalone_count(count_equality, hash_table_storage, probe_key, hash_idx, join_type); + + per_thread_count += match_count; } - using BlockReduce = cub::BlockReduce; - __shared__ typename BlockReduce::TempStorage temp_storage; - std::size_t block_counter = BlockReduce(temp_storage).Sum(thread_counter); + size_t per_block_count = BlockReduce(temp_storage).Sum(per_thread_count); - // Add block counter to global counter if (threadIdx.x == 0) { - cuda::atomic_ref ref{*output_size}; - ref.fetch_add(block_counter, cuda::std::memory_order_relaxed); + cuda::atomic_ref counter_ref(*d_total_count); + counter_ref.fetch_add(per_block_count, cuda::memory_order_relaxed); } } @@ -109,39 +157,34 @@ template std::size_t launch_compute_mixed_join_output_size( table_device_view left_table, table_device_view right_table, - table_device_view probe, - table_device_view build, - row_hash const hash_probe, - row_equality const equality_probe, - join_kind const join_type, - cudf::detail::mixed_multimap_type::device_view hash_table_view, + join_kind join_type, + row_equality equality_probe, + cudf::device_span> hash_table_storage, + cuco::pair const* input_pairs, + cuda::std::pair const* hash_indices, ast::detail::expression_device_view device_expression_data, - bool const swap_tables, - cudf::device_span matches_per_row, - detail::grid_1d const config, + bool swap_tables, + detail::grid_1d const& config, int64_t shmem_size_per_block, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) + rmm::cuda_stream_view stream) { - // Allocate storage for the counter used to get the size of the join output - cudf::detail::device_scalar size(0, stream, mr); + cudf::detail::device_scalar d_total_count{ + 0, stream, cudf::get_current_device_resource_ref()}; - compute_mixed_join_output_size + compute_mixed_join_output_size <<>>( left_table, right_table, - probe, - build, - hash_probe, - equality_probe, join_type, - hash_table_view, + equality_probe, + hash_table_storage, + input_pairs, + hash_indices, device_expression_data, swap_tables, - size.data(), - matches_per_row); - return size.value(stream); + d_total_count.data()); + + return d_total_count.value(stream); } -} // namespace detail -} // namespace cudf +} // namespace cudf::detail diff --git a/cpp/src/join/mixed_join_size_kernel.hpp b/cpp/src/join/mixed_join_size_kernel.hpp index 776229f11c1..d48239a3e13 100644 --- a/cpp/src/join/mixed_join_size_kernel.hpp +++ b/cpp/src/join/mixed_join_size_kernel.hpp @@ -46,46 +46,41 @@ namespace detail { * evaluates to true between the left/right tables when a match is found * between probe and build rows. * - * @tparam block_size The number of threads per block for this kernel + * Uses the current device memory resource for internal allocations. + * * @tparam has_nulls Whether or not the inputs may contain nulls. * - * @param[in] left_table The left table - * @param[in] right_table The right table - * @param[in] probe The table with which to probe the hash table for matches. - * @param[in] build The table with which the hash table was built. - * @param[in] hash_probe The hasher used for the probe table. - * @param[in] equality_probe The equality comparator used when probing the hash table. - * @param[in] join_type The type of join to be performed - * @param[in] hash_table_view The hash table built from `build`. - * @param[in] device_expression_data Container of device data required to evaluate the desired - * expression. - * @param[in] swap_tables If true, the kernel was launched with one thread per right row and - * the kernel needs to internally loop over left rows. Otherwise, loop over right rows. - * @param[out] output_size The resulting output size - * @param[out] matches_per_row The number of matches in one pair of - * equality/conditional tables for each row in the other pair of tables. If - * swap_tables is true, matches_per_row corresponds to the right_table, - * otherwise it corresponds to the left_table. Note that corresponding swap of - * left/right tables to determine which is the build table and which is the - * probe table has already happened on the host. + * @param left_table The left table + * @param right_table The right table + * @param join_type The type of join to be performed + * @param equality_probe The equality comparator used when probing the hash table + * @param hash_table_storage The hash table storage for probing operations + * @param input_pairs Array of hash-value/row-index pairs for probing + * @param hash_indices Array of hash index pairs for efficient lookup + * @param device_expression_data Container of device data required to evaluate the desired + * expression + * @param swap_tables If true, the kernel was launched with one thread per right row and + * the kernel needs to internally loop over left rows. Otherwise, loop over right rows + * @param config Grid configuration for kernel launch + * @param shmem_size_per_block Shared memory size per block in bytes + * @param stream CUDA stream used for device memory operations and kernel launches + * + * @return The resulting output size */ - template std::size_t launch_compute_mixed_join_output_size( - cudf::table_device_view left_table, - cudf::table_device_view right_table, - cudf::table_device_view probe, - cudf::table_device_view build, - row_hash const hash_probe, - row_equality const equality_probe, - join_kind const join_type, - cudf::detail::mixed_multimap_type::device_view hash_table_view, + table_device_view left_table, + table_device_view right_table, + join_kind join_type, + row_equality equality_probe, + cudf::device_span> hash_table_storage, + cuco::pair const* input_pairs, + cuda::std::pair const* hash_indices, ast::detail::expression_device_view device_expression_data, - bool const swap_tables, - cudf::device_span matches_per_row, - detail::grid_1d const config, + bool swap_tables, + detail::grid_1d const& config, int64_t shmem_size_per_block, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr); + rmm::cuda_stream_view stream); + } // namespace detail } // namespace CUDF_EXPORT cudf diff --git a/cpp/src/join/mixed_join_size_kernel_nulls.cu b/cpp/src/join/mixed_join_size_kernel_nulls.cu index 11f9103da4d..66b0439cd42 100644 --- a/cpp/src/join/mixed_join_size_kernel_nulls.cu +++ b/cpp/src/join/mixed_join_size_kernel_nulls.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2024, NVIDIA CORPORATION. + * Copyright (c) 2022-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -22,18 +22,16 @@ namespace detail { template std::size_t launch_compute_mixed_join_output_size( table_device_view left_table, table_device_view right_table, - table_device_view probe, - table_device_view build, - row_hash const hash_probe, - row_equality const equality_probe, - join_kind const join_type, - cudf::detail::mixed_multimap_type::device_view hash_table_view, + join_kind join_type, + row_equality equality_probe, + cudf::device_span> hash_table_storage, + cuco::pair const* input_pairs, + cuda::std::pair const* hash_indices, ast::detail::expression_device_view device_expression_data, - bool const swap_tables, - cudf::device_span matches_per_row, - detail::grid_1d const config, + bool swap_tables, + detail::grid_1d const& config, int64_t shmem_size_per_block, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr); + rmm::cuda_stream_view stream); + } // namespace detail } // namespace cudf diff --git a/cpp/tests/join/mixed_join_tests.cu b/cpp/tests/join/mixed_join_tests.cu index ef32d365425..f334aff89e9 100644 --- a/cpp/tests/join/mixed_join_tests.cu +++ b/cpp/tests/join/mixed_join_tests.cu @@ -210,19 +210,12 @@ struct MixedJoinPairReturnTest : public MixedJoinTest { std::vector> expected_outputs, cudf::null_equality compare_nulls = cudf::null_equality::EQUAL) { - auto [result_size, actual_counts] = this->join_size( + auto result_size = this->join_size( left_equality, right_equality, left_conditional, right_conditional, predicate, compare_nulls); EXPECT_TRUE(result_size == expected_outputs.size()); - cudf::test::fixed_width_column_wrapper expected_counts_cw( - expected_counts.begin(), expected_counts.end()); - auto const actual_counts_view = - cudf::column_view(cudf::data_type{cudf::type_to_id()}, - actual_counts->size(), - actual_counts->data(), - nullptr, - 0); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_counts_cw, actual_counts_view); + // Since we no longer get per-row counts, we can't verify them. + // Instead just verify that the total count matches auto result = this->join( left_equality, right_equality, left_conditional, right_conditional, predicate, compare_nulls); @@ -325,13 +318,12 @@ struct MixedJoinPairReturnTest : public MixedJoinTest { * It should be a simply forwarding of arguments to the appropriate cudf * mixed join size computation API. */ - virtual std::pair>> join_size( - cudf::table_view left_equality, - cudf::table_view right_equality, - cudf::table_view left_conditional, - cudf::table_view right_conditional, - cudf::ast::operation predicate, - cudf::null_equality compare_nulls = cudf::null_equality::EQUAL) = 0; + virtual std::size_t join_size(cudf::table_view left_equality, + cudf::table_view right_equality, + cudf::table_view left_conditional, + cudf::table_view right_conditional, + cudf::ast::operation predicate, + cudf::null_equality compare_nulls = cudf::null_equality::EQUAL) = 0; }; /** @@ -350,13 +342,12 @@ struct MixedInnerJoinTest : public MixedJoinPairReturnTest { left_equality, right_equality, left_conditional, right_conditional, predicate, compare_nulls); } - std::pair>> join_size( - cudf::table_view left_equality, - cudf::table_view right_equality, - cudf::table_view left_conditional, - cudf::table_view right_conditional, - cudf::ast::operation predicate, - cudf::null_equality compare_nulls = cudf::null_equality::EQUAL) override + std::size_t join_size(cudf::table_view left_equality, + cudf::table_view right_equality, + cudf::table_view left_conditional, + cudf::table_view right_conditional, + cudf::ast::operation predicate, + cudf::null_equality compare_nulls = cudf::null_equality::EQUAL) override { return cudf::mixed_inner_join_size( left_equality, right_equality, left_conditional, right_conditional, predicate, compare_nulls); @@ -541,13 +532,12 @@ struct MixedLeftJoinTest : public MixedJoinPairReturnTest { left_equality, right_equality, left_conditional, right_conditional, predicate, compare_nulls); } - std::pair>> join_size( - cudf::table_view left_equality, - cudf::table_view right_equality, - cudf::table_view left_conditional, - cudf::table_view right_conditional, - cudf::ast::operation predicate, - cudf::null_equality compare_nulls = cudf::null_equality::EQUAL) override + std::size_t join_size(cudf::table_view left_equality, + cudf::table_view right_equality, + cudf::table_view left_conditional, + cudf::table_view right_conditional, + cudf::ast::operation predicate, + cudf::null_equality compare_nulls = cudf::null_equality::EQUAL) override { return cudf::mixed_left_join_size( left_equality, right_equality, left_conditional, right_conditional, predicate, compare_nulls); @@ -608,13 +598,12 @@ struct MixedFullJoinTest : public MixedJoinPairReturnTest { left_equality, right_equality, left_conditional, right_conditional, predicate, compare_nulls); } - std::pair>> join_size( - cudf::table_view left_equality, - cudf::table_view right_equality, - cudf::table_view left_conditional, - cudf::table_view right_conditional, - cudf::ast::operation predicate, - cudf::null_equality compare_nulls = cudf::null_equality::EQUAL) override + std::size_t join_size(cudf::table_view left_equality, + cudf::table_view right_equality, + cudf::table_view left_conditional, + cudf::table_view right_conditional, + cudf::ast::operation predicate, + cudf::null_equality compare_nulls = cudf::null_equality::EQUAL) override { // Full joins don't actually support size calculations, and there's no easy way to spoof it. CUDF_FAIL("Size calculation not supported for full joins."); diff --git a/java/src/main/java/ai/rapids/cudf/MixedJoinSize.java b/java/src/main/java/ai/rapids/cudf/MixedJoinSize.java deleted file mode 100644 index 811f0b9a0b0..00000000000 --- a/java/src/main/java/ai/rapids/cudf/MixedJoinSize.java +++ /dev/null @@ -1,43 +0,0 @@ -/* - * Copyright (c) 2022, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package ai.rapids.cudf; - -/** This class tracks size information associated with a mixed table join. */ -public final class MixedJoinSize implements AutoCloseable { - private final long outputRowCount; - // This is in flux, avoid exposing publicly until the dust settles. - private ColumnVector matches; - - MixedJoinSize(long outputRowCount, ColumnVector matches) { - this.outputRowCount = outputRowCount; - this.matches = matches; - } - - /** Return the number of output rows that would be generated from the mixed join */ - public long getOutputRowCount() { - return outputRowCount; - } - - ColumnVector getMatches() { - return matches; - } - - @Override - public synchronized void close() { - matches.close(); - } -} diff --git a/java/src/main/java/ai/rapids/cudf/Table.java b/java/src/main/java/ai/rapids/cudf/Table.java index 422989143c7..229e04754c5 100644 --- a/java/src/main/java/ai/rapids/cudf/Table.java +++ b/java/src/main/java/ai/rapids/cudf/Table.java @@ -736,31 +736,31 @@ private static native long[] conditionalLeftAntiJoinGatherMapWithCount(long left long condition, long rowCount) throws CudfException; - private static native long[] mixedLeftJoinSize(long leftKeysTable, long rightKeysTable, - long leftConditionTable, long rightConditionTable, - long condition, boolean compareNullsEqual); + private static native long mixedLeftJoinRowCount(long leftKeysTable, long rightKeysTable, + long leftConditionTable, long rightConditionTable, + long condition, boolean compareNullsEqual); private static native long[] mixedLeftJoinGatherMaps(long leftKeysTable, long rightKeysTable, long leftConditionTable, long rightConditionTable, long condition, boolean compareNullsEqual); - private static native long[] mixedLeftJoinGatherMapsWithSize(long leftKeysTable, long rightKeysTable, - long leftConditionTable, long rightConditionTable, - long condition, boolean compareNullsEqual, - long outputRowCount, long matchesColumnView); + private static native long[] mixedLeftJoinGatherMapsWithCount(long leftKeysTable, long rightKeysTable, + long leftConditionTable, long rightConditionTable, + long condition, boolean compareNullsEqual, + long outputRowCount); - private static native long[] mixedInnerJoinSize(long leftKeysTable, long rightKeysTable, - long leftConditionTable, long rightConditionTable, - long condition, boolean compareNullsEqual); + private static native long mixedInnerJoinRowCount(long leftKeysTable, long rightKeysTable, + long leftConditionTable, long rightConditionTable, + long condition, boolean compareNullsEqual); private static native long[] mixedInnerJoinGatherMaps(long leftKeysTable, long rightKeysTable, long leftConditionTable, long rightConditionTable, long condition, boolean compareNullsEqual); - private static native long[] mixedInnerJoinGatherMapsWithSize(long leftKeysTable, long rightKeysTable, - long leftConditionTable, long rightConditionTable, - long condition, boolean compareNullsEqual, - long outputRowCount, long matchesColumnView); + private static native long[] mixedInnerJoinGatherMapsWithCount(long leftKeysTable, long rightKeysTable, + long leftConditionTable, long rightConditionTable, + long condition, boolean compareNullsEqual, + long outputRowCount); private static native long[] mixedFullJoinGatherMaps(long leftKeysTable, long rightKeysTable, long leftConditionTable, long rightConditionTable, @@ -3094,31 +3094,26 @@ public GatherMap[] conditionalLeftJoinGatherMaps(Table rightTable, } /** - * Computes output size information for a left join between two tables using a mix of equality - * and inequality conditions. The entire join condition is assumed to be a logical AND of the - * equality condition and inequality condition. - * NOTE: It is the responsibility of the caller to close the resulting size information object - * or native resources can be leaked! + * Computes the number of rows resulting from a left join between two tables using a mix of + * equality and inequality conditions. The entire join condition is assumed to be a logical AND + * of the equality condition and inequality condition. + * * @param leftKeys the left table's key columns for the equality condition * @param rightKeys the right table's key columns for the equality condition * @param leftConditional the left table's columns needed to evaluate the inequality condition * @param rightConditional the right table's columns needed to evaluate the inequality condition * @param condition the inequality condition of the join * @param nullEquality whether nulls should compare as equal - * @return size information for the join + * @return row count of the join result */ - public static MixedJoinSize mixedLeftJoinSize(Table leftKeys, Table rightKeys, - Table leftConditional, Table rightConditional, - CompiledExpression condition, - NullEquality nullEquality) { - long[] mixedSizeInfo = mixedLeftJoinSize( - leftKeys.getNativeView(), rightKeys.getNativeView(), - leftConditional.getNativeView(), rightConditional.getNativeView(), - condition.getNativeHandle(), nullEquality == NullEquality.EQUAL); - assert mixedSizeInfo.length == 2; - long outputRowCount = mixedSizeInfo[0]; - long matchesColumnHandle = mixedSizeInfo[1]; - return new MixedJoinSize(outputRowCount, new ColumnVector(matchesColumnHandle)); + public static long mixedLeftJoinRowCount(Table leftKeys, Table rightKeys, + Table leftConditional, Table rightConditional, + CompiledExpression condition, + NullEquality nullEquality) { + return mixedLeftJoinRowCount( + leftKeys.getNativeView(), rightKeys.getNativeView(), + leftConditional.getNativeView(), rightConditional.getNativeView(), + condition.getNativeHandle(), nullEquality == NullEquality.EQUAL); } /** @@ -3159,8 +3154,8 @@ public static GatherMap[] mixedLeftJoinGatherMaps(Table leftKeys, Table rightKey * * It is the responsibility of the caller to close the resulting gather map instances. * - * This interface allows passing the size result from - * {@link #mixedLeftJoinSize(Table, Table, Table, Table, CompiledExpression, NullEquality)} + * This interface allows passing the row count from + * {@link #mixedLeftJoinRowCount(Table, Table, Table, Table, CompiledExpression, NullEquality)} * when the output size was computed previously. * * @param leftKeys the left table's key columns for the equality condition @@ -3169,20 +3164,20 @@ public static GatherMap[] mixedLeftJoinGatherMaps(Table leftKeys, Table rightKey * @param rightConditional the right table's columns needed to evaluate the inequality condition * @param condition the inequality condition of the join * @param nullEquality whether nulls should compare as equal - * @param joinSize mixed join size result + * @param outputRowCount number of output rows in the join result * @return left and right table gather maps */ public static GatherMap[] mixedLeftJoinGatherMaps(Table leftKeys, Table rightKeys, Table leftConditional, Table rightConditional, CompiledExpression condition, NullEquality nullEquality, - MixedJoinSize joinSize) { - long[] gatherMapData = mixedLeftJoinGatherMapsWithSize( - leftKeys.getNativeView(), rightKeys.getNativeView(), - leftConditional.getNativeView(), rightConditional.getNativeView(), - condition.getNativeHandle(), - nullEquality == NullEquality.EQUAL, - joinSize.getOutputRowCount(), joinSize.getMatches().getNativeView()); + long outputRowCount) { + long[] gatherMapData = mixedLeftJoinGatherMapsWithCount( + leftKeys.getNativeView(), rightKeys.getNativeView(), + leftConditional.getNativeView(), rightConditional.getNativeView(), + condition.getNativeHandle(), + nullEquality == NullEquality.EQUAL, + outputRowCount); return buildJoinGatherMaps(gatherMapData); } @@ -3361,31 +3356,26 @@ public GatherMap[] conditionalInnerJoinGatherMaps(Table rightTable, } /** - * Computes output size information for an inner join between two tables using a mix of equality - * and inequality conditions. The entire join condition is assumed to be a logical AND of the - * equality condition and inequality condition. - * NOTE: It is the responsibility of the caller to close the resulting size information object - * or native resources can be leaked! + * Computes the number of rows resulting from an inner join between two tables using a mix of + * equality and inequality conditions. The entire join condition is assumed to be a logical AND + * of the equality condition and inequality condition. + * * @param leftKeys the left table's key columns for the equality condition * @param rightKeys the right table's key columns for the equality condition * @param leftConditional the left table's columns needed to evaluate the inequality condition * @param rightConditional the right table's columns needed to evaluate the inequality condition * @param condition the inequality condition of the join * @param nullEquality whether nulls should compare as equal - * @return size information for the join + * @return row count of the join result */ - public static MixedJoinSize mixedInnerJoinSize(Table leftKeys, Table rightKeys, - Table leftConditional, Table rightConditional, - CompiledExpression condition, - NullEquality nullEquality) { - long[] mixedSizeInfo = mixedInnerJoinSize( + public static long mixedInnerJoinRowCount(Table leftKeys, Table rightKeys, + Table leftConditional, Table rightConditional, + CompiledExpression condition, + NullEquality nullEquality) { + return mixedInnerJoinRowCount( leftKeys.getNativeView(), rightKeys.getNativeView(), leftConditional.getNativeView(), rightConditional.getNativeView(), condition.getNativeHandle(), nullEquality == NullEquality.EQUAL); - assert mixedSizeInfo.length == 2; - long outputRowCount = mixedSizeInfo[0]; - long matchesColumnHandle = mixedSizeInfo[1]; - return new MixedJoinSize(outputRowCount, new ColumnVector(matchesColumnHandle)); } /** @@ -3427,7 +3417,7 @@ public static GatherMap[] mixedInnerJoinGatherMaps(Table leftKeys, Table rightKe * It is the responsibility of the caller to close the resulting gather map instances. * * This interface allows passing the size result from - * {@link #mixedInnerJoinSize(Table, Table, Table, Table, CompiledExpression, NullEquality)} + * {@link #mixedInnerJoinRowCount(Table, Table, Table, Table, CompiledExpression, NullEquality)} * when the output size was computed previously. * * @param leftKeys the left table's key columns for the equality condition @@ -3436,20 +3426,20 @@ public static GatherMap[] mixedInnerJoinGatherMaps(Table leftKeys, Table rightKe * @param rightConditional the right table's columns needed to evaluate the inequality condition * @param condition the inequality condition of the join * @param nullEquality whether nulls should compare as equal - * @param joinSize mixed join size result + * @param outputRowCount number of output rows in the join result * @return left and right table gather maps */ public static GatherMap[] mixedInnerJoinGatherMaps(Table leftKeys, Table rightKeys, Table leftConditional, Table rightConditional, CompiledExpression condition, NullEquality nullEquality, - MixedJoinSize joinSize) { - long[] gatherMapData = mixedInnerJoinGatherMapsWithSize( + long outputRowCount) { + long[] gatherMapData = mixedInnerJoinGatherMapsWithCount( leftKeys.getNativeView(), rightKeys.getNativeView(), leftConditional.getNativeView(), rightConditional.getNativeView(), condition.getNativeHandle(), nullEquality == NullEquality.EQUAL, - joinSize.getOutputRowCount(), joinSize.getMatches().getNativeView()); + outputRowCount); return buildJoinGatherMaps(gatherMapData); } diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp index 0efc76fc6e9..1dc29a0af2c 100644 --- a/java/src/main/native/src/TableJni.cpp +++ b/java/src/main/native/src/TableJni.cpp @@ -851,14 +851,14 @@ jlongArray cond_join_gather_single_map( } template -jlongArray mixed_join_size(JNIEnv* env, - jlong j_left_keys, - jlong j_right_keys, - jlong j_left_condition, - jlong j_right_condition, - jlong j_condition, - jboolean j_nulls_equal, - T join_size_func) +jlong mixed_join_size(JNIEnv* env, + jlong j_left_keys, + jlong j_right_keys, + jlong j_left_condition, + jlong j_right_condition, + jlong j_condition, + jboolean j_nulls_equal, + T join_size_func) { JNI_NULL_CHECK(env, j_left_keys, "left keys table is null", 0); JNI_NULL_CHECK(env, j_right_keys, "right keys table is null", 0); @@ -874,27 +874,14 @@ jlongArray mixed_join_size(JNIEnv* env, auto const condition = reinterpret_cast(j_condition); auto const nulls_equal = j_nulls_equal ? cudf::null_equality::EQUAL : cudf::null_equality::UNEQUAL; - auto [join_size, matches_per_row] = join_size_func(*left_keys, - *right_keys, - *left_condition, - *right_condition, - condition->get_top_expression(), - nulls_equal); - if (matches_per_row->size() > std::numeric_limits::max()) { - throw std::runtime_error("Too many values in device buffer to convert into a column"); - } - auto col_size = static_cast(matches_per_row->size()); - auto col_data = matches_per_row->release(); - cudf::jni::native_jlongArray result(env, 2); - result[0] = static_cast(join_size); - result[1] = ptr_as_jlong(new cudf::column{cudf::data_type{cudf::type_id::INT32}, - col_size, - std::move(col_data), - rmm::device_buffer{}, - 0}); - return result.get_jArray(); + return join_size_func(*left_keys, + *right_keys, + *left_condition, + *right_condition, + condition->get_top_expression(), + nulls_equal); } - CATCH_STD(env, NULL); + CATCH_STD(env, 0); } template @@ -967,16 +954,6 @@ jlongArray mixed_join_gather_single_map(JNIEnv* env, CATCH_STD(env, NULL); } -std::pair> get_mixed_size_info( - JNIEnv* env, jlong j_output_row_count, jlong j_matches_view) -{ - auto const row_count = static_cast(j_output_row_count); - auto const matches = reinterpret_cast(j_matches_view); - return std::make_pair(row_count, - cudf::device_span( - matches->template data(), matches->size())); -} - cudf::column_view remove_validity_from_col(cudf::column_view column_view) { if (!cudf::is_compound(column_view.type())) { @@ -3050,14 +3027,14 @@ Java_ai_rapids_cudf_Table_conditionalLeftJoinGatherMapsWithCount(JNIEnv* env, }); } -JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_mixedLeftJoinSize(JNIEnv* env, - jclass, - jlong j_left_keys, - jlong j_right_keys, - jlong j_left_condition, - jlong j_right_condition, - jlong j_condition, - jboolean j_nulls_equal) +JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_mixedLeftJoinRowCount(JNIEnv* env, + jclass, + jlong j_left_keys, + jlong j_right_keys, + jlong j_left_condition, + jlong j_right_condition, + jlong j_condition, + jboolean j_nulls_equal) { return cudf::jni::mixed_join_size( env, @@ -3108,18 +3085,16 @@ Java_ai_rapids_cudf_Table_mixedLeftJoinGatherMaps(JNIEnv* env, } JNIEXPORT jlongArray JNICALL -Java_ai_rapids_cudf_Table_mixedLeftJoinGatherMapsWithSize(JNIEnv* env, - jclass, - jlong j_left_keys, - jlong j_right_keys, - jlong j_left_condition, - jlong j_right_condition, - jlong j_condition, - jboolean j_nulls_equal, - jlong j_output_row_count, - jlong j_matches_view) -{ - auto size_info = cudf::jni::get_mixed_size_info(env, j_output_row_count, j_matches_view); +Java_ai_rapids_cudf_Table_mixedLeftJoinGatherMapsWithCount(JNIEnv* env, + jclass, + jlong j_left_keys, + jlong j_right_keys, + jlong j_left_condition, + jlong j_right_condition, + jlong j_condition, + jboolean j_nulls_equal, + jlong j_output_row_count) +{ return cudf::jni::mixed_join_gather_maps( env, j_left_keys, @@ -3128,14 +3103,15 @@ Java_ai_rapids_cudf_Table_mixedLeftJoinGatherMapsWithSize(JNIEnv* env, j_right_condition, j_condition, j_nulls_equal, - [&size_info](cudf::table_view const& left_keys, - cudf::table_view const& right_keys, - cudf::table_view const& left_condition, - cudf::table_view const& right_condition, - cudf::ast::expression const& condition, - cudf::null_equality nulls_equal) { + [row_count = static_cast(j_output_row_count)]( + cudf::table_view const& left_keys, + cudf::table_view const& right_keys, + cudf::table_view const& left_condition, + cudf::table_view const& right_condition, + cudf::ast::expression const& condition, + cudf::null_equality nulls_equal) { return cudf::mixed_left_join( - left_keys, right_keys, left_condition, right_condition, condition, nulls_equal, size_info); + left_keys, right_keys, left_condition, right_condition, condition, nulls_equal, row_count); }); } @@ -3264,14 +3240,14 @@ Java_ai_rapids_cudf_Table_conditionalInnerJoinGatherMapsWithCount(JNIEnv* env, }); } -JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_mixedInnerJoinSize(JNIEnv* env, - jclass, - jlong j_left_keys, - jlong j_right_keys, - jlong j_left_condition, - jlong j_right_condition, - jlong j_condition, - jboolean j_nulls_equal) +JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_mixedInnerJoinRowCount(JNIEnv* env, + jclass, + jlong j_left_keys, + jlong j_right_keys, + jlong j_left_condition, + jlong j_right_condition, + jlong j_condition, + jboolean j_nulls_equal) { return cudf::jni::mixed_join_size( env, @@ -3322,18 +3298,16 @@ Java_ai_rapids_cudf_Table_mixedInnerJoinGatherMaps(JNIEnv* env, } JNIEXPORT jlongArray JNICALL -Java_ai_rapids_cudf_Table_mixedInnerJoinGatherMapsWithSize(JNIEnv* env, - jclass, - jlong j_left_keys, - jlong j_right_keys, - jlong j_left_condition, - jlong j_right_condition, - jlong j_condition, - jboolean j_nulls_equal, - jlong j_output_row_count, - jlong j_matches_view) +Java_ai_rapids_cudf_Table_mixedInnerJoinGatherMapsWithCount(JNIEnv* env, + jclass, + jlong j_left_keys, + jlong j_right_keys, + jlong j_left_condition, + jlong j_right_condition, + jlong j_condition, + jboolean j_nulls_equal, + jlong j_output_row_count) { - auto size_info = cudf::jni::get_mixed_size_info(env, j_output_row_count, j_matches_view); return cudf::jni::mixed_join_gather_maps( env, j_left_keys, @@ -3342,14 +3316,15 @@ Java_ai_rapids_cudf_Table_mixedInnerJoinGatherMapsWithSize(JNIEnv* env, j_right_condition, j_condition, j_nulls_equal, - [&size_info](cudf::table_view const& left_keys, - cudf::table_view const& right_keys, - cudf::table_view const& left_condition, - cudf::table_view const& right_condition, - cudf::ast::expression const& condition, - cudf::null_equality nulls_equal) { + [row_count = static_cast(j_output_row_count)]( + cudf::table_view const& left_keys, + cudf::table_view const& right_keys, + cudf::table_view const& left_condition, + cudf::table_view const& right_condition, + cudf::ast::expression const& condition, + cudf::null_equality nulls_equal) { return cudf::mixed_inner_join( - left_keys, right_keys, left_condition, right_condition, condition, nulls_equal, size_info); + left_keys, right_keys, left_condition, right_condition, condition, nulls_equal, row_count); }); } diff --git a/java/src/test/java/ai/rapids/cudf/TableTest.java b/java/src/test/java/ai/rapids/cudf/TableTest.java index 1289f468002..e539ded8089 100644 --- a/java/src/test/java/ai/rapids/cudf/TableTest.java +++ b/java/src/test/java/ai/rapids/cudf/TableTest.java @@ -2464,12 +2464,12 @@ void testMixedLeftJoinGatherMapsWithSize() { Table expected = new Table.TestBuilder() .column( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9) .column(inv, inv, 2, inv, inv, inv, inv, 0, 1, inv) - .build(); - MixedJoinSize sizeInfo = Table.mixedLeftJoinSize(leftKeys, rightKeys, left, right, - condition, NullEquality.UNEQUAL)) { - assertEquals(expected.getRowCount(), sizeInfo.getOutputRowCount()); + .build()) { + long rowCount = Table.mixedLeftJoinRowCount(leftKeys, rightKeys, left, right, + condition, NullEquality.UNEQUAL); + assertEquals(expected.getRowCount(), rowCount); GatherMap[] maps = Table.mixedLeftJoinGatherMaps(leftKeys, rightKeys, left, right, condition, - NullEquality.UNEQUAL, sizeInfo); + NullEquality.UNEQUAL, rowCount); try { verifyJoinGatherMaps(maps, expected); } finally { @@ -2500,12 +2500,12 @@ void testMixedLeftJoinGatherMapsNullsWithSize() { Table expected = new Table.TestBuilder() .column(0, 1, 2, 3, 4, 5, 6, 7, 7, 8, 9) .column(0, inv, inv, inv, inv, inv, inv, 0, 2, 1, inv) - .build(); - MixedJoinSize sizeInfo = Table.mixedLeftJoinSize(leftKeys, rightKeys, left, right, - condition, NullEquality.EQUAL)) { - assertEquals(expected.getRowCount(), sizeInfo.getOutputRowCount()); + .build()) { + long rowCount = Table.mixedLeftJoinRowCount(leftKeys, rightKeys, left, right, + condition, NullEquality.EQUAL); + assertEquals(expected.getRowCount(), rowCount); GatherMap[] maps = Table.mixedLeftJoinGatherMaps(leftKeys, rightKeys, left, right, condition, - NullEquality.EQUAL, sizeInfo); + NullEquality.EQUAL, rowCount); try { verifyJoinGatherMaps(maps, expected); } finally { @@ -2979,12 +2979,12 @@ void testMixedInnerJoinGatherMapsWithSize() { Table expected = new Table.TestBuilder() .column(2, 7, 8) .column(2, 0, 1) - .build(); - MixedJoinSize sizeInfo = Table.mixedInnerJoinSize(leftKeys, rightKeys, left, right, - condition, NullEquality.UNEQUAL)) { - assertEquals(expected.getRowCount(), sizeInfo.getOutputRowCount()); + .build()) { + long rowCount = Table.mixedInnerJoinRowCount(leftKeys, rightKeys, left, right, + condition, NullEquality.UNEQUAL); + assertEquals(expected.getRowCount(), rowCount); GatherMap[] maps = Table.mixedInnerJoinGatherMaps(leftKeys, rightKeys, left, right, condition, - NullEquality.UNEQUAL, sizeInfo); + NullEquality.UNEQUAL, rowCount); try { verifyJoinGatherMaps(maps, expected); } finally { @@ -3014,12 +3014,12 @@ void testMixedInnerJoinGatherMapsNullsWithSize() { Table expected = new Table.TestBuilder() .column(0, 7, 7, 8) .column(0, 0, 2, 1) - .build(); - MixedJoinSize sizeInfo = Table.mixedInnerJoinSize(leftKeys, rightKeys, left, right, - condition, NullEquality.EQUAL)) { - assertEquals(expected.getRowCount(), sizeInfo.getOutputRowCount()); + .build()) { + long rowCount = Table.mixedInnerJoinRowCount(leftKeys, rightKeys, left, right, + condition, NullEquality.EQUAL); + assertEquals(expected.getRowCount(), rowCount); GatherMap[] maps = Table.mixedInnerJoinGatherMaps(leftKeys, rightKeys, left, right, condition, - NullEquality.EQUAL, sizeInfo); + NullEquality.EQUAL, rowCount); try { verifyJoinGatherMaps(maps, expected); } finally { diff --git a/python/pylibcudf/pylibcudf/join.pyx b/python/pylibcudf/pylibcudf/join.pyx index a9261345db5..f5b8d1ba6d1 100644 --- a/python/pylibcudf/pylibcudf/join.pyx +++ b/python/pylibcudf/pylibcudf/join.pyx @@ -33,8 +33,10 @@ __all__ = [ "left_semi_join", "mixed_full_join", "mixed_inner_join", + "mixed_inner_join_size", "mixed_left_anti_join", "mixed_left_join", + "mixed_left_join_size", "mixed_left_semi_join", ] @@ -516,7 +518,7 @@ cpdef tuple mixed_inner_join( join. """ cdef cpp_join.gather_map_pair_type c_result - cdef cpp_join.output_size_data_type empty_optional + cdef optional[size_t] empty_optional stream = _get_stream(stream) @@ -536,6 +538,55 @@ cpdef tuple mixed_inner_join( _column_from_gather_map(move(c_result.second), stream), ) +cpdef size_t mixed_inner_join_size( + Table left_keys, + Table right_keys, + Table left_conditional, + Table right_conditional, + Expression binary_predicate, + null_equality nulls_equal, + Stream stream=None +): + """Get the size of a mixed inner join between two tables. + + For details, see :cpp:func:`mixed_inner_join_size`. + + Parameters + ---------- + left_keys : Table + The left table to use for the equality join. + right_keys : Table + The right table to use for the equality join. + left_conditional : Table + The left table to use for the conditional join. + right_conditional : Table + The right table to use for the conditional join. + binary_predicate : Expression + Condition to join on. + nulls_equal : NullEquality + Should nulls compare equal in the equality join? + + Returns + ------- + int + The number of rows that would be produced by the join. + """ + cdef size_t result + + stream = _get_stream(stream) + + with nogil: + result = cpp_join.mixed_inner_join_size( + left_keys.view(), + right_keys.view(), + left_conditional.view(), + right_conditional.view(), + dereference(binary_predicate.c_obj.get()), + nulls_equal, + stream.view() + ) + return result + cpdef tuple mixed_left_join( Table left_keys, @@ -572,7 +623,7 @@ cpdef tuple mixed_left_join( join. """ cdef cpp_join.gather_map_pair_type c_result - cdef cpp_join.output_size_data_type empty_optional + cdef optional[size_t] empty_optional stream = _get_stream(stream) @@ -592,6 +643,55 @@ cpdef tuple mixed_left_join( _column_from_gather_map(move(c_result.second), stream), ) +cpdef size_t mixed_left_join_size( + Table left_keys, + Table right_keys, + Table left_conditional, + Table right_conditional, + Expression binary_predicate, + null_equality nulls_equal, + Stream stream=None +): + """Get the size of a mixed left join between two tables. + + For details, see :cpp:func:`mixed_left_join_size`. + + Parameters + ---------- + left_keys : Table + The left table to use for the equality join. + right_keys : Table + The right table to use for the equality join. + left_conditional : Table + The left table to use for the conditional join. + right_conditional : Table + The right table to use for the conditional join. + binary_predicate : Expression + Condition to join on. + nulls_equal : NullEquality + Should nulls compare equal in the equality join? + + Returns + ------- + int + The number of rows that would be produced by the join. + """ + cdef size_t result + + stream = _get_stream(stream) + + with nogil: + result = cpp_join.mixed_left_join_size( + left_keys.view(), + right_keys.view(), + left_conditional.view(), + right_conditional.view(), + dereference(binary_predicate.c_obj.get()), + nulls_equal, + stream.view() + ) + return result + cpdef tuple mixed_full_join( Table left_keys, @@ -628,7 +728,7 @@ cpdef tuple mixed_full_join( join. """ cdef cpp_join.gather_map_pair_type c_result - cdef cpp_join.output_size_data_type empty_optional + cdef optional[size_t] empty_optional stream = _get_stream(stream) diff --git a/python/pylibcudf/pylibcudf/libcudf/join.pxd b/python/pylibcudf/pylibcudf/libcudf/join.pxd index 111576ea1d9..0f702d6ea20 100644 --- a/python/pylibcudf/pylibcudf/libcudf/join.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/join.pxd @@ -19,7 +19,6 @@ from pylibcudf.libcudf.utilities.span cimport device_span ctypedef unique_ptr[device_uvector[size_type]] gather_map_type ctypedef pair[gather_map_type, gather_map_type] gather_map_pair_type -ctypedef optional[pair[size_t, device_span[const size_type]]] output_size_data_type cdef extern from "cudf/join/join.hpp" namespace "cudf" nogil: cdef gather_map_pair_type inner_join( @@ -169,7 +168,7 @@ cdef extern from "cudf/join/mixed_join.hpp" namespace "cudf" nogil: const table_view right_conditional, const expression binary_predicate, null_equality compare_nulls, - output_size_data_type output_size_data, + optional[size_t] output_size, cuda_stream_view stream ) except +libcudf_exception_handler @@ -180,7 +179,7 @@ cdef extern from "cudf/join/mixed_join.hpp" namespace "cudf" nogil: const table_view right_conditional, const expression binary_predicate, null_equality compare_nulls, - output_size_data_type output_size_data, + optional[size_t] output_size, cuda_stream_view stream ) except +libcudf_exception_handler @@ -191,7 +190,7 @@ cdef extern from "cudf/join/mixed_join.hpp" namespace "cudf" nogil: const table_view right_conditional, const expression binary_predicate, null_equality compare_nulls, - output_size_data_type output_size_data, + optional[size_t] output_size, cuda_stream_view stream ) except +libcudf_exception_handler @@ -214,3 +213,23 @@ cdef extern from "cudf/join/mixed_join.hpp" namespace "cudf" nogil: null_equality compare_nulls, cuda_stream_view stream ) except +libcudf_exception_handler + + cdef size_t mixed_inner_join_size( + const table_view left_equality, + const table_view right_equality, + const table_view left_conditional, + const table_view right_conditional, + const expression binary_predicate, + null_equality compare_nulls, + cuda_stream_view stream + ) except +libcudf_exception_handler + + cdef size_t mixed_left_join_size( + const table_view left_equality, + const table_view right_equality, + const table_view left_conditional, + const table_view right_conditional, + const expression binary_predicate, + null_equality compare_nulls, + cuda_stream_view stream + ) except +libcudf_exception_handler From 07243acf7b60ed47a218cfa5bd4485524682740e Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Wed, 3 Sep 2025 14:32:35 -0500 Subject: [PATCH 247/366] Fix various pandas test failures in `cudf.pandas` (#19372) Closes #19550 This PR: - Fixes a net of 400 pytest failures - Enables `data_array_view` for pandas nullable extension types. - Returns `values_host` for null columns with pandas nullable extension types with `np.nan` for missing values. - Fixes `distinct_count` and thus `nunique` to properly handle `NA`, `np.nan` for all types. - Changes `astype`'s `copy` parameter default to `True` from `False`. - Raises error in `searchsorted` for nullable extension types. - Disallows conversion of `np.array([nat, nat])` column to `pa.array([null, null])` in `as_column` constructor. - Fixes return types for many combinations of binary op with pandas nullable extension types. - Fixes all edge cases not handled in `POW`. - Disable conversion of float to int column if there are `nan`'s - Disable allowing `np.nan` for `StringColumn.fillna` - Update cupy nan methods dict with newly supported nan APIs. - Update `to_array` and thus `to_numpy` to work with all pandas nullable extension types. Authors: - GALI PREM SAGAR (https://github.com/galipremsagar) - Matthew Roeschke (https://github.com/mroeschke) - David Wendt (https://github.com/davidwendt) - Vukasin Milovanovic (https://github.com/vuule) - Muhammad Haseeb (https://github.com/mhaseeb123) - https://github.com/brandon-b-miller - Richard (Rick) Zamora (https://github.com/rjzamora) - Matthew Murray (https://github.com/Matt711) - Robert Maynard (https://github.com/robertmaynard) - Kyle Edwards (https://github.com/KyleFromNVIDIA) - Tianyu Liu (https://github.com/kingcrimsontianyu) - James Lamb (https://github.com/jameslamb) Approvers: - Matthew Roeschke (https://github.com/mroeschke) URL: https://github.com/rapidsai/cudf/pull/19372 --- python/cudf/cudf/core/column/column.py | 70 +- python/cudf/cudf/core/column/numerical.py | 121 ++- python/cudf/cudf/core/column/string.py | 13 + python/cudf/cudf/core/dataframe.py | 67 +- python/cudf/cudf/core/frame.py | 88 ++- python/cudf/cudf/core/indexed_frame.py | 61 +- python/cudf/cudf/core/series.py | 26 +- python/cudf/cudf/pandas/_wrappers/pandas.py | 3 + python/cudf/cudf/pandas/fast_slow_proxy.py | 6 +- .../cudf/pandas/scripts/conftest-patch.py | 708 ++++-------------- .../cudf/tests/private_objects/test_column.py | 4 +- .../cudf/tests/series/methods/test_unique.py | 18 +- python/cudf/cudf/utils/dtypes.py | 11 +- 13 files changed, 563 insertions(+), 633 deletions(-) diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 091e6a81c32..e43f15f1895 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -2,6 +2,7 @@ from __future__ import annotations +import decimal import pickle import warnings from collections.abc import Iterable, Iterator, MutableSequence, Sequence @@ -689,7 +690,12 @@ def data_array_view( raise ValueError(f"Unsupported mode: {mode}") else: obj = None - return cuda.as_cuda_array(obj).view(self.dtype) + if cudf.get_option("mode.pandas_compatible"): + return cuda.as_cuda_array(obj).view( + getattr(self.dtype, "numpy_dtype", self.dtype) + ) + else: + return cuda.as_cuda_array(obj).view(self.dtype) def mask_array_view( self, *, mode: Literal["write", "read"] = "write" @@ -799,6 +805,23 @@ def values_host(self) -> np.ndarray: if len(self) == 0: return np.array([], dtype=self.dtype) + if ( + cudf.get_option("mode.pandas_compatible") + and is_pandas_nullable_extension_dtype(self.dtype) + and self.dtype.kind in "iuf" + and self.has_nulls() + ): + col = self.astype( + np.dtype("float32") + if getattr(self.dtype, "numpy_dtype", self.dtype) + == np.dtype("float32") + else np.dtype("float64") + ) + col = col.fillna(np.nan) + with acquire_spill_lock(): + res = col.data_array_view(mode="read").copy_to_host() + return res + if self.has_nulls(): raise ValueError("Column must have no nulls.") @@ -1786,7 +1809,9 @@ def distinct_count(self, dropna: bool = True) -> int: plc.types.NullPolicy.EXCLUDE if dropna else plc.types.NullPolicy.INCLUDE, - plc.types.NanPolicy.NAN_IS_VALID, + plc.types.NanPolicy.NAN_IS_NULL + if dropna + else plc.types.NanPolicy.NAN_IS_VALID, ) self._distinct_count[dropna] = result return self._distinct_count[dropna] @@ -1810,7 +1835,7 @@ def cast(self, dtype: Dtype) -> ColumnBase: result._dtype = dtype return result - def astype(self, dtype: DtypeObj, copy: bool = False) -> ColumnBase: + def astype(self, dtype: DtypeObj, copy: bool | None = False) -> ColumnBase: if self.dtype == dtype: result = self elif len(self) == 0: @@ -1844,7 +1869,7 @@ def astype(self, dtype: DtypeObj, copy: bool = False) -> ColumnBase: result = self.as_numerical_column(dtype) if copy and result is self: - return result.copy() + return result.copy(deep=copy) return result def as_categorical_column( @@ -1963,6 +1988,13 @@ def searchsorted( raise ValueError( "Column searchsorted expects values to be column of same dtype" ) + if is_pandas_nullable_extension_dtype(self.dtype) and self.has_nulls( + include_nan=True + ): + raise ValueError( + "searchsorted requires array to be sorted, which is impossible " + "with NAs present." + ) return ColumnBase.from_pylibcudf( sorting.search_sorted( # type: ignore[return-value] [self], @@ -2233,14 +2265,18 @@ def _return_sentinel_column(): def copy_if_else( self, other: Self | plc.Scalar, boolean_mask: NumericalColumn ) -> Self: - return type(self).from_pylibcudf( # type: ignore[return-value] - plc.copying.copy_if_else( - self.to_pylibcudf(mode="read"), - other - if isinstance(other, plc.Scalar) - else other.to_pylibcudf(mode="read"), - boolean_mask.to_pylibcudf(mode="read"), + return ( + type(self) + .from_pylibcudf( # type: ignore[return-value] + plc.copying.copy_if_else( + self.to_pylibcudf(mode="read"), + other + if isinstance(other, plc.Scalar) + else other.to_pylibcudf(mode="read"), + boolean_mask.to_pylibcudf(mode="read"), + ) ) + ._with_type_metadata(self.dtype) ) def split_by_offsets( @@ -2490,11 +2526,17 @@ def where( def _has_any_nan(arbitrary: pd.Series | np.ndarray) -> bool: """Check if an object dtype Series or array contains NaN.""" return any( - isinstance(x, (float, np.floating)) and np.isnan(x) + (isinstance(x, (float, np.floating)) and np.isnan(x)) + or (isinstance(x, decimal.Decimal) and x.is_nan()) for x in np.asarray(arbitrary) ) +def _has_any_nat(arbitrary: pd.Series | np.ndarray) -> bool: + """Check if an object dtype Series or array contains NaT.""" + return any(x is pd.NaT for x in np.asarray(arbitrary)) + + def column_empty( row_count: int, dtype: DtypeObj = CUDF_STRING_DTYPE, @@ -2993,7 +3035,7 @@ def as_column( elif ( nan_as_null is False and inferred_dtype not in ("decimal", "empty") - and _has_any_nan(arbitrary) + and (_has_any_nan(arbitrary) or _has_any_nat(arbitrary)) ): # Decimal can hold float("nan") # All np.nan is not restricted by type @@ -3433,4 +3475,4 @@ def concat_columns(objs: "MutableSequence[ColumnBase]") -> ColumnBase: plc.concatenate.concatenate( [col.to_pylibcudf(mode="read") for col in objs_with_len] ) - ) + )._with_type_metadata(objs_with_len[0].dtype) # type: ignore[return-value] diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py index 2292b92e68e..1faa75b8881 100644 --- a/python/cudf/cudf/core/column/numerical.py +++ b/python/cudf/cudf/core/column/numerical.py @@ -7,6 +7,7 @@ import cupy as cp import numpy as np +import pandas as pd import pyarrow as pa from typing_extensions import Self @@ -211,12 +212,30 @@ def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase: np.uint64: np.float64, np.bool_: np.float32, } + if cudf.get_option("mode.pandas_compatible"): + int_float_dtype_mapping = { + np.int8: np.float64, + np.int16: np.float64, + np.int32: np.float64, + np.int64: np.float64, + np.uint8: np.float64, + np.uint16: np.float64, + np.uint32: np.float64, + np.uint64: np.float64, + np.bool_: np.float64, + } out_dtype = None if op in {"__truediv__", "__rtruediv__"}: # Division with integer types results in a suitable float. - if truediv_type := int_float_dtype_mapping.get(self.dtype.type): - return self.astype(np.dtype(truediv_type))._binaryop(other, op) + if truediv_type := int_float_dtype_mapping.get( + self.dtype.numpy_dtype.type + if is_pandas_nullable_extension_dtype(self.dtype) + else self.dtype.type + ): + return self.astype( + get_dtype_of_same_kind(self.dtype, np.dtype(truediv_type)) + )._binaryop(other, op) elif op in { "__lt__", "__gt__", @@ -279,7 +298,11 @@ def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase: f"{self.dtype.type.__name__} and " f"{other_cudf_dtype.type.__name__}" ) - if self.dtype.kind == "b" or other_cudf_dtype.kind == "b": + if self.dtype.kind == "b" and other_cudf_dtype.kind == "b": + out_dtype = get_dtype_of_same_kind( + self.dtype, np.dtype(np.bool_) + ) + elif self.dtype.kind == "b" or other_cudf_dtype.kind == "b": out_dtype = get_dtype_of_same_kind( out_dtype, np.dtype(np.bool_) ) @@ -291,13 +314,53 @@ def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase: ): op = "INT_POW" + lhs_dtype, rhs_dtype = ( + (other_cudf_dtype, self.dtype) + if reflect + else (self.dtype, other_cudf_dtype) + ) lhs, rhs = (other, self) if reflect else (self, other) - + if out_dtype.kind == "f" and is_pandas_nullable_extension_dtype( + out_dtype + ): + if ( + not is_pandas_nullable_extension_dtype(lhs_dtype) + and lhs_dtype.kind == "f" + and isinstance(lhs, NumericalColumn) + ): + lhs = lhs.nans_to_nulls() + if ( + not is_pandas_nullable_extension_dtype(rhs_dtype) + and rhs_dtype.kind == "f" + and isinstance(rhs, NumericalColumn) + ): + rhs = rhs.nans_to_nulls() if isinstance(lhs, pa.Scalar): lhs = pa_scalar_to_plc_scalar(lhs) elif isinstance(rhs, pa.Scalar): rhs = pa_scalar_to_plc_scalar(rhs) - return binaryop.binaryop(lhs, rhs, op, out_dtype) + + res = binaryop.binaryop(lhs, rhs, op, out_dtype) + if ( + is_pandas_nullable_extension_dtype(out_dtype) + and out_dtype.kind == "f" + ): + # If the output dtype is a pandas nullable extension type, + # we need to ensure that the result is a NumericalColumn. + res = res.nans_to_nulls() + if op in {"__mod__", "__floordiv__"} and tmp_dtype.kind == "b": + res = res.astype( + get_dtype_of_same_kind(out_dtype, np.dtype(np.int8)) + ) + elif op == "INT_POW" and res.null_count: + if ( + isinstance(lhs, plc.Scalar) + and lhs.to_py() == 1 + and isinstance(rhs, ColumnBase) + and rhs.null_count > 0 + ): + res = res.fillna(lhs.to_py()) + return res def nans_to_nulls(self: Self) -> Self: # Only floats can contain nan. @@ -339,8 +402,20 @@ def _normalize_binop_operand(self, other: Any) -> pa.Scalar | ColumnBase: # => np.int64 # np.promote_types(np.asarray([0], dtype=np.int64).dtype, np.uint8) # => np.int64 - common_dtype = np.result_type(self.dtype, other) # noqa: TID251 - if common_dtype.kind in {"b", "i", "u", "f"}: + if is_pandas_nullable_extension_dtype(self.dtype): + if isinstance(self.dtype, pd.ArrowDtype): + common_dtype = cudf.utils.dtypes.find_common_type( + [self.dtype, other] + ) + else: + common_dtype = get_dtype_of_same_kind( + self.dtype, + np.result_type(self.dtype.numpy_dtype, other), # noqa: TID251 + ) + + else: + common_dtype = np.result_type(self.dtype, other) # noqa: TID251 + if common_dtype.kind in {"b", "i", "u", "f"}: # type: ignore[union-attr] if self.dtype.kind == "b" and not isinstance(other, bool): common_dtype = min_signed_type(other) return pa.scalar( @@ -427,7 +502,26 @@ def as_decimal_column(self, dtype: DecimalDtype) -> DecimalBaseColumn: def as_numerical_column(self, dtype: Dtype) -> NumericalColumn: if dtype == self.dtype: return self + if cudf.get_option("mode.pandas_compatible"): + if ( + is_pandas_nullable_extension_dtype(self.dtype) + and isinstance(dtype, np.dtype) + and self.null_count > 0 + ): + if dtype.kind in "iu": + raise ValueError("cannot convert NA to integer") + elif dtype.kind == "b": + raise ValueError("cannot convert float NaN to bool") + + if ( + not is_pandas_nullable_extension_dtype(self.dtype) + and is_pandas_nullable_extension_dtype(dtype) + and dtype.kind == "f" # type: ignore[union-attr] + ): + res = self.nans_to_nulls().cast(dtype=dtype) # type: ignore[return-value] + res._dtype = dtype + return res # type: ignore[return-value] if dtype_to_pylibcudf_type(dtype) == dtype_to_pylibcudf_type( self.dtype ): @@ -446,6 +540,19 @@ def as_numerical_column(self, dtype: Dtype) -> NumericalColumn: else: self._dtype = dtype return self + if self.dtype.kind == "f" and dtype.kind in "iu": # type: ignore[union-attr] + if ( + not is_pandas_nullable_extension_dtype(dtype) + and self.nan_count > 0 + ): + raise TypeError( + "Cannot convert non-finite values (NA or inf) to integer" + ) + # If casting from float to int, we need to convert nans to nulls + res = self.nans_to_nulls().cast(dtype=dtype) # type: ignore[return-value] + res._dtype = dtype + return res # type: ignore[return-value] + return self.cast(dtype=dtype) # type: ignore[return-value] def all(self, skipna: bool = True) -> bool: diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index a947ae7a275..e7a7105e6a9 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -18,6 +18,7 @@ from cudf.core._internals import binaryop from cudf.core.buffer import Buffer, acquire_spill_lock from cudf.core.column.column import ColumnBase, as_column, column_empty +from cudf.errors import MixedTypeError from cudf.utils.docutils import copy_docstring from cudf.utils.dtypes import ( CUDF_STRING_DTYPE, @@ -252,6 +253,18 @@ def __cuda_array_interface__(self): "`__cuda_array_interface__`" ) + def _validate_fillna_value( + self, fill_value: ScalarLike | ColumnLike + ) -> plc.Scalar | ColumnBase: + """Align fill_value for .fillna based on column type.""" + if ( + cudf.get_option("mode.pandas_compatible") + and is_scalar(fill_value) + and fill_value is np.nan + ): + raise MixedTypeError("Cannot fill `np.nan` in string column") + return super()._validate_fillna_value(fill_value) + def element_indexing(self, index: int): result = super().element_indexing(index) if isinstance(result, pa.Scalar): diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index ab91e1a9a9b..2dc4fed005e 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -105,6 +105,8 @@ get_dtype_of_same_kind, is_column_like, is_dtype_obj_numeric, + is_mixed_with_object_dtype, + is_pandas_nullable_extension_dtype, min_signed_type, ) from cudf.utils.ioutils import ( @@ -130,6 +132,7 @@ "mean": "nanmean", "std": "nanstd", "var": "nanvar", + "median": "nanmedian", } @@ -2047,9 +2050,11 @@ def _concat( def astype( self, dtype: Dtype | dict[Hashable, Dtype], - copy: bool = False, + copy: bool | None = None, errors: Literal["raise", "ignore"] = "raise", ) -> Self: + if copy is None: + copy = True if is_dict_like(dtype): if len(set(dtype.keys()) - set(self._column_names)) > 0: # type: ignore[union-attr] raise KeyError( @@ -6994,6 +6999,7 @@ def _apply_cupy_method_axis_1(self, method, *args, **kwargs): prepared, mask, common_dtype = self._prepare_for_rowwise_op( method, skipna, numeric_only ) + for col in prepared._column_names: if prepared._data[col].nullable: prepared._data[col] = ( @@ -7012,15 +7018,25 @@ def _apply_cupy_method_axis_1(self, method, *args, **kwargs): if skipna is not False and method in _cupy_nan_methods_map: method = _cupy_nan_methods_map[method] + if len(arr) == 0 and method == "nanmedian": + # Workaround for a cupy limitation, cupy + # errors for zero dim array in nanmedian + # https://github.com/cupy/cupy/issues/9332 + method = "median" result = getattr(cupy, method)(arr, axis=1, **kwargs) if result.ndim == 1: type_coerced_methods = { "count", "min", + "nanmin", "max", + "nanmax", "sum", + "nansum", "prod", + "nanprod", + "product", "cummin", "cummax", "cumsum", @@ -7032,6 +7048,45 @@ def _apply_cupy_method_axis_1(self, method, *args, **kwargs): or (common_dtype is not None and common_dtype.kind == "M") else None ) + + if ( + cudf.get_option("mode.pandas_compatible") + and result_dtype is None + and is_pandas_nullable_extension_dtype(common_dtype) + ): + if ( + method + in { + "kurt", + "kurtosis", + "mean", + "nanmean", + "median", + "nanmedian", + "sem", + "skew", + "std", + "nanstd", + "var", + "nanvar", + } + and common_dtype.kind != "f" + ): + result_dtype = get_dtype_of_same_kind( + common_dtype, np.dtype(np.float64) + ) + else: + result_dtype = get_dtype_of_same_kind( + common_dtype, result.dtype + ) + if ( + result_dtype is not None + and result_dtype.kind == "b" + and result.dtype.kind != "b" + ): + result_dtype = get_dtype_of_same_kind( + common_dtype, result.dtype + ) result = as_column(result, dtype=result_dtype) if mask is not None: result = result.set_mask(mask._column.as_mask()) @@ -7633,6 +7688,16 @@ def unnamed_group_generator(): col.astype(common_type) if col is not None else all_nulls() for col in columns ) + if ( + cudf.get_option("mode.pandas_compatible") + and common_type == "object" + ): + for col, hcol in zip(columns, homogenized, strict=True): + if is_mixed_with_object_dtype(col, hcol): + raise TypeError( + "Stacking a DataFrame with mixed object and " + "non-object dtypes is not supported. " + ) with acquire_spill_lock(): interleaved_col = ColumnBase.from_pylibcudf( diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 0c627f3ea84..7caa5826f2a 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -17,10 +17,11 @@ import pylibcudf as plc import cudf +from cudf.api.extensions import no_default # TODO: The `numpy` import is needed for typing purposes during doc builds # only, need to figure out why the `np` alias is insufficient then remove. -from cudf.api.types import is_dtype_equal, is_scalar +from cudf.api.types import is_dtype_equal, is_scalar, is_string_dtype from cudf.core._compat import PANDAS_LT_300 from cudf.core._internals import copying, sorting from cudf.core.abc import Serializable @@ -403,7 +404,7 @@ def __len__(self) -> int: @_performance_tracking def astype( - self, dtype: dict[Hashable, DtypeObj], copy: bool = False + self, dtype: dict[Hashable, DtypeObj], copy: bool | None = None ) -> Self: casted = ( col.astype(dtype.get(col_name, col.dtype), copy=copy) @@ -552,20 +553,56 @@ def _to_array( module: ModuleType, copy: bool, dtype: Dtype | None = None, - na_value=None, + na_value=no_default, ) -> cupy.ndarray | numpy.ndarray: # Internal function to implement to_cupy and to_numpy, which are nearly # identical except for the attribute they access to generate values. def to_array( - col: ColumnBase, dtype: np.dtype + col: ColumnBase, to_dtype: np.dtype ) -> cupy.ndarray | numpy.ndarray: - if na_value is not None: + if ( + col.has_nulls() + and dtype is not None + and not is_string_dtype(dtype) + and na_value is no_default + ): + raise ValueError( + f"cannot convert to '{dtype}'-dtype NumPy array " + "with missing values. Specify an appropriate 'na_value' " + "for this dtype." + ) + if na_value is not no_default: col = col.fillna(na_value) + if isinstance(col.dtype, cudf.CategoricalDtype): col = col._get_decategorized_column() # type: ignore[attr-defined] + array = get_array(col) - casted_array = module.asarray(array, dtype=dtype) + + if ( + cudf.get_option("mode.pandas_compatible") + and is_pandas_nullable_extension_dtype(col.dtype) + and col.dtype.kind in "iuf" + and to_dtype is None + ): + to_dtype = array.dtype + if ( + to_dtype != array.dtype + and dtype is None + and array.dtype.kind in "f" + and col.has_nulls() + ): + to_dtype = None + casted_array = module.asarray(array, dtype=to_dtype) + if ( + col.has_nulls() + and dtype is not None + and is_string_dtype(dtype) + ): + casted_array[col.isnull().values_host] = ( + cudf.NA if na_value is no_default else na_value + ) if copy and casted_array is array: # Don't double copy after asarray casted_array = casted_array.copy() @@ -581,29 +618,43 @@ def to_array( if dtype is None: if ncol == 1: - dtype = next(self._dtypes)[1] + to_dtype = next(self._dtypes)[1] else: - dtype = find_common_type([dtype for _, dtype in self._dtypes]) + to_dtype = find_common_type( + [dtype for _, dtype in self._dtypes] + ) - if isinstance(dtype, cudf.CategoricalDtype): - dtype = dtype.categories.dtype + if cudf.get_option( + "mode.pandas_compatible" + ) and is_pandas_nullable_extension_dtype(to_dtype): + to_dtype = getattr(to_dtype, "numpy_dtype", to_dtype) + if getattr(to_dtype, "kind", None) == "U": + to_dtype = np.dtype(object) + if isinstance(to_dtype, cudf.CategoricalDtype): + to_dtype = to_dtype.categories.dtype - if not isinstance(dtype, numpy.dtype): + if not isinstance(to_dtype, numpy.dtype): raise NotImplementedError( - f"{dtype} cannot be exposed as an array" + f"{to_dtype} cannot be exposed as an array" ) if self.ndim == 1: - return to_array(self._columns[0], dtype) + return to_array( + self._columns[0], to_dtype if dtype is None else dtype + ) else: matrix = module.empty( - shape=(len(self), ncol), dtype=dtype, order="F" + shape=(len(self), ncol), + dtype=to_dtype if dtype is None else dtype, + order="F", ) for i, col in enumerate(self._columns): # TODO: col.values may fail if there is nullable data or an # unsupported dtype. We may want to catch and provide a more # suitable error. - matrix[:, i] = to_array(col, dtype) + matrix[:, i] = to_array( + col, to_dtype if dtype is None else dtype + ) return matrix @_performance_tracking @@ -735,7 +786,7 @@ def to_numpy( self, dtype: Dtype | None = None, copy: bool = True, - na_value=None, + na_value=no_default, ) -> numpy.ndarray: """Convert the Frame to a NumPy array. @@ -1466,10 +1517,11 @@ def searchsorted( values = [*values._columns] if len(values) != self._num_columns: raise ValueError("Mismatch number of columns to search for.") + if cudf.get_option("mode.pandas_compatible"): if any( - col.has_nulls() - and is_pandas_nullable_extension_dtype(col.dtype) + is_pandas_nullable_extension_dtype(col.dtype) + and col.has_nulls(include_nan=True) for col in self._columns ): raise ValueError( diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index e9c8b71fbe3..f09d15e091b 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -4935,7 +4935,7 @@ def repeat(self, repeats, axis=None): def astype( self, dtype: Dtype | dict[Hashable, Dtype], - copy: bool = False, + copy: bool | None = None, errors: Literal["raise", "ignore"] = "raise", ) -> Self: """Cast the object to the given dtype. @@ -6821,30 +6821,43 @@ def _append_new_row_inplace(col: ColumnBase, value: ScalarLike) -> None: """Append a scalar `value` to the end of `col` inplace. Cast to common type if possible """ - val_col = as_column(value, dtype=col.dtype if value is None else None) - if ( - cudf.get_option("mode.pandas_compatible") - and is_pandas_nullable_extension_dtype(col.dtype) - and val_col.dtype.kind == "f" - ): - # If the column is a pandas nullable extension type, we need to - # convert the nans to a nullable type as well. - val_col = val_col.nans_to_nulls() - if len(val_col) == val_col.null_count: - # If the column is all nulls, we can use the column dtype - # to avoid unnecessary casting. - val_col = val_col.astype(col.dtype) - to_type = find_common_type([val_col.dtype, col.dtype]) - if ( - cudf.get_option("mode.pandas_compatible") - and is_string_dtype(to_type) - and is_mixed_with_object_dtype(val_col, col) - ): - raise MixedTypeError("Cannot append mixed types") - if cudf.get_option("mode.pandas_compatible") and val_col.can_cast_safely( - col.dtype - ): + val_col = as_column( + value, + dtype=col.dtype + if ( + cudf.utils.utils._is_null_host_scalar(value) + or value in {None, np.nan} + ) + else None, + ) + if val_col.dtype.kind != "f" and val_col.can_cast_safely(col.dtype): + # If the value can be cast to the column dtype, do so + val_col = val_col.astype(col.dtype) to_type = col.dtype + else: + if ( + cudf.get_option("mode.pandas_compatible") + and is_pandas_nullable_extension_dtype(col.dtype) + and val_col.dtype.kind == "f" + ): + # If the column is a pandas nullable extension type, we need to + # convert the nans to a nullable type as well. + val_col = val_col.nans_to_nulls() + if len(val_col) == val_col.null_count: + # If the column is all nulls, we can use the column dtype + # to avoid unnecessary casting. + val_col = val_col.astype(col.dtype) + to_type = find_common_type([val_col.dtype, col.dtype]) + if ( + cudf.get_option("mode.pandas_compatible") + and is_string_dtype(to_type) + and is_mixed_with_object_dtype(val_col, col) + ): + raise MixedTypeError("Cannot append mixed types") + if cudf.get_option( + "mode.pandas_compatible" + ) and val_col.can_cast_safely(col.dtype): + to_type = col.dtype val_col = val_col.astype(to_type) old_col = col.astype(to_type) res_col = concat_columns([old_col, val_col])._with_type_metadata(to_type) diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index d07a8f76205..5c7d34da65a 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -220,9 +220,21 @@ def __setitem__(self, key, value): and tmp_value.dtype.kind == "b" ) ): - to_dtype = find_common_type( - (tmp_value.dtype, self._frame.dtype) - ) + if not tmp_value.can_cast_safely( + self._frame.dtype + ) and is_pandas_nullable_extension_dtype( + self._frame.dtype + ): + raise TypeError( + f"Invalid value '{value!s}' for dtype " + f"'{self._frame.dtype}'" + ) + if tmp_value.can_cast_safely(self._frame.dtype): + to_dtype = self._frame.dtype + else: + to_dtype = find_common_type( + (tmp_value.dtype, self._frame.dtype) + ) tmp_value = tmp_value.astype(to_dtype) if to_dtype != self._frame.dtype: # Do not remove until pandas-3.0 support is added. @@ -1967,9 +1979,11 @@ def data(self): def astype( self, dtype: Dtype | dict[Hashable, Dtype], - copy: bool = False, + copy: bool | None = None, errors: Literal["raise", "ignore"] = "raise", ) -> Self: + if copy is None: + copy = True if cudf.get_option("mode.pandas_compatible"): if inspect.isclass(dtype) and issubclass( dtype, pd.api.extensions.ExtensionDtype @@ -2960,6 +2974,10 @@ def unique(self): """ res = self._column.unique() if cudf.get_option("mode.pandas_compatible"): + if is_pandas_nullable_extension_dtype(self.dtype): + raise NotImplementedError( + "cudf does not support ExtensionArrays" + ) return res.values return Series._from_column(res, name=self.name) diff --git a/python/cudf/cudf/pandas/_wrappers/pandas.py b/python/cudf/cudf/pandas/_wrappers/pandas.py index b4cb727721c..d4f52670cd1 100644 --- a/python/cudf/cudf/pandas/_wrappers/pandas.py +++ b/python/cudf/cudf/pandas/_wrappers/pandas.py @@ -680,6 +680,7 @@ def Index__setattr__(self, name, value): slow_to_fast=_Unusable(), additional_attributes={ "_pa_array": _FastSlowAttribute("_pa_array", private=True), + "__array__": _FastSlowAttribute("__array__", private=True), }, ) @@ -702,6 +703,7 @@ def Index__setattr__(self, name, value): slow_to_fast=_Unusable(), additional_attributes={ "_pa_array": _FastSlowAttribute("_pa_array", private=True), + "__array__": _FastSlowAttribute("__array__", private=True), }, ) @@ -1899,6 +1901,7 @@ def holiday_calendar_factory_wrapper(*args, **kwargs): slow_to_fast=_Unusable(), additional_attributes={ "_pa_array": _FastSlowAttribute("_pa_array", private=True), + "__array__": _FastSlowAttribute("__array__", private=True), }, ) diff --git a/python/cudf/cudf/pandas/fast_slow_proxy.py b/python/cudf/cudf/pandas/fast_slow_proxy.py index fb62e100e40..73e2d371836 100644 --- a/python/cudf/cudf/pandas/fast_slow_proxy.py +++ b/python/cudf/cudf/pandas/fast_slow_proxy.py @@ -835,7 +835,10 @@ def __get__(self, instance, owner) -> Any: raise e if _is_function_or_method(slow_attr): - self._attr = _MethodProxy(fast_attr, slow_attr) + self._attr = _MethodProxy( + fast_attr, + slow_attr, + ) else: # for anything else, use a fast-slow attribute: self._attr, _ = _fast_slow_function_call( @@ -867,6 +870,7 @@ def __get__(self, instance, owner) -> Any: instance, self._name, )[0] + return self._attr diff --git a/python/cudf/cudf/pandas/scripts/conftest-patch.py b/python/cudf/cudf/pandas/scripts/conftest-patch.py index c8b645b6d26..1d57d38d787 100644 --- a/python/cudf/cudf/pandas/scripts/conftest-patch.py +++ b/python/cudf/cudf/pandas/scripts/conftest-patch.py @@ -88,6 +88,8 @@ def pytest_unconfigure(config): # TODO: Pass these tests with cudf.pandas enabled. NODEIDS_THAT_FAIL_WITH_CUDF_PANDAS = { + "tests/series/methods/test_reindex.py::test_reindexing_with_float64_NA_log", + "tests/copy_view/test_array.py::test_series_values[values]", "tests/api/test_api.py::test_pandas_array_alias", "tests/apply/test_frame_apply.py::test_agg_transform[axis='columns']", "tests/apply/test_frame_apply.py::test_agg_transform[axis='index']", @@ -795,59 +797,11 @@ def pytest_unconfigure(config): "tests/arrays/floating/test_comparison.py::TestComparisonOps::test_ufunc_with_out[Float32Dtype]", "tests/arrays/floating/test_comparison.py::TestComparisonOps::test_ufunc_with_out[Float64Dtype]", "tests/arrays/floating/test_construction.py::test_floating_array_constructor_copy", - "tests/arrays/floating/test_construction.py::test_series_from_float[Float32Dtype]", - "tests/arrays/floating/test_function.py::test_stat_method[kurtosis-kwargs4]", - "tests/arrays/floating/test_function.py::test_stat_method[skew-kwargs5]", "tests/arrays/floating/test_function.py::test_ufuncs_single[absolute]", "tests/arrays/floating/test_function.py::test_ufuncs_single[sign]", "tests/arrays/floating/test_function.py::test_value_counts_empty", - "tests/arrays/floating/test_function.py::test_value_counts_with_normalize", "tests/arrays/floating/test_repr.py::test_frame_repr[Float32Dtype]", "tests/arrays/floating/test_to_numpy.py::test_to_numpy_copy", - "tests/arrays/integer/test_arithmetic.py::test_arith_coerce_scalar[Int16Dtype-__floordiv__]", - "tests/arrays/integer/test_arithmetic.py::test_arith_coerce_scalar[Int16Dtype-__rtruediv__]", - "tests/arrays/integer/test_arithmetic.py::test_arith_coerce_scalar[Int16Dtype-__truediv__]", - "tests/arrays/integer/test_arithmetic.py::test_arith_coerce_scalar[Int32Dtype-__floordiv__]", - "tests/arrays/integer/test_arithmetic.py::test_arith_coerce_scalar[Int32Dtype-__rtruediv__]", - "tests/arrays/integer/test_arithmetic.py::test_arith_coerce_scalar[Int32Dtype-__truediv__]", - "tests/arrays/integer/test_arithmetic.py::test_arith_coerce_scalar[Int64Dtype-__floordiv__]", - "tests/arrays/integer/test_arithmetic.py::test_arith_coerce_scalar[Int64Dtype-__rtruediv__]", - "tests/arrays/integer/test_arithmetic.py::test_arith_coerce_scalar[Int64Dtype-__truediv__]", - "tests/arrays/integer/test_arithmetic.py::test_arith_coerce_scalar[Int8Dtype-__floordiv__]", - "tests/arrays/integer/test_arithmetic.py::test_arith_coerce_scalar[Int8Dtype-__rtruediv__]", - "tests/arrays/integer/test_arithmetic.py::test_arith_coerce_scalar[Int8Dtype-__truediv__]", - "tests/arrays/integer/test_arithmetic.py::test_arith_coerce_scalar[UInt16Dtype-__floordiv__]", - "tests/arrays/integer/test_arithmetic.py::test_arith_coerce_scalar[UInt16Dtype-__rtruediv__]", - "tests/arrays/integer/test_arithmetic.py::test_arith_coerce_scalar[UInt16Dtype-__truediv__]", - "tests/arrays/integer/test_arithmetic.py::test_arith_coerce_scalar[UInt32Dtype-__floordiv__]", - "tests/arrays/integer/test_arithmetic.py::test_arith_coerce_scalar[UInt32Dtype-__rtruediv__]", - "tests/arrays/integer/test_arithmetic.py::test_arith_coerce_scalar[UInt32Dtype-__truediv__]", - "tests/arrays/integer/test_arithmetic.py::test_arith_coerce_scalar[UInt64Dtype-__floordiv__]", - "tests/arrays/integer/test_arithmetic.py::test_arith_coerce_scalar[UInt64Dtype-__rtruediv__]", - "tests/arrays/integer/test_arithmetic.py::test_arith_coerce_scalar[UInt64Dtype-__truediv__]", - "tests/arrays/integer/test_arithmetic.py::test_arith_coerce_scalar[UInt8Dtype-__floordiv__]", - "tests/arrays/integer/test_arithmetic.py::test_arith_coerce_scalar[UInt8Dtype-__rtruediv__]", - "tests/arrays/integer/test_arithmetic.py::test_arith_coerce_scalar[UInt8Dtype-__truediv__]", - "tests/arrays/integer/test_arithmetic.py::test_arithmetic_conversion[__rtruediv__-1.0]", - "tests/arrays/integer/test_arithmetic.py::test_arithmetic_conversion[__rtruediv__-other1]", - "tests/arrays/integer/test_arithmetic.py::test_arithmetic_conversion[__truediv__-1.0]", - "tests/arrays/integer/test_arithmetic.py::test_arithmetic_conversion[__truediv__-other1]", - "tests/arrays/integer/test_arithmetic.py::test_error_invalid_values[Int16Dtype-__mul__]", - "tests/arrays/integer/test_arithmetic.py::test_error_invalid_values[Int16Dtype-__rmul__]", - "tests/arrays/integer/test_arithmetic.py::test_error_invalid_values[Int32Dtype-__mul__]", - "tests/arrays/integer/test_arithmetic.py::test_error_invalid_values[Int32Dtype-__rmul__]", - "tests/arrays/integer/test_arithmetic.py::test_error_invalid_values[Int64Dtype-__mul__]", - "tests/arrays/integer/test_arithmetic.py::test_error_invalid_values[Int64Dtype-__rmul__]", - "tests/arrays/integer/test_arithmetic.py::test_error_invalid_values[Int8Dtype-__mul__]", - "tests/arrays/integer/test_arithmetic.py::test_error_invalid_values[Int8Dtype-__rmul__]", - "tests/arrays/integer/test_arithmetic.py::test_error_invalid_values[UInt16Dtype-__mul__]", - "tests/arrays/integer/test_arithmetic.py::test_error_invalid_values[UInt16Dtype-__rmul__]", - "tests/arrays/integer/test_arithmetic.py::test_error_invalid_values[UInt32Dtype-__mul__]", - "tests/arrays/integer/test_arithmetic.py::test_error_invalid_values[UInt32Dtype-__rmul__]", - "tests/arrays/integer/test_arithmetic.py::test_error_invalid_values[UInt64Dtype-__mul__]", - "tests/arrays/integer/test_arithmetic.py::test_error_invalid_values[UInt64Dtype-__rmul__]", - "tests/arrays/integer/test_arithmetic.py::test_error_invalid_values[UInt8Dtype-__mul__]", - "tests/arrays/integer/test_arithmetic.py::test_error_invalid_values[UInt8Dtype-__rmul__]", "tests/arrays/integer/test_arithmetic.py::test_values_multiplying_large_series_by_NA", "tests/arrays/integer/test_comparison.py::TestComparisonOps::test_ufunc_with_out[Int16Dtype]", "tests/arrays/integer/test_comparison.py::TestComparisonOps::test_ufunc_with_out[Int32Dtype]", @@ -857,79 +811,24 @@ def pytest_unconfigure(config): "tests/arrays/integer/test_comparison.py::TestComparisonOps::test_ufunc_with_out[UInt32Dtype]", "tests/arrays/integer/test_comparison.py::TestComparisonOps::test_ufunc_with_out[UInt64Dtype]", "tests/arrays/integer/test_comparison.py::TestComparisonOps::test_ufunc_with_out[UInt8Dtype]", - "tests/arrays/integer/test_construction.py::test_from_dtype_from_float[Int16Dtype]", - "tests/arrays/integer/test_construction.py::test_from_dtype_from_float[Int32Dtype]", - "tests/arrays/integer/test_construction.py::test_from_dtype_from_float[Int64Dtype]", - "tests/arrays/integer/test_construction.py::test_from_dtype_from_float[Int8Dtype]", - "tests/arrays/integer/test_construction.py::test_from_dtype_from_float[UInt16Dtype]", - "tests/arrays/integer/test_construction.py::test_from_dtype_from_float[UInt32Dtype]", - "tests/arrays/integer/test_construction.py::test_from_dtype_from_float[UInt64Dtype]", - "tests/arrays/integer/test_construction.py::test_from_dtype_from_float[UInt8Dtype]", "tests/arrays/integer/test_construction.py::test_integer_array_constructor_copy", - "tests/arrays/integer/test_dtypes.py::test_astype[data-Int16Dtype]", - "tests/arrays/integer/test_dtypes.py::test_astype[data-Int32Dtype]", - "tests/arrays/integer/test_dtypes.py::test_astype[data-Int64Dtype]", - "tests/arrays/integer/test_dtypes.py::test_astype[data-Int8Dtype]", - "tests/arrays/integer/test_dtypes.py::test_astype[data-UInt16Dtype]", - "tests/arrays/integer/test_dtypes.py::test_astype[data-UInt32Dtype]", - "tests/arrays/integer/test_dtypes.py::test_astype[data-UInt64Dtype]", - "tests/arrays/integer/test_dtypes.py::test_astype[data-UInt8Dtype]", - "tests/arrays/integer/test_dtypes.py::test_astype[data_missing-Int16Dtype]", - "tests/arrays/integer/test_dtypes.py::test_astype[data_missing-Int32Dtype]", - "tests/arrays/integer/test_dtypes.py::test_astype[data_missing-Int64Dtype]", - "tests/arrays/integer/test_dtypes.py::test_astype[data_missing-Int8Dtype]", - "tests/arrays/integer/test_dtypes.py::test_astype[data_missing-UInt16Dtype]", - "tests/arrays/integer/test_dtypes.py::test_astype[data_missing-UInt32Dtype]", - "tests/arrays/integer/test_dtypes.py::test_astype[data_missing-UInt64Dtype]", - "tests/arrays/integer/test_dtypes.py::test_astype[data_missing-UInt8Dtype]", "tests/arrays/integer/test_dtypes.py::test_astype_copy", "tests/arrays/integer/test_function.py::test_ufuncs_single_int[absolute]", "tests/arrays/integer/test_function.py::test_ufuncs_single_int[sign]", "tests/arrays/integer/test_function.py::test_value_counts_empty", - "tests/arrays/integer/test_function.py::test_value_counts_with_normalize", "tests/arrays/interval/test_interval.py::TestSetitem::test_set_na[float64]", "tests/arrays/interval/test_interval_pyarrow.py::test_arrow_array", "tests/arrays/interval/test_interval_pyarrow.py::test_arrow_array_missing", "tests/arrays/interval/test_interval_pyarrow.py::test_from_arrow_from_raw_struct_array", - "tests/arrays/masked/test_arithmetic.py::test_frame[Int16-__rtruediv__]", - "tests/arrays/masked/test_arithmetic.py::test_frame[Int16-__truediv__]", - "tests/arrays/masked/test_arithmetic.py::test_frame[Int32-__rtruediv__]", - "tests/arrays/masked/test_arithmetic.py::test_frame[Int32-__truediv__]", - "tests/arrays/masked/test_arithmetic.py::test_frame[Int64-__rtruediv__]", - "tests/arrays/masked/test_arithmetic.py::test_frame[Int64-__truediv__]", - "tests/arrays/masked/test_arithmetic.py::test_frame[Int8-__rtruediv__]", - "tests/arrays/masked/test_arithmetic.py::test_frame[Int8-__truediv__]", - "tests/arrays/masked/test_arithmetic.py::test_frame[UInt16-__rtruediv__]", - "tests/arrays/masked/test_arithmetic.py::test_frame[UInt16-__truediv__]", - "tests/arrays/masked/test_arithmetic.py::test_frame[UInt32-__rtruediv__]", - "tests/arrays/masked/test_arithmetic.py::test_frame[UInt32-__truediv__]", - "tests/arrays/masked/test_arithmetic.py::test_frame[UInt64-__rtruediv__]", - "tests/arrays/masked/test_arithmetic.py::test_frame[UInt64-__truediv__]", - "tests/arrays/masked/test_arithmetic.py::test_frame[UInt8-__rtruediv__]", - "tests/arrays/masked/test_arithmetic.py::test_frame[UInt8-__truediv__]", + "tests/arrays/masked/test_arithmetic.py::test_frame[boolean-__floordiv__]", + "tests/arrays/masked/test_arithmetic.py::test_frame[boolean-__pow__]", + "tests/arrays/masked/test_arithmetic.py::test_frame[boolean-__rfloordiv__]", + "tests/arrays/masked/test_arithmetic.py::test_frame[boolean-__rpow__]", "tests/arrays/masked/test_arithmetic.py::test_frame[boolean-__rtruediv__]", "tests/arrays/masked/test_arithmetic.py::test_frame[boolean-__truediv__]", - "tests/arrays/masked/test_arithmetic.py::test_series[Int16-__rtruediv__]", - "tests/arrays/masked/test_arithmetic.py::test_series[Int16-__truediv__]", - "tests/arrays/masked/test_arithmetic.py::test_series[Int32-__rtruediv__]", - "tests/arrays/masked/test_arithmetic.py::test_series[Int32-__truediv__]", - "tests/arrays/masked/test_arithmetic.py::test_series[Int64-__rtruediv__]", - "tests/arrays/masked/test_arithmetic.py::test_series[Int64-__truediv__]", - "tests/arrays/masked/test_arithmetic.py::test_series[Int8-__rtruediv__]", - "tests/arrays/masked/test_arithmetic.py::test_series[Int8-__truediv__]", - "tests/arrays/masked/test_arithmetic.py::test_series[UInt16-__rtruediv__]", - "tests/arrays/masked/test_arithmetic.py::test_series[UInt16-__truediv__]", - "tests/arrays/masked/test_arithmetic.py::test_series[UInt32-__rtruediv__]", - "tests/arrays/masked/test_arithmetic.py::test_series[UInt32-__truediv__]", - "tests/arrays/masked/test_arithmetic.py::test_series[UInt64-__rtruediv__]", - "tests/arrays/masked/test_arithmetic.py::test_series[UInt64-__truediv__]", - "tests/arrays/masked/test_arithmetic.py::test_series[UInt8-__rtruediv__]", - "tests/arrays/masked/test_arithmetic.py::test_series[UInt8-__truediv__]", "tests/arrays/masked/test_arithmetic.py::test_series[boolean-__floordiv__]", - "tests/arrays/masked/test_arithmetic.py::test_series[boolean-__mod__]", "tests/arrays/masked/test_arithmetic.py::test_series[boolean-__pow__]", "tests/arrays/masked/test_arithmetic.py::test_series[boolean-__rfloordiv__]", - "tests/arrays/masked/test_arithmetic.py::test_series[boolean-__rmod__]", "tests/arrays/masked/test_arithmetic.py::test_series[boolean-__rpow__]", "tests/arrays/masked/test_arithmetic.py::test_series[boolean-__rtruediv__]", "tests/arrays/masked/test_arithmetic.py::test_series[boolean-__truediv__]", @@ -1119,15 +1018,15 @@ def pytest_unconfigure(config): "tests/arrays/string_/test_string.py::test_repr[string=str[pyarrow]]", "tests/arrays/string_/test_string.py::test_repr[string=str[python]]", "tests/arrays/string_/test_string.py::test_to_numpy_returns_pdna_default[pyarrow_numpy]", - "tests/arrays/string_/test_string.py::test_to_numpy_returns_pdna_default[string=str[pyarrow]]", "tests/arrays/string_/test_string.py::test_to_numpy_returns_pdna_default[string=str[python]]", "tests/arrays/string_/test_string.py::test_value_counts_sort_false[string=str[pyarrow]]", "tests/arrays/string_/test_string.py::test_value_counts_sort_false[string=str[python]]", "tests/arrays/string_/test_string.py::test_value_counts_sort_false[string=string[pyarrow]]", "tests/arrays/string_/test_string.py::test_value_counts_with_normalize[pyarrow]", "tests/arrays/string_/test_string.py::test_value_counts_with_normalize[python]", + "tests/arrays/string_/test_string.py::test_value_counts_with_normalize[string=str[pyarrow]]", + "tests/arrays/string_/test_string.py::test_value_counts_with_normalize[string=str[python]]", "tests/arrays/string_/test_string.py::test_value_counts_with_normalize[string=string[pyarrow]]", - "tests/arrays/string_/test_string.py::test_value_counts_with_normalize[string=string[python]]", "tests/arrays/test_array.py::test_array_inference[data7-expected7]", "tests/arrays/test_datetimelike.py::TestDatetimeArray::test_array_i8_dtype['+01:15'-B]", "tests/arrays/test_datetimelike.py::TestDatetimeArray::test_array_i8_dtype['+01:15'-D]", @@ -1356,7 +1255,6 @@ def pytest_unconfigure(config): "tests/arrays/test_datetimelike.py::TestTimedeltaArray::test_searchsorted_castable_strings[pyarrow_numpy-series]", "tests/arrays/test_datetimelike.py::test_searchsorted_datetimelike_with_listlike_invalid_dtype[arg0-values1]", "tests/arrays/test_datetimes.py::TestDatetimeArray::test_array_interface", - "tests/arrays/test_datetimes.py::TestDatetimeArray::test_astype_copies[datetime64[ns]-datetime64[ns]]", "tests/arrays/test_datetimes.py::TestDatetimeArray::test_astype_to_same", "tests/arrays/test_datetimes.py::TestDatetimeArray::test_shift_fill_value", "tests/arrays/test_datetimes.py::TestDatetimeArrayComparisons::test_cmp_dt64_arraylike_tznaive[eq]", @@ -1630,78 +1528,6 @@ def pytest_unconfigure(config): "tests/base/test_misc.py::test_memory_usage_components_narrow_series[float16]", "tests/base/test_misc.py::test_ndarray_compat_properties[multi]", "tests/base/test_misc.py::test_ndarray_compat_properties[tuples]", - "tests/base/test_unique.py::test_nunique_null[float32-None]", - "tests/base/test_unique.py::test_nunique_null[float32-nan]", - "tests/base/test_unique.py::test_nunique_null[float32-series-None]", - "tests/base/test_unique.py::test_nunique_null[float32-series-nan]", - "tests/base/test_unique.py::test_nunique_null[float64-None]", - "tests/base/test_unique.py::test_nunique_null[float64-nan]", - "tests/base/test_unique.py::test_nunique_null[series-with-bool-dtype-index-None]", - "tests/base/test_unique.py::test_nunique_null[series-with-bool-dtype-index-nan]", - "tests/base/test_unique.py::test_nunique_null[series-with-bool-object-index-None]", - "tests/base/test_unique.py::test_nunique_null[series-with-bool-object-index-nan]", - "tests/base/test_unique.py::test_nunique_null[series-with-categorical-index-None]", - "tests/base/test_unique.py::test_nunique_null[series-with-categorical-index-nan]", - "tests/base/test_unique.py::test_nunique_null[series-with-complex128-index-None]", - "tests/base/test_unique.py::test_nunique_null[series-with-complex128-index-nan]", - "tests/base/test_unique.py::test_nunique_null[series-with-complex64-index-None]", - "tests/base/test_unique.py::test_nunique_null[series-with-complex64-index-nan]", - "tests/base/test_unique.py::test_nunique_null[series-with-datetime-index-None]", - "tests/base/test_unique.py::test_nunique_null[series-with-datetime-index-nan]", - "tests/base/test_unique.py::test_nunique_null[series-with-datetime-tz-index-None]", - "tests/base/test_unique.py::test_nunique_null[series-with-datetime-tz-index-nan]", - "tests/base/test_unique.py::test_nunique_null[series-with-float32-index-None]", - "tests/base/test_unique.py::test_nunique_null[series-with-float32-index-nan]", - "tests/base/test_unique.py::test_nunique_null[series-with-float64-index-None]", - "tests/base/test_unique.py::test_nunique_null[series-with-float64-index-nan]", - "tests/base/test_unique.py::test_nunique_null[series-with-int16-index-None]", - "tests/base/test_unique.py::test_nunique_null[series-with-int16-index-nan]", - "tests/base/test_unique.py::test_nunique_null[series-with-int32-index-None]", - "tests/base/test_unique.py::test_nunique_null[series-with-int32-index-nan]", - "tests/base/test_unique.py::test_nunique_null[series-with-int64-index-None]", - "tests/base/test_unique.py::test_nunique_null[series-with-int64-index-nan]", - "tests/base/test_unique.py::test_nunique_null[series-with-int8-index-None]", - "tests/base/test_unique.py::test_nunique_null[series-with-int8-index-nan]", - "tests/base/test_unique.py::test_nunique_null[series-with-interval-index-None]", - "tests/base/test_unique.py::test_nunique_null[series-with-interval-index-nan]", - "tests/base/test_unique.py::test_nunique_null[series-with-mi-with-dt64tz-level-index-None]", - "tests/base/test_unique.py::test_nunique_null[series-with-mi-with-dt64tz-level-index-nan]", - "tests/base/test_unique.py::test_nunique_null[series-with-multi-index-None]", - "tests/base/test_unique.py::test_nunique_null[series-with-multi-index-nan]", - "tests/base/test_unique.py::test_nunique_null[series-with-nullable_bool-index-None]", - "tests/base/test_unique.py::test_nunique_null[series-with-nullable_bool-index-nan]", - "tests/base/test_unique.py::test_nunique_null[series-with-nullable_float-index-None]", - "tests/base/test_unique.py::test_nunique_null[series-with-nullable_float-index-nan]", - "tests/base/test_unique.py::test_nunique_null[series-with-nullable_int-index-None]", - "tests/base/test_unique.py::test_nunique_null[series-with-nullable_int-index-nan]", - "tests/base/test_unique.py::test_nunique_null[series-with-nullable_uint-index-None]", - "tests/base/test_unique.py::test_nunique_null[series-with-nullable_uint-index-nan]", - "tests/base/test_unique.py::test_nunique_null[series-with-object-index-None]", - "tests/base/test_unique.py::test_nunique_null[series-with-object-index-nan]", - "tests/base/test_unique.py::test_nunique_null[series-with-period-index-None]", - "tests/base/test_unique.py::test_nunique_null[series-with-period-index-nan]", - "tests/base/test_unique.py::test_nunique_null[series-with-range-index-None]", - "tests/base/test_unique.py::test_nunique_null[series-with-range-index-nan]", - "tests/base/test_unique.py::test_nunique_null[series-with-repeats-index-None]", - "tests/base/test_unique.py::test_nunique_null[series-with-repeats-index-nan]", - "tests/base/test_unique.py::test_nunique_null[series-with-string-index-None]", - "tests/base/test_unique.py::test_nunique_null[series-with-string-index-nan]", - "tests/base/test_unique.py::test_nunique_null[series-with-string-pyarrow-index-None]", - "tests/base/test_unique.py::test_nunique_null[series-with-string-pyarrow-index-nan]", - "tests/base/test_unique.py::test_nunique_null[series-with-string-python-index-None]", - "tests/base/test_unique.py::test_nunique_null[series-with-string-python-index-nan]", - "tests/base/test_unique.py::test_nunique_null[series-with-timedelta-index-None]", - "tests/base/test_unique.py::test_nunique_null[series-with-timedelta-index-nan]", - "tests/base/test_unique.py::test_nunique_null[series-with-tuples-index-None]", - "tests/base/test_unique.py::test_nunique_null[series-with-tuples-index-nan]", - "tests/base/test_unique.py::test_nunique_null[series-with-uint16-index-None]", - "tests/base/test_unique.py::test_nunique_null[series-with-uint16-index-nan]", - "tests/base/test_unique.py::test_nunique_null[series-with-uint32-index-None]", - "tests/base/test_unique.py::test_nunique_null[series-with-uint32-index-nan]", - "tests/base/test_unique.py::test_nunique_null[series-with-uint64-index-None]", - "tests/base/test_unique.py::test_nunique_null[series-with-uint64-index-nan]", - "tests/base/test_unique.py::test_nunique_null[series-with-uint8-index-None]", - "tests/base/test_unique.py::test_nunique_null[series-with-uint8-index-nan]", "tests/base/test_unique.py::test_unique[multi]", "tests/base/test_unique.py::test_unique[tuples]", "tests/base/test_value_counts.py::test_value_counts[multi]", @@ -1843,7 +1669,6 @@ def pytest_unconfigure(config): "tests/copy_view/test_array.py::test_dataframe_values[values]", "tests/copy_view/test_array.py::test_series_array_ea_dtypes", "tests/copy_view/test_array.py::test_series_to_numpy", - "tests/copy_view/test_array.py::test_series_values[values]", "tests/copy_view/test_astype.py::test_astype_avoids_copy[Int64-Int64]", "tests/copy_view/test_astype.py::test_astype_avoids_copy[Int64-int64]", "tests/copy_view/test_astype.py::test_astype_avoids_copy[int64-Int64]", @@ -2507,8 +2332,19 @@ def pytest_unconfigure(config): "tests/extension/test_arrow.py::TestArrowArray::test_argsort_missing[uint32]", "tests/extension/test_arrow.py::TestArrowArray::test_argsort_missing[uint64]", "tests/extension/test_arrow.py::TestArrowArray::test_argsort_missing[uint8]", + "tests/extension/test_arrow.py::TestArrowArray::test_arith_frame_with_scalar[bool-__add__]", + "tests/extension/test_arrow.py::TestArrowArray::test_arith_frame_with_scalar[bool-__floordiv__]", + "tests/extension/test_arrow.py::TestArrowArray::test_arith_frame_with_scalar[bool-__mod__]", + "tests/extension/test_arrow.py::TestArrowArray::test_arith_frame_with_scalar[bool-__mul__]", + "tests/extension/test_arrow.py::TestArrowArray::test_arith_frame_with_scalar[bool-__pow__]", + "tests/extension/test_arrow.py::TestArrowArray::test_arith_frame_with_scalar[bool-__radd__]", "tests/extension/test_arrow.py::TestArrowArray::test_arith_frame_with_scalar[bool-__rfloordiv__]", + "tests/extension/test_arrow.py::TestArrowArray::test_arith_frame_with_scalar[bool-__rmod__]", + "tests/extension/test_arrow.py::TestArrowArray::test_arith_frame_with_scalar[bool-__rmul__]", + "tests/extension/test_arrow.py::TestArrowArray::test_arith_frame_with_scalar[bool-__rpow__]", + "tests/extension/test_arrow.py::TestArrowArray::test_arith_frame_with_scalar[bool-__rsub__]", "tests/extension/test_arrow.py::TestArrowArray::test_arith_frame_with_scalar[bool-__rtruediv__]", + "tests/extension/test_arrow.py::TestArrowArray::test_arith_frame_with_scalar[bool-__sub__]", "tests/extension/test_arrow.py::TestArrowArray::test_arith_frame_with_scalar[date32[day]-__rsub__]", "tests/extension/test_arrow.py::TestArrowArray::test_arith_frame_with_scalar[date32[day]-__sub__]", "tests/extension/test_arrow.py::TestArrowArray::test_arith_frame_with_scalar[date64[ms]-__rsub__]", @@ -2524,7 +2360,9 @@ def pytest_unconfigure(config): "tests/extension/test_arrow.py::TestArrowArray::test_arith_frame_with_scalar[decimal128(7, 3)-__rtruediv__]", "tests/extension/test_arrow.py::TestArrowArray::test_arith_frame_with_scalar[decimal128(7, 3)-__sub__]", "tests/extension/test_arrow.py::TestArrowArray::test_arith_frame_with_scalar[decimal128(7, 3)-__truediv__]", + "tests/extension/test_arrow.py::TestArrowArray::test_arith_frame_with_scalar[double-__mod__]", "tests/extension/test_arrow.py::TestArrowArray::test_arith_frame_with_scalar[double-__rfloordiv__]", + "tests/extension/test_arrow.py::TestArrowArray::test_arith_frame_with_scalar[double-__rmod__]", "tests/extension/test_arrow.py::TestArrowArray::test_arith_frame_with_scalar[double-__rtruediv__]", "tests/extension/test_arrow.py::TestArrowArray::test_arith_frame_with_scalar[duration[ms]-__add__]", "tests/extension/test_arrow.py::TestArrowArray::test_arith_frame_with_scalar[duration[ms]-__radd__]", @@ -2542,19 +2380,29 @@ def pytest_unconfigure(config): "tests/extension/test_arrow.py::TestArrowArray::test_arith_frame_with_scalar[duration[us]-__radd__]", "tests/extension/test_arrow.py::TestArrowArray::test_arith_frame_with_scalar[duration[us]-__rsub__]", "tests/extension/test_arrow.py::TestArrowArray::test_arith_frame_with_scalar[duration[us]-__sub__]", + "tests/extension/test_arrow.py::TestArrowArray::test_arith_frame_with_scalar[float-__mod__]", "tests/extension/test_arrow.py::TestArrowArray::test_arith_frame_with_scalar[float-__rfloordiv__]", + "tests/extension/test_arrow.py::TestArrowArray::test_arith_frame_with_scalar[float-__rmod__]", "tests/extension/test_arrow.py::TestArrowArray::test_arith_frame_with_scalar[float-__rtruediv__]", "tests/extension/test_arrow.py::TestArrowArray::test_arith_frame_with_scalar[int16-__floordiv__]", + "tests/extension/test_arrow.py::TestArrowArray::test_arith_frame_with_scalar[int16-__mod__]", "tests/extension/test_arrow.py::TestArrowArray::test_arith_frame_with_scalar[int16-__rfloordiv__]", + "tests/extension/test_arrow.py::TestArrowArray::test_arith_frame_with_scalar[int16-__rmod__]", "tests/extension/test_arrow.py::TestArrowArray::test_arith_frame_with_scalar[int16-__rtruediv__]", "tests/extension/test_arrow.py::TestArrowArray::test_arith_frame_with_scalar[int32-__floordiv__]", + "tests/extension/test_arrow.py::TestArrowArray::test_arith_frame_with_scalar[int32-__mod__]", "tests/extension/test_arrow.py::TestArrowArray::test_arith_frame_with_scalar[int32-__rfloordiv__]", + "tests/extension/test_arrow.py::TestArrowArray::test_arith_frame_with_scalar[int32-__rmod__]", "tests/extension/test_arrow.py::TestArrowArray::test_arith_frame_with_scalar[int32-__rtruediv__]", "tests/extension/test_arrow.py::TestArrowArray::test_arith_frame_with_scalar[int64-__floordiv__]", + "tests/extension/test_arrow.py::TestArrowArray::test_arith_frame_with_scalar[int64-__mod__]", "tests/extension/test_arrow.py::TestArrowArray::test_arith_frame_with_scalar[int64-__rfloordiv__]", + "tests/extension/test_arrow.py::TestArrowArray::test_arith_frame_with_scalar[int64-__rmod__]", "tests/extension/test_arrow.py::TestArrowArray::test_arith_frame_with_scalar[int64-__rtruediv__]", "tests/extension/test_arrow.py::TestArrowArray::test_arith_frame_with_scalar[int8-__floordiv__]", + "tests/extension/test_arrow.py::TestArrowArray::test_arith_frame_with_scalar[int8-__mod__]", "tests/extension/test_arrow.py::TestArrowArray::test_arith_frame_with_scalar[int8-__rfloordiv__]", + "tests/extension/test_arrow.py::TestArrowArray::test_arith_frame_with_scalar[int8-__rmod__]", "tests/extension/test_arrow.py::TestArrowArray::test_arith_frame_with_scalar[int8-__rtruediv__]", "tests/extension/test_arrow.py::TestArrowArray::test_arith_frame_with_scalar[timestamp[ms, tz=US/Eastern]-__rsub__]", "tests/extension/test_arrow.py::TestArrowArray::test_arith_frame_with_scalar[timestamp[ms, tz=US/Eastern]-__sub__]", @@ -2589,16 +2437,31 @@ def pytest_unconfigure(config): "tests/extension/test_arrow.py::TestArrowArray::test_arith_frame_with_scalar[timestamp[us]-__rsub__]", "tests/extension/test_arrow.py::TestArrowArray::test_arith_frame_with_scalar[timestamp[us]-__sub__]", "tests/extension/test_arrow.py::TestArrowArray::test_arith_frame_with_scalar[uint16-__floordiv__]", + "tests/extension/test_arrow.py::TestArrowArray::test_arith_frame_with_scalar[uint16-__mod__]", "tests/extension/test_arrow.py::TestArrowArray::test_arith_frame_with_scalar[uint16-__rfloordiv__]", + "tests/extension/test_arrow.py::TestArrowArray::test_arith_frame_with_scalar[uint16-__rmod__]", "tests/extension/test_arrow.py::TestArrowArray::test_arith_frame_with_scalar[uint16-__rtruediv__]", "tests/extension/test_arrow.py::TestArrowArray::test_arith_frame_with_scalar[uint32-__floordiv__]", + "tests/extension/test_arrow.py::TestArrowArray::test_arith_frame_with_scalar[uint32-__mod__]", "tests/extension/test_arrow.py::TestArrowArray::test_arith_frame_with_scalar[uint32-__rfloordiv__]", + "tests/extension/test_arrow.py::TestArrowArray::test_arith_frame_with_scalar[uint32-__rmod__]", "tests/extension/test_arrow.py::TestArrowArray::test_arith_frame_with_scalar[uint32-__rtruediv__]", + "tests/extension/test_arrow.py::TestArrowArray::test_arith_frame_with_scalar[uint64-__add__]", "tests/extension/test_arrow.py::TestArrowArray::test_arith_frame_with_scalar[uint64-__floordiv__]", + "tests/extension/test_arrow.py::TestArrowArray::test_arith_frame_with_scalar[uint64-__mod__]", + "tests/extension/test_arrow.py::TestArrowArray::test_arith_frame_with_scalar[uint64-__mul__]", + "tests/extension/test_arrow.py::TestArrowArray::test_arith_frame_with_scalar[uint64-__pow__]", + "tests/extension/test_arrow.py::TestArrowArray::test_arith_frame_with_scalar[uint64-__radd__]", "tests/extension/test_arrow.py::TestArrowArray::test_arith_frame_with_scalar[uint64-__rfloordiv__]", + "tests/extension/test_arrow.py::TestArrowArray::test_arith_frame_with_scalar[uint64-__rmod__]", + "tests/extension/test_arrow.py::TestArrowArray::test_arith_frame_with_scalar[uint64-__rmul__]", + "tests/extension/test_arrow.py::TestArrowArray::test_arith_frame_with_scalar[uint64-__rsub__]", "tests/extension/test_arrow.py::TestArrowArray::test_arith_frame_with_scalar[uint64-__rtruediv__]", + "tests/extension/test_arrow.py::TestArrowArray::test_arith_frame_with_scalar[uint64-__sub__]", "tests/extension/test_arrow.py::TestArrowArray::test_arith_frame_with_scalar[uint8-__floordiv__]", + "tests/extension/test_arrow.py::TestArrowArray::test_arith_frame_with_scalar[uint8-__mod__]", "tests/extension/test_arrow.py::TestArrowArray::test_arith_frame_with_scalar[uint8-__rfloordiv__]", + "tests/extension/test_arrow.py::TestArrowArray::test_arith_frame_with_scalar[uint8-__rmod__]", "tests/extension/test_arrow.py::TestArrowArray::test_arith_frame_with_scalar[uint8-__rtruediv__]", "tests/extension/test_arrow.py::TestArrowArray::test_arith_series_with_array[bool-__add__]", "tests/extension/test_arrow.py::TestArrowArray::test_arith_series_with_array[bool-__floordiv__]", @@ -2611,9 +2474,7 @@ def pytest_unconfigure(config): "tests/extension/test_arrow.py::TestArrowArray::test_arith_series_with_array[bool-__rmul__]", "tests/extension/test_arrow.py::TestArrowArray::test_arith_series_with_array[bool-__rpow__]", "tests/extension/test_arrow.py::TestArrowArray::test_arith_series_with_array[bool-__rsub__]", - "tests/extension/test_arrow.py::TestArrowArray::test_arith_series_with_array[bool-__rtruediv__]", "tests/extension/test_arrow.py::TestArrowArray::test_arith_series_with_array[bool-__sub__]", - "tests/extension/test_arrow.py::TestArrowArray::test_arith_series_with_array[bool-__truediv__]", "tests/extension/test_arrow.py::TestArrowArray::test_arith_series_with_array[date32[day]-__rsub__]", "tests/extension/test_arrow.py::TestArrowArray::test_arith_series_with_array[date32[day]-__sub__]", "tests/extension/test_arrow.py::TestArrowArray::test_arith_series_with_array[date64[ms]-__rsub__]", @@ -2686,9 +2547,7 @@ def pytest_unconfigure(config): "tests/extension/test_arrow.py::TestArrowArray::test_arith_series_with_array[int16-__rmod__]", "tests/extension/test_arrow.py::TestArrowArray::test_arith_series_with_array[int16-__rmul__]", "tests/extension/test_arrow.py::TestArrowArray::test_arith_series_with_array[int16-__rsub__]", - "tests/extension/test_arrow.py::TestArrowArray::test_arith_series_with_array[int16-__rtruediv__]", "tests/extension/test_arrow.py::TestArrowArray::test_arith_series_with_array[int16-__sub__]", - "tests/extension/test_arrow.py::TestArrowArray::test_arith_series_with_array[int16-__truediv__]", "tests/extension/test_arrow.py::TestArrowArray::test_arith_series_with_array[int32-__add__]", "tests/extension/test_arrow.py::TestArrowArray::test_arith_series_with_array[int32-__floordiv__]", "tests/extension/test_arrow.py::TestArrowArray::test_arith_series_with_array[int32-__mod__]", @@ -2698,9 +2557,7 @@ def pytest_unconfigure(config): "tests/extension/test_arrow.py::TestArrowArray::test_arith_series_with_array[int32-__rmod__]", "tests/extension/test_arrow.py::TestArrowArray::test_arith_series_with_array[int32-__rmul__]", "tests/extension/test_arrow.py::TestArrowArray::test_arith_series_with_array[int32-__rsub__]", - "tests/extension/test_arrow.py::TestArrowArray::test_arith_series_with_array[int32-__rtruediv__]", "tests/extension/test_arrow.py::TestArrowArray::test_arith_series_with_array[int32-__sub__]", - "tests/extension/test_arrow.py::TestArrowArray::test_arith_series_with_array[int32-__truediv__]", "tests/extension/test_arrow.py::TestArrowArray::test_arith_series_with_array[int64-__add__]", "tests/extension/test_arrow.py::TestArrowArray::test_arith_series_with_array[int64-__floordiv__]", "tests/extension/test_arrow.py::TestArrowArray::test_arith_series_with_array[int64-__mod__]", @@ -2710,9 +2567,7 @@ def pytest_unconfigure(config): "tests/extension/test_arrow.py::TestArrowArray::test_arith_series_with_array[int64-__rmod__]", "tests/extension/test_arrow.py::TestArrowArray::test_arith_series_with_array[int64-__rmul__]", "tests/extension/test_arrow.py::TestArrowArray::test_arith_series_with_array[int64-__rsub__]", - "tests/extension/test_arrow.py::TestArrowArray::test_arith_series_with_array[int64-__rtruediv__]", "tests/extension/test_arrow.py::TestArrowArray::test_arith_series_with_array[int64-__sub__]", - "tests/extension/test_arrow.py::TestArrowArray::test_arith_series_with_array[int64-__truediv__]", "tests/extension/test_arrow.py::TestArrowArray::test_arith_series_with_array[int8-__add__]", "tests/extension/test_arrow.py::TestArrowArray::test_arith_series_with_array[int8-__floordiv__]", "tests/extension/test_arrow.py::TestArrowArray::test_arith_series_with_array[int8-__mod__]", @@ -2722,9 +2577,7 @@ def pytest_unconfigure(config): "tests/extension/test_arrow.py::TestArrowArray::test_arith_series_with_array[int8-__rmod__]", "tests/extension/test_arrow.py::TestArrowArray::test_arith_series_with_array[int8-__rmul__]", "tests/extension/test_arrow.py::TestArrowArray::test_arith_series_with_array[int8-__rsub__]", - "tests/extension/test_arrow.py::TestArrowArray::test_arith_series_with_array[int8-__rtruediv__]", "tests/extension/test_arrow.py::TestArrowArray::test_arith_series_with_array[int8-__sub__]", - "tests/extension/test_arrow.py::TestArrowArray::test_arith_series_with_array[int8-__truediv__]", "tests/extension/test_arrow.py::TestArrowArray::test_arith_series_with_array[timestamp[ms, tz=US/Eastern]-__rsub__]", "tests/extension/test_arrow.py::TestArrowArray::test_arith_series_with_array[timestamp[ms, tz=US/Eastern]-__sub__]", "tests/extension/test_arrow.py::TestArrowArray::test_arith_series_with_array[timestamp[ms, tz=US/Pacific]-__rsub__]", @@ -2765,8 +2618,6 @@ def pytest_unconfigure(config): "tests/extension/test_arrow.py::TestArrowArray::test_arith_series_with_array[uint16-__radd__]", "tests/extension/test_arrow.py::TestArrowArray::test_arith_series_with_array[uint16-__rmod__]", "tests/extension/test_arrow.py::TestArrowArray::test_arith_series_with_array[uint16-__rmul__]", - "tests/extension/test_arrow.py::TestArrowArray::test_arith_series_with_array[uint16-__rtruediv__]", - "tests/extension/test_arrow.py::TestArrowArray::test_arith_series_with_array[uint16-__truediv__]", "tests/extension/test_arrow.py::TestArrowArray::test_arith_series_with_array[uint32-__add__]", "tests/extension/test_arrow.py::TestArrowArray::test_arith_series_with_array[uint32-__floordiv__]", "tests/extension/test_arrow.py::TestArrowArray::test_arith_series_with_array[uint32-__mod__]", @@ -2775,8 +2626,6 @@ def pytest_unconfigure(config): "tests/extension/test_arrow.py::TestArrowArray::test_arith_series_with_array[uint32-__radd__]", "tests/extension/test_arrow.py::TestArrowArray::test_arith_series_with_array[uint32-__rmod__]", "tests/extension/test_arrow.py::TestArrowArray::test_arith_series_with_array[uint32-__rmul__]", - "tests/extension/test_arrow.py::TestArrowArray::test_arith_series_with_array[uint32-__rtruediv__]", - "tests/extension/test_arrow.py::TestArrowArray::test_arith_series_with_array[uint32-__truediv__]", "tests/extension/test_arrow.py::TestArrowArray::test_arith_series_with_array[uint64-__add__]", "tests/extension/test_arrow.py::TestArrowArray::test_arith_series_with_array[uint64-__floordiv__]", "tests/extension/test_arrow.py::TestArrowArray::test_arith_series_with_array[uint64-__mod__]", @@ -2785,8 +2634,6 @@ def pytest_unconfigure(config): "tests/extension/test_arrow.py::TestArrowArray::test_arith_series_with_array[uint64-__radd__]", "tests/extension/test_arrow.py::TestArrowArray::test_arith_series_with_array[uint64-__rmod__]", "tests/extension/test_arrow.py::TestArrowArray::test_arith_series_with_array[uint64-__rmul__]", - "tests/extension/test_arrow.py::TestArrowArray::test_arith_series_with_array[uint64-__rtruediv__]", - "tests/extension/test_arrow.py::TestArrowArray::test_arith_series_with_array[uint64-__truediv__]", "tests/extension/test_arrow.py::TestArrowArray::test_arith_series_with_array[uint8-__add__]", "tests/extension/test_arrow.py::TestArrowArray::test_arith_series_with_array[uint8-__floordiv__]", "tests/extension/test_arrow.py::TestArrowArray::test_arith_series_with_array[uint8-__mod__]", @@ -2795,8 +2642,18 @@ def pytest_unconfigure(config): "tests/extension/test_arrow.py::TestArrowArray::test_arith_series_with_array[uint8-__radd__]", "tests/extension/test_arrow.py::TestArrowArray::test_arith_series_with_array[uint8-__rmod__]", "tests/extension/test_arrow.py::TestArrowArray::test_arith_series_with_array[uint8-__rmul__]", - "tests/extension/test_arrow.py::TestArrowArray::test_arith_series_with_array[uint8-__rtruediv__]", - "tests/extension/test_arrow.py::TestArrowArray::test_arith_series_with_array[uint8-__truediv__]", + "tests/extension/test_arrow.py::TestArrowArray::test_arith_series_with_scalar[bool-__add__]", + "tests/extension/test_arrow.py::TestArrowArray::test_arith_series_with_scalar[bool-__floordiv__]", + "tests/extension/test_arrow.py::TestArrowArray::test_arith_series_with_scalar[bool-__mod__]", + "tests/extension/test_arrow.py::TestArrowArray::test_arith_series_with_scalar[bool-__mul__]", + "tests/extension/test_arrow.py::TestArrowArray::test_arith_series_with_scalar[bool-__pow__]", + "tests/extension/test_arrow.py::TestArrowArray::test_arith_series_with_scalar[bool-__radd__]", + "tests/extension/test_arrow.py::TestArrowArray::test_arith_series_with_scalar[bool-__rfloordiv__]", + "tests/extension/test_arrow.py::TestArrowArray::test_arith_series_with_scalar[bool-__rmod__]", + "tests/extension/test_arrow.py::TestArrowArray::test_arith_series_with_scalar[bool-__rmul__]", + "tests/extension/test_arrow.py::TestArrowArray::test_arith_series_with_scalar[bool-__rpow__]", + "tests/extension/test_arrow.py::TestArrowArray::test_arith_series_with_scalar[bool-__rsub__]", + "tests/extension/test_arrow.py::TestArrowArray::test_arith_series_with_scalar[bool-__sub__]", "tests/extension/test_arrow.py::TestArrowArray::test_arith_series_with_scalar[date32[day]-__rsub__]", "tests/extension/test_arrow.py::TestArrowArray::test_arith_series_with_scalar[date32[day]-__sub__]", "tests/extension/test_arrow.py::TestArrowArray::test_arith_series_with_scalar[date64[ms]-__rsub__]", @@ -2810,6 +2667,8 @@ def pytest_unconfigure(config): "tests/extension/test_arrow.py::TestArrowArray::test_arith_series_with_scalar[decimal128(7, 3)-__rsub__]", "tests/extension/test_arrow.py::TestArrowArray::test_arith_series_with_scalar[decimal128(7, 3)-__sub__]", "tests/extension/test_arrow.py::TestArrowArray::test_arith_series_with_scalar[decimal128(7, 3)-__truediv__]", + "tests/extension/test_arrow.py::TestArrowArray::test_arith_series_with_scalar[double-__mod__]", + "tests/extension/test_arrow.py::TestArrowArray::test_arith_series_with_scalar[double-__rmod__]", "tests/extension/test_arrow.py::TestArrowArray::test_arith_series_with_scalar[duration[ms]-__add__]", "tests/extension/test_arrow.py::TestArrowArray::test_arith_series_with_scalar[duration[ms]-__radd__]", "tests/extension/test_arrow.py::TestArrowArray::test_arith_series_with_scalar[duration[ms]-__rsub__]", @@ -2826,10 +2685,24 @@ def pytest_unconfigure(config): "tests/extension/test_arrow.py::TestArrowArray::test_arith_series_with_scalar[duration[us]-__radd__]", "tests/extension/test_arrow.py::TestArrowArray::test_arith_series_with_scalar[duration[us]-__rsub__]", "tests/extension/test_arrow.py::TestArrowArray::test_arith_series_with_scalar[duration[us]-__sub__]", + "tests/extension/test_arrow.py::TestArrowArray::test_arith_series_with_scalar[float-__mod__]", + "tests/extension/test_arrow.py::TestArrowArray::test_arith_series_with_scalar[float-__rmod__]", "tests/extension/test_arrow.py::TestArrowArray::test_arith_series_with_scalar[int16-__floordiv__]", + "tests/extension/test_arrow.py::TestArrowArray::test_arith_series_with_scalar[int16-__mod__]", + "tests/extension/test_arrow.py::TestArrowArray::test_arith_series_with_scalar[int16-__rfloordiv__]", + "tests/extension/test_arrow.py::TestArrowArray::test_arith_series_with_scalar[int16-__rmod__]", "tests/extension/test_arrow.py::TestArrowArray::test_arith_series_with_scalar[int32-__floordiv__]", + "tests/extension/test_arrow.py::TestArrowArray::test_arith_series_with_scalar[int32-__mod__]", + "tests/extension/test_arrow.py::TestArrowArray::test_arith_series_with_scalar[int32-__rfloordiv__]", + "tests/extension/test_arrow.py::TestArrowArray::test_arith_series_with_scalar[int32-__rmod__]", "tests/extension/test_arrow.py::TestArrowArray::test_arith_series_with_scalar[int64-__floordiv__]", + "tests/extension/test_arrow.py::TestArrowArray::test_arith_series_with_scalar[int64-__mod__]", + "tests/extension/test_arrow.py::TestArrowArray::test_arith_series_with_scalar[int64-__rfloordiv__]", + "tests/extension/test_arrow.py::TestArrowArray::test_arith_series_with_scalar[int64-__rmod__]", "tests/extension/test_arrow.py::TestArrowArray::test_arith_series_with_scalar[int8-__floordiv__]", + "tests/extension/test_arrow.py::TestArrowArray::test_arith_series_with_scalar[int8-__mod__]", + "tests/extension/test_arrow.py::TestArrowArray::test_arith_series_with_scalar[int8-__rfloordiv__]", + "tests/extension/test_arrow.py::TestArrowArray::test_arith_series_with_scalar[int8-__rmod__]", "tests/extension/test_arrow.py::TestArrowArray::test_arith_series_with_scalar[timestamp[ms, tz=US/Eastern]-__rsub__]", "tests/extension/test_arrow.py::TestArrowArray::test_arith_series_with_scalar[timestamp[ms, tz=US/Eastern]-__sub__]", "tests/extension/test_arrow.py::TestArrowArray::test_arith_series_with_scalar[timestamp[ms, tz=US/Pacific]-__rsub__]", @@ -2863,49 +2736,30 @@ def pytest_unconfigure(config): "tests/extension/test_arrow.py::TestArrowArray::test_arith_series_with_scalar[timestamp[us]-__rsub__]", "tests/extension/test_arrow.py::TestArrowArray::test_arith_series_with_scalar[timestamp[us]-__sub__]", "tests/extension/test_arrow.py::TestArrowArray::test_arith_series_with_scalar[uint16-__floordiv__]", + "tests/extension/test_arrow.py::TestArrowArray::test_arith_series_with_scalar[uint16-__mod__]", + "tests/extension/test_arrow.py::TestArrowArray::test_arith_series_with_scalar[uint16-__rfloordiv__]", + "tests/extension/test_arrow.py::TestArrowArray::test_arith_series_with_scalar[uint16-__rmod__]", "tests/extension/test_arrow.py::TestArrowArray::test_arith_series_with_scalar[uint32-__floordiv__]", + "tests/extension/test_arrow.py::TestArrowArray::test_arith_series_with_scalar[uint32-__mod__]", + "tests/extension/test_arrow.py::TestArrowArray::test_arith_series_with_scalar[uint32-__rfloordiv__]", + "tests/extension/test_arrow.py::TestArrowArray::test_arith_series_with_scalar[uint32-__rmod__]", + "tests/extension/test_arrow.py::TestArrowArray::test_arith_series_with_scalar[uint64-__add__]", "tests/extension/test_arrow.py::TestArrowArray::test_arith_series_with_scalar[uint64-__floordiv__]", + "tests/extension/test_arrow.py::TestArrowArray::test_arith_series_with_scalar[uint64-__mod__]", + "tests/extension/test_arrow.py::TestArrowArray::test_arith_series_with_scalar[uint64-__mul__]", + "tests/extension/test_arrow.py::TestArrowArray::test_arith_series_with_scalar[uint64-__pow__]", + "tests/extension/test_arrow.py::TestArrowArray::test_arith_series_with_scalar[uint64-__radd__]", + "tests/extension/test_arrow.py::TestArrowArray::test_arith_series_with_scalar[uint64-__rfloordiv__]", + "tests/extension/test_arrow.py::TestArrowArray::test_arith_series_with_scalar[uint64-__rmod__]", + "tests/extension/test_arrow.py::TestArrowArray::test_arith_series_with_scalar[uint64-__rmul__]", + "tests/extension/test_arrow.py::TestArrowArray::test_arith_series_with_scalar[uint64-__rsub__]", + "tests/extension/test_arrow.py::TestArrowArray::test_arith_series_with_scalar[uint64-__sub__]", "tests/extension/test_arrow.py::TestArrowArray::test_arith_series_with_scalar[uint8-__floordiv__]", - "tests/extension/test_arrow.py::TestArrowArray::test_array_interface_copy[binary]", - "tests/extension/test_arrow.py::TestArrowArray::test_array_interface_copy[bool]", - "tests/extension/test_arrow.py::TestArrowArray::test_array_interface_copy[date32[day]]", - "tests/extension/test_arrow.py::TestArrowArray::test_array_interface_copy[date64[ms]]", - "tests/extension/test_arrow.py::TestArrowArray::test_array_interface_copy[decimal128(7, 3)]", - "tests/extension/test_arrow.py::TestArrowArray::test_array_interface_copy[double]", - "tests/extension/test_arrow.py::TestArrowArray::test_array_interface_copy[duration[ms]]", - "tests/extension/test_arrow.py::TestArrowArray::test_array_interface_copy[duration[ns]]", - "tests/extension/test_arrow.py::TestArrowArray::test_array_interface_copy[duration[s]]", - "tests/extension/test_arrow.py::TestArrowArray::test_array_interface_copy[duration[us]]", - "tests/extension/test_arrow.py::TestArrowArray::test_array_interface_copy[float]", - "tests/extension/test_arrow.py::TestArrowArray::test_array_interface_copy[int16]", - "tests/extension/test_arrow.py::TestArrowArray::test_array_interface_copy[int32]", - "tests/extension/test_arrow.py::TestArrowArray::test_array_interface_copy[int64]", - "tests/extension/test_arrow.py::TestArrowArray::test_array_interface_copy[int8]", - "tests/extension/test_arrow.py::TestArrowArray::test_array_interface_copy[string]", - "tests/extension/test_arrow.py::TestArrowArray::test_array_interface_copy[time32[ms]]", - "tests/extension/test_arrow.py::TestArrowArray::test_array_interface_copy[time32[s]]", - "tests/extension/test_arrow.py::TestArrowArray::test_array_interface_copy[time64[ns]]", - "tests/extension/test_arrow.py::TestArrowArray::test_array_interface_copy[time64[us]]", - "tests/extension/test_arrow.py::TestArrowArray::test_array_interface_copy[timestamp[ms, tz=US/Eastern]]", - "tests/extension/test_arrow.py::TestArrowArray::test_array_interface_copy[timestamp[ms, tz=US/Pacific]]", - "tests/extension/test_arrow.py::TestArrowArray::test_array_interface_copy[timestamp[ms, tz=UTC]]", - "tests/extension/test_arrow.py::TestArrowArray::test_array_interface_copy[timestamp[ms]]", - "tests/extension/test_arrow.py::TestArrowArray::test_array_interface_copy[timestamp[ns, tz=US/Eastern]]", - "tests/extension/test_arrow.py::TestArrowArray::test_array_interface_copy[timestamp[ns, tz=US/Pacific]]", - "tests/extension/test_arrow.py::TestArrowArray::test_array_interface_copy[timestamp[ns, tz=UTC]]", - "tests/extension/test_arrow.py::TestArrowArray::test_array_interface_copy[timestamp[ns]]", - "tests/extension/test_arrow.py::TestArrowArray::test_array_interface_copy[timestamp[s, tz=US/Eastern]]", - "tests/extension/test_arrow.py::TestArrowArray::test_array_interface_copy[timestamp[s, tz=US/Pacific]]", - "tests/extension/test_arrow.py::TestArrowArray::test_array_interface_copy[timestamp[s, tz=UTC]]", - "tests/extension/test_arrow.py::TestArrowArray::test_array_interface_copy[timestamp[s]]", - "tests/extension/test_arrow.py::TestArrowArray::test_array_interface_copy[timestamp[us, tz=US/Eastern]]", - "tests/extension/test_arrow.py::TestArrowArray::test_array_interface_copy[timestamp[us, tz=US/Pacific]]", - "tests/extension/test_arrow.py::TestArrowArray::test_array_interface_copy[timestamp[us, tz=UTC]]", - "tests/extension/test_arrow.py::TestArrowArray::test_array_interface_copy[timestamp[us]]", - "tests/extension/test_arrow.py::TestArrowArray::test_array_interface_copy[uint16]", - "tests/extension/test_arrow.py::TestArrowArray::test_array_interface_copy[uint32]", - "tests/extension/test_arrow.py::TestArrowArray::test_array_interface_copy[uint64]", - "tests/extension/test_arrow.py::TestArrowArray::test_array_interface_copy[uint8]", + "tests/extension/test_arrow.py::TestArrowArray::test_arith_series_with_scalar[uint8-__mod__]", + "tests/extension/test_arrow.py::TestArrowArray::test_arith_series_with_scalar[uint8-__rfloordiv__]", + "tests/extension/test_arrow.py::TestArrowArray::test_arith_series_with_scalar[uint8-__rmod__]", + "tests/extension/test_arrow.py::TestArrowArray::test_array_interface[duration[ns]]", + "tests/extension/test_arrow.py::TestArrowArray::test_array_interface[timestamp[ns]]", "tests/extension/test_arrow.py::TestArrowArray::test_array_type[binary]", "tests/extension/test_arrow.py::TestArrowArray::test_array_type[bool]", "tests/extension/test_arrow.py::TestArrowArray::test_array_type[date32[day]]", @@ -3298,10 +3152,29 @@ def pytest_unconfigure(config): "tests/extension/test_arrow.py::TestArrowArray::test_reduce_frame[float-std-True]", "tests/extension/test_arrow.py::TestArrowArray::test_reduce_frame[float-sum-True]", "tests/extension/test_arrow.py::TestArrowArray::test_reduce_frame[float-var-True]", + "tests/extension/test_arrow.py::TestArrowArray::test_reduce_frame[uint8-skew-True]", + "tests/extension/test_arrow.py::TestArrowArray::test_reduce_frame[uint16-skew-True]", + "tests/extension/test_arrow.py::TestArrowArray::test_reduce_frame[uint32-skew-True]", + "tests/extension/test_arrow.py::TestArrowArray::test_reduce_frame[uint64-skew-True]", + "tests/extension/test_arrow.py::TestArrowArray::test_reduce_frame[int8-skew-True]", + "tests/extension/test_arrow.py::TestArrowArray::test_reduce_frame[int16-skew-True]", + "tests/extension/test_arrow.py::TestArrowArray::test_reduce_frame[int32-skew-True]", + "tests/extension/test_arrow.py::TestArrowArray::test_reduce_frame[int64-skew-True]", + "tests/extension/test_arrow.py::TestArrowArray::test_reduce_frame[float-skew-True]", + "tests/extension/test_arrow.py::TestArrowArray::test_reduce_frame[double-skew-True]", "tests/extension/test_arrow.py::TestArrowArray::test_reduce_series_boolean[timestamp[ms]-any-False]", "tests/extension/test_arrow.py::TestArrowArray::test_reduce_series_boolean[timestamp[ns]-any-False]", "tests/extension/test_arrow.py::TestArrowArray::test_reduce_series_boolean[timestamp[s]-any-False]", "tests/extension/test_arrow.py::TestArrowArray::test_reduce_series_boolean[timestamp[us]-any-False]", + "tests/extension/test_arrow.py::TestArrowArray::test_reduce_series_numeric[float-kurt-True]", + "tests/extension/test_arrow.py::TestArrowArray::test_reduce_series_numeric[int16-kurt-True]", + "tests/extension/test_arrow.py::TestArrowArray::test_reduce_series_numeric[int32-kurt-True]", + "tests/extension/test_arrow.py::TestArrowArray::test_reduce_series_numeric[int64-kurt-True]", + "tests/extension/test_arrow.py::TestArrowArray::test_reduce_series_numeric[int8-kurt-True]", + "tests/extension/test_arrow.py::TestArrowArray::test_reduce_series_numeric[uint16-kurt-True]", + "tests/extension/test_arrow.py::TestArrowArray::test_reduce_series_numeric[uint32-kurt-True]", + "tests/extension/test_arrow.py::TestArrowArray::test_reduce_series_numeric[uint64-kurt-True]", + "tests/extension/test_arrow.py::TestArrowArray::test_reduce_series_numeric[uint8-kurt-True]", "tests/extension/test_arrow.py::TestArrowArray::test_reindex_non_na_fill_value[bool]", "tests/extension/test_arrow.py::TestArrowArray::test_reindex_non_na_fill_value[double]", "tests/extension/test_arrow.py::TestArrowArray::test_reindex_non_na_fill_value[float]", @@ -3402,6 +3275,7 @@ def pytest_unconfigure(config): "tests/extension/test_arrow.py::TestArrowArray::test_set_frame_expand_extension_with_regular[uint32]", "tests/extension/test_arrow.py::TestArrowArray::test_set_frame_expand_extension_with_regular[uint64]", "tests/extension/test_arrow.py::TestArrowArray::test_set_frame_expand_extension_with_regular[uint8]", + "tests/extension/test_arrow.py::TestArrowArray::test_setitem_2d_values[decimal128(7, 3)]", "tests/extension/test_arrow.py::TestArrowArray::test_setitem_expand_columns[bool]", "tests/extension/test_arrow.py::TestArrowArray::test_setitem_expand_columns[decimal128(7, 3)]", "tests/extension/test_arrow.py::TestArrowArray::test_setitem_expand_columns[double]", @@ -3444,37 +3318,6 @@ def pytest_unconfigure(config): "tests/extension/test_arrow.py::TestArrowArray::test_setitem_frame_2d_values[uint32]", "tests/extension/test_arrow.py::TestArrowArray::test_setitem_frame_2d_values[uint64]", "tests/extension/test_arrow.py::TestArrowArray::test_setitem_frame_2d_values[uint8]", - "tests/extension/test_arrow.py::TestArrowArray::test_to_numpy[decimal128(7, 3)]", - "tests/extension/test_arrow.py::TestArrowArray::test_to_numpy[double]", - "tests/extension/test_arrow.py::TestArrowArray::test_to_numpy[duration[ms]]", - "tests/extension/test_arrow.py::TestArrowArray::test_to_numpy[duration[ns]]", - "tests/extension/test_arrow.py::TestArrowArray::test_to_numpy[duration[s]]", - "tests/extension/test_arrow.py::TestArrowArray::test_to_numpy[duration[us]]", - "tests/extension/test_arrow.py::TestArrowArray::test_to_numpy[float]", - "tests/extension/test_arrow.py::TestArrowArray::test_to_numpy[int16]", - "tests/extension/test_arrow.py::TestArrowArray::test_to_numpy[int32]", - "tests/extension/test_arrow.py::TestArrowArray::test_to_numpy[int64]", - "tests/extension/test_arrow.py::TestArrowArray::test_to_numpy[int8]", - "tests/extension/test_arrow.py::TestArrowArray::test_to_numpy[timestamp[ms, tz=US/Eastern]]", - "tests/extension/test_arrow.py::TestArrowArray::test_to_numpy[timestamp[ms, tz=US/Pacific]]", - "tests/extension/test_arrow.py::TestArrowArray::test_to_numpy[timestamp[ms, tz=UTC]]", - "tests/extension/test_arrow.py::TestArrowArray::test_to_numpy[timestamp[ms]]", - "tests/extension/test_arrow.py::TestArrowArray::test_to_numpy[timestamp[ns, tz=US/Eastern]]", - "tests/extension/test_arrow.py::TestArrowArray::test_to_numpy[timestamp[ns, tz=US/Pacific]]", - "tests/extension/test_arrow.py::TestArrowArray::test_to_numpy[timestamp[ns, tz=UTC]]", - "tests/extension/test_arrow.py::TestArrowArray::test_to_numpy[timestamp[ns]]", - "tests/extension/test_arrow.py::TestArrowArray::test_to_numpy[timestamp[s, tz=US/Eastern]]", - "tests/extension/test_arrow.py::TestArrowArray::test_to_numpy[timestamp[s, tz=US/Pacific]]", - "tests/extension/test_arrow.py::TestArrowArray::test_to_numpy[timestamp[s, tz=UTC]]", - "tests/extension/test_arrow.py::TestArrowArray::test_to_numpy[timestamp[s]]", - "tests/extension/test_arrow.py::TestArrowArray::test_to_numpy[timestamp[us, tz=US/Eastern]]", - "tests/extension/test_arrow.py::TestArrowArray::test_to_numpy[timestamp[us, tz=US/Pacific]]", - "tests/extension/test_arrow.py::TestArrowArray::test_to_numpy[timestamp[us, tz=UTC]]", - "tests/extension/test_arrow.py::TestArrowArray::test_to_numpy[timestamp[us]]", - "tests/extension/test_arrow.py::TestArrowArray::test_to_numpy[uint16]", - "tests/extension/test_arrow.py::TestArrowArray::test_to_numpy[uint32]", - "tests/extension/test_arrow.py::TestArrowArray::test_to_numpy[uint64]", - "tests/extension/test_arrow.py::TestArrowArray::test_to_numpy[uint8]", "tests/extension/test_arrow.py::TestArrowArray::test_unary_ufunc_dunder_equivalence[binary-positive]", "tests/extension/test_arrow.py::TestArrowArray::test_unary_ufunc_dunder_equivalence[bool-absolute]", "tests/extension/test_arrow.py::TestArrowArray::test_unary_ufunc_dunder_equivalence[bool-negative]", @@ -3632,16 +3475,22 @@ def pytest_unconfigure(config): "tests/extension/test_arrow.py::TestArrowArray::test_unstack[uint8-series-index2]", "tests/extension/test_arrow.py::TestArrowArray::test_unstack[uint8-series-index3]", "tests/extension/test_arrow.py::TestLogicalOps::test_kleene_and", + "tests/extension/test_arrow.py::TestLogicalOps::test_kleene_and_scalar[False-expected3]", "tests/extension/test_arrow.py::TestLogicalOps::test_kleene_and_scalar[None-expected0]", "tests/extension/test_arrow.py::TestLogicalOps::test_kleene_and_scalar[other1-expected1]", "tests/extension/test_arrow.py::TestLogicalOps::test_kleene_or", "tests/extension/test_arrow.py::TestLogicalOps::test_kleene_or_scalar[None-expected0]", + "tests/extension/test_arrow.py::TestLogicalOps::test_kleene_or_scalar[True-expected2]", "tests/extension/test_arrow.py::TestLogicalOps::test_kleene_or_scalar[other1-expected1]", - "tests/extension/test_arrow.py::TestLogicalOps::test_logical_masked_numpy[__and__-True]", - "tests/extension/test_arrow.py::TestLogicalOps::test_logical_masked_numpy[__or__-True]", - "tests/extension/test_arrow.py::TestLogicalOps::test_logical_masked_numpy[__xor__-False]", - "tests/extension/test_arrow.py::test_arrow_true_division_large_divisor[int64[pyarrow]]", - "tests/extension/test_arrow.py::test_arrow_true_division_large_divisor[uint64[pyarrow]]", + "tests/extension/test_arrow.py::test_arrow_floordiv_floating_0_divisor[float[pyarrow]]", + "tests/extension/test_arrow.py::test_arrow_floordiv_integral_invalid[pa_type0]", + "tests/extension/test_arrow.py::test_arrow_floordiv_integral_invalid[pa_type1]", + "tests/extension/test_arrow.py::test_arrow_floordiv_integral_invalid[pa_type2]", + "tests/extension/test_arrow.py::test_arrow_floordiv_integral_invalid[pa_type3]", + "tests/extension/test_arrow.py::test_arrow_floordiv_large_integral_result[uint64[pyarrow]]", + "tests/extension/test_arrow.py::test_arrow_floordiv_larger_divisor[pa_type0]", + "tests/extension/test_arrow.py::test_arrow_floordiv_larger_divisor[pa_type1]", + "tests/extension/test_arrow.py::test_arrow_floordiv_larger_divisor[pa_type2]", "tests/extension/test_arrow.py::test_astype_errors_ignore", "tests/extension/test_arrow.py::test_bitwise[pa_type0]", "tests/extension/test_arrow.py::test_bitwise[pa_type1]", @@ -3666,7 +3515,6 @@ def pytest_unconfigure(config): "tests/extension/test_arrow.py::test_describe_datetime_data[pa_type6]", "tests/extension/test_arrow.py::test_describe_datetime_data[pa_type7]", "tests/extension/test_arrow.py::test_describe_datetime_data[pa_type9]", - "tests/extension/test_arrow.py::test_dt_components", "tests/extension/test_arrow.py::test_dt_day_month_name[day_name-Sunday]", "tests/extension/test_arrow.py::test_dt_day_month_name[month_name-January]", "tests/extension/test_arrow.py::test_dt_days_in_month[days_in_month]", @@ -3873,69 +3721,26 @@ def pytest_unconfigure(config): "tests/extension/test_masked.py::TestMaskedArrays::test_argsort_missing[UInt32Dtype]", "tests/extension/test_masked.py::TestMaskedArrays::test_argsort_missing[UInt64Dtype]", "tests/extension/test_masked.py::TestMaskedArrays::test_argsort_missing[UInt8Dtype]", - "tests/extension/test_masked.py::TestMaskedArrays::test_arith_frame_with_scalar[Int16Dtype-__rtruediv__]", - "tests/extension/test_masked.py::TestMaskedArrays::test_arith_frame_with_scalar[Int16Dtype-__truediv__]", - "tests/extension/test_masked.py::TestMaskedArrays::test_arith_frame_with_scalar[Int32Dtype-__rtruediv__]", - "tests/extension/test_masked.py::TestMaskedArrays::test_arith_frame_with_scalar[Int32Dtype-__truediv__]", - "tests/extension/test_masked.py::TestMaskedArrays::test_arith_frame_with_scalar[Int64Dtype-__rtruediv__]", - "tests/extension/test_masked.py::TestMaskedArrays::test_arith_frame_with_scalar[Int64Dtype-__truediv__]", - "tests/extension/test_masked.py::TestMaskedArrays::test_arith_frame_with_scalar[Int8Dtype-__rtruediv__]", - "tests/extension/test_masked.py::TestMaskedArrays::test_arith_frame_with_scalar[Int8Dtype-__truediv__]", - "tests/extension/test_masked.py::TestMaskedArrays::test_arith_frame_with_scalar[UInt16Dtype-__rtruediv__]", - "tests/extension/test_masked.py::TestMaskedArrays::test_arith_frame_with_scalar[UInt16Dtype-__truediv__]", - "tests/extension/test_masked.py::TestMaskedArrays::test_arith_frame_with_scalar[UInt32Dtype-__rtruediv__]", - "tests/extension/test_masked.py::TestMaskedArrays::test_arith_frame_with_scalar[UInt32Dtype-__truediv__]", - "tests/extension/test_masked.py::TestMaskedArrays::test_arith_frame_with_scalar[UInt64Dtype-__rtruediv__]", - "tests/extension/test_masked.py::TestMaskedArrays::test_arith_frame_with_scalar[UInt64Dtype-__truediv__]", - "tests/extension/test_masked.py::TestMaskedArrays::test_arith_frame_with_scalar[UInt8Dtype-__rtruediv__]", - "tests/extension/test_masked.py::TestMaskedArrays::test_arith_frame_with_scalar[UInt8Dtype-__truediv__]", + "tests/extension/test_masked.py::TestMaskedArrays::test_arith_frame_with_scalar[Float32Dtype-__floordiv__]", + "tests/extension/test_masked.py::TestMaskedArrays::test_arith_frame_with_scalar[Float64Dtype-__floordiv__]", "tests/extension/test_masked.py::TestMaskedArrays::test_arith_series_with_array[BooleanDtype-__floordiv__]", - "tests/extension/test_masked.py::TestMaskedArrays::test_arith_series_with_array[BooleanDtype-__mod__]", "tests/extension/test_masked.py::TestMaskedArrays::test_arith_series_with_array[BooleanDtype-__pow__]", "tests/extension/test_masked.py::TestMaskedArrays::test_arith_series_with_array[BooleanDtype-__rfloordiv__]", - "tests/extension/test_masked.py::TestMaskedArrays::test_arith_series_with_array[BooleanDtype-__rmod__]", "tests/extension/test_masked.py::TestMaskedArrays::test_arith_series_with_array[BooleanDtype-__rpow__]", "tests/extension/test_masked.py::TestMaskedArrays::test_arith_series_with_array[BooleanDtype-__rsub__]", "tests/extension/test_masked.py::TestMaskedArrays::test_arith_series_with_array[BooleanDtype-__sub__]", - "tests/extension/test_masked.py::TestMaskedArrays::test_arith_series_with_array[BooleanDtype-__truediv__]", "tests/extension/test_masked.py::TestMaskedArrays::test_arith_series_with_array[Float32Dtype-__floordiv__]", "tests/extension/test_masked.py::TestMaskedArrays::test_arith_series_with_array[Float64Dtype-__floordiv__]", "tests/extension/test_masked.py::TestMaskedArrays::test_arith_series_with_array[Int16Dtype-__rpow__]", - "tests/extension/test_masked.py::TestMaskedArrays::test_arith_series_with_array[Int16Dtype-__rtruediv__]", - "tests/extension/test_masked.py::TestMaskedArrays::test_arith_series_with_array[Int16Dtype-__truediv__]", "tests/extension/test_masked.py::TestMaskedArrays::test_arith_series_with_array[Int32Dtype-__rpow__]", - "tests/extension/test_masked.py::TestMaskedArrays::test_arith_series_with_array[Int32Dtype-__truediv__]", "tests/extension/test_masked.py::TestMaskedArrays::test_arith_series_with_array[Int64Dtype-__rpow__]", - "tests/extension/test_masked.py::TestMaskedArrays::test_arith_series_with_array[Int64Dtype-__truediv__]", "tests/extension/test_masked.py::TestMaskedArrays::test_arith_series_with_array[Int8Dtype-__rpow__]", - "tests/extension/test_masked.py::TestMaskedArrays::test_arith_series_with_array[Int8Dtype-__rtruediv__]", - "tests/extension/test_masked.py::TestMaskedArrays::test_arith_series_with_array[Int8Dtype-__truediv__]", "tests/extension/test_masked.py::TestMaskedArrays::test_arith_series_with_array[UInt16Dtype-__rpow__]", - "tests/extension/test_masked.py::TestMaskedArrays::test_arith_series_with_array[UInt16Dtype-__rtruediv__]", - "tests/extension/test_masked.py::TestMaskedArrays::test_arith_series_with_array[UInt16Dtype-__truediv__]", "tests/extension/test_masked.py::TestMaskedArrays::test_arith_series_with_array[UInt32Dtype-__rpow__]", - "tests/extension/test_masked.py::TestMaskedArrays::test_arith_series_with_array[UInt32Dtype-__truediv__]", "tests/extension/test_masked.py::TestMaskedArrays::test_arith_series_with_array[UInt64Dtype-__rpow__]", - "tests/extension/test_masked.py::TestMaskedArrays::test_arith_series_with_array[UInt64Dtype-__truediv__]", "tests/extension/test_masked.py::TestMaskedArrays::test_arith_series_with_array[UInt8Dtype-__rpow__]", - "tests/extension/test_masked.py::TestMaskedArrays::test_arith_series_with_array[UInt8Dtype-__rtruediv__]", - "tests/extension/test_masked.py::TestMaskedArrays::test_arith_series_with_array[UInt8Dtype-__truediv__]", - "tests/extension/test_masked.py::TestMaskedArrays::test_arith_series_with_scalar[Int16Dtype-__rtruediv__]", - "tests/extension/test_masked.py::TestMaskedArrays::test_arith_series_with_scalar[Int16Dtype-__truediv__]", - "tests/extension/test_masked.py::TestMaskedArrays::test_arith_series_with_scalar[Int32Dtype-__rtruediv__]", - "tests/extension/test_masked.py::TestMaskedArrays::test_arith_series_with_scalar[Int32Dtype-__truediv__]", - "tests/extension/test_masked.py::TestMaskedArrays::test_arith_series_with_scalar[Int64Dtype-__rtruediv__]", - "tests/extension/test_masked.py::TestMaskedArrays::test_arith_series_with_scalar[Int64Dtype-__truediv__]", - "tests/extension/test_masked.py::TestMaskedArrays::test_arith_series_with_scalar[Int8Dtype-__rtruediv__]", - "tests/extension/test_masked.py::TestMaskedArrays::test_arith_series_with_scalar[Int8Dtype-__truediv__]", - "tests/extension/test_masked.py::TestMaskedArrays::test_arith_series_with_scalar[UInt16Dtype-__rtruediv__]", - "tests/extension/test_masked.py::TestMaskedArrays::test_arith_series_with_scalar[UInt16Dtype-__truediv__]", - "tests/extension/test_masked.py::TestMaskedArrays::test_arith_series_with_scalar[UInt32Dtype-__rtruediv__]", - "tests/extension/test_masked.py::TestMaskedArrays::test_arith_series_with_scalar[UInt32Dtype-__truediv__]", - "tests/extension/test_masked.py::TestMaskedArrays::test_arith_series_with_scalar[UInt64Dtype-__rtruediv__]", - "tests/extension/test_masked.py::TestMaskedArrays::test_arith_series_with_scalar[UInt64Dtype-__truediv__]", - "tests/extension/test_masked.py::TestMaskedArrays::test_arith_series_with_scalar[UInt8Dtype-__rtruediv__]", - "tests/extension/test_masked.py::TestMaskedArrays::test_arith_series_with_scalar[UInt8Dtype-__truediv__]", + "tests/extension/test_masked.py::TestMaskedArrays::test_arith_series_with_scalar[Float32Dtype-__floordiv__]", + "tests/extension/test_masked.py::TestMaskedArrays::test_arith_series_with_scalar[Float64Dtype-__floordiv__]", "tests/extension/test_masked.py::TestMaskedArrays::test_array_interface_copy[BooleanDtype]", "tests/extension/test_masked.py::TestMaskedArrays::test_array_interface_copy[Float32Dtype]", "tests/extension/test_masked.py::TestMaskedArrays::test_array_interface_copy[Float64Dtype]", @@ -4071,7 +3876,7 @@ def pytest_unconfigure(config): "tests/extension/test_masked.py::TestMaskedArrays::test_memory_usage[UInt32Dtype]", "tests/extension/test_masked.py::TestMaskedArrays::test_memory_usage[UInt64Dtype]", "tests/extension/test_masked.py::TestMaskedArrays::test_memory_usage[UInt8Dtype]", - "tests/extension/test_masked.py::TestMaskedArrays::test_reduce_series_numeric[Float32Dtype-skew-True]", + "tests/extension/test_masked.py::TestMaskedArrays::test_reduce_frame[Float32Dtype-skew-True]", "tests/extension/test_masked.py::TestMaskedArrays::test_reindex_non_na_fill_value[BooleanDtype]", "tests/extension/test_masked.py::TestMaskedArrays::test_reindex_non_na_fill_value[Float32Dtype]", "tests/extension/test_masked.py::TestMaskedArrays::test_reindex_non_na_fill_value[Float64Dtype]", @@ -4181,17 +3986,6 @@ def pytest_unconfigure(config): "tests/extension/test_masked.py::TestMaskedArrays::test_unstack[UInt8Dtype-frame-index3]", "tests/extension/test_masked.py::TestMaskedArrays::test_unstack[UInt8Dtype-series-index2]", "tests/extension/test_masked.py::TestMaskedArrays::test_unstack[UInt8Dtype-series-index3]", - "tests/extension/test_masked.py::TestMaskedArrays::test_value_counts_with_normalize[BooleanDtype]", - "tests/extension/test_masked.py::TestMaskedArrays::test_value_counts_with_normalize[Float32Dtype]", - "tests/extension/test_masked.py::TestMaskedArrays::test_value_counts_with_normalize[Float64Dtype]", - "tests/extension/test_masked.py::TestMaskedArrays::test_value_counts_with_normalize[Int16Dtype]", - "tests/extension/test_masked.py::TestMaskedArrays::test_value_counts_with_normalize[Int32Dtype]", - "tests/extension/test_masked.py::TestMaskedArrays::test_value_counts_with_normalize[Int64Dtype]", - "tests/extension/test_masked.py::TestMaskedArrays::test_value_counts_with_normalize[Int8Dtype]", - "tests/extension/test_masked.py::TestMaskedArrays::test_value_counts_with_normalize[UInt16Dtype]", - "tests/extension/test_masked.py::TestMaskedArrays::test_value_counts_with_normalize[UInt32Dtype]", - "tests/extension/test_masked.py::TestMaskedArrays::test_value_counts_with_normalize[UInt64Dtype]", - "tests/extension/test_masked.py::TestMaskedArrays::test_value_counts_with_normalize[UInt8Dtype]", "tests/extension/test_numpy.py::Test2DCompat::test_copy_order[float]", "tests/extension/test_numpy.py::TestNumpyExtensionArray::test_accumulate_series[float-cummax-False]", "tests/extension/test_numpy.py::TestNumpyExtensionArray::test_accumulate_series[float-cummax-True]", @@ -4446,12 +4240,8 @@ def pytest_unconfigure(config): "tests/extension/test_string.py::TestStringArray::test_arith_series_with_array[string=string[pyarrow]-True-__radd__]", "tests/extension/test_string.py::TestStringArray::test_arith_series_with_array[string=string[python]-False-__radd__]", "tests/extension/test_string.py::TestStringArray::test_arith_series_with_array[string=string[python]-True-__radd__]", - "tests/extension/test_string.py::TestStringArray::test_array_interface_copy[string=str[pyarrow]-False]", - "tests/extension/test_string.py::TestStringArray::test_array_interface_copy[string=str[pyarrow]-True]", "tests/extension/test_string.py::TestStringArray::test_array_interface_copy[string=str[python]-False]", "tests/extension/test_string.py::TestStringArray::test_array_interface_copy[string=str[python]-True]", - "tests/extension/test_string.py::TestStringArray::test_array_interface_copy[string=string[pyarrow]-False]", - "tests/extension/test_string.py::TestStringArray::test_array_interface_copy[string=string[pyarrow]-True]", "tests/extension/test_string.py::TestStringArray::test_array_interface_copy[string=string[python]-False]", "tests/extension/test_string.py::TestStringArray::test_array_interface_copy[string=string[python]-True]", "tests/extension/test_string.py::TestStringArray::test_astype_own_type[pyarrow-False-False]", @@ -4814,12 +4604,8 @@ def pytest_unconfigure(config): "tests/extension/test_string.py::TestStringArray::test_to_numpy[pyarrow_numpy-True]", "tests/extension/test_string.py::TestStringArray::test_to_numpy[python-False]", "tests/extension/test_string.py::TestStringArray::test_to_numpy[python-True]", - "tests/extension/test_string.py::TestStringArray::test_to_numpy[string=str[pyarrow]-False]", - "tests/extension/test_string.py::TestStringArray::test_to_numpy[string=str[pyarrow]-True]", "tests/extension/test_string.py::TestStringArray::test_to_numpy[string=str[python]-False]", "tests/extension/test_string.py::TestStringArray::test_to_numpy[string=str[python]-True]", - "tests/extension/test_string.py::TestStringArray::test_to_numpy[string=string[pyarrow]-False]", - "tests/extension/test_string.py::TestStringArray::test_to_numpy[string=string[pyarrow]-True]", "tests/extension/test_string.py::TestStringArray::test_to_numpy[string=string[python]-False]", "tests/extension/test_string.py::TestStringArray::test_to_numpy[string=string[python]-True]", "tests/extension/test_string.py::TestStringArray::test_unary_ufunc_dunder_equivalence[pyarrow-False-positive]", @@ -4910,10 +4696,12 @@ def pytest_unconfigure(config): "tests/extension/test_string.py::TestStringArray::test_value_counts_with_normalize[pyarrow-True]", "tests/extension/test_string.py::TestStringArray::test_value_counts_with_normalize[python-False]", "tests/extension/test_string.py::TestStringArray::test_value_counts_with_normalize[python-True]", + "tests/extension/test_string.py::TestStringArray::test_value_counts_with_normalize[string=str[pyarrow]-False]", + "tests/extension/test_string.py::TestStringArray::test_value_counts_with_normalize[string=str[pyarrow]-True]", + "tests/extension/test_string.py::TestStringArray::test_value_counts_with_normalize[string=str[python]-False]", + "tests/extension/test_string.py::TestStringArray::test_value_counts_with_normalize[string=str[python]-True]", "tests/extension/test_string.py::TestStringArray::test_value_counts_with_normalize[string=string[pyarrow]-False]", "tests/extension/test_string.py::TestStringArray::test_value_counts_with_normalize[string=string[pyarrow]-True]", - "tests/extension/test_string.py::TestStringArray::test_value_counts_with_normalize[string=string[python]-False]", - "tests/extension/test_string.py::TestStringArray::test_value_counts_with_normalize[string=string[python]-True]", "tests/frame/constructors/test_from_records.py::TestFromRecords::test_from_records_misc_brokenness3", "tests/frame/constructors/test_from_records.py::TestFromRecords::test_from_records_series_categorical_index", "tests/frame/constructors/test_from_records.py::TestFromRecords::test_from_records_series_list_dict", @@ -5083,13 +4871,10 @@ def pytest_unconfigure(config): "tests/frame/methods/test_astype.py::TestAstype::test_astype_arg_for_errors_dictlist", "tests/frame/methods/test_astype.py::TestAstype::test_astype_cast_nan_inf_int[inf-int32]", "tests/frame/methods/test_astype.py::TestAstype::test_astype_cast_nan_inf_int[inf-int64]", - "tests/frame/methods/test_astype.py::TestAstype::test_astype_cast_nan_inf_int[nan-int32]", - "tests/frame/methods/test_astype.py::TestAstype::test_astype_cast_nan_inf_int[nan-int64]", "tests/frame/methods/test_astype.py::TestAstype::test_astype_dt64_to_string[DataFrame-None]", "tests/frame/methods/test_astype.py::TestAstype::test_astype_dt64_to_string[Series-None]", "tests/frame/methods/test_astype.py::TestAstype::test_astype_dt64tz", "tests/frame/methods/test_astype.py::TestAstype::test_astype_dt64tz_to_str", - "tests/frame/methods/test_astype.py::TestAstype::test_astype_extension_dtypes_duplicate_col[Int64]", "tests/frame/methods/test_astype.py::TestAstype::test_astype_str_float", "tests/frame/methods/test_astype.py::TestAstype::test_astype_td64_to_string[DataFrame]", "tests/frame/methods/test_astype.py::TestAstype::test_astype_td64_to_string[Series]", @@ -5125,14 +4910,13 @@ def pytest_unconfigure(config): "tests/frame/methods/test_count.py::TestDataFrameCount::test_count", "tests/frame/methods/test_cov_corr.py::TestDataFrameCorr::test_corr_item_cache", "tests/frame/methods/test_cov_corr.py::TestDataFrameCorr::test_corr_nooverlap[pearson]", + "tests/frame/methods/test_cov_corr.py::TestDataFrameCorr::test_corr_nullable_integer[pearson-other_column2-nullable_column0]", "tests/frame/methods/test_cov_corr.py::TestDataFrameCorr::test_corr_numeric_only[False-spearman]", "tests/frame/methods/test_cov_corr.py::TestDataFrameCorr::test_corr_scipy_method[pearson]", "tests/frame/methods/test_cov_corr.py::TestDataFrameCov::test_cov", "tests/frame/methods/test_describe.py::TestDataFrameDescribe::test_describe_datetime_columns", "tests/frame/methods/test_diff.py::TestDataFrameDiff::test_diff_all_int_dtype[int16]", "tests/frame/methods/test_diff.py::TestDataFrameDiff::test_diff_all_int_dtype[int8]", - "tests/frame/methods/test_diff.py::TestDataFrameDiff::test_diff_integer_na[0-expected0]", - "tests/frame/methods/test_diff.py::TestDataFrameDiff::test_diff_integer_na[1-expected1]", "tests/frame/methods/test_dot.py::TestDataFrameDot::test_dot_1d_ndarray", "tests/frame/methods/test_dot.py::TestDataFrameDot::test_dot_2d_ndarray", "tests/frame/methods/test_dot.py::TestDataFrameDot::test_dot_aligns", @@ -5290,7 +5074,6 @@ def pytest_unconfigure(config): "tests/frame/methods/test_rename.py::TestRename::test_rename_mapper_and_positional_arguments_raises", "tests/frame/methods/test_rename.py::TestRename::test_rename_no_mappings_raises", "tests/frame/methods/test_rename.py::TestRename::test_rename_nocopy", - "tests/frame/methods/test_replace.py::TestDataFrameReplace::test_replace_NA_with_None", "tests/frame/methods/test_replace.py::TestDataFrameReplace::test_replace_after_convert_dtypes", "tests/frame/methods/test_replace.py::TestDataFrameReplace::test_replace_dict_tuple_list_ordering_remains_the_same", "tests/frame/methods/test_replace.py::TestDataFrameReplace::test_replace_list_with_mixed_type[DataFrame-array-data0-to_replace0-value0-expected0]", @@ -5398,11 +5181,7 @@ def pytest_unconfigure(config): "tests/frame/methods/test_tz_localize.py::TestTZLocalize::test_tz_localize_copy_inplace_mutate[Series-True]", "tests/frame/methods/test_update.py::TestDataFrameUpdate::test_update_modify_view", "tests/frame/methods/test_value_counts.py::test_data_frame_value_counts_dropna_false[NoneType]", - "tests/frame/methods/test_value_counts.py::test_data_frame_value_counts_dropna_false[Decimal]", - "tests/frame/methods/test_value_counts.py::test_data_frame_value_counts_dropna_false[NaTType]", "tests/frame/methods/test_value_counts.py::test_data_frame_value_counts_dropna_false[NAType]", - "tests/frame/methods/test_value_counts.py::test_data_frame_value_counts_subset[Decimal-columns1]", - "tests/frame/methods/test_value_counts.py::test_data_frame_value_counts_subset[NaTType-columns1]", "tests/frame/methods/test_value_counts.py::test_data_frame_value_counts_subset[NAType-columns1]", "tests/frame/methods/test_value_counts.py::test_data_frame_value_counts_empty", "tests/frame/methods/test_value_counts.py::test_data_frame_value_counts_empty_normalize", @@ -5466,22 +5245,6 @@ def pytest_unconfigure(config): "tests/frame/test_arithmetic.py::test_arithmetic_multiindex_align[python]", "tests/frame/test_arithmetic.py::test_bool_frame_mult_float[numexpr]", "tests/frame/test_arithmetic.py::test_bool_frame_mult_float[python]", - "tests/frame/test_arithmetic.py::test_frame_sub_nullable_int[numexpr-Int16]", - "tests/frame/test_arithmetic.py::test_frame_sub_nullable_int[numexpr-Int32]", - "tests/frame/test_arithmetic.py::test_frame_sub_nullable_int[numexpr-Int64]", - "tests/frame/test_arithmetic.py::test_frame_sub_nullable_int[numexpr-Int8]", - "tests/frame/test_arithmetic.py::test_frame_sub_nullable_int[numexpr-UInt16]", - "tests/frame/test_arithmetic.py::test_frame_sub_nullable_int[numexpr-UInt32]", - "tests/frame/test_arithmetic.py::test_frame_sub_nullable_int[numexpr-UInt64]", - "tests/frame/test_arithmetic.py::test_frame_sub_nullable_int[numexpr-UInt8]", - "tests/frame/test_arithmetic.py::test_frame_sub_nullable_int[python-Int16]", - "tests/frame/test_arithmetic.py::test_frame_sub_nullable_int[python-Int32]", - "tests/frame/test_arithmetic.py::test_frame_sub_nullable_int[python-Int64]", - "tests/frame/test_arithmetic.py::test_frame_sub_nullable_int[python-Int8]", - "tests/frame/test_arithmetic.py::test_frame_sub_nullable_int[python-UInt16]", - "tests/frame/test_arithmetic.py::test_frame_sub_nullable_int[python-UInt32]", - "tests/frame/test_arithmetic.py::test_frame_sub_nullable_int[python-UInt64]", - "tests/frame/test_arithmetic.py::test_frame_sub_nullable_int[python-UInt8]", "tests/frame/test_arithmetic.py::test_frame_with_zero_len_series_corner_cases[numexpr]", "tests/frame/test_arithmetic.py::test_frame_with_zero_len_series_corner_cases[python]", "tests/frame/test_arithmetic.py::test_inplace_arithmetic_series_update[numexpr]", @@ -5520,7 +5283,6 @@ def pytest_unconfigure(config): "tests/frame/test_constructors.py::TestDataFrameConstructors::test_1d_object_array_does_not_copy", "tests/frame/test_constructors.py::TestDataFrameConstructors::test_2d_object_array_does_not_copy", "tests/frame/test_constructors.py::TestDataFrameConstructors::test_construct_from_list_of_datetimes", - "tests/frame/test_constructors.py::TestDataFrameConstructors::test_construct_ndarray_with_nas_and_int_dtype", "tests/frame/test_constructors.py::TestDataFrameConstructors::test_construct_with_two_categoricalindex_series", "tests/frame/test_constructors.py::TestDataFrameConstructors::test_constructor_datetimes_with_nulls[arr0]", "tests/frame/test_constructors.py::TestDataFrameConstructors::test_constructor_datetimes_with_nulls[arr1]", @@ -5534,6 +5296,7 @@ def pytest_unconfigure(config): "tests/frame/test_constructors.py::TestDataFrameConstructors::test_constructor_dict_with_index", "tests/frame/test_constructors.py::TestDataFrameConstructors::test_constructor_dict_with_index_and_columns", "tests/frame/test_constructors.py::TestDataFrameConstructors::test_constructor_dtype_nocast_view_2d_array", + "tests/frame/test_constructors.py::TestDataFrameConstructors::test_constructor_dtype_nocast_view_dataframe", "tests/frame/test_constructors.py::TestDataFrameConstructors::test_constructor_error_msgs", "tests/frame/test_constructors.py::TestDataFrameConstructors::test_constructor_for_list_with_dtypes", "tests/frame/test_constructors.py::TestDataFrameConstructors::test_constructor_list_of_2d_raises", @@ -5542,7 +5305,6 @@ def pytest_unconfigure(config): "tests/frame/test_constructors.py::TestDataFrameConstructors::test_constructor_maskedarray_hardened", "tests/frame/test_constructors.py::TestDataFrameConstructors::test_constructor_maskedarray_nonfloat", "tests/frame/test_constructors.py::TestDataFrameConstructors::test_constructor_maskedrecarray_dtype", - "tests/frame/test_constructors.py::TestDataFrameConstructors::test_constructor_miscast_na_int_dtype", "tests/frame/test_constructors.py::TestDataFrameConstructors::test_constructor_mixed_dict_and_Series", "tests/frame/test_constructors.py::TestDataFrameConstructors::test_constructor_ndarray_copy", "tests/frame/test_constructors.py::TestDataFrameConstructors::test_constructor_rec", @@ -5828,9 +5590,7 @@ def pytest_unconfigure(config): "tests/frame/test_reductions.py::TestDataFrameAnalytics::test_idxmax_idxmin_convert_dtypes[idxmax-expected_value0]", "tests/frame/test_reductions.py::TestDataFrameAnalytics::test_idxmax_idxmin_convert_dtypes[idxmin-expected_value1]", "tests/frame/test_reductions.py::TestDataFrameAnalytics::test_mode_dropna[False-expected3]", - "tests/frame/test_reductions.py::TestDataFrameAnalytics::test_nunique", "tests/frame/test_reductions.py::TestDataFrameAnalytics::test_operators_timedelta64", - "tests/frame/test_reductions.py::TestDataFrameAnalytics::test_stat_op_calc", "tests/frame/test_reductions.py::TestDataFrameAnalytics::test_std_datetime64_with_nat[False-ms-values0]", "tests/frame/test_reductions.py::TestDataFrameAnalytics::test_std_datetime64_with_nat[False-ms-values1]", "tests/frame/test_reductions.py::TestDataFrameAnalytics::test_std_datetime64_with_nat[False-ns-values0]", @@ -5887,102 +5647,14 @@ def pytest_unconfigure(config): "tests/frame/test_reductions.py::TestEmptyDataFrameReductions::test_df_empty_nullable_min_count_0[sum-UInt8-0-UInt64]", "tests/frame/test_reductions.py::TestNuisanceColumns::test_any_all_categorical_dtype_nuisance_column[all]", "tests/frame/test_reductions.py::TestNuisanceColumns::test_any_all_categorical_dtype_nuisance_column[any]", - "tests/frame/test_reductions.py::test_numeric_ea_axis_1[False-Float32-0-kurt]", - "tests/frame/test_reductions.py::test_numeric_ea_axis_1[False-Float32-0-kurtosis]", - "tests/frame/test_reductions.py::test_numeric_ea_axis_1[False-Float32-0-median]", - "tests/frame/test_reductions.py::test_numeric_ea_axis_1[False-Float32-0-sem]", - "tests/frame/test_reductions.py::test_numeric_ea_axis_1[False-Float32-0-skew]", - "tests/frame/test_reductions.py::test_numeric_ea_axis_1[False-Float32-2-kurt]", - "tests/frame/test_reductions.py::test_numeric_ea_axis_1[False-Float32-2-kurtosis]", - "tests/frame/test_reductions.py::test_numeric_ea_axis_1[False-Float32-2-median]", - "tests/frame/test_reductions.py::test_numeric_ea_axis_1[False-Float32-2-prod]", - "tests/frame/test_reductions.py::test_numeric_ea_axis_1[False-Float32-2-product]", - "tests/frame/test_reductions.py::test_numeric_ea_axis_1[False-Float32-2-sem]", - "tests/frame/test_reductions.py::test_numeric_ea_axis_1[False-Float32-2-skew]", - "tests/frame/test_reductions.py::test_numeric_ea_axis_1[False-Float32-2-sum]", - "tests/frame/test_reductions.py::test_numeric_ea_axis_1[False-Int16-2-prod]", - "tests/frame/test_reductions.py::test_numeric_ea_axis_1[False-Int16-2-product]", - "tests/frame/test_reductions.py::test_numeric_ea_axis_1[False-Int16-2-sum]", - "tests/frame/test_reductions.py::test_numeric_ea_axis_1[False-Int32-2-prod]", - "tests/frame/test_reductions.py::test_numeric_ea_axis_1[False-Int32-2-product]", - "tests/frame/test_reductions.py::test_numeric_ea_axis_1[False-Int32-2-sum]", - "tests/frame/test_reductions.py::test_numeric_ea_axis_1[False-Int64-2-prod]", - "tests/frame/test_reductions.py::test_numeric_ea_axis_1[False-Int64-2-product]", - "tests/frame/test_reductions.py::test_numeric_ea_axis_1[False-Int64-2-sum]", - "tests/frame/test_reductions.py::test_numeric_ea_axis_1[False-Int8-2-prod]", - "tests/frame/test_reductions.py::test_numeric_ea_axis_1[False-Int8-2-product]", - "tests/frame/test_reductions.py::test_numeric_ea_axis_1[False-Int8-2-sum]", - "tests/frame/test_reductions.py::test_numeric_ea_axis_1[False-UInt16-2-prod]", - "tests/frame/test_reductions.py::test_numeric_ea_axis_1[False-UInt16-2-product]", - "tests/frame/test_reductions.py::test_numeric_ea_axis_1[False-UInt16-2-sum]", - "tests/frame/test_reductions.py::test_numeric_ea_axis_1[False-UInt32-2-prod]", - "tests/frame/test_reductions.py::test_numeric_ea_axis_1[False-UInt32-2-product]", - "tests/frame/test_reductions.py::test_numeric_ea_axis_1[False-UInt32-2-sum]", - "tests/frame/test_reductions.py::test_numeric_ea_axis_1[False-UInt64-2-prod]", - "tests/frame/test_reductions.py::test_numeric_ea_axis_1[False-UInt64-2-product]", - "tests/frame/test_reductions.py::test_numeric_ea_axis_1[False-UInt64-2-sum]", - "tests/frame/test_reductions.py::test_numeric_ea_axis_1[False-UInt8-2-prod]", - "tests/frame/test_reductions.py::test_numeric_ea_axis_1[False-UInt8-2-product]", - "tests/frame/test_reductions.py::test_numeric_ea_axis_1[False-UInt8-2-sum]", - "tests/frame/test_reductions.py::test_numeric_ea_axis_1[True-Float32-0-kurt]", - "tests/frame/test_reductions.py::test_numeric_ea_axis_1[True-Float32-0-kurtosis]", - "tests/frame/test_reductions.py::test_numeric_ea_axis_1[True-Float32-0-median]", - "tests/frame/test_reductions.py::test_numeric_ea_axis_1[True-Float32-0-sem]", - "tests/frame/test_reductions.py::test_numeric_ea_axis_1[True-Float32-0-skew]", - "tests/frame/test_reductions.py::test_numeric_ea_axis_1[True-Float32-2-kurt]", - "tests/frame/test_reductions.py::test_numeric_ea_axis_1[True-Float32-2-kurtosis]", - "tests/frame/test_reductions.py::test_numeric_ea_axis_1[True-Float32-2-median]", - "tests/frame/test_reductions.py::test_numeric_ea_axis_1[True-Float32-2-prod]", - "tests/frame/test_reductions.py::test_numeric_ea_axis_1[True-Float32-2-product]", - "tests/frame/test_reductions.py::test_numeric_ea_axis_1[True-Float32-2-sem]", - "tests/frame/test_reductions.py::test_numeric_ea_axis_1[True-Float32-2-skew]", - "tests/frame/test_reductions.py::test_numeric_ea_axis_1[True-Float32-2-sum]", - "tests/frame/test_reductions.py::test_numeric_ea_axis_1[True-Float64-0-median]", - "tests/frame/test_reductions.py::test_numeric_ea_axis_1[True-Float64-2-median]", - "tests/frame/test_reductions.py::test_numeric_ea_axis_1[True-Int16-0-median]", - "tests/frame/test_reductions.py::test_numeric_ea_axis_1[True-Int16-2-median]", - "tests/frame/test_reductions.py::test_numeric_ea_axis_1[True-Int16-2-prod]", - "tests/frame/test_reductions.py::test_numeric_ea_axis_1[True-Int16-2-product]", - "tests/frame/test_reductions.py::test_numeric_ea_axis_1[True-Int16-2-sum]", - "tests/frame/test_reductions.py::test_numeric_ea_axis_1[True-Int32-0-median]", - "tests/frame/test_reductions.py::test_numeric_ea_axis_1[True-Int32-2-median]", - "tests/frame/test_reductions.py::test_numeric_ea_axis_1[True-Int32-2-prod]", - "tests/frame/test_reductions.py::test_numeric_ea_axis_1[True-Int32-2-product]", - "tests/frame/test_reductions.py::test_numeric_ea_axis_1[True-Int32-2-sum]", - "tests/frame/test_reductions.py::test_numeric_ea_axis_1[True-Int64-0-median]", - "tests/frame/test_reductions.py::test_numeric_ea_axis_1[True-Int64-2-median]", - "tests/frame/test_reductions.py::test_numeric_ea_axis_1[True-Int64-2-prod]", - "tests/frame/test_reductions.py::test_numeric_ea_axis_1[True-Int64-2-product]", - "tests/frame/test_reductions.py::test_numeric_ea_axis_1[True-Int64-2-sum]", - "tests/frame/test_reductions.py::test_numeric_ea_axis_1[True-Int8-0-median]", - "tests/frame/test_reductions.py::test_numeric_ea_axis_1[True-Int8-2-median]", - "tests/frame/test_reductions.py::test_numeric_ea_axis_1[True-Int8-2-prod]", - "tests/frame/test_reductions.py::test_numeric_ea_axis_1[True-Int8-2-product]", - "tests/frame/test_reductions.py::test_numeric_ea_axis_1[True-Int8-2-sum]", - "tests/frame/test_reductions.py::test_numeric_ea_axis_1[True-UInt16-0-median]", - "tests/frame/test_reductions.py::test_numeric_ea_axis_1[True-UInt16-2-median]", - "tests/frame/test_reductions.py::test_numeric_ea_axis_1[True-UInt16-2-prod]", - "tests/frame/test_reductions.py::test_numeric_ea_axis_1[True-UInt16-2-product]", - "tests/frame/test_reductions.py::test_numeric_ea_axis_1[True-UInt16-2-sum]", - "tests/frame/test_reductions.py::test_numeric_ea_axis_1[True-UInt32-0-median]", - "tests/frame/test_reductions.py::test_numeric_ea_axis_1[True-UInt32-2-median]", - "tests/frame/test_reductions.py::test_numeric_ea_axis_1[True-UInt32-2-prod]", - "tests/frame/test_reductions.py::test_numeric_ea_axis_1[True-UInt32-2-product]", - "tests/frame/test_reductions.py::test_numeric_ea_axis_1[True-UInt32-2-sum]", - "tests/frame/test_reductions.py::test_numeric_ea_axis_1[True-UInt64-0-median]", - "tests/frame/test_reductions.py::test_numeric_ea_axis_1[True-UInt64-2-median]", - "tests/frame/test_reductions.py::test_numeric_ea_axis_1[True-UInt64-2-prod]", - "tests/frame/test_reductions.py::test_numeric_ea_axis_1[True-UInt64-2-product]", - "tests/frame/test_reductions.py::test_numeric_ea_axis_1[True-UInt64-2-sum]", - "tests/frame/test_reductions.py::test_numeric_ea_axis_1[True-UInt8-0-median]", - "tests/frame/test_reductions.py::test_numeric_ea_axis_1[True-UInt8-2-median]", - "tests/frame/test_reductions.py::test_numeric_ea_axis_1[True-UInt8-2-prod]", - "tests/frame/test_reductions.py::test_numeric_ea_axis_1[True-UInt8-2-product]", - "tests/frame/test_reductions.py::test_numeric_ea_axis_1[True-UInt8-2-sum]", + "tests/frame/test_reductions.py::test_reduction_axis_none_returns_scalar[Float64-False-kurt]", "tests/frame/test_reductions.py::test_reduction_axis_none_returns_scalar[Float64-False-mean]", "tests/frame/test_reductions.py::test_reduction_axis_none_returns_scalar[Float64-False-median]", + "tests/frame/test_reductions.py::test_reduction_axis_none_returns_scalar[Float64-False-skew]", + "tests/frame/test_reductions.py::test_reduction_axis_none_returns_scalar[Float64-True-kurt]", "tests/frame/test_reductions.py::test_reduction_axis_none_returns_scalar[Float64-True-mean]", "tests/frame/test_reductions.py::test_reduction_axis_none_returns_scalar[Float64-True-median]", + "tests/frame/test_reductions.py::test_reduction_axis_none_returns_scalar[Float64-True-skew]", "tests/frame/test_reductions.py::test_reduction_axis_none_returns_scalar[float64-False-kurt]", "tests/frame/test_reductions.py::test_reduction_axis_none_returns_scalar[float64-False-mean]", "tests/frame/test_reductions.py::test_reduction_axis_none_returns_scalar[float64-False-median]", @@ -6979,8 +6651,6 @@ def pytest_unconfigure(config): "tests/groupby/methods/test_nth.py::test_nth_after_selection[any-selection2]", "tests/groupby/methods/test_nth.py::test_nth_column_order", "tests/groupby/methods/test_nth.py::test_nth_indexed", - "tests/groupby/methods/test_nth.py::test_groupby_last_first_nth_with_none[Decimal-nth]", - "tests/groupby/methods/test_nth.py::test_groupby_last_first_nth_with_none[NaTType-nth]", "tests/groupby/methods/test_nth.py::test_groupby_last_first_nth_with_none[NAType-nth]", "tests/groupby/methods/test_nth.py::test_groupby_last_first_nth_with_none[NoneType-nth]", "tests/groupby/methods/test_nth.py::test_nth_multi_grouper", @@ -7153,10 +6823,6 @@ def pytest_unconfigure(config): "tests/groupby/methods/test_value_counts.py::test_compound[string=string[python]-True-True-expected_rows2-expected_count2-expected_group_size2-True]", "tests/groupby/methods/test_value_counts.py::test_data_frame_value_counts_dropna[NoneType-False-count-False-expected_data1-expected_index1]", "tests/groupby/methods/test_value_counts.py::test_data_frame_value_counts_dropna[NoneType-True-proportion-False-expected_data1-expected_index1]", - "tests/groupby/methods/test_value_counts.py::test_data_frame_value_counts_dropna[Decimal-False-count-False-expected_data1-expected_index1]", - "tests/groupby/methods/test_value_counts.py::test_data_frame_value_counts_dropna[Decimal-True-proportion-False-expected_data1-expected_index1]", - "tests/groupby/methods/test_value_counts.py::test_data_frame_value_counts_dropna[NaTType-False-count-False-expected_data1-expected_index1]", - "tests/groupby/methods/test_value_counts.py::test_data_frame_value_counts_dropna[NaTType-True-proportion-False-expected_data1-expected_index1]", "tests/groupby/methods/test_value_counts.py::test_data_frame_value_counts_dropna[NAType-False-count-False-expected_data1-expected_index1]", "tests/groupby/methods/test_value_counts.py::test_data_frame_value_counts_dropna[NAType-True-proportion-False-expected_data1-expected_index1]", "tests/groupby/methods/test_value_counts.py::test_dropna_combinations[True-False-expected_rows2-expected_values2]", @@ -7270,12 +6936,6 @@ def pytest_unconfigure(config): "tests/groupby/test_counting.py::test_count_arrow_string_array[string[pyarrow]]", "tests/groupby/test_counting.py::test_count_arrow_string_array[string[pyarrow_numpy]]", "tests/groupby/test_counting.py::test_count_arrow_string_array[string[python]]", - "tests/groupby/test_cumulative.py::test_cummin[Int64]", - "tests/groupby/test_cumulative.py::test_cummin[np.int64]", - "tests/groupby/test_cumulative.py::test_cummin_max_all_nan_column[Int64-cummax]", - "tests/groupby/test_cumulative.py::test_cummin_max_all_nan_column[Int64-cummin]", - "tests/groupby/test_cumulative.py::test_cummin_max_all_nan_column[UInt64-cummax]", - "tests/groupby/test_cumulative.py::test_cummin_max_all_nan_column[UInt64-cummin]", "tests/groupby/test_cumulative.py::test_cummin_max_all_nan_column[boolean-cummax]", "tests/groupby/test_cumulative.py::test_cummin_max_all_nan_column[boolean-cummin]", "tests/groupby/test_cumulative.py::test_cython_api2", @@ -7457,23 +7117,9 @@ def pytest_unconfigure(config): "tests/groupby/test_groupby_dropna.py::test_groupby_apply_with_dropna_for_multi_index[dropna_true_no_nan]", "tests/groupby/test_groupby_dropna.py::test_groupby_dropna_multi_index_dataframe_agg[False-tuples1-outputs1]", "tests/groupby/test_groupby_dropna.py::test_groupby_dropna_multi_index_dataframe_nan_in_one_group[NoneType-False-tuples1-outputs1]", - "tests/groupby/test_groupby_dropna.py::test_groupby_dropna_multi_index_dataframe_nan_in_one_group[Decimal-False-tuples1-outputs1]", - "tests/groupby/test_groupby_dropna.py::test_groupby_dropna_multi_index_dataframe_nan_in_one_group[NaTType-False-tuples1-outputs1]", "tests/groupby/test_groupby_dropna.py::test_groupby_dropna_multi_index_dataframe_nan_in_one_group[NAType-False-tuples1-outputs1]", - "tests/groupby/test_groupby_dropna.py::test_groupby_dropna_multi_index_dataframe_nan_in_two_groups[Decimal-Decimal-False-tuples1-outputs1]", - "tests/groupby/test_groupby_dropna.py::test_groupby_dropna_multi_index_dataframe_nan_in_two_groups[Decimal-NaTType-False-tuples1-outputs1]", - "tests/groupby/test_groupby_dropna.py::test_groupby_dropna_multi_index_dataframe_nan_in_two_groups[Decimal-NAType-False-tuples1-outputs1]", - "tests/groupby/test_groupby_dropna.py::test_groupby_dropna_multi_index_dataframe_nan_in_two_groups[Decimal-NoneType-False-tuples1-outputs1]", - "tests/groupby/test_groupby_dropna.py::test_groupby_dropna_multi_index_dataframe_nan_in_two_groups[NaTType-Decimal-False-tuples1-outputs1]", - "tests/groupby/test_groupby_dropna.py::test_groupby_dropna_multi_index_dataframe_nan_in_two_groups[NaTType-NaTType-False-tuples1-outputs1]", - "tests/groupby/test_groupby_dropna.py::test_groupby_dropna_multi_index_dataframe_nan_in_two_groups[NaTType-NAType-False-tuples1-outputs1]", - "tests/groupby/test_groupby_dropna.py::test_groupby_dropna_multi_index_dataframe_nan_in_two_groups[NaTType-NoneType-False-tuples1-outputs1]", - "tests/groupby/test_groupby_dropna.py::test_groupby_dropna_multi_index_dataframe_nan_in_two_groups[NAType-Decimal-False-tuples1-outputs1]", - "tests/groupby/test_groupby_dropna.py::test_groupby_dropna_multi_index_dataframe_nan_in_two_groups[NAType-NaTType-False-tuples1-outputs1]", "tests/groupby/test_groupby_dropna.py::test_groupby_dropna_multi_index_dataframe_nan_in_two_groups[NAType-NAType-False-tuples1-outputs1]", "tests/groupby/test_groupby_dropna.py::test_groupby_dropna_multi_index_dataframe_nan_in_two_groups[NAType-NoneType-False-tuples1-outputs1]", - "tests/groupby/test_groupby_dropna.py::test_groupby_dropna_multi_index_dataframe_nan_in_two_groups[NoneType-Decimal-False-tuples1-outputs1]", - "tests/groupby/test_groupby_dropna.py::test_groupby_dropna_multi_index_dataframe_nan_in_two_groups[NoneType-NaTType-False-tuples1-outputs1]", "tests/groupby/test_groupby_dropna.py::test_groupby_dropna_multi_index_dataframe_nan_in_two_groups[NoneType-NAType-False-tuples1-outputs1]", "tests/groupby/test_groupby_dropna.py::test_groupby_dropna_multi_index_dataframe_nan_in_two_groups[NoneType-NoneType-False-tuples1-outputs1]", "tests/groupby/test_groupby_dropna.py::test_grouper_dropna_propagation[False]", @@ -8214,10 +7860,6 @@ def pytest_unconfigure(config): "tests/indexes/numeric/test_astype.py::TestAstype::test_cannot_cast_inf_to_int[inf-int32]", "tests/indexes/numeric/test_astype.py::TestAstype::test_cannot_cast_inf_to_int[inf-int64]", "tests/indexes/numeric/test_astype.py::TestAstype::test_cannot_cast_inf_to_int[inf-int]", - "tests/indexes/numeric/test_astype.py::TestAstype::test_cannot_cast_inf_to_int[nan-int16]", - "tests/indexes/numeric/test_astype.py::TestAstype::test_cannot_cast_inf_to_int[nan-int32]", - "tests/indexes/numeric/test_astype.py::TestAstype::test_cannot_cast_inf_to_int[nan-int64]", - "tests/indexes/numeric/test_astype.py::TestAstype::test_cannot_cast_inf_to_int[nan-int]", "tests/indexes/numeric/test_indexing.py::TestContains::test_contains_float64_nans", "tests/indexes/numeric/test_indexing.py::TestGetIndexer::test_get_indexer_invalid", "tests/indexes/numeric/test_indexing.py::TestGetIndexer::test_get_indexer_nearest_decreasing[backfill-expected1]", @@ -8246,11 +7888,7 @@ def pytest_unconfigure(config): "tests/indexes/numeric/test_numeric.py::TestIntNumericIndex::test_constructor[int16]", "tests/indexes/numeric/test_numeric.py::TestIntNumericIndex::test_constructor[int32]", "tests/indexes/numeric/test_numeric.py::TestIntNumericIndex::test_constructor[int64]", - "tests/indexes/object/test_indexing.py::TestGetIndexerNonUnique::test_get_indexer_non_unique_nas[Decimal]", - "tests/indexes/object/test_indexing.py::TestGetIndexerNonUnique::test_get_indexer_non_unique_nas[NaTType]", "tests/indexes/object/test_indexing.py::TestGetIndexerNonUnique::test_get_indexer_non_unique_nas[NAType]", - "tests/indexes/object/test_indexing.py::TestGetIndexer::test_get_indexer_with_NA_values[None-unique_nulls_fixture22]", - "tests/indexes/object/test_indexing.py::TestGetIndexer::test_get_indexer_with_NA_values[unique_nulls_fixture2-None]", "tests/indexes/object/test_indexing.py::TestSliceLocs::test_slice_locs_negative_step[in_slice13--object]", "tests/indexes/object/test_indexing.py::TestSliceLocs::test_slice_locs_negative_step[in_slice13--string[pyarrow_numpy]]", "tests/indexes/period/test_constructors.py::TestPeriodIndex::test_constructor_fromarraylike", @@ -8324,8 +7962,6 @@ def pytest_unconfigure(config): "tests/indexes/string/test_indexing.py::TestGetIndexer::test_get_indexer_strings_raises[string=string[pyarrow]]", "tests/indexes/string/test_indexing.py::TestGetIndexer::test_get_indexer_strings_raises[string=string[python]]", "tests/indexes/string/test_indexing.py::TestGetIndexerNonUnique::test_get_indexer_non_unique_nas[string=object-null3]", - "tests/indexes/string/test_indexing.py::TestGetLoc::test_get_loc_missing[string=object-Decimal]", - "tests/indexes/string/test_indexing.py::TestGetLoc::test_get_loc_missing[string=object-NaTType]", "tests/indexes/string/test_indexing.py::TestGetLoc::test_get_loc_missing[string=object-NAType]", "tests/indexes/string/test_indexing.py::TestSliceLocs::test_slice_locs_negative_step[string=object-in_slice13-]", "tests/indexes/string/test_indexing.py::TestSliceLocs::test_slice_locs_negative_step[string=str[pyarrow]-in_slice13-]", @@ -8378,8 +8014,6 @@ def pytest_unconfigure(config): "tests/indexes/test_base.py::TestIndex::test_empty_fancy_raises[uint32]", "tests/indexes/test_base.py::TestIndex::test_empty_fancy_raises[uint64]", "tests/indexes/test_base.py::TestIndex::test_equals_op_mismatched_multiindex_raises[index0]", - "tests/indexes/test_base.py::TestIndex::test_format_missing[Decimal-vals1]", - "tests/indexes/test_base.py::TestIndex::test_format_missing[NaTType-vals1]", "tests/indexes/test_base.py::TestIndex::test_format_missing[NAType-vals1]", "tests/indexes/test_base.py::TestIndex::test_is_", "tests/indexes/test_base.py::TestIndex::test_isin_level_kwarg_bad_label_raises[bool-dtype-nan]", @@ -8437,10 +8071,6 @@ def pytest_unconfigure(config): "tests/indexes/test_base.py::TestIndex::test_isin_nan_common_float64[NoneType-float32]", "tests/indexes/test_base.py::TestIndex::test_isin_nan_common_float64[NoneType-float64]", "tests/indexes/test_base.py::TestIndex::test_isin_nan_common_float64[NoneType-float]", - "tests/indexes/test_base.py::TestIndex::test_isin_nan_common_object[Decimal-Decimal]", - "tests/indexes/test_base.py::TestIndex::test_isin_nan_common_object[Decimal-NoneType]", - "tests/indexes/test_base.py::TestIndex::test_isin_nan_common_object[NaTType-NaTType]", - "tests/indexes/test_base.py::TestIndex::test_isin_nan_common_object[NaTType-NoneType]", "tests/indexes/test_base.py::TestIndex::test_isin_nan_common_object[NAType-NAType]", "tests/indexes/test_base.py::TestIndex::test_isin_nan_common_object[NAType-NoneType]", "tests/indexes/test_base.py::TestIndex::test_map_defaultdict", @@ -9013,7 +8643,6 @@ def pytest_unconfigure(config): "tests/indexing/test_iat.py::test_iat_setitem_item_cache_cleared[iat]", "tests/indexing/test_iat.py::test_iat_setitem_item_cache_cleared[iloc]", "tests/indexing/test_iloc.py::TestILocErrors::test_iloc_getitem_setitem_fancy_exceptions", - "tests/indexing/test_iloc.py::TestILocSeries::test_iloc_nullable_int64_size_1_nan", "tests/indexing/test_iloc.py::TestiLocBaseIndependent::test_identity_slice_returns_new_object", "tests/indexing/test_iloc.py::TestiLocBaseIndependent::test_iloc_getitem_doc_issue", "tests/indexing/test_iloc.py::TestiLocBaseIndependent::test_iloc_getitem_int_single_ea_block_view", @@ -10102,6 +9731,7 @@ def pytest_unconfigure(config): "tests/indexing/test_loc.py::TestLocWithMultiIndex::test_loc_getitem_multilevel_index_order[index-keys5-expected5]", "tests/indexing/test_loc.py::TestLocWithMultiIndex::test_loc_getitem_multilevel_index_order[index-keys6-expected6]", "tests/indexing/test_loc.py::TestLocWithMultiIndex::test_loc_multiindex_null_slice_na_level", + "tests/indexing/test_loc.py::TestLocWithMultiIndex::test_loc_set_nan_in_categorical_series[Float64]", "tests/indexing/test_loc.py::test_loc_datetimelike_mismatched_dtypes", "tests/indexing/test_loc.py::test_loc_getitem_multiindex_tuple_level", "tests/indexing/test_loc.py::test_loc_setitem_uint8_upcast[300]", @@ -10242,7 +9872,6 @@ def pytest_unconfigure(config): "tests/io/formats/style/test_html.py::test_sticky_levels[one-False-True]", "tests/io/formats/style/test_html.py::test_sticky_levels[one-True-False]", "tests/io/formats/style/test_html.py::test_sticky_levels[one-True-True]", - "tests/io/formats/style/test_matplotlib.py::test_background_gradient_nullable_dtypes", "tests/io/formats/style/test_style.py::TestStyler::test_apply_axis", "tests/io/formats/style/test_style.py::TestStyler::test_caption", "tests/io/formats/style/test_style.py::TestStyler::test_export", @@ -10298,7 +9927,6 @@ def pytest_unconfigure(config): "tests/io/formats/test_to_csv.py::TestToCSV::test_to_csv_different_datetime_formats", "tests/io/formats/test_to_csv.py::TestToCSV::test_to_csv_multi_index", "tests/io/formats/test_to_csv.py::TestToCSV::test_to_csv_na_rep", - "tests/io/formats/test_to_csv.py::TestToCSV::test_to_csv_na_rep_long_string[Int64]", "tests/io/formats/test_to_csv.py::TestToCSV::test_to_csv_with_single_column", "tests/io/formats/test_to_html.py::test_to_html_truncate", "tests/io/formats/test_to_html.py::test_to_html_truncation_index_false_max_cols[True-gh22783_named_columns_index-0]", @@ -10524,8 +10152,6 @@ def pytest_unconfigure(config): "tests/io/test_fsspec.py::test_json_options[zip]", "tests/io/test_fsspec.py::test_json_options[zstd]", "tests/io/test_fsspec.py::test_non_fsspec_options", - "tests/io/test_html.py::TestReadHtml::test_extract_links[bs4-header]", - "tests/io/test_html.py::TestReadHtml::test_extract_links[lxml-header]", "tests/io/test_orc.py::test_orc_reader_basic", "tests/io/test_orc.py::test_orc_reader_date_high", "tests/io/test_orc.py::test_orc_reader_date_low", @@ -10687,10 +10313,6 @@ def pytest_unconfigure(config): "tests/io/xml/test_xml.py::test_empty_stylesheet[1]", "tests/io/xml/test_xml.py::test_wrong_file_path[etree]", "tests/io/xml/test_xml.py::test_wrong_file_path[lxml]", - "tests/io/xml/test_xml_dtypes.py::test_dtype_nullable_int[etree]", - "tests/io/xml/test_xml_dtypes.py::test_dtype_nullable_int[lxml]", - "tests/io/xml/test_xml_dtypes.py::test_dtypes_with_names[etree]", - "tests/io/xml/test_xml_dtypes.py::test_dtypes_with_names[lxml]", "tests/libs/test_libalgos.py::test_ensure_platform_int", "tests/plotting/frame/test_frame.py::TestDataFramePlots::test_unordered_ts", "tests/plotting/test_boxplot_method.py::TestDataFramePlots::test_boxplot_legacy1_series", @@ -11458,7 +11080,6 @@ def pytest_unconfigure(config): "tests/reshape/test_get_dummies.py::TestGetDummies::test_get_dummies_ea_dtype[string=string[python]-string]", "tests/reshape/test_get_dummies.py::TestGetDummies::test_get_dummies_int_int", "tests/reshape/test_melt.py::TestMelt::test_melt_ea_columns", - "tests/reshape/test_melt.py::TestMelt::test_melt_ea_dtype[Int8]", "tests/reshape/test_melt.py::TestMelt::test_multiindex", "tests/reshape/test_melt.py::TestMelt::test_vars_work_with_multiindex", "tests/reshape/test_melt.py::TestWideToLong::test_invalid_separator", @@ -11935,8 +11556,6 @@ def pytest_unconfigure(config): "tests/series/indexing/test_setitem.py::TestSeriesNoneCoercion::test_series_where[obj1-expected1-None]", "tests/series/indexing/test_setitem.py::TestSeriesNoneCoercion::test_series_where[obj2-expected2-None]", "tests/series/indexing/test_setitem.py::TestSeriesNoneCoercion::test_series_where[obj3-expected3-None]", - "tests/series/indexing/test_setitem.py::TestSetitemBooleanMask::test_setitem_boolean_nullable_int_types[Float32]", - "tests/series/indexing/test_setitem.py::TestSetitemBooleanMask::test_setitem_boolean_nullable_int_types[Float64]", "tests/series/indexing/test_setitem.py::TestSetitemBooleanMask::test_setitem_boolean_nullable_int_types[Int16]", "tests/series/indexing/test_setitem.py::TestSetitemBooleanMask::test_setitem_boolean_nullable_int_types[Int32]", "tests/series/indexing/test_setitem.py::TestSetitemBooleanMask::test_setitem_boolean_nullable_int_types[Int64]", @@ -12490,8 +12109,7 @@ def pytest_unconfigure(config): "tests/series/indexing/test_setitem.py::TestSetitemTimedelta64IntoNumeric::test_mask_key[int-loc]", "tests/series/indexing/test_setitem.py::TestSetitemTimedelta64IntoNumeric::test_mask_key[int-setitem]", "tests/series/indexing/test_setitem.py::TestSetitemTimedelta64IntoNumeric::test_series_where[int]", - "tests/series/indexing/test_setitem.py::TestSetitemWithExpansion::test_setitem_enlargement_object_none[Decimal]", - "tests/series/indexing/test_setitem.py::TestSetitemWithExpansion::test_setitem_enlargement_object_none[NaTType]", + "tests/series/indexing/test_setitem.py::TestSetitemWithExpansion::test_setitem_enlarge_with_na[na5-target_na5-int64-object-2-None]", "tests/series/indexing/test_setitem.py::TestSetitemWithExpansion::test_setitem_enlargement_object_none[NAType]", "tests/series/indexing/test_setitem.py::TestSmallIntegerSetitemUpcast::test_int_key[iloc-4611686018427387904]", "tests/series/indexing/test_setitem.py::TestSmallIntegerSetitemUpcast::test_int_key[iloc-8589934593.0]", @@ -12532,9 +12150,7 @@ def pytest_unconfigure(config): "tests/series/methods/test_argsort.py::TestSeriesArgsort::test_argsort_dt64[us]", "tests/series/methods/test_argsort.py::TestSeriesArgsort::test_argsort_stable", "tests/series/methods/test_astype.py::TestAstype::test_astype_cast_nan_inf_int[int32-inf]", - "tests/series/methods/test_astype.py::TestAstype::test_astype_cast_nan_inf_int[int32-nan]", "tests/series/methods/test_astype.py::TestAstype::test_astype_cast_nan_inf_int[int64-inf]", - "tests/series/methods/test_astype.py::TestAstype::test_astype_cast_nan_inf_int[int64-nan]", "tests/series/methods/test_astype.py::TestAstype::test_astype_float_to_uint_negatives_raise[float-uint16]", "tests/series/methods/test_astype.py::TestAstype::test_astype_float_to_uint_negatives_raise[float-uint32]", "tests/series/methods/test_astype.py::TestAstype::test_astype_float_to_uint_negatives_raise[float-uint64]", @@ -12849,7 +12465,6 @@ def pytest_unconfigure(config): "tests/series/methods/test_nlargest.py::TestSeriesNLargestNSmallest::test_nlargest_boundary_datetimelike[nsmallest-timedelta64[ns]]", "tests/series/methods/test_nlargest.py::TestSeriesNLargestNSmallest::test_nlargest_error[r4]", "tests/series/methods/test_nlargest.py::TestSeriesNLargestNSmallest::test_nlargest_misc", - "tests/series/methods/test_nunique.py::test_nunique", "tests/series/methods/test_quantile.py::TestSeriesQuantile::test_quantile_all_na[Int16]", "tests/series/methods/test_quantile.py::TestSeriesQuantile::test_quantile_all_na[Int32]", "tests/series/methods/test_quantile.py::TestSeriesQuantile::test_quantile_all_na[Int64]", @@ -12892,23 +12507,18 @@ def pytest_unconfigure(config): "tests/series/methods/test_rank.py::TestSeriesRank::test_rank_dense_method[string[pyarrow]-ser5-exp5]", "tests/series/methods/test_rank.py::TestSeriesRank::test_rank_dense_method[string[pyarrow]-ser6-exp6]", "tests/series/methods/test_rank.py::TestSeriesRank::test_rank_dense_method[string[pyarrow]-ser7-exp7]", - "tests/series/methods/test_rank.py::TestSeriesRank::test_rank_descending[average-Int64]", "tests/series/methods/test_rank.py::TestSeriesRank::test_rank_descending[average-float64[pyarrow]]", "tests/series/methods/test_rank.py::TestSeriesRank::test_rank_descending[average-int64[pyarrow]]", "tests/series/methods/test_rank.py::TestSeriesRank::test_rank_descending[average-string[pyarrow]]", - "tests/series/methods/test_rank.py::TestSeriesRank::test_rank_descending[dense-Int64]", "tests/series/methods/test_rank.py::TestSeriesRank::test_rank_descending[dense-float64[pyarrow]]", "tests/series/methods/test_rank.py::TestSeriesRank::test_rank_descending[dense-int64[pyarrow]]", "tests/series/methods/test_rank.py::TestSeriesRank::test_rank_descending[dense-string[pyarrow]]", - "tests/series/methods/test_rank.py::TestSeriesRank::test_rank_descending[first-Int64]", "tests/series/methods/test_rank.py::TestSeriesRank::test_rank_descending[first-float64[pyarrow]]", "tests/series/methods/test_rank.py::TestSeriesRank::test_rank_descending[first-int64[pyarrow]]", "tests/series/methods/test_rank.py::TestSeriesRank::test_rank_descending[first-string[pyarrow]]", - "tests/series/methods/test_rank.py::TestSeriesRank::test_rank_descending[max-Int64]", "tests/series/methods/test_rank.py::TestSeriesRank::test_rank_descending[max-float64[pyarrow]]", "tests/series/methods/test_rank.py::TestSeriesRank::test_rank_descending[max-int64[pyarrow]]", "tests/series/methods/test_rank.py::TestSeriesRank::test_rank_descending[max-string[pyarrow]]", - "tests/series/methods/test_rank.py::TestSeriesRank::test_rank_descending[min-Int64]", "tests/series/methods/test_rank.py::TestSeriesRank::test_rank_descending[min-float64[pyarrow]]", "tests/series/methods/test_rank.py::TestSeriesRank::test_rank_descending[min-int64[pyarrow]]", "tests/series/methods/test_rank.py::TestSeriesRank::test_rank_descending[min-string[pyarrow]]", @@ -13101,10 +12711,8 @@ def pytest_unconfigure(config): "tests/series/methods/test_reindex.py::test_reindex_fill_value_datetimelike_upcast[0-datetime64[ns]]", "tests/series/methods/test_reindex.py::test_reindex_fill_value_datetimelike_upcast[0-timedelta64[ns]]", "tests/series/methods/test_reindex.py::test_reindex_pad2", - "tests/series/methods/test_reindex.py::test_reindexing_with_float64_NA_log", "tests/series/methods/test_rename.py::TestRename::test_rename_copy_false", "tests/series/methods/test_replace.py::TestSeriesReplace::test_replace_categorical_single", - "tests/series/methods/test_replace.py::TestSeriesReplace::test_replace_change_dtype_series", "tests/series/methods/test_replace.py::TestSeriesReplace::test_replace_datetime64", "tests/series/methods/test_replace.py::TestSeriesReplace::test_replace_dtype[Float64-input_data4-to_replace4-expected_data4]", "tests/series/methods/test_replace.py::TestSeriesReplace::test_replace_dtype[Int64-input_data2-to_replace2-expected_data2]", @@ -13157,8 +12765,6 @@ def pytest_unconfigure(config): "tests/series/test_arithmetic.py::TestSeriesArithmetic::test_alignment_doesnt_change_tz[python]", "tests/series/test_arithmetic.py::TestSeriesArithmetic::test_arithmetic_with_duplicate_index[numexpr]", "tests/series/test_arithmetic.py::TestSeriesArithmetic::test_arithmetic_with_duplicate_index[python]", - "tests/series/test_arithmetic.py::TestSeriesArithmetic::test_mask_div_propagate_na_for_non_na_dtype[numexpr]", - "tests/series/test_arithmetic.py::TestSeriesArithmetic::test_mask_div_propagate_na_for_non_na_dtype[python]", "tests/series/test_arithmetic.py::TestSeriesComparison::test_comparison_tuples[numexpr]", "tests/series/test_arithmetic.py::TestSeriesComparison::test_comparison_tuples[python]", "tests/series/test_arithmetic.py::TestSeriesFlexArithmetic::test_flex_add_scalar_fill_value[numexpr]", @@ -13321,22 +12927,12 @@ def pytest_unconfigure(config): "tests/series/test_constructors.py::TestSeriesConstructors::test_constructor_dtype_timedelta64", "tests/series/test_constructors.py::TestSeriesConstructors::test_constructor_floating_data_int_dtype[DataFrame]", "tests/series/test_constructors.py::TestSeriesConstructors::test_constructor_floating_data_int_dtype[Series]", - "tests/series/test_constructors.py::TestSeriesConstructors::test_constructor_invalid_coerce_ints_with_float_nan[int16]", - "tests/series/test_constructors.py::TestSeriesConstructors::test_constructor_invalid_coerce_ints_with_float_nan[int32]", - "tests/series/test_constructors.py::TestSeriesConstructors::test_constructor_invalid_coerce_ints_with_float_nan[int64]", - "tests/series/test_constructors.py::TestSeriesConstructors::test_constructor_invalid_coerce_ints_with_float_nan[int8]", - "tests/series/test_constructors.py::TestSeriesConstructors::test_constructor_invalid_coerce_ints_with_float_nan[int]", - "tests/series/test_constructors.py::TestSeriesConstructors::test_constructor_invalid_coerce_ints_with_float_nan[uint16]", - "tests/series/test_constructors.py::TestSeriesConstructors::test_constructor_invalid_coerce_ints_with_float_nan[uint32]", - "tests/series/test_constructors.py::TestSeriesConstructors::test_constructor_invalid_coerce_ints_with_float_nan[uint64]", - "tests/series/test_constructors.py::TestSeriesConstructors::test_constructor_invalid_coerce_ints_with_float_nan[uint8]", "tests/series/test_constructors.py::TestSeriesConstructors::test_constructor_list_of_tuples", "tests/series/test_constructors.py::TestSeriesConstructors::test_constructor_maskedarray", "tests/series/test_constructors.py::TestSeriesConstructors::test_constructor_no_data_string_type", "tests/series/test_constructors.py::TestSeriesConstructors::test_constructor_no_partial_datetime_casting", "tests/series/test_constructors.py::TestSeriesConstructors::test_constructor_pass_none", "tests/series/test_constructors.py::TestSeriesConstructors::test_constructor_range_overflows", - "tests/series/test_constructors.py::TestSeriesConstructors::test_constructor_sanitize", "tests/series/test_constructors.py::TestSeriesConstructors::test_constructor_tuple_of_tuples", "tests/series/test_constructors.py::TestSeriesConstructors::test_convert_non_ns", "tests/series/test_constructors.py::TestSeriesConstructors::test_invalid_dtype", @@ -13640,7 +13236,6 @@ def pytest_unconfigure(config): "tests/test_algos.py::TestUnique::test_timedelta64_dtype_array_returned", "tests/test_algos.py::TestValueCounts::test_categorical_nans", "tests/test_algos.py::TestValueCounts::test_value_counts_dropna", - "tests/test_algos.py::TestValueCounts::test_value_counts_normalized[M8[ns]]", "tests/test_algos.py::TestValueCounts::test_value_counts_normalized[float64]", "tests/test_common.py::test_serializable[obj0]", "tests/test_common.py::test_standardize_mapping", @@ -13845,7 +13440,6 @@ def pytest_unconfigure(config): "tests/tools/test_to_datetime.py::TestToDatetimeUnit::test_to_datetime_unit_with_nulls[-9223372036854775808]", "tests/tools/test_to_datetime.py::TestToDatetimeUnit::test_unit_rounding[False]", "tests/tools/test_to_datetime.py::TestToDatetimeUnit::test_unit_rounding[True]", - "tests/tools/test_to_datetime.py::test_nullable_integer_to_datetime", "tests/tools/test_to_numeric.py::test_downcast_basic[kwargs0-int64-data2]", "tests/tools/test_to_numeric.py::test_downcast_basic[kwargs1-int64-data2]", "tests/tools/test_to_numeric.py::test_downcast_basic[kwargs2-f-data2]", @@ -14468,6 +14062,7 @@ def pytest_unconfigure(config): "tests/util/test_assert_produces_warning.py::test_right_category_wrong_match_raises[tuple7]", "tests/util/test_assert_series_equal.py::test_allows_duplicate_labels", "tests/util/test_assert_series_equal.py::test_assert_series_equal_extension_dtype_mismatch", + "tests/util/test_assert_series_equal.py::test_large_unequal_ints[Int64]", "tests/util/test_assert_series_equal.py::test_large_unequal_ints[int64]", "tests/util/test_assert_series_equal.py::test_series_equal_series_type", "tests/util/test_hashing.py::test_hash_with_tuple[data0-result_data0]", @@ -15148,6 +14743,15 @@ def pytest_unconfigure(config): r"tests/util/test_assert_series_equal.py::test_series_equal_index_dtype[True-s11-s21-MultiIndex level \\[0\\] are different]", } NODEIDS_THAT_XPASS_WITH_CUDF_PANDAS = { + "tests/extension/test_arrow.py::TestArrowArray::test_reduce_series_numeric[uint8-skew-True]", + "tests/extension/test_arrow.py::TestArrowArray::test_reduce_series_numeric[uint16-skew-True]", + "tests/extension/test_arrow.py::TestArrowArray::test_reduce_series_numeric[uint32-skew-True]", + "tests/extension/test_arrow.py::TestArrowArray::test_reduce_series_numeric[uint64-skew-True]", + "tests/extension/test_arrow.py::TestArrowArray::test_reduce_series_numeric[int8-skew-True]", + "tests/extension/test_arrow.py::TestArrowArray::test_reduce_series_numeric[int16-skew-True]", + "tests/extension/test_arrow.py::TestArrowArray::test_reduce_series_numeric[int32-skew-True]", + "tests/extension/test_arrow.py::TestArrowArray::test_reduce_series_numeric[int64-skew-True]", + "tests/extension/test_arrow.py::TestArrowArray::test_reduce_series_numeric[float-skew-True]", "tests/io/json/test_readlines.py::test_readjson_unicode[pyarrow]", "tests/series/methods/test_info.py::test_info_memory_usage_deep_pypy", "tests/arrays/string_/test_string.py::test_add_2d[pyarrow]", @@ -15325,6 +14929,12 @@ def pytest_unconfigure(config): # TODO: Investigate why sometimes these fail NODEIDS_THAT_FLAKY_XFAIL_WITH_CUDF_PANDAS = { + "tests/extension/test_arrow.py::TestArrowArray::test_arith_series_with_scalar[uint8-__rpow__]", + "tests/extension/test_arrow.py::TestArrowArray::test_arith_series_with_scalar[uint16-__rpow__]", + "tests/extension/test_arrow.py::TestArrowArray::test_arith_series_with_scalar[uint32-__rpow__]", + "tests/extension/test_arrow.py::TestArrowArray::test_arith_frame_with_scalar[uint8-__rpow__]", + "tests/extension/test_arrow.py::TestArrowArray::test_arith_frame_with_scalar[uint16-__rpow__]", + "tests/extension/test_arrow.py::TestArrowArray::test_arith_frame_with_scalar[uint32-__rpow__]", r"tests/tools/test_to_datetime.py::TestToDatetimeMisc::test_to_datetime_iso8601_fails[True-2012-01-01 10:00-%Y-%m-%d %H:%M:%S]", "tests/indexing/test_chaining_and_caching.py::TestChaining::test_detect_chained_assignment_warnings_errors", "tests/indexes/multi/test_indexing.py::test_pyint_engine", diff --git a/python/cudf/cudf/tests/private_objects/test_column.py b/python/cudf/cudf/tests/private_objects/test_column.py index f7c7b0808d4..47cd9c6d706 100644 --- a/python/cudf/cudf/tests/private_objects/test_column.py +++ b/python/cudf/cudf/tests/private_objects/test_column.py @@ -313,7 +313,9 @@ def test_column_view_valid_numeric_to_numeric(data, from_dtype, to_dtype): gpu_data_view = gpu_data.view(to_dtype) expect = pd.Series(cpu_data_view, dtype=cpu_data_view.dtype) - got = cudf.Series._from_column(gpu_data_view).astype(gpu_data_view.dtype) + got = cudf.Series._from_column(gpu_data_view).astype( + gpu_data_view.dtype, copy=False + ) gpu_ptr = gpu_data.data.get_ptr(mode="read") assert gpu_ptr == got._column.data.get_ptr(mode="read") diff --git a/python/cudf/cudf/tests/series/methods/test_unique.py b/python/cudf/cudf/tests/series/methods/test_unique.py index 257c57aa1e7..b87f1872157 100644 --- a/python/cudf/cudf/tests/series/methods/test_unique.py +++ b/python/cudf/cudf/tests/series/methods/test_unique.py @@ -50,14 +50,6 @@ def test_series_unique(): def test_series_nunique(request, nan_as_null, dropna): - # We remove nulls as opposed to NaNs using the dropna parameter, - # so to test against pandas we replace NaN with another discrete value - request.applymarker( - pytest.mark.xfail( - nan_as_null is None, - reason=f"{nan_as_null=} returns wrong result", - ) - ) cudf_series = cudf.Series([1, 2, 2, 3, 3], nan_as_null=nan_as_null) pd_series = pd.Series([1, 2, 2, 3, 3]) expect = pd_series.nunique(dropna=dropna) @@ -67,20 +59,20 @@ def test_series_nunique(request, nan_as_null, dropna): cudf_series = cudf.Series( [1.0, 2.0, 3.0, np.nan, None], nan_as_null=nan_as_null ) - if nan_as_null is True: - pd_series = pd.Series([1.0, 2.0, 3.0, np.nan, None]) + if nan_as_null in {True, None}: + pd_series = pd.Series([1.0, 2.0, 3.0, None, None]) else: - pd_series = pd.Series([1.0, 2.0, 3.0, -1.0, None]) + pd_series = pd.Series([1.0, 2.0, 3.0, np.nan, None], dtype=object) expect = pd_series.nunique(dropna=dropna) got = cudf_series.nunique(dropna=dropna) assert expect == got cudf_series = cudf.Series([1.0, np.nan, np.nan], nan_as_null=nan_as_null) - if nan_as_null is True: + if nan_as_null in {True, None}: pd_series = pd.Series([1.0, np.nan, np.nan]) else: - pd_series = pd.Series([1.0, -1.0, -1.0]) + pd_series = pd.Series([1.0, None, None]) expect = pd_series.nunique(dropna=dropna) got = cudf_series.nunique(dropna=dropna) assert expect == got diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py index b8acd46e895..b10b5f1247c 100644 --- a/python/cudf/cudf/utils/dtypes.py +++ b/python/cudf/cudf/utils/dtypes.py @@ -228,9 +228,18 @@ def is_mixed_with_object_dtype(lhs, rhs): elif isinstance(rhs.dtype, cudf.CategoricalDtype): return is_mixed_with_object_dtype(lhs, rhs.dtype.categories) - return (lhs.dtype == "object" and rhs.dtype != "object") or ( + res = (lhs.dtype == "object" and rhs.dtype != "object") or ( rhs.dtype == "object" and lhs.dtype != "object" ) + if res: + return res + return ( + cudf.api.types.is_string_dtype(lhs.dtype) + and not cudf.api.types.is_string_dtype(rhs.dtype) + ) or ( + cudf.api.types.is_string_dtype(rhs.dtype) + and not cudf.api.types.is_string_dtype(lhs.dtype) + ) def _get_nan_for_dtype(dtype: DtypeObj) -> DtypeObj: From 414b91691094b087fce9c687c8efaa4ca22308cf Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Wed, 3 Sep 2025 12:42:08 -0700 Subject: [PATCH 248/366] Remove pyarrow upper bound (#19870) The latest PR to update our pyarrow pinnings #19592 made us compatible with the latest version of Arrow. The update was a little bumpy, but the main reasons had to do with 1) our improper use of Arrow APIs in our C++ tests and 2) a bug in our reading of v2 parquet files. Actual usage of our library was fine, so users would have been OK using a newer version, and we might have caught the bugs in our parquet support sooner. This PR proposes dropping the upper bound entirely to allow us to automatically support future versions as they are released. There is no real need for us to upgrade the version of Arrow that our C++ builds against; if it's already working, then we can stick with it since we're primarily using it for testing. If the Spark team finds a reason to request an upgrade we can always bump the CMake pin, but [they also plan to move to nanoarrow eventually](https://github.com/NVIDIA/spark-rapids-jni/issues/3268) so I doubt it'll be a priority. Authors: - Vyas Ramasubramani (https://github.com/vyasr) Approvers: - James Lamb (https://github.com/jameslamb) URL: https://github.com/rapidsai/cudf/pull/19870 --- conda/environments/all_cuda-129_arch-aarch64.yaml | 2 +- conda/environments/all_cuda-129_arch-x86_64.yaml | 2 +- conda/environments/all_cuda-130_arch-aarch64.yaml | 2 +- conda/environments/all_cuda-130_arch-x86_64.yaml | 2 +- dependencies.yaml | 6 +++--- python/cudf/pyproject.toml | 4 ++-- python/pylibcudf/pyproject.toml | 8 ++++---- 7 files changed, 13 insertions(+), 13 deletions(-) diff --git a/conda/environments/all_cuda-129_arch-aarch64.yaml b/conda/environments/all_cuda-129_arch-aarch64.yaml index 4a296164cec..a4a81ebaa66 100644 --- a/conda/environments/all_cuda-129_arch-aarch64.yaml +++ b/conda/environments/all_cuda-129_arch-aarch64.yaml @@ -68,7 +68,7 @@ dependencies: - pandoc - polars>=1.28,<1.33 - pre-commit -- pyarrow>=15.0.0,<22.0.0a0 +- pyarrow>=15.0.0 - pydata-sphinx-theme>=0.15.4 - pynvml>=12.0.0,<13.0.0a0 - pytest diff --git a/conda/environments/all_cuda-129_arch-x86_64.yaml b/conda/environments/all_cuda-129_arch-x86_64.yaml index 30ac023ca78..0fa1e5f3ddc 100644 --- a/conda/environments/all_cuda-129_arch-x86_64.yaml +++ b/conda/environments/all_cuda-129_arch-x86_64.yaml @@ -69,7 +69,7 @@ dependencies: - pandoc - polars>=1.28,<1.33 - pre-commit -- pyarrow>=15.0.0,<22.0.0a0 +- pyarrow>=15.0.0 - pydata-sphinx-theme>=0.15.4 - pynvml>=12.0.0,<13.0.0a0 - pytest diff --git a/conda/environments/all_cuda-130_arch-aarch64.yaml b/conda/environments/all_cuda-130_arch-aarch64.yaml index 5ab54367559..c64e9000a08 100644 --- a/conda/environments/all_cuda-130_arch-aarch64.yaml +++ b/conda/environments/all_cuda-130_arch-aarch64.yaml @@ -68,7 +68,7 @@ dependencies: - pandoc - polars>=1.28,<1.33 - pre-commit -- pyarrow>=15.0.0,<22.0.0a0 +- pyarrow>=15.0.0 - pydata-sphinx-theme>=0.15.4 - pynvml>=12.0.0,<13.0.0a0 - pytest diff --git a/conda/environments/all_cuda-130_arch-x86_64.yaml b/conda/environments/all_cuda-130_arch-x86_64.yaml index 906a143b428..7ae24161e60 100644 --- a/conda/environments/all_cuda-130_arch-x86_64.yaml +++ b/conda/environments/all_cuda-130_arch-x86_64.yaml @@ -69,7 +69,7 @@ dependencies: - pandoc - polars>=1.28,<1.33 - pre-commit -- pyarrow>=15.0.0,<22.0.0a0 +- pyarrow>=15.0.0 - pydata-sphinx-theme>=0.15.4 - pynvml>=12.0.0,<13.0.0a0 - pytest diff --git a/dependencies.yaml b/dependencies.yaml index 73d6900d8f1..67e627aa718 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -511,7 +511,7 @@ dependencies: common: - output_types: [conda] packages: - - pyarrow>=15.0.0,<22.0.0a0 + - pyarrow>=15.0.0 - output_types: [requirements, pyproject] packages: # pyarrow 17.0.0 wheels have a subtle issue around threading that @@ -519,8 +519,8 @@ dependencies: # be highly dependent on the exact build configuration, so we'll just # avoid 17.0.0 for now unless we observe similar issues in future # releases as well. - - pyarrow>=15.0.0,<22.0.0a0; platform_machine=='x86_64' - - pyarrow>=15.0.0,<22.0.0a0,!=17.0.0; platform_machine=='aarch64' + - pyarrow>=15.0.0; platform_machine=='x86_64' + - pyarrow>=15.0.0,!=17.0.0; platform_machine=='aarch64' cuda_version: specific: - output_types: conda diff --git a/python/cudf/pyproject.toml b/python/cudf/pyproject.toml index f97bd6a7eb3..af324761ee0 100644 --- a/python/cudf/pyproject.toml +++ b/python/cudf/pyproject.toml @@ -29,8 +29,8 @@ dependencies = [ "nvtx>=0.2.1", "packaging", "pandas>=2.0,<2.4.0dev0", - "pyarrow>=15.0.0,<22.0.0a0,!=17.0.0; platform_machine=='aarch64'", - "pyarrow>=15.0.0,<22.0.0a0; platform_machine=='x86_64'", + "pyarrow>=15.0.0,!=17.0.0; platform_machine=='aarch64'", + "pyarrow>=15.0.0; platform_machine=='x86_64'", "pylibcudf==25.10.*,>=0.0.0a0", "rich", "rmm==25.10.*,>=0.0.0a0", diff --git a/python/pylibcudf/pyproject.toml b/python/pylibcudf/pyproject.toml index 811af529493..f5167490384 100644 --- a/python/pylibcudf/pyproject.toml +++ b/python/pylibcudf/pyproject.toml @@ -47,16 +47,16 @@ test = [ "numba-cuda[cu13]>=0.19.1,<0.20.0a0", "numba>=0.60.0,<0.62.0a0", "pandas", - "pyarrow>=15.0.0,<22.0.0a0,!=17.0.0; platform_machine=='aarch64'", - "pyarrow>=15.0.0,<22.0.0a0; platform_machine=='x86_64'", + "pyarrow>=15.0.0,!=17.0.0; platform_machine=='aarch64'", + "pyarrow>=15.0.0; platform_machine=='x86_64'", "pytest", "pytest-cov", "pytest-xdist", "xxhash", ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. pyarrow = [ - "pyarrow>=15.0.0,<22.0.0a0,!=17.0.0; platform_machine=='aarch64'", - "pyarrow>=15.0.0,<22.0.0a0; platform_machine=='x86_64'", + "pyarrow>=15.0.0,!=17.0.0; platform_machine=='aarch64'", + "pyarrow>=15.0.0; platform_machine=='x86_64'", ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. numpy = [ "numpy>=1.23,<3.0a0", From 314ccc675ec19f0aaed473a02276939a0305d968 Mon Sep 17 00:00:00 2001 From: David Wendt <45795991+davidwendt@users.noreply.github.com> Date: Wed, 3 Sep 2025 17:11:04 -0400 Subject: [PATCH 249/366] Fix empty column returned by cudf::from_arrow_stream_column (#19812) Fixes the `cudf::from_arrow_stream_column` API to return the correct type for an empty column. Closes #19802 Authors: - David Wendt (https://github.com/davidwendt) Approvers: - Matthew Roeschke (https://github.com/mroeschke) - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/19812 --- cpp/src/interop/from_arrow_stream.cu | 7 --- cpp/tests/interop/from_arrow_stream_test.cpp | 47 ++++++++++++++++++++ 2 files changed, 47 insertions(+), 7 deletions(-) diff --git a/cpp/src/interop/from_arrow_stream.cu b/cpp/src/interop/from_arrow_stream.cu index d8a17b4dc4d..f2cc8cf8c61 100644 --- a/cpp/src/interop/from_arrow_stream.cu +++ b/cpp/src/interop/from_arrow_stream.cu @@ -154,13 +154,6 @@ std::unique_ptr from_arrow_stream_column(ArrowArrayStream* input, input->release(input); if (chunks.empty()) { - if (schema.n_children == 0) { - schema.release(&schema); - return std::make_unique(); - } - - // If there are no chunks but the schema has children, we need to construct a suitable empty - // column. auto empty_column = make_empty_column_from_schema(&schema, stream, mr); schema.release(&schema); return empty_column; diff --git a/cpp/tests/interop/from_arrow_stream_test.cpp b/cpp/tests/interop/from_arrow_stream_test.cpp index d8e11e375bc..5d8d37b5e15 100644 --- a/cpp/tests/interop/from_arrow_stream_test.cpp +++ b/cpp/tests/interop/from_arrow_stream_test.cpp @@ -15,6 +15,7 @@ */ #include +#include #include #include @@ -62,6 +63,32 @@ get_nanoarrow_stream(int num_copies) return std::make_tuple(std::move(expected), std::move(schema), stream); } +std::tuple, nanoarrow::UniqueSchema, ArrowArrayStream> +get_nanoarrow_chunked_stream(int num_copies, cudf::size_type length) +{ + std::vector> columns; + std::vector arrays; + for (auto i = 0; i < 3; ++i) { + auto [tbl, sch, arr] = get_nanoarrow_host_tables(length); + // just use the first column + columns.push_back(std::move(tbl->release().front())); + arrays.push_back(std::move(arr->children[0])); + } + std::vector views; + for (auto const& col : columns) { + views.push_back(col->view()); + } + auto expected = cudf::concatenate(views); + + nanoarrow::UniqueSchema schema; + ArrowSchemaInit(schema.get()); + NANOARROW_THROW_NOT_OK(ArrowSchemaInitFromType(schema.get(), NANOARROW_TYPE_INT64)); + + ArrowArrayStream stream; + makeStreamFromArrays(std::move(arrays), std::move(schema), &stream); + return std::make_tuple(std::move(expected), std::move(schema), stream); +} + TEST_F(FromArrowStreamTest, BasicTest) { constexpr auto num_copies = 3; @@ -82,3 +109,23 @@ TEST_F(FromArrowStreamTest, EmptyTest) auto result = cudf::from_arrow_stream(&stream); cudf::have_same_types(expected->view(), result->view()); } + +TEST_F(FromArrowStreamTest, ChunkedTest) +{ + constexpr auto num_copies = 3; + constexpr auto length = 3; + auto [expected, schema, stream] = get_nanoarrow_chunked_stream(num_copies, length); + + auto result = cudf::from_arrow_stream_column(&stream); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected->view(), result->view()); +} + +TEST_F(FromArrowStreamTest, EmptyChunkedTest) +{ + constexpr auto num_copies = 3; + constexpr auto length = 0; + auto [expected, schema, stream] = get_nanoarrow_chunked_stream(num_copies, length); + + auto result = cudf::from_arrow_stream_column(&stream); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(result->view(), expected->view()); +} From 266875f366766ffed66cb24cde4996587c6fb134 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Wed, 3 Sep 2025 15:49:22 -0700 Subject: [PATCH 250/366] Refactor column_empty to use only pylibcudf APIs (#19800) Precursor of https://github.com/rapidsai/cudf/issues/18726 towards consistently constructing a cuDF column with a pylibcudf Column Also added `tests/series/methods/test_map.py::test_map_empty[categorical]` to conftest-patch since this now (consistent with other types for this test) returns `None` with object type instead of `np.nan` with float type for "all missing-value data" Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/19800 --- python/cudf/cudf/core/column/column.py | 82 ++++++++----------- python/cudf/cudf/core/udf/groupby_utils.py | 3 +- .../cudf/pandas/scripts/conftest-patch.py | 1 + python/cudf/cudf/utils/applyutils.py | 2 +- python/cudf/cudf/utils/queryutils.py | 3 +- 5 files changed, 42 insertions(+), 49 deletions(-) diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index e43f15f1895..7fecde15bc1 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -2540,7 +2540,6 @@ def _has_any_nat(arbitrary: pd.Series | np.ndarray) -> bool: def column_empty( row_count: int, dtype: DtypeObj = CUDF_STRING_DTYPE, - for_numba: bool = False, ) -> ColumnBase: """ Allocate a new column with the given row_count and dtype. @@ -2555,59 +2554,50 @@ def column_empty( dtype : Dtype Type of the column. - - for_numba : bool, default False - If True, don't allocate a mask as it's not supported by numba. """ - children: tuple[ColumnBase, ...] = () - - if isinstance(dtype, StructDtype): - data = None - children = tuple( - column_empty(row_count, field_dtype) - for field_dtype in dtype.fields.values() - ) - elif isinstance(dtype, ListDtype): - data = None - children = ( - as_column(0, length=row_count + 1, dtype=SIZE_TYPE_DTYPE), - column_empty(row_count, dtype=dtype.element_type), - ) - elif isinstance(dtype, CategoricalDtype): - data = None - children = ( - ColumnBase.from_pylibcudf( - plc.Column.from_scalar( - plc.Scalar.from_py( - None, dtype_to_pylibcudf_type(SIZE_TYPE_DTYPE) - ), - row_count, + if isinstance(dtype, (StructDtype, ListDtype)): + if isinstance(dtype, StructDtype): + children = tuple( + column_empty(row_count, field_dtype) + for field_dtype in dtype.fields.values() + ) + elif isinstance(dtype, ListDtype): + children = ( + as_column(0, length=row_count + 1, dtype=SIZE_TYPE_DTYPE), + column_empty(row_count, dtype=dtype.element_type), + ) + mask = ( + None + if row_count == 0 + else plc.gpumemoryview( + plc.null_mask.create_null_mask( + row_count, plc.null_mask.MaskState.ALL_NULL ) - ), + ) ) + return ColumnBase.from_pylibcudf( + plc.Column( + dtype_to_pylibcudf_type(dtype), + row_count, + None, + mask, + row_count, + 0, + [child.to_pylibcudf(mode="read") for child in children], + ) + )._with_type_metadata(dtype) else: - col = ColumnBase.from_pylibcudf( + if isinstance(dtype, CategoricalDtype): + # May get downcast in _with_type_metadata + plc_dtype = plc.DataType(plc.TypeId.INT64) + else: + plc_dtype = dtype_to_pylibcudf_type(dtype) + return ColumnBase.from_pylibcudf( plc.Column.from_scalar( - plc.Scalar.from_py(None, dtype_to_pylibcudf_type(dtype)), + plc.Scalar.from_py(None, plc_dtype), row_count, ) )._with_type_metadata(dtype) - if for_numba: - col = col.set_mask(None) - return col - - if row_count > 0 and not for_numba: - mask = as_buffer( - plc.null_mask.create_null_mask( - row_count, plc.null_mask.MaskState.ALL_NULL - ) - ) - else: - mask = None - - return build_column( - data, dtype, mask=mask, size=row_count, children=children - ) def build_column( diff --git a/python/cudf/cudf/core/udf/groupby_utils.py b/python/cudf/cudf/core/udf/groupby_utils.py index 677e9a38881..3327b4320f3 100644 --- a/python/cudf/cudf/core/udf/groupby_utils.py +++ b/python/cudf/cudf/core/udf/groupby_utils.py @@ -126,7 +126,8 @@ def jit_groupby_apply(offsets, grouped_values, function, *args): offsets = as_column(offsets) ngroups = len(offsets) - 1 - output = column_empty(ngroups, dtype=return_type, for_numba=True) + output = column_empty(ngroups, dtype=return_type) + output = output.set_mask(None) launch_args = [ offsets, output, diff --git a/python/cudf/cudf/pandas/scripts/conftest-patch.py b/python/cudf/cudf/pandas/scripts/conftest-patch.py index 1d57d38d787..94c6d914029 100644 --- a/python/cudf/cudf/pandas/scripts/conftest-patch.py +++ b/python/cudf/cudf/pandas/scripts/conftest-patch.py @@ -12437,6 +12437,7 @@ def pytest_unconfigure(config): "tests/series/methods/test_map.py::test_map_dict_ignore_na[dict]", "tests/series/methods/test_map.py::test_map_dict_subclass_with_missing", "tests/series/methods/test_map.py::test_map_dict_with_tuple_keys", + "tests/series/methods/test_map.py::test_map_empty[categorical]", "tests/series/methods/test_map.py::test_map_empty[bool-dtype]", "tests/series/methods/test_map.py::test_map_empty[empty]", "tests/series/methods/test_map.py::test_map_empty[float32]", diff --git a/python/cudf/cudf/utils/applyutils.py b/python/cudf/cudf/utils/applyutils.py index f3dc0df8362..712ef3fb654 100644 --- a/python/cudf/cudf/utils/applyutils.py +++ b/python/cudf/cudf/utils/applyutils.py @@ -158,7 +158,7 @@ def run(self, df, **launch_params): outputs = {} for k, dt in self.outcols.items(): outputs[k] = column.column_empty( - len(df), np.dtype(dt), False + len(df), np.dtype(dt) ).data_array_view(mode="write") # Bind argument args = {} diff --git a/python/cudf/cudf/utils/queryutils.py b/python/cudf/cudf/utils/queryutils.py index c49e8c1ba2b..a9f45d87e74 100644 --- a/python/cudf/cudf/utils/queryutils.py +++ b/python/cudf/cudf/utils/queryutils.py @@ -247,7 +247,8 @@ def query_execute(df, expr, callenv): # allocate output buffer nrows = len(df) - out = column_empty(nrows, dtype=np.dtype(np.bool_), for_numba=True) + out = column_empty(nrows, dtype=np.dtype(np.bool_)) + out = out.set_mask(None) # run kernel args = [out, *colarrays, *envargs] with _CUDFNumbaConfig(): From f1c32e08dd65666afa976049d9bb238d11ee9d03 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Wed, 3 Sep 2025 16:58:13 -0700 Subject: [PATCH 251/366] Move more test_dataframe.py tests to new cudf classic testing directory (#19770) Towards https://github.com/rapidsai/cudf/issues/9999 Towards https://github.com/rapidsai/cudf/issues/15723 Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/19770 --- python/cudf/cudf/tests/conftest.py | 104 +- .../tests/dataframe/indexing/test_getitem.py | 26 + .../cudf/tests/dataframe/indexing/test_loc.py | 77 + .../tests/dataframe/indexing/test_setitem.py | 40 + .../cudf/tests/dataframe/methods/test_agg.py | 117 + .../tests/dataframe/methods/test_argsort.py | 23 + .../cudf/tests/dataframe/methods/test_diff.py | 98 + .../dataframe/methods/test_duplicated.py | 35 + .../tests/dataframe/methods/test_equals.py | 29 + .../cudf/tests/dataframe/methods/test_eval.py | 99 + .../tests/dataframe/methods/test_explode.py | 77 + .../dataframe/methods/test_ffill_bfill.py | 45 + .../cudf/tests/dataframe/methods/test_info.py | 248 ++ .../tests/dataframe/methods/test_insert.py | 14 + .../methods/test_iterrows_itertuples.py | 33 + .../cudf/tests/dataframe/methods/test_keys.py | 54 + .../dataframe/methods/test_memory_usage.py | 111 + .../cudf/tests/dataframe/methods/test_mode.py | 54 + .../tests/dataframe/methods/test_nunique.py | 32 + .../dataframe/methods/test_pct_change.py | 46 + .../cudf/tests/dataframe/methods/test_pipe.py | 62 + .../dataframe/methods/test_reductions.py | 69 + .../tests/dataframe/methods/test_reindex.py | 50 + .../tests/dataframe/methods/test_rename.py | 79 + .../tests/dataframe/methods/test_sample.py | 241 ++ .../tests/dataframe/methods/test_to_arrow.py | 68 + .../tests/dataframe/methods/test_to_pandas.py | 193 + .../tests/dataframe/methods/test_update.py | 91 + .../dataframe/methods/test_value_counts.py | 59 + .../cudf/tests/dataframe/test_attributes.py | 129 + .../cudf/cudf/tests/dataframe/test_binops.py | 170 + .../cudf/tests/dataframe/test_constructors.py | 460 +++ .../cudf/tests/dataframe/test_reductions.py | 75 - .../multiindex/methods/test_memory_usage.py | 33 + .../rangeindex/methods/test_nunique.py | 15 + python/cudf/cudf/tests/reshape/test_concat.py | 644 +++ .../cudf/tests/series/methods/test_keys.py | 32 + .../cudf/cudf/tests/series/test_attributes.py | 17 + python/cudf/cudf/tests/test_dataframe.py | 3524 +---------------- 39 files changed, 3677 insertions(+), 3696 deletions(-) create mode 100644 python/cudf/cudf/tests/dataframe/methods/test_agg.py create mode 100644 python/cudf/cudf/tests/dataframe/methods/test_argsort.py create mode 100644 python/cudf/cudf/tests/dataframe/methods/test_diff.py create mode 100644 python/cudf/cudf/tests/dataframe/methods/test_duplicated.py create mode 100644 python/cudf/cudf/tests/dataframe/methods/test_equals.py create mode 100644 python/cudf/cudf/tests/dataframe/methods/test_eval.py create mode 100644 python/cudf/cudf/tests/dataframe/methods/test_explode.py create mode 100644 python/cudf/cudf/tests/dataframe/methods/test_ffill_bfill.py create mode 100644 python/cudf/cudf/tests/dataframe/methods/test_info.py create mode 100644 python/cudf/cudf/tests/dataframe/methods/test_insert.py create mode 100644 python/cudf/cudf/tests/dataframe/methods/test_iterrows_itertuples.py create mode 100644 python/cudf/cudf/tests/dataframe/methods/test_keys.py create mode 100644 python/cudf/cudf/tests/dataframe/methods/test_mode.py create mode 100644 python/cudf/cudf/tests/dataframe/methods/test_nunique.py create mode 100644 python/cudf/cudf/tests/dataframe/methods/test_pct_change.py create mode 100644 python/cudf/cudf/tests/dataframe/methods/test_pipe.py create mode 100644 python/cudf/cudf/tests/dataframe/methods/test_sample.py create mode 100644 python/cudf/cudf/tests/dataframe/methods/test_update.py create mode 100644 python/cudf/cudf/tests/dataframe/methods/test_value_counts.py create mode 100644 python/cudf/cudf/tests/dataframe/test_binops.py delete mode 100644 python/cudf/cudf/tests/dataframe/test_reductions.py create mode 100644 python/cudf/cudf/tests/indexes/multiindex/methods/test_memory_usage.py create mode 100644 python/cudf/cudf/tests/indexes/rangeindex/methods/test_nunique.py create mode 100644 python/cudf/cudf/tests/series/methods/test_keys.py diff --git a/python/cudf/cudf/tests/conftest.py b/python/cudf/cudf/tests/conftest.py index 8f6155bfb2c..663e4dbd517 100644 --- a/python/cudf/cudf/tests/conftest.py +++ b/python/cudf/cudf/tests/conftest.py @@ -15,7 +15,6 @@ import rmm # noqa: F401 import cudf -from cudf.testing import assert_eq _CURRENT_DIRECTORY = str(pathlib.Path(__file__).resolve().parent) @@ -25,103 +24,6 @@ def datadir(): return pathlib.Path(__file__).parent / "data" -@pytest.fixture( - params=itertools.product([0, 2, None], [0.3, None]), - ids=lambda arg: f"n={arg[0]}-frac={arg[1]}", -) -def sample_n_frac(request): - """ - Specific to `test_sample*` tests. - """ - n, frac = request.param - if n is not None and frac is not None: - pytest.skip("Cannot specify both n and frac.") - return n, frac - - -def shape_checker(expected, got): - assert expected.shape == got.shape - - -def exact_checker(expected, got): - assert_eq(expected, got) - - -@pytest.fixture( - params=[ - (None, None, shape_checker), - (42, 42, shape_checker), - (np.random.RandomState(42), np.random.RandomState(42), exact_checker), - ], - ids=["None", "IntSeed", "NumpyRandomState"], -) -def random_state_tuple_axis_1(request): - """ - Specific to `test_sample*_axis_1` tests. - A pytest fixture of valid `random_state` parameter pairs for pandas - and cudf. Valid parameter combinations, and what to check for each pair - are listed below: - - pandas: None, seed(int), np.random.RandomState - cudf: None, seed(int), np.random.RandomState - ------ - check: shape, shape, exact result - - Each column above stands for one valid parameter combination and check. - """ - - return request.param - - -@pytest.fixture( - params=[ - (None, None, shape_checker), - (42, 42, shape_checker), - (np.random.RandomState(42), np.random.RandomState(42), exact_checker), - (np.random.RandomState(42), cp.random.RandomState(42), shape_checker), - ], - ids=["None", "IntSeed", "NumpyRandomState", "CupyRandomState"], -) -def random_state_tuple_axis_0(request): - """ - Specific to `test_sample*_axis_0` tests. - A pytest fixture of valid `random_state` parameter pairs for pandas - and cudf. Valid parameter combinations, and what to check for each pair - are listed below: - - pandas: None, seed(int), np.random.RandomState, np.random.RandomState - cudf: None, seed(int), np.random.RandomState, cp.random.RandomState - ------ - check: shape, shape, exact result, shape - - Each column above stands for one valid parameter combination and check. - """ - - return request.param - - -@pytest.fixture(params=[None, "builtin_list", "ndarray"]) -def make_weights_axis_0(request): - """Specific to `test_sample*_axis_0` tests. - Only testing weights array that matches type with random state. - """ - - if request.param is None: - return lambda *_: (None, None) - elif request.param == "builtin-list": - return lambda size, _: ([1] * size, [1] * size) - else: - - def wrapped(size, numpy_weights_for_cudf): - # Uniform distribution, non-normalized - if numpy_weights_for_cudf: - return np.ones(size), np.ones(size) - else: - return np.ones(size), cp.ones(size) - - return wrapped - - # To set and remove the NO_EXTERNAL_ONLY_APIS environment variable we must use # the sessionstart and sessionfinish hooks rather than a simple autouse, # session-scope fixture because we need to set these variable before collection @@ -504,6 +406,12 @@ def numeric_and_temporal_types_as_str(request): return request.param +@pytest.fixture +def numeric_and_temporal_types_as_str2(numeric_and_temporal_types_as_str): + """Used for testing cartesian product of numeric_and_temporal_types_as_str""" + return numeric_and_temporal_types_as_str + + @pytest.fixture( params=signed_integer_types + unsigned_integer_types diff --git a/python/cudf/cudf/tests/dataframe/indexing/test_getitem.py b/python/cudf/cudf/tests/dataframe/indexing/test_getitem.py index 6453a4abca1..ba7b7f95b2e 100644 --- a/python/cudf/cudf/tests/dataframe/indexing/test_getitem.py +++ b/python/cudf/cudf/tests/dataframe/indexing/test_getitem.py @@ -8,3 +8,29 @@ def test_struct_of_struct_loc(): df = cudf.DataFrame({"col": [{"a": {"b": 1}}]}) expect = cudf.Series([{"a": {"b": 1}}], name="col") assert_eq(expect, df["col"]) + + +def test_dataframe_midx_cols_getitem(): + df = cudf.DataFrame( + { + "a": ["a", "b", "c"], + "b": ["b", "", ""], + "c": [10, 11, 12], + } + ) + df.columns = df.set_index(["a", "b"]).index + pdf = df.to_pandas() + + expected = df["c"] + actual = pdf["c"] + assert_eq(expected, actual) + df = cudf.DataFrame( + [[1, 0], [0, 1]], + columns=[ + ["foo", "foo"], + ["location", "location"], + ["x", "y"], + ], + ) + df = df.assign(bools=cudf.Series([True, False], dtype="bool")) + assert_eq(df["bools"], df.to_pandas()["bools"]) diff --git a/python/cudf/cudf/tests/dataframe/indexing/test_loc.py b/python/cudf/cudf/tests/dataframe/indexing/test_loc.py index 7821347cbe8..9a0ace436a7 100644 --- a/python/cudf/cudf/tests/dataframe/indexing/test_loc.py +++ b/python/cudf/cudf/tests/dataframe/indexing/test_loc.py @@ -1,6 +1,7 @@ # Copyright (c) 2025, NVIDIA CORPORATION. import re +import numpy as np import pandas as pd import pytest @@ -8,6 +9,82 @@ from cudf.testing import assert_eq +def test_dataframe_midx_columns_loc(): + idx_1 = ["Hi", "Lo"] + idx_2 = ["I", "II", "III"] + idx = cudf.MultiIndex.from_product([idx_1, idx_2]) + + data_rand = ( + np.random.default_rng(seed=0) + .uniform(0, 1, 3 * len(idx)) + .reshape(3, -1) + ) + df = cudf.DataFrame(data_rand, index=["A", "B", "C"], columns=idx) + pdf = df.to_pandas() + + assert_eq(df.shape, pdf.shape) + + expected = pdf.loc[["A", "B"]] + actual = df.loc[["A", "B"]] + + assert_eq(expected, actual) + assert_eq(df, pdf) + + +@pytest.mark.parametrize("dtype1", ["int16", "float32"]) +@pytest.mark.parametrize("dtype2", ["int16", "float32"]) +def test_dataframe_loc_int_float(dtype1, dtype2): + df = cudf.DataFrame( + {"a": [10, 11, 12, 13, 14]}, + index=cudf.Index([1, 2, 3, 4, 5], dtype=dtype1), + ) + pdf = df.to_pandas() + + gidx = cudf.Index([2, 3, 4], dtype=dtype2) + pidx = gidx.to_pandas() + + actual = df.loc[gidx] + expected = pdf.loc[pidx] + + assert_eq(actual, expected, check_index_type=True, check_dtype=True) + + +@pytest.mark.xfail(reason="Not yet properly supported.") +def test_multiindex_wildcard_selection_three_level_all(): + midx = cudf.MultiIndex.from_tuples( + [(c1, c2, c3) for c1 in "abcd" for c2 in "abc" for c3 in "ab"] + ) + df = cudf.DataFrame({f"{i}": [i] for i in range(24)}) + df.columns = midx + + expect = df.to_pandas().loc[:, (slice("a", "c"), slice("a", "b"), "b")] + got = df.loc[:, (slice(None), "b")] + assert_eq(expect, got) + + +def test_multiindex_wildcard_selection_all(): + midx = cudf.MultiIndex.from_tuples( + [(c1, c2) for c1 in "abc" for c2 in "ab"] + ) + df = cudf.DataFrame({f"{i}": [i] for i in range(6)}) + df.columns = midx + expect = df.to_pandas().loc[:, (slice(None), "b")] + got = df.loc[:, (slice(None), "b")] + assert_eq(expect, got) + + +@pytest.mark.xfail(reason="Not yet properly supported.") +def test_multiindex_wildcard_selection_partial(): + midx = cudf.MultiIndex.from_tuples( + [(c1, c2) for c1 in "abc" for c2 in "ab"] + ) + df = cudf.DataFrame({f"{i}": [i] for i in range(6)}) + df.columns = midx + expect = df.to_pandas().loc[:, (slice("a", "b"), "b")] + got = df.loc[:, (slice("a", "b"), "b")] + assert_eq(expect, got) + + @pytest.mark.parametrize( "value", [ diff --git a/python/cudf/cudf/tests/dataframe/indexing/test_setitem.py b/python/cudf/cudf/tests/dataframe/indexing/test_setitem.py index c566be17d4a..f7874f5cd90 100644 --- a/python/cudf/cudf/tests/dataframe/indexing/test_setitem.py +++ b/python/cudf/cudf/tests/dataframe/indexing/test_setitem.py @@ -1,5 +1,6 @@ # Copyright (c) 2025, NVIDIA CORPORATION. +import cupy as cp import numpy as np import pandas as pd import pytest @@ -120,6 +121,45 @@ def test_listcol_setitem_retain_dtype(): assert df2["a"].dtype == df["a"].dtype +def test_setitem_reset_label_dtype(): + result = cudf.DataFrame({1: [2]}) + expected = pd.DataFrame({1: [2]}) + result["a"] = [2] + expected["a"] = [2] + assert_eq(result, expected) + + +def test_dataframe_assign_scalar_to_empty_series(): + expected = pd.DataFrame({"a": []}) + actual = cudf.DataFrame({"a": []}) + expected.a = 0 + actual.a = 0 + assert_eq(expected, actual) + + +def test_dataframe_assign_cp_np_array(): + m, n = 5, 3 + cp_ndarray = cp.random.randn(m, n) + pdf = pd.DataFrame({f"f_{i}": range(m) for i in range(n)}) + gdf = cudf.DataFrame({f"f_{i}": range(m) for i in range(n)}) + pdf[[f"f_{i}" for i in range(n)]] = cp.asnumpy(cp_ndarray) + gdf[[f"f_{i}" for i in range(n)]] = cp_ndarray + + assert_eq(pdf, gdf) + + +def test_dataframe_setitem_cupy_array(): + rng = np.random.default_rng(seed=0) + pdf = pd.DataFrame(rng.standard_normal(size=(10, 2))) + gdf = cudf.from_pandas(pdf) + + gpu_array = cp.array([True, False] * 5) + pdf[gpu_array.get()] = 1.5 + gdf[gpu_array] = 1.5 + + assert_eq(pdf, gdf) + + def test_setitem_datetime(): df = cudf.DataFrame({"date": pd.date_range("20010101", "20010105").values}) assert df.date.dtype.kind == "M" diff --git a/python/cudf/cudf/tests/dataframe/methods/test_agg.py b/python/cudf/cudf/tests/dataframe/methods/test_agg.py new file mode 100644 index 00000000000..7265376b835 --- /dev/null +++ b/python/cudf/cudf/tests/dataframe/methods/test_agg.py @@ -0,0 +1,117 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. +import re + +import numpy as np +import pandas as pd +import pytest + +import cudf +from cudf.testing import assert_eq + + +@pytest.mark.parametrize( + "data", + [ + {"a": [1, 2.5, 3], "b": [3, 4.5, 5], "c": [2.0, 3.0, 4.0]}, + {"a": [1, 2.2, 3], "b": [2.0, 3.0, 4.0], "c": [5.0, 6.0, 4.0]}, + ], +) +@pytest.mark.parametrize( + "aggs", + [ + ["min", "sum", "max"], + ("min", "sum", "max"), + {"min", "sum", "max"}, + "sum", + {"a": "sum", "b": "min", "c": "max"}, + {"a": ["sum"], "b": ["min"], "c": ["max"]}, + {"a": ("sum"), "b": ("min"), "c": ("max")}, + {"a": {"sum"}, "b": {"min"}, "c": {"max"}}, + {"a": ["sum", "min"], "b": ["sum", "max"], "c": ["min", "max"]}, + {"a": ("sum", "min"), "b": ("sum", "max"), "c": ("min", "max")}, + {"a": {"sum", "min"}, "b": {"sum", "max"}, "c": {"min", "max"}}, + ], +) +def test_agg_for_dataframes(data, aggs): + pdf = pd.DataFrame(data) + gdf = cudf.DataFrame(data) + + expect = pdf.agg(aggs).sort_index() + got = gdf.agg(aggs).sort_index() + + assert_eq(expect, got, check_dtype=True) + + +@pytest.mark.parametrize( + "data", + [ + {"a": [1, 2, 3], "b": [3.0, 4.0, 5.0], "c": [True, True, False]}, + {"a": [1, 2, 3], "b": [True, True, False], "c": [False, True, False]}, + ], +) +@pytest.mark.parametrize( + "aggs", + [ + ["min", "sum", "max"], + "sum", + {"a": "sum", "b": "min", "c": "max"}, + ], +) +def test_agg_for_dataframes_error(data, aggs): + gdf = cudf.DataFrame(data) + + with pytest.raises(TypeError): + gdf.agg(aggs) + + +def test_agg_for_unsupported_function(): + gdf = cudf.DataFrame({"a": [1, 2, 3], "b": [3.0, 4.0, 5.0]}) + + with pytest.raises(NotImplementedError): + gdf.agg({"a": np.sum, "b": np.min, "c": np.max}) + + +def test_agg_for_dataframe_with_invalid_function(): + aggs = "asdf" + gdf = cudf.DataFrame({"a": [1, 2, 3], "b": [3.0, 4.0, 5.0]}) + + with pytest.raises( + AttributeError, + match=f"{aggs} is not a valid function for 'DataFrame' object", + ): + gdf.agg(aggs) + + +def test_agg_for_series_with_invalid_function(): + aggs = {"a": "asdf"} + gdf = cudf.DataFrame({"a": [1, 2, 3], "b": [3.0, 4.0, 5.0]}) + + with pytest.raises( + AttributeError, + match=f"{aggs['a']} is not a valid function for 'Series' object", + ): + gdf.agg(aggs) + + +@pytest.mark.parametrize( + "aggs", + [ + "sum", + ["min", "sum", "max"], + {"a": {"sum", "min"}, "b": {"sum", "max"}, "c": {"min", "max"}}, + ], +) +def test_agg_for_dataframe_with_string_columns(aggs): + gdf = cudf.DataFrame( + {"a": ["m", "n", "o"], "b": ["t", "u", "v"], "c": ["x", "y", "z"]}, + index=["a", "b", "c"], + ) + + with pytest.raises( + NotImplementedError, + match=re.escape( + "DataFrame.agg() is not supported for " + "frames containing string columns" + ), + ): + gdf.agg(aggs) diff --git a/python/cudf/cudf/tests/dataframe/methods/test_argsort.py b/python/cudf/cudf/tests/dataframe/methods/test_argsort.py new file mode 100644 index 00000000000..96e1bcb8228 --- /dev/null +++ b/python/cudf/cudf/tests/dataframe/methods/test_argsort.py @@ -0,0 +1,23 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + +import cupy as cp +import pytest + +import cudf +from cudf.testing import assert_eq + + +@pytest.mark.parametrize( + "ascending,expected_data", + [ + (True, [1, 2, 0]), + (False, [0, 2, 1]), + ], +) +def test_dataframe_argsort(ascending, expected_data): + actual = cudf.DataFrame({"a": [10, 0, 2], "b": [-10, 10, 1]}).argsort( + ascending=ascending + ) + expected = cp.array(expected_data, dtype="int32") + + assert_eq(actual, expected) diff --git a/python/cudf/cudf/tests/dataframe/methods/test_diff.py b/python/cudf/cudf/tests/dataframe/methods/test_diff.py new file mode 100644 index 00000000000..c5f3b504313 --- /dev/null +++ b/python/cudf/cudf/tests/dataframe/methods/test_diff.py @@ -0,0 +1,98 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + +import numpy as np +import pandas as pd +import pytest + +import cudf +from cudf.testing import assert_eq + + +@pytest.mark.parametrize( + "data", + [ + [1, 0, 4, -10, 6], + np.array([1.123, 2.343, 5.890, 0.0]), + [True, False, True, False, False], + {"a": [1.123, 2.343, np.nan, np.nan], "b": [None, 3, 9.08, None]}, + ], +) +@pytest.mark.parametrize("periods", (-5, -1, 0, 1, 5)) +def test_diff_numeric_dtypes(data, periods): + gdf = cudf.DataFrame(data) + pdf = gdf.to_pandas() + + actual = gdf.diff(periods=periods, axis=0) + expected = pdf.diff(periods=periods, axis=0) + + assert_eq( + expected, + actual, + check_dtype=False, + ) + + +@pytest.mark.parametrize( + ("precision", "scale"), + [(5, 2), (8, 5)], +) +@pytest.mark.parametrize( + "dtype", + [cudf.Decimal32Dtype, cudf.Decimal64Dtype], +) +def test_diff_decimal_dtypes(precision, scale, dtype): + gdf = cudf.DataFrame( + np.random.default_rng(seed=42).uniform(10.5, 75.5, (10, 6)), + dtype=dtype(precision=precision, scale=scale), + ) + pdf = gdf.to_pandas() + + actual = gdf.diff() + expected = pdf.diff() + + assert_eq( + expected, + actual, + check_dtype=False, + ) + + +def test_diff_invalid_axis(): + gdf = cudf.DataFrame(np.array([1.123, 2.343, 5.890, 0.0])) + with pytest.raises(NotImplementedError, match="Only axis=0 is supported."): + gdf.diff(periods=1, axis=1) + + +@pytest.mark.parametrize( + "data", + [ + { + "int_col": [1, 2, 3, 4, 5], + "float_col": [1.0, 2.0, 3.0, 4.0, 5.0], + "string_col": ["a", "b", "c", "d", "e"], + }, + ["a", "b", "c", "d", "e"], + ], +) +def test_diff_unsupported_dtypes(data): + gdf = cudf.DataFrame(data) + with pytest.raises( + TypeError, + match=r"unsupported operand type\(s\)", + ): + gdf.diff() + + +def test_diff_many_dtypes(): + pdf = pd.DataFrame( + { + "dates": pd.date_range("2020-01-01", "2020-01-06", freq="D"), + "bools": [True, True, True, False, True, True], + "floats": [1.0, 2.0, 3.5, np.nan, 5.0, -1.7], + "ints": [1, 2, 3, 3, 4, 5], + "nans_nulls": [np.nan, None, None, np.nan, np.nan, None], + } + ) + gdf = cudf.from_pandas(pdf) + assert_eq(pdf.diff(), gdf.diff()) + assert_eq(pdf.diff(periods=2), gdf.diff(periods=2)) diff --git a/python/cudf/cudf/tests/dataframe/methods/test_duplicated.py b/python/cudf/cudf/tests/dataframe/methods/test_duplicated.py new file mode 100644 index 00000000000..ce23afd2ace --- /dev/null +++ b/python/cudf/cudf/tests/dataframe/methods/test_duplicated.py @@ -0,0 +1,35 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + +import pytest + +import cudf +from cudf.testing import assert_eq + + +@pytest.mark.parametrize( + "data", + [ + { + "brand": ["Yum Yum", "Yum Yum", "Indomie", "Indomie", "Indomie"], + "style": ["cup", "cup", "cup", "pack", "pack"], + "rating": [4, 4, 3.5, 15, 5], + }, + { + "brand": ["Indomie", "Yum Yum", "Indomie", "Indomie", "Indomie"], + "style": ["cup", "cup", "cup", "cup", "pack"], + "rating": [4, 4, 3.5, 4, 5], + }, + ], +) +@pytest.mark.parametrize( + "subset", [None, ["brand"], ["rating"], ["style", "rating"]] +) +@pytest.mark.parametrize("keep", ["first", "last", False]) +def test_dataframe_duplicated(data, subset, keep): + gdf = cudf.DataFrame(data) + pdf = gdf.to_pandas() + + expected = pdf.duplicated(subset=subset, keep=keep) + actual = gdf.duplicated(subset=subset, keep=keep) + + assert_eq(expected, actual) diff --git a/python/cudf/cudf/tests/dataframe/methods/test_equals.py b/python/cudf/cudf/tests/dataframe/methods/test_equals.py new file mode 100644 index 00000000000..a41438b3fc3 --- /dev/null +++ b/python/cudf/cudf/tests/dataframe/methods/test_equals.py @@ -0,0 +1,29 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + +import pytest + +import cudf +from cudf.testing import assert_eq + + +@pytest.mark.parametrize( + "lhs, rhs", [("a", "a"), ("a", "b"), (1, 1.0), (None, None), (None, "a")] +) +def test_equals_names(lhs, rhs): + lhs = cudf.DataFrame({lhs: [1, 2]}) + rhs = cudf.DataFrame({rhs: [1, 2]}) + + got = lhs.equals(rhs) + expect = lhs.to_pandas().equals(rhs.to_pandas()) + + assert_eq(expect, got) + + +def test_equals_dtypes(): + lhs = cudf.DataFrame({"a": [1, 2.0]}) + rhs = cudf.DataFrame({"a": [1, 2]}) + + got = lhs.equals(rhs) + expect = lhs.to_pandas().equals(rhs.to_pandas()) + + assert got == expect diff --git a/python/cudf/cudf/tests/dataframe/methods/test_eval.py b/python/cudf/cudf/tests/dataframe/methods/test_eval.py new file mode 100644 index 00000000000..239304129a1 --- /dev/null +++ b/python/cudf/cudf/tests/dataframe/methods/test_eval.py @@ -0,0 +1,99 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + +import re + +import numpy as np +import pytest + +import cudf +from cudf.testing import assert_eq + + +# Note that for now expressions do not automatically handle casting, so inputs +# need to be casted appropriately +@pytest.mark.filterwarnings("ignore::RuntimeWarning") +@pytest.mark.parametrize( + "expr, dtype", + [ + ("a", int), + ("+a", int), + ("a + b", int), + ("a == b", int), + ("a / b", float), + ("a * b", int), + ("a > b", int), + ("a >= b", int), + ("a > b > c", int), + ("a > b < c", int), + ("a & b", int), + ("a & b | c", int), + ("sin(a)", float), + ("exp(sin(abs(a)))", float), + ("sqrt(floor(a))", float), + ("ceil(arctanh(a))", float), + ("(a + b) - (c * d)", int), + ("~a", int), + ("(a > b) and (c > d)", int), + ("(a > b) or (c > d)", int), + ("not (a > b)", int), + ("a + 1", int), + ("a + 1.0", float), + ("-a + 1", int), + ("+a + 1", int), + ("e = a + 1", int), + ( + """ + e = log(cos(a)) + 1.0 + f = abs(c) - exp(d) + """, + float, + ), + ("a_b_are_equal = (a == b)", int), + ("a > b", str), + ("a < '1'", str), + ('a == "1"', str), + ], +) +@pytest.mark.parametrize("nrows", [0, 10]) +def test_dataframe_eval(nrows, expr, dtype): + arr = np.ones(nrows) + df_eval = cudf.DataFrame({"a": arr, "b": arr, "c": arr, "d": arr}) + df_eval = df_eval.astype(dtype) + expect = df_eval.to_pandas().eval(expr) + got = df_eval.eval(expr) + # In the specific case where the evaluated expression is a unary function + # of a single column with no nesting, pandas will retain the name. This + # level of compatibility is out of scope for now. + assert_eq(expect, got, check_names=False) + + # Test inplace + if re.search("[^=><]=[^=]", expr) is not None: + pdf_eval = df_eval.to_pandas() + pdf_eval.eval(expr, inplace=True) + df_eval.eval(expr, inplace=True) + assert_eq(pdf_eval, df_eval) + + +@pytest.mark.parametrize( + "expr", + [ + """ + e = a + b + a == b + """, + "a_b_are_equal = (a == b) = c", + ], +) +def test_dataframe_eval_errors(expr): + df = cudf.DataFrame({"a": [1, 2, 3, 4, 5], "b": [1, 2, 3, 4, 5]}) + with pytest.raises(ValueError): + df.eval(expr) + + +def test_dataframe_eval_misc(): + df = cudf.DataFrame({"a": [1, 2, 3, None, 5]}) + got = df.eval("isnull(a)") + assert_eq(got, cudf.Series.isnull(df["a"]), check_names=False) + + df.eval("c = isnull(1)", inplace=True) + assert_eq(df["c"], cudf.Series([False] * len(df), name="c")) diff --git a/python/cudf/cudf/tests/dataframe/methods/test_explode.py b/python/cudf/cudf/tests/dataframe/methods/test_explode.py new file mode 100644 index 00000000000..c01b6f3fc39 --- /dev/null +++ b/python/cudf/cudf/tests/dataframe/methods/test_explode.py @@ -0,0 +1,77 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + +import pandas as pd +import pytest + +import cudf +from cudf.testing import assert_eq + + +def test_explode_preserve_categorical(): + gdf = cudf.DataFrame( + { + "A": [[1, 2], None, [2, 3]], + "B": cudf.Series([0, 1, 2], dtype="category"), + } + ) + result = gdf.explode("A") + expected = cudf.DataFrame( + { + "A": [1, 2, None, 2, 3], + "B": cudf.Series([0, 0, 1, 2, 2], dtype="category"), + } + ) + expected.index = cudf.Index([0, 0, 1, 2, 2]) + assert_eq(result, expected) + + +@pytest.mark.parametrize( + "data", + [ + [ + [[1, 2, 3], 11, "a"], + [None, 22, "e"], + [[4], 33, "i"], + [[], 44, "o"], + [[5, 6], 55, "u"], + ], # nested + [ + [1, 11, "a"], + [2, 22, "e"], + [3, 33, "i"], + [4, 44, "o"], + [5, 55, "u"], + ], # non-nested + ], +) +@pytest.mark.parametrize( + ("labels", "label_to_explode"), + [ + (None, 0), + (pd.Index(["a", "b", "c"]), "a"), + ( + pd.MultiIndex.from_tuples( + [(0, "a"), (0, "b"), (1, "a")], names=["l0", "l1"] + ), + (0, "a"), + ), + ], +) +@pytest.mark.parametrize( + "p_index", + [ + None, + ["ia", "ib", "ic", "id", "ie"], + pd.MultiIndex.from_tuples( + [(0, "a"), (0, "b"), (0, "c"), (1, "a"), (1, "b")] + ), + ], +) +def test_explode(data, labels, ignore_index, p_index, label_to_explode): + pdf = pd.DataFrame(data, index=p_index, columns=labels) + gdf = cudf.from_pandas(pdf) + + expect = pdf.explode(label_to_explode, ignore_index) + got = gdf.explode(label_to_explode, ignore_index) + + assert_eq(expect, got, check_dtype=False) diff --git a/python/cudf/cudf/tests/dataframe/methods/test_ffill_bfill.py b/python/cudf/cudf/tests/dataframe/methods/test_ffill_bfill.py new file mode 100644 index 00000000000..49516c9c1cb --- /dev/null +++ b/python/cudf/cudf/tests/dataframe/methods/test_ffill_bfill.py @@ -0,0 +1,45 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + +import numpy as np +import pandas as pd +import pytest + +import cudf +from cudf.testing import assert_eq +from cudf.testing._utils import expect_warning_if + + +@pytest.mark.parametrize( + "df", + [ + pd.DataFrame({"A": [1, 2, 3, np.nan, None, 6]}), + pd.Series([1, 2, 3, None, np.nan, 5, 6, np.nan]), + ], +) +@pytest.mark.parametrize("alias", ["bfill", "backfill"]) +def test_dataframe_bfill(df, alias): + gdf = cudf.from_pandas(df) + + with expect_warning_if(alias == "backfill"): + actual = getattr(df, alias)() + with expect_warning_if(alias == "backfill"): + expected = getattr(gdf, alias)() + assert_eq(expected, actual) + + +@pytest.mark.parametrize( + "df", + [ + pd.DataFrame({"A": [1, 2, 3, np.nan, None, 6]}), + pd.Series([1, 2, 3, None, np.nan, 5, 6, np.nan]), + ], +) +@pytest.mark.parametrize("alias", ["ffill", "pad"]) +def test_dataframe_ffill(df, alias): + gdf = cudf.from_pandas(df) + + with expect_warning_if(alias == "pad"): + actual = getattr(df, alias)() + with expect_warning_if(alias == "pad"): + expected = getattr(gdf, alias)() + assert_eq(expected, actual) diff --git a/python/cudf/cudf/tests/dataframe/methods/test_info.py b/python/cudf/cudf/tests/dataframe/methods/test_info.py new file mode 100644 index 00000000000..d5d06268e85 --- /dev/null +++ b/python/cudf/cudf/tests/dataframe/methods/test_info.py @@ -0,0 +1,248 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + +import io +import textwrap + +import numpy as np +import pandas as pd + +import cudf + + +def test_dataframe_info_basic(): + buffer = io.StringIO() + str_cmp = textwrap.dedent( + """\ + + Index: 10 entries, a to 1111 + Data columns (total 10 columns): + # Column Non-Null Count Dtype + --- ------ -------------- ----- + 0 0 10 non-null float64 + 1 1 10 non-null float64 + 2 2 10 non-null float64 + 3 3 10 non-null float64 + 4 4 10 non-null float64 + 5 5 10 non-null float64 + 6 6 10 non-null float64 + 7 7 10 non-null float64 + 8 8 10 non-null float64 + 9 9 10 non-null float64 + dtypes: float64(10) + memory usage: 859.0+ bytes + """ + ) + rng = np.random.default_rng(seed=0) + df = pd.DataFrame( + rng.standard_normal(size=(10, 10)), + index=["a", "2", "3", "4", "5", "6", "7", "8", "100", "1111"], + ) + cudf.from_pandas(df).info(buf=buffer, verbose=True) + s = buffer.getvalue() + assert str_cmp == s + + +def test_dataframe_info_verbose_mem_usage(): + buffer = io.StringIO() + df = pd.DataFrame({"a": [1, 2, 3], "b": ["safdas", "assa", "asdasd"]}) + str_cmp = textwrap.dedent( + """\ + + RangeIndex: 3 entries, 0 to 2 + Data columns (total 2 columns): + # Column Non-Null Count Dtype + --- ------ -------------- ----- + 0 a 3 non-null int64 + 1 b 3 non-null object + dtypes: int64(1), object(1) + memory usage: 56.0+ bytes + """ + ) + cudf.from_pandas(df).info(buf=buffer, verbose=True) + s = buffer.getvalue() + assert str_cmp == s + + buffer.truncate(0) + buffer.seek(0) + + str_cmp = textwrap.dedent( + """\ + + RangeIndex: 3 entries, 0 to 2 + Columns: 2 entries, a to b + dtypes: int64(1), object(1) + memory usage: 56.0+ bytes + """ + ) + cudf.from_pandas(df).info(buf=buffer, verbose=False) + s = buffer.getvalue() + assert str_cmp == s + + buffer.truncate(0) + buffer.seek(0) + + df = pd.DataFrame( + {"a": [1, 2, 3], "b": ["safdas", "assa", "asdasd"]}, + index=["sdfdsf", "sdfsdfds", "dsfdf"], + ) + str_cmp = textwrap.dedent( + """\ + + Index: 3 entries, sdfdsf to dsfdf + Data columns (total 2 columns): + # Column Non-Null Count Dtype + --- ------ -------------- ----- + 0 a 3 non-null int64 + 1 b 3 non-null object + dtypes: int64(1), object(1) + memory usage: 91.0 bytes + """ + ) + cudf.from_pandas(df).info(buf=buffer, verbose=True, memory_usage="deep") + s = buffer.getvalue() + assert str_cmp == s + + buffer.truncate(0) + buffer.seek(0) + + int_values = [1, 2, 3, 4, 5] + text_values = ["alpha", "beta", "gamma", "delta", "epsilon"] + float_values = [0.0, 0.25, 0.5, 0.75, 1.0] + + df = cudf.DataFrame( + { + "int_col": int_values, + "text_col": text_values, + "float_col": float_values, + } + ) + str_cmp = textwrap.dedent( + """\ + + RangeIndex: 5 entries, 0 to 4 + Data columns (total 3 columns): + # Column Non-Null Count Dtype + --- ------ -------------- ----- + 0 int_col 5 non-null int64 + 1 text_col 5 non-null object + 2 float_col 5 non-null float64 + dtypes: float64(1), int64(1), object(1) + memory usage: 130.0 bytes + """ + ) + df.info(buf=buffer, verbose=True, memory_usage="deep") + actual_string = buffer.getvalue() + assert str_cmp == actual_string + + buffer.truncate(0) + buffer.seek(0) + + +def test_dataframe_info_null_counts(): + int_values = [1, 2, 3, 4, 5] + text_values = ["alpha", "beta", "gamma", "delta", "epsilon"] + float_values = [0.0, 0.25, 0.5, 0.75, 1.0] + + df = cudf.DataFrame( + { + "int_col": int_values, + "text_col": text_values, + "float_col": float_values, + } + ) + buffer = io.StringIO() + str_cmp = textwrap.dedent( + """\ + + RangeIndex: 5 entries, 0 to 4 + Data columns (total 3 columns): + # Column Dtype + --- ------ ----- + 0 int_col int64 + 1 text_col object + 2 float_col float64 + dtypes: float64(1), int64(1), object(1) + memory usage: 130.0+ bytes + """ + ) + df.info(buf=buffer, verbose=True, null_counts=False) + actual_string = buffer.getvalue() + assert str_cmp == actual_string + + buffer.truncate(0) + buffer.seek(0) + + df.info(buf=buffer, verbose=True, max_cols=0) + actual_string = buffer.getvalue() + assert str_cmp == actual_string + + buffer.truncate(0) + buffer.seek(0) + + df = cudf.DataFrame() + + str_cmp = textwrap.dedent( + """\ + + RangeIndex: 0 entries + Empty DataFrame""" + ) + df.info(buf=buffer, verbose=True) + actual_string = buffer.getvalue() + assert str_cmp == actual_string + + buffer.truncate(0) + buffer.seek(0) + + df = cudf.DataFrame( + { + "a": [1, 2, 3, None, 10, 11, 12, None], + "b": ["a", "b", "c", "sd", "sdf", "sd", None, None], + } + ) + + str_cmp = textwrap.dedent( + """\ + + RangeIndex: 8 entries, 0 to 7 + Data columns (total 2 columns): + # Column Dtype + --- ------ ----- + 0 a int64 + 1 b object + dtypes: int64(1), object(1) + memory usage: 238.0+ bytes + """ + ) + with pd.option_context("display.max_info_rows", 2): + df.info(buf=buffer, max_cols=2, null_counts=None) + actual_string = buffer.getvalue() + assert str_cmp == actual_string + + buffer.truncate(0) + buffer.seek(0) + + str_cmp = textwrap.dedent( + """\ + + RangeIndex: 8 entries, 0 to 7 + Data columns (total 2 columns): + # Column Non-Null Count Dtype + --- ------ -------------- ----- + 0 a 6 non-null int64 + 1 b 6 non-null object + dtypes: int64(1), object(1) + memory usage: 238.0+ bytes + """ + ) + + df.info(buf=buffer, max_cols=2, null_counts=None) + actual_string = buffer.getvalue() + assert str_cmp == actual_string + + buffer.truncate(0) + buffer.seek(0) + + df.info(buf=buffer, null_counts=True) + actual_string = buffer.getvalue() + assert str_cmp == actual_string diff --git a/python/cudf/cudf/tests/dataframe/methods/test_insert.py b/python/cudf/cudf/tests/dataframe/methods/test_insert.py new file mode 100644 index 00000000000..f3dbe3a7ff0 --- /dev/null +++ b/python/cudf/cudf/tests/dataframe/methods/test_insert.py @@ -0,0 +1,14 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + +import pandas as pd + +import cudf +from cudf.testing import assert_eq + + +def test_insert_reset_label_dtype(): + result = cudf.DataFrame({1: [2]}) + expected = pd.DataFrame({1: [2]}) + result.insert(1, "a", [2]) + expected.insert(1, "a", [2]) + assert_eq(result, expected) diff --git a/python/cudf/cudf/tests/dataframe/methods/test_iterrows_itertuples.py b/python/cudf/cudf/tests/dataframe/methods/test_iterrows_itertuples.py new file mode 100644 index 00000000000..d67dea7b79c --- /dev/null +++ b/python/cudf/cudf/tests/dataframe/methods/test_iterrows_itertuples.py @@ -0,0 +1,33 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + +import re + +import pytest + +import cudf + + +def test_dataframe_iterrows_itertuples(): + df = cudf.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) + + with pytest.raises( + TypeError, + match=re.escape( + "cuDF does not support iteration of DataFrame " + "via itertuples. Consider using " + "`.to_pandas().itertuples()` " + "if you wish to iterate over namedtuples." + ), + ): + df.itertuples() + + with pytest.raises( + TypeError, + match=re.escape( + "cuDF does not support iteration of DataFrame " + "via iterrows. Consider using " + "`.to_pandas().iterrows()` " + "if you wish to iterate over each row." + ), + ): + df.iterrows() diff --git a/python/cudf/cudf/tests/dataframe/methods/test_keys.py b/python/cudf/cudf/tests/dataframe/methods/test_keys.py new file mode 100644 index 00000000000..4e9fdd9f9c9 --- /dev/null +++ b/python/cudf/cudf/tests/dataframe/methods/test_keys.py @@ -0,0 +1,54 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + +import pandas as pd +import pytest + +import cudf +from cudf.testing import assert_eq + + +@pytest.mark.parametrize( + "df", + [ + pd.DataFrame({"a": [1, 2, 3, 4, 5, 10, 11, 12, 33, 55, 19]}), + pd.DataFrame( + { + "one": [1, 2, 3, 4, 5, 10], + "two": ["abc", "def", "ghi", "xyz", "pqr", "abc"], + } + ), + pd.DataFrame( + { + "one": [1, 2, 3, 4, 5, 10], + "two": ["abc", "def", "ghi", "xyz", "pqr", "abc"], + }, + index=[10, 20, 30, 40, 50, 60], + ), + pd.DataFrame( + { + "one": [1, 2, 3, 4, 5, 10], + "two": ["abc", "def", "ghi", "xyz", "pqr", "abc"], + }, + index=["a", "b", "c", "d", "e", "f"], + ), + pd.DataFrame(index=["a", "b", "c", "d", "e", "f"]), + pd.DataFrame(columns=["a", "b", "c", "d", "e", "f"]), + pd.DataFrame(index=[10, 11, 12]), + pd.DataFrame(columns=[10, 11, 12]), + pd.DataFrame(), + pd.DataFrame({"one": [], "two": []}), + pd.DataFrame({2: [], 1: []}), + pd.DataFrame( + { + 0: [1, 2, 3, 4, 5, 10], + 1: ["abc", "def", "ghi", "xyz", "pqr", "abc"], + 100: ["a", "b", "b", "x", "z", "a"], + }, + index=[10, 20, 30, 40, 50, 60], + ), + ], +) +def test_dataframe_keys(df): + gdf = cudf.from_pandas(df) + + assert_eq(df.keys(), gdf.keys()) diff --git a/python/cudf/cudf/tests/dataframe/methods/test_memory_usage.py b/python/cudf/cudf/tests/dataframe/methods/test_memory_usage.py index 42185ff6a9c..1c3a5bc0585 100644 --- a/python/cudf/cudf/tests/dataframe/methods/test_memory_usage.py +++ b/python/cudf/cudf/tests/dataframe/methods/test_memory_usage.py @@ -1,9 +1,120 @@ # Copyright (c) 2025, NVIDIA CORPORATION. +import numpy as np +import pandas as pd +import pytest import cudf +from cudf.testing import assert_eq +from cudf.testing._utils import expect_warning_if def test_list_struct_list_memory_usage(): df = cudf.DataFrame({"a": [[{"b": [1]}]]}) assert df.memory_usage().sum() == 16 + + +@pytest.mark.parametrize("index", [False, True]) +def test_memory_usage_index_preserve_types(index): + data = [[1, 2, 3]] + columns = pd.Index(np.array([1, 2, 3], dtype=np.int8), name="a") + result = ( + cudf.DataFrame(data, columns=columns).memory_usage(index=index).index + ) + expected = ( + pd.DataFrame(data, columns=columns).memory_usage(index=index).index + ) + if index: + # pandas returns an Index[object] with int and string elements + expected = expected.astype(str) + assert_eq(result, expected) + + +@pytest.mark.parametrize("set_index", [None, "A", "C", "D"]) +@pytest.mark.parametrize("index", [True, False]) +@pytest.mark.parametrize("deep", [True, False]) +def test_memory_usage(deep, index, set_index): + # Testing numerical/datetime by comparing with pandas + # (string and categorical columns will be different) + rows = 100 + df = pd.DataFrame( + { + "A": np.arange(rows, dtype="int64"), + "B": np.arange(rows, dtype="int32"), + "C": np.arange(rows, dtype="float64"), + } + ) + df["D"] = pd.to_datetime(df.A) + if set_index: + df = df.set_index(set_index) + + gdf = cudf.from_pandas(df) + + if index and set_index is None: + # Special Case: Assume RangeIndex size == 0 + with expect_warning_if(deep, UserWarning): + assert gdf.index.memory_usage(deep=deep) == 0 + + else: + # Check for Series only + assert df["B"].memory_usage(index=index, deep=deep) == gdf[ + "B" + ].memory_usage(index=index, deep=deep) + + # Check for entire DataFrame + assert_eq( + df.memory_usage(index=index, deep=deep).sort_index(), + gdf.memory_usage(index=index, deep=deep).sort_index(), + ) + + +@pytest.mark.xfail +def test_memory_usage_string(): + rows = 100 + rng = np.random.default_rng(seed=0) + df = pd.DataFrame( + { + "A": np.arange(rows, dtype="int32"), + "B": rng.choice(["apple", "banana", "orange"], rows), + } + ) + gdf = cudf.from_pandas(df) + + # Check deep=False (should match pandas) + assert gdf.B.memory_usage(deep=False, index=False) == df.B.memory_usage( + deep=False, index=False + ) + + # Check string column + assert gdf.B.memory_usage(deep=True, index=False) == df.B.memory_usage( + deep=True, index=False + ) + + # Check string index + assert gdf.set_index("B").index.memory_usage( + deep=True + ) == df.B.memory_usage(deep=True, index=False) + + +def test_memory_usage_cat(): + rows = 100 + rng = np.random.default_rng(seed=0) + df = pd.DataFrame( + { + "A": np.arange(rows, dtype="int32"), + "B": rng.choice(["apple", "banana", "orange"], rows), + } + ) + df["B"] = df.B.astype("category") + gdf = cudf.from_pandas(df) + + expected = ( + gdf.B._column.categories.memory_usage + + gdf.B._column.codes.memory_usage + ) + + # Check cat column + assert gdf.B.memory_usage(deep=True, index=False) == expected + + # Check cat index + assert gdf.set_index("B").index.memory_usage(deep=True) == expected diff --git a/python/cudf/cudf/tests/dataframe/methods/test_mode.py b/python/cudf/cudf/tests/dataframe/methods/test_mode.py new file mode 100644 index 00000000000..aad65d67612 --- /dev/null +++ b/python/cudf/cudf/tests/dataframe/methods/test_mode.py @@ -0,0 +1,54 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + +import pytest + +import cudf +from cudf.testing import assert_eq + + +@pytest.mark.parametrize( + "df", + [ + lambda: cudf.DataFrame({"a": [1, 2, 3]}), + lambda: cudf.DataFrame( + {"a": [1, 2, 3], "b": ["a", "z", "c"]}, index=["a", "z", "x"] + ), + lambda: cudf.DataFrame( + { + "a": [1, 2, 3, None, 2, 1, None], + "b": ["a", "z", "c", "a", "v", "z", "z"], + } + ), + lambda: cudf.DataFrame({"a": [], "b": []}), + lambda: cudf.DataFrame({"a": [None, None], "b": [None, None]}), + lambda: cudf.DataFrame( + { + "a": ["hello", "world", "rapids", "ai", "nvidia"], + "b": cudf.Series( + [1, 21, 21, 11, 11], + dtype="timedelta64[s]", + index=["a", "b", "c", "d", " e"], + ), + }, + index=["a", "b", "c", "d", " e"], + ), + lambda: cudf.DataFrame( + { + "a": ["hello", None, "world", "rapids", None, "ai", "nvidia"], + "b": cudf.Series( + [1, 21, None, 11, None, 11, None], dtype="datetime64[s]" + ), + } + ), + ], +) +def test_dataframe_mode(df, numeric_only, dropna): + df = df() + pdf = df.to_pandas() + + expected = pdf.mode(numeric_only=numeric_only, dropna=dropna) + actual = df.mode(numeric_only=numeric_only, dropna=dropna) + if len(actual.columns) == 0: + # pandas < 3.0 returns an Index[object] instead of RangeIndex + actual.columns = expected.columns + assert_eq(expected, actual, check_dtype=False) diff --git a/python/cudf/cudf/tests/dataframe/methods/test_nunique.py b/python/cudf/cudf/tests/dataframe/methods/test_nunique.py new file mode 100644 index 00000000000..b54e8cf1b4f --- /dev/null +++ b/python/cudf/cudf/tests/dataframe/methods/test_nunique.py @@ -0,0 +1,32 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + +import numpy as np +import pandas as pd +import pytest + +import cudf +from cudf.testing import assert_eq + + +@pytest.mark.parametrize( + "columns", + [ + pd.RangeIndex(2, name="foo"), + pd.MultiIndex.from_arrays([[1, 2], [2, 3]], names=["foo", 1]), + pd.Index([3, 5], dtype=np.int8, name="foo"), + ], +) +def test_nunique_preserve_column_in_index(columns): + df = cudf.DataFrame([[1, 2]], columns=columns) + result = df.nunique().index.to_pandas() + assert_eq(result, columns, exact=True) + + +def test_dataframe_nunique(): + gdf = cudf.DataFrame({"a": [1, 2, 3], "b": [1, 1, 0]}) + pdf = gdf.to_pandas() + + actual = gdf.nunique() + expected = pdf.nunique() + + assert_eq(expected, actual) diff --git a/python/cudf/cudf/tests/dataframe/methods/test_pct_change.py b/python/cudf/cudf/tests/dataframe/methods/test_pct_change.py new file mode 100644 index 00000000000..57a68f14d35 --- /dev/null +++ b/python/cudf/cudf/tests/dataframe/methods/test_pct_change.py @@ -0,0 +1,46 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + + +import numpy as np +import pytest + +import cudf +from cudf.api.extensions import no_default +from cudf.core._compat import ( + PANDAS_CURRENT_SUPPORTED_VERSION, + PANDAS_VERSION, +) +from cudf.testing import assert_eq +from cudf.testing._utils import ( + expect_warning_if, +) + + +@pytest.mark.skipif( + PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, + reason="warning not present in older pandas versions", +) +@pytest.mark.parametrize( + "data", + [ + [1, 4, 6, 1], + np.array([1.123, 2.343, 5.890, 0.0]), + {"a": [1.123, 2.343, np.nan, np.nan], "b": [None, 3, 9.08, None]}, + ], +) +@pytest.mark.parametrize("periods", [-5, 0, 2]) +@pytest.mark.parametrize( + "fill_method", ["ffill", "bfill", "pad", "backfill", no_default] +) +def test_dataframe_pct_change(data, periods, fill_method): + gdf = cudf.DataFrame(data) + pdf = gdf.to_pandas() + + with expect_warning_if(fill_method is not no_default): + actual = gdf.pct_change(periods=periods, fill_method=fill_method) + with expect_warning_if( + fill_method is not no_default or pdf.isna().any().any() + ): + expected = pdf.pct_change(periods=periods, fill_method=fill_method) + + assert_eq(expected, actual) diff --git a/python/cudf/cudf/tests/dataframe/methods/test_pipe.py b/python/cudf/cudf/tests/dataframe/methods/test_pipe.py new file mode 100644 index 00000000000..f20724da0c5 --- /dev/null +++ b/python/cudf/cudf/tests/dataframe/methods/test_pipe.py @@ -0,0 +1,62 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + +import pandas as pd + +import cudf +from cudf.testing import assert_eq +from cudf.testing._utils import assert_exceptions_equal + + +def test_dataframe_pipe(): + pdf = pd.DataFrame() + gdf = cudf.DataFrame() + + def add_int_col(df, column): + df[column] = df._constructor_sliced([10, 20, 30, 40]) + return df + + def add_str_col(df, column): + df[column] = df._constructor_sliced(["a", "b", "xyz", "ai"]) + return df + + expected = ( + pdf.pipe(add_int_col, "one") + .pipe(add_int_col, column="two") + .pipe(add_str_col, "three") + ) + actual = ( + gdf.pipe(add_int_col, "one") + .pipe(add_int_col, column="two") + .pipe(add_str_col, "three") + ) + + assert_eq(expected, actual) + + expected = ( + pdf.pipe((add_str_col, "df"), column="one") + .pipe(add_str_col, column="two") + .pipe(add_int_col, "three") + ) + actual = ( + gdf.pipe((add_str_col, "df"), column="one") + .pipe(add_str_col, column="two") + .pipe(add_int_col, "three") + ) + + assert_eq(expected, actual) + + +def test_dataframe_pipe_error(): + pdf = pd.DataFrame() + gdf = cudf.DataFrame() + + def custom_func(df, column): + df[column] = df._constructor_sliced([10, 20, 30, 40]) + return df + + assert_exceptions_equal( + lfunc=pdf.pipe, + rfunc=gdf.pipe, + lfunc_args_and_kwargs=([(custom_func, "columns")], {"columns": "d"}), + rfunc_args_and_kwargs=([(custom_func, "columns")], {"columns": "d"}), + ) diff --git a/python/cudf/cudf/tests/dataframe/methods/test_reductions.py b/python/cudf/cudf/tests/dataframe/methods/test_reductions.py index 9b4134e5b3b..0061115bc08 100644 --- a/python/cudf/cudf/tests/dataframe/methods/test_reductions.py +++ b/python/cudf/cudf/tests/dataframe/methods/test_reductions.py @@ -181,3 +181,72 @@ def test_dataframe_axis_0_preserve_column_type_in_index(columns): result = cudf_df.sum(axis=0) expected = pd_df.sum(axis=0) assert_eq(result, expected, check_index_type=True) + + +def test_dataframe_reduction_error(): + gdf = cudf.DataFrame( + { + "a": cudf.Series([1, 2, 3], dtype="float"), + "d": cudf.Series([10, 20, 30], dtype="timedelta64[ns]"), + } + ) + + with pytest.raises(TypeError): + gdf.sum() + + +def test_mean_timeseries(numeric_only): + gdf = cudf.datasets.timeseries() + if not numeric_only: + gdf = gdf.select_dtypes(include="number") + pdf = gdf.to_pandas() + + expected = pdf.mean(numeric_only=numeric_only) + actual = gdf.mean(numeric_only=numeric_only) + + assert_eq(expected, actual) + + +def test_std_different_dtypes(numeric_only): + gdf = cudf.DataFrame( + { + "a": [1, 2, 3, 4, 5], + "b": ["a", "b", "c", "d", "e"], + "c": [1.0, 2.0, 3.0, 4.0, 5.0], + } + ) + if not numeric_only: + gdf = gdf.select_dtypes(include="number") + pdf = gdf.to_pandas() + + expected = pdf.std(numeric_only=numeric_only) + actual = gdf.std(numeric_only=numeric_only) + + assert_eq(expected, actual) + + +def test_empty_numeric_only(): + gdf = cudf.DataFrame( + { + "id": ["a", "a", "a", "b", "b", "b", "c", "c", "c"], + "val1": ["v", "n", "k", "l", "m", "i", "y", "r", "w"], + "val2": ["d", "d", "d", "e", "e", "e", "f", "f", "f"], + } + ) + pdf = gdf.to_pandas() + expected = pdf.prod(numeric_only=True) + actual = gdf.prod(numeric_only=True) + assert_eq(expected, actual, check_dtype=True) + + +@pytest.mark.parametrize( + "op", + ["count", "kurt", "kurtosis", "skew"], +) +def test_dataframe_axis1_unsupported_ops(op): + df = cudf.DataFrame({"a": [1, 2, 3], "b": [8, 9, 10]}) + + with pytest.raises( + NotImplementedError, match="Only axis=0 is currently supported." + ): + getattr(df, op)(axis=1) diff --git a/python/cudf/cudf/tests/dataframe/methods/test_reindex.py b/python/cudf/cudf/tests/dataframe/methods/test_reindex.py index 19efd968a95..bfe6ecf170f 100644 --- a/python/cudf/cudf/tests/dataframe/methods/test_reindex.py +++ b/python/cudf/cudf/tests/dataframe/methods/test_reindex.py @@ -6,6 +6,7 @@ import cudf from cudf.testing import assert_eq +from cudf.testing._utils import assert_exceptions_equal @pytest.mark.parametrize("copy", [True, False]) @@ -189,3 +190,52 @@ def test_reindex_columns_rangeindex_keeps_rangeindex(name, klass): result = df.reindex(columns=new_columns).columns expected = pd.RangeIndex(3, name=exp_name) assert_eq(result, expected) + + +def test_dataframe_duplicate_index_reindex(): + gdf = cudf.DataFrame({"a": [0, 1, 2, 3]}, index=[0, 0, 1, 1]) + pdf = gdf.to_pandas() + + assert_exceptions_equal( + gdf.reindex, + pdf.reindex, + lfunc_args_and_kwargs=([10, 11, 12, 13], {}), + rfunc_args_and_kwargs=([10, 11, 12, 13], {}), + ) + + +def test_dataframe_reindex_keep_colname(): + gdf = cudf.DataFrame([1], columns=cudf.Index([1], name="foo")) + result = gdf.reindex(index=[0, 1]) + expected = cudf.DataFrame( + [1, None], columns=cudf.Index([1], name="foo"), index=[0, 1] + ) + assert_eq(result, expected) + + +@pytest.mark.parametrize( + "index_data,name", + [([10, 13], "a"), ([30, 40, 20], "b"), (["ef"], "c"), ([2, 3], "Z")], +) +def test_dataframe_reindex_with_index_names(index_data, name): + gdf = cudf.DataFrame( + { + "a": [10, 12, 13], + "b": [20, 30, 40], + "c": cudf.Series(["ab", "cd", "ef"], dtype="category"), + } + ) + if name in gdf.columns: + gdf = gdf.set_index(name) + pdf = gdf.to_pandas() + + gidx = cudf.Index(index_data, name=name) + actual = gdf.reindex(gidx) + expected = pdf.reindex(gidx.to_pandas()) + + assert_eq(actual, expected) + + actual = gdf.reindex(index_data) + expected = pdf.reindex(index_data) + + assert_eq(actual, expected) diff --git a/python/cudf/cudf/tests/dataframe/methods/test_rename.py b/python/cudf/cudf/tests/dataframe/methods/test_rename.py index b7e9bf3c7ce..345342574c0 100644 --- a/python/cudf/cudf/tests/dataframe/methods/test_rename.py +++ b/python/cudf/cudf/tests/dataframe/methods/test_rename.py @@ -68,3 +68,82 @@ def test_dataframe_column_rename(axis): got = gdf.rename(columns=rename_mapper) assert_eq(expect, got) + + +def test_rename_reset_label_dtype(): + data = {1: [2]} + col_mapping = {1: "a"} + result = cudf.DataFrame(data).rename(columns=col_mapping) + expected = pd.DataFrame(data).rename(columns=col_mapping) + assert_eq(result, expected) + + +def test_dataframe_rename_columns_keep_type(): + gdf = cudf.DataFrame([[1, 2, 3]]) + gdf.columns = cudf.Index([4, 5, 6], dtype=np.int8) + result = gdf.rename({4: 50}, axis="columns").columns + expected = pd.Index([50, 5, 6], dtype=np.int8) + assert_eq(result, expected) + + +def test_dataframe_rename_duplicate_column(): + gdf = cudf.DataFrame({"a": [1, 2, 3], "b": [3, 4, 5]}) + with pytest.raises( + ValueError, match="Duplicate column names are not allowed" + ): + gdf.rename(columns={"a": "b"}, inplace=True) + + +@pytest.mark.parametrize("level", ["x", 0]) +def test_rename_for_level_MultiIndex_dataframe(level): + data = {"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]} + index = {0: 123, 1: 4, 2: 6} + pdf = pd.DataFrame( + data, + index=pd.MultiIndex.from_tuples([(0, 1, 2), (1, 2, 3), (2, 3, 4)]), + ) + pdf.index.names = ["x", "y", "z"] + gdf = cudf.from_pandas(pdf) + + expect = pdf.rename(index=index, level=level) + got = gdf.rename(index=index, level=level) + + assert_eq(expect, got) + + +@pytest.mark.parametrize( + "columns", + [{"a": "f", "b": "g"}, {1: 3, 2: 4}, lambda s: 2 * s], +) +@pytest.mark.parametrize("level", [0, 1]) +def test_rename_for_level_MultiColumn_dataframe(columns, level): + gdf = cudf.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]}) + gdf.columns = pd.MultiIndex.from_tuples([("a", 1), ("a", 2), ("b", 1)]) + + pdf = gdf.to_pandas() + + expect = pdf.rename(columns=columns, level=level) + got = gdf.rename(columns=columns, level=level) + + assert_eq(expect, got) + + +def test_rename_for_level_RangeIndex_dataframe(): + gdf = cudf.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]}) + pdf = gdf.to_pandas() + + expect = pdf.rename(columns={"a": "f"}, index={0: 3, 1: 4}, level=0) + got = gdf.rename(columns={"a": "f"}, index={0: 3, 1: 4}, level=0) + + assert_eq(expect, got) + + +def test_rename_for_level_is_None_MC(): + gdf = cudf.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]}) + gdf.columns = pd.MultiIndex.from_tuples([("a", 1), ("a", 2), ("b", 1)]) + pdf = gdf.to_pandas() + + expect = pdf.rename(columns={"a": "f"}, level=None) + got = gdf.rename(columns={"a": "f"}, level=None) + + assert_eq(expect, got) diff --git a/python/cudf/cudf/tests/dataframe/methods/test_sample.py b/python/cudf/cudf/tests/dataframe/methods/test_sample.py new file mode 100644 index 00000000000..5d46f8c0d11 --- /dev/null +++ b/python/cudf/cudf/tests/dataframe/methods/test_sample.py @@ -0,0 +1,241 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. +import itertools + +import cupy as cp +import numpy as np +import pandas as pd +import pytest + +import cudf +from cudf.testing import assert_eq +from cudf.testing._utils import assert_exceptions_equal + + +def shape_checker(expected, got): + assert expected.shape == got.shape + + +def exact_checker(expected, got): + assert_eq(expected, got) + + +@pytest.fixture( + params=itertools.product([0, 2, None], [0.3, None]), + ids=lambda arg: f"n={arg[0]}-frac={arg[1]}", +) +def sample_n_frac(request): + """ + Specific to `test_sample*` tests. + """ + n, frac = request.param + if n is not None and frac is not None: + pytest.skip("Cannot specify both n and frac.") + return n, frac + + +@pytest.fixture(params=[None, "builtin_list", "ndarray"]) +def make_weights_axis_0(request): + """Specific to `test_sample*_axis_0` tests. + Only testing weights array that matches type with random state. + """ + + if request.param is None: + return lambda *_: (None, None) + elif request.param == "builtin-list": + return lambda size, _: ([1] * size, [1] * size) + else: + + def wrapped(size, numpy_weights_for_cudf): + # Uniform distribution, non-normalized + if numpy_weights_for_cudf: + return np.ones(size), np.ones(size) + else: + return np.ones(size), cp.ones(size) + + return wrapped + + +@pytest.mark.parametrize( + "make_weights_axis_1", + [lambda _: None, lambda s: [1] * s, lambda s: np.ones(s)], +) +@pytest.mark.parametrize( + "pd_random_state, gd_random_state, checker", + [ + (None, None, shape_checker), + (42, 42, shape_checker), + (np.random.RandomState(42), np.random.RandomState(42), exact_checker), + ], + ids=["None", "IntSeed", "NumpyRandomState"], +) +def test_sample_axis_1( + sample_n_frac, + pd_random_state, + gd_random_state, + checker, + make_weights_axis_1, +): + n, frac = sample_n_frac + + pdf = pd.DataFrame( + { + "a": [1, 2, 3, 4, 5], + "float": [0.05, 0.2, 0.3, 0.2, 0.25], + "int": [1, 3, 5, 4, 2], + }, + ) + df = cudf.DataFrame.from_pandas(pdf) + + weights = make_weights_axis_1(len(pdf.columns)) + + expected = pdf.sample( + n=n, + frac=frac, + replace=False, + random_state=pd_random_state, + weights=weights, + axis=1, + ) + got = df.sample( + n=n, + frac=frac, + replace=False, + random_state=gd_random_state, + weights=weights, + axis=1, + ) + checker(expected, got) + + +@pytest.mark.parametrize( + "pdf", + [ + pd.DataFrame( + { + "a": [1, 2, 3, 4, 5], + "float": [0.05, 0.2, 0.3, 0.2, 0.25], + "int": [1, 3, 5, 4, 2], + }, + ), + pd.Series([1, 2, 3, 4, 5]), + ], +) +@pytest.mark.parametrize( + "pd_random_state, gd_random_state, checker", + [ + (None, None, shape_checker), + (42, 42, shape_checker), + (np.random.RandomState(42), np.random.RandomState(42), exact_checker), + (np.random.RandomState(42), cp.random.RandomState(42), shape_checker), + ], + ids=["None", "IntSeed", "NumpyRandomState", "CupyRandomState"], +) +@pytest.mark.parametrize("replace", [True, False]) +def test_sample_axis_0( + pdf, + sample_n_frac, + replace, + pd_random_state, + gd_random_state, + checker, + make_weights_axis_0, +): + n, frac = sample_n_frac + + df = cudf.from_pandas(pdf) + + pd_weights, gd_weights = make_weights_axis_0( + len(pdf), isinstance(gd_random_state, np.random.RandomState) + ) + if ( + not replace + and not isinstance(gd_random_state, np.random.RandomState) + and gd_weights is not None + ): + pytest.skip( + "`cupy.random.RandomState` doesn't support weighted sampling " + "without replacement." + ) + + expected = pdf.sample( + n=n, + frac=frac, + replace=replace, + random_state=pd_random_state, + weights=pd_weights, + axis=0, + ) + + got = df.sample( + n=n, + frac=frac, + replace=replace, + random_state=gd_random_state, + weights=gd_weights, + axis=0, + ) + checker(expected, got) + + +@pytest.mark.parametrize("replace", [True, False]) +@pytest.mark.parametrize( + "random_state_lib", [cp.random.RandomState, np.random.RandomState] +) +def test_sample_reproducibility(replace, random_state_lib): + df = cudf.DataFrame({"a": cp.arange(0, 25)}) + + n = 25 + expected = df.sample(n, replace=replace, random_state=random_state_lib(10)) + out = df.sample(n, replace=replace, random_state=random_state_lib(10)) + + assert_eq(expected, out) + + +def test_sample_invalid_n_frac_combo(axis): + n, frac = 2, 0.5 + pdf = pd.DataFrame( + { + "a": [1, 2, 3, 4, 5], + "float": [0.05, 0.2, 0.3, 0.2, 0.25], + "int": [1, 3, 5, 4, 2], + }, + ) + df = cudf.DataFrame.from_pandas(pdf) + + assert_exceptions_equal( + lfunc=pdf.sample, + rfunc=df.sample, + lfunc_args_and_kwargs=([], {"n": n, "frac": frac, "axis": axis}), + rfunc_args_and_kwargs=([], {"n": n, "frac": frac, "axis": axis}), + ) + + +@pytest.mark.parametrize("n, frac", [(100, None), (None, 3)]) +def test_oversample_without_replace(n, frac, axis): + pdf = pd.DataFrame({"a": [1, 2, 3, 4, 5]}) + df = cudf.DataFrame.from_pandas(pdf) + + assert_exceptions_equal( + lfunc=pdf.sample, + rfunc=df.sample, + lfunc_args_and_kwargs=( + [], + {"n": n, "frac": frac, "axis": axis, "replace": False}, + ), + rfunc_args_and_kwargs=( + [], + {"n": n, "frac": frac, "axis": axis, "replace": False}, + ), + ) + + +@pytest.mark.parametrize("random_state", [None, cp.random.RandomState(42)]) +def test_sample_unsupported_arguments(random_state): + df = cudf.DataFrame({"float": [0.05, 0.2, 0.3, 0.2, 0.25]}) + with pytest.raises( + NotImplementedError, + match="Random sampling with cupy does not support these inputs.", + ): + df.sample( + n=2, replace=False, random_state=random_state, weights=[1] * 5 + ) diff --git a/python/cudf/cudf/tests/dataframe/methods/test_to_arrow.py b/python/cudf/cudf/tests/dataframe/methods/test_to_arrow.py index d29a6bbcfaf..7af8a5f94de 100644 --- a/python/cudf/cudf/tests/dataframe/methods/test_to_arrow.py +++ b/python/cudf/cudf/tests/dataframe/methods/test_to_arrow.py @@ -161,3 +161,71 @@ def test_to_arrow_categorical(): assert isinstance(pa_gs, pa.Array) assert pa.Array.equals(pa_s, pa_gs) + + +@pytest.mark.parametrize( + "data", + [ + {0: [1, 2, 3], 2: [10, 11, 23]}, + {("a", "b"): [1, 2, 3], ("2",): [10, 11, 23]}, + ], +) +def test_non_string_column_name_to_arrow(data): + df = cudf.DataFrame(data) + + expected = df.to_arrow() + actual = pa.Table.from_pandas(df.to_pandas()) + + assert expected.equals(actual) + + +@pytest.mark.parametrize( + "data", + [ + {"a": [{"one": 3, "two": 4, "three": 10}]}, + { + "left-a": [0, 1, 2], + "a": [{"x": 0.23, "y": 43}, None, {"x": 23.9, "y": 4.3}], + "right-a": ["abc", "def", "ghi"], + }, + { + "left-a": [{"a": 1}, None, None], + "a": [ + {"one": 324, "two": 23432, "three": 324}, + None, + {"one": 3.24, "two": 1, "three": 324}, + ], + "right-a": ["abc", "def", "ghi"], + }, + ], +) +def test_dataframe_roundtrip_arrow_struct_dtype(data): + gdf = cudf.DataFrame(data) + table = gdf.to_arrow() + expected = cudf.DataFrame.from_arrow(table) + + assert_eq(gdf, expected) + + +@pytest.mark.parametrize( + "data", + [ + {"a": [[1], [2], [3]]}, + { + "left-a": [0, 1, 2], + "a": [[1], None, [3]], + "right-a": ["abc", "def", "ghi"], + }, + { + "left-a": [[], None, None], + "a": [[1], None, [3]], + "right-a": ["abc", "def", "ghi"], + }, + ], +) +def test_dataframe_roundtrip_arrow_list_dtype(data): + gdf = cudf.DataFrame(data) + table = gdf.to_arrow() + expected = cudf.DataFrame.from_arrow(table) + + assert_eq(gdf, expected) diff --git a/python/cudf/cudf/tests/dataframe/methods/test_to_pandas.py b/python/cudf/cudf/tests/dataframe/methods/test_to_pandas.py index 739d97c28a5..4fd728d06f1 100644 --- a/python/cudf/cudf/tests/dataframe/methods/test_to_pandas.py +++ b/python/cudf/cudf/tests/dataframe/methods/test_to_pandas.py @@ -1,9 +1,14 @@ # Copyright (c) 2025, NVIDIA CORPORATION. +import datetime +import decimal import numpy as np +import pandas as pd +import pyarrow as pa import pytest import cudf +from cudf.testing import assert_eq def test_to_pandas(): @@ -36,3 +41,191 @@ def test_list_to_pandas_nullable_true(): df = cudf.DataFrame({"a": cudf.Series([[1, 2, 3]])}) with pytest.raises(NotImplementedError): df.to_pandas(nullable=True) + + +@pytest.mark.parametrize( + "scalar", + [ + 1, + 1.0, + "a", + datetime.datetime(2020, 1, 1), + datetime.timedelta(1), + {"1": 2}, + [1], + decimal.Decimal("1.0"), + ], +) +def test_dataframe_to_pandas_arrow_type(scalar): + pa_array = pa.array([scalar, None]) + df = cudf.DataFrame({"a": pa_array}) + result = df.to_pandas(arrow_type=True) + expected = pd.DataFrame({"a": pd.arrays.ArrowExtensionArray(pa_array)}) + pd.testing.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "scalar", + [ + 1, + 1.0, + "a", + datetime.datetime(2020, 1, 1), + datetime.timedelta(1), + {"1": 2}, + [1], + decimal.Decimal("1.0"), + ], +) +def test_dataframe_to_pandas_arrow_type_nullable_raises(scalar): + pa_array = pa.array([scalar, None]) + df = cudf.DataFrame({"a": pa_array}) + with pytest.raises(ValueError): + df.to_pandas(nullable=True, arrow_type=True) + + +@pytest.mark.parametrize( + "df,expected_pdf", + [ + ( + lambda: cudf.DataFrame( + { + "a": cudf.Series([1, 2, None, 3], dtype="uint8"), + "b": cudf.Series([23, None, None, 32], dtype="uint16"), + } + ), + pd.DataFrame( + { + "a": pd.Series([1, 2, None, 3], dtype=pd.UInt8Dtype()), + "b": pd.Series( + [23, None, None, 32], dtype=pd.UInt16Dtype() + ), + } + ), + ), + ( + lambda: cudf.DataFrame( + { + "a": cudf.Series([None, 123, None, 1], dtype="uint32"), + "b": cudf.Series( + [234, 2323, 23432, None, None, 224], dtype="uint64" + ), + } + ), + pd.DataFrame( + { + "a": pd.Series( + [None, 123, None, 1], dtype=pd.UInt32Dtype() + ), + "b": pd.Series( + [234, 2323, 23432, None, None, 224], + dtype=pd.UInt64Dtype(), + ), + } + ), + ), + ( + lambda: cudf.DataFrame( + { + "a": cudf.Series( + [-10, 1, None, -1, None, 3], dtype="int8" + ), + "b": cudf.Series( + [111, None, 222, None, 13], dtype="int16" + ), + } + ), + pd.DataFrame( + { + "a": pd.Series( + [-10, 1, None, -1, None, 3], dtype=pd.Int8Dtype() + ), + "b": pd.Series( + [111, None, 222, None, 13], dtype=pd.Int16Dtype() + ), + } + ), + ), + ( + lambda: cudf.DataFrame( + { + "a": cudf.Series( + [11, None, 22, 33, None, 2, None, 3], dtype="int32" + ), + "b": cudf.Series( + [32431, None, None, 32322, 0, 10, -32324, None], + dtype="int64", + ), + } + ), + pd.DataFrame( + { + "a": pd.Series( + [11, None, 22, 33, None, 2, None, 3], + dtype=pd.Int32Dtype(), + ), + "b": pd.Series( + [32431, None, None, 32322, 0, 10, -32324, None], + dtype=pd.Int64Dtype(), + ), + } + ), + ), + ( + lambda: cudf.DataFrame( + { + "a": cudf.Series( + [True, None, False, None, False, True, True, False], + dtype="bool_", + ), + "b": cudf.Series( + [ + "abc", + "a", + None, + "hello world", + "foo buzz", + "", + None, + "rapids ai", + ], + dtype="object", + ), + "c": cudf.Series( + [0.1, None, 0.2, None, 3, 4, 1000, None], + dtype="float64", + ), + } + ), + pd.DataFrame( + { + "a": pd.Series( + [True, None, False, None, False, True, True, False], + dtype=pd.BooleanDtype(), + ), + "b": pd.Series( + [ + "abc", + "a", + None, + "hello world", + "foo buzz", + "", + None, + "rapids ai", + ], + dtype=pd.StringDtype(), + ), + "c": pd.Series( + [0.1, None, 0.2, None, 3, 4, 1000, None], + dtype=pd.Float64Dtype(), + ), + } + ), + ), + ], +) +def test_dataframe_to_pandas_nullable_dtypes(df, expected_pdf): + actual_pdf = df().to_pandas(nullable=True) + + assert_eq(actual_pdf, expected_pdf) diff --git a/python/cudf/cudf/tests/dataframe/methods/test_update.py b/python/cudf/cudf/tests/dataframe/methods/test_update.py new file mode 100644 index 00000000000..92d7a117403 --- /dev/null +++ b/python/cudf/cudf/tests/dataframe/methods/test_update.py @@ -0,0 +1,91 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + +import numpy as np +import pandas as pd +import pytest + +import cudf +from cudf.testing import assert_eq +from cudf.testing._utils import assert_exceptions_equal + + +@pytest.mark.parametrize("overwrite", [True, False]) +@pytest.mark.parametrize( + "left_keys,right_keys", + [ + [("a", "b"), ("a", "b")], + [("a", "b"), ("a", "c")], + [("a", "b"), ("d", "e")], + ], +) +@pytest.mark.parametrize( + "data_left,data_right", + [ + [([1, 2, 3], [3, 4, 5]), ([1, 2, 3], [3, 4, 5])], + [ + ([1.0, 2.0, 3.0], [3.0, 4.0, 5.0]), + ([1.0, 2.0, 3.0], [3.0, 4.0, 5.0]), + ], + [ + ([True, False, True], [False, False, False]), + ([True, False, True], [False, False, False]), + ], + [ + ([np.nan, np.nan, np.nan], [np.nan, np.nan, np.nan]), + ([np.nan, np.nan, np.nan], [np.nan, np.nan, np.nan]), + ], + [([1, 2, 3], [3, 4, 5]), ([1, 2, 4], [30, 40, 50])], + [ + ([1.0, 2.0, 3.0], [3.0, 4.0, 5.0]), + ([1.0, 2.0, 4.0], [30.0, 40.0, 50.0]), + ], + [([1, 2, 3], [3, 4, 5]), ([10, 20, 40], [30, 40, 50])], + [ + ([1.0, 2.0, 3.0], [3.0, 4.0, 5.0]), + ([10.0, 20.0, 40.0], [30.0, 40.0, 50.0]), + ], + ], +) +def test_update_for_dataframes( + left_keys, right_keys, data_left, data_right, overwrite +): + errors = "ignore" + join = "left" + left = dict(zip(left_keys, data_left, strict=True)) + right = dict(zip(right_keys, data_right, strict=True)) + pdf = pd.DataFrame(left) + gdf = cudf.DataFrame(left, nan_as_null=False) + + other_pd = pd.DataFrame(right) + other_gd = cudf.DataFrame(right, nan_as_null=False) + + pdf.update(other=other_pd, join=join, overwrite=overwrite, errors=errors) + gdf.update(other=other_gd, join=join, overwrite=overwrite, errors=errors) + + assert_eq(pdf, gdf, check_dtype=False) + + +def test_update_for_right_join(): + gdf = cudf.DataFrame({"a": [1, 2, 3], "b": [3.0, 4.0, 5.0]}) + other_gd = cudf.DataFrame({"a": [1, np.nan, 3], "b": [np.nan, 2.0, 5.0]}) + + with pytest.raises( + NotImplementedError, match="Only left join is supported" + ): + gdf.update(other_gd, join="right") + + +def test_update_for_data_overlap(): + errors = "raise" + pdf = pd.DataFrame({"a": [1, 2, 3], "b": [3.0, 4.0, 5.0]}) + gdf = cudf.DataFrame({"a": [1, 2, 3], "b": [3.0, 4.0, 5.0]}) + + other_pd = pd.DataFrame({"a": [1, np.nan, 3], "b": [np.nan, 2.0, 5.0]}) + other_gd = cudf.DataFrame({"a": [1, np.nan, 3], "b": [np.nan, 2.0, 5.0]}) + + assert_exceptions_equal( + lfunc=pdf.update, + rfunc=gdf.update, + lfunc_args_and_kwargs=([other_pd, errors], {}), + rfunc_args_and_kwargs=([other_gd, errors], {}), + ) diff --git a/python/cudf/cudf/tests/dataframe/methods/test_value_counts.py b/python/cudf/cudf/tests/dataframe/methods/test_value_counts.py new file mode 100644 index 00000000000..26357348182 --- /dev/null +++ b/python/cudf/cudf/tests/dataframe/methods/test_value_counts.py @@ -0,0 +1,59 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. +import pytest + +import cudf +from cudf.testing import assert_eq + + +def test_value_counts_no_subset(): + gdf = cudf.DataFrame({"a": [1, 2, 3], "b": [1, 1, 0]}) + with pytest.raises(KeyError): + gdf.value_counts(subset=["not_a_column_name"]) + + +@pytest.mark.parametrize( + "data", + [ + {"num_legs": [2, 4, 4, 6], "num_wings": [2, 0, 0, 0]}, + { + "first_name": ["John", "Anne", "John", "Beth"], + "middle_name": ["Smith", None, None, "Louise"], + }, + ], +) +@pytest.mark.parametrize("normalize", [True, False]) +@pytest.mark.parametrize("use_subset", [True, False]) +def test_value_counts( + data, + sort, + ascending, + normalize, + dropna, + use_subset, +): + subset = [next(iter(data.keys()))] + gdf = cudf.DataFrame(data) + pdf = gdf.to_pandas() + + got = gdf.value_counts( + subset=subset if (use_subset) else None, + sort=sort, + ascending=ascending, + normalize=normalize, + dropna=dropna, + ) + expected = pdf.value_counts( + subset=subset if (use_subset) else None, + sort=sort, + ascending=ascending, + normalize=normalize, + dropna=dropna, + ) + + if not dropna: + # Convert the Pandas series to a cuDF one due to difference + # in the handling of NaNs between the two ( in cuDF and + # NaN in Pandas) when dropna=False. + assert_eq(got.sort_index(), cudf.from_pandas(expected).sort_index()) + else: + assert_eq(got.sort_index(), expected.sort_index()) diff --git a/python/cudf/cudf/tests/dataframe/test_attributes.py b/python/cudf/cudf/tests/dataframe/test_attributes.py index 3a3c43a15c7..dd0ecbf5d61 100644 --- a/python/cudf/cudf/tests/dataframe/test_attributes.py +++ b/python/cudf/cudf/tests/dataframe/test_attributes.py @@ -182,6 +182,135 @@ def test_ndim(): assert pdf.ndim == gdf.ndim +@pytest.mark.parametrize("names", [["abc", "def"], [1, 2], ["abc", 10]]) +def test_dataframe_multiindex_column_names(names): + arrays = [["A", "A", "B", "B"], ["one", "two", "one", "two"]] + tuples = list(zip(*arrays, strict=True)) + index = pd.MultiIndex.from_tuples(tuples, names=["first", "second"]) + + pdf = pd.DataFrame([[1, 2, 3, 4], [5, 6, 7, 8]], columns=index) + df = cudf.from_pandas(pdf) + + assert_eq(df, pdf) + assert_eq(df.columns.names, pdf.columns.names) + pdf.columns.names = names + df.columns.names = names + assert_eq(df, pdf) + assert_eq(df.columns.names, pdf.columns.names) + + +@pytest.mark.parametrize("name", ["a", 0, None, np.nan, cudf.NA]) +@pytest.mark.parametrize("contains", ["a", 0, None, np.nan, cudf.NA]) +@pytest.mark.parametrize("other_names", [[], ["b", "c"], [1, 2]]) +def test_dataframe_contains(name, contains, other_names): + column_names = [name, *other_names] + gdf = cudf.DataFrame({c: [0] for c in column_names}) + pdf = pd.DataFrame({c: [0] for c in column_names}) + + assert_eq(gdf, pdf) + + if contains is cudf.NA or name is cudf.NA: + expectation = contains is cudf.NA and name is cudf.NA + assert (contains in pdf) == expectation + assert (contains in gdf) == expectation + elif gdf.columns.dtype.kind == "f": + # In some cases, the columns are converted to an Index[float] based on + # the other column names. That casts name values from None to np.nan. + expectation = contains is np.nan and (name is None or name is np.nan) + assert (contains in pdf) == expectation + assert (contains in gdf) == expectation + else: + expectation = contains == name or ( + contains is np.nan and name is np.nan + ) + assert (contains in pdf) == expectation + assert (contains in gdf) == expectation + + assert (contains in pdf) == (contains in gdf) + + +@pytest.mark.parametrize( + "data", + [ + {"col": [{"a": 1.1}, {"a": 2.1}, {"a": 10.0}, {"a": 11.2323}, None]}, + {"a": [[{"b": 567}], None] * 10}, + {"a": [decimal.Decimal(10), decimal.Decimal(20), None]}, + ], +) +def test_dataframe_values_complex_types(data): + gdf = cudf.DataFrame(data) + with pytest.raises(NotImplementedError): + gdf.values + + +@pytest.mark.parametrize( + "df", + [ + pd.DataFrame(), + pd.DataFrame(index=[100, 10, 1, 0]), + pd.DataFrame(columns=["a", "b", "c", "d"]), + pd.DataFrame(columns=["a", "b", "c", "d"], index=[100]), + pd.DataFrame( + columns=["a", "b", "c", "d"], index=[100, 10000, 2131, 133] + ), + pd.DataFrame({"a": [1, 2, 3], "b": ["abc", "xyz", "klm"]}), + ], +) +def test_dataframe_size(df): + pdf = df + gdf = cudf.from_pandas(pdf) + + assert_eq(pdf.size, gdf.size) + + +@pytest.mark.parametrize( + "df", + [ + pd.DataFrame(), + pd.DataFrame(index=[100, 10, 1, 0]), + pd.DataFrame(columns=["a", "b", "c", "d"]), + pd.DataFrame(columns=["a", "b", "c", "d"], index=[100]), + pd.DataFrame( + columns=["a", "b", "c", "d"], index=[100, 10000, 2131, 133] + ), + pd.DataFrame({"a": [1, 2, 3], "b": ["abc", "xyz", "klm"]}), + ], +) +def test_dataframe_empty(df): + pdf = df + gdf = cudf.from_pandas(pdf) + + assert_eq(pdf.empty, gdf.empty) + + +def test_cudf_arrow_array_error(): + df = cudf.DataFrame({"a": [1, 2, 3]}) + + with pytest.raises( + TypeError, + match="Implicit conversion to a host PyArrow object via " + "__arrow_array__ is not allowed. Consider using .to_arrow()", + ): + df.__arrow_array__() + + sr = cudf.Series([1, 2, 3]) + + with pytest.raises( + TypeError, + match="Implicit conversion to a host PyArrow object via " + "__arrow_array__ is not allowed. Consider using .to_arrow()", + ): + sr.__arrow_array__() + + sr = cudf.Series(["a", "b", "c"]) + with pytest.raises( + TypeError, + match="Implicit conversion to a host PyArrow object via " + "__arrow_array__ is not allowed. Consider using .to_arrow()", + ): + sr.__arrow_array__() + + @pytest.mark.parametrize( "index", [ diff --git a/python/cudf/cudf/tests/dataframe/test_binops.py b/python/cudf/cudf/tests/dataframe/test_binops.py new file mode 100644 index 00000000000..a8540b4e711 --- /dev/null +++ b/python/cudf/cudf/tests/dataframe/test_binops.py @@ -0,0 +1,170 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + +import numpy as np +import pandas as pd +import pytest + +import cudf +from cudf.testing import assert_eq +from cudf.testing._utils import assert_exceptions_equal + + +@pytest.mark.parametrize( + "expected", + [ + pd.RangeIndex(1, 2, name="a"), + pd.Index([1], dtype=np.int8, name="a"), + pd.MultiIndex.from_arrays([[1]], names=["a"]), + ], +) +@pytest.mark.parametrize("binop", [lambda df: df == df, lambda df: df - 1]) +def test_dataframe_binop_preserves_column_metadata(expected, binop): + df = cudf.DataFrame([1], columns=expected) + result = binop(df).columns + pd.testing.assert_index_equal(result, expected, exact=True) + + +def test_dataframe_series_dot(): + pser = pd.Series(range(2)) + gser = cudf.from_pandas(pser) + + expected = pser @ pser + actual = gser @ gser + + assert_eq(expected, actual) + + pdf = pd.DataFrame([[1, 2], [3, 4]], columns=list("ab")) + gdf = cudf.from_pandas(pdf) + + expected = pser @ pdf + actual = gser @ gdf + + assert_eq(expected, actual) + + assert_exceptions_equal( + lfunc=pdf.dot, + rfunc=gdf.dot, + lfunc_args_and_kwargs=([pser], {}), + rfunc_args_and_kwargs=([gser], {}), + ) + + assert_exceptions_equal( + lfunc=pdf.dot, + rfunc=gdf.dot, + lfunc_args_and_kwargs=([pdf], {}), + rfunc_args_and_kwargs=([gdf], {}), + ) + + pser = pd.Series(range(2), index=["a", "k"]) + gser = cudf.from_pandas(pser) + + pdf = pd.DataFrame([[1, 2], [3, 4]], columns=list("ab"), index=["a", "k"]) + gdf = cudf.from_pandas(pdf) + + expected = pser @ pdf + actual = gser @ gdf + + assert_eq(expected, actual) + + actual = gdf @ [2, 3] + expected = pdf @ [2, 3] + + assert_eq(expected, actual) + + actual = pser @ [12, 13] + expected = gser @ [12, 13] + + assert_eq(expected, actual) + + +def test_dataframe_binop_with_datetime_index(): + rng = np.random.default_rng(seed=0) + df = pd.DataFrame( + rng.random(size=(2, 2)), + columns=pd.Index(["2000-01-03", "2000-01-04"], dtype="datetime64[ns]"), + ) + ser = pd.Series( + rng.random(2), + index=pd.Index( + [ + "2000-01-04", + "2000-01-03", + ], + dtype="datetime64[ns]", + ), + ) + gdf = cudf.from_pandas(df) + gser = cudf.from_pandas(ser) + expected = df - ser + got = gdf - gser + assert_eq(expected, got) + + +def test_dataframe_binop_and_where(): + rng = np.random.default_rng(seed=0) + df = pd.DataFrame(rng.random(size=(2, 2)), columns=pd.Index([True, False])) + gdf = cudf.from_pandas(df) + + expected = df > 1 + got = gdf > 1 + + assert_eq(expected, got) + + expected = df[df > 1] + got = gdf[gdf > 1] + + assert_eq(expected, got) + + +def test_dataframe_binop_with_mixed_string_types(): + rng = np.random.default_rng(seed=0) + df1 = pd.DataFrame(rng.random(size=(3, 3)), columns=pd.Index([0, 1, 2])) + df2 = pd.DataFrame( + rng.random(size=(6, 6)), + columns=pd.Index([0, 1, 2, "VhDoHxRaqt", "X0NNHBIPfA", "5FbhPtS0D1"]), + ) + gdf1 = cudf.from_pandas(df1) + gdf2 = cudf.from_pandas(df2) + + expected = df2 + df1 + got = gdf2 + gdf1 + + assert_eq(expected, got) + + +def test_dataframe_binop_with_mixed_date_types(): + rng = np.random.default_rng(seed=0) + df = pd.DataFrame( + rng.random(size=(2, 2)), + columns=pd.Index(["2000-01-03", "2000-01-04"], dtype="datetime64[ns]"), + ) + ser = pd.Series(rng.random(size=3), index=[0, 1, 2]) + gdf = cudf.from_pandas(df) + gser = cudf.from_pandas(ser) + expected = df - ser + got = gdf - gser + assert_eq(expected, got) + + +@pytest.mark.parametrize( + "df1", + [ + pd.DataFrame({"a": [10, 11, 12]}, index=["a", "b", "z"]), + pd.DataFrame({"z": ["a"]}), + pd.DataFrame({"a": [], "b": []}), + ], +) +@pytest.mark.parametrize( + "df2", + [ + pd.DataFrame(), + pd.DataFrame({"a": ["a", "a", "c", "z", "A"], "z": [1, 2, 3, 4, 5]}), + ], +) +def test_dataframe_error_equality(df1, df2, comparison_op): + gdf1 = cudf.from_pandas(df1) + gdf2 = cudf.from_pandas(df2) + + assert_exceptions_equal( + comparison_op, comparison_op, ([df1, df2],), ([gdf1, gdf2],) + ) diff --git a/python/cudf/cudf/tests/dataframe/test_constructors.py b/python/cudf/cudf/tests/dataframe/test_constructors.py index 28c515757f0..21fb39059e8 100644 --- a/python/cudf/cudf/tests/dataframe/test_constructors.py +++ b/python/cudf/cudf/tests/dataframe/test_constructors.py @@ -13,6 +13,7 @@ import cudf from cudf.core.column.column import as_column from cudf.testing import assert_eq +from cudf.testing._utils import assert_exceptions_equal def test_init_via_list_of_tuples(): @@ -1356,6 +1357,465 @@ def test_create_interval_df(data1, data2, data3, data4, interval_closed): assert_eq(expect_three, got_three) +def test_roundtrip_dataframe_plc_table(): + pdf = pd.DataFrame( + { + "a": [None, None, np.nan, None], + "b": [np.nan, None, np.nan, None], + } + ) + expect = cudf.DataFrame.from_pandas(pdf) + actual = cudf.DataFrame.from_pylibcudf(*expect.to_pylibcudf()) + assert_eq(expect, actual) + + +def test_dataframe_from_generator(): + pdf = pd.DataFrame((i for i in range(5))) + gdf = cudf.DataFrame((i for i in range(5))) + assert_eq(pdf, gdf) + + +@pytest.mark.parametrize( + "dtype", ["datetime64[ns]", "timedelta64[ns]", "int64", "float32"] +) +def test_dataframe_mixed_dtype_error(dtype): + pdf = pd.Series([1, 2, 3], dtype=dtype).to_frame().astype(object) + with pytest.raises(TypeError): + cudf.from_pandas(pdf) + + +def test_dataframe_from_arrow_slice(): + table = pa.Table.from_pandas( + pd.DataFrame.from_dict( + {"a": ["aa", "bb", "cc"] * 3, "b": [1, 2, 3] * 3} + ) + ) + table_slice = table.slice(3, 7) + + expected = table_slice.to_pandas() + actual = cudf.DataFrame.from_arrow(table_slice) + + assert_eq(expected, actual) + + +@pytest.mark.parametrize( + "data,index", + [ + ({"a": [1, 2, 3], "b": ["x", "y", "z", "z"], "c": 4}, None), + ( + { + "a": [1, 2, 3], + "b": ["x", "y", "z"], + }, + [10, 11], + ), + ( + { + "a": [1, 2, 3], + "b": ["x", "y", "z"], + }, + [10, 11], + ), + ([[10, 11], [12, 13]], ["a", "b", "c"]), + ], +) +def test_dataframe_init_length_error(data, index): + assert_exceptions_equal( + lfunc=pd.DataFrame, + rfunc=cudf.DataFrame, + lfunc_args_and_kwargs=( + [], + {"data": data, "index": index}, + ), + rfunc_args_and_kwargs=( + [], + {"data": data, "index": index}, + ), + ) + + +def test_complex_types_from_arrow(): + expected = pa.Table.from_arrays( + [ + pa.array([1, 2, 3]), + pa.array([10, 20, 30]), + pa.array([{"a": 9}, {"b": 10}, {"c": 11}]), + pa.array([[{"a": 1}], [{"b": 2}], [{"c": 3}]]), + pa.array([10, 11, 12]).cast(pa.decimal128(21, 2)), + pa.array([{"a": 9}, {"b": 10, "c": {"g": 43}}, {"c": {"a": 10}}]), + ], + names=["a", "b", "c", "d", "e", "f"], + ) + + df = cudf.DataFrame.from_arrow(expected) + actual = df.to_arrow() + + assert expected.equals(actual) + + +def test_dataframe_constructor_column_index_only(): + columns = ["a", "b", "c"] + index = ["r1", "r2", "r3"] + + gdf = cudf.DataFrame(index=index, columns=columns) + assert gdf["a"]._column is not gdf["b"]._column + assert gdf["b"]._column is not gdf["c"]._column + + +@pytest.mark.parametrize( + "df", + [ + pd.DataFrame( + {"a": [1, 2, 3], "b": [10, 11, 20], "c": ["a", "bcd", "xyz"]} + ), + pd.DataFrame(), + ], +) +@pytest.mark.parametrize( + "columns", + [ + None, + ["a"], + ["c", "a"], + ["b", "a", "c"], + [], + pd.Index(["c", "a"]), + cudf.Index(["c", "a"]), + ["abc", "a"], + ["column_not_exists1", "column_not_exists2"], + ], +) +@pytest.mark.parametrize("index", [["abc", "def", "ghi"]]) +def test_dataframe_constructor_columns(df, columns, index, request): + def assert_local_eq(actual, df, expected, host_columns): + check_index_type = not expected.empty + if host_columns is not None and any( + col not in df.columns for col in host_columns + ): + assert_eq( + expected, + actual, + check_dtype=False, + check_index_type=check_index_type, + ) + else: + assert_eq( + expected, + actual, + check_index_type=check_index_type, + check_column_type=False, + ) + + gdf = cudf.from_pandas(df) + host_columns = ( + columns.to_pandas() if isinstance(columns, cudf.Index) else columns + ) + + expected = pd.DataFrame(df, columns=host_columns, index=index) + actual = cudf.DataFrame(gdf, columns=columns, index=index) + + assert_local_eq(actual, df, expected, host_columns) + + +def test_dataframe_from_pandas_duplicate_columns(): + pdf = pd.DataFrame(columns=["a", "b", "c", "a"]) + pdf["a"] = [1, 2, 3] + + with pytest.raises( + ValueError, match="Duplicate column names are not allowed" + ): + cudf.from_pandas(pdf) + + +@pytest.mark.parametrize( + "data", + [ + [{"a": 1, "b": 2, "c": 3}, {"a": 4, "b": 5, "c": 6}], + [{"a": 1, "b": 2, "c": None}, {"a": None, "b": 5, "c": 6}], + [{"a": 1, "b": 2}, {"a": 1, "b": 5, "c": 6}], + [{"a": 1, "b": 2}, {"b": 5, "c": 6}], + [{}, {"a": 1, "b": 5, "c": 6}], + [{"a": 1, "b": 2, "c": 3}, {"a": 4.5, "b": 5.5, "c": 6.5}], + ], +) +def test_dataframe_init_from_list_of_dicts(data): + expect = pd.DataFrame(data) + got = cudf.DataFrame(data) + + assert_eq(expect, got) + + +@pytest.mark.parametrize( + "data", + [ + None, + [], + [1], + {"a": [10, 11, 12]}, + { + "a": [10, 11, 12], + "another column name": [12, 22, 34], + "xyz": [0, 10, 11], + }, + ], +) +@pytest.mark.parametrize( + "columns", + [["a"], ["another column name"], None, pd.Index(["a"], name="index name")], +) +def test_dataframe_init_with_columns(data, columns): + pdf = pd.DataFrame(data, columns=columns) + gdf = cudf.DataFrame(data, columns=columns) + + assert_eq( + pdf, + gdf, + check_index_type=len(pdf.index) != 0, + check_dtype=not (pdf.empty and len(pdf.columns)), + check_column_type=False, + ) + + +@pytest.mark.parametrize( + "data, ignore_dtype", + [ + ([pd.Series([1, 2, 3])], False), + ([pd.Series(index=[1, 2, 3], dtype="float64")], False), + ([pd.Series(name="empty series name", dtype="float64")], False), + ( + [pd.Series([1]), pd.Series([], dtype="float64"), pd.Series([3])], + False, + ), + ( + [ + pd.Series([1, 0.324234, 32424.323, -1233, 34242]), + pd.Series([], dtype="float64"), + pd.Series([3], name="series that is named"), + ], + False, + ), + ([pd.Series([1, 2, 3], name="hi")] * 10, False), + ([pd.Series([1, 2, 3], name=None, index=[10, 11, 12])] * 10, False), + ( + [ + pd.Series([1, 2, 3], name=None, index=[10, 11, 12]), + pd.Series([1, 2, 30], name=None, index=[13, 144, 15]), + ], + True, + ), + ( + [ + pd.Series([1, 0.324234, 32424.323, -1233, 34242]), + pd.Series([], dtype="float64"), + pd.Series(index=[10, 11, 12], dtype="float64"), + ], + False, + ), + ( + [ + pd.Series([1, 0.324234, 32424.323, -1233, 34242]), + pd.Series([], name="abc", dtype="float64"), + pd.Series(index=[10, 11, 12], dtype="float64"), + ], + False, + ), + ( + [ + pd.Series([1, 0.324234, 32424.323, -1233, 34242]), + pd.Series([1, -100, 200, -399, 400], name="abc"), + pd.Series([111, 222, 333], index=[10, 11, 12]), + ], + False, + ), + ], +) +@pytest.mark.parametrize( + "columns", + [ + None, + ["0"], + [0], + ["abc"], + [144, 13], + [2, 1, 0], + pd.Index(["abc"], name="custom_name"), + ], +) +def test_dataframe_init_from_series_list(data, ignore_dtype, columns): + gd_data = [cudf.from_pandas(obj) for obj in data] + + expected = pd.DataFrame(data, columns=columns) + actual = cudf.DataFrame(gd_data, columns=columns) + + if ignore_dtype: + # When a union is performed to generate columns, + # the order is never guaranteed. Hence sort by + # columns before comparison. + if not expected.columns.equals(actual.columns): + expected = expected.sort_index(axis=1) + actual = actual.sort_index(axis=1) + assert_eq( + expected.fillna(-1), + actual.fillna(-1), + check_dtype=False, + check_index_type=True, + ) + else: + assert_eq( + expected, + actual, + check_index_type=True, + check_column_type=False, + ) + + +@pytest.mark.parametrize( + "data, ignore_dtype, index", + [ + ([pd.Series([1, 2, 3])], False, ["a", "b", "c"]), + ([pd.Series(index=[1, 2, 3], dtype="float64")], False, ["a", "b"]), + ( + [pd.Series(name="empty series name", dtype="float64")], + False, + ["index1"], + ), + ( + [pd.Series([1]), pd.Series([], dtype="float64"), pd.Series([3])], + False, + ["0", "2", "1"], + ), + ( + [ + pd.Series([1, 0.324234, 32424.323, -1233, 34242]), + pd.Series([], dtype="float64"), + pd.Series([3], name="series that is named"), + ], + False, + ["_", "+", "*"], + ), + ([pd.Series([1, 2, 3], name="hi")] * 10, False, ["mean"] * 10), + ( + [pd.Series([1, 2, 3], name=None, index=[10, 11, 12])] * 10, + False, + ["abc"] * 10, + ), + ( + [ + pd.Series([1, 2, 3], name=None, index=[10, 11, 12]), + pd.Series([1, 2, 30], name=None, index=[13, 144, 15]), + ], + True, + ["set_index_a", "set_index_b"], + ), + ( + [ + pd.Series([1, 0.324234, 32424.323, -1233, 34242]), + pd.Series([], dtype="float64"), + pd.Series(index=[10, 11, 12], dtype="float64"), + ], + False, + ["a", "b", "c"], + ), + ( + [ + pd.Series([1, 0.324234, 32424.323, -1233, 34242]), + pd.Series([], name="abc", dtype="float64"), + pd.Series(index=[10, 11, 12], dtype="float64"), + ], + False, + ["a", "v", "z"], + ), + ( + [ + pd.Series([1, 0.324234, 32424.323, -1233, 34242]), + pd.Series([1, -100, 200, -399, 400], name="abc"), + pd.Series([111, 222, 333], index=[10, 11, 12]), + ], + False, + ["a", "v", "z"], + ), + ], +) +@pytest.mark.parametrize( + "columns", [None, ["0"], [0], ["abc"], [144, 13], [2, 1, 0]] +) +def test_dataframe_init_from_series_list_with_index( + data, + ignore_dtype, + index, + columns, +): + gd_data = [cudf.from_pandas(obj) for obj in data] + + expected = pd.DataFrame(data, columns=columns, index=index) + actual = cudf.DataFrame(gd_data, columns=columns, index=index) + + if ignore_dtype: + # When a union is performed to generate columns, + # the order is never guaranteed. Hence sort by + # columns before comparison. + if not expected.columns.equals(actual.columns): + expected = expected.sort_index(axis=1) + actual = actual.sort_index(axis=1) + assert_eq(expected.fillna(-1), actual.fillna(-1), check_dtype=False) + else: + assert_eq(expected, actual, check_column_type=False) + + +@pytest.mark.parametrize( + "data, index", + [ + ([pd.Series([1, 2]), pd.Series([1, 2])], ["a", "b", "c"]), + ( + [ + pd.Series([1, 0.324234, 32424.323, -1233, 34242]), + pd.Series([], dtype="float64"), + pd.Series([3], name="series that is named"), + ], + ["_", "+"], + ), + ([pd.Series([1, 2, 3], name="hi")] * 10, ["mean"] * 9), + ], +) +def test_dataframe_init_from_series_list_with_index_error(data, index): + gd_data = [cudf.from_pandas(obj) for obj in data] + + assert_exceptions_equal( + pd.DataFrame, + cudf.DataFrame, + ([data], {"index": index}), + ([gd_data], {"index": index}), + ) + + +@pytest.mark.parametrize( + "data", + [ + [pd.Series([1, 2, 3], index=["a", "a", "a"])], + [pd.Series([1, 2, 3], index=["a", "a", "a"])] * 4, + [ + pd.Series([1, 2, 3], index=["a", "b", "a"]), + pd.Series([1, 2, 3], index=["b", "b", "a"]), + ], + [ + pd.Series([1, 2, 3], index=["a", "b", "z"]), + pd.Series([1, 2, 3], index=["u", "b", "a"]), + pd.Series([1, 2, 3], index=["u", "b", "u"]), + ], + ], +) +def test_dataframe_init_from_series_list_duplicate_index_error(data): + gd_data = [cudf.from_pandas(obj) for obj in data] + + assert_exceptions_equal( + lfunc=pd.DataFrame, + rfunc=cudf.DataFrame, + lfunc_args_and_kwargs=([], {"data": data}), + rfunc_args_and_kwargs=([], {"data": gd_data}), + check_exception_type=False, + ) + + def test_from_pandas(): pdf = pd.DataFrame( { diff --git a/python/cudf/cudf/tests/dataframe/test_reductions.py b/python/cudf/cudf/tests/dataframe/test_reductions.py deleted file mode 100644 index 167ba2bd427..00000000000 --- a/python/cudf/cudf/tests/dataframe/test_reductions.py +++ /dev/null @@ -1,75 +0,0 @@ -# Copyright (c) 2025, NVIDIA CORPORATION. - - -import pandas as pd -import pytest - -import cudf -from cudf.testing import assert_eq - - -def test_single_q(): - q = 0.5 - - pdf = pd.DataFrame({"a": [4, 24, 13, 8, 7]}) - gdf = cudf.from_pandas(pdf) - - pdf_q = pdf.quantile(q, interpolation="nearest") - gdf_q = gdf.quantile(q, interpolation="nearest", method="table") - - assert_eq(pdf_q, gdf_q, check_index_type=False) - - -def test_with_index(): - q = [0, 0.5, 1] - - pdf = pd.DataFrame({"a": [7, 4, 4, 9, 13]}, index=[0, 4, 3, 2, 7]) - gdf = cudf.from_pandas(pdf) - - pdf_q = pdf.quantile(q, interpolation="nearest") - gdf_q = gdf.quantile(q, interpolation="nearest", method="table") - - assert_eq(pdf_q, gdf_q, check_index_type=False) - - -def test_with_multiindex(): - q = [0, 0.5, 1] - - pdf = pd.DataFrame( - { - "index_1": [3, 1, 9, 7, 5], - "index_2": [2, 4, 3, 5, 1], - "a": [8, 4, 2, 3, 8], - } - ) - pdf.set_index(["index_1", "index_2"], inplace=True) - - gdf = cudf.from_pandas(pdf) - - pdf_q = pdf.quantile(q, interpolation="nearest") - gdf_q = gdf.quantile(q, interpolation="nearest", method="table") - - assert_eq(pdf_q, gdf_q, check_index_type=False) - - -@pytest.mark.parametrize( - "data", - [ - {"a": [1, 2, 3], "b": [10, 11, 12]}, - {"a": [1, 0, 3], "b": [10, 11, 12]}, - {"a": [1, 2, 3], "b": [10, 11, None]}, - { - "a": [], - }, - {}, - ], -) -@pytest.mark.parametrize("op", ["all", "any"]) -def test_any_all_axis_none(data, op): - gdf = cudf.DataFrame(data) - pdf = gdf.to_pandas() - - expected = getattr(pdf, op)(axis=None) - actual = getattr(gdf, op)(axis=None) - - assert expected == actual diff --git a/python/cudf/cudf/tests/indexes/multiindex/methods/test_memory_usage.py b/python/cudf/cudf/tests/indexes/multiindex/methods/test_memory_usage.py new file mode 100644 index 00000000000..5c193c4a290 --- /dev/null +++ b/python/cudf/cudf/tests/indexes/multiindex/methods/test_memory_usage.py @@ -0,0 +1,33 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + +import numpy as np +import pandas as pd + +import cudf + + +def test_memory_usage_multi(): + # We need to sample without replacement to guarantee that the size of the + # levels are always the same. + rng = np.random.default_rng(seed=0) + rows = 10 + df = pd.DataFrame( + { + "A": np.arange(rows, dtype="int32"), + "B": rng.choice( + np.arange(rows, dtype="int64"), rows, replace=False + ), + "C": rng.choice( + np.arange(rows, dtype="float64"), rows, replace=False + ), + } + ).set_index(["B", "C"]) + gdf = cudf.from_pandas(df) + # Assume MultiIndex memory footprint is just that + # of the underlying columns, levels, and codes + expect = rows * 16 # Source Columns + expect += rows * 16 # Codes + expect += rows * 8 # Level 0 + expect += rows * 8 # Level 1 + + assert expect == gdf.index.memory_usage(deep=True) diff --git a/python/cudf/cudf/tests/indexes/rangeindex/methods/test_nunique.py b/python/cudf/cudf/tests/indexes/rangeindex/methods/test_nunique.py new file mode 100644 index 00000000000..1683051c9ad --- /dev/null +++ b/python/cudf/cudf/tests/indexes/rangeindex/methods/test_nunique.py @@ -0,0 +1,15 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + +import pandas as pd + +import cudf + + +def test_nunique(): + gidx = cudf.RangeIndex(5) + pidx = pd.RangeIndex(5) + + actual = gidx.nunique() + expected = pidx.nunique() + + assert actual == expected diff --git a/python/cudf/cudf/tests/reshape/test_concat.py b/python/cudf/cudf/tests/reshape/test_concat.py index e533bfac9df..7cdd40f68bc 100644 --- a/python/cudf/cudf/tests/reshape/test_concat.py +++ b/python/cudf/cudf/tests/reshape/test_concat.py @@ -2217,3 +2217,647 @@ def test_series_concat_existing_buffers(): np.testing.assert_equal( gs.to_numpy(), np.hstack([a6.to_numpy(), a5.to_numpy()]) ) + + +@pytest.mark.parametrize( + "df", + [ + pd.DataFrame(), + pd.DataFrame([[1, 2], [3, 4]], columns=list("AB")), + pd.DataFrame([[1, 2], [3, 4]], columns=list("AB"), index=[10, 20]), + pd.DataFrame([[1, 2], [3, 4]], columns=list("AB"), index=[7, 8]), + pd.DataFrame( + { + "a": [315.3324, 3243.32432, 3232.332, -100.32], + "z": [0.3223, 0.32, 0.0000232, 0.32224], + } + ), + pd.DataFrame( + { + "a": [315.3324, 3243.32432, 3232.332, -100.32], + "z": [0.3223, 0.32, 0.0000232, 0.32224], + }, + index=[7, 20, 11, 9], + ), + pd.DataFrame({"l": [10]}), + pd.DataFrame({"l": [10]}, index=[100]), + pd.DataFrame({"f": [10.2, 11.2332, 0.22, 3.3, 44.23, 10.0]}), + pd.DataFrame( + {"f": [10.2, 11.2332, 0.22, 3.3, 44.23, 10.0]}, + index=[100, 200, 300, 400, 500, 0], + ), + pd.DataFrame({"first_col": [], "second_col": [], "third_col": []}), + ], +) +@pytest.mark.parametrize( + "other", + [ + [[1, 2], [10, 100]], + [[1, 2, 10, 100, 0.1, 0.2, 0.0021]], + [[]], + [[], [], [], []], + [[0.23, 0.00023, -10.00, 100, 200, 1000232, 1232.32323]], + ], +) +def test_dataframe_concat_lists(df, other, sort, ignore_index): + pdf = df + other_pd = [pd.DataFrame(o) for o in other] + + gdf = cudf.from_pandas(df) + other_gd = [cudf.from_pandas(o) for o in other_pd] + + with _hide_concat_empty_dtype_warning(): + expected = pd.concat( + [pdf, *other_pd], sort=sort, ignore_index=ignore_index + ) + actual = cudf.concat( + [gdf, *other_gd], sort=sort, ignore_index=ignore_index + ) + + if expected.shape != df.shape: + assert_eq( + expected.fillna(-1), + actual.fillna(-1), + check_dtype=False, + check_column_type=not gdf.empty, + ) + else: + assert_eq( + expected, + actual, + check_index_type=not gdf.empty, + check_column_type=len(gdf.columns) != 0, + ) + + +def test_dataframe_concat_series_without_name(): + df = cudf.DataFrame({"a": [1, 2, 3]}) + pdf = df.to_pandas() + gs = cudf.Series([1, 2, 3]) + ps = gs.to_pandas() + + assert_eq(pd.concat([pdf, ps]), cudf.concat([df, gs])) + + +@pytest.mark.parametrize( + "df", + [ + pd.DataFrame(), + pd.DataFrame(index=[10, 20, 30]), + pd.DataFrame({"first_col": [], "second_col": [], "third_col": []}), + pd.DataFrame([[1, 2], [3, 4]], columns=list("AB")), + pd.DataFrame([[1, 2], [3, 4]], columns=list("AB"), index=[10, 20]), + pd.DataFrame([[1, 2], [3, 4]], columns=list("AB"), index=[7, 8]), + pd.DataFrame( + { + "a": [315.3324, 3243.32432, 3232.332, -100.32], + "z": [0.3223, 0.32, 0.0000232, 0.32224], + } + ), + pd.DataFrame( + { + "a": [315.3324, 3243.32432, 3232.332, -100.32], + "z": [0.3223, 0.32, 0.0000232, 0.32224], + }, + index=[7, 20, 11, 9], + ), + pd.DataFrame({"l": [10]}), + pd.DataFrame({"l": [10]}, index=[100]), + pd.DataFrame({"f": [10.2, 11.2332, 0.22, 3.3, 44.23, 10.0]}), + pd.DataFrame( + {"f": [10.2, 11.2332, 0.22, 3.3, 44.23, 10.0]}, + index=[100, 200, 300, 400, 500, 0], + ), + ], +) +@pytest.mark.parametrize( + "other", + [ + [pd.DataFrame([[5, 6], [7, 8]], columns=list("AB"))], + [ + pd.DataFrame([[5, 6], [7, 8]], columns=list("AB")), + pd.DataFrame([[5, 6], [7, 8]], columns=list("BD")), + pd.DataFrame([[5, 6], [7, 8]], columns=list("DE")), + ], + [pd.DataFrame(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame()], + [ + pd.DataFrame( + {"c": [10, 11, 22, 33, 44, 100]}, index=[7, 8, 9, 10, 11, 20] + ), + pd.DataFrame(), + pd.DataFrame(), + pd.DataFrame([[5, 6], [7, 8]], columns=list("AB")), + ], + [ + pd.DataFrame({"f": [10.2, 11.2332, 0.22, 3.3, 44.23, 10.0]}), + pd.DataFrame({"l": [10]}), + pd.DataFrame({"l": [10]}, index=[200]), + ], + [pd.DataFrame([]), pd.DataFrame([], index=[100])], + [ + pd.DataFrame([]), + pd.DataFrame([], index=[100]), + pd.DataFrame({"first_col": [], "second_col": [], "third_col": []}), + ], + [ + pd.DataFrame( + { + "a": [315.3324, 3243.32432, 3232.332, -100.32], + "z": [0.3223, 0.32, 0.0000232, 0.32224], + } + ), + pd.DataFrame( + { + "a": [315.3324, 3243.32432, 3232.332, -100.32], + "z": [0.3223, 0.32, 0.0000232, 0.32224], + }, + index=[0, 100, 200, 300], + ), + ], + [ + pd.DataFrame( + { + "a": [315.3324, 3243.32432, 3232.332, -100.32], + "z": [0.3223, 0.32, 0.0000232, 0.32224], + }, + index=[0, 100, 200, 300], + ), + ], + [ + pd.DataFrame( + { + "a": [315.3324, 3243.32432, 3232.332, -100.32], + "z": [0.3223, 0.32, 0.0000232, 0.32224], + }, + index=[0, 100, 200, 300], + ), + pd.DataFrame( + { + "a": [315.3324, 3243.32432, 3232.332, -100.32], + "z": [0.3223, 0.32, 0.0000232, 0.32224], + }, + index=[0, 100, 200, 300], + ), + ], + [ + pd.DataFrame( + { + "a": [315.3324, 3243.32432, 3232.332, -100.32], + "z": [0.3223, 0.32, 0.0000232, 0.32224], + }, + index=[0, 100, 200, 300], + ), + pd.DataFrame( + { + "a": [315.3324, 3243.32432, 3232.332, -100.32], + "z": [0.3223, 0.32, 0.0000232, 0.32224], + }, + index=[0, 100, 200, 300], + ), + pd.DataFrame({"first_col": [], "second_col": [], "third_col": []}), + ], + ], +) +def test_dataframe_concat_dataframe_lists(df, other, sort, ignore_index): + pdf = df + other_pd = other + + gdf = cudf.from_pandas(df) + other_gd = [cudf.from_pandas(o) for o in other] + + with _hide_concat_empty_dtype_warning(): + expected = pd.concat( + [pdf, *other_pd], sort=sort, ignore_index=ignore_index + ) + actual = cudf.concat( + [gdf, *other_gd], sort=sort, ignore_index=ignore_index + ) + + # In some cases, Pandas creates an empty Index([], dtype="object") for + # columns whereas cudf creates a RangeIndex(0, 0). + check_column_type = ( + False if len(expected.columns) == len(df.columns) == 0 else True + ) + + if expected.shape != df.shape: + assert_eq( + expected.fillna(-1), + actual.fillna(-1), + check_dtype=False, + check_column_type=check_column_type, + ) + else: + assert_eq( + expected, + actual, + check_index_type=not gdf.empty, + check_column_type=check_column_type, + ) + + +def test_dataframe_concat_series_mixed_index(): + df = cudf.DataFrame({"first": [], "d": []}) + pdf = df.to_pandas() + + sr = cudf.Series([1, 2, 3, 4]) + psr = sr.to_pandas() + + assert_eq( + cudf.concat([df, sr], ignore_index=True), + pd.concat([pdf, psr], ignore_index=True), + check_dtype=False, + ) + + +@pytest.mark.parametrize( + "df", + [ + pd.DataFrame(), + pd.DataFrame(index=[10, 20, 30]), + pd.DataFrame({12: [], 22: []}), + pd.DataFrame([[1, 2], [3, 4]], columns=[10, 20]), + pd.DataFrame([[1, 2], [3, 4]], columns=[0, 1], index=[10, 20]), + pd.DataFrame([[1, 2], [3, 4]], columns=[1, 0], index=[7, 8]), + pd.DataFrame( + { + 23: [315.3324, 3243.32432, 3232.332, -100.32], + 33: [0.3223, 0.32, 0.0000232, 0.32224], + } + ), + pd.DataFrame( + { + 0: [315.3324, 3243.32432, 3232.332, -100.32], + 1: [0.3223, 0.32, 0.0000232, 0.32224], + }, + index=[7, 20, 11, 9], + ), + ], +) +@pytest.mark.parametrize( + "other", + [ + pd.Series([10, 11, 23, 234, 13]), + pd.Series([10, 11, 23, 234, 13], index=[11, 12, 13, 44, 33]), + {1: 1}, + {0: 10, 1: 100, 2: 102}, + ], +) +def test_dataframe_concat_series(df, other, sort): + pdf = df + gdf = cudf.from_pandas(df) + + if isinstance(other, dict): + other_pd = pd.Series(other) + else: + other_pd = other + other_gd = cudf.from_pandas(other_pd) + + expected = pd.concat([pdf, other_pd], ignore_index=True, sort=sort) + actual = cudf.concat([gdf, other_gd], ignore_index=True, sort=sort) + + if expected.shape != df.shape: + # Ignore the column type comparison because pandas incorrectly + # returns pd.Index([1, 2, 3], dtype="object") instead + # of pd.Index([1, 2, 3], dtype="int64") + assert_eq( + expected.fillna(-1), + actual.fillna(-1), + check_dtype=False, + check_column_type=False, + check_index_type=True, + ) + else: + assert_eq(expected, actual, check_index_type=not gdf.empty) + + +@pytest.mark.parametrize( + "df", + [ + pd.DataFrame(), + pd.DataFrame(index=[10, 20, 30]), + pd.DataFrame({"first_col": [], "second_col": [], "third_col": []}), + pd.DataFrame([[1, 2], [3, 4]], columns=list("AB")), + pd.DataFrame([[1, 2], [3, 4]], columns=list("AB"), index=[10, 20]), + pd.DataFrame([[1, 2], [3, 4]], columns=list("AB"), index=[7, 8]), + pd.DataFrame( + { + "a": [315.3324, 3243.32432, 3232.332, -100.32], + "z": [0.3223, 0.32, 0.0000232, 0.32224], + } + ), + pd.DataFrame( + { + "a": [315.3324, 3243.32432, 3232.332, -100.32], + "z": [0.3223, 0.32, 0.0000232, 0.32224], + }, + index=[7, 20, 11, 9], + ), + pd.DataFrame({"l": [10]}), + pd.DataFrame({"l": [10]}, index=[100]), + pd.DataFrame({"f": [10.2, 11.2332, 0.22, 3.3, 44.23, 10.0]}), + pd.DataFrame( + {"f": [10.2, 11.2332, 0.22, 3.3, 44.23, 10.0]}, + index=[100, 200, 300, 400, 500, 0], + ), + ], +) +@pytest.mark.parametrize( + "other", + [ + pd.DataFrame([[5, 6], [7, 8]], columns=list("AB")), + pd.DataFrame([[5, 6], [7, 8]], columns=list("BD")), + pd.DataFrame([[5, 6], [7, 8]], columns=list("DE")), + pd.DataFrame(), + pd.DataFrame( + {"c": [10, 11, 22, 33, 44, 100]}, index=[7, 8, 9, 10, 11, 20] + ), + pd.DataFrame({"f": [10.2, 11.2332, 0.22, 3.3, 44.23, 10.0]}), + pd.DataFrame({"l": [10]}), + pd.DataFrame({"l": [10]}, index=[200]), + pd.DataFrame([]), + pd.DataFrame({"first_col": [], "second_col": [], "third_col": []}), + pd.DataFrame([], index=[100]), + pd.DataFrame( + { + "a": [315.3324, 3243.32432, 3232.332, -100.32], + "z": [0.3223, 0.32, 0.0000232, 0.32224], + } + ), + pd.DataFrame( + { + "a": [315.3324, 3243.32432, 3232.332, -100.32], + "z": [0.3223, 0.32, 0.0000232, 0.32224], + }, + index=[0, 100, 200, 300], + ), + ], +) +def test_dataframe_concat_dataframe(df, other, sort, ignore_index): + pdf = df + other_pd = other + + gdf = cudf.from_pandas(df) + other_gd = cudf.from_pandas(other) + + with _hide_concat_empty_dtype_warning(): + expected = pd.concat( + [pdf, other_pd], sort=sort, ignore_index=ignore_index + ) + actual = cudf.concat( + [gdf, other_gd], sort=sort, ignore_index=ignore_index + ) + + # In empty dataframe cases, Pandas & cudf differ in columns + # creation, pandas creates RangeIndex(0, 0) + # whereas cudf creates an empty Index([], dtype="object"). + check_column_type = ( + False if len(expected.columns) == len(df.columns) == 0 else True + ) + + if expected.shape != df.shape: + assert_eq( + expected.fillna(-1), + actual.fillna(-1), + check_dtype=False, + check_column_type=check_column_type, + ) + else: + assert_eq( + expected, + actual, + check_index_type=not gdf.empty, + check_column_type=check_column_type, + ) + + +@pytest.mark.parametrize("df_1_data", [{"a": [1, 2], "b": [1, 3]}, {}]) +@pytest.mark.parametrize("df_2_data", [{"a": [], "b": []}, {}]) +def test_concat_empty_dataframe(df_1_data, df_2_data): + df_1 = cudf.DataFrame(df_1_data) + df_2 = cudf.DataFrame(df_2_data) + with _hide_concat_empty_dtype_warning(): + got = cudf.concat([df_1, df_2]) + expect = pd.concat([df_1.to_pandas(), df_2.to_pandas()], sort=False) + + # ignoring dtypes as pandas upcasts int to float + # on concatenation with empty dataframes + + assert_eq(got, expect, check_dtype=False, check_index_type=True) + + +@pytest.mark.parametrize( + "df1_d", + [ + {"a": [1, 2], "b": [1, 2], "c": ["s1", "s2"], "d": [1.0, 2.0]}, + {"b": [1.9, 10.9], "c": ["s1", "s2"]}, + {"c": ["s1"], "b": pd.Series([None], dtype="float"), "a": [False]}, + ], +) +@pytest.mark.parametrize( + "df2_d", + [ + {"a": [1, 2, 3]}, + {"a": [1, None, 3], "b": [True, True, False], "c": ["s3", None, "s4"]}, + {"a": [], "b": []}, + {}, + ], +) +def test_concat_different_column_dataframe(df1_d, df2_d): + with _hide_concat_empty_dtype_warning(): + got = cudf.concat( + [ + cudf.DataFrame(df1_d), + cudf.DataFrame(df2_d), + cudf.DataFrame(df1_d), + ], + sort=False, + ) + + pdf1 = pd.DataFrame(df1_d) + pdf2 = pd.DataFrame(df2_d) + + expect = pd.concat([pdf1, pdf2, pdf1], sort=False) + + # numerical columns are upcasted to float in cudf.DataFrame.to_pandas() + # casts nan to 0 in non-float numerical columns + + numeric_cols = got.dtypes[got.dtypes != "object"].index + for col in numeric_cols: + got[col] = got[col].astype(np.float64).fillna(np.nan) + + assert_eq(got, expect, check_dtype=False, check_index_type=True) + + +@pytest.mark.parametrize( + "ser_1", [pd.Series([1, 2, 3]), pd.Series([], dtype="float64")] +) +def test_concat_empty_series(ser_1): + ser_2 = pd.Series([], dtype="float64") + with _hide_concat_empty_dtype_warning(): + got = cudf.concat([cudf.Series(ser_1), cudf.Series(ser_2)]) + expect = pd.concat([ser_1, ser_2]) + + assert_eq(got, expect, check_index_type=True) + + +def test_dataframe_concat_different_numerical_columns( + numeric_and_temporal_types_as_str, numeric_and_temporal_types_as_str2 +): + df1 = pd.DataFrame( + dict( + x=pd.Series(np.arange(5)).astype(numeric_and_temporal_types_as_str) + ) + ) + df2 = pd.DataFrame( + dict( + x=pd.Series(np.arange(5)).astype( + numeric_and_temporal_types_as_str2 + ) + ) + ) + if ( + numeric_and_temporal_types_as_str != numeric_and_temporal_types_as_str2 + and "datetime" in numeric_and_temporal_types_as_str + or "datetime" in numeric_and_temporal_types_as_str2 + ): + with pytest.raises(TypeError): + cudf.concat([df1, df2]) + else: + pres = pd.concat([df1, df2]) + gres = cudf.concat([cudf.from_pandas(df1), cudf.from_pandas(df2)]) + assert_eq(pres, gres, check_dtype=False, check_index_type=True) + + +def test_dataframe_concat_different_column_types(): + df1 = cudf.Series([42], dtype=np.float64) + df2 = cudf.Series(["a"], dtype="category") + with pytest.raises(ValueError): + cudf.concat([df1, df2]) + + df2 = cudf.Series(["a string"]) + with pytest.raises(TypeError): + cudf.concat([df1, df2]) + + +@pytest.mark.parametrize("a", [[1, 2, 3], [1, 10, 30]]) +@pytest.mark.parametrize("b", [[4, 5, 6], [-11, -100, 30]]) +def test_concat_index(a, b): + ser_a = pd.Series(a) + ser_b = pd.Series(b) + + gser_a = cudf.Series(a) + gser_b = cudf.Series(b) + + expected = pd.concat([ser_a, ser_b]) + actual = cudf.concat([gser_a, gser_b]) + + assert len(expected) == len(actual) + assert_eq(expected.index, actual.index) + + expected = pd.concat([ser_a, ser_b], ignore_index=True) + actual = cudf.concat([gser_a, gser_b], ignore_index=True) + + assert len(expected) == len(actual) + assert_eq(expected.index, actual.index) + + +def test_concat_with_axis(): + df1 = pd.DataFrame(dict(x=np.arange(5), y=np.arange(5))) + df2 = pd.DataFrame(dict(a=np.arange(5), b=np.arange(5))) + + concat_df = pd.concat([df1, df2], axis=1) + cdf1 = cudf.from_pandas(df1) + cdf2 = cudf.from_pandas(df2) + + # concat only dataframes + concat_cdf = cudf.concat([cdf1, cdf2], axis=1) + assert_eq(concat_cdf, concat_df, check_index_type=True) + + # concat only series + concat_s = pd.concat([df1.x, df1.y], axis=1) + cs1 = cudf.Series.from_pandas(df1.x) + cs2 = cudf.Series.from_pandas(df1.y) + concat_cdf_s = cudf.concat([cs1, cs2], axis=1) + + assert_eq(concat_cdf_s, concat_s, check_index_type=True) + + rng = np.random.default_rng(seed=0) + # concat series and dataframes + s3 = pd.Series(rng.random(5)) + cs3 = cudf.Series.from_pandas(s3) + + concat_cdf_all = cudf.concat([cdf1, cs3, cdf2], axis=1) + concat_df_all = pd.concat([df1, s3, df2], axis=1) + assert_eq(concat_cdf_all, concat_df_all, check_index_type=True) + + # concat manual multi index + midf1 = cudf.from_pandas(df1) + midf1.index = cudf.MultiIndex( + levels=[[0, 1, 2, 3], [0, 1]], codes=[[0, 1, 2, 3, 2], [0, 1, 0, 1, 0]] + ) + midf2 = midf1[2:] + midf2.index = cudf.MultiIndex( + levels=[[3, 4, 5], [2, 0]], codes=[[0, 1, 2], [1, 0, 1]] + ) + mipdf1 = midf1.to_pandas() + mipdf2 = midf2.to_pandas() + + assert_eq( + cudf.concat([midf1, midf2]), + pd.concat([mipdf1, mipdf2]), + check_index_type=True, + ) + assert_eq( + cudf.concat([midf2, midf1]), + pd.concat([mipdf2, mipdf1]), + check_index_type=True, + ) + assert_eq( + cudf.concat([midf1, midf2, midf1]), + pd.concat([mipdf1, mipdf2, mipdf1]), + check_index_type=True, + ) + + rng = np.random.default_rng(seed=0) + # concat groupby multi index + gdf1 = cudf.DataFrame( + { + "x": rng.integers(0, 10, 10), + "y": rng.integers(0, 10, 10), + "z": rng.integers(0, 10, 10), + "v": rng.integers(0, 10, 10), + } + ) + gdf2 = gdf1[5:] + gdg1 = gdf1.groupby(["x", "y"]).min() + gdg2 = gdf2.groupby(["x", "y"]).min() + pdg1 = gdg1.to_pandas() + pdg2 = gdg2.to_pandas() + + assert_eq( + cudf.concat([gdg1, gdg2]), + pd.concat([pdg1, pdg2]), + check_index_type=True, + ) + assert_eq( + cudf.concat([gdg2, gdg1]), + pd.concat([pdg2, pdg1]), + check_index_type=True, + ) + + # series multi index concat + gdgz1 = gdg1.z + gdgz2 = gdg2.z + pdgz1 = gdgz1.to_pandas() + pdgz2 = gdgz2.to_pandas() + + assert_eq( + cudf.concat([gdgz1, gdgz2]), + pd.concat([pdgz1, pdgz2]), + check_index_type=True, + ) + assert_eq( + cudf.concat([gdgz2, gdgz1]), + pd.concat([pdgz2, pdgz1]), + check_index_type=True, + ) diff --git a/python/cudf/cudf/tests/series/methods/test_keys.py b/python/cudf/cudf/tests/series/methods/test_keys.py new file mode 100644 index 00000000000..6f9c78ab008 --- /dev/null +++ b/python/cudf/cudf/tests/series/methods/test_keys.py @@ -0,0 +1,32 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + +import pandas as pd +import pytest + +import cudf +from cudf.testing import assert_eq + + +@pytest.mark.parametrize( + "ps", + [ + pd.Series([1, 2, 3, 4, 5, 10, 11, 12, 33, 55, 19]), + pd.Series(["abc", "def", "ghi", "xyz", "pqr", "abc"]), + pd.Series( + [1, 2, 3, 4, 5, 10], + index=["abc", "def", "ghi", "xyz", "pqr", "abc"], + ), + pd.Series( + ["abc", "def", "ghi", "xyz", "pqr", "abc"], + index=[1, 2, 3, 4, 5, 10], + ), + pd.Series(index=["a", "b", "c", "d", "e", "f"], dtype="float64"), + pd.Series(index=[10, 11, 12], dtype="float64"), + pd.Series(dtype="float64"), + pd.Series([], dtype="float64"), + ], +) +def test_series_keys(ps): + gds = cudf.from_pandas(ps) + + assert_eq(ps.keys(), gds.keys()) diff --git a/python/cudf/cudf/tests/series/test_attributes.py b/python/cudf/cudf/tests/series/test_attributes.py index 65ba67dce90..677a791e1d9 100644 --- a/python/cudf/cudf/tests/series/test_attributes.py +++ b/python/cudf/cudf/tests/series/test_attributes.py @@ -206,6 +206,23 @@ def test_axes(data): assert_eq(e, a) +@pytest.mark.parametrize( + "ps", + [ + pd.Series(dtype="float64"), + pd.Series(index=[100, 10, 1, 0], dtype="float64"), + pd.Series([], dtype="float64"), + pd.Series(["a", "b", "c", "d"]), + pd.Series(["a", "b", "c", "d"], index=[0, 1, 10, 11]), + ], +) +def test_series_empty(ps): + ps = ps + gs = cudf.from_pandas(ps) + + assert_eq(ps.empty, gs.empty) + + @pytest.mark.parametrize( "data", [ diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index ecd68f46e46..d0ad73de09c 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -1,14 +1,8 @@ # Copyright (c) 2018-2025, NVIDIA CORPORATION. import array as arr -import datetime -import decimal -import io import operator -import re import textwrap -import warnings -from contextlib import contextmanager from copy import copy import cupy @@ -19,14 +13,13 @@ from packaging import version import cudf -from cudf.api.extensions import no_default from cudf.core._compat import ( PANDAS_CURRENT_SUPPORTED_VERSION, PANDAS_VERSION, ) from cudf.core.buffer.spill_manager import get_global_manager from cudf.core.column.column import as_column -from cudf.testing import _utils as utils, assert_eq +from cudf.testing import assert_eq from cudf.testing._utils import ( ALL_TYPES, DATETIME_TYPES, @@ -54,96 +47,6 @@ pytest_xfail = pytest.mark.skipif -@contextmanager -def _hide_ufunc_warnings(eval_str): - # pandas raises warnings for some inputs to the following ufuncs: - if any( - x in eval_str - for x in { - "arctanh", - "log", - } - ): - with warnings.catch_warnings(): - warnings.filterwarnings( - "ignore", - "invalid value encountered in", - category=RuntimeWarning, - ) - warnings.filterwarnings( - "ignore", - "divide by zero encountered in", - category=RuntimeWarning, - ) - yield - else: - yield - - -@contextmanager -def _hide_concat_empty_dtype_warning(): - with warnings.catch_warnings(): - # Ignoring warnings in this test as warnings are - # being caught and validated in other tests. - warnings.filterwarnings( - "ignore", - "The behavior of array concatenation with empty " - "entries is deprecated.", - category=FutureWarning, - ) - yield - - -@pytest.fixture( - params=[ - pd.DataFrame( - { - "a": [0, 1, 2, np.nan, 4, None, 6], - "b": [np.nan, None, "u", "h", "d", "a", "m"], - }, - index=["q", "w", "e", "r", "t", "y", "u"], - ), - pd.DataFrame({"a": [0, 1, 2, 3, 4], "b": ["a", "b", "u", "h", "d"]}), - pd.DataFrame( - { - "a": [None, None, np.nan, None], - "b": [np.nan, None, np.nan, None], - } - ), - pd.DataFrame({"a": []}), - pd.DataFrame({"a": [np.nan], "b": [None]}), - pd.DataFrame({"a": ["a", "b", "c", None, "e"]}), - pd.DataFrame({"a": ["a", "b", "c", "d", "e"]}), - ] -) -def na_data(request): - return request.param - - -@pytest.mark.parametrize("a", [[1, 2, 3], [1, 10, 30]]) -@pytest.mark.parametrize("b", [[4, 5, 6], [-11, -100, 30]]) -def test_concat_index(a, b): - df = pd.DataFrame() - df["a"] = a - df["b"] = b - - gdf = cudf.DataFrame() - gdf["a"] = a - gdf["b"] = b - - expected = pd.concat([df.a, df.b]) - actual = cudf.concat([gdf.a, gdf.b]) - - assert len(expected) == len(actual) - assert_eq(expected.index, actual.index) - - expected = pd.concat([df.a, df.b], ignore_index=True) - actual = cudf.concat([gdf.a, gdf.b], ignore_index=True) - - assert len(expected) == len(actual) - assert_eq(expected.index, actual.index) - - def test_dataframe_basic(): rng = np.random.default_rng(seed=0) df = cudf.DataFrame() @@ -571,203 +474,6 @@ def test_empty_dataframe_setitem_df(): assert_eq(gdf1, gdf2) -@pytest.mark.parametrize("dtype1", utils.supported_numpy_dtypes) -@pytest.mark.parametrize("dtype2", utils.supported_numpy_dtypes) -def test_dataframe_concat_different_numerical_columns(dtype1, dtype2): - df1 = pd.DataFrame(dict(x=pd.Series(np.arange(5)).astype(dtype1))) - df2 = pd.DataFrame(dict(x=pd.Series(np.arange(5)).astype(dtype2))) - if dtype1 != dtype2 and "datetime" in dtype1 or "datetime" in dtype2: - with pytest.raises(TypeError): - cudf.concat([df1, df2]) - else: - pres = pd.concat([df1, df2]) - gres = cudf.concat([cudf.from_pandas(df1), cudf.from_pandas(df2)]) - assert_eq(pres, gres, check_dtype=False, check_index_type=True) - - -def test_dataframe_concat_different_column_types(): - df1 = cudf.Series([42], dtype=np.float64) - df2 = cudf.Series(["a"], dtype="category") - with pytest.raises(ValueError): - cudf.concat([df1, df2]) - - df2 = cudf.Series(["a string"]) - with pytest.raises(TypeError): - cudf.concat([df1, df2]) - - -@pytest.mark.parametrize("df_1_data", [{"a": [1, 2], "b": [1, 3]}, {}]) -@pytest.mark.parametrize("df_2_data", [{"a": [], "b": []}, {}]) -def test_concat_empty_dataframe(df_1_data, df_2_data): - df_1 = cudf.DataFrame(df_1_data) - df_2 = cudf.DataFrame(df_2_data) - with _hide_concat_empty_dtype_warning(): - got = cudf.concat([df_1, df_2]) - expect = pd.concat([df_1.to_pandas(), df_2.to_pandas()], sort=False) - - # ignoring dtypes as pandas upcasts int to float - # on concatenation with empty dataframes - - assert_eq(got, expect, check_dtype=False, check_index_type=True) - - -@pytest.mark.parametrize( - "df1_d", - [ - {"a": [1, 2], "b": [1, 2], "c": ["s1", "s2"], "d": [1.0, 2.0]}, - {"b": [1.9, 10.9], "c": ["s1", "s2"]}, - {"c": ["s1"], "b": pd.Series([None], dtype="float"), "a": [False]}, - ], -) -@pytest.mark.parametrize( - "df2_d", - [ - {"a": [1, 2, 3]}, - {"a": [1, None, 3], "b": [True, True, False], "c": ["s3", None, "s4"]}, - {"a": [], "b": []}, - {}, - ], -) -def test_concat_different_column_dataframe(df1_d, df2_d): - with _hide_concat_empty_dtype_warning(): - got = cudf.concat( - [ - cudf.DataFrame(df1_d), - cudf.DataFrame(df2_d), - cudf.DataFrame(df1_d), - ], - sort=False, - ) - - pdf1 = pd.DataFrame(df1_d) - pdf2 = pd.DataFrame(df2_d) - - expect = pd.concat([pdf1, pdf2, pdf1], sort=False) - - # numerical columns are upcasted to float in cudf.DataFrame.to_pandas() - # casts nan to 0 in non-float numerical columns - - numeric_cols = got.dtypes[got.dtypes != "object"].index - for col in numeric_cols: - got[col] = got[col].astype(np.float64).fillna(np.nan) - - assert_eq(got, expect, check_dtype=False, check_index_type=True) - - -@pytest.mark.parametrize( - "ser_1", [pd.Series([1, 2, 3]), pd.Series([], dtype="float64")] -) -def test_concat_empty_series(ser_1): - ser_2 = pd.Series([], dtype="float64") - with _hide_concat_empty_dtype_warning(): - got = cudf.concat([cudf.Series(ser_1), cudf.Series(ser_2)]) - expect = pd.concat([ser_1, ser_2]) - - assert_eq(got, expect, check_index_type=True) - - -def test_concat_with_axis(): - df1 = pd.DataFrame(dict(x=np.arange(5), y=np.arange(5))) - df2 = pd.DataFrame(dict(a=np.arange(5), b=np.arange(5))) - - concat_df = pd.concat([df1, df2], axis=1) - cdf1 = cudf.from_pandas(df1) - cdf2 = cudf.from_pandas(df2) - - # concat only dataframes - concat_cdf = cudf.concat([cdf1, cdf2], axis=1) - assert_eq(concat_cdf, concat_df, check_index_type=True) - - # concat only series - concat_s = pd.concat([df1.x, df1.y], axis=1) - cs1 = cudf.Series.from_pandas(df1.x) - cs2 = cudf.Series.from_pandas(df1.y) - concat_cdf_s = cudf.concat([cs1, cs2], axis=1) - - assert_eq(concat_cdf_s, concat_s, check_index_type=True) - - rng = np.random.default_rng(seed=0) - # concat series and dataframes - s3 = pd.Series(rng.random(5)) - cs3 = cudf.Series.from_pandas(s3) - - concat_cdf_all = cudf.concat([cdf1, cs3, cdf2], axis=1) - concat_df_all = pd.concat([df1, s3, df2], axis=1) - assert_eq(concat_cdf_all, concat_df_all, check_index_type=True) - - # concat manual multi index - midf1 = cudf.from_pandas(df1) - midf1.index = cudf.MultiIndex( - levels=[[0, 1, 2, 3], [0, 1]], codes=[[0, 1, 2, 3, 2], [0, 1, 0, 1, 0]] - ) - midf2 = midf1[2:] - midf2.index = cudf.MultiIndex( - levels=[[3, 4, 5], [2, 0]], codes=[[0, 1, 2], [1, 0, 1]] - ) - mipdf1 = midf1.to_pandas() - mipdf2 = midf2.to_pandas() - - assert_eq( - cudf.concat([midf1, midf2]), - pd.concat([mipdf1, mipdf2]), - check_index_type=True, - ) - assert_eq( - cudf.concat([midf2, midf1]), - pd.concat([mipdf2, mipdf1]), - check_index_type=True, - ) - assert_eq( - cudf.concat([midf1, midf2, midf1]), - pd.concat([mipdf1, mipdf2, mipdf1]), - check_index_type=True, - ) - - rng = np.random.default_rng(seed=0) - # concat groupby multi index - gdf1 = cudf.DataFrame( - { - "x": rng.integers(0, 10, 10), - "y": rng.integers(0, 10, 10), - "z": rng.integers(0, 10, 10), - "v": rng.integers(0, 10, 10), - } - ) - gdf2 = gdf1[5:] - gdg1 = gdf1.groupby(["x", "y"]).min() - gdg2 = gdf2.groupby(["x", "y"]).min() - pdg1 = gdg1.to_pandas() - pdg2 = gdg2.to_pandas() - - assert_eq( - cudf.concat([gdg1, gdg2]), - pd.concat([pdg1, pdg2]), - check_index_type=True, - ) - assert_eq( - cudf.concat([gdg2, gdg1]), - pd.concat([pdg2, pdg1]), - check_index_type=True, - ) - - # series multi index concat - gdgz1 = gdg1.z - gdgz2 = gdg2.z - pdgz1 = gdgz1.to_pandas() - pdgz2 = gdgz2.to_pandas() - - assert_eq( - cudf.concat([gdgz1, gdgz2]), - pd.concat([pdgz1, pdgz2]), - check_index_type=True, - ) - assert_eq( - cudf.concat([gdgz2, gdgz1]), - pd.concat([pdgz2, pdgz1]), - check_index_type=True, - ) - - @pytest.mark.parametrize("nrows", [0, 3]) def test_nonmatching_index_setitem(nrows): rng = np.random.default_rng(seed=0) @@ -2636,147 +2342,6 @@ def test_df_sr_binop_col_order(op): assert_eq(expect, got) -@pytest.mark.parametrize("set_index", [None, "A", "C", "D"]) -@pytest.mark.parametrize("index", [True, False]) -@pytest.mark.parametrize("deep", [True, False]) -def test_memory_usage(deep, index, set_index): - # Testing numerical/datetime by comparing with pandas - # (string and categorical columns will be different) - rows = 100 - df = pd.DataFrame( - { - "A": np.arange(rows, dtype="int64"), - "B": np.arange(rows, dtype="int32"), - "C": np.arange(rows, dtype="float64"), - } - ) - df["D"] = pd.to_datetime(df.A) - if set_index: - df = df.set_index(set_index) - - gdf = cudf.from_pandas(df) - - if index and set_index is None: - # Special Case: Assume RangeIndex size == 0 - with expect_warning_if(deep, UserWarning): - assert gdf.index.memory_usage(deep=deep) == 0 - - else: - # Check for Series only - assert df["B"].memory_usage(index=index, deep=deep) == gdf[ - "B" - ].memory_usage(index=index, deep=deep) - - # Check for entire DataFrame - assert_eq( - df.memory_usage(index=index, deep=deep).sort_index(), - gdf.memory_usage(index=index, deep=deep).sort_index(), - ) - - -@pytest_xfail -def test_memory_usage_string(): - rows = 100 - rng = np.random.default_rng(seed=0) - df = pd.DataFrame( - { - "A": np.arange(rows, dtype="int32"), - "B": rng.choice(["apple", "banana", "orange"], rows), - } - ) - gdf = cudf.from_pandas(df) - - # Check deep=False (should match pandas) - assert gdf.B.memory_usage(deep=False, index=False) == df.B.memory_usage( - deep=False, index=False - ) - - # Check string column - assert gdf.B.memory_usage(deep=True, index=False) == df.B.memory_usage( - deep=True, index=False - ) - - # Check string index - assert gdf.set_index("B").index.memory_usage( - deep=True - ) == df.B.memory_usage(deep=True, index=False) - - -def test_memory_usage_cat(): - rows = 100 - rng = np.random.default_rng(seed=0) - df = pd.DataFrame( - { - "A": np.arange(rows, dtype="int32"), - "B": rng.choice(["apple", "banana", "orange"], rows), - } - ) - df["B"] = df.B.astype("category") - gdf = cudf.from_pandas(df) - - expected = ( - gdf.B._column.categories.memory_usage - + gdf.B._column.codes.memory_usage - ) - - # Check cat column - assert gdf.B.memory_usage(deep=True, index=False) == expected - - # Check cat index - assert gdf.set_index("B").index.memory_usage(deep=True) == expected - - -def test_memory_usage_list(): - df = cudf.DataFrame({"A": [[0, 1, 2, 3], [4, 5, 6], [7, 8], [9]]}) - expected = ( - df.A._column.offsets.memory_usage + df.A._column.elements.memory_usage - ) - assert expected == df.A.memory_usage() - - -def test_memory_usage_multi(): - # We need to sample without replacement to guarantee that the size of the - # levels are always the same. - rng = np.random.default_rng(seed=0) - rows = 10 - df = pd.DataFrame( - { - "A": np.arange(rows, dtype="int32"), - "B": rng.choice( - np.arange(rows, dtype="int64"), rows, replace=False - ), - "C": rng.choice( - np.arange(rows, dtype="float64"), rows, replace=False - ), - } - ).set_index(["B", "C"]) - gdf = cudf.from_pandas(df) - # Assume MultiIndex memory footprint is just that - # of the underlying columns, levels, and codes - expect = rows * 16 # Source Columns - expect += rows * 16 # Codes - expect += rows * 8 # Level 0 - expect += rows * 8 # Level 1 - - assert expect == gdf.index.memory_usage(deep=True) - - -@pytest.mark.parametrize("index", [False, True]) -def test_memory_usage_index_preserve_types(index): - data = [[1, 2, 3]] - columns = pd.Index(np.array([1, 2, 3], dtype=np.int8), name="a") - result = ( - cudf.DataFrame(data, columns=columns).memory_usage(index=index).index - ) - expected = ( - pd.DataFrame(data, columns=columns).memory_usage(index=index).index - ) - if index: - # pandas returns an Index[object] with int and string elements - expected = expected.astype(str) - assert_eq(result, expected) - - @pytest.mark.parametrize( "list_input", [ @@ -2978,3090 +2543,3 @@ def test_dataframe_assign_scalar_with_scalar_cols(col_data, assign_val): gdf["b"] = assign_val assert_eq(pdf, gdf) - - -def test_dataframe_info_basic(): - buffer = io.StringIO() - str_cmp = textwrap.dedent( - """\ - - Index: 10 entries, a to 1111 - Data columns (total 10 columns): - # Column Non-Null Count Dtype - --- ------ -------------- ----- - 0 0 10 non-null float64 - 1 1 10 non-null float64 - 2 2 10 non-null float64 - 3 3 10 non-null float64 - 4 4 10 non-null float64 - 5 5 10 non-null float64 - 6 6 10 non-null float64 - 7 7 10 non-null float64 - 8 8 10 non-null float64 - 9 9 10 non-null float64 - dtypes: float64(10) - memory usage: 859.0+ bytes - """ - ) - rng = np.random.default_rng(seed=0) - df = pd.DataFrame( - rng.standard_normal(size=(10, 10)), - index=["a", "2", "3", "4", "5", "6", "7", "8", "100", "1111"], - ) - cudf.from_pandas(df).info(buf=buffer, verbose=True) - s = buffer.getvalue() - assert str_cmp == s - - -def test_dataframe_info_verbose_mem_usage(): - buffer = io.StringIO() - df = pd.DataFrame({"a": [1, 2, 3], "b": ["safdas", "assa", "asdasd"]}) - str_cmp = textwrap.dedent( - """\ - - RangeIndex: 3 entries, 0 to 2 - Data columns (total 2 columns): - # Column Non-Null Count Dtype - --- ------ -------------- ----- - 0 a 3 non-null int64 - 1 b 3 non-null object - dtypes: int64(1), object(1) - memory usage: 56.0+ bytes - """ - ) - cudf.from_pandas(df).info(buf=buffer, verbose=True) - s = buffer.getvalue() - assert str_cmp == s - - buffer.truncate(0) - buffer.seek(0) - - str_cmp = textwrap.dedent( - """\ - - RangeIndex: 3 entries, 0 to 2 - Columns: 2 entries, a to b - dtypes: int64(1), object(1) - memory usage: 56.0+ bytes - """ - ) - cudf.from_pandas(df).info(buf=buffer, verbose=False) - s = buffer.getvalue() - assert str_cmp == s - - buffer.truncate(0) - buffer.seek(0) - - df = pd.DataFrame( - {"a": [1, 2, 3], "b": ["safdas", "assa", "asdasd"]}, - index=["sdfdsf", "sdfsdfds", "dsfdf"], - ) - str_cmp = textwrap.dedent( - """\ - - Index: 3 entries, sdfdsf to dsfdf - Data columns (total 2 columns): - # Column Non-Null Count Dtype - --- ------ -------------- ----- - 0 a 3 non-null int64 - 1 b 3 non-null object - dtypes: int64(1), object(1) - memory usage: 91.0 bytes - """ - ) - cudf.from_pandas(df).info(buf=buffer, verbose=True, memory_usage="deep") - s = buffer.getvalue() - assert str_cmp == s - - buffer.truncate(0) - buffer.seek(0) - - int_values = [1, 2, 3, 4, 5] - text_values = ["alpha", "beta", "gamma", "delta", "epsilon"] - float_values = [0.0, 0.25, 0.5, 0.75, 1.0] - - df = cudf.DataFrame( - { - "int_col": int_values, - "text_col": text_values, - "float_col": float_values, - } - ) - str_cmp = textwrap.dedent( - """\ - - RangeIndex: 5 entries, 0 to 4 - Data columns (total 3 columns): - # Column Non-Null Count Dtype - --- ------ -------------- ----- - 0 int_col 5 non-null int64 - 1 text_col 5 non-null object - 2 float_col 5 non-null float64 - dtypes: float64(1), int64(1), object(1) - memory usage: 130.0 bytes - """ - ) - df.info(buf=buffer, verbose=True, memory_usage="deep") - actual_string = buffer.getvalue() - assert str_cmp == actual_string - - buffer.truncate(0) - buffer.seek(0) - - -def test_dataframe_info_null_counts(): - int_values = [1, 2, 3, 4, 5] - text_values = ["alpha", "beta", "gamma", "delta", "epsilon"] - float_values = [0.0, 0.25, 0.5, 0.75, 1.0] - - df = cudf.DataFrame( - { - "int_col": int_values, - "text_col": text_values, - "float_col": float_values, - } - ) - buffer = io.StringIO() - str_cmp = textwrap.dedent( - """\ - - RangeIndex: 5 entries, 0 to 4 - Data columns (total 3 columns): - # Column Dtype - --- ------ ----- - 0 int_col int64 - 1 text_col object - 2 float_col float64 - dtypes: float64(1), int64(1), object(1) - memory usage: 130.0+ bytes - """ - ) - df.info(buf=buffer, verbose=True, null_counts=False) - actual_string = buffer.getvalue() - assert str_cmp == actual_string - - buffer.truncate(0) - buffer.seek(0) - - df.info(buf=buffer, verbose=True, max_cols=0) - actual_string = buffer.getvalue() - assert str_cmp == actual_string - - buffer.truncate(0) - buffer.seek(0) - - df = cudf.DataFrame() - - str_cmp = textwrap.dedent( - """\ - - RangeIndex: 0 entries - Empty DataFrame""" - ) - df.info(buf=buffer, verbose=True) - actual_string = buffer.getvalue() - assert str_cmp == actual_string - - buffer.truncate(0) - buffer.seek(0) - - df = cudf.DataFrame( - { - "a": [1, 2, 3, None, 10, 11, 12, None], - "b": ["a", "b", "c", "sd", "sdf", "sd", None, None], - } - ) - - str_cmp = textwrap.dedent( - """\ - - RangeIndex: 8 entries, 0 to 7 - Data columns (total 2 columns): - # Column Dtype - --- ------ ----- - 0 a int64 - 1 b object - dtypes: int64(1), object(1) - memory usage: 238.0+ bytes - """ - ) - pd.options.display.max_info_rows = 2 - df.info(buf=buffer, max_cols=2, null_counts=None) - pd.reset_option("display.max_info_rows") - actual_string = buffer.getvalue() - assert str_cmp == actual_string - - buffer.truncate(0) - buffer.seek(0) - - str_cmp = textwrap.dedent( - """\ - - RangeIndex: 8 entries, 0 to 7 - Data columns (total 2 columns): - # Column Non-Null Count Dtype - --- ------ -------------- ----- - 0 a 6 non-null int64 - 1 b 6 non-null object - dtypes: int64(1), object(1) - memory usage: 238.0+ bytes - """ - ) - - df.info(buf=buffer, max_cols=2, null_counts=None) - actual_string = buffer.getvalue() - assert str_cmp == actual_string - - buffer.truncate(0) - buffer.seek(0) - - df.info(buf=buffer, null_counts=True) - actual_string = buffer.getvalue() - assert str_cmp == actual_string - - -@pytest.mark.parametrize( - "df", - [ - pd.DataFrame({"a": [1, 2, 3, 4, 5, 10, 11, 12, 33, 55, 19]}), - pd.DataFrame( - { - "one": [1, 2, 3, 4, 5, 10], - "two": ["abc", "def", "ghi", "xyz", "pqr", "abc"], - } - ), - pd.DataFrame( - { - "one": [1, 2, 3, 4, 5, 10], - "two": ["abc", "def", "ghi", "xyz", "pqr", "abc"], - }, - index=[10, 20, 30, 40, 50, 60], - ), - pd.DataFrame( - { - "one": [1, 2, 3, 4, 5, 10], - "two": ["abc", "def", "ghi", "xyz", "pqr", "abc"], - }, - index=["a", "b", "c", "d", "e", "f"], - ), - pd.DataFrame(index=["a", "b", "c", "d", "e", "f"]), - pd.DataFrame(columns=["a", "b", "c", "d", "e", "f"]), - pd.DataFrame(index=[10, 11, 12]), - pd.DataFrame(columns=[10, 11, 12]), - pd.DataFrame(), - pd.DataFrame({"one": [], "two": []}), - pd.DataFrame({2: [], 1: []}), - pd.DataFrame( - { - 0: [1, 2, 3, 4, 5, 10], - 1: ["abc", "def", "ghi", "xyz", "pqr", "abc"], - 100: ["a", "b", "b", "x", "z", "a"], - }, - index=[10, 20, 30, 40, 50, 60], - ), - ], -) -def test_dataframe_keys(df): - gdf = cudf.from_pandas(df) - - assert_eq( - df.keys(), - gdf.keys(), - ) - - -@pytest.mark.parametrize( - "ps", - [ - pd.Series([1, 2, 3, 4, 5, 10, 11, 12, 33, 55, 19]), - pd.Series(["abc", "def", "ghi", "xyz", "pqr", "abc"]), - pd.Series( - [1, 2, 3, 4, 5, 10], - index=["abc", "def", "ghi", "xyz", "pqr", "abc"], - ), - pd.Series( - ["abc", "def", "ghi", "xyz", "pqr", "abc"], - index=[1, 2, 3, 4, 5, 10], - ), - pd.Series(index=["a", "b", "c", "d", "e", "f"], dtype="float64"), - pd.Series(index=[10, 11, 12], dtype="float64"), - pd.Series(dtype="float64"), - pd.Series([], dtype="float64"), - ], -) -def test_series_keys(ps): - gds = cudf.from_pandas(ps) - - assert_eq(ps.keys(), gds.keys()) - - -@pytest_unmark_spilling -@pytest.mark.parametrize( - "df", - [ - pd.DataFrame(), - pd.DataFrame(index=[10, 20, 30]), - pd.DataFrame({"first_col": [], "second_col": [], "third_col": []}), - pd.DataFrame([[1, 2], [3, 4]], columns=list("AB")), - pd.DataFrame([[1, 2], [3, 4]], columns=list("AB"), index=[10, 20]), - pd.DataFrame([[1, 2], [3, 4]], columns=list("AB"), index=[7, 8]), - pd.DataFrame( - { - "a": [315.3324, 3243.32432, 3232.332, -100.32], - "z": [0.3223, 0.32, 0.0000232, 0.32224], - } - ), - pd.DataFrame( - { - "a": [315.3324, 3243.32432, 3232.332, -100.32], - "z": [0.3223, 0.32, 0.0000232, 0.32224], - }, - index=[7, 20, 11, 9], - ), - pd.DataFrame({"l": [10]}), - pd.DataFrame({"l": [10]}, index=[100]), - pd.DataFrame({"f": [10.2, 11.2332, 0.22, 3.3, 44.23, 10.0]}), - pd.DataFrame( - {"f": [10.2, 11.2332, 0.22, 3.3, 44.23, 10.0]}, - index=[100, 200, 300, 400, 500, 0], - ), - ], -) -@pytest.mark.parametrize( - "other", - [ - pd.DataFrame([[5, 6], [7, 8]], columns=list("AB")), - pd.DataFrame([[5, 6], [7, 8]], columns=list("BD")), - pd.DataFrame([[5, 6], [7, 8]], columns=list("DE")), - pd.DataFrame(), - pd.DataFrame( - {"c": [10, 11, 22, 33, 44, 100]}, index=[7, 8, 9, 10, 11, 20] - ), - pd.DataFrame({"f": [10.2, 11.2332, 0.22, 3.3, 44.23, 10.0]}), - pd.DataFrame({"l": [10]}), - pd.DataFrame({"l": [10]}, index=[200]), - pd.DataFrame([]), - pd.DataFrame({"first_col": [], "second_col": [], "third_col": []}), - pd.DataFrame([], index=[100]), - pd.DataFrame( - { - "a": [315.3324, 3243.32432, 3232.332, -100.32], - "z": [0.3223, 0.32, 0.0000232, 0.32224], - } - ), - pd.DataFrame( - { - "a": [315.3324, 3243.32432, 3232.332, -100.32], - "z": [0.3223, 0.32, 0.0000232, 0.32224], - }, - index=[0, 100, 200, 300], - ), - ], -) -@pytest.mark.parametrize("sort", [False, True]) -@pytest.mark.parametrize("ignore_index", [True, False]) -def test_dataframe_concat_dataframe(df, other, sort, ignore_index): - pdf = df - other_pd = other - - gdf = cudf.from_pandas(df) - other_gd = cudf.from_pandas(other) - - with _hide_concat_empty_dtype_warning(): - expected = pd.concat( - [pdf, other_pd], sort=sort, ignore_index=ignore_index - ) - actual = cudf.concat( - [gdf, other_gd], sort=sort, ignore_index=ignore_index - ) - - # In empty dataframe cases, Pandas & cudf differ in columns - # creation, pandas creates RangeIndex(0, 0) - # whereas cudf creates an empty Index([], dtype="object"). - check_column_type = ( - False if len(expected.columns) == len(df.columns) == 0 else True - ) - - if expected.shape != df.shape: - assert_eq( - expected.fillna(-1), - actual.fillna(-1), - check_dtype=False, - check_column_type=check_column_type, - ) - else: - assert_eq( - expected, - actual, - check_index_type=not gdf.empty, - check_column_type=check_column_type, - ) - - -@pytest_unmark_spilling -@pytest.mark.parametrize( - "df", - [ - pd.DataFrame(), - pd.DataFrame(index=[10, 20, 30]), - pd.DataFrame({12: [], 22: []}), - pd.DataFrame([[1, 2], [3, 4]], columns=[10, 20]), - pd.DataFrame([[1, 2], [3, 4]], columns=[0, 1], index=[10, 20]), - pd.DataFrame([[1, 2], [3, 4]], columns=[1, 0], index=[7, 8]), - pd.DataFrame( - { - 23: [315.3324, 3243.32432, 3232.332, -100.32], - 33: [0.3223, 0.32, 0.0000232, 0.32224], - } - ), - pd.DataFrame( - { - 0: [315.3324, 3243.32432, 3232.332, -100.32], - 1: [0.3223, 0.32, 0.0000232, 0.32224], - }, - index=[7, 20, 11, 9], - ), - ], -) -@pytest.mark.parametrize( - "other", - [ - pd.Series([10, 11, 23, 234, 13]), - pd.Series([10, 11, 23, 234, 13], index=[11, 12, 13, 44, 33]), - {1: 1}, - {0: 10, 1: 100, 2: 102}, - ], -) -@pytest.mark.parametrize("sort", [False, True]) -def test_dataframe_concat_series(df, other, sort): - pdf = df - gdf = cudf.from_pandas(df) - - if isinstance(other, dict): - other_pd = pd.Series(other) - else: - other_pd = other - other_gd = cudf.from_pandas(other_pd) - - expected = pd.concat([pdf, other_pd], ignore_index=True, sort=sort) - actual = cudf.concat([gdf, other_gd], ignore_index=True, sort=sort) - - if expected.shape != df.shape: - # Ignore the column type comparison because pandas incorrectly - # returns pd.Index([1, 2, 3], dtype="object") instead - # of pd.Index([1, 2, 3], dtype="int64") - assert_eq( - expected.fillna(-1), - actual.fillna(-1), - check_dtype=False, - check_column_type=False, - check_index_type=True, - ) - else: - assert_eq(expected, actual, check_index_type=not gdf.empty) - - -def test_dataframe_concat_series_mixed_index(): - df = cudf.DataFrame({"first": [], "d": []}) - pdf = df.to_pandas() - - sr = cudf.Series([1, 2, 3, 4]) - psr = sr.to_pandas() - - assert_eq( - cudf.concat([df, sr], ignore_index=True), - pd.concat([pdf, psr], ignore_index=True), - check_dtype=False, - ) - - -@pytest_unmark_spilling -@pytest.mark.parametrize( - "df", - [ - pd.DataFrame(), - pd.DataFrame(index=[10, 20, 30]), - pd.DataFrame({"first_col": [], "second_col": [], "third_col": []}), - pd.DataFrame([[1, 2], [3, 4]], columns=list("AB")), - pd.DataFrame([[1, 2], [3, 4]], columns=list("AB"), index=[10, 20]), - pd.DataFrame([[1, 2], [3, 4]], columns=list("AB"), index=[7, 8]), - pd.DataFrame( - { - "a": [315.3324, 3243.32432, 3232.332, -100.32], - "z": [0.3223, 0.32, 0.0000232, 0.32224], - } - ), - pd.DataFrame( - { - "a": [315.3324, 3243.32432, 3232.332, -100.32], - "z": [0.3223, 0.32, 0.0000232, 0.32224], - }, - index=[7, 20, 11, 9], - ), - pd.DataFrame({"l": [10]}), - pd.DataFrame({"l": [10]}, index=[100]), - pd.DataFrame({"f": [10.2, 11.2332, 0.22, 3.3, 44.23, 10.0]}), - pd.DataFrame( - {"f": [10.2, 11.2332, 0.22, 3.3, 44.23, 10.0]}, - index=[100, 200, 300, 400, 500, 0], - ), - ], -) -@pytest.mark.parametrize( - "other", - [ - [pd.DataFrame([[5, 6], [7, 8]], columns=list("AB"))], - [ - pd.DataFrame([[5, 6], [7, 8]], columns=list("AB")), - pd.DataFrame([[5, 6], [7, 8]], columns=list("BD")), - pd.DataFrame([[5, 6], [7, 8]], columns=list("DE")), - ], - [pd.DataFrame(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame()], - [ - pd.DataFrame( - {"c": [10, 11, 22, 33, 44, 100]}, index=[7, 8, 9, 10, 11, 20] - ), - pd.DataFrame(), - pd.DataFrame(), - pd.DataFrame([[5, 6], [7, 8]], columns=list("AB")), - ], - [ - pd.DataFrame({"f": [10.2, 11.2332, 0.22, 3.3, 44.23, 10.0]}), - pd.DataFrame({"l": [10]}), - pd.DataFrame({"l": [10]}, index=[200]), - ], - [pd.DataFrame([]), pd.DataFrame([], index=[100])], - [ - pd.DataFrame([]), - pd.DataFrame([], index=[100]), - pd.DataFrame({"first_col": [], "second_col": [], "third_col": []}), - ], - [ - pd.DataFrame( - { - "a": [315.3324, 3243.32432, 3232.332, -100.32], - "z": [0.3223, 0.32, 0.0000232, 0.32224], - } - ), - pd.DataFrame( - { - "a": [315.3324, 3243.32432, 3232.332, -100.32], - "z": [0.3223, 0.32, 0.0000232, 0.32224], - }, - index=[0, 100, 200, 300], - ), - ], - [ - pd.DataFrame( - { - "a": [315.3324, 3243.32432, 3232.332, -100.32], - "z": [0.3223, 0.32, 0.0000232, 0.32224], - }, - index=[0, 100, 200, 300], - ), - ], - [ - pd.DataFrame( - { - "a": [315.3324, 3243.32432, 3232.332, -100.32], - "z": [0.3223, 0.32, 0.0000232, 0.32224], - }, - index=[0, 100, 200, 300], - ), - pd.DataFrame( - { - "a": [315.3324, 3243.32432, 3232.332, -100.32], - "z": [0.3223, 0.32, 0.0000232, 0.32224], - }, - index=[0, 100, 200, 300], - ), - ], - [ - pd.DataFrame( - { - "a": [315.3324, 3243.32432, 3232.332, -100.32], - "z": [0.3223, 0.32, 0.0000232, 0.32224], - }, - index=[0, 100, 200, 300], - ), - pd.DataFrame( - { - "a": [315.3324, 3243.32432, 3232.332, -100.32], - "z": [0.3223, 0.32, 0.0000232, 0.32224], - }, - index=[0, 100, 200, 300], - ), - pd.DataFrame({"first_col": [], "second_col": [], "third_col": []}), - ], - ], -) -@pytest.mark.parametrize("sort", [False, True]) -@pytest.mark.parametrize("ignore_index", [True, False]) -def test_dataframe_concat_dataframe_lists(df, other, sort, ignore_index): - pdf = df - other_pd = other - - gdf = cudf.from_pandas(df) - other_gd = [cudf.from_pandas(o) for o in other] - - with _hide_concat_empty_dtype_warning(): - expected = pd.concat( - [pdf, *other_pd], sort=sort, ignore_index=ignore_index - ) - actual = cudf.concat( - [gdf, *other_gd], sort=sort, ignore_index=ignore_index - ) - - # In some cases, Pandas creates an empty Index([], dtype="object") for - # columns whereas cudf creates a RangeIndex(0, 0). - check_column_type = ( - False if len(expected.columns) == len(df.columns) == 0 else True - ) - - if expected.shape != df.shape: - assert_eq( - expected.fillna(-1), - actual.fillna(-1), - check_dtype=False, - check_column_type=check_column_type, - ) - else: - assert_eq( - expected, - actual, - check_index_type=not gdf.empty, - check_column_type=check_column_type, - ) - - -@pytest.mark.parametrize( - "df", - [ - pd.DataFrame({"A": [1, 2, 3, np.nan, None, 6]}), - pd.Series([1, 2, 3, None, np.nan, 5, 6, np.nan]), - ], -) -@pytest.mark.parametrize("alias", ["bfill", "backfill"]) -def test_dataframe_bfill(df, alias): - gdf = cudf.from_pandas(df) - - with expect_warning_if(alias == "backfill"): - actual = getattr(df, alias)() - with expect_warning_if(alias == "backfill"): - expected = getattr(gdf, alias)() - assert_eq(expected, actual) - - -@pytest.mark.parametrize( - "df", - [ - pd.DataFrame({"A": [1, 2, 3, np.nan, None, 6]}), - pd.Series([1, 2, 3, None, np.nan, 5, 6, np.nan]), - ], -) -@pytest.mark.parametrize("alias", ["ffill", "pad"]) -def test_dataframe_ffill(df, alias): - gdf = cudf.from_pandas(df) - - with expect_warning_if(alias == "pad"): - actual = getattr(df, alias)() - with expect_warning_if(alias == "pad"): - expected = getattr(gdf, alias)() - assert_eq(expected, actual) - - -@pytest_unmark_spilling -@pytest.mark.parametrize( - "df", - [ - pd.DataFrame(), - pd.DataFrame([[1, 2], [3, 4]], columns=list("AB")), - pd.DataFrame([[1, 2], [3, 4]], columns=list("AB"), index=[10, 20]), - pd.DataFrame([[1, 2], [3, 4]], columns=list("AB"), index=[7, 8]), - pd.DataFrame( - { - "a": [315.3324, 3243.32432, 3232.332, -100.32], - "z": [0.3223, 0.32, 0.0000232, 0.32224], - } - ), - pd.DataFrame( - { - "a": [315.3324, 3243.32432, 3232.332, -100.32], - "z": [0.3223, 0.32, 0.0000232, 0.32224], - }, - index=[7, 20, 11, 9], - ), - pd.DataFrame({"l": [10]}), - pd.DataFrame({"l": [10]}, index=[100]), - pd.DataFrame({"f": [10.2, 11.2332, 0.22, 3.3, 44.23, 10.0]}), - pd.DataFrame( - {"f": [10.2, 11.2332, 0.22, 3.3, 44.23, 10.0]}, - index=[100, 200, 300, 400, 500, 0], - ), - pd.DataFrame({"first_col": [], "second_col": [], "third_col": []}), - ], -) -@pytest.mark.parametrize( - "other", - [ - [[1, 2], [10, 100]], - [[1, 2, 10, 100, 0.1, 0.2, 0.0021]], - [[]], - [[], [], [], []], - [[0.23, 0.00023, -10.00, 100, 200, 1000232, 1232.32323]], - ], -) -@pytest.mark.parametrize("sort", [False, True]) -@pytest.mark.parametrize("ignore_index", [True, False]) -def test_dataframe_concat_lists(df, other, sort, ignore_index): - pdf = df - other_pd = [pd.DataFrame(o) for o in other] - - gdf = cudf.from_pandas(df) - other_gd = [cudf.from_pandas(o) for o in other_pd] - - with _hide_concat_empty_dtype_warning(): - expected = pd.concat( - [pdf, *other_pd], sort=sort, ignore_index=ignore_index - ) - actual = cudf.concat( - [gdf, *other_gd], sort=sort, ignore_index=ignore_index - ) - - if expected.shape != df.shape: - assert_eq( - expected.fillna(-1), - actual.fillna(-1), - check_dtype=False, - check_column_type=not gdf.empty, - ) - else: - assert_eq( - expected, - actual, - check_index_type=not gdf.empty, - check_column_type=len(gdf.columns) != 0, - ) - - -def test_dataframe_concat_series_without_name(): - df = cudf.DataFrame({"a": [1, 2, 3]}) - pdf = df.to_pandas() - gs = cudf.Series([1, 2, 3]) - ps = gs.to_pandas() - - assert_eq(pd.concat([pdf, ps]), cudf.concat([df, gs])) - - -def test_cudf_arrow_array_error(): - df = cudf.DataFrame({"a": [1, 2, 3]}) - - with pytest.raises( - TypeError, - match="Implicit conversion to a host PyArrow object via " - "__arrow_array__ is not allowed. Consider using .to_arrow()", - ): - df.__arrow_array__() - - sr = cudf.Series([1, 2, 3]) - - with pytest.raises( - TypeError, - match="Implicit conversion to a host PyArrow object via " - "__arrow_array__ is not allowed. Consider using .to_arrow()", - ): - sr.__arrow_array__() - - sr = cudf.Series(["a", "b", "c"]) - with pytest.raises( - TypeError, - match="Implicit conversion to a host PyArrow object via " - "__arrow_array__ is not allowed. Consider using .to_arrow()", - ): - sr.__arrow_array__() - - -@pytest.mark.parametrize( - "make_weights_axis_1", - [lambda _: None, lambda s: [1] * s, lambda s: np.ones(s)], -) -def test_sample_axis_1( - sample_n_frac, random_state_tuple_axis_1, make_weights_axis_1 -): - n, frac = sample_n_frac - pd_random_state, gd_random_state, checker = random_state_tuple_axis_1 - - pdf = pd.DataFrame( - { - "a": [1, 2, 3, 4, 5], - "float": [0.05, 0.2, 0.3, 0.2, 0.25], - "int": [1, 3, 5, 4, 2], - }, - ) - df = cudf.DataFrame.from_pandas(pdf) - - weights = make_weights_axis_1(len(pdf.columns)) - - expected = pdf.sample( - n=n, - frac=frac, - replace=False, - random_state=pd_random_state, - weights=weights, - axis=1, - ) - got = df.sample( - n=n, - frac=frac, - replace=False, - random_state=gd_random_state, - weights=weights, - axis=1, - ) - checker(expected, got) - - -@pytest.mark.parametrize( - "pdf", - [ - pd.DataFrame( - { - "a": [1, 2, 3, 4, 5], - "float": [0.05, 0.2, 0.3, 0.2, 0.25], - "int": [1, 3, 5, 4, 2], - }, - ), - pd.Series([1, 2, 3, 4, 5]), - ], -) -@pytest.mark.parametrize("replace", [True, False]) -def test_sample_axis_0( - pdf, sample_n_frac, replace, random_state_tuple_axis_0, make_weights_axis_0 -): - n, frac = sample_n_frac - pd_random_state, gd_random_state, checker = random_state_tuple_axis_0 - - df = cudf.from_pandas(pdf) - - pd_weights, gd_weights = make_weights_axis_0( - len(pdf), isinstance(gd_random_state, np.random.RandomState) - ) - if ( - not replace - and not isinstance(gd_random_state, np.random.RandomState) - and gd_weights is not None - ): - pytest.skip( - "`cupy.random.RandomState` doesn't support weighted sampling " - "without replacement." - ) - - expected = pdf.sample( - n=n, - frac=frac, - replace=replace, - random_state=pd_random_state, - weights=pd_weights, - axis=0, - ) - - got = df.sample( - n=n, - frac=frac, - replace=replace, - random_state=gd_random_state, - weights=gd_weights, - axis=0, - ) - checker(expected, got) - - -@pytest.mark.parametrize("replace", [True, False]) -@pytest.mark.parametrize( - "random_state_lib", [cupy.random.RandomState, np.random.RandomState] -) -def test_sample_reproducibility(replace, random_state_lib): - df = cudf.DataFrame({"a": cupy.arange(0, 1024)}) - - n = 1024 - expected = df.sample(n, replace=replace, random_state=random_state_lib(10)) - out = df.sample(n, replace=replace, random_state=random_state_lib(10)) - - assert_eq(expected, out) - - -@pytest.mark.parametrize("axis", [0, 1]) -def test_sample_invalid_n_frac_combo(axis): - n, frac = 2, 0.5 - pdf = pd.DataFrame( - { - "a": [1, 2, 3, 4, 5], - "float": [0.05, 0.2, 0.3, 0.2, 0.25], - "int": [1, 3, 5, 4, 2], - }, - ) - df = cudf.DataFrame.from_pandas(pdf) - - assert_exceptions_equal( - lfunc=pdf.sample, - rfunc=df.sample, - lfunc_args_and_kwargs=([], {"n": n, "frac": frac, "axis": axis}), - rfunc_args_and_kwargs=([], {"n": n, "frac": frac, "axis": axis}), - ) - - -@pytest.mark.parametrize("n, frac", [(100, None), (None, 3)]) -@pytest.mark.parametrize("axis", [0, 1]) -def test_oversample_without_replace(n, frac, axis): - pdf = pd.DataFrame({"a": [1, 2, 3, 4, 5]}) - df = cudf.DataFrame.from_pandas(pdf) - - assert_exceptions_equal( - lfunc=pdf.sample, - rfunc=df.sample, - lfunc_args_and_kwargs=( - [], - {"n": n, "frac": frac, "axis": axis, "replace": False}, - ), - rfunc_args_and_kwargs=( - [], - {"n": n, "frac": frac, "axis": axis, "replace": False}, - ), - ) - - -@pytest.mark.parametrize("random_state", [None, cupy.random.RandomState(42)]) -def test_sample_unsupported_arguments(random_state): - df = cudf.DataFrame({"float": [0.05, 0.2, 0.3, 0.2, 0.25]}) - with pytest.raises( - NotImplementedError, - match="Random sampling with cupy does not support these inputs.", - ): - df.sample( - n=2, replace=False, random_state=random_state, weights=[1] * 5 - ) - - -@pytest.mark.parametrize( - "df", - [ - pd.DataFrame(), - pd.DataFrame(index=[100, 10, 1, 0]), - pd.DataFrame(columns=["a", "b", "c", "d"]), - pd.DataFrame(columns=["a", "b", "c", "d"], index=[100]), - pd.DataFrame( - columns=["a", "b", "c", "d"], index=[100, 10000, 2131, 133] - ), - pd.DataFrame({"a": [1, 2, 3], "b": ["abc", "xyz", "klm"]}), - ], -) -def test_dataframe_empty(df): - pdf = df - gdf = cudf.from_pandas(pdf) - - assert_eq(pdf.empty, gdf.empty) - - -@pytest.mark.parametrize( - "df", - [ - pd.DataFrame(), - pd.DataFrame(index=[100, 10, 1, 0]), - pd.DataFrame(columns=["a", "b", "c", "d"]), - pd.DataFrame(columns=["a", "b", "c", "d"], index=[100]), - pd.DataFrame( - columns=["a", "b", "c", "d"], index=[100, 10000, 2131, 133] - ), - pd.DataFrame({"a": [1, 2, 3], "b": ["abc", "xyz", "klm"]}), - ], -) -def test_dataframe_size(df): - pdf = df - gdf = cudf.from_pandas(pdf) - - assert_eq(pdf.size, gdf.size) - - -@pytest.mark.parametrize( - "ps", - [ - pd.Series(dtype="float64"), - pd.Series(index=[100, 10, 1, 0], dtype="float64"), - pd.Series([], dtype="float64"), - pd.Series(["a", "b", "c", "d"]), - pd.Series(["a", "b", "c", "d"], index=[0, 1, 10, 11]), - ], -) -def test_series_empty(ps): - ps = ps - gs = cudf.from_pandas(ps) - - assert_eq(ps.empty, gs.empty) - - -@pytest.mark.parametrize( - "data", - [ - None, - [], - [1], - {"a": [10, 11, 12]}, - { - "a": [10, 11, 12], - "another column name": [12, 22, 34], - "xyz": [0, 10, 11], - }, - ], -) -@pytest.mark.parametrize( - "columns", - [["a"], ["another column name"], None, pd.Index(["a"], name="index name")], -) -def test_dataframe_init_with_columns(data, columns): - pdf = pd.DataFrame(data, columns=columns) - gdf = cudf.DataFrame(data, columns=columns) - - assert_eq( - pdf, - gdf, - check_index_type=len(pdf.index) != 0, - check_dtype=not (pdf.empty and len(pdf.columns)), - check_column_type=False, - ) - - -@pytest_unmark_spilling -@pytest.mark.parametrize( - "data, ignore_dtype", - [ - ([pd.Series([1, 2, 3])], False), - ([pd.Series(index=[1, 2, 3], dtype="float64")], False), - ([pd.Series(name="empty series name", dtype="float64")], False), - ( - [pd.Series([1]), pd.Series([], dtype="float64"), pd.Series([3])], - False, - ), - ( - [ - pd.Series([1, 0.324234, 32424.323, -1233, 34242]), - pd.Series([], dtype="float64"), - pd.Series([3], name="series that is named"), - ], - False, - ), - ([pd.Series([1, 2, 3], name="hi")] * 10, False), - ([pd.Series([1, 2, 3], name=None, index=[10, 11, 12])] * 10, False), - ( - [ - pd.Series([1, 2, 3], name=None, index=[10, 11, 12]), - pd.Series([1, 2, 30], name=None, index=[13, 144, 15]), - ], - True, - ), - ( - [ - pd.Series([1, 0.324234, 32424.323, -1233, 34242]), - pd.Series([], dtype="float64"), - pd.Series(index=[10, 11, 12], dtype="float64"), - ], - False, - ), - ( - [ - pd.Series([1, 0.324234, 32424.323, -1233, 34242]), - pd.Series([], name="abc", dtype="float64"), - pd.Series(index=[10, 11, 12], dtype="float64"), - ], - False, - ), - ( - [ - pd.Series([1, 0.324234, 32424.323, -1233, 34242]), - pd.Series([1, -100, 200, -399, 400], name="abc"), - pd.Series([111, 222, 333], index=[10, 11, 12]), - ], - False, - ), - ], -) -@pytest.mark.parametrize( - "columns", - [ - None, - ["0"], - [0], - ["abc"], - [144, 13], - [2, 1, 0], - pd.Index(["abc"], name="custom_name"), - ], -) -def test_dataframe_init_from_series_list(data, ignore_dtype, columns): - gd_data = [cudf.from_pandas(obj) for obj in data] - - expected = pd.DataFrame(data, columns=columns) - actual = cudf.DataFrame(gd_data, columns=columns) - - if ignore_dtype: - # When a union is performed to generate columns, - # the order is never guaranteed. Hence sort by - # columns before comparison. - if not expected.columns.equals(actual.columns): - expected = expected.sort_index(axis=1) - actual = actual.sort_index(axis=1) - assert_eq( - expected.fillna(-1), - actual.fillna(-1), - check_dtype=False, - check_index_type=True, - ) - else: - assert_eq( - expected, - actual, - check_index_type=True, - check_column_type=False, - ) - - -@pytest_unmark_spilling -@pytest.mark.parametrize( - "data, ignore_dtype, index", - [ - ([pd.Series([1, 2, 3])], False, ["a", "b", "c"]), - ([pd.Series(index=[1, 2, 3], dtype="float64")], False, ["a", "b"]), - ( - [pd.Series(name="empty series name", dtype="float64")], - False, - ["index1"], - ), - ( - [pd.Series([1]), pd.Series([], dtype="float64"), pd.Series([3])], - False, - ["0", "2", "1"], - ), - ( - [ - pd.Series([1, 0.324234, 32424.323, -1233, 34242]), - pd.Series([], dtype="float64"), - pd.Series([3], name="series that is named"), - ], - False, - ["_", "+", "*"], - ), - ([pd.Series([1, 2, 3], name="hi")] * 10, False, ["mean"] * 10), - ( - [pd.Series([1, 2, 3], name=None, index=[10, 11, 12])] * 10, - False, - ["abc"] * 10, - ), - ( - [ - pd.Series([1, 2, 3], name=None, index=[10, 11, 12]), - pd.Series([1, 2, 30], name=None, index=[13, 144, 15]), - ], - True, - ["set_index_a", "set_index_b"], - ), - ( - [ - pd.Series([1, 0.324234, 32424.323, -1233, 34242]), - pd.Series([], dtype="float64"), - pd.Series(index=[10, 11, 12], dtype="float64"), - ], - False, - ["a", "b", "c"], - ), - ( - [ - pd.Series([1, 0.324234, 32424.323, -1233, 34242]), - pd.Series([], name="abc", dtype="float64"), - pd.Series(index=[10, 11, 12], dtype="float64"), - ], - False, - ["a", "v", "z"], - ), - ( - [ - pd.Series([1, 0.324234, 32424.323, -1233, 34242]), - pd.Series([1, -100, 200, -399, 400], name="abc"), - pd.Series([111, 222, 333], index=[10, 11, 12]), - ], - False, - ["a", "v", "z"], - ), - ], -) -@pytest.mark.parametrize( - "columns", [None, ["0"], [0], ["abc"], [144, 13], [2, 1, 0]] -) -def test_dataframe_init_from_series_list_with_index( - data, - ignore_dtype, - index, - columns, -): - gd_data = [cudf.from_pandas(obj) for obj in data] - - expected = pd.DataFrame(data, columns=columns, index=index) - actual = cudf.DataFrame(gd_data, columns=columns, index=index) - - if ignore_dtype: - # When a union is performed to generate columns, - # the order is never guaranteed. Hence sort by - # columns before comparison. - if not expected.columns.equals(actual.columns): - expected = expected.sort_index(axis=1) - actual = actual.sort_index(axis=1) - assert_eq(expected.fillna(-1), actual.fillna(-1), check_dtype=False) - else: - assert_eq(expected, actual, check_column_type=False) - - -@pytest.mark.parametrize( - "data, index", - [ - ([pd.Series([1, 2]), pd.Series([1, 2])], ["a", "b", "c"]), - ( - [ - pd.Series([1, 0.324234, 32424.323, -1233, 34242]), - pd.Series([], dtype="float64"), - pd.Series([3], name="series that is named"), - ], - ["_", "+"], - ), - ([pd.Series([1, 2, 3], name="hi")] * 10, ["mean"] * 9), - ], -) -def test_dataframe_init_from_series_list_with_index_error(data, index): - gd_data = [cudf.from_pandas(obj) for obj in data] - - assert_exceptions_equal( - pd.DataFrame, - cudf.DataFrame, - ([data], {"index": index}), - ([gd_data], {"index": index}), - ) - - -@pytest.mark.parametrize( - "data", - [ - [pd.Series([1, 2, 3], index=["a", "a", "a"])], - [pd.Series([1, 2, 3], index=["a", "a", "a"])] * 4, - [ - pd.Series([1, 2, 3], index=["a", "b", "a"]), - pd.Series([1, 2, 3], index=["b", "b", "a"]), - ], - [ - pd.Series([1, 2, 3], index=["a", "b", "z"]), - pd.Series([1, 2, 3], index=["u", "b", "a"]), - pd.Series([1, 2, 3], index=["u", "b", "u"]), - ], - ], -) -def test_dataframe_init_from_series_list_duplicate_index_error(data): - gd_data = [cudf.from_pandas(obj) for obj in data] - - assert_exceptions_equal( - lfunc=pd.DataFrame, - rfunc=cudf.DataFrame, - lfunc_args_and_kwargs=([], {"data": data}), - rfunc_args_and_kwargs=([], {"data": gd_data}), - check_exception_type=False, - ) - - -def test_dataframe_iterrows_itertuples(): - df = cudf.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) - - with pytest.raises( - TypeError, - match=re.escape( - "cuDF does not support iteration of DataFrame " - "via itertuples. Consider using " - "`.to_pandas().itertuples()` " - "if you wish to iterate over namedtuples." - ), - ): - df.itertuples() - - with pytest.raises( - TypeError, - match=re.escape( - "cuDF does not support iteration of DataFrame " - "via iterrows. Consider using " - "`.to_pandas().iterrows()` " - "if you wish to iterate over each row." - ), - ): - df.iterrows() - - -@pytest.mark.parametrize( - "df", - [ - lambda: cudf.DataFrame({"a": [1, 2, 3]}), - lambda: cudf.DataFrame( - {"a": [1, 2, 3], "b": ["a", "z", "c"]}, index=["a", "z", "x"] - ), - lambda: cudf.DataFrame( - { - "a": [1, 2, 3, None, 2, 1, None], - "b": ["a", "z", "c", "a", "v", "z", "z"], - } - ), - lambda: cudf.DataFrame({"a": [], "b": []}), - lambda: cudf.DataFrame({"a": [None, None], "b": [None, None]}), - lambda: cudf.DataFrame( - { - "a": ["hello", "world", "rapids", "ai", "nvidia"], - "b": cudf.Series( - [1, 21, 21, 11, 11], - dtype="timedelta64[s]", - index=["a", "b", "c", "d", " e"], - ), - }, - index=["a", "b", "c", "d", " e"], - ), - lambda: cudf.DataFrame( - { - "a": ["hello", None, "world", "rapids", None, "ai", "nvidia"], - "b": cudf.Series( - [1, 21, None, 11, None, 11, None], dtype="datetime64[s]" - ), - } - ), - ], -) -@pytest.mark.parametrize("numeric_only", [True, False]) -@pytest.mark.parametrize("dropna", [True, False]) -def test_dataframe_mode(df, numeric_only, dropna): - df = df() - pdf = df.to_pandas() - - expected = pdf.mode(numeric_only=numeric_only, dropna=dropna) - actual = df.mode(numeric_only=numeric_only, dropna=dropna) - if len(actual.columns) == 0: - # pandas < 3.0 returns an Index[object] instead of RangeIndex - actual.columns = expected.columns - assert_eq(expected, actual, check_dtype=False) - - -@pytest.mark.parametrize( - "lhs, rhs", [("a", "a"), ("a", "b"), (1, 1.0), (None, None), (None, "a")] -) -def test_equals_names(lhs, rhs): - lhs = cudf.DataFrame({lhs: [1, 2]}) - rhs = cudf.DataFrame({rhs: [1, 2]}) - - got = lhs.equals(rhs) - expect = lhs.to_pandas().equals(rhs.to_pandas()) - - assert_eq(expect, got) - - -def test_equals_dtypes(): - lhs = cudf.DataFrame({"a": [1, 2.0]}) - rhs = cudf.DataFrame({"a": [1, 2]}) - - got = lhs.equals(rhs) - expect = lhs.to_pandas().equals(rhs.to_pandas()) - - assert_eq(expect, got) - - -@pytest.mark.parametrize( - "df1", - [ - pd.DataFrame({"a": [10, 11, 12]}, index=["a", "b", "z"]), - pd.DataFrame({"z": ["a"]}), - pd.DataFrame({"a": [], "b": []}), - ], -) -@pytest.mark.parametrize( - "df2", - [ - pd.DataFrame(), - pd.DataFrame({"a": ["a", "a", "c", "z", "A"], "z": [1, 2, 3, 4, 5]}), - ], -) -@pytest.mark.parametrize( - "op", - [ - operator.eq, - operator.ne, - operator.lt, - operator.gt, - operator.le, - operator.ge, - ], -) -def test_dataframe_error_equality(df1, df2, op): - gdf1 = cudf.from_pandas(df1) - gdf2 = cudf.from_pandas(df2) - - assert_exceptions_equal(op, op, ([df1, df2],), ([gdf1, gdf2],)) - - -@pytest.mark.parametrize( - "df,expected_pdf", - [ - ( - lambda: cudf.DataFrame( - { - "a": cudf.Series([1, 2, None, 3], dtype="uint8"), - "b": cudf.Series([23, None, None, 32], dtype="uint16"), - } - ), - pd.DataFrame( - { - "a": pd.Series([1, 2, None, 3], dtype=pd.UInt8Dtype()), - "b": pd.Series( - [23, None, None, 32], dtype=pd.UInt16Dtype() - ), - } - ), - ), - ( - lambda: cudf.DataFrame( - { - "a": cudf.Series([None, 123, None, 1], dtype="uint32"), - "b": cudf.Series( - [234, 2323, 23432, None, None, 224], dtype="uint64" - ), - } - ), - pd.DataFrame( - { - "a": pd.Series( - [None, 123, None, 1], dtype=pd.UInt32Dtype() - ), - "b": pd.Series( - [234, 2323, 23432, None, None, 224], - dtype=pd.UInt64Dtype(), - ), - } - ), - ), - ( - lambda: cudf.DataFrame( - { - "a": cudf.Series( - [-10, 1, None, -1, None, 3], dtype="int8" - ), - "b": cudf.Series( - [111, None, 222, None, 13], dtype="int16" - ), - } - ), - pd.DataFrame( - { - "a": pd.Series( - [-10, 1, None, -1, None, 3], dtype=pd.Int8Dtype() - ), - "b": pd.Series( - [111, None, 222, None, 13], dtype=pd.Int16Dtype() - ), - } - ), - ), - ( - lambda: cudf.DataFrame( - { - "a": cudf.Series( - [11, None, 22, 33, None, 2, None, 3], dtype="int32" - ), - "b": cudf.Series( - [32431, None, None, 32322, 0, 10, -32324, None], - dtype="int64", - ), - } - ), - pd.DataFrame( - { - "a": pd.Series( - [11, None, 22, 33, None, 2, None, 3], - dtype=pd.Int32Dtype(), - ), - "b": pd.Series( - [32431, None, None, 32322, 0, 10, -32324, None], - dtype=pd.Int64Dtype(), - ), - } - ), - ), - ( - lambda: cudf.DataFrame( - { - "a": cudf.Series( - [True, None, False, None, False, True, True, False], - dtype="bool_", - ), - "b": cudf.Series( - [ - "abc", - "a", - None, - "hello world", - "foo buzz", - "", - None, - "rapids ai", - ], - dtype="object", - ), - "c": cudf.Series( - [0.1, None, 0.2, None, 3, 4, 1000, None], - dtype="float64", - ), - } - ), - pd.DataFrame( - { - "a": pd.Series( - [True, None, False, None, False, True, True, False], - dtype=pd.BooleanDtype(), - ), - "b": pd.Series( - [ - "abc", - "a", - None, - "hello world", - "foo buzz", - "", - None, - "rapids ai", - ], - dtype=pd.StringDtype(), - ), - "c": pd.Series( - [0.1, None, 0.2, None, 3, 4, 1000, None], - dtype=pd.Float64Dtype(), - ), - } - ), - ), - ], -) -def test_dataframe_to_pandas_nullable_dtypes(df, expected_pdf): - actual_pdf = df().to_pandas(nullable=True) - - assert_eq(actual_pdf, expected_pdf) - - -@pytest.mark.parametrize( - "data", - [ - [{"a": 1, "b": 2, "c": 3}, {"a": 4, "b": 5, "c": 6}], - [{"a": 1, "b": 2, "c": None}, {"a": None, "b": 5, "c": 6}], - [{"a": 1, "b": 2}, {"a": 1, "b": 5, "c": 6}], - [{"a": 1, "b": 2}, {"b": 5, "c": 6}], - [{}, {"a": 1, "b": 5, "c": 6}], - [{"a": 1, "b": 2, "c": 3}, {"a": 4.5, "b": 5.5, "c": 6.5}], - ], -) -def test_dataframe_init_from_list_of_dicts(data): - expect = pd.DataFrame(data) - got = cudf.DataFrame(data) - - assert_eq(expect, got) - - -def test_dataframe_pipe(): - pdf = pd.DataFrame() - gdf = cudf.DataFrame() - - def add_int_col(df, column): - df[column] = df._constructor_sliced([10, 20, 30, 40]) - return df - - def add_str_col(df, column): - df[column] = df._constructor_sliced(["a", "b", "xyz", "ai"]) - return df - - expected = ( - pdf.pipe(add_int_col, "one") - .pipe(add_int_col, column="two") - .pipe(add_str_col, "three") - ) - actual = ( - gdf.pipe(add_int_col, "one") - .pipe(add_int_col, column="two") - .pipe(add_str_col, "three") - ) - - assert_eq(expected, actual) - - expected = ( - pdf.pipe((add_str_col, "df"), column="one") - .pipe(add_str_col, column="two") - .pipe(add_int_col, "three") - ) - actual = ( - gdf.pipe((add_str_col, "df"), column="one") - .pipe(add_str_col, column="two") - .pipe(add_int_col, "three") - ) - - assert_eq(expected, actual) - - -def test_dataframe_pipe_error(): - pdf = pd.DataFrame() - gdf = cudf.DataFrame() - - def custom_func(df, column): - df[column] = df._constructor_sliced([10, 20, 30, 40]) - return df - - assert_exceptions_equal( - lfunc=pdf.pipe, - rfunc=gdf.pipe, - lfunc_args_and_kwargs=([(custom_func, "columns")], {"columns": "d"}), - rfunc_args_and_kwargs=([(custom_func, "columns")], {"columns": "d"}), - ) - - -@pytest.mark.parametrize( - "op", - ["count", "kurt", "kurtosis", "skew"], -) -def test_dataframe_axis1_unsupported_ops(op): - df = cudf.DataFrame({"a": [1, 2, 3], "b": [8, 9, 10]}) - - with pytest.raises( - NotImplementedError, match="Only axis=0 is currently supported." - ): - getattr(df, op)(axis=1) - - -def test_dataframe_from_pandas_duplicate_columns(): - pdf = pd.DataFrame(columns=["a", "b", "c", "a"]) - pdf["a"] = [1, 2, 3] - - with pytest.raises( - ValueError, match="Duplicate column names are not allowed" - ): - cudf.from_pandas(pdf) - - -@pytest.mark.parametrize( - "df", - [ - pd.DataFrame( - {"a": [1, 2, 3], "b": [10, 11, 20], "c": ["a", "bcd", "xyz"]} - ), - pd.DataFrame(), - ], -) -@pytest.mark.parametrize( - "columns", - [ - None, - ["a"], - ["c", "a"], - ["b", "a", "c"], - [], - pd.Index(["c", "a"]), - cudf.Index(["c", "a"]), - ["abc", "a"], - ["column_not_exists1", "column_not_exists2"], - ], -) -@pytest.mark.parametrize("index", [["abc", "def", "ghi"]]) -def test_dataframe_constructor_columns(df, columns, index, request): - def assert_local_eq(actual, df, expected, host_columns): - check_index_type = not expected.empty - if host_columns is not None and any( - col not in df.columns for col in host_columns - ): - assert_eq( - expected, - actual, - check_dtype=False, - check_index_type=check_index_type, - ) - else: - assert_eq( - expected, - actual, - check_index_type=check_index_type, - check_column_type=False, - ) - - gdf = cudf.from_pandas(df) - host_columns = ( - columns.to_pandas() if isinstance(columns, cudf.Index) else columns - ) - - expected = pd.DataFrame(df, columns=host_columns, index=index) - actual = cudf.DataFrame(gdf, columns=columns, index=index) - - assert_local_eq(actual, df, expected, host_columns) - - -def test_dataframe_constructor_column_index_only(): - columns = ["a", "b", "c"] - index = ["r1", "r2", "r3"] - - gdf = cudf.DataFrame(index=index, columns=columns) - assert not id(gdf["a"]._column) == id(gdf["b"]._column) and not id( - gdf["b"]._column - ) == id(gdf["c"]._column) - - -@pytest_unmark_spilling -@pytest.mark.parametrize( - "data", - [ - {"a": [1, 2.5, 3], "b": [3, 4.5, 5], "c": [2.0, 3.0, 4.0]}, - {"a": [1, 2.2, 3], "b": [2.0, 3.0, 4.0], "c": [5.0, 6.0, 4.0]}, - ], -) -@pytest.mark.parametrize( - "aggs", - [ - ["min", "sum", "max"], - ("min", "sum", "max"), - {"min", "sum", "max"}, - "sum", - {"a": "sum", "b": "min", "c": "max"}, - {"a": ["sum"], "b": ["min"], "c": ["max"]}, - {"a": ("sum"), "b": ("min"), "c": ("max")}, - {"a": {"sum"}, "b": {"min"}, "c": {"max"}}, - {"a": ["sum", "min"], "b": ["sum", "max"], "c": ["min", "max"]}, - {"a": ("sum", "min"), "b": ("sum", "max"), "c": ("min", "max")}, - {"a": {"sum", "min"}, "b": {"sum", "max"}, "c": {"min", "max"}}, - ], -) -def test_agg_for_dataframes(data, aggs): - pdf = pd.DataFrame(data) - gdf = cudf.DataFrame(data) - - expect = pdf.agg(aggs).sort_index() - got = gdf.agg(aggs).sort_index() - - assert_eq(expect, got, check_dtype=True) - - -@pytest_unmark_spilling -@pytest.mark.parametrize( - "data", - [ - {"a": [1, 2, 3], "b": [3.0, 4.0, 5.0], "c": [True, True, False]}, - {"a": [1, 2, 3], "b": [True, True, False], "c": [False, True, False]}, - ], -) -@pytest.mark.parametrize( - "aggs", - [ - ["min", "sum", "max"], - "sum", - {"a": "sum", "b": "min", "c": "max"}, - ], -) -def test_agg_for_dataframes_error(data, aggs): - gdf = cudf.DataFrame(data) - - with pytest.raises(TypeError): - gdf.agg(aggs) - - -def test_agg_for_unsupported_function(): - gdf = cudf.DataFrame({"a": [1, 2, 3], "b": [3.0, 4.0, 5.0]}) - - with pytest.raises(NotImplementedError): - gdf.agg({"a": np.sum, "b": np.min, "c": np.max}) - - -def test_agg_for_dataframe_with_invalid_function(): - aggs = "asdf" - gdf = cudf.DataFrame({"a": [1, 2, 3], "b": [3.0, 4.0, 5.0]}) - - with pytest.raises( - AttributeError, - match=f"{aggs} is not a valid function for 'DataFrame' object", - ): - gdf.agg(aggs) - - -def test_agg_for_series_with_invalid_function(): - aggs = {"a": "asdf"} - gdf = cudf.DataFrame({"a": [1, 2, 3], "b": [3.0, 4.0, 5.0]}) - - with pytest.raises( - AttributeError, - match=f"{aggs['a']} is not a valid function for 'Series' object", - ): - gdf.agg(aggs) - - -@pytest.mark.parametrize( - "aggs", - [ - "sum", - ["min", "sum", "max"], - {"a": {"sum", "min"}, "b": {"sum", "max"}, "c": {"min", "max"}}, - ], -) -def test_agg_for_dataframe_with_string_columns(aggs): - gdf = cudf.DataFrame( - {"a": ["m", "n", "o"], "b": ["t", "u", "v"], "c": ["x", "y", "z"]}, - index=["a", "b", "c"], - ) - - with pytest.raises( - NotImplementedError, - match=re.escape( - "DataFrame.agg() is not supported for " - "frames containing string columns" - ), - ): - gdf.agg(aggs) - - -@pytest_unmark_spilling -@pytest.mark.parametrize("overwrite", [True, False]) -@pytest.mark.parametrize( - "left_keys,right_keys", - [ - [("a", "b"), ("a", "b")], - [("a", "b"), ("a", "c")], - [("a", "b"), ("d", "e")], - ], -) -@pytest.mark.parametrize( - "data_left,data_right", - [ - [([1, 2, 3], [3, 4, 5]), ([1, 2, 3], [3, 4, 5])], - [ - ([1.0, 2.0, 3.0], [3.0, 4.0, 5.0]), - ([1.0, 2.0, 3.0], [3.0, 4.0, 5.0]), - ], - [ - ([True, False, True], [False, False, False]), - ([True, False, True], [False, False, False]), - ], - [ - ([np.nan, np.nan, np.nan], [np.nan, np.nan, np.nan]), - ([np.nan, np.nan, np.nan], [np.nan, np.nan, np.nan]), - ], - [([1, 2, 3], [3, 4, 5]), ([1, 2, 4], [30, 40, 50])], - [ - ([1.0, 2.0, 3.0], [3.0, 4.0, 5.0]), - ([1.0, 2.0, 4.0], [30.0, 40.0, 50.0]), - ], - [([1, 2, 3], [3, 4, 5]), ([10, 20, 40], [30, 40, 50])], - [ - ([1.0, 2.0, 3.0], [3.0, 4.0, 5.0]), - ([10.0, 20.0, 40.0], [30.0, 40.0, 50.0]), - ], - ], -) -def test_update_for_dataframes( - left_keys, right_keys, data_left, data_right, overwrite -): - errors = "ignore" - join = "left" - left = dict(zip(left_keys, data_left, strict=True)) - right = dict(zip(right_keys, data_right, strict=True)) - pdf = pd.DataFrame(left) - gdf = cudf.DataFrame(left, nan_as_null=False) - - other_pd = pd.DataFrame(right) - other_gd = cudf.DataFrame(right, nan_as_null=False) - - pdf.update(other=other_pd, join=join, overwrite=overwrite, errors=errors) - gdf.update(other=other_gd, join=join, overwrite=overwrite, errors=errors) - - assert_eq(pdf, gdf, check_dtype=False) - - -def test_update_for_right_join(): - gdf = cudf.DataFrame({"a": [1, 2, 3], "b": [3.0, 4.0, 5.0]}) - other_gd = cudf.DataFrame({"a": [1, np.nan, 3], "b": [np.nan, 2.0, 5.0]}) - - with pytest.raises( - NotImplementedError, match="Only left join is supported" - ): - gdf.update(other_gd, join="right") - - -def test_update_for_data_overlap(): - errors = "raise" - pdf = pd.DataFrame({"a": [1, 2, 3], "b": [3.0, 4.0, 5.0]}) - gdf = cudf.DataFrame({"a": [1, 2, 3], "b": [3.0, 4.0, 5.0]}) - - other_pd = pd.DataFrame({"a": [1, np.nan, 3], "b": [np.nan, 2.0, 5.0]}) - other_gd = cudf.DataFrame({"a": [1, np.nan, 3], "b": [np.nan, 2.0, 5.0]}) - - assert_exceptions_equal( - lfunc=pdf.update, - rfunc=gdf.update, - lfunc_args_and_kwargs=([other_pd, errors], {}), - rfunc_args_and_kwargs=([other_gd, errors], {}), - ) - - -@pytest.mark.parametrize( - "data", - [ - {"a": [[1], [2], [3]]}, - { - "left-a": [0, 1, 2], - "a": [[1], None, [3]], - "right-a": ["abc", "def", "ghi"], - }, - { - "left-a": [[], None, None], - "a": [[1], None, [3]], - "right-a": ["abc", "def", "ghi"], - }, - ], -) -def test_dataframe_roundtrip_arrow_list_dtype(data): - gdf = cudf.DataFrame(data) - table = gdf.to_arrow() - expected = cudf.DataFrame.from_arrow(table) - - assert_eq(gdf, expected) - - -@pytest.mark.parametrize( - "data", - [ - {"a": [{"one": 3, "two": 4, "three": 10}]}, - { - "left-a": [0, 1, 2], - "a": [{"x": 0.23, "y": 43}, None, {"x": 23.9, "y": 4.3}], - "right-a": ["abc", "def", "ghi"], - }, - { - "left-a": [{"a": 1}, None, None], - "a": [ - {"one": 324, "two": 23432, "three": 324}, - None, - {"one": 3.24, "two": 1, "three": 324}, - ], - "right-a": ["abc", "def", "ghi"], - }, - ], -) -def test_dataframe_roundtrip_arrow_struct_dtype(data): - gdf = cudf.DataFrame(data) - table = gdf.to_arrow() - expected = cudf.DataFrame.from_arrow(table) - - assert_eq(gdf, expected) - - -def test_dataframe_setitem_cupy_array(): - rng = np.random.default_rng(seed=0) - pdf = pd.DataFrame(rng.standard_normal(size=(10, 2))) - gdf = cudf.from_pandas(pdf) - - gpu_array = cupy.array([True, False] * 5) - pdf[gpu_array.get()] = 1.5 - gdf[gpu_array] = 1.5 - - assert_eq(pdf, gdf) - - -@pytest.mark.parametrize("level", ["x", 0]) -def test_rename_for_level_MultiIndex_dataframe(level): - data = {"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]} - index = {0: 123, 1: 4, 2: 6} - pdf = pd.DataFrame( - data, - index=pd.MultiIndex.from_tuples([(0, 1, 2), (1, 2, 3), (2, 3, 4)]), - ) - pdf.index.names = ["x", "y", "z"] - gdf = cudf.from_pandas(pdf) - - expect = pdf.rename(index=index, level=level) - got = gdf.rename(index=index, level=level) - - assert_eq(expect, got) - - -@pytest.mark.parametrize( - "columns", - [{"a": "f", "b": "g"}, {1: 3, 2: 4}, lambda s: 2 * s], -) -@pytest.mark.parametrize("level", [0, 1]) -def test_rename_for_level_MultiColumn_dataframe(columns, level): - gdf = cudf.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]}) - gdf.columns = pd.MultiIndex.from_tuples([("a", 1), ("a", 2), ("b", 1)]) - - pdf = gdf.to_pandas() - - expect = pdf.rename(columns=columns, level=level) - got = gdf.rename(columns=columns, level=level) - - assert_eq(expect, got) - - -def test_rename_for_level_RangeIndex_dataframe(): - gdf = cudf.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]}) - pdf = gdf.to_pandas() - - expect = pdf.rename(columns={"a": "f"}, index={0: 3, 1: 4}, level=0) - got = gdf.rename(columns={"a": "f"}, index={0: 3, 1: 4}, level=0) - - assert_eq(expect, got) - - -def test_rename_for_level_is_None_MC(): - gdf = cudf.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]}) - gdf.columns = pd.MultiIndex.from_tuples([("a", 1), ("a", 2), ("b", 1)]) - pdf = gdf.to_pandas() - - expect = pdf.rename(columns={"a": "f"}, level=None) - got = gdf.rename(columns={"a": "f"}, level=None) - - assert_eq(expect, got) - - -@pytest.mark.parametrize( - "data", - [ - [ - [[1, 2, 3], 11, "a"], - [None, 22, "e"], - [[4], 33, "i"], - [[], 44, "o"], - [[5, 6], 55, "u"], - ], # nested - [ - [1, 11, "a"], - [2, 22, "e"], - [3, 33, "i"], - [4, 44, "o"], - [5, 55, "u"], - ], # non-nested - ], -) -@pytest.mark.parametrize( - ("labels", "label_to_explode"), - [ - (None, 0), - (pd.Index(["a", "b", "c"]), "a"), - ( - pd.MultiIndex.from_tuples( - [(0, "a"), (0, "b"), (1, "a")], names=["l0", "l1"] - ), - (0, "a"), - ), - ], -) -@pytest.mark.parametrize("ignore_index", [True, False]) -@pytest.mark.parametrize( - "p_index", - [ - None, - ["ia", "ib", "ic", "id", "ie"], - pd.MultiIndex.from_tuples( - [(0, "a"), (0, "b"), (0, "c"), (1, "a"), (1, "b")] - ), - ], -) -def test_explode(data, labels, ignore_index, p_index, label_to_explode): - pdf = pd.DataFrame(data, index=p_index, columns=labels) - gdf = cudf.from_pandas(pdf) - - expect = pdf.explode(label_to_explode, ignore_index) - got = gdf.explode(label_to_explode, ignore_index) - - assert_eq(expect, got, check_dtype=False) - - -def test_explode_preserve_categorical(): - gdf = cudf.DataFrame( - { - "A": [[1, 2], None, [2, 3]], - "B": cudf.Series([0, 1, 2], dtype="category"), - } - ) - result = gdf.explode("A") - expected = cudf.DataFrame( - { - "A": [1, 2, None, 2, 3], - "B": cudf.Series([0, 0, 1, 2, 2], dtype="category"), - } - ) - expected.index = cudf.Index([0, 0, 1, 2, 2]) - assert_eq(result, expected) - - -@pytest.mark.parametrize( - "data,ascending,expected_data", - [ - ( - {"a": [10, 0, 2], "b": [-10, 10, 1]}, - True, - [1, 2, 0], - ), - ( - {"a": [10, 0, 2], "b": [-10, 10, 1]}, - False, - [0, 2, 1], - ), - ], -) -def test_dataframe_argsort(data, ascending, expected_data): - actual = cudf.DataFrame(data).argsort(ascending=ascending) - expected = cupy.array(expected_data, dtype="int32") - - assert_eq(actual, expected) - - -@pytest.mark.parametrize( - "data", - [ - np.random.RandomState(seed=10).randint(-50, 50, (25, 30)), - np.random.RandomState(seed=10).random_sample((4, 4)), - np.array([1.123, 2.343, 5.890, 0.0]), - [True, False, True, False, False], - {"a": [1.123, 2.343, np.nan, np.nan], "b": [None, 3, 9.08, None]}, - ], -) -@pytest.mark.parametrize("periods", (-5, -1, 0, 1, 5)) -def test_diff_numeric_dtypes(data, periods): - gdf = cudf.DataFrame(data) - pdf = gdf.to_pandas() - - actual = gdf.diff(periods=periods, axis=0) - expected = pdf.diff(periods=periods, axis=0) - - assert_eq( - expected, - actual, - check_dtype=False, - ) - - -@pytest.mark.parametrize( - ("precision", "scale"), - [(5, 2), (8, 5)], -) -@pytest.mark.parametrize( - "dtype", - [cudf.Decimal32Dtype, cudf.Decimal64Dtype], -) -def test_diff_decimal_dtypes(precision, scale, dtype): - gdf = cudf.DataFrame( - np.random.default_rng(seed=42).uniform(10.5, 75.5, (10, 6)), - dtype=dtype(precision=precision, scale=scale), - ) - pdf = gdf.to_pandas() - - actual = gdf.diff() - expected = pdf.diff() - - assert_eq( - expected, - actual, - check_dtype=False, - ) - - -def test_diff_invalid_axis(): - gdf = cudf.DataFrame(np.array([1.123, 2.343, 5.890, 0.0])) - with pytest.raises(NotImplementedError, match="Only axis=0 is supported."): - gdf.diff(periods=1, axis=1) - - -@pytest.mark.parametrize( - "data", - [ - { - "int_col": [1, 2, 3, 4, 5], - "float_col": [1.0, 2.0, 3.0, 4.0, 5.0], - "string_col": ["a", "b", "c", "d", "e"], - }, - ["a", "b", "c", "d", "e"], - ], -) -def test_diff_unsupported_dtypes(data): - gdf = cudf.DataFrame(data) - with pytest.raises( - TypeError, - match=r"unsupported operand type\(s\)", - ): - gdf.diff() - - -def test_diff_many_dtypes(): - pdf = pd.DataFrame( - { - "dates": pd.date_range("2020-01-01", "2020-01-06", freq="D"), - "bools": [True, True, True, False, True, True], - "floats": [1.0, 2.0, 3.5, np.nan, 5.0, -1.7], - "ints": [1, 2, 3, 3, 4, 5], - "nans_nulls": [np.nan, None, None, np.nan, np.nan, None], - } - ) - gdf = cudf.from_pandas(pdf) - assert_eq(pdf.diff(), gdf.diff()) - assert_eq(pdf.diff(periods=2), gdf.diff(periods=2)) - - -def test_dataframe_assign_cp_np_array(): - m, n = 5, 3 - cp_ndarray = cupy.random.randn(m, n) - pdf = pd.DataFrame({f"f_{i}": range(m) for i in range(n)}) - gdf = cudf.DataFrame({f"f_{i}": range(m) for i in range(n)}) - pdf[[f"f_{i}" for i in range(n)]] = cupy.asnumpy(cp_ndarray) - gdf[[f"f_{i}" for i in range(n)]] = cp_ndarray - - assert_eq(pdf, gdf) - - -def test_dataframe_nunique(): - gdf = cudf.DataFrame({"a": [1, 2, 3], "b": [1, 1, 0]}) - pdf = gdf.to_pandas() - - actual = gdf.nunique() - expected = pdf.nunique() - - assert_eq(expected, actual) - - -@pytest.mark.parametrize( - "columns", - [ - pd.RangeIndex(2, name="foo"), - pd.MultiIndex.from_arrays([[1, 2], [2, 3]], names=["foo", 1]), - pd.Index([3, 5], dtype=np.int8, name="foo"), - ], -) -def test_nunique_preserve_column_in_index(columns): - df = cudf.DataFrame([[1, 2]], columns=columns) - result = df.nunique().index.to_pandas() - assert_eq(result, columns, exact=True) - - -def test_dataframe_nunique_index(): - gdf = cudf.DataFrame( - {"key": [0, 1, 1, 0, 0, 1], "val": [1, 8, 3, 9, -3, 8]} - ) - pdf = gdf.to_pandas() - - actual = gdf.index.nunique() - expected = pdf.index.nunique() - - assert_eq(expected, actual) - - -def test_dataframe_rename_duplicate_column(): - gdf = cudf.DataFrame({"a": [1, 2, 3], "b": [3, 4, 5]}) - with pytest.raises( - ValueError, match="Duplicate column names are not allowed" - ): - gdf.rename(columns={"a": "b"}, inplace=True) - - -def test_dataframe_rename_columns_keep_type(): - gdf = cudf.DataFrame([[1, 2, 3]]) - gdf.columns = cudf.Index([4, 5, 6], dtype=np.int8) - result = gdf.rename({4: 50}, axis="columns").columns - expected = pd.Index([50, 5, 6], dtype=np.int8) - assert_eq(result, expected) - - -@pytest_unmark_spilling -@pytest.mark.skipif( - PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, - reason="warning not present in older pandas versions", -) -@pytest.mark.parametrize( - "data", - [ - np.random.RandomState(seed=10).randint(-50, 50, (10, 10)), - np.random.RandomState(seed=10).random_sample((4, 4)), - np.array([1.123, 2.343, 5.890, 0.0]), - {"a": [1.123, 2.343, np.nan, np.nan], "b": [None, 3, 9.08, None]}, - ], -) -@pytest.mark.parametrize("periods", [-5, 0, 2]) -@pytest.mark.parametrize( - "fill_method", ["ffill", "bfill", "pad", "backfill", no_default] -) -def test_dataframe_pct_change(data, periods, fill_method): - gdf = cudf.DataFrame(data) - pdf = gdf.to_pandas() - - with expect_warning_if(fill_method is not no_default): - actual = gdf.pct_change(periods=periods, fill_method=fill_method) - with expect_warning_if( - fill_method is not no_default or pdf.isna().any().any() - ): - expected = pdf.pct_change(periods=periods, fill_method=fill_method) - - assert_eq(expected, actual) - - -@pytest.mark.parametrize("numeric_only", [True, False]) -def test_mean_timeseries(numeric_only): - gdf = cudf.datasets.timeseries() - if not numeric_only: - gdf = gdf.select_dtypes(include="number") - pdf = gdf.to_pandas() - - expected = pdf.mean(numeric_only=numeric_only) - actual = gdf.mean(numeric_only=numeric_only) - - assert_eq(expected, actual) - - -@pytest.mark.parametrize("numeric_only", [True, False]) -def test_std_different_dtypes(numeric_only): - gdf = cudf.DataFrame( - { - "a": [1, 2, 3, 4, 5], - "b": ["a", "b", "c", "d", "e"], - "c": [1.0, 2.0, 3.0, 4.0, 5.0], - } - ) - if not numeric_only: - gdf = gdf.select_dtypes(include="number") - pdf = gdf.to_pandas() - - expected = pdf.std(numeric_only=numeric_only) - actual = gdf.std(numeric_only=numeric_only) - - assert_eq(expected, actual) - - -def test_empty_numeric_only(): - gdf = cudf.DataFrame( - { - "id": ["a", "a", "a", "b", "b", "b", "c", "c", "c"], - "val1": ["v", "n", "k", "l", "m", "i", "y", "r", "w"], - "val2": ["d", "d", "d", "e", "e", "e", "f", "f", "f"], - } - ) - pdf = gdf.to_pandas() - expected = pdf.prod(numeric_only=True) - actual = gdf.prod(numeric_only=True) - assert_eq(expected, actual, check_dtype=True) - - -# Note that for now expressions do not automatically handle casting, so inputs -# need to be casted appropriately -@pytest.mark.parametrize( - "expr, dtype", - [ - ("a", int), - ("+a", int), - ("a + b", int), - ("a == b", int), - ("a / b", float), - ("a * b", int), - ("a > b", int), - ("a >= b", int), - ("a > b > c", int), - ("a > b < c", int), - ("a & b", int), - ("a & b | c", int), - ("sin(a)", float), - ("exp(sin(abs(a)))", float), - ("sqrt(floor(a))", float), - ("ceil(arctanh(a))", float), - ("(a + b) - (c * d)", int), - ("~a", int), - ("(a > b) and (c > d)", int), - ("(a > b) or (c > d)", int), - ("not (a > b)", int), - ("a + 1", int), - ("a + 1.0", float), - ("-a + 1", int), - ("+a + 1", int), - ("e = a + 1", int), - ( - """ - e = log(cos(a)) + 1.0 - f = abs(c) - exp(d) - """, - float, - ), - ("a_b_are_equal = (a == b)", int), - ("a > b", str), - ("a < '1'", str), - ('a == "1"', str), - ], -) -@pytest.mark.parametrize("nrows", [0, 10]) -def test_dataframe_eval(nrows, expr, dtype): - arr = np.ones(nrows) - df_eval = cudf.DataFrame({"a": arr, "b": arr, "c": arr, "d": arr}) - df_eval = df_eval.astype(dtype) - with _hide_ufunc_warnings(expr): - expect = df_eval.to_pandas().eval(expr) - got = df_eval.eval(expr) - # In the specific case where the evaluated expression is a unary function - # of a single column with no nesting, pandas will retain the name. This - # level of compatibility is out of scope for now. - assert_eq(expect, got, check_names=False) - - # Test inplace - if re.search("[^=><]=[^=]", expr) is not None: - pdf_eval = df_eval.to_pandas() - with _hide_ufunc_warnings(expr): - pdf_eval.eval(expr, inplace=True) - df_eval.eval(expr, inplace=True) - assert_eq(pdf_eval, df_eval) - - -@pytest.mark.parametrize( - "expr", - [ - """ - e = a + b - a == b - """, - "a_b_are_equal = (a == b) = c", - ], -) -def test_dataframe_eval_errors(expr): - df = cudf.DataFrame({"a": [1, 2, 3, 4, 5], "b": [1, 2, 3, 4, 5]}) - with pytest.raises(ValueError): - df.eval(expr) - - -def test_dataframe_eval_misc(): - df = cudf.DataFrame({"a": [1, 2, 3, None, 5]}) - got = df.eval("isnull(a)") - assert_eq(got, cudf.Series.isnull(df["a"]), check_names=False) - - df.eval("c = isnull(1)", inplace=True) - assert_eq(df["c"], cudf.Series([False] * len(df), name="c")) - - -@pytest.mark.parametrize( - "data", - [ - {"num_legs": [2, 4, 4, 6], "num_wings": [2, 0, 0, 0]}, - { - "first_name": ["John", "Anne", "John", "Beth"], - "middle_name": ["Smith", None, None, "Louise"], - }, - ], -) -@pytest.mark.parametrize("sort", [True, False]) -@pytest.mark.parametrize("ascending", [True, False]) -@pytest.mark.parametrize("normalize", [True, False]) -@pytest.mark.parametrize("dropna", [True, False]) -@pytest.mark.parametrize("use_subset", [True, False]) -def test_value_counts( - data, - sort, - ascending, - normalize, - dropna, - use_subset, -): - subset = [next(iter(data.keys()))] - gdf = cudf.DataFrame(data) - pdf = gdf.to_pandas() - - got = gdf.value_counts( - subset=subset if (use_subset) else None, - sort=sort, - ascending=ascending, - normalize=normalize, - dropna=dropna, - ) - expected = pdf.value_counts( - subset=subset if (use_subset) else None, - sort=sort, - ascending=ascending, - normalize=normalize, - dropna=dropna, - ) - - if not dropna: - # Convert the Pandas series to a cuDF one due to difference - # in the handling of NaNs between the two ( in cuDF and - # NaN in Pandas) when dropna=False. - assert_eq(got.sort_index(), cudf.from_pandas(expected).sort_index()) - else: - assert_eq(got.sort_index(), expected.sort_index()) - - -def test_value_counts_no_subset(): - gdf = cudf.DataFrame({"a": [1, 2, 3], "b": [1, 1, 0]}) - with pytest.raises(KeyError): - gdf.value_counts(subset=["not_a_column_name"]) - - -def test_multiindex_wildcard_selection_all(): - midx = cudf.MultiIndex.from_tuples( - [(c1, c2) for c1 in "abc" for c2 in "ab"] - ) - df = cudf.DataFrame({f"{i}": [i] for i in range(6)}) - df.columns = midx - expect = df.to_pandas().loc[:, (slice(None), "b")] - got = df.loc[:, (slice(None), "b")] - assert_eq(expect, got) - - -@pytest_xfail(reason="Not yet properly supported.") -def test_multiindex_wildcard_selection_partial(): - midx = cudf.MultiIndex.from_tuples( - [(c1, c2) for c1 in "abc" for c2 in "ab"] - ) - df = cudf.DataFrame({f"{i}": [i] for i in range(6)}) - df.columns = midx - expect = df.to_pandas().loc[:, (slice("a", "b"), "b")] - got = df.loc[:, (slice("a", "b"), "b")] - assert_eq(expect, got) - - -@pytest_xfail(reason="Not yet properly supported.") -def test_multiindex_wildcard_selection_three_level_all(): - midx = cudf.MultiIndex.from_tuples( - [(c1, c2, c3) for c1 in "abcd" for c2 in "abc" for c3 in "ab"] - ) - df = cudf.DataFrame({f"{i}": [i] for i in range(24)}) - df.columns = midx - - expect = df.to_pandas().loc[:, (slice("a", "c"), slice("a", "b"), "b")] - got = df.loc[:, (slice(None), "b")] - assert_eq(expect, got) - - -def test_dataframe_assign_scalar_to_empty_series(): - expected = pd.DataFrame({"a": []}) - actual = cudf.DataFrame({"a": []}) - expected.a = 0 - actual.a = 0 - assert_eq(expected, actual) - - -@pytest.mark.parametrize( - "data", - [ - {0: [1, 2, 3], 2: [10, 11, 23]}, - {("a", "b"): [1, 2, 3], ("2",): [10, 11, 23]}, - ], -) -def test_non_string_column_name_to_arrow(data): - df = cudf.DataFrame(data) - - expected = df.to_arrow() - actual = pa.Table.from_pandas(df.to_pandas()) - - assert expected.equals(actual) - - -def test_complex_types_from_arrow(): - expected = pa.Table.from_arrays( - [ - pa.array([1, 2, 3]), - pa.array([10, 20, 30]), - pa.array([{"a": 9}, {"b": 10}, {"c": 11}]), - pa.array([[{"a": 1}], [{"b": 2}], [{"c": 3}]]), - pa.array([10, 11, 12]).cast(pa.decimal128(21, 2)), - pa.array([{"a": 9}, {"b": 10, "c": {"g": 43}}, {"c": {"a": 10}}]), - ], - names=["a", "b", "c", "d", "e", "f"], - ) - - df = cudf.DataFrame.from_arrow(expected) - actual = df.to_arrow() - - assert expected.equals(actual) - - -@pytest.mark.parametrize( - "data", - [ - { - "brand": ["Yum Yum", "Yum Yum", "Indomie", "Indomie", "Indomie"], - "style": ["cup", "cup", "cup", "pack", "pack"], - "rating": [4, 4, 3.5, 15, 5], - }, - { - "brand": ["Indomie", "Yum Yum", "Indomie", "Indomie", "Indomie"], - "style": ["cup", "cup", "cup", "cup", "pack"], - "rating": [4, 4, 3.5, 4, 5], - }, - ], -) -@pytest.mark.parametrize( - "subset", [None, ["brand"], ["rating"], ["style", "rating"]] -) -@pytest.mark.parametrize("keep", ["first", "last", False]) -def test_dataframe_duplicated(data, subset, keep): - gdf = cudf.DataFrame(data) - pdf = gdf.to_pandas() - - expected = pdf.duplicated(subset=subset, keep=keep) - actual = gdf.duplicated(subset=subset, keep=keep) - - assert_eq(expected, actual) - - -@pytest.mark.parametrize( - "data", - [ - {"col": [{"a": 1.1}, {"a": 2.1}, {"a": 10.0}, {"a": 11.2323}, None]}, - {"a": [[{"b": 567}], None] * 10}, - {"a": [decimal.Decimal(10), decimal.Decimal(20), None]}, - ], -) -def test_dataframe_values_complex_types(data): - gdf = cudf.DataFrame(data) - with pytest.raises(NotImplementedError): - gdf.values - - -def test_dataframe_from_arrow_slice(): - table = pa.Table.from_pandas( - pd.DataFrame.from_dict( - {"a": ["aa", "bb", "cc"] * 3, "b": [1, 2, 3] * 3} - ) - ) - table_slice = table.slice(3, 7) - - expected = table_slice.to_pandas() - actual = cudf.DataFrame.from_arrow(table_slice) - - assert_eq(expected, actual) - - -@pytest.mark.parametrize( - "data,index", - [ - ({"a": [1, 2, 3], "b": ["x", "y", "z", "z"], "c": 4}, None), - ( - { - "a": [1, 2, 3], - "b": ["x", "y", "z"], - }, - [10, 11], - ), - ( - { - "a": [1, 2, 3], - "b": ["x", "y", "z"], - }, - [10, 11], - ), - ([[10, 11], [12, 13]], ["a", "b", "c"]), - ], -) -def test_dataframe_init_length_error(data, index): - assert_exceptions_equal( - lfunc=pd.DataFrame, - rfunc=cudf.DataFrame, - lfunc_args_and_kwargs=( - [], - {"data": data, "index": index}, - ), - rfunc_args_and_kwargs=( - [], - {"data": data, "index": index}, - ), - ) - - -def test_dataframe_binop_with_mixed_date_types(): - rng = np.random.default_rng(seed=0) - df = pd.DataFrame( - rng.random(size=(2, 2)), - columns=pd.Index(["2000-01-03", "2000-01-04"], dtype="datetime64[ns]"), - ) - ser = pd.Series(rng.random(size=3), index=[0, 1, 2]) - gdf = cudf.from_pandas(df) - gser = cudf.from_pandas(ser) - expected = df - ser - got = gdf - gser - assert_eq(expected, got) - - -def test_dataframe_binop_with_mixed_string_types(): - rng = np.random.default_rng(seed=0) - df1 = pd.DataFrame(rng.random(size=(3, 3)), columns=pd.Index([0, 1, 2])) - df2 = pd.DataFrame( - rng.random(size=(6, 6)), - columns=pd.Index([0, 1, 2, "VhDoHxRaqt", "X0NNHBIPfA", "5FbhPtS0D1"]), - ) - gdf1 = cudf.from_pandas(df1) - gdf2 = cudf.from_pandas(df2) - - expected = df2 + df1 - got = gdf2 + gdf1 - - assert_eq(expected, got) - - -def test_dataframe_binop_and_where(): - rng = np.random.default_rng(seed=0) - df = pd.DataFrame(rng.random(size=(2, 2)), columns=pd.Index([True, False])) - gdf = cudf.from_pandas(df) - - expected = df > 1 - got = gdf > 1 - - assert_eq(expected, got) - - expected = df[df > 1] - got = gdf[gdf > 1] - - assert_eq(expected, got) - - -def test_dataframe_binop_with_datetime_index(): - rng = np.random.default_rng(seed=0) - df = pd.DataFrame( - rng.random(size=(2, 2)), - columns=pd.Index(["2000-01-03", "2000-01-04"], dtype="datetime64[ns]"), - ) - ser = pd.Series( - rng.random(2), - index=pd.Index( - [ - "2000-01-04", - "2000-01-03", - ], - dtype="datetime64[ns]", - ), - ) - gdf = cudf.from_pandas(df) - gser = cudf.from_pandas(ser) - expected = df - ser - got = gdf - gser - assert_eq(expected, got) - - -@pytest.mark.parametrize( - "dtype", ["datetime64[ns]", "timedelta64[ns]", "int64", "float32"] -) -def test_dataframe_mixed_dtype_error(dtype): - pdf = pd.Series([1, 2, 3], dtype=dtype).to_frame().astype(object) - with pytest.raises(TypeError): - cudf.from_pandas(pdf) - - -@pytest.mark.parametrize( - "index_data,name", - [([10, 13], "a"), ([30, 40, 20], "b"), (["ef"], "c"), ([2, 3], "Z")], -) -def test_dataframe_reindex_with_index_names(index_data, name): - gdf = cudf.DataFrame( - { - "a": [10, 12, 13], - "b": [20, 30, 40], - "c": cudf.Series(["ab", "cd", "ef"], dtype="category"), - } - ) - if name in gdf.columns: - gdf = gdf.set_index(name) - pdf = gdf.to_pandas() - - gidx = cudf.Index(index_data, name=name) - actual = gdf.reindex(gidx) - expected = pdf.reindex(gidx.to_pandas()) - - assert_eq(actual, expected) - - actual = gdf.reindex(index_data) - expected = pdf.reindex(index_data) - - assert_eq(actual, expected) - - -def test_dataframe_reduction_error(): - gdf = cudf.DataFrame( - { - "a": cudf.Series([1, 2, 3], dtype="float"), - "d": cudf.Series([10, 20, 30], dtype="timedelta64[ns]"), - } - ) - - with pytest.raises(TypeError): - gdf.sum() - - -def test_dataframe_from_generator(): - pdf = pd.DataFrame((i for i in range(5))) - gdf = cudf.DataFrame((i for i in range(5))) - assert_eq(pdf, gdf) - - -@pytest.mark.parametrize("name", ["a", 0, None, np.nan, cudf.NA]) -@pytest.mark.parametrize("contains", ["a", 0, None, np.nan, cudf.NA]) -@pytest.mark.parametrize("other_names", [[], ["b", "c"], [1, 2]]) -def test_dataframe_contains(name, contains, other_names): - column_names = [name, *other_names] - gdf = cudf.DataFrame({c: [0] for c in column_names}) - pdf = pd.DataFrame({c: [0] for c in column_names}) - - assert_eq(gdf, pdf) - - if contains is cudf.NA or name is cudf.NA: - expectation = contains is cudf.NA and name is cudf.NA - assert (contains in pdf) == expectation - assert (contains in gdf) == expectation - elif gdf.columns.dtype.kind == "f": - # In some cases, the columns are converted to an Index[float] based on - # the other column names. That casts name values from None to np.nan. - expectation = contains is np.nan and (name is None or name is np.nan) - assert (contains in pdf) == expectation - assert (contains in gdf) == expectation - else: - expectation = contains == name or ( - contains is np.nan and name is np.nan - ) - assert (contains in pdf) == expectation - assert (contains in gdf) == expectation - - assert (contains in pdf) == (contains in gdf) - - -def test_dataframe_series_dot(): - pser = pd.Series(range(2)) - gser = cudf.from_pandas(pser) - - expected = pser @ pser - actual = gser @ gser - - assert_eq(expected, actual) - - pdf = pd.DataFrame([[1, 2], [3, 4]], columns=list("ab")) - gdf = cudf.from_pandas(pdf) - - expected = pser @ pdf - actual = gser @ gdf - - assert_eq(expected, actual) - - assert_exceptions_equal( - lfunc=pdf.dot, - rfunc=gdf.dot, - lfunc_args_and_kwargs=([pser], {}), - rfunc_args_and_kwargs=([gser], {}), - ) - - assert_exceptions_equal( - lfunc=pdf.dot, - rfunc=gdf.dot, - lfunc_args_and_kwargs=([pdf], {}), - rfunc_args_and_kwargs=([gdf], {}), - ) - - pser = pd.Series(range(2), index=["a", "k"]) - gser = cudf.from_pandas(pser) - - pdf = pd.DataFrame([[1, 2], [3, 4]], columns=list("ab"), index=["a", "k"]) - gdf = cudf.from_pandas(pdf) - - expected = pser @ pdf - actual = gser @ gdf - - assert_eq(expected, actual) - - actual = gdf @ [2, 3] - expected = pdf @ [2, 3] - - assert_eq(expected, actual) - - actual = pser @ [12, 13] - expected = gser @ [12, 13] - - assert_eq(expected, actual) - - -def test_dataframe_reindex_keep_colname(): - gdf = cudf.DataFrame([1], columns=cudf.Index([1], name="foo")) - result = gdf.reindex(index=[0, 1]) - expected = cudf.DataFrame( - [1, None], columns=cudf.Index([1], name="foo"), index=[0, 1] - ) - assert_eq(result, expected) - - -def test_dataframe_duplicate_index_reindex(): - gdf = cudf.DataFrame({"a": [0, 1, 2, 3]}, index=[0, 0, 1, 1]) - pdf = gdf.to_pandas() - - assert_exceptions_equal( - gdf.reindex, - pdf.reindex, - lfunc_args_and_kwargs=([10, 11, 12, 13], {}), - rfunc_args_and_kwargs=([10, 11, 12, 13], {}), - ) - - -@pytest.mark.parametrize( - "expected", - [ - pd.RangeIndex(1, 2, name="a"), - pd.Index([1], dtype=np.int8, name="a"), - pd.MultiIndex.from_arrays([[1]], names=["a"]), - ], -) -@pytest.mark.parametrize("binop", [lambda df: df == df, lambda df: df - 1]) -def test_dataframe_binop_preserves_column_metadata(expected, binop): - df = cudf.DataFrame([1], columns=expected) - result = binop(df).columns - pd.testing.assert_index_equal(result, expected, exact=True) - - -@pytest.mark.parametrize( - "scalar", - [ - 1, - 1.0, - "a", - datetime.datetime(2020, 1, 1), - datetime.timedelta(1), - {"1": 2}, - [1], - decimal.Decimal("1.0"), - ], -) -def test_dataframe_to_pandas_arrow_type_nullable_raises(scalar): - pa_array = pa.array([scalar, None]) - df = cudf.DataFrame({"a": pa_array}) - with pytest.raises(ValueError): - df.to_pandas(nullable=True, arrow_type=True) - - -@pytest.mark.parametrize( - "scalar", - [ - 1, - 1.0, - "a", - datetime.datetime(2020, 1, 1), - datetime.timedelta(1), - {"1": 2}, - [1], - decimal.Decimal("1.0"), - ], -) -def test_dataframe_to_pandas_arrow_type(scalar): - pa_array = pa.array([scalar, None]) - df = cudf.DataFrame({"a": pa_array}) - result = df.to_pandas(arrow_type=True) - expected = pd.DataFrame({"a": pd.arrays.ArrowExtensionArray(pa_array)}) - pd.testing.assert_frame_equal(result, expected) - - -@pytest.mark.parametrize("dtype1", ["int16", "float32"]) -@pytest.mark.parametrize("dtype2", ["int16", "float32"]) -def test_dataframe_loc_int_float(dtype1, dtype2): - df = cudf.DataFrame( - {"a": [10, 11, 12, 13, 14]}, - index=cudf.Index([1, 2, 3, 4, 5], dtype=dtype1), - ) - pdf = df.to_pandas() - - gidx = cudf.Index([2, 3, 4], dtype=dtype2) - pidx = gidx.to_pandas() - - actual = df.loc[gidx] - expected = pdf.loc[pidx] - - assert_eq(actual, expected, check_index_type=True, check_dtype=True) - - -@pytest.mark.parametrize("names", [["abc", "def"], [1, 2], ["abc", 10]]) -def test_dataframe_multiindex_column_names(names): - arrays = [["A", "A", "B", "B"], ["one", "two", "one", "two"]] - tuples = list(zip(*arrays, strict=True)) - index = pd.MultiIndex.from_tuples(tuples, names=["first", "second"]) - - pdf = pd.DataFrame([[1, 2, 3, 4], [5, 6, 7, 8]], columns=index) - df = cudf.from_pandas(pdf) - - assert_eq(df, pdf) - assert_eq(df.columns.names, pdf.columns.names) - pdf.columns.names = names - df.columns.names = names - assert_eq(df, pdf) - assert_eq(df.columns.names, pdf.columns.names) - - -def test_roundtrip_dataframe_plc_table(na_data): - pdf = na_data - expect = cudf.DataFrame.from_pandas(pdf) - actual = cudf.DataFrame.from_pylibcudf(*expect.to_pylibcudf()) - assert_eq(expect, actual) - - -def test_dataframe_midx_columns_loc(): - idx_1 = ["Hi", "Lo"] - idx_2 = ["I", "II", "III"] - idx = cudf.MultiIndex.from_product([idx_1, idx_2]) - - data_rand = ( - np.random.default_rng(seed=0) - .uniform(0, 1, 3 * len(idx)) - .reshape(3, -1) - ) - df = cudf.DataFrame(data_rand, index=["A", "B", "C"], columns=idx) - pdf = df.to_pandas() - - assert_eq(df.shape, pdf.shape) - - expected = pdf.loc[["A", "B"]] - actual = df.loc[["A", "B"]] - - assert_eq(expected, actual) - assert_eq(df, pdf) - - -def test_rename_reset_label_dtype(): - data = {1: [2]} - col_mapping = {1: "a"} - result = cudf.DataFrame(data).rename(columns=col_mapping) - expected = pd.DataFrame(data).rename(columns=col_mapping) - assert_eq(result, expected) - - -def test_insert_reset_label_dtype(): - result = cudf.DataFrame({1: [2]}) - expected = pd.DataFrame({1: [2]}) - result.insert(1, "a", [2]) - expected.insert(1, "a", [2]) - assert_eq(result, expected) - - -def test_setitem_reset_label_dtype(): - result = cudf.DataFrame({1: [2]}) - expected = pd.DataFrame({1: [2]}) - result["a"] = [2] - expected["a"] = [2] - assert_eq(result, expected) - - -def test_dataframe_midx_cols_getitem(): - df = cudf.DataFrame( - { - "a": ["a", "b", "c"], - "b": ["b", "", ""], - "c": [10, 11, 12], - } - ) - df.columns = df.set_index(["a", "b"]).index - pdf = df.to_pandas() - - expected = df["c"] - actual = pdf["c"] - assert_eq(expected, actual) - df = cudf.DataFrame( - [[1, 0], [0, 1]], - columns=[ - ["foo", "foo"], - ["location", "location"], - ["x", "y"], - ], - ) - df = df.assign(bools=cudf.Series([True, False], dtype="bool")) - assert_eq(df["bools"], df.to_pandas()["bools"]) From fe33fdbe2f454b53ebb2ac2d9b8d05a97d6f10d6 Mon Sep 17 00:00:00 2001 From: James Lamb Date: Wed, 3 Sep 2025 20:06:12 -0500 Subject: [PATCH 252/366] use 'nvidia-ml-py' package for 'pynvml' module (#19862) Contributes to https://github.com/rapidsai/build-infra/issues/293 The `pynvml` *package* (https://github.com/gpuopenanalytics/pynvml) provides 2 things: * the `pynvml` **Python module**, transitively via a dependency on the `nvidia-ml-py` package * the `pynvml_utils` Python module This project doesn't need the `pynvml_utils` Python module, so this PR proposes dropping the dependency on the `pynvml` package in favor of `nvidia-ml-py`. ## Notes for Reviewers # Authors: - James Lamb (https://github.com/jameslamb) Approvers: - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/19862 --- conda/environments/all_cuda-129_arch-aarch64.yaml | 2 +- conda/environments/all_cuda-129_arch-x86_64.yaml | 2 +- conda/environments/all_cuda-130_arch-aarch64.yaml | 2 +- conda/environments/all_cuda-130_arch-x86_64.yaml | 2 +- conda/recipes/dask-cudf/recipe.yaml | 4 +++- dependencies.yaml | 4 +++- python/dask_cudf/pyproject.toml | 2 +- 7 files changed, 11 insertions(+), 7 deletions(-) diff --git a/conda/environments/all_cuda-129_arch-aarch64.yaml b/conda/environments/all_cuda-129_arch-aarch64.yaml index a4a81ebaa66..03505a99545 100644 --- a/conda/environments/all_cuda-129_arch-aarch64.yaml +++ b/conda/environments/all_cuda-129_arch-aarch64.yaml @@ -60,6 +60,7 @@ dependencies: - numpy>=1.23,<3.0a0 - numpydoc - nvidia-ml-py +- nvidia-ml-py>=12.560.30 - nvtx>=0.2.1 - openpyxl - packaging @@ -70,7 +71,6 @@ dependencies: - pre-commit - pyarrow>=15.0.0 - pydata-sphinx-theme>=0.15.4 -- pynvml>=12.0.0,<13.0.0a0 - pytest - pytest-benchmark - pytest-cases>=3.8.2 diff --git a/conda/environments/all_cuda-129_arch-x86_64.yaml b/conda/environments/all_cuda-129_arch-x86_64.yaml index 0fa1e5f3ddc..63ec249d456 100644 --- a/conda/environments/all_cuda-129_arch-x86_64.yaml +++ b/conda/environments/all_cuda-129_arch-x86_64.yaml @@ -61,6 +61,7 @@ dependencies: - numpy>=1.23,<3.0a0 - numpydoc - nvidia-ml-py +- nvidia-ml-py>=12.560.30 - nvtx>=0.2.1 - openpyxl - packaging @@ -71,7 +72,6 @@ dependencies: - pre-commit - pyarrow>=15.0.0 - pydata-sphinx-theme>=0.15.4 -- pynvml>=12.0.0,<13.0.0a0 - pytest - pytest-benchmark - pytest-cases>=3.8.2 diff --git a/conda/environments/all_cuda-130_arch-aarch64.yaml b/conda/environments/all_cuda-130_arch-aarch64.yaml index c64e9000a08..454a4053fdf 100644 --- a/conda/environments/all_cuda-130_arch-aarch64.yaml +++ b/conda/environments/all_cuda-130_arch-aarch64.yaml @@ -60,6 +60,7 @@ dependencies: - numpy>=1.23,<3.0a0 - numpydoc - nvidia-ml-py +- nvidia-ml-py>=12.560.30 - nvtx>=0.2.1 - openpyxl - packaging @@ -70,7 +71,6 @@ dependencies: - pre-commit - pyarrow>=15.0.0 - pydata-sphinx-theme>=0.15.4 -- pynvml>=12.0.0,<13.0.0a0 - pytest - pytest-benchmark - pytest-cases>=3.8.2 diff --git a/conda/environments/all_cuda-130_arch-x86_64.yaml b/conda/environments/all_cuda-130_arch-x86_64.yaml index 7ae24161e60..580fc9e4b1b 100644 --- a/conda/environments/all_cuda-130_arch-x86_64.yaml +++ b/conda/environments/all_cuda-130_arch-x86_64.yaml @@ -61,6 +61,7 @@ dependencies: - numpy>=1.23,<3.0a0 - numpydoc - nvidia-ml-py +- nvidia-ml-py>=12.560.30 - nvtx>=0.2.1 - openpyxl - packaging @@ -71,7 +72,6 @@ dependencies: - pre-commit - pyarrow>=15.0.0 - pydata-sphinx-theme>=0.15.4 -- pynvml>=12.0.0,<13.0.0a0 - pytest - pytest-benchmark - pytest-cases>=3.8.2 diff --git a/conda/recipes/dask-cudf/recipe.yaml b/conda/recipes/dask-cudf/recipe.yaml index fc9e20f4192..077195dfb97 100644 --- a/conda/recipes/dask-cudf/recipe.yaml +++ b/conda/recipes/dask-cudf/recipe.yaml @@ -34,7 +34,9 @@ requirements: run: - python - cudf =${{ version }} - - pynvml >=12.0.0,<13.0.0a0 + # 'nvidia-ml-py' provides the 'pynvml' module, since v12.560.30 + # ref: https://github.com/conda-forge/nvidia-ml-py-feedstock/pull/24 + - nvidia-ml-py>=12.560.30 - rapids-dask-dependency =${{ minor_version }} - ${{ pin_compatible("cuda-version", upper_bound="x", lower_bound="x") }} diff --git a/dependencies.yaml b/dependencies.yaml index 67e627aa718..fdb33201aa3 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -734,7 +734,9 @@ dependencies: common: - output_types: [conda, requirements, pyproject] packages: - - pynvml>=12.0.0,<13.0.0a0 + # 'nvidia-ml-py' provides the 'pynvml' module, since v12.560.30 + # ref: https://github.com/conda-forge/nvidia-ml-py-feedstock/pull/24 + - nvidia-ml-py>=12.560.30 - rapids-dask-dependency==25.10.*,>=0.0.0a0 run_custreamz: common: diff --git a/python/dask_cudf/pyproject.toml b/python/dask_cudf/pyproject.toml index 3b5f7a1d2e7..ef0a7d9376f 100644 --- a/python/dask_cudf/pyproject.toml +++ b/python/dask_cudf/pyproject.toml @@ -23,8 +23,8 @@ dependencies = [ "cupy-cuda13x>=13.6.0", "fsspec>=0.6.0", "numpy>=1.23,<3.0a0", + "nvidia-ml-py>=12.560.30", "pandas>=2.0,<2.4.0dev0", - "pynvml>=12.0.0,<13.0.0a0", "rapids-dask-dependency==25.10.*,>=0.0.0a0", ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. classifiers = [ From 86509db03cb17e95c0011c910db16e3bf29e6d87 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Wed, 3 Sep 2025 18:10:55 -0700 Subject: [PATCH 253/366] Move test_categorical/dask/serialize.py to new cuDF classic test directory structure (#19877) Towards https://github.com/rapidsai/cudf/issues/9999 Towards https://github.com/rapidsai/cudf/issues/15723 Also conslidates `test_version.py` into `test_api.py` Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/19877 --- python/cudf/cudf/tests/dask/__init__.py | 0 .../test_dataframe_utils.py} | 0 .../cudf/tests/{ => dask}/test_serialize.py | 35 +- .../tests/dataframe/methods/test_assign.py | 47 + .../cudf/tests/dataframe/methods/test_copy.py | 24 +- .../tests/dataframe/methods/test_set_index.py | 15 + .../dataframe/methods/test_sort_index.py | 14 + .../tests/dtypes/test_categoricaldtype.py | 29 + python/cudf/cudf/tests/groupby/test_fillna.py | 21 + .../categoricalindex/methods/__init__.py | 0 .../methods/test_add_categories.py | 15 + .../methods/test_as_ordered.py | 18 + .../methods/test_remove_categories.py | 15 + .../methods/test_reorder_categories.py | 17 + .../methods/test_set_categories.py | 17 + .../categoricalindex/test_attributes.py | 23 +- .../tests/indexes/index/test_constructor.py | 10 + .../cudf/tests/series/accessors/test_cat.py | 349 +++++++ .../tests/series/indexing/test_getitem.py | 27 + .../tests/series/indexing/test_setitem.py | 12 + .../cudf/tests/series/methods/test_astype.py | 73 ++ .../cudf/tests/series/methods/test_nunique.py | 21 +- .../tests/series/methods/test_reductions.py | 19 + .../cudf/tests/series/methods/test_unique.py | 23 +- python/cudf/cudf/tests/series/test_binops.py | 26 + .../cudf/tests/series/test_constructors.py | 78 ++ python/cudf/cudf/tests/test_api.py | 9 + python/cudf/cudf/tests/test_categorical.py | 895 ------------------ python/cudf/cudf/tests/test_version.py | 12 - 29 files changed, 916 insertions(+), 928 deletions(-) create mode 100644 python/cudf/cudf/tests/dask/__init__.py rename python/cudf/cudf/tests/{test_dask.py => dask/test_dataframe_utils.py} (100%) rename python/cudf/cudf/tests/{ => dask}/test_serialize.py (94%) create mode 100644 python/cudf/cudf/tests/indexes/categoricalindex/methods/__init__.py create mode 100644 python/cudf/cudf/tests/indexes/categoricalindex/methods/test_add_categories.py create mode 100644 python/cudf/cudf/tests/indexes/categoricalindex/methods/test_as_ordered.py create mode 100644 python/cudf/cudf/tests/indexes/categoricalindex/methods/test_remove_categories.py create mode 100644 python/cudf/cudf/tests/indexes/categoricalindex/methods/test_reorder_categories.py create mode 100644 python/cudf/cudf/tests/indexes/categoricalindex/methods/test_set_categories.py create mode 100644 python/cudf/cudf/tests/series/accessors/test_cat.py delete mode 100644 python/cudf/cudf/tests/test_categorical.py delete mode 100644 python/cudf/cudf/tests/test_version.py diff --git a/python/cudf/cudf/tests/dask/__init__.py b/python/cudf/cudf/tests/dask/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/python/cudf/cudf/tests/test_dask.py b/python/cudf/cudf/tests/dask/test_dataframe_utils.py similarity index 100% rename from python/cudf/cudf/tests/test_dask.py rename to python/cudf/cudf/tests/dask/test_dataframe_utils.py diff --git a/python/cudf/cudf/tests/test_serialize.py b/python/cudf/cudf/tests/dask/test_serialize.py similarity index 94% rename from python/cudf/cudf/tests/test_serialize.py rename to python/cudf/cudf/tests/dask/test_serialize.py index 59609158728..132ac6d5006 100644 --- a/python/cudf/cudf/tests/test_serialize.py +++ b/python/cudf/cudf/tests/dask/test_serialize.py @@ -6,12 +6,12 @@ import msgpack import numpy as np import pandas as pd +import pyarrow as pa import pytest from packaging import version import cudf -from cudf.core.column import as_column -from cudf.testing import _utils as utils, assert_eq +from cudf.testing import assert_eq @pytest.mark.parametrize( @@ -230,14 +230,7 @@ def test_serialize_multi_index(): def test_serialize_masked_series(): - nelem = 50 - rng = np.random.default_rng(seed=0) - data = rng.random(nelem) - mask = utils.random_bitmask(nelem) - bitmask = utils.expand_bits_to_bytes(mask)[:nelem] - null_count = utils.count_zero(bitmask) - assert null_count >= 0 - sr = cudf.Series._from_column(as_column(data).set_mask(mask)) + sr = cudf.Series(pa.array([1, None, 2])) outsr = cudf.Series.deserialize(*sr.serialize()) assert_eq(sr, outsr) @@ -470,3 +463,25 @@ def test_serialize_decimal_columns(data): ) recreated = df.__class__.deserialize(*df.serialize()) assert_eq(recreated, df) + + +@pytest.mark.parametrize( + "data", + [ + {"a": pd.Series(["a", "b", "c", "a", "c", "b"]).astype("category")}, + { + "a": pd.Series(["a", "a", "b", "b"]).astype("category"), + "b": pd.Series(["b", "b", "c", "c"]).astype("category"), + "c": pd.Series(["c", "c", "a", "a"]).astype("category"), + }, + { + "a": pd.Series(["a", None, "b", "b"]).astype("category"), + "b": pd.Series(["b", "b", None, "c"]).astype("category"), + "c": pd.Series(["c", "c", "a", None]).astype("category"), + }, + ], +) +def test_serialize_categorical_columns(data): + df = cudf.DataFrame(data) + recreated = df.__class__.deserialize(*df.serialize()) + assert_eq(recreated, df) diff --git a/python/cudf/cudf/tests/dataframe/methods/test_assign.py b/python/cudf/cudf/tests/dataframe/methods/test_assign.py index 9cbd740a4ef..02159a31e63 100644 --- a/python/cudf/cudf/tests/dataframe/methods/test_assign.py +++ b/python/cudf/cudf/tests/dataframe/methods/test_assign.py @@ -34,3 +34,50 @@ def test_assign_callable(mapping): expect = df.assign(**mapping) actual = cdf.assign(**mapping) assert_eq(expect, actual) + + +@pytest.mark.parametrize( + "data", + [ + [1, 2, 3, 4], + ["a", "1", "2", "1", "a"], + pd.Series(["a", "1", "22", "1", "aa"]), + pd.Series(["a", "1", "22", "1", "aa"], dtype="category"), + pd.Series([1, 2, 3, 4], dtype="int64"), + pd.Series([1, 2.3, 3, 4], dtype="float"), + [None, 1, None, 2, None], + ["a"], + ], +) +@pytest.mark.parametrize( + "categories", + [ + ["aa", "bb", "cc"], + [2, 4, 10, 100], + ["a", "b", "c"], + ["22", "b", "c"], + ["a"], + ], +) +def test_categorical_assignment(data, categories): + cat_dtype = pd.CategoricalDtype(categories) + pd_df = pd.DataFrame({"a": np.ones(len(data))}) + cd_df = cudf.from_pandas(pd_df) + + pd_cat_series = pd.Series(data, dtype=cat_dtype) + # assign categorical series + pd_df.assign(cat_col=pd_cat_series) + cd_df.assign(cat_col=pd_cat_series) + assert_eq(pd_df, cd_df) + + # assign categorical array + # needed for dask_cudf support for including file name + # as a categorical column + # see issue: https://github.com/rapidsai/cudf/issues/2269 + pd_df = pd.DataFrame({"a": np.ones(len(data))}) + cd_df = cudf.from_pandas(pd_df) + + pd_categorical = pd.Categorical(data, dtype=cat_dtype) + pd_df.assign(cat_col=pd_categorical) + cd_df.assign(cat_col=pd_categorical) + assert_eq(pd_df, cd_df) diff --git a/python/cudf/cudf/tests/dataframe/methods/test_copy.py b/python/cudf/cudf/tests/dataframe/methods/test_copy.py index 7910451db57..787e2a7f0af 100644 --- a/python/cudf/cudf/tests/dataframe/methods/test_copy.py +++ b/python/cudf/cudf/tests/dataframe/methods/test_copy.py @@ -6,7 +6,7 @@ import pandas as pd import pytest -from cudf.core.dataframe import DataFrame +import cudf from cudf.testing import assert_eq, assert_neq """ @@ -43,7 +43,7 @@ def test_dataframe_deep_copy(copy_fn): pdf = pd.DataFrame( [[1, 2, 3], [4, 5, 6], [7, 8, 9]], columns=["a", "b", "c"] ) - gdf = DataFrame.from_pandas(pdf) + gdf = cudf.DataFrame.from_pandas(pdf) copy_pdf = copy_fn(pdf) copy_gdf = copy_fn(gdf) copy_pdf["b"] = [0, 0, 0] @@ -67,7 +67,7 @@ def test_cudf_dataframe_copy(copy_fn, ncols, all_supported_types_as_str): for i in range(ncols) } ) - df = DataFrame.from_pandas(pdf) + df = cudf.DataFrame.from_pandas(pdf) copy_df = copy_fn(df) assert_eq(df, copy_df) @@ -85,7 +85,7 @@ def test_cudf_dataframe_copy_then_insert( for i in range(ncols) } ) - df = DataFrame.from_pandas(pdf) + df = cudf.DataFrame.from_pandas(pdf) copy_df = copy_fn(df) copy_pdf = copy_fn(pdf) copy_df["aa"] = pd.Series(rng.integers(0, 1000, 20)).astype( @@ -102,7 +102,7 @@ def test_deep_copy_write_in_place(): pdf = pd.DataFrame( [[1, 2, 3], [4, 5, 6], [7, 8, 9]], columns=["a", "b", "c"] ) - gdf = DataFrame.from_pandas(pdf) + gdf = cudf.DataFrame.from_pandas(pdf) cdf = gdf.copy(deep=True) sr = gdf["b"] @@ -117,7 +117,7 @@ def test_shallow_copy_write_in_place(): pdf = pd.DataFrame( [[1, 2, 3], [4, 5, 6], [7, 8, 9]], columns=["a", "b", "c"] ) - gdf = DataFrame.from_pandas(pdf) + gdf = cudf.DataFrame.from_pandas(pdf) cdf = gdf.copy(deep=False) sr = gdf["a"] @@ -133,10 +133,20 @@ def test_dataframe_copy_shallow(): pdf = pd.DataFrame( [[1, 2, 3], [4, 5, 6], [7, 8, 9]], columns=["a", "b", "c"] ) - gdf = DataFrame.from_pandas(pdf) + gdf = cudf.DataFrame.from_pandas(pdf) copy_pdf = pdf.copy(deep=False) copy_gdf = gdf.copy(deep=False) copy_pdf["b"] = [0, 0, 0] copy_gdf["b"] = [0, 0, 0] assert_eq(pdf["b"], copy_pdf["b"]) assert_eq(gdf["b"], copy_gdf["b"]) + + +def test_categorical_dataframe_slice_copy(): + pdf = pd.DataFrame({"g": pd.Series(["a", "b", "z"], dtype="category")}) + gdf = cudf.from_pandas(pdf) + + exp = pdf[1:].copy() + gdf = gdf[1:].copy() + + assert_eq(exp, gdf) diff --git a/python/cudf/cudf/tests/dataframe/methods/test_set_index.py b/python/cudf/cudf/tests/dataframe/methods/test_set_index.py index e1a99df4119..b9727941e0f 100644 --- a/python/cudf/cudf/tests/dataframe/methods/test_set_index.py +++ b/python/cudf/cudf/tests/dataframe/methods/test_set_index.py @@ -103,3 +103,18 @@ def test_set_index_multi(drop): df.set_index(["b", "d", "e"], drop=drop), gdf.set_index(["b", "d", "e"], drop=drop), ) + + +def test_df_cat_set_index(): + df = cudf.DataFrame( + { + "a": pd.Categorical(list("aababcabbc"), categories=list("abc")), + "b": np.arange(10), + } + ) + got = df.set_index("a") + + pddf = df.to_pandas() + expect = pddf.set_index("a") + + assert_eq(got, expect) diff --git a/python/cudf/cudf/tests/dataframe/methods/test_sort_index.py b/python/cudf/cudf/tests/dataframe/methods/test_sort_index.py index 905c4622ade..8e58755eff9 100644 --- a/python/cudf/cudf/tests/dataframe/methods/test_sort_index.py +++ b/python/cudf/cudf/tests/dataframe/methods/test_sort_index.py @@ -145,3 +145,17 @@ def test_sort_index_axis_1_ignore_index_true_columnaccessor_state_names(): gdf = cudf.DataFrame([[1, 2, 3]], columns=["b", "a", "c"]) result = gdf.sort_index(axis=1, ignore_index=True) assert result._data.names == tuple(result._data.keys()) + + +def test_df_cat_sort_index(): + df = cudf.DataFrame( + { + "a": pd.Categorical(list("aababcabbc"), categories=list("abc")), + "b": np.arange(10), + } + ) + + got = df.set_index("a").sort_index() + expect = df.to_pandas().set_index("a").sort_index() + + assert_eq(got, expect) diff --git a/python/cudf/cudf/tests/dtypes/test_categoricaldtype.py b/python/cudf/cudf/tests/dtypes/test_categoricaldtype.py index 93311fd3c02..3205f54dea8 100644 --- a/python/cudf/cudf/tests/dtypes/test_categoricaldtype.py +++ b/python/cudf/cudf/tests/dtypes/test_categoricaldtype.py @@ -4,6 +4,7 @@ import pytest import cudf +from cudf.testing import assert_eq @pytest.mark.parametrize( @@ -34,3 +35,31 @@ def test_cdf_to_pandas(data, categorical_ordered): categories=data, ordered=categorical_ordered ).to_pandas() ) + + +@pytest.mark.parametrize( + "categories", + [ + [], + [1, 2, 3], + pd.Series(["a", "c", "b"], dtype="category"), + pd.Series([1, 2, 3, 4, -100], dtype="category"), + ], +) +def test_categorical_dtype(categories, categorical_ordered): + expected = pd.CategoricalDtype( + categories=categories, ordered=categorical_ordered + ) + got = cudf.CategoricalDtype( + categories=categories, ordered=categorical_ordered + ) + assert_eq(expected, got) + + expected = pd.CategoricalDtype(categories=categories) + got = cudf.CategoricalDtype(categories=categories) + assert_eq(expected, got) + + +def test_categorical_dtype_ordered_not_settable(): + with pytest.raises(AttributeError): + cudf.CategoricalDtype().ordered = False diff --git a/python/cudf/cudf/tests/groupby/test_fillna.py b/python/cudf/cudf/tests/groupby/test_fillna.py index c8f45818357..7a3b85b66be 100644 --- a/python/cudf/cudf/tests/groupby/test_fillna.py +++ b/python/cudf/cudf/tests/groupby/test_fillna.py @@ -10,6 +10,7 @@ PANDAS_VERSION, ) from cudf.testing import assert_groupby_results_equal +from cudf.testing._utils import assert_exceptions_equal from cudf.testing.dataset_generator import rand_dataframe @@ -189,3 +190,23 @@ def test_groupby_fillna_method(method): assert_groupby_results_equal( expect[value_cols], got[value_cols], sort=False ) + + +@pytest.mark.skipif( + PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, + reason="Does not warn on older versions of pandas", +) +def test_cat_groupby_fillna(): + ps = pd.Series(["a", "b", "c"], dtype="category") + gs = cudf.from_pandas(ps) + + with pytest.warns(FutureWarning): + pg = ps.groupby(ps) + gg = gs.groupby(gs) + + assert_exceptions_equal( + lfunc=pg.fillna, + rfunc=gg.fillna, + lfunc_args_and_kwargs=(("d",), {}), + rfunc_args_and_kwargs=(("d",), {}), + ) diff --git a/python/cudf/cudf/tests/indexes/categoricalindex/methods/__init__.py b/python/cudf/cudf/tests/indexes/categoricalindex/methods/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/python/cudf/cudf/tests/indexes/categoricalindex/methods/test_add_categories.py b/python/cudf/cudf/tests/indexes/categoricalindex/methods/test_add_categories.py new file mode 100644 index 00000000000..09ae5580001 --- /dev/null +++ b/python/cudf/cudf/tests/indexes/categoricalindex/methods/test_add_categories.py @@ -0,0 +1,15 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + +import pandas as pd + +import cudf +from cudf.testing import assert_eq + + +def test_index_add_categories(): + pd_ci = pd.CategoricalIndex([1, 2, 3]) + cudf_ci = cudf.from_pandas(pd_ci) + + expected = pd_ci.add_categories([4]) + result = cudf_ci.add_categories([4]) + assert_eq(result, expected) diff --git a/python/cudf/cudf/tests/indexes/categoricalindex/methods/test_as_ordered.py b/python/cudf/cudf/tests/indexes/categoricalindex/methods/test_as_ordered.py new file mode 100644 index 00000000000..1a327555913 --- /dev/null +++ b/python/cudf/cudf/tests/indexes/categoricalindex/methods/test_as_ordered.py @@ -0,0 +1,18 @@ +# Copyright (c) 2018-2025, NVIDIA CORPORATION. + +import pandas as pd +import pytest + +import cudf +from cudf.testing import assert_eq + + +@pytest.mark.parametrize("method", ["as_ordered", "as_unordered"]) +@pytest.mark.parametrize("ordered", [True, False]) +def test_index_as_ordered(method, ordered): + pd_ci = pd.CategoricalIndex([1, 2, 3], ordered=ordered) + cudf_ci = cudf.from_pandas(pd_ci) + + expected = getattr(pd_ci, method)() + result = getattr(cudf_ci, method)() + assert_eq(result, expected) diff --git a/python/cudf/cudf/tests/indexes/categoricalindex/methods/test_remove_categories.py b/python/cudf/cudf/tests/indexes/categoricalindex/methods/test_remove_categories.py new file mode 100644 index 00000000000..7457368cebf --- /dev/null +++ b/python/cudf/cudf/tests/indexes/categoricalindex/methods/test_remove_categories.py @@ -0,0 +1,15 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + +import pandas as pd + +import cudf +from cudf.testing import assert_eq + + +def test_index_remove_categories(): + pd_ci = pd.CategoricalIndex([1, 2, 3], categories=[1, 2, 3, 4]) + cudf_ci = cudf.from_pandas(pd_ci) + + expected = pd_ci.remove_categories([4]) + result = cudf_ci.remove_categories([4]) + assert_eq(result, expected) diff --git a/python/cudf/cudf/tests/indexes/categoricalindex/methods/test_reorder_categories.py b/python/cudf/cudf/tests/indexes/categoricalindex/methods/test_reorder_categories.py new file mode 100644 index 00000000000..b1c31be1232 --- /dev/null +++ b/python/cudf/cudf/tests/indexes/categoricalindex/methods/test_reorder_categories.py @@ -0,0 +1,17 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + +import pandas as pd +import pytest + +import cudf +from cudf.testing import assert_eq + + +@pytest.mark.parametrize("ordered", [True, False]) +def test_index_reorder_categories(ordered): + pd_ci = pd.CategoricalIndex([1, 2, 3], categories=[1, 3, 2, 4]) + cudf_ci = cudf.from_pandas(pd_ci) + + expected = pd_ci.reorder_categories([1, 2, 3, 4], ordered=ordered) + result = cudf_ci.reorder_categories([1, 2, 3, 4], ordered=ordered) + assert_eq(result, expected) diff --git a/python/cudf/cudf/tests/indexes/categoricalindex/methods/test_set_categories.py b/python/cudf/cudf/tests/indexes/categoricalindex/methods/test_set_categories.py new file mode 100644 index 00000000000..4616e666e71 --- /dev/null +++ b/python/cudf/cudf/tests/indexes/categoricalindex/methods/test_set_categories.py @@ -0,0 +1,17 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + +import pandas as pd +import pytest + +import cudf +from cudf.testing import assert_eq + + +@pytest.mark.parametrize("ordered", [True, False]) +def test_index_set_categories(ordered): + pd_ci = pd.CategoricalIndex([1, 2, 3]) + cudf_ci = cudf.from_pandas(pd_ci) + + expected = pd_ci.set_categories([1, 2, 3, 4], ordered=ordered) + result = cudf_ci.set_categories([1, 2, 3, 4], ordered=ordered) + assert_eq(result, expected) diff --git a/python/cudf/cudf/tests/indexes/categoricalindex/test_attributes.py b/python/cudf/cudf/tests/indexes/categoricalindex/test_attributes.py index cfdf26877b6..d563bc332f6 100644 --- a/python/cudf/cudf/tests/indexes/categoricalindex/test_attributes.py +++ b/python/cudf/cudf/tests/indexes/categoricalindex/test_attributes.py @@ -3,7 +3,8 @@ import pandas as pd import pytest -from cudf.core.index import CategoricalIndex +import cudf +from cudf.testing import assert_eq @pytest.mark.parametrize( @@ -12,9 +13,27 @@ def test_categorical_index_is_unique_monotonic(testlist): # Assuming unordered categorical data cannot be "monotonic" raw_cat = pd.Categorical(testlist, ordered=True) - index = CategoricalIndex(raw_cat) + index = cudf.CategoricalIndex(raw_cat) index_pd = pd.CategoricalIndex(raw_cat) assert index.is_unique == index_pd.is_unique assert index.is_monotonic_increasing == index_pd.is_monotonic_increasing assert index.is_monotonic_decreasing == index_pd.is_monotonic_decreasing + + +@pytest.mark.parametrize( + "data", [["$ 1", "$ 2", "hello"], ["($) 1", "( 2", "hello", "^1$"]] +) +@pytest.mark.parametrize("value", ["$ 1", "hello", "$", "^1$"]) +def test_categorical_string_index_contains(data, value): + idx = cudf.CategoricalIndex(data) + pidx = idx.to_pandas() + + assert_eq(value in idx, value in pidx) + + +@pytest.mark.parametrize("ordered", [True, False]) +def test_index_ordered(ordered): + pd_ci = pd.CategoricalIndex([1, 2, 3], ordered=ordered) + cudf_ci = cudf.from_pandas(pd_ci) + assert pd_ci.ordered == cudf_ci.ordered diff --git a/python/cudf/cudf/tests/indexes/index/test_constructor.py b/python/cudf/cudf/tests/indexes/index/test_constructor.py index 6c8abdec247..a46c4fec49e 100644 --- a/python/cudf/cudf/tests/indexes/index/test_constructor.py +++ b/python/cudf/cudf/tests/indexes/index/test_constructor.py @@ -47,3 +47,13 @@ def test_infer_timedelta_index(data, timedelta_types_as_str): pdi = gdi.to_pandas() assert_eq(pdi, gdi) + + +def test_categorical_index_with_dtype(): + dtype = cudf.CategoricalDtype(categories=["a", "z", "c"]) + gi = cudf.Index(["z", "c", "a"], dtype=dtype) + pi = pd.Index(["z", "c", "a"], dtype=dtype.to_pandas()) + + assert_eq(gi, pi) + assert_eq(gi.dtype, pi.dtype) + assert_eq(gi.dtype.categories, pi.dtype.categories) diff --git a/python/cudf/cudf/tests/series/accessors/test_cat.py b/python/cudf/cudf/tests/series/accessors/test_cat.py new file mode 100644 index 00000000000..50f01173e5a --- /dev/null +++ b/python/cudf/cudf/tests/series/accessors/test_cat.py @@ -0,0 +1,349 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + +from textwrap import dedent + +import numpy as np +import pandas as pd +import pytest + +import cudf +from cudf.testing import assert_eq +from cudf.testing._utils import assert_exceptions_equal + + +def test_categorical_basic(): + cat = pd.Categorical(["a", "a", "b", "c", "a"], categories=["a", "b", "c"]) + cudf_cat = cudf.Index(cat) + assert_eq(cat.codes, cudf_cat.codes.to_numpy()) + + pdsr = pd.Series(cat, index=["p", "q", "r", "s", "t"]) + sr = cudf.Series(cat, index=["p", "q", "r", "s", "t"]) + assert_eq(pdsr.cat.codes, sr.cat.codes, check_dtype=False) + + # Test attributes + assert_eq(pdsr.cat.categories, sr.cat.categories) + assert pdsr.cat.ordered == sr.cat.ordered + + np.testing.assert_array_equal( + pdsr.cat.codes.values, sr.cat.codes.to_numpy() + ) + + assert str(sr) == str(pdsr) + + +def test_categorical_integer(): + cat = pd.Categorical(["a", "_", "_", "c", "a"], categories=["a", "b", "c"]) + pdsr = pd.Series(cat) + sr = cudf.Series(cat) + np.testing.assert_array_equal( + cat.codes, sr.cat.codes.astype(cat.codes.dtype).fillna(-1).to_numpy() + ) + assert sr.null_count == 2 + + np.testing.assert_array_equal( + pdsr.cat.codes.values, + sr.cat.codes.astype(pdsr.cat.codes.dtype).fillna(-1).to_numpy(), + ) + + expect_str = dedent( + """\ + 0 a + 1 + 2 + 3 c + 4 a + dtype: category + Categories (3, object): ['a', 'b', 'c']""" + ) + assert str(sr) == expect_str + + +def test_categorical_element_indexing(): + """ + Element indexing to a cat column must give the underlying object + not the numerical index. + """ + cat = pd.Categorical(["a", "a", "b", "c", "a"], categories=["a", "b", "c"]) + pdsr = pd.Series(cat) + sr = cudf.Series(cat) + assert_eq(pdsr, sr) + assert_eq(pdsr.cat.codes, sr.cat.codes, check_dtype=False) + + +def test_categorical_empty(): + cat = pd.Categorical([]) + pdsr = pd.Series(cat) + sr = cudf.Series(cat) + np.testing.assert_array_equal(cat.codes, sr.cat.codes.to_numpy()) + + # Test attributes + assert_eq(pdsr.cat.categories, sr.cat.categories) + assert pdsr.cat.ordered == sr.cat.ordered + + np.testing.assert_array_equal( + pdsr.cat.codes.values, sr.cat.codes.to_numpy() + ) + + +def test_categorical_set_categories(): + cat = pd.Categorical(["a", "a", "b", "c", "a"], categories=["a", "b", "c"]) + psr = pd.Series(cat) + sr = cudf.Series.from_pandas(cat) + + # adding category + expect = psr.cat.set_categories(["a", "b", "c", "d"]) + got = sr.cat.set_categories(["a", "b", "c", "d"]) + assert_eq(expect, got) + + # removing category + expect = psr.cat.set_categories(["a", "b"]) + got = sr.cat.set_categories(["a", "b"]) + assert_eq(expect, got) + + +def test_categorical_set_categories_preserves_order(): + series = pd.Series([1, 0, 0, 0, 2]).astype("category") + # reassigning categories should preserve element ordering + assert_eq( + series.cat.set_categories([1, 2]), + cudf.Series(series).cat.set_categories([1, 2]), + ) + + +def test_categorical_as_ordered(): + categories = list("abc") + codes = [0, 0, 1, 0, 1, 2, 0, 1, 1, 2] + pd_str_cat = pd.Categorical.from_codes(codes, categories=categories) + pd_sr = pd.Series(pd_str_cat.set_ordered(False)) + cd_sr = cudf.Series(pd_str_cat.set_ordered(False)) + + assert cd_sr.cat.ordered is False + assert cd_sr.cat.ordered == pd_sr.cat.ordered + + pd_sr_1 = pd_sr.cat.as_ordered() + cd_sr_1 = cd_sr.cat.as_ordered() + + assert cd_sr_1.cat.ordered is True + assert cd_sr_1.cat.ordered == pd_sr_1.cat.ordered + assert str(cd_sr_1) == str(pd_sr_1) + + +def test_categorical_as_unordered(): + categories = list("abc") + codes = [0, 0, 1, 0, 1, 2, 0, 1, 1, 2] + pd_str_cat = pd.Categorical.from_codes(codes, categories=categories) + pd_sr = pd.Series(pd_str_cat.set_ordered(True)) + cd_sr = cudf.Series(pd_str_cat.set_ordered(True)) + + assert cd_sr.cat.ordered is True + assert cd_sr.cat.ordered == pd_sr.cat.ordered + + pd_sr_1 = pd_sr.cat.as_unordered() + cd_sr_1 = cd_sr.cat.as_unordered() + + assert cd_sr_1.cat.ordered is False + assert cd_sr_1.cat.ordered == pd_sr_1.cat.ordered + assert str(cd_sr_1) == str(pd_sr_1) + + +@pytest.mark.parametrize("from_ordered", [True, False]) +@pytest.mark.parametrize("to_ordered", [True, False]) +def test_categorical_reorder_categories(from_ordered, to_ordered): + categories = list("abc") + codes = [0, 0, 1, 0, 1, 2, 0, 1, 1, 2] + pd_str_cat = pd.Categorical.from_codes(codes, categories=categories) + pd_sr = pd.Series(pd_str_cat.set_ordered(from_ordered)) + cd_sr = cudf.Series(pd_str_cat.set_ordered(from_ordered)) + + assert_eq(pd_sr, cd_sr) + + assert str(pd_sr) == str(cd_sr) + + pd_sr_1 = pd_sr.cat.reorder_categories(list("cba"), ordered=to_ordered) + cd_sr_1 = cd_sr.cat.reorder_categories(list("cba"), ordered=to_ordered) + + assert_eq(pd_sr_1, cd_sr_1) + + assert str(cd_sr_1) == str(pd_sr_1) + + +def test_categorical_add_categories(): + categories = list("abc") + codes = [0, 0, 1, 0, 1, 2, 0, 1, 1, 2] + pd_str_cat = pd.Categorical.from_codes(codes, categories=categories) + pd_sr = pd.Series(pd_str_cat) + cd_sr = cudf.Series(pd_str_cat) + + assert_eq(pd_sr, cd_sr) + + assert str(pd_sr) == str(cd_sr) + + pd_sr_1 = pd_sr.cat.add_categories(["d"]) + cd_sr_1 = cd_sr.cat.add_categories(["d"]) + + assert "d" in pd_sr_1.cat.categories.to_list() + assert "d" in cd_sr_1.cat.categories.to_pandas().to_list() + + assert_eq(pd_sr_1, cd_sr_1) + + +def test_categorical_remove_categories(): + categories = list("abc") + codes = [0, 0, 1, 0, 1, 2, 0, 1, 1, 2] + pd_str_cat = pd.Categorical.from_codes(codes, categories=categories) + pd_sr = pd.Series(pd_str_cat) + cd_sr = cudf.Series(pd_str_cat) + + assert_eq(pd_sr, cd_sr) + + assert str(pd_sr) == str(cd_sr) + + pd_sr_1 = pd_sr.cat.remove_categories(["a"]) + cd_sr_1 = cd_sr.cat.remove_categories(["a"]) + + assert "a" not in pd_sr_1.cat.categories.to_list() + assert "a" not in cd_sr_1.cat.categories.to_pandas().to_list() + + assert_eq(pd_sr_1, cd_sr_1) + + # test using ordered operators + assert_exceptions_equal( + lfunc=cd_sr.to_pandas().cat.remove_categories, + rfunc=cd_sr.cat.remove_categories, + lfunc_args_and_kwargs=([["a", "d"]], {}), + rfunc_args_and_kwargs=([["a", "d"]], {}), + ) + + +@pytest.mark.filterwarnings("ignore:Can't safely cast column:UserWarning") +@pytest.mark.parametrize( + "data", + [ + pd.Series([1, 2, 3, 89]), + pd.Series(["a", "b", "c", "c", "b", "a", "b", "b"]), + pd.Series(["aa", "b", "c", "c", "bb", "bb", "a", "b", "b"]), + pd.Series([1, 2, 3, 89, None, np.nan, np.nan], dtype="float64"), + pd.Series([1, 2, 3, 89], dtype="float64"), + pd.Series([1, 2.5, 3.001, 89], dtype="float64"), + pd.Series([None, None, None]), + pd.Series([], dtype="float64"), + ], +) +@pytest.mark.parametrize( + "new_categories", + [ + ["aa", "bb", "cc"], + [2, 4, 10, 100], + ["a", "b", "c"], + [], + pd.Series(["a", "b", "c"]), + pd.Series(["a", "b", "c"], dtype="category"), + pd.Series([-100, 10, 11, 0, 1, 2], dtype="category"), + ], +) +def test_categorical_set_categories_categoricals(data, new_categories): + pd_data = data.astype("category") + gd_data = cudf.from_pandas(pd_data) + + expected = pd_data.cat.set_categories(new_categories=new_categories) + actual = gd_data.cat.set_categories(new_categories=new_categories) + + assert_eq(expected, actual) + + expected = pd_data.cat.set_categories( + new_categories=pd.Series(new_categories, dtype="category") + ) + actual = gd_data.cat.set_categories( + new_categories=cudf.Series(new_categories, dtype="category") + ) + + assert_eq(expected, actual) + + +@pytest.mark.filterwarnings("ignore:Can't safely cast column:UserWarning") +@pytest.mark.parametrize( + "data,add", + [ + ([1, 2, 3], [100, 11, 12]), + ([1, 2, 3], [0.01, 9.7, 15.0]), + ([0.0, 6.7, 10.0], [100, 11, 12]), + ([0.0, 6.7, 10.0], [0.01, 9.7, 15.0]), + (["a", "bd", "ef"], ["asdfsdf", "bddf", "eff"]), + ([1, 2, 3], []), + ([0.0, 6.7, 10.0], []), + (["a", "bd", "ef"], []), + ], +) +def test_add_categories(data, add): + pds = pd.Series(data, dtype="category") + gds = cudf.Series(data, dtype="category") + + expected = pds.cat.add_categories(add) + actual = gds.cat.add_categories(add) + + assert_eq( + expected.cat.codes, actual.cat.codes.astype(expected.cat.codes.dtype) + ) + + # Need to type-cast pandas object to str due to mixed-type + # support in "object" + assert_eq( + expected.cat.categories.astype("str") + if (expected.cat.categories.dtype == "object") + else expected.cat.categories, + actual.cat.categories, + ) + + +@pytest.mark.parametrize( + "data,add", + [ + ([1, 2, 3], [1, 3, 11]), + ([0.0, 6.7, 10.0], [1, 2, 0.0]), + (["a", "bd", "ef"], ["a", "bd", "a"]), + ], +) +def test_add_categories_error(data, add): + pds = pd.Series(data, dtype="category") + gds = cudf.Series(data, dtype="category") + + assert_exceptions_equal( + pds.cat.add_categories, + gds.cat.add_categories, + ([add],), + ([add],), + ) + + +def test_add_categories_mixed_error(): + gds = cudf.Series(["a", "bd", "ef"], dtype="category") + + with pytest.raises(TypeError): + gds.cat.add_categories([1, 2, 3]) + + gds = cudf.Series([1, 2, 3], dtype="category") + + with pytest.raises(TypeError): + gds.cat.add_categories(["a", "bd", "ef"]) + + +def test_categorical_allow_nan(): + gs = cudf.Series([1, 2, np.nan, 10, np.nan, None], nan_as_null=False) + gs = gs.astype("category") + expected_codes = cudf.Series([0, 1, 3, 2, 3, None], dtype="uint8") + assert_eq(expected_codes, gs.cat.codes) + + expected_categories = cudf.Index([1.0, 2.0, 10.0, np.nan], dtype="float64") + assert_eq(expected_categories, gs.cat.categories) + + actual_ps = gs.to_pandas() + expected_ps = pd.Series( + [1.0, 2.0, np.nan, 10.0, np.nan, np.nan], dtype="category" + ) + assert_eq(actual_ps, expected_ps) + + +def test_cat_iterate_error(): + s = cudf.Series([1, 2, 3], dtype="category") + with pytest.raises(TypeError): + iter(s.cat) diff --git a/python/cudf/cudf/tests/series/indexing/test_getitem.py b/python/cudf/cudf/tests/series/indexing/test_getitem.py index 37e0c10618f..2909980bb0e 100644 --- a/python/cudf/cudf/tests/series/indexing/test_getitem.py +++ b/python/cudf/cudf/tests/series/indexing/test_getitem.py @@ -1,5 +1,6 @@ # Copyright (c) 2025, NVIDIA CORPORATION. +import numpy as np import pandas as pd import pyarrow as pa import pytest @@ -238,3 +239,29 @@ def test_string_slice_with_mask(): assert_eq(actual._column.null_count, expected._column.null_count) assert_eq(actual, expected) + + +def test_categorical_masking(): + """ + Test common operation for getting a all rows that matches a certain + category. + """ + cat = pd.Categorical(["a", "a", "b", "c", "a"], categories=["a", "b", "c"]) + pdsr = pd.Series(cat) + sr = cudf.Series(cat) + + # check scalar comparison + expect_matches = pdsr == "a" + got_matches = sr == "a" + + np.testing.assert_array_equal( + expect_matches.values, got_matches.to_numpy() + ) + + # mask series + expect_masked = pdsr[expect_matches] + got_masked = sr[got_matches] + + assert len(expect_masked) == len(got_masked) + assert got_masked.null_count == 0 + assert_eq(got_masked, expect_masked) diff --git a/python/cudf/cudf/tests/series/indexing/test_setitem.py b/python/cudf/cudf/tests/series/indexing/test_setitem.py index 2468477bdb7..5024adbe73f 100644 --- a/python/cudf/cudf/tests/series/indexing/test_setitem.py +++ b/python/cudf/cudf/tests/series/indexing/test_setitem.py @@ -850,3 +850,15 @@ def test_series_setitem_decimal(data, dtype, item, to, expect): expect = cudf.Series([decimal.Decimal(x) for x in expect], dtype=dtype) data[to] = item assert_eq(data, expect) + + +def test_categorical_setitem_with_nan(): + gs = cudf.Series( + [1, 2, np.nan, 10, np.nan, None], nan_as_null=False + ).astype("category") + gs[[1, 3]] = np.nan + + expected_series = cudf.Series( + [1, np.nan, np.nan, np.nan, np.nan, None], nan_as_null=False + ).astype(gs.dtype) + assert_eq(gs, expected_series) diff --git a/python/cudf/cudf/tests/series/methods/test_astype.py b/python/cudf/cudf/tests/series/methods/test_astype.py index 8d51d6226bc..daf5388e272 100644 --- a/python/cudf/cudf/tests/series/methods/test_astype.py +++ b/python/cudf/cudf/tests/series/methods/test_astype.py @@ -1246,3 +1246,76 @@ def test_typecast_from_decimal(precision, scale, signed_integer_types_as_str): assert_eq(got, expected) assert_eq(got.dtype, expected.dtype) + + +@pytest.mark.parametrize( + "data", + [ + pd.Series([1, 2, 3, 89]), + pd.Series([1, 2, 3, 89, 3, 1, 89], dtype="category"), + pd.Series(["1", "2", "3", "4", "5"], dtype="category"), + pd.Series(["1.0", "2.5", "3.001", "9"], dtype="category"), + pd.Series(["1", "2", "3", None, "4", "5"], dtype="category"), + pd.Series(["1.0", "2.5", "3.001", None, "9"], dtype="category"), + pd.Series(["a", "b", "c", "c", "b", "a", "b", "b"]), + pd.Series(["aa", "b", "c", "c", "bb", "bb", "a", "b", "b"]), + pd.Series([1, 2, 3, 89, None, np.nan, np.nan], dtype="float64"), + pd.Series([1, 2, 3, 89], dtype="float64"), + pd.Series([1, 2.5, 3.001, 89], dtype="float64"), + pd.Series([None, None, None]), + pd.Series([], dtype="float64"), + ], +) +@pytest.mark.parametrize( + "categories", + [ + ["aa", "bb", "cc"], + [2, 4, 10, 100], + ["aa", "bb", "c"], + ["a", "bb", "c"], + ["a", "b", "c"], + ["1", "2", "3", "4"], + ["1.0", "2.5", "3.001", "9"], + [], + ], +) +def test_categorical_typecast(data, categories): + pd_data = data + gd_data = cudf.from_pandas(data) + cat_type = pd.CategoricalDtype(categories) + + assert_eq(pd_data.astype(cat_type), gd_data.astype(cat_type)) + + +@pytest.mark.parametrize( + ("values", "expected"), + [ + ([1], np.uint8), + ([1, None], np.uint8), + (np.arange(np.iinfo(np.int8).max), np.uint8), + (np.append(np.arange(np.iinfo(np.int8).max), [None]), np.uint8), + (np.arange(np.iinfo(np.int16).max), np.uint16), + (np.append(np.arange(np.iinfo(np.int16).max), [None]), np.uint16), + (np.arange(np.iinfo(np.uint8).max), np.uint8), + (np.append(np.arange(np.iinfo(np.uint8).max), [None]), np.uint8), + (np.arange(np.iinfo(np.uint16).max), np.uint16), + (np.append(np.arange(np.iinfo(np.uint16).max), [None]), np.uint16), + ], +) +def test_astype_dtype(values, expected): + data = cudf.Series(values) + got = data.astype("category").cat.codes.dtype + np.testing.assert_equal(got, expected) + + +@pytest.mark.parametrize("ordered", [True, False]) +def test_empty_series_category_cast(ordered): + dtype = cudf.CategoricalDtype(ordered=ordered) + ps = pd.Series([], dtype="str") + gs = cudf.from_pandas(ps) + + expected = ps.astype(dtype.to_pandas()) + actual = gs.astype(dtype) + + assert_eq(expected, actual) + assert_eq(expected.dtype.ordered, actual.dtype.ordered) diff --git a/python/cudf/cudf/tests/series/methods/test_nunique.py b/python/cudf/cudf/tests/series/methods/test_nunique.py index c645fa401f4..3cdc8435f79 100644 --- a/python/cudf/cudf/tests/series/methods/test_nunique.py +++ b/python/cudf/cudf/tests/series/methods/test_nunique.py @@ -1,5 +1,5 @@ # Copyright (c) 2025, NVIDIA CORPORATION. - +import string import numpy as np import pandas as pd @@ -50,3 +50,22 @@ def test_datetime_nunique(data, nulls): expected = psr.nunique() got = gsr.nunique() assert_eq(got, expected) + + +def test_categorical_nunique(): + nelem = 20 + rng = np.random.default_rng(seed=0) + pd_cat = pd.Categorical( + pd.Series( + rng.choice(list(string.ascii_letters + string.digits), nelem), + dtype="category", + ) + ) + + gser = cudf.Series(pd_cat) + gdf_unique_count = gser.nunique() + + pser = pd.Series(pd_cat) + pdf_unique = pser.unique() + + assert gdf_unique_count == len(pdf_unique) diff --git a/python/cudf/cudf/tests/series/methods/test_reductions.py b/python/cudf/cudf/tests/series/methods/test_reductions.py index 3b636476651..001563b34db 100644 --- a/python/cudf/cudf/tests/series/methods/test_reductions.py +++ b/python/cudf/cudf/tests/series/methods/test_reductions.py @@ -1039,3 +1039,22 @@ def test_datetime_reductions(data, reduction_methods, datetime_types_as_str): assert True else: assert_eq(expected, actual) + + +@pytest.mark.parametrize("op", ["min", "max"]) +def test_categorical_maxima(op): + ser = cudf.Series( + ["a", "d", "c", "z", "g"], + dtype=cudf.CategoricalDtype(["z", "c", "g", "d", "a"], ordered=False), + ) + assert not ser.cat.ordered + + # Cannot get extrema of unordered Categorical column + with pytest.raises(TypeError, match="Categorical is not ordered"): + getattr(ser, op)() + + # Max/min should work after converting to "ordered" + ser_pd = ser.to_pandas() + result = getattr(ser.cat.as_ordered(), op)() + result_pd = getattr(ser_pd.cat.as_ordered(), op)() + assert_eq(result, result_pd) diff --git a/python/cudf/cudf/tests/series/methods/test_unique.py b/python/cudf/cudf/tests/series/methods/test_unique.py index b87f1872157..bc9fe01cf2b 100644 --- a/python/cudf/cudf/tests/series/methods/test_unique.py +++ b/python/cudf/cudf/tests/series/methods/test_unique.py @@ -1,5 +1,5 @@ # Copyright (c) 2025, NVIDIA CORPORATION. - +import string import numpy as np import pandas as pd @@ -96,3 +96,24 @@ def test_string_unique(item): # cudf returns a cudf.Series gres = gs.unique() assert_eq(pres, gres) + + +def test_categorical_unique(): + num_elements = 20 + rng = np.random.default_rng(seed=12) + pd_cat = pd.Categorical( + pd.Series( + rng.choice( + list(string.ascii_letters + string.digits), num_elements + ), + dtype="category", + ) + ) + + gser = cudf.Series(pd_cat) + gdf_unique_sorted = np.sort(gser.unique().to_pandas()) + + pser = pd.Series(pd_cat) + pdf_unique_sorted = np.sort(pser.unique()) + + np.testing.assert_array_equal(pdf_unique_sorted, gdf_unique_sorted) diff --git a/python/cudf/cudf/tests/series/test_binops.py b/python/cudf/cudf/tests/series/test_binops.py index d24dab0a012..222b9373dc9 100644 --- a/python/cudf/cudf/tests/series/test_binops.py +++ b/python/cudf/cudf/tests/series/test_binops.py @@ -856,3 +856,29 @@ def test_categorical_binary_add(): lfunc_args_and_kwargs=([pdsr, pdsr],), rfunc_args_and_kwargs=([sr, sr],), ) + + +def test_cat_series_binop_error(): + data_a = pd.Categorical(list("aababcabbc"), categories=list("abc")) + data_b = np.arange(len(data_a)) + + pd_ser_a = pd.Series(data_a) + pd_ser_b = pd.Series(data_b) + gdf_ser_a = cudf.Series(data_a) + gdf_ser_b = cudf.Series(data_b) + + # lhs is categorical + assert_exceptions_equal( + lfunc=operator.add, + rfunc=operator.add, + lfunc_args_and_kwargs=([pd_ser_a, pd_ser_b],), + rfunc_args_and_kwargs=([gdf_ser_a, gdf_ser_b],), + ) + + # lhs is numerical + assert_exceptions_equal( + lfunc=operator.add, + rfunc=operator.add, + lfunc_args_and_kwargs=([pd_ser_b, pd_ser_a],), + rfunc_args_and_kwargs=([gdf_ser_b, gdf_ser_a],), + ) diff --git a/python/cudf/cudf/tests/series/test_constructors.py b/python/cudf/cudf/tests/series/test_constructors.py index 6ad6a1a41cb..1074006be2f 100644 --- a/python/cudf/cudf/tests/series/test_constructors.py +++ b/python/cudf/cudf/tests/series/test_constructors.py @@ -1423,3 +1423,81 @@ def test_from_pandas_obj_tz_aware_unsupported(klass): pandas_obj = getattr(pd, klass)(tz_aware_data) with pytest.raises(NotImplementedError): cudf.from_pandas(pandas_obj) + + +@pytest.mark.parametrize( + "data", + [ + [1, 2, 3, 4], + ["a", "1", "2", "1", "a"], + pd.Series(["a", "1", "22", "1", "aa"]), + pd.Series(["a", "1", "22", "1", "aa"], dtype="category"), + pd.Series([1, 2, 3, -4], dtype="int64"), + pd.Series([1, 2, 3, 4], dtype="uint64"), + pd.Series([1, 2.3, 3, 4], dtype="float"), + np.asarray([0, 2, 1]), + [None, 1, None, 2, None], + [], + ], +) +@pytest.mark.parametrize( + "categories", + [ + ["aa", "bb", "cc"], + [2, 4, 10, 100], + ["a", "b", "c"], + ["22", "b", "c"], + [], + ], +) +def test_categorical_creation(data, categories): + dtype = pd.CategoricalDtype(categories) + expected = pd.Series(data, dtype=dtype) + got = cudf.Series(data, dtype=dtype) + assert_eq(expected, got) + + got = cudf.Series(data, dtype=cudf.from_pandas(dtype)) + assert_eq(expected, got) + + expected = pd.Series(data, dtype="category") + got = cudf.Series(data, dtype="category") + assert_eq(expected, got) + + +@pytest.mark.parametrize("input_obj", [[1, cudf.NA, 3]]) +def test_series_construction_with_nulls_as_category( + input_obj, all_supported_types_as_str +): + if all_supported_types_as_str == "category": + pytest.skip(f"No {all_supported_types_as_str} scalar.") + if all_supported_types_as_str.startswith( + "datetime" + ) or all_supported_types_as_str.startswith("timedelta"): + pytest.skip("Test intended for numeric and string scalars.") + dtype = cudf.dtype(all_supported_types_as_str) + input_obj = [ + dtype.type(v) if v is not cudf.NA else cudf.NA for v in input_obj + ] + + expect = pd.Series(input_obj, dtype="category") + got = cudf.Series(input_obj, dtype="category") + + assert_eq(expect, got) + + +@pytest.mark.parametrize("scalar", [1, "a", None, 10.2]) +def test_cat_from_scalar(scalar): + ps = pd.Series(scalar, dtype="category") + gs = cudf.Series(scalar, dtype="category") + + assert_eq(ps, gs) + + +def test_categorical_interval_pandas_roundtrip(): + expected = cudf.Series(cudf.interval_range(0, 5)).astype("category") + result = cudf.Series.from_pandas(expected.to_pandas()) + assert_eq(result, expected) + + expected = pd.Series(pd.interval_range(0, 5)).astype("category") + result = cudf.Series.from_pandas(expected).to_pandas() + assert_eq(result, expected) diff --git a/python/cudf/cudf/tests/test_api.py b/python/cudf/cudf/tests/test_api.py index a7d4b6f7cd3..44a5cc90da2 100644 --- a/python/cudf/cudf/tests/test_api.py +++ b/python/cudf/cudf/tests/test_api.py @@ -7,3 +7,12 @@ def test_toplevel_imports_matches_all_modules(): all_objects = set(cudf.__all__) extras = dir_objects - all_objects assert not extras, f"{extras} not included in cudf.__all__" + + +def test_version_constants_are_populated(): + # __git_commit__ will only be non-empty in a built distribution + assert isinstance(cudf.__git_commit__, str) + + # __version__ should always be non-empty + assert isinstance(cudf.__version__, str) + assert len(cudf.__version__) > 0 diff --git a/python/cudf/cudf/tests/test_categorical.py b/python/cudf/cudf/tests/test_categorical.py deleted file mode 100644 index bfa21b35bbd..00000000000 --- a/python/cudf/cudf/tests/test_categorical.py +++ /dev/null @@ -1,895 +0,0 @@ -# Copyright (c) 2018-2025, NVIDIA CORPORATION. - -import operator -import string -import warnings -from contextlib import contextmanager -from textwrap import dedent - -import numpy as np -import pandas as pd -import pytest - -import cudf -from cudf.core._compat import PANDAS_CURRENT_SUPPORTED_VERSION, PANDAS_VERSION -from cudf.testing import assert_eq -from cudf.testing._utils import NUMERIC_TYPES, assert_exceptions_equal - - -@contextmanager -def _hide_cudf_safe_casting_warning(): - with warnings.catch_warnings(): - warnings.filterwarnings( - "ignore", - "Can't safely cast column", - category=UserWarning, - ) - yield - - -@pytest.fixture -def pd_str_cat(): - categories = list("abc") - codes = [0, 0, 1, 0, 1, 2, 0, 1, 1, 2] - return pd.Categorical.from_codes(codes, categories=categories) - - -def test_categorical_basic(): - cat = pd.Categorical(["a", "a", "b", "c", "a"], categories=["a", "b", "c"]) - cudf_cat = cudf.Index(cat) - assert_eq(cat.codes, cudf_cat.codes.to_numpy()) - - pdsr = pd.Series(cat, index=["p", "q", "r", "s", "t"]) - sr = cudf.Series(cat, index=["p", "q", "r", "s", "t"]) - assert_eq(pdsr.cat.codes, sr.cat.codes, check_dtype=False) - - # Test attributes - assert_eq(pdsr.cat.categories, sr.cat.categories) - assert pdsr.cat.ordered == sr.cat.ordered - - np.testing.assert_array_equal( - pdsr.cat.codes.values, sr.cat.codes.to_numpy() - ) - - assert str(sr) == str(pdsr) - - -def test_categorical_integer(): - cat = pd.Categorical(["a", "_", "_", "c", "a"], categories=["a", "b", "c"]) - pdsr = pd.Series(cat) - sr = cudf.Series(cat) - np.testing.assert_array_equal( - cat.codes, sr.cat.codes.astype(cat.codes.dtype).fillna(-1).to_numpy() - ) - assert sr.null_count == 2 - - np.testing.assert_array_equal( - pdsr.cat.codes.values, - sr.cat.codes.astype(pdsr.cat.codes.dtype).fillna(-1).to_numpy(), - ) - - expect_str = dedent( - """\ - 0 a - 1 - 2 - 3 c - 4 a - dtype: category - Categories (3, object): ['a', 'b', 'c']""" - ) - assert str(sr) == expect_str - - -def test_categorical_compare_unordered(): - cat = pd.Categorical(["a", "a", "b", "c", "a"], categories=["a", "b", "c"]) - pdsr = pd.Series(cat) - - sr = cudf.Series(cat) - - # test equal - out = sr == sr - assert out.dtype == np.bool_ - assert type(out[0]) is np.bool_ - assert np.all(out.to_numpy()) - assert np.all(pdsr == pdsr) - - # test inequality - out = sr != sr - assert not np.any(out.to_numpy()) - assert not np.any(pdsr != pdsr) - - assert not pdsr.cat.ordered - assert not sr.cat.ordered - - # test using ordered operators - assert_exceptions_equal( - lfunc=operator.lt, - rfunc=operator.lt, - lfunc_args_and_kwargs=([pdsr, pdsr],), - rfunc_args_and_kwargs=([sr, sr],), - ) - - -def test_categorical_element_indexing(): - """ - Element indexing to a cat column must give the underlying object - not the numerical index. - """ - cat = pd.Categorical(["a", "a", "b", "c", "a"], categories=["a", "b", "c"]) - pdsr = pd.Series(cat) - sr = cudf.Series(cat) - assert_eq(pdsr, sr) - assert_eq(pdsr.cat.codes, sr.cat.codes, check_dtype=False) - - -def test_categorical_masking(): - """ - Test common operation for getting a all rows that matches a certain - category. - """ - cat = pd.Categorical(["a", "a", "b", "c", "a"], categories=["a", "b", "c"]) - pdsr = pd.Series(cat) - sr = cudf.Series(cat) - - # check scalar comparison - expect_matches = pdsr == "a" - got_matches = sr == "a" - - np.testing.assert_array_equal( - expect_matches.values, got_matches.to_numpy() - ) - - # mask series - expect_masked = pdsr[expect_matches] - got_masked = sr[got_matches] - - assert len(expect_masked) == len(got_masked) - assert got_masked.null_count == 0 - assert_eq(got_masked, expect_masked) - - -def test_df_cat_set_index(): - df = cudf.DataFrame() - df["a"] = pd.Categorical(list("aababcabbc"), categories=list("abc")) - df["b"] = np.arange(len(df)) - got = df.set_index("a") - - pddf = df.to_pandas() - expect = pddf.set_index("a") - - assert_eq(got, expect) - - -def test_df_cat_sort_index(): - df = cudf.DataFrame() - df["a"] = pd.Categorical(list("aababcabbc"), categories=list("abc")) - df["b"] = np.arange(len(df)) - - got = df.set_index("a").sort_index() - expect = df.to_pandas().set_index("a").sort_index() - - assert_eq(got, expect) - - -def test_cat_series_binop_error(): - df = cudf.DataFrame() - df["a"] = pd.Categorical(list("aababcabbc"), categories=list("abc")) - df["b"] = np.arange(len(df)) - - pdf = df.to_pandas() - - # lhs is categorical - assert_exceptions_equal( - lfunc=operator.add, - rfunc=operator.add, - lfunc_args_and_kwargs=([pdf["a"], pdf["b"]],), - rfunc_args_and_kwargs=([df["a"], df["b"]],), - ) - - # lhs is numerical - assert_exceptions_equal( - lfunc=operator.add, - rfunc=operator.add, - lfunc_args_and_kwargs=([pdf["b"], pdf["a"]],), - rfunc_args_and_kwargs=([df["b"], df["a"]],), - ) - - -def test_categorical_unique(): - num_elements = 20 - rng = np.random.default_rng(seed=12) - pd_cat = pd.Categorical( - pd.Series( - rng.choice( - list(string.ascii_letters + string.digits), num_elements - ), - dtype="category", - ) - ) - - # gdf - gdf = cudf.DataFrame() - gdf["a"] = cudf.Series.from_pandas(pd_cat) - gdf_unique_sorted = np.sort(gdf["a"].unique().to_pandas()) - - # pandas - pdf = pd.DataFrame() - pdf["a"] = pd_cat - pdf_unique_sorted = np.sort(pdf["a"].unique()) - - # verify - np.testing.assert_array_equal(pdf_unique_sorted, gdf_unique_sorted) - - -def test_categorical_unique_count(): - nelem = 20 - rng = np.random.default_rng(seed=0) - pd_cat = pd.Categorical( - pd.Series( - rng.choice(list(string.ascii_letters + string.digits), nelem), - dtype="category", - ) - ) - - # gdf - gdf = cudf.DataFrame() - gdf["a"] = cudf.Series.from_pandas(pd_cat) - gdf_unique_count = gdf["a"].nunique() - - # pandas - pdf = pd.DataFrame() - pdf["a"] = pd_cat - pdf_unique = pdf["a"].unique() - - # verify - assert gdf_unique_count == len(pdf_unique) - - -def test_categorical_empty(): - cat = pd.Categorical([]) - pdsr = pd.Series(cat) - sr = cudf.Series(cat) - np.testing.assert_array_equal(cat.codes, sr.cat.codes.to_numpy()) - - # Test attributes - assert_eq(pdsr.cat.categories, sr.cat.categories) - assert pdsr.cat.ordered == sr.cat.ordered - - np.testing.assert_array_equal( - pdsr.cat.codes.values, sr.cat.codes.to_numpy() - ) - - -def test_categorical_set_categories(): - cat = pd.Categorical(["a", "a", "b", "c", "a"], categories=["a", "b", "c"]) - psr = pd.Series(cat) - sr = cudf.Series.from_pandas(cat) - - # adding category - expect = psr.cat.set_categories(["a", "b", "c", "d"]) - got = sr.cat.set_categories(["a", "b", "c", "d"]) - assert_eq(expect, got) - - # removing category - expect = psr.cat.set_categories(["a", "b"]) - got = sr.cat.set_categories(["a", "b"]) - assert_eq(expect, got) - - -def test_categorical_set_categories_preserves_order(): - series = pd.Series([1, 0, 0, 0, 2]).astype("category") - # reassigning categories should preserve element ordering - assert_eq( - series.cat.set_categories([1, 2]), - cudf.Series(series).cat.set_categories([1, 2]), - ) - - -def test_categorical_as_ordered(pd_str_cat): - pd_sr = pd.Series(pd_str_cat.set_ordered(False)) - cd_sr = cudf.Series(pd_str_cat.set_ordered(False)) - - assert cd_sr.cat.ordered is False - assert cd_sr.cat.ordered == pd_sr.cat.ordered - - pd_sr_1 = pd_sr.cat.as_ordered() - cd_sr_1 = cd_sr.cat.as_ordered() - - assert cd_sr_1.cat.ordered is True - assert cd_sr_1.cat.ordered == pd_sr_1.cat.ordered - assert str(cd_sr_1) == str(pd_sr_1) - - -def test_categorical_as_unordered(pd_str_cat): - pd_sr = pd.Series(pd_str_cat.set_ordered(True)) - cd_sr = cudf.Series(pd_str_cat.set_ordered(True)) - - assert cd_sr.cat.ordered is True - assert cd_sr.cat.ordered == pd_sr.cat.ordered - - pd_sr_1 = pd_sr.cat.as_unordered() - cd_sr_1 = cd_sr.cat.as_unordered() - - assert cd_sr_1.cat.ordered is False - assert cd_sr_1.cat.ordered == pd_sr_1.cat.ordered - assert str(cd_sr_1) == str(pd_sr_1) - - -@pytest.mark.parametrize("from_ordered", [True, False]) -@pytest.mark.parametrize("to_ordered", [True, False]) -def test_categorical_reorder_categories(pd_str_cat, from_ordered, to_ordered): - pd_sr = pd.Series(pd_str_cat.set_ordered(from_ordered)) - cd_sr = cudf.Series(pd_str_cat.set_ordered(from_ordered)) - - assert_eq(pd_sr, cd_sr) - - assert str(pd_sr) == str(cd_sr) - - pd_sr_1 = pd_sr.cat.reorder_categories(list("cba"), ordered=to_ordered) - cd_sr_1 = cd_sr.cat.reorder_categories(list("cba"), ordered=to_ordered) - - assert_eq(pd_sr_1, cd_sr_1) - - assert str(cd_sr_1) == str(pd_sr_1) - - -def test_categorical_add_categories(pd_str_cat): - pd_sr = pd.Series(pd_str_cat) - cd_sr = cudf.Series(pd_str_cat) - - assert_eq(pd_sr, cd_sr) - - assert str(pd_sr) == str(cd_sr) - - pd_sr_1 = pd_sr.cat.add_categories(["d"]) - cd_sr_1 = cd_sr.cat.add_categories(["d"]) - - assert "d" in pd_sr_1.cat.categories.to_list() - assert "d" in cd_sr_1.cat.categories.to_pandas().to_list() - - assert_eq(pd_sr_1, cd_sr_1) - - -def test_categorical_remove_categories(pd_str_cat): - pd_sr = pd.Series(pd_str_cat) - cd_sr = cudf.Series(pd_str_cat) - - assert_eq(pd_sr, cd_sr) - - assert str(pd_sr) == str(cd_sr) - - pd_sr_1 = pd_sr.cat.remove_categories(["a"]) - cd_sr_1 = cd_sr.cat.remove_categories(["a"]) - - assert "a" not in pd_sr_1.cat.categories.to_list() - assert "a" not in cd_sr_1.cat.categories.to_pandas().to_list() - - assert_eq(pd_sr_1, cd_sr_1) - - # test using ordered operators - assert_exceptions_equal( - lfunc=cd_sr.to_pandas().cat.remove_categories, - rfunc=cd_sr.cat.remove_categories, - lfunc_args_and_kwargs=([["a", "d"]], {}), - rfunc_args_and_kwargs=([["a", "d"]], {}), - ) - - -def test_categorical_dataframe_slice_copy(): - pdf = pd.DataFrame({"g": pd.Series(["a", "b", "z"], dtype="category")}) - gdf = cudf.from_pandas(pdf) - - exp = pdf[1:].copy() - gdf = gdf[1:].copy() - - assert_eq(exp, gdf) - - -@pytest.mark.parametrize( - "data", - [ - pd.Series([1, 2, 3, 89]), - pd.Series([1, 2, 3, 89, 3, 1, 89], dtype="category"), - pd.Series(["1", "2", "3", "4", "5"], dtype="category"), - pd.Series(["1.0", "2.5", "3.001", "9"], dtype="category"), - pd.Series(["1", "2", "3", None, "4", "5"], dtype="category"), - pd.Series(["1.0", "2.5", "3.001", None, "9"], dtype="category"), - pd.Series(["a", "b", "c", "c", "b", "a", "b", "b"]), - pd.Series(["aa", "b", "c", "c", "bb", "bb", "a", "b", "b"]), - pd.Series([1, 2, 3, 89, None, np.nan, np.nan], dtype="float64"), - pd.Series([1, 2, 3, 89], dtype="float64"), - pd.Series([1, 2.5, 3.001, 89], dtype="float64"), - pd.Series([None, None, None]), - pd.Series([], dtype="float64"), - ], -) -@pytest.mark.parametrize( - "categories", - [ - ["aa", "bb", "cc"], - [2, 4, 10, 100], - ["aa", "bb", "c"], - ["a", "bb", "c"], - ["a", "b", "c"], - ["1", "2", "3", "4"], - ["1.0", "2.5", "3.001", "9"], - [], - ], -) -def test_categorical_typecast(data, categories): - pd_data = data - gd_data = cudf.from_pandas(data) - cat_type = pd.CategoricalDtype(categories) - - assert_eq(pd_data.astype(cat_type), gd_data.astype(cat_type)) - - -@pytest.mark.parametrize( - "data", - [ - pd.Series([1, 2, 3, 89]), - pd.Series(["a", "b", "c", "c", "b", "a", "b", "b"]), - pd.Series(["aa", "b", "c", "c", "bb", "bb", "a", "b", "b"]), - pd.Series([1, 2, 3, 89, None, np.nan, np.nan], dtype="float64"), - pd.Series([1, 2, 3, 89], dtype="float64"), - pd.Series([1, 2.5, 3.001, 89], dtype="float64"), - pd.Series([None, None, None]), - pd.Series([], dtype="float64"), - ], -) -@pytest.mark.parametrize( - "new_categories", - [ - ["aa", "bb", "cc"], - [2, 4, 10, 100], - ["aa", "bb", "c"], - ["a", "bb", "c"], - ["a", "b", "c"], - [], - pd.Series(["a", "b", "c"]), - pd.Series(["a", "b", "c"], dtype="category"), - pd.Series([-100, 10, 11, 0, 1, 2], dtype="category"), - ], -) -def test_categorical_set_categories_categoricals(data, new_categories): - pd_data = data.astype("category") - gd_data = cudf.from_pandas(pd_data) - - expected = pd_data.cat.set_categories(new_categories=new_categories) - with _hide_cudf_safe_casting_warning(): - actual = gd_data.cat.set_categories(new_categories=new_categories) - - assert_eq(expected, actual) - - expected = pd_data.cat.set_categories( - new_categories=pd.Series(new_categories, dtype="category") - ) - with _hide_cudf_safe_casting_warning(): - actual = gd_data.cat.set_categories( - new_categories=cudf.Series(new_categories, dtype="category") - ) - - assert_eq(expected, actual) - - -@pytest.mark.parametrize( - "data", - [ - [1, 2, 3, 4], - ["a", "1", "2", "1", "a"], - pd.Series(["a", "1", "22", "1", "aa"]), - pd.Series(["a", "1", "22", "1", "aa"], dtype="category"), - pd.Series([1, 2, 3, -4], dtype="int64"), - pd.Series([1, 2, 3, 4], dtype="uint64"), - pd.Series([1, 2.3, 3, 4], dtype="float"), - np.asarray([0, 2, 1]), - [None, 1, None, 2, None], - [], - ], -) -@pytest.mark.parametrize( - "categories", - [ - ["aa", "bb", "cc"], - [2, 4, 10, 100], - ["aa", "bb", "c"], - ["a", "bb", "c"], - ["a", "b", "c"], - ["22", "b", "c"], - [], - ], -) -def test_categorical_creation(data, categories): - dtype = pd.CategoricalDtype(categories) - expected = pd.Series(data, dtype=dtype) - got = cudf.Series(data, dtype=dtype) - assert_eq(expected, got) - - got = cudf.Series(data, dtype=cudf.from_pandas(dtype)) - assert_eq(expected, got) - - expected = pd.Series(data, dtype="category") - got = cudf.Series(data, dtype="category") - assert_eq(expected, got) - - -@pytest.mark.parametrize( - "categories", - [ - [], - [1, 2, 3], - pd.Series(["a", "c", "b"], dtype="category"), - pd.Series([1, 2, 3, 4, -100], dtype="category"), - ], -) -@pytest.mark.parametrize("ordered", [True, False]) -def test_categorical_dtype(categories, ordered): - expected = pd.CategoricalDtype(categories=categories, ordered=ordered) - got = cudf.CategoricalDtype(categories=categories, ordered=ordered) - assert_eq(expected, got) - - expected = pd.CategoricalDtype(categories=categories) - got = cudf.CategoricalDtype(categories=categories) - assert_eq(expected, got) - - -@pytest.mark.parametrize( - ("values", "expected"), - [ - ([1], np.uint8), - ([1, None], np.uint8), - (np.arange(np.iinfo(np.int8).max), np.uint8), - (np.append(np.arange(np.iinfo(np.int8).max), [None]), np.uint8), - (np.arange(np.iinfo(np.int16).max), np.uint16), - (np.append(np.arange(np.iinfo(np.int16).max), [None]), np.uint16), - (np.arange(np.iinfo(np.uint8).max), np.uint8), - (np.append(np.arange(np.iinfo(np.uint8).max), [None]), np.uint8), - (np.arange(np.iinfo(np.uint16).max), np.uint16), - (np.append(np.arange(np.iinfo(np.uint16).max), [None]), np.uint16), - ], -) -def test_astype_dtype(values, expected): - data = cudf.Series(values) - got = data.astype("category").cat.codes.dtype - np.testing.assert_equal(got, expected) - - -@pytest.mark.parametrize( - "data,add", - [ - ([1, 2, 3], [100, 11, 12]), - ([1, 2, 3], [0.01, 9.7, 15.0]), - ([0.0, 6.7, 10.0], [100, 11, 12]), - ([0.0, 6.7, 10.0], [0.01, 9.7, 15.0]), - (["a", "bd", "ef"], ["asdfsdf", "bddf", "eff"]), - ([1, 2, 3], []), - ([0.0, 6.7, 10.0], []), - (["a", "bd", "ef"], []), - ], -) -def test_add_categories(data, add): - pds = pd.Series(data, dtype="category") - gds = cudf.Series(data, dtype="category") - - expected = pds.cat.add_categories(add) - with _hide_cudf_safe_casting_warning(): - actual = gds.cat.add_categories(add) - - assert_eq( - expected.cat.codes, actual.cat.codes.astype(expected.cat.codes.dtype) - ) - - # Need to type-cast pandas object to str due to mixed-type - # support in "object" - assert_eq( - expected.cat.categories.astype("str") - if (expected.cat.categories.dtype == "object") - else expected.cat.categories, - actual.cat.categories, - ) - - -@pytest.mark.parametrize( - "data,add", - [ - ([1, 2, 3], [1, 3, 11]), - ([0.0, 6.7, 10.0], [1, 2, 0.0]), - (["a", "bd", "ef"], ["a", "bd", "a"]), - ], -) -def test_add_categories_error(data, add): - pds = pd.Series(data, dtype="category") - gds = cudf.Series(data, dtype="category") - - assert_exceptions_equal( - pds.cat.add_categories, - gds.cat.add_categories, - ([add],), - ([add],), - ) - - -def test_add_categories_mixed_error(): - gds = cudf.Series(["a", "bd", "ef"], dtype="category") - - with pytest.raises(TypeError): - gds.cat.add_categories([1, 2, 3]) - - gds = cudf.Series([1, 2, 3], dtype="category") - - with pytest.raises(TypeError): - gds.cat.add_categories(["a", "bd", "ef"]) - - -@pytest.mark.parametrize( - "data", - [ - [1, 2, 3, 4], - ["a", "1", "2", "1", "a"], - pd.Series(["a", "1", "22", "1", "aa"]), - pd.Series(["a", "1", "22", "1", "aa"], dtype="category"), - pd.Series([1, 2, 3, 4], dtype="int64"), - pd.Series([1, 2.3, 3, 4], dtype="float"), - [None, 1, None, 2, None], - ["a"], - ], -) -@pytest.mark.parametrize( - "categories", - [ - ["aa", "bb", "cc"], - [2, 4, 10, 100], - ["aa", "bb", "c"], - ["a", "bb", "c"], - ["a", "b", "c"], - ["22", "b", "c"], - ["a"], - ], -) -def test_categorical_assignment(data, categories): - cat_dtype = pd.CategoricalDtype(categories) - pd_df = pd.DataFrame() - pd_df["a"] = np.ones(len(data)) - cd_df = cudf.from_pandas(pd_df) - - pd_cat_series = pd.Series(data, dtype=cat_dtype) - # assign categorical series - pd_df.assign(cat_col=pd_cat_series) - cd_df.assign(cat_col=pd_cat_series) - assert_eq(pd_df, cd_df) - - # assign categorical array - # needed for dask_cudf support for including file name - # as a categorical column - # see issue: https://github.com/rapidsai/cudf/issues/2269 - pd_df = pd.DataFrame() - pd_df["a"] = np.ones(len(data)) - cd_df = cudf.from_pandas(pd_df) - - pd_categorical = pd.Categorical(data, dtype=cat_dtype) - pd_df.assign(cat_col=pd_categorical) - cd_df.assign(cat_col=pd_categorical) - assert_eq(pd_df, cd_df) - - -def test_categorical_allow_nan(): - gs = cudf.Series([1, 2, np.nan, 10, np.nan, None], nan_as_null=False) - gs = gs.astype("category") - expected_codes = cudf.Series([0, 1, 3, 2, 3, None], dtype="uint8") - assert_eq(expected_codes, gs.cat.codes) - - expected_categories = cudf.Index([1.0, 2.0, 10.0, np.nan], dtype="float64") - assert_eq(expected_categories, gs.cat.categories) - - actual_ps = gs.to_pandas() - expected_ps = pd.Series( - [1.0, 2.0, np.nan, 10.0, np.nan, np.nan], dtype="category" - ) - assert_eq(actual_ps, expected_ps) - - -def test_categorical_setitem_with_nan(): - gs = cudf.Series( - [1, 2, np.nan, 10, np.nan, None], nan_as_null=False - ).astype("category") - gs[[1, 3]] = np.nan - - expected_series = cudf.Series( - [1, np.nan, np.nan, np.nan, np.nan, None], nan_as_null=False - ).astype(gs.dtype) - assert_eq(gs, expected_series) - - -@pytest.mark.parametrize("dtype", [*list(NUMERIC_TYPES), "object"]) -@pytest.mark.parametrize("input_obj", [[1, cudf.NA, 3]]) -def test_series_construction_with_nulls(input_obj, dtype): - dtype = cudf.dtype(dtype) - input_obj = [ - dtype.type(v) if v is not cudf.NA else cudf.NA for v in input_obj - ] - - expect = pd.Series(input_obj, dtype="category") - got = cudf.Series(input_obj, dtype="category").to_pandas() - - assert_eq(expect, got) - - -@pytest.mark.parametrize( - "data", - [ - {"a": pd.Series(["a", "b", "c", "a", "c", "b"]).astype("category")}, - { - "a": pd.Series(["a", "a", "b", "b"]).astype("category"), - "b": pd.Series(["b", "b", "c", "c"]).astype("category"), - "c": pd.Series(["c", "c", "a", "a"]).astype("category"), - }, - { - "a": pd.Series(["a", None, "b", "b"]).astype("category"), - "b": pd.Series(["b", "b", None, "c"]).astype("category"), - "c": pd.Series(["c", "c", "a", None]).astype("category"), - }, - ], -) -def test_serialize_categorical_columns(data): - df = cudf.DataFrame(data) - recreated = df.__class__.deserialize(*df.serialize()) - assert_eq(recreated, df) - - -@pytest.mark.parametrize( - "data", [["$ 1", "$ 2", "hello"], ["($) 1", "( 2", "hello", "^1$"]] -) -@pytest.mark.parametrize("value", ["$ 1", "hello", "$", "^1$"]) -def test_categorical_string_index_contains(data, value): - idx = cudf.CategoricalIndex(data) - pidx = idx.to_pandas() - - assert_eq(value in idx, value in pidx) - - -def test_categorical_index_with_dtype(): - dtype = cudf.CategoricalDtype(categories=["a", "z", "c"]) - gi = cudf.Index(["z", "c", "a"], dtype=dtype) - pi = pd.Index(["z", "c", "a"], dtype=dtype.to_pandas()) - - assert_eq(gi, pi) - assert_eq(gi.dtype, pi.dtype) - assert_eq(gi.dtype.categories, pi.dtype.categories) - - -def test_cat_iterate_error(): - s = cudf.Series([1, 2, 3], dtype="category") - with pytest.raises(TypeError): - iter(s.cat) - - -@pytest.mark.parametrize("ordered", [True, False]) -def test_empty_series_category_cast(ordered): - dtype = cudf.CategoricalDtype(ordered=ordered) - ps = pd.Series([], dtype="str") - gs = cudf.from_pandas(ps) - - expected = ps.astype(dtype.to_pandas()) - actual = gs.astype(dtype) - - assert_eq(expected, actual) - assert_eq(expected.dtype.ordered, actual.dtype.ordered) - - -def test_categorical_dtype_ordered_not_settable(): - with pytest.raises(AttributeError): - cudf.CategoricalDtype().ordered = False - - -@pytest.mark.parametrize("scalar", [1, "a", None, 10.2]) -def test_cat_from_scalar(scalar): - ps = pd.Series(scalar, dtype="category") - gs = cudf.Series(scalar, dtype="category") - - assert_eq(ps, gs) - - -@pytest.mark.skipif( - PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, - reason="Does not warn on older versions of pandas", -) -def test_cat_groupby_fillna(): - ps = pd.Series(["a", "b", "c"], dtype="category") - gs = cudf.from_pandas(ps) - - with pytest.warns(FutureWarning): - pg = ps.groupby(ps) - gg = gs.groupby(gs) - - assert_exceptions_equal( - lfunc=pg.fillna, - rfunc=gg.fillna, - lfunc_args_and_kwargs=(("d",), {}), - rfunc_args_and_kwargs=(("d",), {}), - ) - - -@pytest.mark.parametrize("op", ["min", "max"]) -def test_categorical_maxima(op): - ser = cudf.Series( - ["a", "d", "c", "z", "g"], - dtype=cudf.CategoricalDtype(["z", "c", "g", "d", "a"], ordered=False), - ) - assert not ser.cat.ordered - - # Cannot get extrema of unordered Categorical column - with pytest.raises(TypeError, match="Categorical is not ordered"): - getattr(ser, op)() - - # Max/min should work after converting to "ordered" - ser_pd = ser.to_pandas() - result = getattr(ser.cat.as_ordered(), op)() - result_pd = getattr(ser_pd.cat.as_ordered(), op)() - assert_eq(result, result_pd) - - -@pytest.mark.parametrize("ordered", [True, False]) -def test_index_ordered(ordered): - pd_ci = pd.CategoricalIndex([1, 2, 3], ordered=ordered) - cudf_ci = cudf.from_pandas(pd_ci) - assert pd_ci.ordered == cudf_ci.ordered - - -@pytest.mark.parametrize("method", ["as_ordered", "as_unordered"]) -@pytest.mark.parametrize("ordered", [True, False]) -def test_index_as_ordered(method, ordered): - pd_ci = pd.CategoricalIndex([1, 2, 3], ordered=ordered) - cudf_ci = cudf.from_pandas(pd_ci) - - expected = getattr(pd_ci, method)() - result = getattr(cudf_ci, method)() - assert_eq(result, expected) - - -def test_index_add_categories(): - pd_ci = pd.CategoricalIndex([1, 2, 3]) - cudf_ci = cudf.from_pandas(pd_ci) - - expected = pd_ci.add_categories([4]) - result = cudf_ci.add_categories([4]) - assert_eq(result, expected) - - -def test_index_remove_categories(): - pd_ci = pd.CategoricalIndex([1, 2, 3], categories=[1, 2, 3, 4]) - cudf_ci = cudf.from_pandas(pd_ci) - - expected = pd_ci.remove_categories([4]) - result = cudf_ci.remove_categories([4]) - assert_eq(result, expected) - - -@pytest.mark.parametrize("ordered", [True, False]) -def test_index_reorder_categories(ordered): - pd_ci = pd.CategoricalIndex([1, 2, 3], categories=[1, 3, 2, 4]) - cudf_ci = cudf.from_pandas(pd_ci) - - expected = pd_ci.reorder_categories([1, 2, 3, 4], ordered=ordered) - result = cudf_ci.reorder_categories([1, 2, 3, 4], ordered=ordered) - assert_eq(result, expected) - - -@pytest.mark.parametrize("ordered", [True, False]) -def test_index_set_categories(ordered): - pd_ci = pd.CategoricalIndex([1, 2, 3]) - cudf_ci = cudf.from_pandas(pd_ci) - - expected = pd_ci.set_categories([1, 2, 3, 4], ordered=ordered) - result = cudf_ci.set_categories([1, 2, 3, 4], ordered=ordered) - assert_eq(result, expected) - - -def test_categorical_interval_pandas_roundtrip(): - expected = cudf.Series(cudf.interval_range(0, 5)).astype("category") - result = cudf.Series.from_pandas(expected.to_pandas()) - assert_eq(result, expected) - - expected = pd.Series(pd.interval_range(0, 5)).astype("category") - result = cudf.Series.from_pandas(expected).to_pandas() - assert_eq(result, expected) diff --git a/python/cudf/cudf/tests/test_version.py b/python/cudf/cudf/tests/test_version.py deleted file mode 100644 index 8c10cc20a9a..00000000000 --- a/python/cudf/cudf/tests/test_version.py +++ /dev/null @@ -1,12 +0,0 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. - -import cudf - - -def test_version_constants_are_populated(): - # __git_commit__ will only be non-empty in a built distribution - assert isinstance(cudf.__git_commit__, str) - - # __version__ should always be non-empty - assert isinstance(cudf.__version__, str) - assert len(cudf.__version__) > 0 From 54d0d15b6955c4c5f7b684dab055d92e45a14b5f Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Wed, 3 Sep 2025 19:45:15 -0700 Subject: [PATCH 254/366] Move (most of) test_index.py to new cudf classic directory structure (#19696) Towards https://github.com/rapidsai/cudf/issues/9999 Towards https://github.com/rapidsai/cudf/issues/15723 The remaining are pending other tests being moved first so sub-directories/files are established Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/19696 --- python/cudf/cudf/tests/conftest.py | 20 + .../dataframe/methods/test_sort_index.py | 25 +- .../dataframe/methods/test_sort_values.py | 10 + .../cudf/tests/dataframe/methods/test_take.py | 47 + .../cudf/tests/groupby/test_reductions.py | 122 +- .../categoricalindex/methods/test_equals.py | 71 + .../categoricalindex/test_constructors.py | 55 + .../datetimeindex/indexing/__init__.py | 0 .../datetimeindex/indexing/test_getitem.py | 16 + .../methods/test_ceil_floor_round.py | 20 + .../datetimeindex/methods/test_repeat.py | 19 + .../tests/indexes/index/indexing/__init__.py | 0 .../indexes/index/indexing/test_getitem.py | 33 + .../indexes/index/indexing/test_setitem.py | 16 + .../indexes/index/methods/test_any_all.py | 12 + .../indexes/index/methods/test_append.py | 323 ++ .../indexes/index/methods/test_argsort.py | 22 + .../indexes/index/methods/test_astype.py | 18 + .../tests/indexes/index/methods/test_copy.py | 50 + .../indexes/index/methods/test_difference.py | 109 + .../index/methods/test_drop_duplicates.py | 23 + .../indexes/index/methods/test_dropna.py | 5 + .../indexes/index/methods/test_duplicated.py | 26 + .../indexes/index/methods/test_equals.py | 122 + .../indexes/index/methods/test_fillna.py | 38 + .../index/methods/test_find_label_range.py | 27 + .../indexes/index/methods/test_get_indexer.py | 158 + .../indexes/index/methods/test_get_loc.py | 76 + .../index/methods/test_intersection.py | 89 + .../tests/indexes/index/methods/test_isin.py | 85 + .../index/methods/test_isna_notnull.py | 15 + .../indexes/index/methods/test_reductions.py | 23 + .../indexes/index/methods/test_rename.py | 68 + .../indexes/index/methods/test_set_names.py | 48 + .../indexes/index/methods/test_sort_values.py | 47 + .../indexes/index/methods/test_to_arrow.py | 29 + .../indexes/index/methods/test_to_frame.py | 24 + .../indexes/index/methods/test_to_pandas.py | 77 + .../indexes/index/methods/test_to_series.py | 22 + .../indexes/index/methods/test_tolist.py | 22 + .../tests/indexes/index/methods/test_union.py | 72 + .../tests/indexes/index/methods/test_where.py | 190 + .../tests/indexes/index/test_attributes.py | 150 + .../tests/indexes/index/test_constructor.py | 274 ++ .../multiindex/indexing/test_getitem.py | 47 + .../indexes/multiindex/methods/test_append.py | 52 + .../multiindex/methods/test_get_indexer.py | 128 + .../multiindex/methods/test_get_loc.py | 141 + .../indexes/multiindex/methods/test_isin.py | 85 + .../multiindex/methods/test_to_arrow.py | 27 + .../indexes/multiindex/test_constructors.py | 18 + .../indexes/rangeindex/indexing/__init__.py | 0 .../rangeindex/indexing/test_getitem.py | 85 + .../rangeindex/methods/test_any_all.py | 12 + .../indexes/rangeindex/methods/test_append.py | 15 + .../indexes/rangeindex/methods/test_dropna.py | 12 + .../rangeindex/methods/test_factorize.py | 15 + .../methods/test_find_label_range.py | 23 + .../rangeindex/methods/test_get_indexer.py | 39 + .../rangeindex/methods/test_get_loc.py | 32 + .../rangeindex/methods/test_intersection.py | 19 + .../indexes/rangeindex/methods/test_join.py | 19 + .../rangeindex/methods/test_nunique.py | 33 + .../indexes/rangeindex/methods/test_rename.py | 15 + .../indexes/rangeindex/methods/test_repeat.py | 16 + .../rangeindex/methods/test_searchsorted.py | 12 + .../indexes/rangeindex/methods/test_take.py | 16 + .../indexes/rangeindex/methods/test_union.py | 17 + .../indexes/rangeindex/methods/test_unique.py | 17 + .../indexes/rangeindex/methods/test_where.py | 18 + .../indexes/rangeindex/test_attributes.py | 32 + .../tests/indexes/rangeindex/test_binops.py | 33 + .../indexes/rangeindex/test_constructors.py | 40 +- .../cudf/tests/input_output/test_pickling.py | 30 + .../cudf/tests/series/methods/test_take.py | 37 + python/cudf/cudf/tests/test_index.py | 3110 +---------------- python/cudf/cudf/tests/test_indexing.py | 82 - python/cudf/cudf/tests/test_multiindex.py | 269 +- 78 files changed, 3798 insertions(+), 3446 deletions(-) create mode 100644 python/cudf/cudf/tests/dataframe/methods/test_take.py create mode 100644 python/cudf/cudf/tests/indexes/categoricalindex/methods/test_equals.py create mode 100644 python/cudf/cudf/tests/indexes/categoricalindex/test_constructors.py create mode 100644 python/cudf/cudf/tests/indexes/datetimeindex/indexing/__init__.py create mode 100644 python/cudf/cudf/tests/indexes/datetimeindex/indexing/test_getitem.py create mode 100644 python/cudf/cudf/tests/indexes/datetimeindex/methods/test_ceil_floor_round.py create mode 100644 python/cudf/cudf/tests/indexes/datetimeindex/methods/test_repeat.py create mode 100644 python/cudf/cudf/tests/indexes/index/indexing/__init__.py create mode 100644 python/cudf/cudf/tests/indexes/index/indexing/test_getitem.py create mode 100644 python/cudf/cudf/tests/indexes/index/indexing/test_setitem.py create mode 100644 python/cudf/cudf/tests/indexes/index/methods/test_any_all.py create mode 100644 python/cudf/cudf/tests/indexes/index/methods/test_append.py create mode 100644 python/cudf/cudf/tests/indexes/index/methods/test_argsort.py create mode 100644 python/cudf/cudf/tests/indexes/index/methods/test_astype.py create mode 100644 python/cudf/cudf/tests/indexes/index/methods/test_copy.py create mode 100644 python/cudf/cudf/tests/indexes/index/methods/test_difference.py create mode 100644 python/cudf/cudf/tests/indexes/index/methods/test_drop_duplicates.py create mode 100644 python/cudf/cudf/tests/indexes/index/methods/test_duplicated.py create mode 100644 python/cudf/cudf/tests/indexes/index/methods/test_equals.py create mode 100644 python/cudf/cudf/tests/indexes/index/methods/test_fillna.py create mode 100644 python/cudf/cudf/tests/indexes/index/methods/test_find_label_range.py create mode 100644 python/cudf/cudf/tests/indexes/index/methods/test_get_indexer.py create mode 100644 python/cudf/cudf/tests/indexes/index/methods/test_get_loc.py create mode 100644 python/cudf/cudf/tests/indexes/index/methods/test_intersection.py create mode 100644 python/cudf/cudf/tests/indexes/index/methods/test_isin.py create mode 100644 python/cudf/cudf/tests/indexes/index/methods/test_isna_notnull.py create mode 100644 python/cudf/cudf/tests/indexes/index/methods/test_reductions.py create mode 100644 python/cudf/cudf/tests/indexes/index/methods/test_rename.py create mode 100644 python/cudf/cudf/tests/indexes/index/methods/test_set_names.py create mode 100644 python/cudf/cudf/tests/indexes/index/methods/test_sort_values.py create mode 100644 python/cudf/cudf/tests/indexes/index/methods/test_to_arrow.py create mode 100644 python/cudf/cudf/tests/indexes/index/methods/test_to_frame.py create mode 100644 python/cudf/cudf/tests/indexes/index/methods/test_to_pandas.py create mode 100644 python/cudf/cudf/tests/indexes/index/methods/test_to_series.py create mode 100644 python/cudf/cudf/tests/indexes/index/methods/test_tolist.py create mode 100644 python/cudf/cudf/tests/indexes/index/methods/test_union.py create mode 100644 python/cudf/cudf/tests/indexes/index/methods/test_where.py create mode 100644 python/cudf/cudf/tests/indexes/multiindex/indexing/test_getitem.py create mode 100644 python/cudf/cudf/tests/indexes/multiindex/methods/test_append.py create mode 100644 python/cudf/cudf/tests/indexes/multiindex/methods/test_get_indexer.py create mode 100644 python/cudf/cudf/tests/indexes/multiindex/methods/test_get_loc.py create mode 100644 python/cudf/cudf/tests/indexes/multiindex/methods/test_isin.py create mode 100644 python/cudf/cudf/tests/indexes/multiindex/methods/test_to_arrow.py create mode 100644 python/cudf/cudf/tests/indexes/rangeindex/indexing/__init__.py create mode 100644 python/cudf/cudf/tests/indexes/rangeindex/indexing/test_getitem.py create mode 100644 python/cudf/cudf/tests/indexes/rangeindex/methods/test_any_all.py create mode 100644 python/cudf/cudf/tests/indexes/rangeindex/methods/test_append.py create mode 100644 python/cudf/cudf/tests/indexes/rangeindex/methods/test_dropna.py create mode 100644 python/cudf/cudf/tests/indexes/rangeindex/methods/test_factorize.py create mode 100644 python/cudf/cudf/tests/indexes/rangeindex/methods/test_find_label_range.py create mode 100644 python/cudf/cudf/tests/indexes/rangeindex/methods/test_get_indexer.py create mode 100644 python/cudf/cudf/tests/indexes/rangeindex/methods/test_get_loc.py create mode 100644 python/cudf/cudf/tests/indexes/rangeindex/methods/test_intersection.py create mode 100644 python/cudf/cudf/tests/indexes/rangeindex/methods/test_join.py create mode 100644 python/cudf/cudf/tests/indexes/rangeindex/methods/test_rename.py create mode 100644 python/cudf/cudf/tests/indexes/rangeindex/methods/test_repeat.py create mode 100644 python/cudf/cudf/tests/indexes/rangeindex/methods/test_searchsorted.py create mode 100644 python/cudf/cudf/tests/indexes/rangeindex/methods/test_take.py create mode 100644 python/cudf/cudf/tests/indexes/rangeindex/methods/test_union.py create mode 100644 python/cudf/cudf/tests/indexes/rangeindex/methods/test_unique.py create mode 100644 python/cudf/cudf/tests/indexes/rangeindex/methods/test_where.py create mode 100644 python/cudf/cudf/tests/indexes/rangeindex/test_binops.py create mode 100644 python/cudf/cudf/tests/series/methods/test_take.py diff --git a/python/cudf/cudf/tests/conftest.py b/python/cudf/cudf/tests/conftest.py index 663e4dbd517..5d017f9e8ca 100644 --- a/python/cudf/cudf/tests/conftest.py +++ b/python/cudf/cudf/tests/conftest.py @@ -298,6 +298,14 @@ def signed_integer_types_as_str(request): return request.param +@pytest.fixture(params=unsigned_integer_types) +def unsigned_integer_types_as_str(request): + """ + - "uint8", "uint16", "uint32", "uint64" + """ + return request.param + + @pytest.fixture(params=signed_integer_types + unsigned_integer_types) def integer_types_as_str(request): """ @@ -486,6 +494,18 @@ def numpy_ufunc(request): return request.param +@pytest.fixture(params=[True, False]) +def copy(request): + """Param for `copy` argument""" + return request.param + + +@pytest.fixture(params=[True, False]) +def deep(request): + """Param for `deep` argument""" + return request.param + + @pytest.fixture(params=[True, False]) def dropna(request): """Param for `dropna` argument""" diff --git a/python/cudf/cudf/tests/dataframe/methods/test_sort_index.py b/python/cudf/cudf/tests/dataframe/methods/test_sort_index.py index 8e58755eff9..73e7bd184d8 100644 --- a/python/cudf/cudf/tests/dataframe/methods/test_sort_index.py +++ b/python/cudf/cudf/tests/dataframe/methods/test_sort_index.py @@ -1,5 +1,5 @@ # Copyright (c) 2025, NVIDIA CORPORATION. - +import itertools import numpy as np import pandas as pd @@ -147,6 +147,29 @@ def test_sort_index_axis_1_ignore_index_true_columnaccessor_state_names(): assert result._data.names == tuple(result._data.keys()) +@pytest.mark.parametrize( + "levels", + itertools.chain.from_iterable( + itertools.permutations(range(3), n) for n in range(1, 4) + ), + ids=str, +) +def test_multiindex_sort_index_partial(levels): + df = pd.DataFrame( + { + "a": [3, 3, 3, 1, 1, 1, 2, 2], + "b": [4, 2, 7, -1, 11, -2, 7, 7], + "c": [4, 4, 2, 3, 3, 3, 1, 1], + "val": [1, 2, 3, 4, 5, 6, 7, 8], + } + ).set_index(["a", "b", "c"]) + cdf = cudf.from_pandas(df) + + expect = df.sort_index(level=levels, sort_remaining=True) + got = cdf.sort_index(level=levels, sort_remaining=True) + assert_eq(expect, got) + + def test_df_cat_sort_index(): df = cudf.DataFrame( { diff --git a/python/cudf/cudf/tests/dataframe/methods/test_sort_values.py b/python/cudf/cudf/tests/dataframe/methods/test_sort_values.py index 2a7b53d94cb..dedc30feeb2 100644 --- a/python/cudf/cudf/tests/dataframe/methods/test_sort_values.py +++ b/python/cudf/cudf/tests/dataframe/methods/test_sort_values.py @@ -218,3 +218,13 @@ def test_sort_values_datetime(): s_gdf = gdf.sort_values(by="date") assert_eq(s_df, s_gdf) + + +def test_dataframe_loc_duplicate_index_scalar(): + pdf = pd.DataFrame({"a": [1, 2, 3, 4, 5]}, index=[1, 2, 1, 4, 2]) + gdf = cudf.DataFrame.from_pandas(pdf) + + pdf_sorted = pdf.sort_values(by=list(pdf.columns), axis=0) + gdf_sorted = gdf.sort_values(by=list(gdf.columns), axis=0) + + assert_eq(pdf_sorted, gdf_sorted) diff --git a/python/cudf/cudf/tests/dataframe/methods/test_take.py b/python/cudf/cudf/tests/dataframe/methods/test_take.py new file mode 100644 index 00000000000..35d6f78487a --- /dev/null +++ b/python/cudf/cudf/tests/dataframe/methods/test_take.py @@ -0,0 +1,47 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + +import numpy as np +import pytest + +import cudf +from cudf.testing import assert_eq + + +@pytest.mark.parametrize("ntake", [0, 1, 123, 122, 200]) +def test_dataframe_take(ntake): + rng = np.random.default_rng(seed=0) + nelem = 123 + df = cudf.DataFrame( + { + "ii": rng.integers(0, 20, nelem), + "ff": rng.random(nelem), + } + ) + + take_indices = rng.integers(0, len(df), ntake) + + actual = df.take(take_indices) + expected = df.to_pandas().take(take_indices) + + assert actual.ii.null_count == 0 + assert actual.ff.null_count == 0 + assert_eq(actual, expected) + + +@pytest.mark.parametrize("ntake", [1, 2, 8, 9]) +def test_dataframe_take_with_multiindex(ntake): + rng = np.random.default_rng(seed=0) + df = cudf.DataFrame( + {"ii": rng.integers(0, 20, 9), "ff": rng.random(9)}, + index=cudf.MultiIndex( + levels=[["lama", "cow", "falcon"], ["speed", "weight", "length"]], + codes=[[0, 0, 0, 1, 1, 1, 2, 2, 2], [0, 1, 2, 0, 1, 2, 0, 1, 2]], + ), + ) + + take_indices = rng.integers(0, len(df), ntake) + + actual = df.take(take_indices) + expected = df.to_pandas().take(take_indices) + + assert_eq(actual, expected) diff --git a/python/cudf/cudf/tests/groupby/test_reductions.py b/python/cudf/cudf/tests/groupby/test_reductions.py index f4d248f79c4..19ad845a435 100644 --- a/python/cudf/cudf/tests/groupby/test_reductions.py +++ b/python/cudf/cudf/tests/groupby/test_reductions.py @@ -9,7 +9,7 @@ PANDAS_CURRENT_SUPPORTED_VERSION, PANDAS_VERSION, ) -from cudf.testing import assert_eq, assert_groupby_results_equal +from cudf.testing import assert_eq, assert_groupby_results_equal, assert_neq from cudf.testing._utils import assert_exceptions_equal @@ -987,6 +987,126 @@ def test_group_by_reduce_numeric_only(by, data, groupby_reduction_methods): assert_eq(expected, result) +def test_multiindex_multiple_groupby(): + rng = np.random.default_rng(seed=0) + pdf = pd.DataFrame( + { + "a": [4, 17, 4, 9, 5], + "b": [1, 4, 4, 3, 2], + "x": rng.normal(size=5), + } + ) + gdf = cudf.DataFrame.from_pandas(pdf) + pdg = pdf.groupby(["a", "b"], sort=True).sum() + gdg = gdf.groupby(["a", "b"], sort=True).sum() + assert_eq(pdg, gdg) + pdg = pdf.groupby(["a", "b"], sort=True).x.sum() + gdg = gdf.groupby(["a", "b"], sort=True).x.sum() + assert_eq(pdg, gdg) + + +def test_multiindex_equality(): + # mi made from groupby + # mi made manually to be identical + # are they equal? + gdf = cudf.DataFrame( + {"x": [1, 5, 3, 4, 1], "y": [1, 1, 2, 2, 5], "z": [0, 1, 0, 1, 0]} + ) + mi1 = gdf.groupby(["x", "y"], sort=True).mean().index + mi2 = cudf.MultiIndex( + levels=[[1, 3, 4, 5], [1, 2, 5]], + codes=[[0, 0, 1, 2, 3], [0, 2, 1, 1, 0]], + names=["x", "y"], + ) + assert_eq(mi1, mi2) + + # mi made from two groupbys, are they equal? + mi2 = gdf.groupby(["x", "y"], sort=True).max().index + assert_eq(mi1, mi2) + + # mi made manually twice are they equal? + mi1 = cudf.MultiIndex( + levels=[[1, 3, 4, 5], [1, 2, 5]], + codes=[[0, 0, 1, 2, 3], [0, 2, 1, 1, 0]], + names=["x", "y"], + ) + mi2 = cudf.MultiIndex( + levels=[[1, 3, 4, 5], [1, 2, 5]], + codes=[[0, 0, 1, 2, 3], [0, 2, 1, 1, 0]], + names=["x", "y"], + ) + assert_eq(mi1, mi2) + + # mi made from different groupbys are they not equal? + mi1 = gdf.groupby(["x", "y"]).mean().index + mi2 = gdf.groupby(["x", "z"]).mean().index + assert_neq(mi1, mi2) + + # mi made from different manuals are they not equal? + mi1 = cudf.MultiIndex( + levels=[[1, 3, 4, 5], [1, 2, 5]], + codes=[[0, 0, 1, 2, 3], [0, 2, 1, 1, 0]], + names=["x", "y"], + ) + mi2 = cudf.MultiIndex( + levels=[[0, 3, 4, 5], [1, 2, 5]], + codes=[[0, 0, 1, 2, 3], [0, 2, 1, 1, 0]], + names=["x", "y"], + ) + assert_neq(mi1, mi2) + + +def test_multiindex_equals(): + # mi made from groupby + # mi made manually to be identical + # are they equal? + gdf = cudf.DataFrame( + {"x": [1, 5, 3, 4, 1], "y": [1, 1, 2, 2, 5], "z": [0, 1, 0, 1, 0]} + ) + mi1 = gdf.groupby(["x", "y"], sort=True).mean().index + mi2 = cudf.MultiIndex( + levels=[[1, 3, 4, 5], [1, 2, 5]], + codes=[[0, 0, 1, 2, 3], [0, 2, 1, 1, 0]], + names=["x", "y"], + ) + assert_eq(mi1.equals(mi2), True) + + # mi made from two groupbys, are they equal? + mi2 = gdf.groupby(["x", "y"], sort=True).max().index + assert_eq(mi1.equals(mi2), True) + + # mi made manually twice are they equal? + mi1 = cudf.MultiIndex( + levels=[[1, 3, 4, 5], [1, 2, 5]], + codes=[[0, 0, 1, 2, 3], [0, 2, 1, 1, 0]], + names=["x", "y"], + ) + mi2 = cudf.MultiIndex( + levels=[[1, 3, 4, 5], [1, 2, 5]], + codes=[[0, 0, 1, 2, 3], [0, 2, 1, 1, 0]], + names=["x", "y"], + ) + assert_eq(mi1.equals(mi2), True) + + # mi made from different groupbys are they not equal? + mi1 = gdf.groupby(["x", "y"], sort=True).mean().index + mi2 = gdf.groupby(["x", "z"], sort=True).mean().index + assert_eq(mi1.equals(mi2), False) + + # mi made from different manuals are they not equal? + mi1 = cudf.MultiIndex( + levels=[[1, 3, 4, 5], [1, 2, 5]], + codes=[[0, 0, 1, 2, 3], [0, 2, 1, 1, 0]], + names=["x", "y"], + ) + mi2 = cudf.MultiIndex( + levels=[[0, 3, 4, 5], [1, 2, 5]], + codes=[[0, 0, 1, 2, 3], [0, 2, 1, 1, 0]], + names=["x", "y"], + ) + assert_eq(mi1.equals(mi2), False) + + @pytest.mark.parametrize( "str_data", [[], ["a", "b", "c", "d", "e"], [None, None, None, None, None]] ) diff --git a/python/cudf/cudf/tests/indexes/categoricalindex/methods/test_equals.py b/python/cudf/cudf/tests/indexes/categoricalindex/methods/test_equals.py new file mode 100644 index 00000000000..4077e8611fe --- /dev/null +++ b/python/cudf/cudf/tests/indexes/categoricalindex/methods/test_equals.py @@ -0,0 +1,71 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + +import pandas as pd +import pytest + +import cudf +from cudf.testing import assert_eq + + +@pytest.mark.parametrize( + "data", + [ + [1, 2, 3, 4, 5, 6], + [10, 20, 30, 40, 50, 60], + ["1", "2", "3", "4", "5", "6"], + [1.0, 2.0, 3.0, 4.0, 5.0, 6.0], + ["a"], + ["b", "c", "d"], + [1], + [2, 3, 4], + [], + [10.0], + [1100.112, 2323.2322, 2323.2322], + ["abcd", "defgh", "werty", "poiu"], + ], +) +@pytest.mark.parametrize( + "other", + [ + [1, 2, 3, 4, 5, 6], + [10, 20, 30, 40, 50, 60], + ["1", "2", "3", "4", "5", "6"], + [1.0, 2.0, 3.0, 4.0, 5.0, 6.0], + ["a"], + ["b", "c", "d"], + [1], + [2, 3, 4], + [], + [10.0], + [1100.112, 2323.2322, 2323.2322], + ["abcd", "defgh", "werty", "poiu"], + ], +) +def test_index_categories_equal(data, other): + pd_data = pd.Index(data).astype("category") + pd_other = pd.Index(other) + + gd_data = cudf.Index(data).astype("category") + gd_other = cudf.Index(other) + + expected = pd_data.equals(pd_other) + actual = gd_data.equals(gd_other) + assert_eq(expected, actual) + + expected = pd_other.equals(pd_data) + actual = gd_other.equals(gd_data) + assert_eq(expected, actual) + + +def test_index_equals_categories(): + lhs = cudf.CategoricalIndex( + ["a", "b", "c", "b", "a"], categories=["a", "b", "c"] + ) + rhs = cudf.CategoricalIndex( + ["a", "b", "c", "b", "a"], categories=["a", "b", "c", "_"] + ) + + got = lhs.equals(rhs) + expect = lhs.to_pandas().equals(rhs.to_pandas()) + + assert got == expect diff --git a/python/cudf/cudf/tests/indexes/categoricalindex/test_constructors.py b/python/cudf/cudf/tests/indexes/categoricalindex/test_constructors.py new file mode 100644 index 00000000000..e6124831550 --- /dev/null +++ b/python/cudf/cudf/tests/indexes/categoricalindex/test_constructors.py @@ -0,0 +1,55 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + +import pandas as pd +import pytest + +import cudf +from cudf.testing import assert_eq + + +@pytest.mark.parametrize("data", [[1, 2, 3, 4], []]) +@pytest.mark.parametrize("categories", [[1, 2], None]) +@pytest.mark.parametrize( + "dtype", + [ + pd.CategoricalDtype([1, 2, 3], ordered=True), + pd.CategoricalDtype([1, 2, 3], ordered=False), + None, + ], +) +@pytest.mark.parametrize("ordered", [True, False]) +@pytest.mark.parametrize("name", [1, "a", None]) +def test_categorical_index_basic(data, categories, dtype, ordered, name): + # can't have both dtype and categories/ordered + if dtype is not None: + categories = None + ordered = None + pindex = pd.CategoricalIndex( + data=data, + categories=categories, + dtype=dtype, + ordered=ordered, + name=name, + ) + gindex = cudf.CategoricalIndex( + data=data, + categories=categories, + dtype=dtype, + ordered=ordered, + name=name, + ) + + assert_eq(pindex, gindex) + + +@pytest.mark.parametrize("ordered", [True, False]) +@pytest.mark.parametrize("name", [None, "test"]) +def test_categoricalindex_from_codes(ordered, name): + codes = [0, 1, 2, 3, 4] + categories = ["a", "b", "c", "d", "e"] + result = cudf.CategoricalIndex.from_codes(codes, categories, ordered, name) + expected = pd.CategoricalIndex( + pd.Categorical.from_codes(codes, categories, ordered=ordered), + name=name, + ) + assert_eq(result, expected) diff --git a/python/cudf/cudf/tests/indexes/datetimeindex/indexing/__init__.py b/python/cudf/cudf/tests/indexes/datetimeindex/indexing/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/python/cudf/cudf/tests/indexes/datetimeindex/indexing/test_getitem.py b/python/cudf/cudf/tests/indexes/datetimeindex/indexing/test_getitem.py new file mode 100644 index 00000000000..dc40612f6e6 --- /dev/null +++ b/python/cudf/cudf/tests/indexes/datetimeindex/indexing/test_getitem.py @@ -0,0 +1,16 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + + +import cudf +from cudf.testing import assert_eq + + +def test_index_getitem_time_duration(temporal_types_as_str): + gidx = cudf.Index([1, 2, 3, 4, None], dtype=temporal_types_as_str) + pidx = gidx.to_pandas() + with cudf.option_context("mode.pandas_compatible", True): + for i in range(len(gidx)): + if i == 4: + assert gidx[i] is pidx[i] + else: + assert_eq(gidx[i], pidx[i]) diff --git a/python/cudf/cudf/tests/indexes/datetimeindex/methods/test_ceil_floor_round.py b/python/cudf/cudf/tests/indexes/datetimeindex/methods/test_ceil_floor_round.py new file mode 100644 index 00000000000..b21e315502c --- /dev/null +++ b/python/cudf/cudf/tests/indexes/datetimeindex/methods/test_ceil_floor_round.py @@ -0,0 +1,20 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + +import pytest + +import cudf +from cudf.testing import assert_eq + + +@pytest.mark.parametrize( + "resolution", ["D", "h", "min", "s", "ms", "us", "ns"] +) +@pytest.mark.parametrize("method", ["ceil", "floor", "round"]) +def test_index_datetime_ceil(resolution, method): + cuidx = cudf.DatetimeIndex([1000000, 2000000, 3000000, 4000000, 5000000]) + pidx = cuidx.to_pandas() + + expected = getattr(pidx, method)(resolution) + result = getattr(cuidx, method)(resolution) + + assert_eq(expected, result) diff --git a/python/cudf/cudf/tests/indexes/datetimeindex/methods/test_repeat.py b/python/cudf/cudf/tests/indexes/datetimeindex/methods/test_repeat.py new file mode 100644 index 00000000000..a2b68a3584f --- /dev/null +++ b/python/cudf/cudf/tests/indexes/datetimeindex/methods/test_repeat.py @@ -0,0 +1,19 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + + +import cudf +from cudf.testing import assert_eq + + +def test_index_datetime_repeat(): + gidx = cudf.date_range("2021-01-01", periods=3, freq="D") + pidx = gidx.to_pandas() + + actual = gidx.repeat(5) + expected = pidx.repeat(5) + + assert_eq(actual, expected) + + actual = gidx.to_frame().repeat(5) + + assert_eq(actual.index, expected) diff --git a/python/cudf/cudf/tests/indexes/index/indexing/__init__.py b/python/cudf/cudf/tests/indexes/index/indexing/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/python/cudf/cudf/tests/indexes/index/indexing/test_getitem.py b/python/cudf/cudf/tests/indexes/index/indexing/test_getitem.py new file mode 100644 index 00000000000..70fab79932a --- /dev/null +++ b/python/cudf/cudf/tests/indexes/index/indexing/test_getitem.py @@ -0,0 +1,33 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + +import numpy as np +import pytest + +import cudf +from cudf.testing import assert_eq + + +@pytest.mark.parametrize( + "index_values", + [range(1, 10, 2), [1, 2, 3], ["a", "b", "c"], [1.5, 2.5, 3.5]], +) +@pytest.mark.parametrize("i_type", [int, np.int8, np.int32, np.int64]) +def test_scalar_getitem(index_values, i_type): + i = i_type(1) + index = cudf.Index(index_values) + + assert not isinstance(index[i], cudf.Index) + assert index[i] == index_values[i] + assert_eq(index, index.to_pandas()) + + +@pytest.mark.parametrize("idx", [0, np.int64(0)]) +def test_index_getitem_from_int(idx): + result = cudf.Index([1, 2])[idx] + assert result == 1 + + +@pytest.mark.parametrize("idx", [1.5, True, "foo"]) +def test_index_getitem_from_nonint_raises(idx): + with pytest.raises(ValueError): + cudf.Index([1, 2])[idx] diff --git a/python/cudf/cudf/tests/indexes/index/indexing/test_setitem.py b/python/cudf/cudf/tests/indexes/index/indexing/test_setitem.py new file mode 100644 index 00000000000..91fa5f2a33c --- /dev/null +++ b/python/cudf/cudf/tests/indexes/index/indexing/test_setitem.py @@ -0,0 +1,16 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + +import numpy as np +import pytest + +import cudf + + +def test_index_immutable(): + start, stop = 10, 34 + rg = cudf.RangeIndex(start, stop) + with pytest.raises(TypeError): + rg[1] = 5 + gi = cudf.Index(np.arange(start, stop)) + with pytest.raises(TypeError): + gi[1] = 5 diff --git a/python/cudf/cudf/tests/indexes/index/methods/test_any_all.py b/python/cudf/cudf/tests/indexes/index/methods/test_any_all.py new file mode 100644 index 00000000000..f20c292c845 --- /dev/null +++ b/python/cudf/cudf/tests/indexes/index/methods/test_any_all.py @@ -0,0 +1,12 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + + +import cudf +from cudf.testing import assert_eq + + +def test_index_any(): + gidx = cudf.Index([1, 2, 3]) + pidx = gidx.to_pandas() + + assert_eq(pidx.any(), gidx.any()) diff --git a/python/cudf/cudf/tests/indexes/index/methods/test_append.py b/python/cudf/cudf/tests/indexes/index/methods/test_append.py new file mode 100644 index 00000000000..dc5a2a26a80 --- /dev/null +++ b/python/cudf/cudf/tests/indexes/index/methods/test_append.py @@ -0,0 +1,323 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + +import re + +import numpy as np +import pandas as pd +import pytest + +import cudf +from cudf.core._compat import ( + PANDAS_CURRENT_SUPPORTED_VERSION, + PANDAS_VERSION, +) +from cudf.core.index import Index +from cudf.testing import assert_eq +from cudf.testing._utils import ( + assert_exceptions_equal, + expect_warning_if, +) + + +@pytest.mark.parametrize( + "data", + [ + [1, 2, 3, 4, 5, 6], + [10, 20, 30, 40, 50, 60], + ["1", "2", "3", "4", "5", "6"], + [1.0, 2.0, 3.0, 4.0, 5.0, 6.0], + ["a"], + ["b", "c", "d"], + [1], + [2, 3, 4], + [], + [10.0], + [1100.112, 2323.2322, 2323.2322], + ["abcd", "defgh", "werty", "poiu"], + ], +) +@pytest.mark.parametrize( + "other", + [ + [1, 2, 3, 4, 5, 6], + [10, 20, 30, 40, 50, 60], + ["1", "2", "3", "4", "5", "6"], + [1.0, 2.0, 3.0, 4.0, 5.0, 6.0], + ["a"], + ["b", "c", "d"], + [1], + [2, 3, 4], + [], + [10.0], + [1100.112, 2323.2322, 2323.2322], + ["abcd", "defgh", "werty", "poiu"], + ], +) +@pytest.mark.skipif( + PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, + reason="Does not warn on older versions of pandas", +) +def test_index_append(data, other): + pd_data = pd.Index(data) + pd_other = pd.Index(other) + + gd_data = cudf.Index(data) + gd_other = cudf.Index(other) + + if cudf.utils.dtypes.is_mixed_with_object_dtype(gd_data, gd_other): + gd_data = gd_data.astype("str") + gd_other = gd_other.astype("str") + + with expect_warning_if( + (len(data) == 0 or len(other) == 0) and pd_data.dtype != pd_other.dtype + ): + expected = pd_data.append(pd_other) + with expect_warning_if( + (len(data) == 0 or len(other) == 0) and gd_data.dtype != gd_other.dtype + ): + actual = gd_data.append(gd_other) + if len(data) == 0 and len(other) == 0: + # Pandas default dtype to "object" for empty list + # cudf default dtype to "float" for empty list + assert_eq(expected, actual.astype("str")) + elif actual.dtype == "object": + assert_eq(expected.astype("str"), actual) + else: + assert_eq(expected, actual) + + +def test_index_empty_append_name_conflict(): + empty = cudf.Index([], name="foo") + non_empty = cudf.Index([1], name="bar") + expected = cudf.Index([1]) + + with pytest.warns(FutureWarning): + result = non_empty.append(empty) + assert_eq(result, expected) + + with pytest.warns(FutureWarning): + result = empty.append(non_empty) + assert_eq(result, expected) + + +@pytest.mark.parametrize( + "data", + [ + [1, 2, 3, 4, 5, 6], + [10, 20, 30, 40, 50, 60], + [1.0, 2.0, 3.0, 4.0, 5.0, 6.0], + [1], + [2, 3, 4], + [10.0], + [1100.112, 2323.2322, 2323.2322], + ], +) +@pytest.mark.parametrize( + "other", + [ + ["1", "2", "3", "4", "5", "6"], + ["a"], + ["b", "c", "d"], + ["abcd", "defgh", "werty", "poiu"], + ], +) +def test_index_append_error(data, other): + gd_data = Index(data) + gd_other = Index(other) + + got_dtype = ( + gd_other.dtype + if gd_data.dtype == np.dtype("object") + else gd_data.dtype + ) + with pytest.raises( + TypeError, + match=re.escape( + f"cudf does not support appending an Index of " + f"dtype `{np.dtype('object')}` with an Index " + f"of dtype `{got_dtype}`, please type-cast " + f"either one of them to same dtypes." + ), + ): + gd_data.append(gd_other) + + with pytest.raises( + TypeError, + match=re.escape( + f"cudf does not support appending an Index of " + f"dtype `{np.dtype('object')}` with an Index " + f"of dtype `{got_dtype}`, please type-cast " + f"either one of them to same dtypes." + ), + ): + gd_other.append(gd_data) + + sr = gd_other.to_series() + + assert_exceptions_equal( + lfunc=gd_data.to_pandas().append, + rfunc=gd_data.append, + lfunc_args_and_kwargs=([[sr.to_pandas()]],), + rfunc_args_and_kwargs=([[sr]],), + ) + + +@pytest.mark.parametrize( + "data,other", + [ + ( + pd.Index([1, 2, 3, 4, 5, 6]), + [ + pd.Index([1, 2, 3, 4, 5, 6]), + pd.Index([1, 2, 3, 4, 5, 6, 10]), + pd.Index([]), + ], + ), + ( + pd.Index([]), + [ + pd.Index([1, 2, 3, 4, 5, 6]), + pd.Index([1, 2, 3, 4, 5, 6, 10]), + pd.Index([1, 4, 5, 6]), + ], + ), + ( + pd.Index([10, 20, 30, 40, 50, 60]), + [ + pd.Index([10, 20, 30, 40, 50, 60]), + pd.Index([10, 20, 30]), + pd.Index([40, 50, 60]), + pd.Index([10, 60]), + pd.Index([60]), + ], + ), + ( + pd.Index([]), + [ + pd.Index([10, 20, 30, 40, 50, 60]), + pd.Index([10, 20, 30]), + pd.Index([40, 50, 60]), + pd.Index([10, 60]), + pd.Index([60]), + ], + ), + ( + pd.Index(["1", "2", "3", "4", "5", "6"]), + [ + pd.Index(["1", "2", "3", "4", "5", "6"]), + pd.Index(["1", "2", "3"]), + pd.Index(["6"]), + pd.Index(["1", "6"]), + ], + ), + ( + pd.Index([]), + [ + pd.Index(["1", "2", "3", "4", "5", "6"]), + pd.Index(["1", "2", "3"]), + pd.Index(["6"]), + pd.Index(["1", "6"]), + ], + ), + ( + pd.Index([1.0, 2.0, 3.0, 4.0, 5.0, 6.0]), + [ + pd.Index([1.0, 2.0, 3.0, 4.0, 5.0, 6.0]), + pd.Index([1.0, 6.0]), + pd.Index([]), + pd.Index([6.0]), + ], + ), + ( + pd.Index([]), + [ + pd.Index([1.0, 2.0, 3.0, 4.0, 5.0, 6.0]), + pd.Index([1.0, 6.0]), + pd.Index([1.0, 2.0, 6.0]), + pd.Index([6.0]), + ], + ), + ( + pd.Index(["a"]), + [ + pd.Index(["a"]), + pd.Index(["a", "b", "c"]), + pd.Index(["c"]), + pd.Index(["d"]), + pd.Index(["ae", "hello", "world"]), + ], + ), + ( + pd.Index([]), + [ + pd.Index(["a"]), + pd.Index(["a", "b", "c"]), + pd.Index(["c"]), + pd.Index(["d"]), + pd.Index(["ae", "hello", "world"]), + pd.Index([]), + ], + ), + ], +) +@pytest.mark.skipif( + PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, + reason="Does not warn on older versions of pandas", +) +def test_index_append_list(data, other): + pd_data = data + pd_other = other + + gd_data = cudf.from_pandas(data) + gd_other = [cudf.from_pandas(i) for i in other] + + with expect_warning_if( + (len(data) == 0 or any(len(d) == 0 for d in other)) + and (any(d.dtype != data.dtype for d in other)) + ): + expected = pd_data.append(pd_other) + with expect_warning_if( + (len(data) == 0 or any(len(d) == 0 for d in other)) + and (any(d.dtype != data.dtype for d in other)) + ): + actual = gd_data.append(gd_other) + + assert_eq(expected, actual) + + +@pytest.mark.parametrize( + "index", + [ + range(np.random.default_rng(seed=0).integers(0, 100)), + range(0, 10, -2), + range(0, -10, 2), + range(0, -10, -2), + range(0, 1), + [1, 2, 3, 1, None, None], + [None, None, 3.2, 1, None, None], + [None, "a", "3.2", "z", None, None], + pd.Series(["a", "b", None], dtype="category"), + np.array([1, 2, 3, None], dtype="datetime64[s]"), + ], +) +@pytest.mark.parametrize( + "func", + [ + "to_series", + "isna", + "notna", + "append", + ], +) +def test_index_methods(index, func): + gidx = cudf.Index(index) + pidx = gidx.to_pandas() + + if func == "append": + expected = pidx.append(other=pidx) + actual = gidx.append(other=gidx) + else: + expected = getattr(pidx, func)() + actual = getattr(gidx, func)() + + assert_eq(expected, actual) diff --git a/python/cudf/cudf/tests/indexes/index/methods/test_argsort.py b/python/cudf/cudf/tests/indexes/index/methods/test_argsort.py new file mode 100644 index 00000000000..aca74c8885b --- /dev/null +++ b/python/cudf/cudf/tests/indexes/index/methods/test_argsort.py @@ -0,0 +1,22 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + +import pandas as pd +import pytest + +import cudf +from cudf.testing import assert_eq + + +@pytest.mark.parametrize( + "data", + [ + [1, 10, 2, 100, -10], + ["z", "x", "a", "c", "b"], + [-10.2, 100.1, -100.2, 0.0, 0.23], + ], +) +def test_index_argsort(data): + pdi = pd.Index(data) + gdi = cudf.from_pandas(pdi) + + assert_eq(pdi.argsort(), gdi.argsort()) diff --git a/python/cudf/cudf/tests/indexes/index/methods/test_astype.py b/python/cudf/cudf/tests/indexes/index/methods/test_astype.py new file mode 100644 index 00000000000..05ec3e792f9 --- /dev/null +++ b/python/cudf/cudf/tests/indexes/index/methods/test_astype.py @@ -0,0 +1,18 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + + +import pandas as pd + +import cudf +from cudf.testing import assert_eq + + +def test_index_astype(all_supported_types_as_str, copy): + pdi = pd.Index([1, 2, 3]) + gdi = cudf.from_pandas(pdi) + + actual = gdi.astype(dtype=all_supported_types_as_str, copy=copy) + expected = pdi.astype(dtype=all_supported_types_as_str, copy=copy) + + assert_eq(expected, actual) + assert_eq(pdi, gdi) diff --git a/python/cudf/cudf/tests/indexes/index/methods/test_copy.py b/python/cudf/cudf/tests/indexes/index/methods/test_copy.py new file mode 100644 index 00000000000..3429adea5d1 --- /dev/null +++ b/python/cudf/cudf/tests/indexes/index/methods/test_copy.py @@ -0,0 +1,50 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + +import pandas as pd +import pytest + +import cudf +from cudf.testing import assert_eq +from cudf.testing._utils import ( + assert_column_memory_eq, + assert_column_memory_ne, +) + + +@pytest.mark.parametrize( + "data", + [ + range(1, 5), + [1, 2, 3, 4], + pd.DatetimeIndex(["2001", "2002", "2003"]), + ["a", "b", "c"], + pd.CategoricalIndex(["a", "b", "c"]), + ], +) +@pytest.mark.parametrize("copy_on_write", [True, False]) +def test_index_copy(data, deep, copy_on_write): + name = "x" + cidx = cudf.Index(data) + pidx = cidx.to_pandas() + + pidx_copy = pidx.copy(name=name, deep=deep) + cidx_copy = cidx.copy(name=name, deep=deep) + + assert_eq(pidx_copy, cidx_copy) + + with cudf.option_context("copy_on_write", copy_on_write): + if not isinstance(cidx, cudf.RangeIndex): + if ( + isinstance(cidx._column, cudf.core.column.StringColumn) + or not deep + or (copy_on_write and not deep) + ): + # StringColumn is immutable hence, deep copies of a + # Index with string dtype will share the same StringColumn. + + # When `copy_on_write` is turned on, Index objects will + # have unique column object but they all point to same + # data pointers. + assert_column_memory_eq(cidx._column, cidx_copy._column) + else: + assert_column_memory_ne(cidx._column, cidx_copy._column) diff --git a/python/cudf/cudf/tests/indexes/index/methods/test_difference.py b/python/cudf/cudf/tests/indexes/index/methods/test_difference.py new file mode 100644 index 00000000000..18a08cb63b2 --- /dev/null +++ b/python/cudf/cudf/tests/indexes/index/methods/test_difference.py @@ -0,0 +1,109 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. +import pandas as pd +import pytest + +import cudf +from cudf.core._compat import ( + PANDAS_CURRENT_SUPPORTED_VERSION, + PANDAS_GE_220, + PANDAS_VERSION, +) +from cudf.testing import assert_eq +from cudf.testing._utils import ( + assert_exceptions_equal, +) + + +@pytest.mark.parametrize( + "data", + [ + [1, 2, 3, 4, 5, 6], + [4, 5, 6, 10, 20, 30], + [10, 20, 30, 40, 50, 60], + ["1", "2", "3", "4", "5", "6"], + ["5", "6", "2", "a", "b", "c"], + [1.0, 2.0, 3.0, 4.0, 5.0, 6.0], + [1.0, 5.0, 6.0, 0.0, 1.3], + ["ab", "cd", "ef"], + pd.Series(["1", "2", "a", "3", None], dtype="category"), + range(0, 10), + [], + [1, 1, 2, 2], + ], +) +@pytest.mark.parametrize( + "other", + [ + [1, 2, 3, 4, 5, 6], + [4, 5, 6, 10, 20, 30], + [10, 20, 30, 40, 50, 60], + ["1", "2", "3", "4", "5", "6"], + ["5", "6", "2", "a", "b", "c"], + ["ab", "ef", None], + [1.0, 2.0, 3.0, 4.0, 5.0, 6.0], + [1.0, 5.0, 6.0, 0.0, 1.3], + range(2, 4), + pd.Series(["1", "a", "3", None], dtype="category"), + [], + [2], + ], +) +@pytest.mark.parametrize("sort", [None, False, True]) +@pytest.mark.parametrize( + "name_data,name_other", + [("abc", "c"), (None, "abc"), ("abc", pd.NA), ("abc", "abc")], +) +@pytest.mark.skipif( + PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, + reason="Fails in older versions of pandas", +) +def test_index_difference(data, other, sort, name_data, name_other): + pd_data = pd.Index(data, name=name_data) + pd_other = pd.Index(other, name=name_other) + if ( + not PANDAS_GE_220 + and isinstance(pd_data.dtype, pd.CategoricalDtype) + and not isinstance(pd_other.dtype, pd.CategoricalDtype) + and pd_other.isnull().any() + ): + pytest.skip(reason="https://github.com/pandas-dev/pandas/issues/57318") + + if ( + not PANDAS_GE_220 + and len(pd_other) == 0 + and len(pd_data) != len(pd_data.unique()) + ): + pytest.skip(reason="Bug fixed in pandas-2.2+") + + gd_data = cudf.from_pandas(pd_data) + gd_other = cudf.from_pandas(pd_other) + + expected = pd_data.difference(pd_other, sort=sort) + actual = gd_data.difference(gd_other, sort=sort) + + assert_eq(expected, actual) + + +@pytest.mark.parametrize("other", ["a", 1, None]) +def test_index_difference_invalid_inputs(other): + pdi = pd.Index([1, 2, 3]) + gdi = cudf.Index([1, 2, 3]) + + assert_exceptions_equal( + pdi.difference, + gdi.difference, + ([other], {}), + ([other], {}), + ) + + +def test_index_difference_sort_error(): + pdi = pd.Index([1, 2, 3]) + gdi = cudf.Index([1, 2, 3]) + + assert_exceptions_equal( + pdi.difference, + gdi.difference, + ([pdi], {"sort": "A"}), + ([gdi], {"sort": "A"}), + ) diff --git a/python/cudf/cudf/tests/indexes/index/methods/test_drop_duplicates.py b/python/cudf/cudf/tests/indexes/index/methods/test_drop_duplicates.py new file mode 100644 index 00000000000..f569fcde38a --- /dev/null +++ b/python/cudf/cudf/tests/indexes/index/methods/test_drop_duplicates.py @@ -0,0 +1,23 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + +import pandas as pd +import pytest + +import cudf +from cudf.testing import assert_eq + + +@pytest.mark.parametrize("data", [[1, 2, 3, 1, 2, 3, 4], [], [1], [1, 2, 3]]) +def test_index_drop_duplicates(data, all_supported_types_as_str, request): + request.applymarker( + pytest.mark.xfail( + len(data) > 0 + and all_supported_types_as_str + in {"timedelta64[us]", "timedelta64[ms]", "timedelta64[s]"}, + reason=f"wrong result for {all_supported_types_as_str}", + ) + ) + pdi = pd.Index(data, dtype=all_supported_types_as_str) + gdi = cudf.Index(data, dtype=all_supported_types_as_str) + + assert_eq(pdi.drop_duplicates(), gdi.drop_duplicates()) diff --git a/python/cudf/cudf/tests/indexes/index/methods/test_dropna.py b/python/cudf/cudf/tests/indexes/index/methods/test_dropna.py index b572e9e156d..01cfb9e7aa2 100644 --- a/python/cudf/cudf/tests/indexes/index/methods/test_dropna.py +++ b/python/cudf/cudf/tests/indexes/index/methods/test_dropna.py @@ -7,6 +7,11 @@ from cudf.testing import assert_eq +def test_dropna_bad_how(): + with pytest.raises(ValueError): + cudf.Index([1]).dropna(how="foo") + + @pytest.mark.parametrize( "data, dtype", [ diff --git a/python/cudf/cudf/tests/indexes/index/methods/test_duplicated.py b/python/cudf/cudf/tests/indexes/index/methods/test_duplicated.py new file mode 100644 index 00000000000..dce4c1ec5c0 --- /dev/null +++ b/python/cudf/cudf/tests/indexes/index/methods/test_duplicated.py @@ -0,0 +1,26 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + +import numpy as np +import pytest + +import cudf +from cudf.testing import assert_eq + + +@pytest.mark.parametrize( + "data", + [ + [1, 2, 3, 1, 1, 3, 2, 3], + [np.nan, 10, 15, 16, np.nan, 10, 16], + range(0, 10), + ["ab", "zx", None, "pq", "ab", None, "zx", None], + ], +) +@pytest.mark.parametrize("keep", ["first", "last", False]) +def test_index_duplicated(data, keep): + gs = cudf.Index(data) + ps = gs.to_pandas() + + expected = ps.duplicated(keep=keep) + actual = gs.duplicated(keep=keep) + assert_eq(expected, actual) diff --git a/python/cudf/cudf/tests/indexes/index/methods/test_equals.py b/python/cudf/cudf/tests/indexes/index/methods/test_equals.py new file mode 100644 index 00000000000..91e943311e9 --- /dev/null +++ b/python/cudf/cudf/tests/indexes/index/methods/test_equals.py @@ -0,0 +1,122 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + +import numpy as np +import pandas as pd +import pytest + +import cudf +from cudf.testing import assert_eq + + +def test_index_comparision(): + start, stop = 10, 34 + rg = cudf.RangeIndex(start, stop) + gi = cudf.Index(np.arange(start, stop)) + assert rg.equals(gi) + assert gi.equals(rg) + assert not rg[:-1].equals(gi) + assert rg[:-1].equals(gi[:-1]) + + +@pytest.mark.parametrize( + "data", + [ + [1, 2, 3, 4, 5, 6], + [10, 20, 30, 40, 50, 60], + ["1", "2", "3", "4", "5", "6"], + [1.0, 2.0, 3.0, 4.0, 5.0, 6.0], + ["a"], + ["b", "c", "d"], + [1], + [2, 3, 4], + [], + [10.0], + [1100.112, 2323.2322, 2323.2322], + ["abcd", "defgh", "werty", "poiu"], + ], +) +@pytest.mark.parametrize( + "other", + [ + [1, 2, 3, 4, 5, 6], + [10, 20, 30, 40, 50, 60], + ["1", "2", "3", "4", "5", "6"], + [1.0, 2.0, 3.0, 4.0, 5.0, 6.0], + ["a"], + [], + ["b", "c", "d"], + [1], + [2, 3, 4], + [10.0], + [1100.112, 2323.2322, 2323.2322], + ["abcd", "defgh", "werty", "poiu"], + ], +) +def test_index_equals(data, other): + pd_data = pd.Index(data) + pd_other = pd.Index(other) + + gd_data = cudf.Index(data) + gd_other = cudf.Index(other) + + expected = pd_data.equals(pd_other) + actual = gd_data.equals(gd_other) + assert_eq(expected, actual) + + +@pytest.mark.parametrize( + "data", + [ + [1, 2, 3, 4, 5, 6], + [10, 20, 30, 40, 50, 60], + ["1", "2", "3", "4", "5", "6"], + [1.0, 2.0, 3.0, 4.0, 5.0, 6.0], + ["a"], + ["b", "c", "d"], + [1], + [2, 3, 4], + [], + [10.0], + [1100.112, 2323.2322, 2323.2322], + ["abcd", "defgh", "werty", "poiu"], + ], +) +@pytest.mark.parametrize( + "other", + [ + [1, 2, 3, 4, 5, 6], + [10, 20, 30, 40, 50, 60], + ["1", "2", "3", "4", "5", "6"], + [1.0, 2.0, 3.0, 4.0, 5.0, 6.0], + ["a"], + ["b", "c", "d"], + [1], + [2, 3, 4], + [], + [10.0], + [1100.112, 2323.2322, 2323.2322], + ["abcd", "defgh", "werty", "poiu"], + ], +) +def test_index_equal_misc(data, other): + pd_data = pd.Index(data) + pd_other = other + + gd_data = cudf.Index(data) + gd_other = other + + expected = pd_data.equals(pd_other) + actual = gd_data.equals(gd_other) + assert_eq(expected, actual) + + expected = pd_data.equals(np.array(pd_other)) + actual = gd_data.equals(np.array(gd_other)) + assert_eq(expected, actual) + + expected = pd_data.equals(pd.Series(pd_other)) + actual = gd_data.equals(cudf.Series(gd_other)) + assert_eq(expected, actual) + + expected = pd_data.astype("category").equals(pd_other) + actual = gd_data.astype("category").equals(gd_other) + assert_eq(expected, actual) diff --git a/python/cudf/cudf/tests/indexes/index/methods/test_fillna.py b/python/cudf/cudf/tests/indexes/index/methods/test_fillna.py new file mode 100644 index 00000000000..ab061abc73f --- /dev/null +++ b/python/cudf/cudf/tests/indexes/index/methods/test_fillna.py @@ -0,0 +1,38 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + +import numpy as np +import pandas as pd +import pytest + +import cudf +from cudf.testing import assert_eq + + +@pytest.mark.parametrize( + "data,fill_value", + [ + ([1, 2, 3, 1, None, None], 1), + ([None, None, 3.2, 1, None, None], 10.0), + ([None, "a", "3.2", "z", None, None], "helloworld"), + (pd.Series(["a", "b", None], dtype="category"), "b"), + (pd.Series([None, None, 1.0], dtype="category"), 1.0), + ( + np.array([1, 2, 3, None], dtype="datetime64[s]"), + np.datetime64("2005-02-25"), + ), + ( + np.array( + [None, None, 122, 3242234, None, 6237846], + dtype="datetime64[ms]", + ), + np.datetime64("2005-02-25"), + ), + ], +) +def test_index_fillna(data, fill_value): + pdi = pd.Index(data) + gdi = cudf.Index(data) + + assert_eq( + pdi.fillna(fill_value), gdi.fillna(fill_value), exact=False + ) # Int64 v/s Float64 diff --git a/python/cudf/cudf/tests/indexes/index/methods/test_find_label_range.py b/python/cudf/cudf/tests/indexes/index/methods/test_find_label_range.py new file mode 100644 index 00000000000..21bc684026e --- /dev/null +++ b/python/cudf/cudf/tests/indexes/index/methods/test_find_label_range.py @@ -0,0 +1,27 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + + +import numpy as np +import pytest + +import cudf + + +def test_index_find_label_range_index(): + # Monotonic Index + idx = cudf.Index(np.asarray([4, 5, 6, 10])) + assert idx.find_label_range(slice(4, 6)) == slice(0, 3, 1) + assert idx.find_label_range(slice(5, 10)) == slice(1, 4, 1) + assert idx.find_label_range(slice(0, 6)) == slice(0, 3, 1) + assert idx.find_label_range(slice(4, 11)) == slice(0, 4, 1) + + # Non-monotonic Index + idx_nm = cudf.Index(np.asarray([5, 4, 6, 10])) + assert idx_nm.find_label_range(slice(4, 6)) == slice(1, 3, 1) + assert idx_nm.find_label_range(slice(5, 10)) == slice(0, 4, 1) + # Last value not found + with pytest.raises(KeyError, match="not in index"): + idx_nm.find_label_range(slice(0, 6)) + # Last value not found + with pytest.raises(KeyError, match="not in index"): + idx_nm.find_label_range(slice(4, 11)) diff --git a/python/cudf/cudf/tests/indexes/index/methods/test_get_indexer.py b/python/cudf/cudf/tests/indexes/index/methods/test_get_indexer.py new file mode 100644 index 00000000000..7b1ac5dcf43 --- /dev/null +++ b/python/cudf/cudf/tests/indexes/index/methods/test_get_indexer.py @@ -0,0 +1,158 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + +import pandas as pd +import pytest + +import cudf +from cudf.testing import assert_eq +from cudf.testing._utils import assert_exceptions_equal + + +@pytest.mark.parametrize( + "data", [[1, 3, 6], [6, 1, 3]], ids=["monotonic", "non-monotonic"] +) +@pytest.mark.parametrize("method", [None, "ffill", "bfill", "nearest"]) +def test_get_indexer_single_unique_numeric(data, method): + key = list(range(0, 8)) + pi = pd.Index(data) + gi = cudf.from_pandas(pi) + + if ( + # `method` only applicable to monotonic index + not pi.is_monotonic_increasing and method is not None + ): + assert_exceptions_equal( + lfunc=pi.get_loc, + rfunc=gi.get_loc, + lfunc_args_and_kwargs=([], {"key": key, "method": method}), + rfunc_args_and_kwargs=([], {"key": key, "method": method}), + ) + else: + expected = pi.get_indexer(key, method=method) + got = gi.get_indexer(key, method=method) + + assert_eq(expected, got) + + with cudf.option_context("mode.pandas_compatible", True): + got = gi.get_indexer(key, method=method) + assert_eq(expected, got, check_dtype=True) + + +@pytest.mark.parametrize( + "idx", + [ + [-1, 2, 3, 6], + [6, 1, 3, 4], + ], + ids=["monotonic", "non-monotonic"], +) +@pytest.mark.parametrize("key", [[0, 3, 1], [6, 7]]) +@pytest.mark.parametrize("method", [None, "ffill", "bfill", "nearest"]) +@pytest.mark.parametrize("tolerance", [None, 1, 2]) +def test_get_indexer_single_duplicate_numeric(idx, key, method, tolerance): + pi = pd.Index(idx) + gi = cudf.from_pandas(pi) + + if not pi.is_monotonic_increasing and method is not None: + assert_exceptions_equal( + lfunc=pi.get_indexer, + rfunc=gi.get_indexer, + lfunc_args_and_kwargs=([], {"key": key, "method": method}), + rfunc_args_and_kwargs=([], {"key": key, "method": method}), + ) + else: + expected = pi.get_indexer( + key, method=method, tolerance=None if method is None else tolerance + ) + got = gi.get_indexer( + key, method=method, tolerance=None if method is None else tolerance + ) + + assert_eq(expected, got) + + +@pytest.mark.parametrize("idx", [["b", "f", "m", "q"], ["m", "f", "b", "q"]]) +@pytest.mark.parametrize("key", [["a", "f", "n", "z"], ["p", "p", "b"]]) +@pytest.mark.parametrize("method", [None, "ffill", "bfill"]) +def test_get_indexer_single_unique_string(idx, key, method): + pi = pd.Index(idx) + gi = cudf.from_pandas(pi) + + if not pi.is_monotonic_increasing and method is not None: + assert_exceptions_equal( + lfunc=pi.get_indexer, + rfunc=gi.get_indexer, + lfunc_args_and_kwargs=([], {"key": key, "method": method}), + rfunc_args_and_kwargs=([], {"key": key, "method": method}), + ) + else: + expected = pi.get_indexer(key, method=method) + got = gi.get_indexer(key, method=method) + + assert_eq(expected, got) + + +@pytest.mark.parametrize("idx", [["b", "m", "m", "q"], ["a", "f", "m", "q"]]) +@pytest.mark.parametrize("key", [["a"], ["f", "n", "z"]]) +@pytest.mark.parametrize("method", [None, "ffill", "bfill"]) +def test_get_indexer_single_duplicate_string(idx, key, method): + pi = pd.Index(idx) + gi = cudf.from_pandas(pi) + + if ( + # `method` only applicable to monotonic index + (not pi.is_monotonic_increasing and method is not None) + or not pi.is_unique + ): + assert_exceptions_equal( + lfunc=pi.get_indexer, + rfunc=gi.get_indexer, + lfunc_args_and_kwargs=([], {"key": key, "method": method}), + rfunc_args_and_kwargs=([], {"key": key, "method": method}), + ) + else: + expected = pi.get_indexer(key, method=method) + got = gi.get_indexer(key, method=method) + + assert_eq(expected, got) + + with cudf.option_context("mode.pandas_compatible", True): + got = gi.get_indexer(key, method=method) + + assert_eq(expected, got, check_dtype=True) + + +@pytest.mark.parametrize( + "idx1", + [ + lambda: cudf.Index(["a", "b", "c"]), + lambda: cudf.RangeIndex(0, 10), + lambda: cudf.Index([1, 2, 3], dtype="category"), + lambda: cudf.Index(["a", "b", "c", "d"], dtype="category"), + lambda: cudf.MultiIndex.from_tuples( + [ + ("a", "a", "a"), + ("a", "b", "c"), + ("b", "a", "a"), + ("a", "a", "b"), + ("a", "b", "a"), + ("b", "c", "a"), + ] + ), + ], +) +@pytest.mark.parametrize( + "idx2", + [ + lambda: cudf.Index(["a", "b", "c"]), + lambda: cudf.RangeIndex(0, 10), + lambda: cudf.Index([1, 2, 3], dtype="category"), + lambda: cudf.Index(["a", "b", "c", "d"], dtype="category"), + ], +) +def test_get_indexer_invalid(idx1, idx2): + idx1 = idx1() + idx2 = idx2() + assert_eq( + idx1.get_indexer(idx2), idx1.to_pandas().get_indexer(idx2.to_pandas()) + ) diff --git a/python/cudf/cudf/tests/indexes/index/methods/test_get_loc.py b/python/cudf/cudf/tests/indexes/index/methods/test_get_loc.py new file mode 100644 index 00000000000..96277cec167 --- /dev/null +++ b/python/cudf/cudf/tests/indexes/index/methods/test_get_loc.py @@ -0,0 +1,76 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + +import pandas as pd +import pytest + +import cudf +from cudf.testing import assert_eq +from cudf.testing._utils import assert_exceptions_equal + + +@pytest.mark.parametrize( + "idx", + [ + [1, 3, 3, 6], + [6, 1, 3, 3], + [4, 3, 2, 1, 0], + ], + ids=["monotonic increasing", "non-monotonic", "monotonic decreasing"], +) +@pytest.mark.parametrize("key", [0, 3, 6, 7, 4]) +def test_get_loc_duplicate_numeric(idx, key): + pi = pd.Index(idx) + gi = cudf.from_pandas(pi) + + if key not in pi: + assert_exceptions_equal( + lfunc=pi.get_loc, + rfunc=gi.get_loc, + lfunc_args_and_kwargs=([], {"key": key}), + rfunc_args_and_kwargs=([], {"key": key}), + ) + else: + expected = pi.get_loc(key) + got = gi.get_loc(key) + + assert_eq(expected, got) + + +@pytest.mark.parametrize("idx", [["b", "f", "m", "q"], ["m", "f", "b", "q"]]) +@pytest.mark.parametrize("key", ["a", "f", "n", "z"]) +def test_get_loc_single_unique_string(idx, key): + pi = pd.Index(idx) + gi = cudf.from_pandas(pi) + + if key not in pi: + assert_exceptions_equal( + lfunc=pi.get_loc, + rfunc=gi.get_loc, + lfunc_args_and_kwargs=([], {"key": key}), + rfunc_args_and_kwargs=([], {"key": key}), + ) + else: + expected = pi.get_loc(key) + got = gi.get_loc(key) + + assert_eq(expected, got) + + +@pytest.mark.parametrize("idx", [["b", "m", "m", "q"], ["m", "f", "m", "q"]]) +@pytest.mark.parametrize("key", ["a", "f", "n", "z"]) +def test_get_loc_single_duplicate_string(idx, key): + pi = pd.Index(idx) + gi = cudf.from_pandas(pi) + + if key not in pi: + assert_exceptions_equal( + lfunc=pi.get_loc, + rfunc=gi.get_loc, + lfunc_args_and_kwargs=([], {"key": key}), + rfunc_args_and_kwargs=([], {"key": key}), + ) + else: + expected = pi.get_loc(key) + got = gi.get_loc(key) + + assert_eq(expected, got) diff --git a/python/cudf/cudf/tests/indexes/index/methods/test_intersection.py b/python/cudf/cudf/tests/indexes/index/methods/test_intersection.py new file mode 100644 index 00000000000..c142044a348 --- /dev/null +++ b/python/cudf/cudf/tests/indexes/index/methods/test_intersection.py @@ -0,0 +1,89 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + +import pandas as pd +import pytest + +import cudf +from cudf.testing import assert_eq + + +@pytest.mark.parametrize( + "idx1, idx2", + [ + (pd.RangeIndex(0, 10), pd.RangeIndex(3, 7)), + (pd.RangeIndex(0, 10), pd.RangeIndex(-10, 20)), + (pd.RangeIndex(0, 10, name="a"), pd.RangeIndex(90, 100, name="b")), + (pd.Index([0, 1, 2, 30], name=pd.NA), pd.Index([30, 0, 90, 100])), + (pd.Index([0, 1, 2, 30], name="a"), [90, 100]), + (pd.Index([0, 1, 2, 30]), pd.Index([0, 10, 1.0, 11])), + ( + pd.Index(["a", "b", "c", "d", "c"]), + pd.Index(["a", "c", "z"], name="abc"), + ), + ( + pd.Index(["a", "b", "c", "d", "c"]), + pd.Index(["a", "b", "c", "d", "c"]), + ), + (pd.Index([True, False, True, True]), pd.Index([10, 11, 12, 0, 1, 2])), + (pd.Index([True, False, True, True]), pd.Index([True, True])), + (pd.RangeIndex(0, 10, name="a"), pd.Index([5, 6, 7], name="b")), + (pd.Index(["a", "b", "c"], dtype="category"), pd.Index(["a", "b"])), + (pd.Index([0, 1, 2], dtype="category"), pd.RangeIndex(0, 10)), + (pd.Index(["a", "b", "c"], name="abc"), []), + (pd.Index([], name="abc"), pd.RangeIndex(0, 4)), + (pd.Index([1, 2, 3]), pd.Index([1, 2], dtype="category")), + (pd.Index([]), pd.Index([1, 2], dtype="category")), + ], +) +@pytest.mark.parametrize("sort", [None, False, True]) +@pytest.mark.parametrize("pandas_compatible", [True, False]) +def test_intersection_index(idx1, idx2, sort, pandas_compatible): + expected = idx1.intersection(idx2, sort=sort) + + with cudf.option_context("mode.pandas_compatible", pandas_compatible): + idx1 = cudf.from_pandas(idx1) if isinstance(idx1, pd.Index) else idx1 + idx2 = cudf.from_pandas(idx2) if isinstance(idx2, pd.Index) else idx2 + + actual = idx1.intersection(idx2, sort=sort) + + # TODO: Resolve the bool vs ints mixed issue + # once pandas has a direction on this issue + # https://github.com/pandas-dev/pandas/issues/44000 + assert_eq( + expected, + actual, + exact=False + if (idx1.dtype.kind == "b" and idx2.dtype.kind != "b") + or (idx1.dtype.kind != "b" or idx2.dtype.kind == "b") + else True, + ) + + +@pytest.mark.parametrize( + "idx1, idx2", + [ + (pd.Index(["a", "b", "c"], dtype="category"), pd.Index([1, 2, 3])), + ], +) +@pytest.mark.parametrize("sort", [None, False, True]) +@pytest.mark.parametrize("pandas_compatible", [True, False]) +def test_intersection_index_error(idx1, idx2, sort, pandas_compatible): + expected = idx1.intersection(idx2, sort=sort) + + with cudf.option_context("mode.pandas_compatible", pandas_compatible): + idx1 = cudf.from_pandas(idx1) if isinstance(idx1, pd.Index) else idx1 + idx2 = cudf.from_pandas(idx2) if isinstance(idx2, pd.Index) else idx2 + + if pandas_compatible: + with pytest.raises( + ValueError, + match="Cannot convert numerical column to string column when dtype is an object dtype in pandas compatibility mode.", + ): + idx1.intersection(idx2, sort=sort) + else: + actual = idx1.intersection(idx2, sort=sort) + + assert_eq( + expected, + actual, + ) diff --git a/python/cudf/cudf/tests/indexes/index/methods/test_isin.py b/python/cudf/cudf/tests/indexes/index/methods/test_isin.py new file mode 100644 index 00000000000..37377a50b91 --- /dev/null +++ b/python/cudf/cudf/tests/indexes/index/methods/test_isin.py @@ -0,0 +1,85 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + +import re + +import pandas as pd +import pytest + +import cudf +from cudf.core._compat import PANDAS_GE_220 +from cudf.testing import assert_eq +from cudf.testing._utils import expect_warning_if + + +@pytest.mark.parametrize( + "index", + [ + pd.Index([]), + pd.Index(["a", "b", "c", "d", "e"]), + pd.Index([0, None, 9]), + pd.date_range("2019-01-01", periods=3), + ], +) +@pytest.mark.parametrize( + "values", + [ + [], + ["this", "is"], + [0, 19, 13], + ["2019-01-01 04:00:00", "2019-01-01 06:00:00", "2018-03-02 10:00:00"], + ], +) +def test_isin_index(index, values): + pidx = index + gidx = cudf.Index.from_pandas(pidx) + + is_dt_str = ( + next(iter(values), None) == "2019-01-01 04:00:00" + and len(pidx) + and pidx.dtype.kind == "M" + ) + with expect_warning_if(is_dt_str): + got = gidx.isin(values) + with expect_warning_if(PANDAS_GE_220 and is_dt_str): + expected = pidx.isin(values) + + assert_eq(got, expected) + + +@pytest.mark.parametrize( + "idx, values", + [ + (range(100, 1000, 10), [200, 600, 800]), + ([None, "a", "3.2", "z", None, None], ["a", "z"]), + (pd.Series(["a", "b", None], dtype="category"), [10, None]), + ], +) +def test_index_isin_values(idx, values): + gidx = cudf.Index(idx) + pidx = gidx.to_pandas() + + actual = gidx.isin(values) + expected = pidx.isin(values) + + assert_eq(expected, actual) + + +@pytest.mark.parametrize( + "idx, scalar", + [ + (range(0, -10, -2), -4), + ([None, "a", "3.2", "z", None, None], "x"), + (pd.Series(["a", "b", None], dtype="category"), 10), + ], +) +def test_index_isin_scalar_values(idx, scalar): + gidx = cudf.Index(idx) + + with pytest.raises( + TypeError, + match=re.escape( + f"only list-like objects are allowed to be passed " + f"to isin(), you passed a {type(scalar).__name__}" + ), + ): + gidx.isin(scalar) diff --git a/python/cudf/cudf/tests/indexes/index/methods/test_isna_notnull.py b/python/cudf/cudf/tests/indexes/index/methods/test_isna_notnull.py new file mode 100644 index 00000000000..33bfb957390 --- /dev/null +++ b/python/cudf/cudf/tests/indexes/index/methods/test_isna_notnull.py @@ -0,0 +1,15 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + + +import pandas as pd + +import cudf +from cudf.testing import assert_eq + + +def test_index_isna_notna(): + idx = [1, None, 3, None, 5] + pidx = pd.Index(idx, name="idx") + gidx = cudf.Index(idx, name="idx") + assert_eq(gidx.isna(), pidx.isna()) + assert_eq(gidx.notna(), pidx.notna()) diff --git a/python/cudf/cudf/tests/indexes/index/methods/test_reductions.py b/python/cudf/cudf/tests/indexes/index/methods/test_reductions.py new file mode 100644 index 00000000000..eaf5e48cc01 --- /dev/null +++ b/python/cudf/cudf/tests/indexes/index/methods/test_reductions.py @@ -0,0 +1,23 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + + +import numpy as np +import pytest + +import cudf + + +@pytest.mark.parametrize( + "func", + [ + lambda x: x.min(), + lambda x: x.max(), + lambda x: x.any(), + lambda x: x.all(), + ], +) +def test_reductions(func): + x = np.asarray([4, 5, 6, 10]) + idx = cudf.Index(np.asarray([4, 5, 6, 10])) + + assert func(x) == func(idx) diff --git a/python/cudf/cudf/tests/indexes/index/methods/test_rename.py b/python/cudf/cudf/tests/indexes/index/methods/test_rename.py new file mode 100644 index 00000000000..b0db84e20cd --- /dev/null +++ b/python/cudf/cudf/tests/indexes/index/methods/test_rename.py @@ -0,0 +1,68 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + +import pandas as pd +import pytest + +import cudf +from cudf.testing import assert_eq +from cudf.testing._utils import ( + SERIES_OR_INDEX_NAMES, +) + + +@pytest.mark.parametrize("initial_name", SERIES_OR_INDEX_NAMES) +@pytest.mark.parametrize("name", SERIES_OR_INDEX_NAMES) +def test_index_rename(initial_name, name): + pds = pd.Index([1, 2, 3], name=initial_name) + gds = cudf.Index(pds) + + assert_eq(pds, gds) + + expect = pds.rename(name) + got = gds.rename(name) + + assert_eq(expect, got) + """ + From here on testing recursive creation + and if name is being handles in recursive creation. + """ + pds = pd.Index(expect) + gds = cudf.Index(got) + + assert_eq(pds, gds) + + pds = pd.Index(pds, name="abc") + gds = cudf.Index(gds, name="abc") + assert_eq(pds, gds) + + +def test_index_rename_inplace(): + pds = pd.Index([1, 2, 3], name="asdf") + gds = cudf.Index(pds) + + # inplace=False should yield a shallow copy + gds_renamed_deep = gds.rename("new_name", inplace=False) + + assert gds_renamed_deep._column.data_ptr == gds._column.data_ptr + + # inplace=True returns none + expected_ptr = gds._column.data_ptr + gds.rename("new_name", inplace=True) + + assert expected_ptr == gds._column.data_ptr + + +def test_index_rename_preserves_arg(): + idx1 = cudf.Index([1, 2, 3], name="orig_name") + + # this should be an entirely new object + idx2 = idx1.rename("new_name", inplace=False) + + assert idx2.name == "new_name" + assert idx1.name == "orig_name" + + # a new object but referencing the same data + idx3 = cudf.Index(idx1, name="last_name") + + assert idx3.name == "last_name" + assert idx1.name == "orig_name" diff --git a/python/cudf/cudf/tests/indexes/index/methods/test_set_names.py b/python/cudf/cudf/tests/indexes/index/methods/test_set_names.py new file mode 100644 index 00000000000..11b0c371ed9 --- /dev/null +++ b/python/cudf/cudf/tests/indexes/index/methods/test_set_names.py @@ -0,0 +1,48 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + +import pandas as pd +import pytest + +import cudf +from cudf.testing import assert_eq +from cudf.testing._utils import assert_exceptions_equal + + +@pytest.mark.parametrize( + "idx", + [ + pd.Index([1, 2, 3]), + pd.Index(["abc", "def", "ghi"]), + pd.RangeIndex(0, 10, 1), + pd.Index([0.324, 0.234, 1.3], name="abc"), + ], +) +@pytest.mark.parametrize("names", [None, "a", "new name", ["another name"]]) +def test_index_set_names(idx, names, inplace): + if inplace: + pi = idx.copy() + else: + pi = idx + gi = cudf.from_pandas(idx) + + expected = pi.set_names(names=names, inplace=inplace) + actual = gi.set_names(names=names, inplace=inplace) + + if inplace: + expected, actual = pi, gi + + assert_eq(expected, actual) + + +@pytest.mark.parametrize("level", [1, [0], "abc"]) +@pytest.mark.parametrize("names", [None, "a"]) +def test_index_set_names_error(level, names): + pi = pd.Index([1, 2, 3], name="abc") + gi = cudf.from_pandas(pi) + + assert_exceptions_equal( + lfunc=pi.set_names, + rfunc=gi.set_names, + lfunc_args_and_kwargs=([], {"names": names, "level": level}), + rfunc_args_and_kwargs=([], {"names": names, "level": level}), + ) diff --git a/python/cudf/cudf/tests/indexes/index/methods/test_sort_values.py b/python/cudf/cudf/tests/indexes/index/methods/test_sort_values.py new file mode 100644 index 00000000000..edc03010540 --- /dev/null +++ b/python/cudf/cudf/tests/indexes/index/methods/test_sort_values.py @@ -0,0 +1,47 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + +import pandas as pd +import pytest + +import cudf +from cudf.testing import assert_eq + + +@pytest.mark.parametrize( + "data", + [ + pd.Index([1, 10, 2, 100, -10], name="abc"), + pd.Index(["z", "x", "a", "c", "b"]), + pd.Index(["z", "x", "a", "c", "b"], dtype="category"), + pd.Index( + [-10.2, 100.1, -100.2, 0.0, 0.23], name="this is a float index" + ), + pd.Index([102, 1001, 1002, 0.0, 23], dtype="datetime64[ns]"), + pd.Index([13240.2, 1001, 100.2, 0.0, 23], dtype="datetime64[ns]"), + pd.RangeIndex(0, 10, 1), + pd.RangeIndex(0, -100, -2), + pd.Index([-10.2, 100.1, -100.2, 0.0, 23], dtype="timedelta64[ns]"), + ], +) +@pytest.mark.parametrize("return_indexer", [True, False]) +def test_index_sort_values(data, ascending, return_indexer): + pdi = data + gdi = cudf.from_pandas(pdi) + + expected = pdi.sort_values( + ascending=ascending, return_indexer=return_indexer + ) + actual = gdi.sort_values( + ascending=ascending, return_indexer=return_indexer + ) + + if return_indexer: + expected_indexer = expected[1] + actual_indexer = actual[1] + + assert_eq(expected_indexer, actual_indexer) + + expected = expected[0] + actual = actual[0] + + assert_eq(expected, actual) diff --git a/python/cudf/cudf/tests/indexes/index/methods/test_to_arrow.py b/python/cudf/cudf/tests/indexes/index/methods/test_to_arrow.py new file mode 100644 index 00000000000..ee6b97cae6b --- /dev/null +++ b/python/cudf/cudf/tests/indexes/index/methods/test_to_arrow.py @@ -0,0 +1,29 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + +import numpy as np +import pandas as pd +import pyarrow as pa +import pytest + +import cudf +from cudf.testing import assert_eq + + +@pytest.mark.parametrize( + "data", + [ + [1, 2, 3, 1, None, None], + [None, None, 3.2, 1, None, None], + [None, "a", "3.2", "z", None, None], + pd.Series(["a", "b", None], dtype="category"), + np.array([1, 2, 3, None], dtype="datetime64[s]"), + ], +) +def test_index_to_arrow(data): + pdi = pd.Index(data) + gdi = cudf.Index(data) + + expected_arrow_array = pa.Array.from_pandas(pdi) + got_arrow_array = gdi.to_arrow() + + assert_eq(expected_arrow_array, got_arrow_array) diff --git a/python/cudf/cudf/tests/indexes/index/methods/test_to_frame.py b/python/cudf/cudf/tests/indexes/index/methods/test_to_frame.py new file mode 100644 index 00000000000..223786b0f86 --- /dev/null +++ b/python/cudf/cudf/tests/indexes/index/methods/test_to_frame.py @@ -0,0 +1,24 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + +import pandas as pd +import pytest + +import cudf +from cudf.api.extensions import no_default +from cudf.testing import assert_eq + + +@pytest.mark.parametrize( + "data", [[1, 2, 3], ["ab", "cd", "e", None], range(0, 10)] +) +@pytest.mark.parametrize("data_name", [None, 1, "abc"]) +@pytest.mark.parametrize("index", [True, False]) +@pytest.mark.parametrize("name", [None, no_default, 1, "abc"]) +def test_index_to_frame(data, data_name, index, name): + pidx = pd.Index(data, name=data_name) + gidx = cudf.from_pandas(pidx) + + expected = pidx.to_frame(index=index, name=name) + actual = gidx.to_frame(index=index, name=name) + + assert_eq(expected, actual) diff --git a/python/cudf/cudf/tests/indexes/index/methods/test_to_pandas.py b/python/cudf/cudf/tests/indexes/index/methods/test_to_pandas.py new file mode 100644 index 00000000000..c505222f67a --- /dev/null +++ b/python/cudf/cudf/tests/indexes/index/methods/test_to_pandas.py @@ -0,0 +1,77 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + +import datetime + +import numpy as np +import pandas as pd +import pyarrow as pa +import pytest + +import cudf +from cudf.testing import assert_eq + + +@pytest.mark.parametrize( + "data,expected_dtype", + [ + ([10, 11, 12], pd.Int64Dtype()), + ([0.1, 10.2, 12.3], pd.Float64Dtype()), + (["abc", None, "def"], pd.StringDtype()), + ], +) +def test_index_to_pandas_nullable(data, expected_dtype): + gi = cudf.Index(data) + pi = gi.to_pandas(nullable=True) + expected = pd.Index(data, dtype=expected_dtype) + + assert_eq(pi, expected) + + +@pytest.mark.parametrize( + "data", + [ + range(1), + np.array([1, 2], dtype="datetime64[ns]"), + np.array([1, 2], dtype="timedelta64[ns]"), + ], +) +def test_index_to_pandas_nullable_notimplemented(data): + idx = cudf.Index(data) + with pytest.raises(NotImplementedError): + idx.to_pandas(nullable=True) + + +@pytest.mark.parametrize( + "scalar", + [ + 1, + 1.0, + "a", + datetime.datetime(2020, 1, 1), + datetime.timedelta(1), + pd.Interval(1, 2), + ], +) +def test_index_to_pandas_arrow_type_nullable_raises(scalar): + data = [scalar, None] + idx = cudf.Index(data) + with pytest.raises(ValueError): + idx.to_pandas(nullable=True, arrow_type=True) + + +@pytest.mark.parametrize( + "scalar", + [ + 1, + 1.0, + "a", + datetime.datetime(2020, 1, 1), + datetime.timedelta(1), + ], +) +def test_index_to_pandas_arrow_type(scalar): + pa_array = pa.array([scalar, None]) + idx = cudf.Index(pa_array) + result = idx.to_pandas(arrow_type=True) + expected = pd.Index(pd.arrays.ArrowExtensionArray(pa_array)) + pd.testing.assert_index_equal(result, expected) diff --git a/python/cudf/cudf/tests/indexes/index/methods/test_to_series.py b/python/cudf/cudf/tests/indexes/index/methods/test_to_series.py new file mode 100644 index 00000000000..9c63a5d5fcd --- /dev/null +++ b/python/cudf/cudf/tests/indexes/index/methods/test_to_series.py @@ -0,0 +1,22 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + +import pandas as pd +import pytest + +import cudf +from cudf.testing import assert_eq + + +@pytest.mark.parametrize( + "data", + [ + [1, 10, 2, 100, -10], + ["z", "x", "a", "c", "b"], + [-10.2, 100.1, -100.2, 0.0, 0.23], + ], +) +def test_index_to_series(data): + pdi = pd.Index(data) + gdi = cudf.from_pandas(pdi) + + assert_eq(pdi.to_series(), gdi.to_series()) diff --git a/python/cudf/cudf/tests/indexes/index/methods/test_tolist.py b/python/cudf/cudf/tests/indexes/index/methods/test_tolist.py new file mode 100644 index 00000000000..6b0454c647b --- /dev/null +++ b/python/cudf/cudf/tests/indexes/index/methods/test_tolist.py @@ -0,0 +1,22 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + +import re + +import pytest + +import cudf + + +@pytest.mark.parametrize("data", [[1, 2, 3, 1, 2, 3, 4], []]) +def test_index_tolist(data, all_supported_types_as_str): + gdi = cudf.Index(data, dtype=all_supported_types_as_str) + + with pytest.raises( + TypeError, + match=re.escape( + r"cuDF does not support conversion to host memory " + r"via the `tolist()` method. Consider using " + r"`.to_arrow().to_pylist()` to construct a Python list." + ), + ): + gdi.tolist() diff --git a/python/cudf/cudf/tests/indexes/index/methods/test_union.py b/python/cudf/cudf/tests/indexes/index/methods/test_union.py new file mode 100644 index 00000000000..9bbdf5e4bc6 --- /dev/null +++ b/python/cudf/cudf/tests/indexes/index/methods/test_union.py @@ -0,0 +1,72 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + +import pandas as pd +import pytest + +import cudf +from cudf.testing import assert_eq + + +@pytest.mark.parametrize( + "idx1, idx2", + [ + (pd.RangeIndex(0, 10), pd.RangeIndex(3, 7)), + (pd.RangeIndex(0, 10), pd.RangeIndex(10, 20)), + (pd.RangeIndex(0, 10, 2), pd.RangeIndex(1, 5, 3)), + (pd.RangeIndex(1, 5, 3), pd.RangeIndex(0, 10, 2)), + (pd.RangeIndex(1, 10, 3), pd.RangeIndex(1, 5, 2)), + (pd.RangeIndex(1, 5, 2), pd.RangeIndex(1, 10, 3)), + (pd.RangeIndex(1, 100, 3), pd.RangeIndex(1, 50, 3)), + (pd.RangeIndex(1, 100, 3), pd.RangeIndex(1, 50, 6)), + (pd.RangeIndex(1, 100, 6), pd.RangeIndex(1, 50, 3)), + (pd.RangeIndex(0, 10, name="a"), pd.RangeIndex(90, 100, name="b")), + (pd.Index([0, 1, 2, 30], name="a"), pd.Index([90, 100])), + (pd.Index([0, 1, 2, 30], name="a"), [90, 100]), + (pd.Index([0, 1, 2, 30]), pd.Index([0, 10, 1.0, 11])), + (pd.Index(["a", "b", "c", "d", "c"]), pd.Index(["a", "c", "z"])), + ( + pd.IntervalIndex.from_tuples([(0, 2), (0, 2), (2, 4)]), + pd.IntervalIndex.from_tuples([(0, 2), (2, 4)]), + ), + (pd.RangeIndex(0, 10), pd.Index([8, 1, 2, 4])), + (pd.Index([8, 1, 2, 4], name="a"), pd.Index([8, 1, 2, 4], name="b")), + ( + pd.Index([8, 1, 2, 4], name="a"), + pd.Index([], name="b", dtype="int64"), + ), + (pd.Index([], dtype="int64", name="a"), pd.Index([10, 12], name="b")), + (pd.Index([True, True, True], name="a"), pd.Index([], dtype="bool")), + ( + pd.Index([True, True, True]), + pd.Index([False, True], dtype="bool", name="b"), + ), + ], +) +@pytest.mark.parametrize("sort", [None, False, True]) +def test_union_index(idx1, idx2, sort): + expected = idx1.union(idx2, sort=sort) + + idx1 = cudf.from_pandas(idx1) if isinstance(idx1, pd.Index) else idx1 + idx2 = cudf.from_pandas(idx2) if isinstance(idx2, pd.Index) else idx2 + + actual = idx1.union(idx2, sort=sort) + + assert_eq(expected, actual) + + +def test_union_bool_with_other(): + idx1 = cudf.Index([True, True, True]) + idx2 = cudf.Index([0, 1], name="b") + with cudf.option_context("mode.pandas_compatible", True): + with pytest.raises(cudf.errors.MixedTypeError): + idx1.union(idx2) + + +def test_union_unsigned_vs_signed( + signed_integer_types_as_str, unsigned_integer_types_as_str +): + idx1 = cudf.Index([10, 20, 30], dtype=signed_integer_types_as_str) + idx2 = cudf.Index([0, 1], dtype=unsigned_integer_types_as_str) + with cudf.option_context("mode.pandas_compatible", True): + with pytest.raises(cudf.errors.MixedTypeError): + idx1.union(idx2) diff --git a/python/cudf/cudf/tests/indexes/index/methods/test_where.py b/python/cudf/cudf/tests/indexes/index/methods/test_where.py new file mode 100644 index 00000000000..46fa521e6e2 --- /dev/null +++ b/python/cudf/cudf/tests/indexes/index/methods/test_where.py @@ -0,0 +1,190 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + + +import numpy as np +import pandas as pd +import pytest + +import cudf +from cudf.testing import assert_eq +from cudf.testing._utils import ( + assert_exceptions_equal, +) + + +@pytest.mark.parametrize( + "data,condition,other,error", + [ + (pd.Index(range(5)), pd.Index(range(5)) > 0, None, None), + (pd.Index([1, 2, 3]), pd.Index([1, 2, 3]) != 2, None, None), + (pd.Index(list("abc")), pd.Index(list("abc")) == "c", None, None), + ( + pd.Index(list("abc")), + pd.Index(list("abc")) == "c", + pd.Index(list("xyz")), + None, + ), + (pd.Index(range(5)), pd.Index(range(4)) > 0, None, ValueError), + ( + pd.Index(range(5)), + pd.Index(range(5)) > 1, + 10, + None, + ), + ( + pd.Index(np.arange(10)), + (pd.Index(np.arange(10)) % 3) == 0, + -pd.Index(np.arange(10)), + None, + ), + ( + pd.Index([1, 2, np.nan]), + pd.Index([1, 2, np.nan]) == 4, + None, + None, + ), + ( + pd.Index([1, 2, np.nan]), + pd.Index([1, 2, np.nan]) != 4, + None, + None, + ), + ( + pd.Index([-2, 3, -4, -79]), + [True, True, True], + None, + ValueError, + ), + ( + pd.Index([-2, 3, -4, -79]), + [True, True, True, False], + None, + None, + ), + ( + pd.Index([-2, 3, -4, -79]), + [True, True, True, False], + 17, + None, + ), + (pd.Index(list("abcdgh")), pd.Index(list("abcdgh")) != "g", "3", None), + ( + pd.Index(list("abcdgh")), + pd.Index(list("abcdg")) != "g", + "3", + ValueError, + ), + ( + pd.CategoricalIndex(["a", "b", "c", "a", "b", "c"]), + pd.CategoricalIndex(["a", "b", "c", "a", "b", "c"]) != "a", + "a", + None, + ), + ( + pd.CategoricalIndex(["a", "b", "c", "a", "b", "c"]), + pd.CategoricalIndex(["a", "b", "c", "a", "b", "c"]) != "a", + "b", + None, + ), + ( + pd.MultiIndex.from_tuples( + list( + zip( + *[ + [ + "bar", + "bar", + "baz", + "baz", + "foo", + "foo", + "qux", + "qux", + ], + [ + "one", + "two", + "one", + "two", + "one", + "two", + "one", + "two", + ], + ], + strict=True, + ) + ) + ), + pd.MultiIndex.from_tuples( + list( + zip( + *[ + [ + "bar", + "bar", + "baz", + "baz", + "foo", + "foo", + "qux", + "qux", + ], + [ + "one", + "two", + "one", + "two", + "one", + "two", + "one", + "two", + ], + ], + strict=True, + ) + ) + ) + != "a", + None, + NotImplementedError, + ), + ], +) +def test_index_where(data, condition, other, error): + ps = data + gs = cudf.from_pandas(data) + + ps_condition = condition + if isinstance(condition, pd.Index): + gs_condition = cudf.from_pandas(condition) + else: + gs_condition = condition + + ps_other = other + if isinstance(condition, pd.Index): + gs_other = cudf.from_pandas(other) + else: + gs_other = other + + if error is None: + if hasattr(ps, "dtype") and isinstance(ps.dtype, pd.CategoricalDtype): + expect = ps.where(ps_condition, other=ps_other) + got = gs.where(gs_condition, other=gs_other) + np.testing.assert_array_equal( + expect.codes, + got.codes.astype(expect.codes.dtype).fillna(-1).to_numpy(), + ) + assert_eq(expect.categories, got.categories) + else: + assert_eq( + ps.where(ps_condition, other=ps_other), + gs.where(gs_condition, other=gs_other).to_pandas(), + ) + else: + assert_exceptions_equal( + lfunc=ps.where, + rfunc=gs.where, + lfunc_args_and_kwargs=([ps_condition], {"other": ps_other}), + rfunc_args_and_kwargs=([gs_condition], {"other": gs_other}), + ) diff --git a/python/cudf/cudf/tests/indexes/index/test_attributes.py b/python/cudf/cudf/tests/indexes/index/test_attributes.py index ee4a1654a10..3ccf155875c 100644 --- a/python/cudf/cudf/tests/indexes/index/test_attributes.py +++ b/python/cudf/cudf/tests/indexes/index/test_attributes.py @@ -1,11 +1,13 @@ # Copyright (c) 2025, NVIDIA CORPORATION. import datetime +import re import numpy as np import pandas as pd import pytest import cudf +from cudf.testing import assert_eq @pytest.mark.parametrize( @@ -67,3 +69,151 @@ def test_index_is_unique_monotonic(testlist): assert index.is_unique == index_pd.is_unique assert index.is_monotonic_increasing == index_pd.is_monotonic_increasing assert index.is_monotonic_decreasing == index_pd.is_monotonic_decreasing + + +def test_name(): + idx = cudf.Index(np.asarray([4, 5, 6, 10]), name="foo") + assert idx.name == "foo" + + +def test_index_names(): + idx = cudf.Index([1, 2, 3], name="idx") + assert idx.names == ("idx",) + + +@pytest.mark.parametrize("data", [[1, 2, 3, 4], []]) +def test_index_empty(data, all_supported_types_as_str): + pdi = pd.Index(data, dtype=all_supported_types_as_str) + gdi = cudf.Index(data, dtype=all_supported_types_as_str) + + assert pdi.empty == gdi.empty + + +@pytest.mark.parametrize("data", [[1, 2, 3, 4], []]) +def test_index_size(data, all_supported_types_as_str): + pdi = pd.Index(data, dtype=all_supported_types_as_str) + gdi = cudf.Index(data, dtype=all_supported_types_as_str) + + assert pdi.size == gdi.size + + +@pytest.mark.parametrize("data", [[], [1]]) +def test_index_iter_error(data, all_supported_types_as_str): + gdi = cudf.Index(data, dtype=all_supported_types_as_str) + + with pytest.raises( + TypeError, + match=re.escape( + f"{gdi.__class__.__name__} object is not iterable. " + f"Consider using `.to_arrow()`, `.to_pandas()` or `.values_host` " + f"if you wish to iterate over the values." + ), + ): + iter(gdi) + + +@pytest.mark.parametrize("data", [[], [1]]) +def test_index_values_host(data, all_supported_types_as_str, request): + request.applymarker( + pytest.mark.xfail( + len(data) > 0 + and all_supported_types_as_str + in {"timedelta64[us]", "timedelta64[ms]", "timedelta64[s]"}, + reason=f"wrong result for {all_supported_types_as_str}", + ) + ) + gdi = cudf.Index(data, dtype=all_supported_types_as_str) + pdi = pd.Index(data, dtype=all_supported_types_as_str) + + np.testing.assert_array_equal(gdi.values_host, pdi.values) + + +@pytest.mark.parametrize( + "data", + [ + [1, 2, 3], + ["a", "v", "d"], + [234.243, 2432.3, None], + [True, False, True], + pd.Series(["a", " ", "v"], dtype="category"), + pd.IntervalIndex.from_breaks([0, 1, 2, 3]), + ], +) +@pytest.mark.parametrize( + "func", + [ + "is_numeric", + "is_boolean", + "is_integer", + "is_floating", + "is_object", + "is_categorical", + "is_interval", + ], +) +def test_index_type_methods(data, func): + pidx = pd.Index(data) + gidx = cudf.from_pandas(pidx) + + with pytest.warns(FutureWarning): + expected = getattr(pidx, func)() + with pytest.warns(FutureWarning): + actual = getattr(gidx, func)() + + if gidx.dtype == np.dtype("bool") and func == "is_object": + assert_eq(False, actual) + else: + assert_eq(expected, actual) + + +def test_index_values(): + gidx = cudf.Index([1, 2, 3]) + pidx = gidx.to_pandas() + + assert_eq(pidx.values, gidx.values) + + +def test_index_null_values(): + gidx = cudf.Index([1.0, None, 3, 0, None]) + with pytest.raises(ValueError): + gidx.values + + +@pytest.mark.parametrize( + "data", + [ + [1, 2, 3], + pytest.param( + [np.nan, 10, 15, 16], + marks=pytest.mark.xfail( + reason="https://github.com/pandas-dev/pandas/issues/49818" + ), + ), + range(0, 10), + [np.nan, None, 10, 20], + ["ab", "zx", "pq"], + ["ab", "zx", None, "pq"], + ], +) +def test_index_hasnans(data): + gs = cudf.Index(data, nan_as_null=False) + if isinstance(gs, cudf.RangeIndex): + with pytest.raises(NotImplementedError): + gs.to_pandas(nullable=True) + else: + ps = gs.to_pandas(nullable=True) + # Check type to avoid mixing Python bool and NumPy bool + assert isinstance(gs.hasnans, bool) + assert gs.hasnans == ps.hasnans + + +@pytest.mark.parametrize("data", [[0, 1, 2], [1.1, 2.3, 4.5]]) +@pytest.mark.parametrize("needle", [0, 1, 2.3]) +def test_index_contains_float_int(data, numeric_types_as_str, needle): + gidx = cudf.Index(data=data, dtype=numeric_types_as_str) + pidx = gidx.to_pandas() + + actual = needle in gidx + expected = needle in pidx + + assert_eq(actual, expected) diff --git a/python/cudf/cudf/tests/indexes/index/test_constructor.py b/python/cudf/cudf/tests/indexes/index/test_constructor.py index a46c4fec49e..e23969c7cb7 100644 --- a/python/cudf/cudf/tests/indexes/index/test_constructor.py +++ b/python/cudf/cudf/tests/indexes/index/test_constructor.py @@ -1,8 +1,11 @@ # Copyright (c) 2025, NVIDIA CORPORATION. +import re + import cupy as cp import numpy as np import pandas as pd +import pyarrow as pa import pytest import cudf @@ -49,6 +52,277 @@ def test_infer_timedelta_index(data, timedelta_types_as_str): assert_eq(pdi, gdi) +def test_pandas_as_index(): + # Define Pandas Indexes + pdf_int_index = pd.Index([1, 2, 3, 4, 5]) + pdf_uint_index = pd.Index([1, 2, 3, 4, 5]) + pdf_float_index = pd.Index([1.0, 2.0, 3.0, 4.0, 5.0]) + pdf_datetime_index = pd.DatetimeIndex( + [1000000, 2000000, 3000000, 4000000, 5000000] + ) + pdf_category_index = pd.CategoricalIndex(["a", "b", "c", "b", "a"]) + + # Define cudf Indexes + gdf_int_index = cudf.Index(pdf_int_index) + gdf_uint_index = cudf.Index(pdf_uint_index) + gdf_float_index = cudf.Index(pdf_float_index) + gdf_datetime_index = cudf.Index(pdf_datetime_index) + gdf_category_index = cudf.Index(pdf_category_index) + + # Check instance types + assert isinstance(gdf_int_index, cudf.Index) + assert isinstance(gdf_uint_index, cudf.Index) + assert isinstance(gdf_float_index, cudf.Index) + assert isinstance(gdf_datetime_index, cudf.DatetimeIndex) + assert isinstance(gdf_category_index, cudf.CategoricalIndex) + + # Check equality + assert_eq(pdf_int_index, gdf_int_index) + assert_eq(pdf_uint_index, gdf_uint_index) + assert_eq(pdf_float_index, gdf_float_index) + assert_eq(pdf_datetime_index, gdf_datetime_index) + assert_eq(pdf_category_index, gdf_category_index) + + assert_eq( + pdf_category_index.codes, + gdf_category_index.codes.astype( + pdf_category_index.codes.dtype + ).to_numpy(), + ) + + +def test_from_pandas_str(): + idx = ["a", "b", "c"] + pidx = pd.Index(idx, name="idx") + gidx_1 = cudf.Index(idx, name="idx") + gidx_2 = cudf.from_pandas(pidx) + + assert_eq(gidx_1, gidx_2) + + +def test_from_pandas_gen(): + idx = [2, 4, 6] + pidx = pd.Index(idx, name="idx") + gidx_1 = cudf.Index(idx, name="idx") + gidx_2 = cudf.from_pandas(pidx) + + assert_eq(gidx_1, gidx_2) + + +@pytest.mark.parametrize( + "data", + [ + range(0), + range(1), + range(0, 1), + range(0, 5), + range(1, 10), + range(1, 10, 1), + range(1, 10, 3), + range(10, 1, -3), + range(-5, 10), + ], +) +def test_range_index_from_range(data): + assert_eq(pd.Index(data), cudf.Index(data)) + + +@pytest.mark.parametrize("data", [[1, 2, 3, 4], []]) +@pytest.mark.parametrize("name", [1, "a", None]) +def test_index_basic(data, all_supported_types_as_str, name, request): + request.applymarker( + pytest.mark.xfail( + len(data) > 0 + and all_supported_types_as_str + in {"timedelta64[us]", "timedelta64[ms]", "timedelta64[s]"}, + reason=f"wrong result for {all_supported_types_as_str}", + ) + ) + pdi = pd.Index(data, dtype=all_supported_types_as_str, name=name) + gdi = cudf.Index(data, dtype=all_supported_types_as_str, name=name) + + assert_eq(pdi, gdi) + + +@pytest.mark.parametrize( + "data,nan_idx,NA_idx", + [([1, 2, 3, None], None, 3), ([2, 3, np.nan, None], 2, 3)], +) +def test_index_nan_as_null(data, nan_idx, NA_idx, nan_as_null): + idx = cudf.Index(data, nan_as_null=nan_as_null) + + if nan_as_null is not False: + if nan_idx is not None: + assert idx[nan_idx] is cudf.NA + else: + if nan_idx is not None: + assert np.isnan(idx[nan_idx]) + + if NA_idx is not None: + assert idx[NA_idx] is cudf.NA + + +def test_index_constructor_integer(default_integer_bitwidth): + got = cudf.Index([1, 2, 3]) + expect = cudf.Index([1, 2, 3], dtype=f"int{default_integer_bitwidth}") + + assert_eq(expect, got) + + +def test_index_constructor_float(default_float_bitwidth): + got = cudf.Index([1.0, 2.0, 3.0]) + expect = cudf.Index( + [1.0, 2.0, 3.0], dtype=f"float{default_float_bitwidth}" + ) + + assert_eq(expect, got) + + +def test_index_error_list_index(): + s = cudf.Series([[1, 2], [2], [4]]) + with pytest.raises( + NotImplementedError, + match=re.escape( + "Unsupported column type passed to create an " + "Index: " + ), + ): + cudf.Index(s) + + +@pytest.mark.parametrize( + "data", + [ + [ + pd.Timestamp("1970-01-01 00:00:00.000000001"), + pd.Timestamp("1970-01-01 00:00:00.000000002"), + 12, + 20, + ], + [ + pd.Timedelta(10), + pd.Timedelta(20), + 12, + 20, + ], + [1, 2, 3, 4], + ], +) +def test_index_mixed_dtype_error(data): + pi = pd.Index(data, dtype="object") + with pytest.raises(TypeError): + cudf.Index(pi) + + +@pytest.mark.parametrize("cls", [pd.DatetimeIndex, pd.TimedeltaIndex]) +def test_index_date_duration_freq_error(cls): + s = cls([1, 2, 3], freq="infer") + with cudf.option_context("mode.pandas_compatible", True): + with pytest.raises(NotImplementedError): + cudf.Index(s) + + +def test_index_empty_from_pandas(all_supported_types_as_str): + pidx = pd.Index([], dtype=all_supported_types_as_str) + gidx = cudf.from_pandas(pidx) + + assert_eq(pidx, gidx) + + +def test_empty_index_init(): + pidx = pd.Index([]) + gidx = cudf.Index([]) + + assert_eq(pidx, gidx) + + +@pytest.mark.parametrize("data", [[1, 2, 3], range(0, 10)]) +def test_index_with_index_dtype(request, data, all_supported_types_as_str): + request.applymarker( + pytest.mark.xfail( + isinstance(data, list) + and all_supported_types_as_str + in {"timedelta64[us]", "timedelta64[ms]", "timedelta64[s]"}, + reason=f"wrong result for {all_supported_types_as_str}", + ) + ) + request.applymarker( + pytest.mark.xfail( + all_supported_types_as_str == "category", + raises=AttributeError, + reason=f"cuDF bug in Column.astype with {all_supported_types_as_str}", + ) + ) + pidx = pd.Index(data) + gidx = cudf.Index(data) + + expected = pd.Index(pidx, dtype=all_supported_types_as_str) + actual = cudf.Index(gidx, dtype=all_supported_types_as_str) + + assert_eq(expected, actual) + + +def test_period_index_error(): + pidx = pd.PeriodIndex(data=[pd.Period("2020-01")]) + with pytest.raises(NotImplementedError): + cudf.from_pandas(pidx) + with pytest.raises(NotImplementedError): + cudf.Index(pidx) + with pytest.raises(NotImplementedError): + cudf.Series(pidx) + with pytest.raises(NotImplementedError): + cudf.Series(pd.Series(pidx)) + with pytest.raises(NotImplementedError): + cudf.Series(pd.array(pidx)) + + +@pytest.mark.parametrize("value", [cudf.DataFrame(range(1)), 11]) +def test_index_from_dataframe_scalar_raises(value): + with pytest.raises(TypeError): + cudf.Index(value) + + +@pytest.mark.parametrize( + "data", + [ + cp.ones(5, dtype=cp.float16), + np.ones(5, dtype="float16"), + pd.Series([0.1, 1.2, 3.3], dtype="float16"), + pytest.param( + pa.array(np.ones(5, dtype="float16")), + marks=pytest.mark.xfail( + reason="https://issues.apache.org/jira/browse/ARROW-13762" + ), + ), + ], +) +def test_index_raises_float16(data): + with pytest.raises(TypeError): + cudf.Index(data) + + +def test_from_pandas_rangeindex_return_rangeindex(): + pidx = pd.RangeIndex(start=3, stop=9, step=3, name="a") + result = cudf.Index.from_pandas(pidx) + expected = cudf.RangeIndex(start=3, stop=9, step=3, name="a") + assert_eq(result, expected, exact=True) + + +def test_Index_init_with_nans(): + with cudf.option_context("mode.pandas_compatible", True): + gi = cudf.Index([1, 2, 3, np.nan]) + assert gi.dtype == np.dtype("float64") + pi = pd.Index([1, 2, 3, np.nan]) + assert_eq(pi, gi) + + +def test_roundtrip_index_plc_column(): + index = cudf.Index([1]) + expect = cudf.Index(index) + actual = cudf.Index.from_pylibcudf(*expect.to_pylibcudf()) + assert_eq(expect, actual) + + def test_categorical_index_with_dtype(): dtype = cudf.CategoricalDtype(categories=["a", "z", "c"]) gi = cudf.Index(["z", "c", "a"], dtype=dtype) diff --git a/python/cudf/cudf/tests/indexes/multiindex/indexing/test_getitem.py b/python/cudf/cudf/tests/indexes/multiindex/indexing/test_getitem.py new file mode 100644 index 00000000000..6605aa0ab94 --- /dev/null +++ b/python/cudf/cudf/tests/indexes/multiindex/indexing/test_getitem.py @@ -0,0 +1,47 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + +import numpy as np +import pandas as pd +import pytest + +import cudf +from cudf.testing import assert_eq + + +def test_multiindex_getitem(): + pidx = pd.MultiIndex( + [ + ["a", "b", "c"], + ["house", "store", "forest"], + ["clouds", "clear", "storm"], + ["fire", "smoke", "clear"], + [ + np.datetime64("2001-01-01", "ns"), + np.datetime64("2002-01-01", "ns"), + np.datetime64("2003-01-01", "ns"), + ], + ], + [ + [0, 0, 0, 0, 1, 1, 2], + [1, 1, 1, 1, 0, 0, 2], + [0, 0, 2, 2, 2, 0, 1], + [0, 0, 0, 1, 2, 0, 1], + [1, 0, 1, 2, 0, 0, 1], + ], + ) + pidx.names = ["alpha", "location", "weather", "sign", "timestamp"] + gidx = cudf.from_pandas(pidx) + assert_eq(pidx[0], gidx[0]) + + +@pytest.mark.parametrize( + "key", + [0, 1, [], [0, 1], slice(None), slice(0, 0), slice(0, 1), slice(0, 2)], +) +def test_multiindex_indexing(key): + gi = cudf.MultiIndex.from_frame( + cudf.DataFrame({"a": [1, 2, 3], "b": [True, False, False]}) + ) + pi = gi.to_pandas() + + assert_eq(gi[key], pi[key], exact=False) diff --git a/python/cudf/cudf/tests/indexes/multiindex/methods/test_append.py b/python/cudf/cudf/tests/indexes/multiindex/methods/test_append.py new file mode 100644 index 00000000000..ecf2c9fb97e --- /dev/null +++ b/python/cudf/cudf/tests/indexes/multiindex/methods/test_append.py @@ -0,0 +1,52 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + +import pandas as pd +import pytest + +import cudf +from cudf.testing import assert_eq + + +@pytest.mark.parametrize( + "data", + [ + pd.MultiIndex.from_arrays( + [[1, 1, 2, 2], ["red", "blue", "red", "blue"]], + names=("number", "color"), + ), + pd.MultiIndex.from_arrays( + [[1, 2, 3, 4], ["yellow", "violet", "pink", "white"]], + names=("number1", "color2"), + ), + pd.MultiIndex.from_arrays( + [[1, 1, 2, 2], ["red", "blue", "red", "blue"]], + ), + ], +) +@pytest.mark.parametrize( + "other", + [ + pd.MultiIndex.from_arrays( + [[1, 1, 2, 2], ["red", "blue", "red", "blue"]], + names=("number", "color"), + ), + pd.MultiIndex.from_arrays( + [[1, 2, 3, 4], ["yellow", "violet", "pink", "white"]], + names=("number1", "color2"), + ), + pd.MultiIndex.from_arrays( + [[1, 1, 2, 2], ["red", "blue", "red", "blue"]], + ), + ], +) +def test_multiindex_append(data, other): + pdi = data + other_pd = other + + gdi = cudf.from_pandas(data) + other_gd = cudf.from_pandas(other) + + expected = pdi.append(other_pd) + actual = gdi.append(other_gd) + + assert_eq(expected, actual) diff --git a/python/cudf/cudf/tests/indexes/multiindex/methods/test_get_indexer.py b/python/cudf/cudf/tests/indexes/multiindex/methods/test_get_indexer.py new file mode 100644 index 00000000000..9ea8b7555e0 --- /dev/null +++ b/python/cudf/cudf/tests/indexes/multiindex/methods/test_get_indexer.py @@ -0,0 +1,128 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + +import pandas as pd +import pytest + +import cudf +from cudf.core._compat import ( + PANDAS_CURRENT_SUPPORTED_VERSION, + PANDAS_VERSION, +) +from cudf.testing import assert_eq +from cudf.testing._utils import assert_exceptions_equal + + +@pytest.mark.parametrize( + "data", + [ + [(1, 1, 1), (1, 1, 2), (1, 2, 1), (1, 2, 3), (2, 1, 1), (2, 2, 1)], + [(2, 1, 1), (1, 2, 3), (1, 2, 1), (1, 1, 2), (2, 2, 1), (1, 1, 1)], + [(1, 1, 1), (1, 1, 2), (1, 1, 24), (1, 2, 3), (2, 1, 1), (2, 2, 1)], + ], +) +@pytest.mark.parametrize("key", [[(1, 2, 3)], [(9, 9, 9)]]) +@pytest.mark.parametrize("method", [None, "ffill", "bfill"]) +def test_get_indexer_multi_numeric(data, key, method): + idx = pd.MultiIndex.from_tuples(data) + pi = idx.sort_values() + gi = cudf.from_pandas(pi) + + expected = pi.get_indexer(key, method=method) + got = gi.get_indexer(key, method=method) + + assert_eq(expected, got) + + with cudf.option_context("mode.pandas_compatible", True): + got = gi.get_indexer(key, method=method) + + assert_eq(expected, got, check_dtype=True) + + +@pytest.mark.parametrize( + "key", + [ + ((1, 2, 3),), + ((2, 1, 1),), + ((9, 9, 9),), + ], +) +@pytest.mark.parametrize("method", [None, "ffill", "bfill"]) +def test_get_indexer_multi_numeric_deviate(key, method): + pi = pd.MultiIndex.from_tuples( + [(2, 1, 1), (1, 2, 3), (1, 2, 1), (1, 1, 10), (1, 1, 1), (2, 2, 1)] + ).sort_values() + gi = cudf.from_pandas(pi) + + expected = pi.get_indexer(key, method=method) + got = gi.get_indexer(key, method=method) + + assert_eq(expected, got) + + +@pytest.mark.parametrize("method", ["ffill", "bfill"]) +@pytest.mark.skipif( + PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, + reason="Fails in older versions of pandas", +) +def test_get_indexer_multi_error(method): + pi = pd.MultiIndex.from_tuples( + [(2, 1, 1), (1, 2, 3), (1, 2, 1), (1, 1, 10), (1, 1, 1), (2, 2, 1)] + ) + gi = cudf.from_pandas(pi) + + assert_exceptions_equal( + pi.get_indexer, + gi.get_indexer, + lfunc_args_and_kwargs=( + [], + {"target": ((1, 2, 3),), "method": method}, + ), + rfunc_args_and_kwargs=( + [], + {"target": ((1, 2, 3),), "method": method}, + ), + ) + + +@pytest.mark.parametrize( + "data", + [ + [ + ("a", "a", "a"), + ("a", "a", "b"), + ("a", "b", "a"), + ("a", "b", "c"), + ("b", "a", "a"), + ("b", "c", "a"), + ], + [ + ("a", "a", "b"), + ("a", "b", "c"), + ("b", "a", "a"), + ("a", "a", "a"), + ("a", "b", "a"), + ("b", "c", "a"), + ], + [ + ("a", "a", "a"), + ("a", "b", "c"), + ("b", "a", "a"), + ("a", "a", "b"), + ("a", "b", "a"), + ("b", "c", "a"), + ], + ], +) +@pytest.mark.parametrize( + "key", [[("a", "b", "c"), ("b", "c", "a")], [("z", "z", "z")]] +) +@pytest.mark.parametrize("method", [None, "ffill", "bfill"]) +def test_get_indexer_multi_string(data, key, method): + idx = pd.MultiIndex.from_tuples(data) + pi = idx.sort_values() + gi = cudf.from_pandas(pi) + + expected = pi.get_indexer(key, method=method) + got = gi.get_indexer(key, method=method) + + assert_eq(expected, got) diff --git a/python/cudf/cudf/tests/indexes/multiindex/methods/test_get_loc.py b/python/cudf/cudf/tests/indexes/multiindex/methods/test_get_loc.py new file mode 100644 index 00000000000..4048a0f9c60 --- /dev/null +++ b/python/cudf/cudf/tests/indexes/multiindex/methods/test_get_loc.py @@ -0,0 +1,141 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + +import pandas as pd +import pytest + +import cudf +from cudf.testing import assert_eq +from cudf.testing._utils import assert_exceptions_equal, expect_warning_if + + +@pytest.mark.parametrize( + "data", + [ + [(1, 1, 1), (1, 1, 2), (1, 2, 1), (1, 2, 3), (2, 1, 1), (2, 2, 1)], + [(2, 1, 1), (1, 2, 3), (1, 2, 1), (1, 1, 2), (2, 2, 1), (1, 1, 1)], + [(1, 1, 1), (1, 1, 2), (1, 1, 2), (1, 2, 3), (2, 1, 1), (2, 2, 1)], + ], +) +@pytest.mark.parametrize("key", [1, (1, 2), (1, 2, 3), (2, 1, 1), (9, 9, 9)]) +def test_get_loc_multi_numeric(data, key): + idx = pd.MultiIndex.from_tuples(data) + pi = idx.sort_values() + gi = cudf.from_pandas(pi) + + if key not in pi: + assert_exceptions_equal( + lfunc=pi.get_loc, + rfunc=gi.get_loc, + lfunc_args_and_kwargs=([], {"key": key}), + rfunc_args_and_kwargs=([], {"key": key}), + ) + else: + expected = pi.get_loc(key) + got = gi.get_loc(key) + + assert_eq(expected, got) + + +@pytest.mark.parametrize( + "key, result", + [ + (1, slice(1, 5, 1)), # deviates + ((1, 2), slice(1, 3, 1)), + ((1, 2, 3), slice(1, 2, None)), + ((2, 1, 1), slice(0, 1, None)), + ((9, 9, 9), None), + ], +) +def test_get_loc_multi_numeric_deviate(key, result): + pi = pd.MultiIndex.from_tuples( + [(2, 1, 1), (1, 2, 3), (1, 2, 1), (1, 1, 1), (1, 1, 1), (2, 2, 1)] + ) + gi = cudf.from_pandas(pi) + + with expect_warning_if( + isinstance(key, tuple), pd.errors.PerformanceWarning + ): + key_flag = key not in pi + + if key_flag: + with expect_warning_if( + isinstance(key, tuple), pd.errors.PerformanceWarning + ): + assert_exceptions_equal( + lfunc=pi.get_loc, + rfunc=gi.get_loc, + lfunc_args_and_kwargs=([], {"key": key}), + rfunc_args_and_kwargs=([], {"key": key}), + ) + else: + expected = result + got = gi.get_loc(key) + + assert_eq(expected, got) + + +@pytest.mark.parametrize( + "data", + [ + [ + ("a", "a", "a"), + ("a", "a", "b"), + ("a", "b", "a"), + ("a", "b", "c"), + ("b", "a", "a"), + ("b", "c", "a"), + ], + [ + ("a", "a", "b"), + ("a", "b", "c"), + ("b", "a", "a"), + ("a", "a", "a"), + ("a", "b", "a"), + ("b", "c", "a"), + ], + [ + ("a", "a", "a"), + ("a", "b", "c"), + ("b", "a", "a"), + ("a", "a", "b"), + ("a", "b", "a"), + ("b", "c", "a"), + ], + [ + ("a", "a", "a"), + ("a", "a", "b"), + ("a", "a", "b"), + ("a", "b", "c"), + ("b", "a", "a"), + ("b", "c", "a"), + ], + [ + ("a", "a", "b"), + ("b", "a", "a"), + ("b", "a", "a"), + ("a", "a", "a"), + ("a", "b", "a"), + ("b", "c", "a"), + ], + ], +) +@pytest.mark.parametrize( + "key", ["a", ("a", "a"), ("a", "b", "c"), ("b", "c", "a"), ("z", "z", "z")] +) +def test_get_loc_multi_string(data, key): + idx = pd.MultiIndex.from_tuples(data) + pi = idx.sort_values() + gi = cudf.from_pandas(pi) + + if key not in pi: + assert_exceptions_equal( + lfunc=pi.get_loc, + rfunc=gi.get_loc, + lfunc_args_and_kwargs=([], {"key": key}), + rfunc_args_and_kwargs=([], {"key": key}), + ) + else: + expected = pi.get_loc(key) + got = gi.get_loc(key) + + assert_eq(expected, got) diff --git a/python/cudf/cudf/tests/indexes/multiindex/methods/test_isin.py b/python/cudf/cudf/tests/indexes/multiindex/methods/test_isin.py new file mode 100644 index 00000000000..0cce814a2a0 --- /dev/null +++ b/python/cudf/cudf/tests/indexes/multiindex/methods/test_isin.py @@ -0,0 +1,85 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + +import pandas as pd +import pytest + +import cudf +from cudf.testing import assert_eq +from cudf.testing._utils import assert_exceptions_equal + + +@pytest.mark.parametrize( + "data", + [ + pd.MultiIndex.from_arrays( + [[1, 2, 3], ["red", "blue", "green"]], names=("number", "color") + ), + pd.MultiIndex.from_arrays([[], []], names=("number", "color")), + pd.MultiIndex.from_arrays( + [[1, 2, 3, 10, 100], ["red", "blue", "green", "pink", "white"]], + names=("number", "color"), + ), + pd.MultiIndex.from_product( + [[0, 1], ["red", "blue", "green"]], names=("number", "color") + ), + ], +) +@pytest.mark.parametrize( + "values,level,err", + [ + ([(1, "red"), (2, "blue"), (0, "green")], None, None), + (["red", "orange", "yellow"], "color", None), + (["red", "white", "yellow"], "color", None), + ([0, 1, 2, 10, 11, 15], "number", None), + ([0, 1, 2, 10, 11, 15], None, TypeError), + (pd.Series([0, 1, 2, 10, 11, 15]), None, TypeError), + (pd.Index([0, 1, 2, 10, 11, 15]), None, TypeError), + (pd.Index([0, 1, 2, 8, 11, 15]), "number", None), + (pd.Index(["red", "white", "yellow"]), "color", None), + ([(1, "red"), (3, "red")], None, None), + (((1, "red"), (3, "red")), None, None), + ( + pd.MultiIndex.from_arrays( + [[1, 2, 3], ["red", "blue", "green"]], + names=("number", "color"), + ), + None, + None, + ), + ( + pd.MultiIndex.from_arrays([[], []], names=("number", "color")), + None, + None, + ), + ( + pd.MultiIndex.from_arrays( + [ + [1, 2, 3, 10, 100], + ["red", "blue", "green", "pink", "white"], + ], + names=("number", "color"), + ), + None, + None, + ), + ], +) +def test_isin_multiindex(data, values, level, err): + pmdx = data + gmdx = cudf.from_pandas(data) + + if err is None: + expected = pmdx.isin(values, level=level) + if isinstance(values, pd.MultiIndex): + values = cudf.from_pandas(values) + got = gmdx.isin(values, level=level) + + assert_eq(got, expected) + else: + assert_exceptions_equal( + lfunc=pmdx.isin, + rfunc=gmdx.isin, + lfunc_args_and_kwargs=([values], {"level": level}), + rfunc_args_and_kwargs=([values], {"level": level}), + check_exception_type=False, + ) diff --git a/python/cudf/cudf/tests/indexes/multiindex/methods/test_to_arrow.py b/python/cudf/cudf/tests/indexes/multiindex/methods/test_to_arrow.py new file mode 100644 index 00000000000..e4151a6083f --- /dev/null +++ b/python/cudf/cudf/tests/indexes/multiindex/methods/test_to_arrow.py @@ -0,0 +1,27 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + +import numpy as np +import pandas as pd +import pyarrow as pa + +import cudf +from cudf.testing import assert_eq + + +def test_multiindex_to_arrow(): + pdf = pd.DataFrame( + { + "a": [1, 2, 1, 2, 3], + "b": [1.0, 2.0, 3.0, 4.0, 5.0], + "c": np.array([1, 2, 3, None, 5], dtype="datetime64[s]"), + "d": ["a", "b", "c", "d", "e"], + } + ) + pdf["a"] = pdf["a"].astype("category") + df = cudf.from_pandas(pdf) + gdi = cudf.MultiIndex.from_frame(df) + + expected = pa.Table.from_pandas(pdf) + got = gdi.to_arrow() + + assert_eq(expected, got) diff --git a/python/cudf/cudf/tests/indexes/multiindex/test_constructors.py b/python/cudf/cudf/tests/indexes/multiindex/test_constructors.py index 2afb5c4f179..131024c521f 100644 --- a/python/cudf/cudf/tests/indexes/multiindex/test_constructors.py +++ b/python/cudf/cudf/tests/indexes/multiindex/test_constructors.py @@ -4,6 +4,7 @@ import cupy as cp import numpy as np import pandas as pd +import pyarrow as pa import pytest import cudf @@ -181,3 +182,20 @@ def test_multiindex_duplicate_names(): ) assert_eq(gi, pi) + + +def test_multiindex_from_arrow(): + pdf = pd.DataFrame( + { + "a": [1, 2, 1, 2, 3], + "b": [1.0, 2.0, 3.0, 4.0, 5.0], + "c": np.array([1, 2, 3, None, 5], dtype="datetime64[s]"), + "d": ["a", "b", "c", "d", "e"], + } + ) + pdf["a"] = pdf["a"].astype("category") + ptb = pa.Table.from_pandas(pdf) + gdi = cudf.MultiIndex.from_arrow(ptb) + pdi = pd.MultiIndex.from_frame(pdf) + + assert_eq(pdi, gdi) diff --git a/python/cudf/cudf/tests/indexes/rangeindex/indexing/__init__.py b/python/cudf/cudf/tests/indexes/rangeindex/indexing/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/python/cudf/cudf/tests/indexes/rangeindex/indexing/test_getitem.py b/python/cudf/cudf/tests/indexes/rangeindex/indexing/test_getitem.py new file mode 100644 index 00000000000..d33a0c57f43 --- /dev/null +++ b/python/cudf/cudf/tests/indexes/rangeindex/indexing/test_getitem.py @@ -0,0 +1,85 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + +import pandas as pd +import pytest + +import cudf +from cudf.testing import assert_eq + + +def test_rangeindex_slice_attr_name(): + start, stop = 0, 10 + rg = cudf.RangeIndex(start, stop, name="myindex") + sliced_rg = rg[0:9] + assert rg.name == sliced_rg.name + + +@pytest.mark.parametrize( + "start,stop,step", + [(1, 10, 1), (1, 10, 3), (10, -17, -1), (10, -17, -3)], +) +def test_index_rangeindex_get_item_basic(start, stop, step): + pridx = pd.RangeIndex(start, stop, step) + gridx = cudf.RangeIndex(start, stop, step) + + for i in range(-len(pridx), len(pridx)): + assert pridx[i] == gridx[i] + + +@pytest.mark.parametrize("start,stop,step", [(1, 10, 3), (10, 1, -3)]) +def test_index_rangeindex_get_item_out_of_bounds(start, stop, step): + gridx = cudf.RangeIndex(start, stop, step) + with pytest.raises(IndexError): + gridx[4] + + +@pytest.mark.parametrize("start,stop,step", [(10, 1, 1), (-17, 10, -3)]) +def test_index_rangeindex_get_item_null_range(start, stop, step): + gridx = cudf.RangeIndex(start, stop, step) + + with pytest.raises(IndexError): + gridx[0] + + +@pytest.mark.parametrize( + "start,stop,step", + [(-17, 21, 2), (21, -17, -3), (0, 0, 1), (0, 1, -3), (10, 0, 5)], +) +@pytest.mark.parametrize( + "sl", + [ + slice(1, 7, 1), + slice(1, 7, 2), + slice(-1, 7, 1), + slice(-1, 7, 2), + slice(-3, 7, 2), + slice(7, 1, -2), + slice(7, -3, -2), + slice(None, None, 1), + slice(0, None, 2), + slice(0, None, 3), + slice(0, 0, 3), + ], +) +def test_index_rangeindex_get_item_slices(start, stop, step, sl): + pridx = pd.RangeIndex(start, stop, step) + gridx = cudf.RangeIndex(start, stop, step) + + assert_eq(pridx[sl], gridx[sl]) + + +def test_rangeindex_apply_boolean_mask_user_option(default_integer_bitwidth): + # Test that RangeIndex is materialized into 32 bit index under user + # configuration for apply boolean mask operation. + idx = cudf.RangeIndex(0, 8) + mask = [True, True, True, False, False, False, True, False] + actual = idx[mask] + expected = cudf.Index([0, 1, 2, 6], dtype=f"int{default_integer_bitwidth}") + assert_eq(expected, actual) + + +def test_df_slice_empty_index(): + idx = cudf.RangeIndex(0) + assert isinstance(idx[:1], cudf.RangeIndex) + with pytest.raises(IndexError): + idx[1] diff --git a/python/cudf/cudf/tests/indexes/rangeindex/methods/test_any_all.py b/python/cudf/cudf/tests/indexes/rangeindex/methods/test_any_all.py new file mode 100644 index 00000000000..4d5dbf72543 --- /dev/null +++ b/python/cudf/cudf/tests/indexes/rangeindex/methods/test_any_all.py @@ -0,0 +1,12 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + +import pytest + +import cudf + + +@pytest.mark.parametrize("data", [range(-3, 3), range(1, 3), range(0)]) +def test_rangeindex_all(data): + result = cudf.RangeIndex(data).all() + expected = cudf.Index(list(data)).all() + assert result == expected diff --git a/python/cudf/cudf/tests/indexes/rangeindex/methods/test_append.py b/python/cudf/cudf/tests/indexes/rangeindex/methods/test_append.py new file mode 100644 index 00000000000..d793151d87f --- /dev/null +++ b/python/cudf/cudf/tests/indexes/rangeindex/methods/test_append.py @@ -0,0 +1,15 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + + +import cudf +from cudf.testing import assert_eq + + +def test_rangeindex_append_return_rangeindex(): + idx = cudf.RangeIndex(0, 10) + result = idx.append([]) + assert_eq(idx, result) + + result = idx.append(cudf.Index([10])) + expected = cudf.RangeIndex(0, 11) + assert_eq(result, expected) diff --git a/python/cudf/cudf/tests/indexes/rangeindex/methods/test_dropna.py b/python/cudf/cudf/tests/indexes/rangeindex/methods/test_dropna.py new file mode 100644 index 00000000000..6f6f61a771d --- /dev/null +++ b/python/cudf/cudf/tests/indexes/rangeindex/methods/test_dropna.py @@ -0,0 +1,12 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + + +import cudf +from cudf.testing import assert_eq + + +def test_rangeindex_dropna(): + ri = cudf.RangeIndex(range(2)) + result = ri.dropna() + expected = ri.copy() + assert_eq(result, expected) diff --git a/python/cudf/cudf/tests/indexes/rangeindex/methods/test_factorize.py b/python/cudf/cudf/tests/indexes/rangeindex/methods/test_factorize.py new file mode 100644 index 00000000000..d22ddf3c58a --- /dev/null +++ b/python/cudf/cudf/tests/indexes/rangeindex/methods/test_factorize.py @@ -0,0 +1,15 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + +import pytest + +import cudf +from cudf.testing import assert_eq + + +@pytest.mark.parametrize("sort", [True, False]) +@pytest.mark.parametrize("data", [range(2), range(2, -1, -1)]) +def test_rangeindex_factorize(sort, data): + res_codes, res_uniques = cudf.RangeIndex(data).factorize(sort=sort) + exp_codes, exp_uniques = cudf.Index(list(data)).factorize(sort=sort) + assert_eq(res_codes, exp_codes) + assert_eq(res_uniques, exp_uniques) diff --git a/python/cudf/cudf/tests/indexes/rangeindex/methods/test_find_label_range.py b/python/cudf/cudf/tests/indexes/rangeindex/methods/test_find_label_range.py new file mode 100644 index 00000000000..f6666d4213e --- /dev/null +++ b/python/cudf/cudf/tests/indexes/rangeindex/methods/test_find_label_range.py @@ -0,0 +1,23 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + +import cudf + + +def test_index_find_label_range_rangeindex(): + """Cudf specific""" + # step > 0 + # 3, 8, 13, 18 + ridx = cudf.RangeIndex(3, 20, 5) + assert ridx.find_label_range(slice(3, 8)) == slice(0, 2, 1) + assert ridx.find_label_range(slice(0, 7)) == slice(0, 1, 1) + assert ridx.find_label_range(slice(3, 19)) == slice(0, 4, 1) + assert ridx.find_label_range(slice(2, 21)) == slice(0, 4, 1) + + # step < 0 + # 20, 15, 10, 5 + ridx = cudf.RangeIndex(20, 3, -5) + assert ridx.find_label_range(slice(15, 10)) == slice(1, 3, 1) + assert ridx.find_label_range(slice(10, 15, -1)) == slice(2, 0, -1) + assert ridx.find_label_range(slice(10, 0)) == slice(2, 4, 1) + assert ridx.find_label_range(slice(30, 13)) == slice(0, 2, 1) + assert ridx.find_label_range(slice(30, 0)) == slice(0, 4, 1) diff --git a/python/cudf/cudf/tests/indexes/rangeindex/methods/test_get_indexer.py b/python/cudf/cudf/tests/indexes/rangeindex/methods/test_get_indexer.py new file mode 100644 index 00000000000..b67aebac4cf --- /dev/null +++ b/python/cudf/cudf/tests/indexes/rangeindex/methods/test_get_indexer.py @@ -0,0 +1,39 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + +import pandas as pd +import pytest + +import cudf +from cudf.testing import assert_eq + + +@pytest.mark.parametrize( + "rng", + [ + range(1, 20, 3), + range(20, 35, 3), + range(35, 77, 3), + range(77, 110, 3), + ], +) +@pytest.mark.parametrize("method", [None, "ffill", "bfill", "nearest"]) +@pytest.mark.parametrize("tolerance", [None, 0, 1, 13, 20]) +def test_get_indexer_rangeindex(rng, method, tolerance): + key = list(rng) + pi = pd.RangeIndex(3, 100, 4) + gi = cudf.from_pandas(pi) + + expected = pi.get_indexer( + key, method=method, tolerance=None if method is None else tolerance + ) + got = gi.get_indexer( + key, method=method, tolerance=None if method is None else tolerance + ) + + assert_eq(expected, got) + + with cudf.option_context("mode.pandas_compatible", True): + got = gi.get_indexer( + key, method=method, tolerance=None if method is None else tolerance + ) + assert_eq(expected, got, check_dtype=True) diff --git a/python/cudf/cudf/tests/indexes/rangeindex/methods/test_get_loc.py b/python/cudf/cudf/tests/indexes/rangeindex/methods/test_get_loc.py new file mode 100644 index 00000000000..d453f99566c --- /dev/null +++ b/python/cudf/cudf/tests/indexes/rangeindex/methods/test_get_loc.py @@ -0,0 +1,32 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + +import pandas as pd +import pytest + +import cudf +from cudf.testing import assert_eq +from cudf.testing._utils import assert_exceptions_equal + + +@pytest.mark.parametrize("key", list(range(1, 110, 13))) +def test_get_loc_rangeindex(key): + pi = pd.RangeIndex(3, 100, 4) + gi = cudf.from_pandas(pi) + if ( + (key not in pi) + # Get key before the first element is KeyError + or (key < pi.start) + # Get key after the last element is KeyError + or (key >= pi.stop) + ): + assert_exceptions_equal( + lfunc=pi.get_loc, + rfunc=gi.get_loc, + lfunc_args_and_kwargs=([], {"key": key}), + rfunc_args_and_kwargs=([], {"key": key}), + ) + else: + expected = pi.get_loc(key) + got = gi.get_loc(key) + + assert_eq(expected, got) diff --git a/python/cudf/cudf/tests/indexes/rangeindex/methods/test_intersection.py b/python/cudf/cudf/tests/indexes/rangeindex/methods/test_intersection.py new file mode 100644 index 00000000000..93ef401a69e --- /dev/null +++ b/python/cudf/cudf/tests/indexes/rangeindex/methods/test_intersection.py @@ -0,0 +1,19 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + + +import cudf +from cudf.testing import assert_eq + + +def test_rangeindex_intersection_default_user_option(default_integer_bitwidth): + # Test that RangeIndex is materialized into 32 bit index under user + # configuration for intersection operation. + idx1 = cudf.RangeIndex(0, 100) + # Intersecting two RangeIndex will _always_ result in a RangeIndex, use + # regular index here to force materializing. + idx2 = cudf.Index([50, 102]) + + expected = cudf.Index([50], dtype=f"int{default_integer_bitwidth}") + actual = idx1.intersection(idx2) + + assert_eq(expected, actual) diff --git a/python/cudf/cudf/tests/indexes/rangeindex/methods/test_join.py b/python/cudf/cudf/tests/indexes/rangeindex/methods/test_join.py new file mode 100644 index 00000000000..a471d86230f --- /dev/null +++ b/python/cudf/cudf/tests/indexes/rangeindex/methods/test_join.py @@ -0,0 +1,19 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + + +import cudf +from cudf.testing import assert_eq + + +def test_rangeindex_join_user_option(default_integer_bitwidth): + # Test that RangeIndex is materialized into 32 bit index under user + # configuration for join. + idx1 = cudf.RangeIndex(0, 10, name="a") + idx2 = cudf.RangeIndex(5, 15, name="b") + + actual = idx1.join(idx2, how="inner", sort=True) + expected = idx1.to_pandas().join(idx2.to_pandas(), how="inner", sort=True) + assert actual.dtype == cudf.dtype(f"int{default_integer_bitwidth}") + # exact=False to ignore dtype comparison, + # because `default_integer_bitwidth` is cudf only option + assert_eq(expected, actual, exact=False) diff --git a/python/cudf/cudf/tests/indexes/rangeindex/methods/test_nunique.py b/python/cudf/cudf/tests/indexes/rangeindex/methods/test_nunique.py index 1683051c9ad..0f996ef7996 100644 --- a/python/cudf/cudf/tests/indexes/rangeindex/methods/test_nunique.py +++ b/python/cudf/cudf/tests/indexes/rangeindex/methods/test_nunique.py @@ -1,8 +1,41 @@ # Copyright (c) 2025, NVIDIA CORPORATION. +import numpy as np import pandas as pd +import pytest import cudf +from cudf.testing import assert_eq + + +@pytest.mark.parametrize( + "rangeindex", + [ + range(np.random.default_rng(seed=0).integers(0, 100)), + range(9, 12, 2), + range(20, 30), + range(100, 1000, 10), + range(0, 10, -2), + range(0, -10, 2), + range(0, -10, -2), + ], +) +@pytest.mark.parametrize( + "func", + ["nunique", "min", "max", "any", "values"], +) +def test_rangeindex_methods(rangeindex, func): + gidx = cudf.RangeIndex(rangeindex) + pidx = gidx.to_pandas() + + if func == "values": + expected = pidx.values + actual = gidx.values + else: + expected = getattr(pidx, func)() + actual = getattr(gidx, func)() + + assert_eq(expected, actual) def test_nunique(): diff --git a/python/cudf/cudf/tests/indexes/rangeindex/methods/test_rename.py b/python/cudf/cudf/tests/indexes/rangeindex/methods/test_rename.py new file mode 100644 index 00000000000..01841d11647 --- /dev/null +++ b/python/cudf/cudf/tests/indexes/rangeindex/methods/test_rename.py @@ -0,0 +1,15 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + +import pandas as pd + +import cudf + + +def test_rename_shallow_copy(): + idx = pd.Index([1]) + result = idx.rename("a") + assert idx.to_numpy(copy=False) is result.to_numpy(copy=False) + + idx = cudf.Index([1]) + result = idx.rename("a") + assert idx._column is result._column diff --git a/python/cudf/cudf/tests/indexes/rangeindex/methods/test_repeat.py b/python/cudf/cudf/tests/indexes/rangeindex/methods/test_repeat.py new file mode 100644 index 00000000000..ed2b838c65a --- /dev/null +++ b/python/cudf/cudf/tests/indexes/rangeindex/methods/test_repeat.py @@ -0,0 +1,16 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + + +import cudf +from cudf.testing import assert_eq + + +def test_rangeindex_repeat_user_option(default_integer_bitwidth): + # Test that RangeIndex is materialized into 32 bit index under user + # configuration for repeat operation. + idx = cudf.RangeIndex(0, 3) + actual = idx.repeat(3) + expected = cudf.Index( + [0, 0, 0, 1, 1, 1, 2, 2, 2], dtype=f"int{default_integer_bitwidth}" + ) + assert_eq(expected, actual) diff --git a/python/cudf/cudf/tests/indexes/rangeindex/methods/test_searchsorted.py b/python/cudf/cudf/tests/indexes/rangeindex/methods/test_searchsorted.py new file mode 100644 index 00000000000..8a2e3eec796 --- /dev/null +++ b/python/cudf/cudf/tests/indexes/rangeindex/methods/test_searchsorted.py @@ -0,0 +1,12 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + + +import cudf + + +def test_index_rangeindex_searchsorted(): + # step > 0 + ridx = cudf.RangeIndex(-13, 17, 4) + for i in range(len(ridx)): + assert i == ridx.searchsorted(ridx[i], side="left") + assert i + 1 == ridx.searchsorted(ridx[i], side="right") diff --git a/python/cudf/cudf/tests/indexes/rangeindex/methods/test_take.py b/python/cudf/cudf/tests/indexes/rangeindex/methods/test_take.py new file mode 100644 index 00000000000..7804f71a1ab --- /dev/null +++ b/python/cudf/cudf/tests/indexes/rangeindex/methods/test_take.py @@ -0,0 +1,16 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + + +import cudf +from cudf.testing import assert_eq + + +def test_rangeindex_take_default_user_option(default_integer_bitwidth): + # Test that RangeIndex is materialized into 32 bit index under user + # configuration for take operation. + idx = cudf.RangeIndex(0, 100) + actual = idx.take([0, 3, 7, 62]) + expected = cudf.Index( + [0, 3, 7, 62], dtype=f"int{default_integer_bitwidth}" + ) + assert_eq(expected, actual) diff --git a/python/cudf/cudf/tests/indexes/rangeindex/methods/test_union.py b/python/cudf/cudf/tests/indexes/rangeindex/methods/test_union.py new file mode 100644 index 00000000000..e38f2ca761f --- /dev/null +++ b/python/cudf/cudf/tests/indexes/rangeindex/methods/test_union.py @@ -0,0 +1,17 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + + +import cudf +from cudf.testing import assert_eq + + +def test_rangeindex_union_default_user_option(default_integer_bitwidth): + # Test that RangeIndex is materialized into 32 bit index under user + # configuration for union operation. + idx1 = cudf.RangeIndex(0, 2) + idx2 = cudf.RangeIndex(5, 6) + + expected = cudf.Index([0, 1, 5], dtype=f"int{default_integer_bitwidth}") + actual = idx1.union(idx2) + + assert_eq(expected, actual) diff --git a/python/cudf/cudf/tests/indexes/rangeindex/methods/test_unique.py b/python/cudf/cudf/tests/indexes/rangeindex/methods/test_unique.py new file mode 100644 index 00000000000..685e7f69ae6 --- /dev/null +++ b/python/cudf/cudf/tests/indexes/rangeindex/methods/test_unique.py @@ -0,0 +1,17 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + +import pandas as pd + +import cudf +from cudf.testing import assert_eq + + +def test_rangeindex_unique_shallow_copy(): + ri_pandas = pd.RangeIndex(1) + result = ri_pandas.unique() + assert result is not ri_pandas + + ri_cudf = cudf.RangeIndex(1) + result = ri_cudf.unique() + assert result is not ri_cudf + assert_eq(result, ri_cudf) diff --git a/python/cudf/cudf/tests/indexes/rangeindex/methods/test_where.py b/python/cudf/cudf/tests/indexes/rangeindex/methods/test_where.py new file mode 100644 index 00000000000..3e960875cbc --- /dev/null +++ b/python/cudf/cudf/tests/indexes/rangeindex/methods/test_where.py @@ -0,0 +1,18 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + + +import cudf +from cudf.testing import assert_eq + + +def test_rangeindex_where_user_option(default_integer_bitwidth): + # Test that RangeIndex is materialized into 32 bit index under user + # configuration for where operation. + idx = cudf.RangeIndex(0, 10) + mask = [True, False, True, False, True, False, True, False, True, False] + actual = idx.where(mask, -1) + expected = cudf.Index( + [0, -1, 2, -1, 4, -1, 6, -1, 8, -1], + dtype=f"int{default_integer_bitwidth}", + ) + assert_eq(expected, actual) diff --git a/python/cudf/cudf/tests/indexes/rangeindex/test_attributes.py b/python/cudf/cudf/tests/indexes/rangeindex/test_attributes.py index 60ee8b432e6..63ebdd7e472 100644 --- a/python/cudf/cudf/tests/indexes/rangeindex/test_attributes.py +++ b/python/cudf/cudf/tests/indexes/rangeindex/test_attributes.py @@ -4,6 +4,8 @@ import pytest import cudf +from cudf.testing import assert_eq +from cudf.testing._utils import assert_exceptions_equal def test_rangeindex_contains(): @@ -22,3 +24,33 @@ def test_range_index_is_unique_monotonic(start, stop, step): assert index.is_unique == index_pd.is_unique assert index.is_monotonic_increasing == index_pd.is_monotonic_increasing assert index.is_monotonic_decreasing == index_pd.is_monotonic_decreasing + + +@pytest.mark.parametrize("data", [range(2), [10, 11, 12]]) +def test_index_contains_hashable(data): + gidx = cudf.Index(data) + pidx = gidx.to_pandas() + + assert_exceptions_equal( + lambda: [] in gidx, + lambda: [] in pidx, + lfunc_args_and_kwargs=((),), + rfunc_args_and_kwargs=((),), + ) + + +def test_bool_rangeindex_raises(): + assert_exceptions_equal( + lfunc=bool, + rfunc=bool, + lfunc_args_and_kwargs=[[pd.RangeIndex(0)]], + rfunc_args_and_kwargs=[[cudf.RangeIndex(0)]], + ) + + +def test_from_pandas_rangeindex(): + idx1 = pd.RangeIndex(start=0, stop=4, step=1, name="myindex") + idx2 = cudf.from_pandas(idx1) + + assert_eq(idx1.values, idx2.values) + assert idx1.name == idx2.name diff --git a/python/cudf/cudf/tests/indexes/rangeindex/test_binops.py b/python/cudf/cudf/tests/indexes/rangeindex/test_binops.py new file mode 100644 index 00000000000..af39d17864a --- /dev/null +++ b/python/cudf/cudf/tests/indexes/rangeindex/test_binops.py @@ -0,0 +1,33 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + +import pytest + +import cudf +from cudf.testing import assert_eq + + +@pytest.mark.parametrize( + "op, expected, expected_kind", + [ + (lambda idx: 2**idx, [2, 4, 8, 16], "int"), + (lambda idx: idx**2, [1, 4, 9, 16], "int"), + (lambda idx: idx / 2, [0.5, 1, 1.5, 2], "float"), + (lambda idx: 2 / idx, [2, 1, 2 / 3, 0.5], "float"), + (lambda idx: idx % 3, [1, 2, 0, 1], "int"), + (lambda idx: 3 % idx, [0, 1, 0, 3], "int"), + ], +) +def test_rangeindex_binops_user_option( + op, expected, expected_kind, default_integer_bitwidth +): + # Test that RangeIndex is materialized into 32 bit index under user + # configuration for binary operation. + idx = cudf.RangeIndex(1, 5) + actual = op(idx) + expected = cudf.Index( + expected, dtype=f"{expected_kind}{default_integer_bitwidth}" + ) + assert_eq( + expected, + actual, + ) diff --git a/python/cudf/cudf/tests/indexes/rangeindex/test_constructors.py b/python/cudf/cudf/tests/indexes/rangeindex/test_constructors.py index 9bbe35ed33b..b54a781ff38 100644 --- a/python/cudf/cudf/tests/indexes/rangeindex/test_constructors.py +++ b/python/cudf/cudf/tests/indexes/rangeindex/test_constructors.py @@ -1,18 +1,46 @@ # Copyright (c) 2025, NVIDIA CORPORATION. import pandas as pd +import pytest import cudf from cudf.testing import assert_eq -def test_from_pandas_rangeindex(): - idx1 = pd.RangeIndex(start=0, stop=4, step=1, name="myindex") - idx2 = cudf.from_pandas(idx1) +def test_rangeindex_arg_validation(): + with pytest.raises(TypeError): + cudf.RangeIndex("1") - # Check index - assert_eq(idx1.values, idx2.values) - assert idx1.name == idx2.name + with pytest.raises(TypeError): + cudf.RangeIndex(1, "2") + + with pytest.raises(TypeError): + cudf.RangeIndex(1, 3, "1") + + with pytest.raises(ValueError): + cudf.RangeIndex(1, dtype="float64") + + with pytest.raises(ValueError): + cudf.RangeIndex(1, dtype="uint64") + + +def test_rangeindex_name_not_hashable(): + with pytest.raises(ValueError): + cudf.RangeIndex(range(2), name=["foo"]) + + with pytest.raises(ValueError): + cudf.RangeIndex(range(2)).copy(name=["foo"]) + + +@pytest.mark.parametrize("klass", [cudf.RangeIndex, pd.RangeIndex]) +@pytest.mark.parametrize("name_inner", [None, "a"]) +@pytest.mark.parametrize("name_outer", [None, "b"]) +def test_rangeindex_accepts_rangeindex(klass, name_inner, name_outer): + result = cudf.RangeIndex(klass(range(1), name=name_inner), name=name_outer) + expected = pd.RangeIndex( + pd.RangeIndex(range(1), name=name_inner), name=name_outer + ) + assert_eq(result, expected) def test_from_pandas_rangeindex_step(): diff --git a/python/cudf/cudf/tests/input_output/test_pickling.py b/python/cudf/cudf/tests/input_output/test_pickling.py index ed3483f296b..25450dab8a8 100644 --- a/python/cudf/cudf/tests/input_output/test_pickling.py +++ b/python/cudf/cudf/tests/input_output/test_pickling.py @@ -1,5 +1,6 @@ # Copyright (c) 2018-2025, NVIDIA CORPORATION. +import io import pickle import numpy as np @@ -132,3 +133,32 @@ def test_pickle_string_column(slices): out = pickle.loads(pickled) assert_eq(Series._from_column(out), Series._from_column(input_col)) + + +@pytest.mark.parametrize( + "names", + [ + ["a", "b", "c"], + [None, None, None], + ["aa", "aa", "aa"], + ["bb", "aa", "aa"], + None, + ], +) +def test_pickle_roundtrip_multiindex(names): + df = DataFrame( + { + "one": [1, 2, 3], + "two": [True, False, True], + "three": ["ab", "cd", "ef"], + "four": [0.2, 0.1, -10.2], + } + ) + expected_df = df.set_index(["one", "two", "three"]) + expected_df.index.names = names + local_file = io.BytesIO() + + pickle.dump(expected_df, local_file) + local_file.seek(0) + actual_df = pickle.load(local_file) + assert_eq(expected_df, actual_df) diff --git a/python/cudf/cudf/tests/series/methods/test_take.py b/python/cudf/cudf/tests/series/methods/test_take.py new file mode 100644 index 00000000000..9bc3e88cb5a --- /dev/null +++ b/python/cudf/cudf/tests/series/methods/test_take.py @@ -0,0 +1,37 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + +import numpy as np +import pandas as pd +import pytest + +import cudf +from cudf.testing import assert_eq + + +@pytest.mark.parametrize("ntake", [0, 1, 123, 122, 200]) +def test_series_take(ntake): + rng = np.random.default_rng(seed=0) + nelem = 123 + + psr = pd.Series(rng.integers(0, 20, nelem)) + gsr = cudf.Series(psr) + + take_indices = rng.integers(0, len(gsr), ntake) + + actual = gsr.take(take_indices) + expected = psr.take(take_indices) + + assert_eq(actual, expected) + + +def test_series_take_positional(): + psr = pd.Series([1, 2, 3, 4, 5], index=["a", "b", "c", "d", "e"]) + + gsr = cudf.Series.from_pandas(psr) + + take_indices = [1, 2, 0, 3] + + expect = psr.take(take_indices) + got = gsr.take(take_indices) + + assert_eq(expect, got) diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py index dbb193019b2..c28d65fe45e 100644 --- a/python/cudf/cudf/tests/test_index.py +++ b/python/cudf/cudf/tests/test_index.py @@ -4,35 +4,15 @@ Test related to Index """ -import datetime import operator -import re -import cupy as cp import numpy as np import pandas as pd -import pyarrow as pa import pytest import cudf -from cudf.api.extensions import no_default -from cudf.core._compat import ( - PANDAS_CURRENT_SUPPORTED_VERSION, - PANDAS_GE_220, - PANDAS_VERSION, -) -from cudf.core.index import CategoricalIndex, DatetimeIndex, Index, RangeIndex +from cudf.core.index import CategoricalIndex from cudf.testing import assert_eq -from cudf.testing._utils import ( - ALL_TYPES, - NUMERIC_TYPES, - OTHER_TYPES, - SERIES_OR_INDEX_NAMES, - assert_column_memory_eq, - assert_column_memory_ne, - assert_exceptions_equal, - expect_warning_if, -) def test_df_set_index_from_series(): @@ -62,97 +42,6 @@ def test_df_set_index_from_name(): assert list(sliced_strided.index.values) == [2, 4, 6] -def test_df_slice_empty_index(): - df = cudf.DataFrame() - assert isinstance(df.index, RangeIndex) - assert isinstance(df.index[:1], RangeIndex) - with pytest.raises(IndexError): - df.index[1] - - -def test_index_find_label_range_genericindex(): - # Monotonic Index - idx = cudf.Index(np.asarray([4, 5, 6, 10])) - assert idx.find_label_range(slice(4, 6)) == slice(0, 3, 1) - assert idx.find_label_range(slice(5, 10)) == slice(1, 4, 1) - assert idx.find_label_range(slice(0, 6)) == slice(0, 3, 1) - assert idx.find_label_range(slice(4, 11)) == slice(0, 4, 1) - - # Non-monotonic Index - idx_nm = cudf.Index(np.asarray([5, 4, 6, 10])) - assert idx_nm.find_label_range(slice(4, 6)) == slice(1, 3, 1) - assert idx_nm.find_label_range(slice(5, 10)) == slice(0, 4, 1) - # Last value not found - with pytest.raises(KeyError) as raises: - idx_nm.find_label_range(slice(0, 6)) - raises.match("not in index") - # Last value not found - with pytest.raises(KeyError) as raises: - idx_nm.find_label_range(slice(4, 11)) - raises.match("not in index") - - -def test_index_find_label_range_rangeindex(): - """Cudf specific""" - # step > 0 - # 3, 8, 13, 18 - ridx = RangeIndex(3, 20, 5) - assert ridx.find_label_range(slice(3, 8)) == slice(0, 2, 1) - assert ridx.find_label_range(slice(0, 7)) == slice(0, 1, 1) - assert ridx.find_label_range(slice(3, 19)) == slice(0, 4, 1) - assert ridx.find_label_range(slice(2, 21)) == slice(0, 4, 1) - - # step < 0 - # 20, 15, 10, 5 - ridx = RangeIndex(20, 3, -5) - assert ridx.find_label_range(slice(15, 10)) == slice(1, 3, 1) - assert ridx.find_label_range(slice(10, 15, -1)) == slice(2, 0, -1) - assert ridx.find_label_range(slice(10, 0)) == slice(2, 4, 1) - assert ridx.find_label_range(slice(30, 13)) == slice(0, 2, 1) - assert ridx.find_label_range(slice(30, 0)) == slice(0, 4, 1) - - -def test_index_comparision(): - start, stop = 10, 34 - rg = cudf.RangeIndex(start, stop) - gi = cudf.Index(np.arange(start, stop)) - assert rg.equals(gi) - assert gi.equals(rg) - assert not rg[:-1].equals(gi) - assert rg[:-1].equals(gi[:-1]) - - -@pytest.mark.parametrize( - "func", - [ - lambda x: x.min(), - lambda x: x.max(), - lambda x: x.any(), - lambda x: x.all(), - ], -) -def test_reductions(func): - x = np.asarray([4, 5, 6, 10]) - idx = cudf.Index(np.asarray([4, 5, 6, 10])) - - assert func(x) == func(idx) - - -def test_name(): - idx = cudf.Index(np.asarray([4, 5, 6, 10]), name="foo") - assert idx.name == "foo" - - -def test_index_immutable(): - start, stop = 10, 34 - rg = RangeIndex(start, stop) - with pytest.raises(TypeError): - rg[1] = 5 - gi = cudf.Index(np.arange(start, stop)) - with pytest.raises(TypeError): - gi[1] = 5 - - def test_categorical_index(): pdf = pd.DataFrame() pdf["a"] = [1, 2, 3] @@ -183,103 +72,6 @@ def test_categorical_index(): ) -def test_pandas_as_index(): - # Define Pandas Indexes - pdf_int_index = pd.Index([1, 2, 3, 4, 5]) - pdf_uint_index = pd.Index([1, 2, 3, 4, 5]) - pdf_float_index = pd.Index([1.0, 2.0, 3.0, 4.0, 5.0]) - pdf_datetime_index = pd.DatetimeIndex( - [1000000, 2000000, 3000000, 4000000, 5000000] - ) - pdf_category_index = pd.CategoricalIndex(["a", "b", "c", "b", "a"]) - - # Define cudf Indexes - gdf_int_index = Index(pdf_int_index) - gdf_uint_index = Index(pdf_uint_index) - gdf_float_index = Index(pdf_float_index) - gdf_datetime_index = Index(pdf_datetime_index) - gdf_category_index = Index(pdf_category_index) - - # Check instance types - assert isinstance(gdf_int_index, Index) - assert isinstance(gdf_uint_index, Index) - assert isinstance(gdf_float_index, Index) - assert isinstance(gdf_datetime_index, DatetimeIndex) - assert isinstance(gdf_category_index, CategoricalIndex) - - # Check equality - assert_eq(pdf_int_index, gdf_int_index) - assert_eq(pdf_uint_index, gdf_uint_index) - assert_eq(pdf_float_index, gdf_float_index) - assert_eq(pdf_datetime_index, gdf_datetime_index) - assert_eq(pdf_category_index, gdf_category_index) - - assert_eq( - pdf_category_index.codes, - gdf_category_index.codes.astype( - pdf_category_index.codes.dtype - ).to_numpy(), - ) - - -@pytest.mark.parametrize("initial_name", SERIES_OR_INDEX_NAMES) -@pytest.mark.parametrize("name", SERIES_OR_INDEX_NAMES) -def test_index_rename(initial_name, name): - pds = pd.Index([1, 2, 3], name=initial_name) - gds = Index(pds) - - assert_eq(pds, gds) - - expect = pds.rename(name) - got = gds.rename(name) - - assert_eq(expect, got) - """ - From here on testing recursive creation - and if name is being handles in recursive creation. - """ - pds = pd.Index(expect) - gds = Index(got) - - assert_eq(pds, gds) - - pds = pd.Index(pds, name="abc") - gds = Index(gds, name="abc") - assert_eq(pds, gds) - - -def test_index_rename_inplace(): - pds = pd.Index([1, 2, 3], name="asdf") - gds = Index(pds) - - # inplace=False should yield a shallow copy - gds_renamed_deep = gds.rename("new_name", inplace=False) - - assert gds_renamed_deep._column.data_ptr == gds._column.data_ptr - - # inplace=True returns none - expected_ptr = gds._column.data_ptr - gds.rename("new_name", inplace=True) - - assert expected_ptr == gds._column.data_ptr - - -def test_index_rename_preserves_arg(): - idx1 = cudf.Index([1, 2, 3], name="orig_name") - - # this should be an entirely new object - idx2 = idx1.rename("new_name", inplace=False) - - assert idx2.name == "new_name" - assert idx1.name == "orig_name" - - # a new object but referencing the same data - idx3 = Index(idx1, name="last_name") - - assert idx3.name == "last_name" - assert idx1.name == "orig_name" - - def test_set_index_as_property(): cdf = cudf.DataFrame() col1 = np.arange(10) @@ -306,102 +98,6 @@ def test_set_index_as_property(): assert_eq(head.index, idx[:5]) -@pytest.mark.parametrize( - "data", - [ - range(1, 5), - [1, 2, 3, 4], - pd.DatetimeIndex(["2001", "2002", "2003"]), - ["a", "b", "c"], - pd.CategoricalIndex(["a", "b", "c"]), - ], -) -@pytest.mark.parametrize("deep", [True, False]) -@pytest.mark.parametrize("copy_on_write", [True, False]) -def test_index_copy(data, deep, copy_on_write): - name = "x" - cidx = cudf.Index(data) - pidx = cidx.to_pandas() - - pidx_copy = pidx.copy(name=name, deep=deep) - cidx_copy = cidx.copy(name=name, deep=deep) - - assert_eq(pidx_copy, cidx_copy) - - with cudf.option_context("copy_on_write", copy_on_write): - if not isinstance(cidx, cudf.RangeIndex): - if ( - isinstance(cidx._column, cudf.core.column.StringColumn) - or not deep - or (copy_on_write and not deep) - ): - # StringColumn is immutable hence, deep copies of a - # Index with string dtype will share the same StringColumn. - - # When `copy_on_write` is turned on, Index objects will - # have unique column object but they all point to same - # data pointers. - assert_column_memory_eq(cidx._column, cidx_copy._column) - else: - assert_column_memory_ne(cidx._column, cidx_copy._column) - - -def test_index_isna_notna(): - idx = [1, None, 3, None, 5] - pidx = pd.Index(idx, name="idx") - gidx = cudf.Index(idx, name="idx") - assert_eq(gidx.isna(), pidx.isna()) - assert_eq(gidx.notna(), pidx.notna()) - - -def test_rangeindex_slice_attr_name(): - start, stop = 0, 10 - rg = RangeIndex(start, stop, name="myindex") - sliced_rg = rg[0:9] - assert_eq(rg.name, sliced_rg.name) - - -def test_from_pandas_str(): - idx = ["a", "b", "c"] - pidx = pd.Index(idx, name="idx") - gidx_1 = cudf.Index(idx, name="idx") - gidx_2 = cudf.from_pandas(pidx) - - assert_eq(gidx_1, gidx_2) - - -def test_from_pandas_gen(): - idx = [2, 4, 6] - pidx = pd.Index(idx, name="idx") - gidx_1 = cudf.Index(idx, name="idx") - gidx_2 = cudf.from_pandas(pidx) - - assert_eq(gidx_1, gidx_2) - - -def test_index_names(): - idx = Index([1, 2, 3], name="idx") - assert idx.names == ("idx",) - - -@pytest.mark.parametrize( - "data", - [ - range(0), - range(1), - range(0, 1), - range(0, 5), - range(1, 10), - range(1, 10, 1), - range(1, 10, 3), - range(10, 1, -3), - range(-5, 10), - ], -) -def test_range_index_from_range(data): - assert_eq(pd.Index(data), cudf.Index(data)) - - @pytest.mark.parametrize( "n", [-10, -5, -2, 0, 1, 0, 2, 5, 10], @@ -424,2764 +120,42 @@ def test_empty_df_head_tail_index(n): @pytest.mark.parametrize( - "data,condition,other,error", - [ - (pd.Index(range(5)), pd.Index(range(5)) > 0, None, None), - (pd.Index([1, 2, 3]), pd.Index([1, 2, 3]) != 2, None, None), - (pd.Index(list("abc")), pd.Index(list("abc")) == "c", None, None), - ( - pd.Index(list("abc")), - pd.Index(list("abc")) == "c", - pd.Index(list("xyz")), - None, - ), - (pd.Index(range(5)), pd.Index(range(4)) > 0, None, ValueError), - ( - pd.Index(range(5)), - pd.Index(range(5)) > 1, - 10, - None, - ), - ( - pd.Index(np.arange(10)), - (pd.Index(np.arange(10)) % 3) == 0, - -pd.Index(np.arange(10)), - None, - ), - ( - pd.Index([1, 2, np.nan]), - pd.Index([1, 2, np.nan]) == 4, - None, - None, - ), - ( - pd.Index([1, 2, np.nan]), - pd.Index([1, 2, np.nan]) != 4, - None, - None, - ), - ( - pd.Index([-2, 3, -4, -79]), - [True, True, True], - None, - ValueError, - ), - ( - pd.Index([-2, 3, -4, -79]), - [True, True, True, False], - None, - None, - ), - ( - pd.Index([-2, 3, -4, -79]), - [True, True, True, False], - 17, - None, - ), - (pd.Index(list("abcdgh")), pd.Index(list("abcdgh")) != "g", "3", None), - ( - pd.Index(list("abcdgh")), - pd.Index(list("abcdg")) != "g", - "3", - ValueError, - ), - ( - pd.CategoricalIndex(["a", "b", "c", "a", "b", "c"]), - pd.CategoricalIndex(["a", "b", "c", "a", "b", "c"]) != "a", - "a", - None, - ), - ( - pd.CategoricalIndex(["a", "b", "c", "a", "b", "c"]), - pd.CategoricalIndex(["a", "b", "c", "a", "b", "c"]) != "a", - "b", - None, - ), - ( - pd.MultiIndex.from_tuples( - list( - zip( - *[ - [ - "bar", - "bar", - "baz", - "baz", - "foo", - "foo", - "qux", - "qux", - ], - [ - "one", - "two", - "one", - "two", - "one", - "two", - "one", - "two", - ], - ], - strict=True, - ) - ) - ), - pd.MultiIndex.from_tuples( - list( - zip( - *[ - [ - "bar", - "bar", - "baz", - "baz", - "foo", - "foo", - "qux", - "qux", - ], - [ - "one", - "two", - "one", - "two", - "one", - "two", - "one", - "two", - ], - ], - strict=True, - ) - ) - ) - != "a", - None, - NotImplementedError, - ), - ], -) -def test_index_where(data, condition, other, error): - ps = data - gs = cudf.from_pandas(data) - - ps_condition = condition - if type(condition).__module__.split(".")[0] == "pandas": - gs_condition = cudf.from_pandas(condition) - else: - gs_condition = condition - - ps_other = other - if type(other).__module__.split(".")[0] == "pandas": - gs_other = cudf.from_pandas(other) - else: - gs_other = other - - if error is None: - if hasattr(ps, "dtype") and isinstance(ps.dtype, pd.CategoricalDtype): - expect = ps.where(ps_condition, other=ps_other) - got = gs.where(gs_condition, other=gs_other) - np.testing.assert_array_equal( - expect.codes, - got.codes.astype(expect.codes.dtype).fillna(-1).to_numpy(), - ) - assert_eq(expect.categories, got.categories) - else: - assert_eq( - ps.where(ps_condition, other=ps_other), - gs.where(gs_condition, other=gs_other).to_pandas(), - ) - else: - assert_exceptions_equal( - lfunc=ps.where, - rfunc=gs.where, - lfunc_args_and_kwargs=([ps_condition], {"other": ps_other}), - rfunc_args_and_kwargs=([gs_condition], {"other": gs_other}), - ) - - -@pytest.mark.parametrize("dtype", NUMERIC_TYPES + OTHER_TYPES) -@pytest.mark.parametrize("copy", [True, False]) -def test_index_astype(dtype, copy): - pdi = pd.Index([1, 2, 3]) - gdi = cudf.from_pandas(pdi) - - actual = gdi.astype(dtype=dtype, copy=copy) - expected = pdi.astype(dtype=dtype, copy=copy) - - assert_eq(expected, actual) - assert_eq(pdi, gdi) - - -@pytest.mark.parametrize( - "data", - [ - [1, 10, 2, 100, -10], - ["z", "x", "a", "c", "b"], - [-10.2, 100.1, -100.2, 0.0, 0.23], - ], -) -def test_index_argsort(data): - pdi = pd.Index(data) - gdi = cudf.from_pandas(pdi) - - assert_eq(pdi.argsort(), gdi.argsort()) - - -@pytest.mark.parametrize( - "data", - [ - pd.Index([1, 10, 2, 100, -10], name="abc"), - pd.Index(["z", "x", "a", "c", "b"]), - pd.Index(["z", "x", "a", "c", "b"], dtype="category"), - pd.Index( - [-10.2, 100.1, -100.2, 0.0, 0.23], name="this is a float index" - ), - pd.Index([102, 1001, 1002, 0.0, 23], dtype="datetime64[ns]"), - pd.Index([13240.2, 1001, 100.2, 0.0, 23], dtype="datetime64[ns]"), - pd.RangeIndex(0, 10, 1), - pd.RangeIndex(0, -100, -2), - pd.Index([-10.2, 100.1, -100.2, 0.0, 23], dtype="timedelta64[ns]"), - ], -) -@pytest.mark.parametrize("ascending", [True, False]) -@pytest.mark.parametrize("return_indexer", [True, False]) -def test_index_sort_values(data, ascending, return_indexer): - pdi = data - gdi = cudf.from_pandas(pdi) - - expected = pdi.sort_values( - ascending=ascending, return_indexer=return_indexer - ) - actual = gdi.sort_values( - ascending=ascending, return_indexer=return_indexer - ) - - if return_indexer: - expected_indexer = expected[1] - actual_indexer = actual[1] - - assert_eq(expected_indexer, actual_indexer) - - expected = expected[0] - actual = actual[0] - - assert_eq(expected, actual) - - -@pytest.mark.parametrize( - "data", - [ - [1, 10, 2, 100, -10], - ["z", "x", "a", "c", "b"], - [-10.2, 100.1, -100.2, 0.0, 0.23], - ], -) -def test_index_to_series(data): - pdi = pd.Index(data) - gdi = cudf.from_pandas(pdi) - - assert_eq(pdi.to_series(), gdi.to_series()) - - -@pytest.mark.parametrize( - "data", - [ - [1, 2, 3, 4, 5, 6], - [4, 5, 6, 10, 20, 30], - [10, 20, 30, 40, 50, 60], - ["1", "2", "3", "4", "5", "6"], - ["5", "6", "2", "a", "b", "c"], - [1.0, 2.0, 3.0, 4.0, 5.0, 6.0], - [1.0, 5.0, 6.0, 0.0, 1.3], - ["ab", "cd", "ef"], - pd.Series(["1", "2", "a", "3", None], dtype="category"), - range(0, 10), - [], - [1, 1, 2, 2], - ], -) -@pytest.mark.parametrize( - "other", - [ - [1, 2, 3, 4, 5, 6], - [4, 5, 6, 10, 20, 30], - [10, 20, 30, 40, 50, 60], - ["1", "2", "3", "4", "5", "6"], - ["5", "6", "2", "a", "b", "c"], - ["ab", "ef", None], - [1.0, 2.0, 3.0, 4.0, 5.0, 6.0], - [1.0, 5.0, 6.0, 0.0, 1.3], - range(2, 4), - pd.Series(["1", "a", "3", None], dtype="category"), - [], - [2], - ], -) -@pytest.mark.parametrize("sort", [None, False, True]) -@pytest.mark.parametrize( - "name_data,name_other", - [("abc", "c"), (None, "abc"), ("abc", pd.NA), ("abc", "abc")], -) -@pytest.mark.skipif( - PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, - reason="Fails in older versions of pandas", -) -def test_index_difference(data, other, sort, name_data, name_other): - pd_data = pd.Index(data, name=name_data) - pd_other = pd.Index(other, name=name_other) - if ( - not PANDAS_GE_220 - and isinstance(pd_data.dtype, pd.CategoricalDtype) - and not isinstance(pd_other.dtype, pd.CategoricalDtype) - and pd_other.isnull().any() - ): - pytest.skip(reason="https://github.com/pandas-dev/pandas/issues/57318") - - if ( - not PANDAS_GE_220 - and len(pd_other) == 0 - and len(pd_data) != len(pd_data.unique()) - ): - pytest.skip(reason="Bug fixed in pandas-2.2+") - - gd_data = cudf.from_pandas(pd_data) - gd_other = cudf.from_pandas(pd_other) - - expected = pd_data.difference(pd_other, sort=sort) - actual = gd_data.difference(gd_other, sort=sort) - - assert_eq(expected, actual) - - -@pytest.mark.parametrize("other", ["a", 1, None]) -def test_index_difference_invalid_inputs(other): - pdi = pd.Index([1, 2, 3]) - gdi = cudf.Index([1, 2, 3]) - - assert_exceptions_equal( - pdi.difference, - gdi.difference, - ([other], {}), - ([other], {}), - ) - - -def test_index_difference_sort_error(): - pdi = pd.Index([1, 2, 3]) - gdi = cudf.Index([1, 2, 3]) - - assert_exceptions_equal( - pdi.difference, - gdi.difference, - ([pdi], {"sort": "A"}), - ([gdi], {"sort": "A"}), - ) - - -@pytest.mark.parametrize( - "data", - [ - [1, 2, 3, 4, 5, 6], - [10, 20, 30, 40, 50, 60], - ["1", "2", "3", "4", "5", "6"], - [1.0, 2.0, 3.0, 4.0, 5.0, 6.0], - ["a"], - ["b", "c", "d"], - [1], - [2, 3, 4], - [], - [10.0], - [1100.112, 2323.2322, 2323.2322], - ["abcd", "defgh", "werty", "poiu"], - ], -) -@pytest.mark.parametrize( - "other", - [ - [1, 2, 3, 4, 5, 6], - [10, 20, 30, 40, 50, 60], - ["1", "2", "3", "4", "5", "6"], - [1.0, 2.0, 3.0, 4.0, 5.0, 6.0], - ["a"], - [], - ["b", "c", "d"], - [1], - [2, 3, 4], - [10.0], - [1100.112, 2323.2322, 2323.2322], - ["abcd", "defgh", "werty", "poiu"], - ], -) -def test_index_equals(data, other): - pd_data = pd.Index(data) - pd_other = pd.Index(other) - - gd_data = Index(data) - gd_other = Index(other) - - expected = pd_data.equals(pd_other) - actual = gd_data.equals(gd_other) - assert_eq(expected, actual) - - -@pytest.mark.parametrize( - "data", - [ - [1, 2, 3, 4, 5, 6], - [10, 20, 30, 40, 50, 60], - ["1", "2", "3", "4", "5", "6"], - [1.0, 2.0, 3.0, 4.0, 5.0, 6.0], - ["a"], - ["b", "c", "d"], - [1], - [2, 3, 4], - [], - [10.0], - [1100.112, 2323.2322, 2323.2322], - ["abcd", "defgh", "werty", "poiu"], - ], -) -@pytest.mark.parametrize( - "other", - [ - [1, 2, 3, 4, 5, 6], - [10, 20, 30, 40, 50, 60], - ["1", "2", "3", "4", "5", "6"], - [1.0, 2.0, 3.0, 4.0, 5.0, 6.0], - ["a"], - ["b", "c", "d"], - [1], - [2, 3, 4], - [], - [10.0], - [1100.112, 2323.2322, 2323.2322], - ["abcd", "defgh", "werty", "poiu"], - ], -) -def test_index_categories_equal(data, other): - pd_data = pd.Index(data).astype("category") - pd_other = pd.Index(other) - - gd_data = Index(data).astype("category") - gd_other = Index(other) - - expected = pd_data.equals(pd_other) - actual = gd_data.equals(gd_other) - assert_eq(expected, actual) - - expected = pd_other.equals(pd_data) - actual = gd_other.equals(gd_data) - assert_eq(expected, actual) - - -@pytest.mark.parametrize( - "data", - [ - [1, 2, 3, 4, 5, 6], - [10, 20, 30, 40, 50, 60], - ["1", "2", "3", "4", "5", "6"], - [1.0, 2.0, 3.0, 4.0, 5.0, 6.0], - ["a"], - ["b", "c", "d"], - [1], - [2, 3, 4], - [], - [10.0], - [1100.112, 2323.2322, 2323.2322], - ["abcd", "defgh", "werty", "poiu"], - ], -) -@pytest.mark.parametrize( - "other", + "objs", [ - [1, 2, 3, 4, 5, 6], - [10, 20, 30, 40, 50, 60], - ["1", "2", "3", "4", "5", "6"], - [1.0, 2.0, 3.0, 4.0, 5.0, 6.0], - ["a"], - ["b", "c", "d"], - [1], - [2, 3, 4], - [], - [10.0], - [1100.112, 2323.2322, 2323.2322], - ["abcd", "defgh", "werty", "poiu"], + [pd.RangeIndex(0, 10), pd.RangeIndex(10, 20)], + [pd.RangeIndex(10, 20), pd.RangeIndex(22, 40), pd.RangeIndex(50, 60)], + [pd.RangeIndex(10, 20, 2), pd.RangeIndex(20, 40, 2)], ], ) -def test_index_equal_misc(data, other): - pd_data = pd.Index(data) - pd_other = other - - gd_data = Index(data) - gd_other = other - - expected = pd_data.equals(pd_other) - actual = gd_data.equals(gd_other) - assert_eq(expected, actual) - - expected = pd_data.equals(np.array(pd_other)) - actual = gd_data.equals(np.array(gd_other)) - assert_eq(expected, actual) +def test_range_index_concat(objs): + cudf_objs = [cudf.from_pandas(obj) for obj in objs] - expected = pd_data.equals(pd.Series(pd_other)) - actual = gd_data.equals(cudf.Series(gd_other)) - assert_eq(expected, actual) + actual = cudf.concat(cudf_objs) - expected = pd_data.astype("category").equals(pd_other) - actual = gd_data.astype("category").equals(gd_other) + expected = objs[0] + for obj in objs[1:]: + expected = expected.append(obj) assert_eq(expected, actual) @pytest.mark.parametrize( - "data", - [ - [1, 2, 3, 4, 5, 6], - [10, 20, 30, 40, 50, 60], - ["1", "2", "3", "4", "5", "6"], - [1.0, 2.0, 3.0, 4.0, 5.0, 6.0], - ["a"], - ["b", "c", "d"], - [1], - [2, 3, 4], - [], - [10.0], - [1100.112, 2323.2322, 2323.2322], - ["abcd", "defgh", "werty", "poiu"], - ], -) -@pytest.mark.parametrize( - "other", - [ - [1, 2, 3, 4, 5, 6], - [10, 20, 30, 40, 50, 60], - ["1", "2", "3", "4", "5", "6"], - [1.0, 2.0, 3.0, 4.0, 5.0, 6.0], - ["a"], - ["b", "c", "d"], - [1], - [2, 3, 4], - [], - [10.0], - [1100.112, 2323.2322, 2323.2322], - ["abcd", "defgh", "werty", "poiu"], - ], -) -@pytest.mark.skipif( - PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, - reason="Does not warn on older versions of pandas", + "op", [operator.add, operator.sub, operator.mul, operator.truediv] ) -def test_index_append(data, other): - pd_data = pd.Index(data) - pd_other = pd.Index(other) - - gd_data = cudf.Index(data) - gd_other = cudf.Index(other) - - if cudf.utils.dtypes.is_mixed_with_object_dtype(gd_data, gd_other): - gd_data = gd_data.astype("str") - gd_other = gd_other.astype("str") - - with expect_warning_if( - (len(data) == 0 or len(other) == 0) and pd_data.dtype != pd_other.dtype - ): - expected = pd_data.append(pd_other) - with expect_warning_if( - (len(data) == 0 or len(other) == 0) and gd_data.dtype != gd_other.dtype - ): - actual = gd_data.append(gd_other) - if len(data) == 0 and len(other) == 0: - # Pandas default dtype to "object" for empty list - # cudf default dtype to "float" for empty list - assert_eq(expected, actual.astype("str")) - elif actual.dtype == "object": - assert_eq(expected.astype("str"), actual) - else: - assert_eq(expected, actual) - - -def test_index_empty_append_name_conflict(): - empty = cudf.Index([], name="foo") - non_empty = cudf.Index([1], name="bar") - expected = cudf.Index([1]) - - with pytest.warns(FutureWarning): - result = non_empty.append(empty) - assert_eq(result, expected) - - with pytest.warns(FutureWarning): - result = empty.append(non_empty) +def test_rangeindex_binop_diff_names_none(op): + idx1 = cudf.RangeIndex(10, 13, name="foo") + idx2 = cudf.RangeIndex(13, 16, name="bar") + result = op(idx1, idx2) + expected = op(idx1.to_pandas(), idx2.to_pandas()) assert_eq(result, expected) + assert result.name is None @pytest.mark.parametrize( - "data", - [ - [1, 2, 3, 4, 5, 6], - [10, 20, 30, 40, 50, 60], - [1.0, 2.0, 3.0, 4.0, 5.0, 6.0], - [1], - [2, 3, 4], - [10.0], - [1100.112, 2323.2322, 2323.2322], - ], -) -@pytest.mark.parametrize( - "other", - [ - ["1", "2", "3", "4", "5", "6"], - ["a"], - ["b", "c", "d"], - ["abcd", "defgh", "werty", "poiu"], - ], -) -def test_index_append_error(data, other): - gd_data = Index(data) - gd_other = Index(other) - - got_dtype = ( - gd_other.dtype - if gd_data.dtype == np.dtype("object") - else gd_data.dtype - ) - with pytest.raises( - TypeError, - match=re.escape( - f"cudf does not support appending an Index of " - f"dtype `{np.dtype('object')}` with an Index " - f"of dtype `{got_dtype}`, please type-cast " - f"either one of them to same dtypes." - ), - ): - gd_data.append(gd_other) - - with pytest.raises( - TypeError, - match=re.escape( - f"cudf does not support appending an Index of " - f"dtype `{np.dtype('object')}` with an Index " - f"of dtype `{got_dtype}`, please type-cast " - f"either one of them to same dtypes." - ), - ): - gd_other.append(gd_data) - - sr = gd_other.to_series() - - assert_exceptions_equal( - lfunc=gd_data.to_pandas().append, - rfunc=gd_data.append, - lfunc_args_and_kwargs=([[sr.to_pandas()]],), - rfunc_args_and_kwargs=([[sr]],), - ) - - -@pytest.mark.parametrize( - "data,other", - [ - ( - pd.Index([1, 2, 3, 4, 5, 6]), - [ - pd.Index([1, 2, 3, 4, 5, 6]), - pd.Index([1, 2, 3, 4, 5, 6, 10]), - pd.Index([]), - ], - ), - ( - pd.Index([]), - [ - pd.Index([1, 2, 3, 4, 5, 6]), - pd.Index([1, 2, 3, 4, 5, 6, 10]), - pd.Index([1, 4, 5, 6]), - ], - ), - ( - pd.Index([10, 20, 30, 40, 50, 60]), - [ - pd.Index([10, 20, 30, 40, 50, 60]), - pd.Index([10, 20, 30]), - pd.Index([40, 50, 60]), - pd.Index([10, 60]), - pd.Index([60]), - ], - ), - ( - pd.Index([]), - [ - pd.Index([10, 20, 30, 40, 50, 60]), - pd.Index([10, 20, 30]), - pd.Index([40, 50, 60]), - pd.Index([10, 60]), - pd.Index([60]), - ], - ), - ( - pd.Index(["1", "2", "3", "4", "5", "6"]), - [ - pd.Index(["1", "2", "3", "4", "5", "6"]), - pd.Index(["1", "2", "3"]), - pd.Index(["6"]), - pd.Index(["1", "6"]), - ], - ), - ( - pd.Index([]), - [ - pd.Index(["1", "2", "3", "4", "5", "6"]), - pd.Index(["1", "2", "3"]), - pd.Index(["6"]), - pd.Index(["1", "6"]), - ], - ), - ( - pd.Index([1.0, 2.0, 3.0, 4.0, 5.0, 6.0]), - [ - pd.Index([1.0, 2.0, 3.0, 4.0, 5.0, 6.0]), - pd.Index([1.0, 6.0]), - pd.Index([]), - pd.Index([6.0]), - ], - ), - ( - pd.Index([]), - [ - pd.Index([1.0, 2.0, 3.0, 4.0, 5.0, 6.0]), - pd.Index([1.0, 6.0]), - pd.Index([1.0, 2.0, 6.0]), - pd.Index([6.0]), - ], - ), - ( - pd.Index(["a"]), - [ - pd.Index(["a"]), - pd.Index(["a", "b", "c"]), - pd.Index(["c"]), - pd.Index(["d"]), - pd.Index(["ae", "hello", "world"]), - ], - ), - ( - pd.Index([]), - [ - pd.Index(["a"]), - pd.Index(["a", "b", "c"]), - pd.Index(["c"]), - pd.Index(["d"]), - pd.Index(["ae", "hello", "world"]), - pd.Index([]), - ], - ), - ], -) -@pytest.mark.skipif( - PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, - reason="Does not warn on older versions of pandas", -) -def test_index_append_list(data, other): - pd_data = data - pd_other = other - - gd_data = cudf.from_pandas(data) - gd_other = [cudf.from_pandas(i) for i in other] - - with expect_warning_if( - (len(data) == 0 or any(len(d) == 0 for d in other)) - and (any(d.dtype != data.dtype for d in other)) - ): - expected = pd_data.append(pd_other) - with expect_warning_if( - (len(data) == 0 or any(len(d) == 0 for d in other)) - and (any(d.dtype != data.dtype for d in other)) - ): - actual = gd_data.append(gd_other) - - assert_eq(expected, actual) - - -@pytest.mark.parametrize("data", [[1, 2, 3, 4], []]) -@pytest.mark.parametrize( - "dtype", [*NUMERIC_TYPES, "str", "category", "datetime64[ns]"] -) -@pytest.mark.parametrize("name", [1, "a", None]) -def test_index_basic(data, dtype, name): - pdi = pd.Index(data, dtype=dtype, name=name) - gdi = cudf.Index(data, dtype=dtype, name=name) - - assert_eq(pdi, gdi) - - -@pytest.mark.parametrize("data", [[1, 2, 3, 4], []]) -@pytest.mark.parametrize("categories", [[1, 2], None]) -@pytest.mark.parametrize( - "dtype", + "index", [ - pd.CategoricalDtype([1, 2, 3], ordered=True), - pd.CategoricalDtype([1, 2, 3], ordered=False), - None, - ], -) -@pytest.mark.parametrize("ordered", [True, False]) -@pytest.mark.parametrize("name", [1, "a", None]) -def test_categorical_index_basic(data, categories, dtype, ordered, name): - # can't have both dtype and categories/ordered - if dtype is not None: - categories = None - ordered = None - pindex = pd.CategoricalIndex( - data=data, - categories=categories, - dtype=dtype, - ordered=ordered, - name=name, - ) - gindex = CategoricalIndex( - data=data, - categories=categories, - dtype=dtype, - ordered=ordered, - name=name, - ) - - assert_eq(pindex, gindex) - - -@pytest.mark.parametrize( - "data", - [ - pd.MultiIndex.from_arrays( - [[1, 1, 2, 2], ["red", "blue", "red", "blue"]], - names=("number", "color"), - ), - pd.MultiIndex.from_arrays( - [[1, 2, 3, 4], ["yellow", "violet", "pink", "white"]], - names=("number1", "color2"), - ), - pd.MultiIndex.from_arrays( - [[1, 1, 2, 2], ["red", "blue", "red", "blue"]], - ), - ], -) -@pytest.mark.parametrize( - "other", - [ - pd.MultiIndex.from_arrays( - [[1, 1, 2, 2], ["red", "blue", "red", "blue"]], - names=("number", "color"), - ), - pd.MultiIndex.from_arrays( - [[1, 2, 3, 4], ["yellow", "violet", "pink", "white"]], - names=("number1", "color2"), - ), - pd.MultiIndex.from_arrays( - [[1, 1, 2, 2], ["red", "blue", "red", "blue"]], - ), - ], -) -def test_multiindex_append(data, other): - pdi = data - other_pd = other - - gdi = cudf.from_pandas(data) - other_gd = cudf.from_pandas(other) - - expected = pdi.append(other_pd) - actual = gdi.append(other_gd) - - assert_eq(expected, actual) - - -@pytest.mark.parametrize("data", [[1, 2, 3, 4], []]) -@pytest.mark.parametrize( - "dtype", [*NUMERIC_TYPES, "str", "category", "datetime64[ns]"] -) -def test_index_empty(data, dtype): - pdi = pd.Index(data, dtype=dtype) - gdi = cudf.Index(data, dtype=dtype) - - assert_eq(pdi.empty, gdi.empty) - - -@pytest.mark.parametrize("data", [[1, 2, 3, 4], []]) -@pytest.mark.parametrize( - "dtype", [*NUMERIC_TYPES, "str", "category", "datetime64[ns]"] -) -def test_index_size(data, dtype): - pdi = pd.Index(data, dtype=dtype) - gdi = cudf.Index(data, dtype=dtype) - - assert_eq(pdi.size, gdi.size) - - -@pytest.mark.parametrize("data", [[1, 2, 3, 1, 2, 3, 4], [], [1], [1, 2, 3]]) -@pytest.mark.parametrize( - "dtype", [*NUMERIC_TYPES, "str", "category", "datetime64[ns]"] -) -def test_index_drop_duplicates(data, dtype): - pdi = pd.Index(data, dtype=dtype) - gdi = cudf.Index(data, dtype=dtype) - - assert_eq(pdi.drop_duplicates(), gdi.drop_duplicates()) - - -def test_dropna_bad_how(): - with pytest.raises(ValueError): - cudf.Index([1]).dropna(how="foo") - - -@pytest.mark.parametrize("data", [[1, 2, 3, 1, 2, 3, 4], []]) -@pytest.mark.parametrize( - "dtype", [*NUMERIC_TYPES, "str", "category", "datetime64[ns]"] -) -def test_index_tolist(data, dtype): - gdi = cudf.Index(data, dtype=dtype) - - with pytest.raises( - TypeError, - match=re.escape( - r"cuDF does not support conversion to host memory " - r"via the `tolist()` method. Consider using " - r"`.to_arrow().to_pylist()` to construct a Python list." - ), - ): - gdi.tolist() - - -@pytest.mark.parametrize("data", [[], [1], [1, 2, 3]]) -@pytest.mark.parametrize( - "dtype", [*NUMERIC_TYPES, "str", "category", "datetime64[ns]"] -) -def test_index_iter_error(data, dtype): - gdi = cudf.Index(data, dtype=dtype) - - with pytest.raises( - TypeError, - match=re.escape( - f"{gdi.__class__.__name__} object is not iterable. " - f"Consider using `.to_arrow()`, `.to_pandas()` or `.values_host` " - f"if you wish to iterate over the values." - ), - ): - iter(gdi) - - -@pytest.mark.parametrize("data", [[], [1], [1, 2, 3, 4, 5]]) -@pytest.mark.parametrize( - "dtype", [*NUMERIC_TYPES, "str", "category", "datetime64[ns]"] -) -def test_index_values_host(data, dtype): - gdi = cudf.Index(data, dtype=dtype) - pdi = pd.Index(data, dtype=dtype) - - np.testing.assert_array_equal(gdi.values_host, pdi.values) - - -@pytest.mark.parametrize( - "data,fill_value", - [ - ([1, 2, 3, 1, None, None], 1), - ([None, None, 3.2, 1, None, None], 10.0), - ([None, "a", "3.2", "z", None, None], "helloworld"), - (pd.Series(["a", "b", None], dtype="category"), "b"), - (pd.Series([None, None, 1.0], dtype="category"), 1.0), - ( - np.array([1, 2, 3, None], dtype="datetime64[s]"), - np.datetime64("2005-02-25"), - ), - ( - np.array( - [None, None, 122, 3242234, None, 6237846], - dtype="datetime64[ms]", - ), - np.datetime64("2005-02-25"), - ), - ], -) -def test_index_fillna(data, fill_value): - pdi = pd.Index(data) - gdi = cudf.Index(data) - - assert_eq( - pdi.fillna(fill_value), gdi.fillna(fill_value), exact=False - ) # Int64 v/s Float64 - - -@pytest.mark.parametrize( - "data", - [ - [1, 2, 3, 1, None, None], - [None, None, 3.2, 1, None, None], - [None, "a", "3.2", "z", None, None], - pd.Series(["a", "b", None], dtype="category"), - np.array([1, 2, 3, None], dtype="datetime64[s]"), - ], -) -def test_index_to_arrow(data): - pdi = pd.Index(data) - gdi = cudf.Index(data) - - expected_arrow_array = pa.Array.from_pandas(pdi) - got_arrow_array = gdi.to_arrow() - - assert_eq(expected_arrow_array, got_arrow_array) - - -@pytest.mark.parametrize( - "data", - [ - [None, None, 3.2, 1, None, None], - [None, "a", "3.2", "z", None, None], - pd.Series(["a", "b", None], dtype="category"), - np.array([1, 2, 3, None], dtype="datetime64[s]"), - ], -) -def test_index_from_arrow(data): - pdi = pd.Index(data) - - arrow_array = pa.Array.from_pandas(pdi) - expected_index = pd.Index(arrow_array.to_pandas()) - gdi = cudf.Index.from_arrow(arrow_array) - - assert_eq(expected_index, gdi) - - -def test_multiindex_to_arrow(): - pdf = pd.DataFrame( - { - "a": [1, 2, 1, 2, 3], - "b": [1.0, 2.0, 3.0, 4.0, 5.0], - "c": np.array([1, 2, 3, None, 5], dtype="datetime64[s]"), - "d": ["a", "b", "c", "d", "e"], - } - ) - pdf["a"] = pdf["a"].astype("category") - df = cudf.from_pandas(pdf) - gdi = cudf.MultiIndex.from_frame(df) - - expected = pa.Table.from_pandas(pdf) - got = gdi.to_arrow() - - assert_eq(expected, got) - - -def test_multiindex_from_arrow(): - pdf = pd.DataFrame( - { - "a": [1, 2, 1, 2, 3], - "b": [1.0, 2.0, 3.0, 4.0, 5.0], - "c": np.array([1, 2, 3, None, 5], dtype="datetime64[s]"), - "d": ["a", "b", "c", "d", "e"], - } - ) - pdf["a"] = pdf["a"].astype("category") - ptb = pa.Table.from_pandas(pdf) - gdi = cudf.MultiIndex.from_arrow(ptb) - pdi = pd.MultiIndex.from_frame(pdf) - - assert_eq(pdi, gdi) - - -def test_index_equals_categories(): - lhs = cudf.CategoricalIndex( - ["a", "b", "c", "b", "a"], categories=["a", "b", "c"] - ) - rhs = cudf.CategoricalIndex( - ["a", "b", "c", "b", "a"], categories=["a", "b", "c", "_"] - ) - - got = lhs.equals(rhs) - expect = lhs.to_pandas().equals(rhs.to_pandas()) - - assert_eq(expect, got) - - -def test_rangeindex_arg_validation(): - with pytest.raises(TypeError): - RangeIndex("1") - - with pytest.raises(TypeError): - RangeIndex(1, "2") - - with pytest.raises(TypeError): - RangeIndex(1, 3, "1") - - with pytest.raises(ValueError): - RangeIndex(1, dtype="float64") - - with pytest.raises(ValueError): - RangeIndex(1, dtype="uint64") - - -def test_rangeindex_name_not_hashable(): - with pytest.raises(ValueError): - RangeIndex(range(2), name=["foo"]) - - with pytest.raises(ValueError): - RangeIndex(range(2)).copy(name=["foo"]) - - -def test_index_rangeindex_searchsorted(): - # step > 0 - ridx = RangeIndex(-13, 17, 4) - for i in range(len(ridx)): - assert i == ridx.searchsorted(ridx[i], side="left") - assert i + 1 == ridx.searchsorted(ridx[i], side="right") - - -@pytest.mark.parametrize( - "rge", - [(1, 10, 1), (1, 10, 3), (10, -17, -1), (10, -17, -3)], -) -def test_index_rangeindex_get_item_basic(rge): - pridx = pd.RangeIndex(*rge) - gridx = cudf.RangeIndex(*rge) - - for i in range(-len(pridx), len(pridx)): - assert pridx[i] == gridx[i] - - -@pytest.mark.parametrize( - "rge", - [(1, 10, 3), (10, 1, -3)], -) -def test_index_rangeindex_get_item_out_of_bounds(rge): - gridx = cudf.RangeIndex(*rge) - with pytest.raises(IndexError): - _ = gridx[4] - - -@pytest.mark.parametrize( - "rge", - [(10, 1, 1), (-17, 10, -3)], -) -def test_index_rangeindex_get_item_null_range(rge): - gridx = cudf.RangeIndex(*rge) - - with pytest.raises(IndexError): - gridx[0] - - -@pytest.mark.parametrize( - "rge", [(-17, 21, 2), (21, -17, -3), (0, 0, 1), (0, 1, -3), (10, 0, 5)] -) -@pytest.mark.parametrize( - "sl", - [ - slice(1, 7, 1), - slice(1, 7, 2), - slice(-1, 7, 1), - slice(-1, 7, 2), - slice(-3, 7, 2), - slice(7, 1, -2), - slice(7, -3, -2), - slice(None, None, 1), - slice(0, None, 2), - slice(0, None, 3), - slice(0, 0, 3), - ], -) -def test_index_rangeindex_get_item_slices(rge, sl): - pridx = pd.RangeIndex(*rge) - gridx = cudf.RangeIndex(*rge) - - assert_eq(pridx[sl], gridx[sl]) - - -@pytest.mark.parametrize( - "idx", - [ - pd.Index([1, 2, 3]), - pd.Index(["abc", "def", "ghi"]), - pd.RangeIndex(0, 10, 1), - pd.Index([0.324, 0.234, 1.3], name="abc"), - ], -) -@pytest.mark.parametrize("names", [None, "a", "new name", ["another name"]]) -@pytest.mark.parametrize("inplace", [True, False]) -def test_index_set_names(idx, names, inplace): - pi = idx.copy() - gi = cudf.from_pandas(idx) - - expected = pi.set_names(names=names, inplace=inplace) - actual = gi.set_names(names=names, inplace=inplace) - - if inplace: - expected, actual = pi, gi - - assert_eq(expected, actual) - - -@pytest.mark.parametrize("level", [1, [0], "abc"]) -@pytest.mark.parametrize("names", [None, "a"]) -def test_index_set_names_error(level, names): - pi = pd.Index([1, 2, 3], name="abc") - gi = cudf.from_pandas(pi) - - assert_exceptions_equal( - lfunc=pi.set_names, - rfunc=gi.set_names, - lfunc_args_and_kwargs=([], {"names": names, "level": level}), - rfunc_args_and_kwargs=([], {"names": names, "level": level}), - ) - - -@pytest.mark.parametrize( - "data", [[1, 3, 6], [6, 1, 3]], ids=["monotonic", "non-monotonic"] -) -@pytest.mark.parametrize("method", [None, "ffill", "bfill", "nearest"]) -def test_get_indexer_single_unique_numeric(data, method): - key = list(range(0, 8)) - pi = pd.Index(data) - gi = cudf.from_pandas(pi) - - if ( - # `method` only applicable to monotonic index - not pi.is_monotonic_increasing and method is not None - ): - assert_exceptions_equal( - lfunc=pi.get_loc, - rfunc=gi.get_loc, - lfunc_args_and_kwargs=([], {"key": key, "method": method}), - rfunc_args_and_kwargs=([], {"key": key, "method": method}), - ) - else: - expected = pi.get_indexer(key, method=method) - got = gi.get_indexer(key, method=method) - - assert_eq(expected, got) - - with cudf.option_context("mode.pandas_compatible", True): - got = gi.get_indexer(key, method=method) - assert_eq(expected, got, check_dtype=True) - - -@pytest.mark.parametrize( - "rng", - [ - range(1, 20, 3), - range(20, 35, 3), - range(35, 77, 3), - range(77, 110, 3), - ], -) -@pytest.mark.parametrize("method", [None, "ffill", "bfill", "nearest"]) -@pytest.mark.parametrize("tolerance", [None, 0, 1, 13, 20]) -def test_get_indexer_rangeindex(rng, method, tolerance): - key = list(rng) - pi = pd.RangeIndex(3, 100, 4) - gi = cudf.from_pandas(pi) - - expected = pi.get_indexer( - key, method=method, tolerance=None if method is None else tolerance - ) - got = gi.get_indexer( - key, method=method, tolerance=None if method is None else tolerance - ) - - assert_eq(expected, got) - - with cudf.option_context("mode.pandas_compatible", True): - got = gi.get_indexer( - key, method=method, tolerance=None if method is None else tolerance - ) - assert_eq(expected, got, check_dtype=True) - - -@pytest.mark.parametrize("key", list(range(1, 110, 3))) -def test_get_loc_rangeindex(key): - pi = pd.RangeIndex(3, 100, 4) - gi = cudf.from_pandas(pi) - if ( - (key not in pi) - # Get key before the first element is KeyError - or (key < pi.start) - # Get key after the last element is KeyError - or (key >= pi.stop) - ): - assert_exceptions_equal( - lfunc=pi.get_loc, - rfunc=gi.get_loc, - lfunc_args_and_kwargs=([], {"key": key}), - rfunc_args_and_kwargs=([], {"key": key}), - ) - else: - expected = pi.get_loc(key) - got = gi.get_loc(key) - - assert_eq(expected, got) - - -@pytest.mark.parametrize( - "idx", - [ - [1, 3, 3, 6], - [6, 1, 3, 3], - [4, 3, 2, 1, 0], - ], - ids=["monotonic increasing", "non-monotonic", "monotonic decreasing"], -) -@pytest.mark.parametrize("key", [0, 3, 6, 7, 4]) -def test_get_loc_duplicate_numeric(idx, key): - pi = pd.Index(idx) - gi = cudf.from_pandas(pi) - - if key not in pi: - assert_exceptions_equal( - lfunc=pi.get_loc, - rfunc=gi.get_loc, - lfunc_args_and_kwargs=([], {"key": key}), - rfunc_args_and_kwargs=([], {"key": key}), - ) - else: - expected = pi.get_loc(key) - got = gi.get_loc(key) - - assert_eq(expected, got) - - -@pytest.mark.parametrize( - "idx", - [ - [-1, 2, 3, 6], - [6, 1, 3, 4], - ], - ids=["monotonic", "non-monotonic"], -) -@pytest.mark.parametrize("key", [[0, 3, 1], [6, 7]]) -@pytest.mark.parametrize("method", [None, "ffill", "bfill", "nearest"]) -@pytest.mark.parametrize("tolerance", [None, 1, 2]) -def test_get_indexer_single_duplicate_numeric(idx, key, method, tolerance): - pi = pd.Index(idx) - gi = cudf.from_pandas(pi) - - if not pi.is_monotonic_increasing and method is not None: - assert_exceptions_equal( - lfunc=pi.get_indexer, - rfunc=gi.get_indexer, - lfunc_args_and_kwargs=([], {"key": key, "method": method}), - rfunc_args_and_kwargs=([], {"key": key, "method": method}), - ) - else: - expected = pi.get_indexer( - key, method=method, tolerance=None if method is None else tolerance - ) - got = gi.get_indexer( - key, method=method, tolerance=None if method is None else tolerance - ) - - assert_eq(expected, got) - - -@pytest.mark.parametrize("idx", [["b", "f", "m", "q"], ["m", "f", "b", "q"]]) -@pytest.mark.parametrize("key", ["a", "f", "n", "z"]) -def test_get_loc_single_unique_string(idx, key): - pi = pd.Index(idx) - gi = cudf.from_pandas(pi) - - if key not in pi: - assert_exceptions_equal( - lfunc=pi.get_loc, - rfunc=gi.get_loc, - lfunc_args_and_kwargs=([], {"key": key}), - rfunc_args_and_kwargs=([], {"key": key}), - ) - else: - expected = pi.get_loc(key) - got = gi.get_loc(key) - - assert_eq(expected, got) - - -@pytest.mark.parametrize("idx", [["b", "f", "m", "q"], ["m", "f", "b", "q"]]) -@pytest.mark.parametrize("key", [["a", "f", "n", "z"], ["p", "p", "b"]]) -@pytest.mark.parametrize("method", [None, "ffill", "bfill"]) -def test_get_indexer_single_unique_string(idx, key, method): - pi = pd.Index(idx) - gi = cudf.from_pandas(pi) - - if not pi.is_monotonic_increasing and method is not None: - assert_exceptions_equal( - lfunc=pi.get_indexer, - rfunc=gi.get_indexer, - lfunc_args_and_kwargs=([], {"key": key, "method": method}), - rfunc_args_and_kwargs=([], {"key": key, "method": method}), - ) - else: - expected = pi.get_indexer(key, method=method) - got = gi.get_indexer(key, method=method) - - assert_eq(expected, got) - - -@pytest.mark.parametrize("idx", [["b", "m", "m", "q"], ["m", "f", "m", "q"]]) -@pytest.mark.parametrize("key", ["a", "f", "n", "z"]) -def test_get_loc_single_duplicate_string(idx, key): - pi = pd.Index(idx) - gi = cudf.from_pandas(pi) - - if key not in pi: - assert_exceptions_equal( - lfunc=pi.get_loc, - rfunc=gi.get_loc, - lfunc_args_and_kwargs=([], {"key": key}), - rfunc_args_and_kwargs=([], {"key": key}), - ) - else: - expected = pi.get_loc(key) - got = gi.get_loc(key) - - assert_eq(expected, got) - - -@pytest.mark.parametrize("idx", [["b", "m", "m", "q"], ["a", "f", "m", "q"]]) -@pytest.mark.parametrize("key", [["a"], ["f", "n", "z"]]) -@pytest.mark.parametrize("method", [None, "ffill", "bfill"]) -def test_get_indexer_single_duplicate_string(idx, key, method): - pi = pd.Index(idx) - gi = cudf.from_pandas(pi) - - if ( - # `method` only applicable to monotonic index - (not pi.is_monotonic_increasing and method is not None) - or not pi.is_unique - ): - assert_exceptions_equal( - lfunc=pi.get_indexer, - rfunc=gi.get_indexer, - lfunc_args_and_kwargs=([], {"key": key, "method": method}), - rfunc_args_and_kwargs=([], {"key": key, "method": method}), - ) - else: - expected = pi.get_indexer(key, method=method) - got = gi.get_indexer(key, method=method) - - assert_eq(expected, got) - - with cudf.option_context("mode.pandas_compatible", True): - got = gi.get_indexer(key, method=method) - - assert_eq(expected, got, check_dtype=True) - - -@pytest.mark.parametrize( - "data", - [ - [(1, 1, 1), (1, 1, 2), (1, 2, 1), (1, 2, 3), (2, 1, 1), (2, 2, 1)], - [(2, 1, 1), (1, 2, 3), (1, 2, 1), (1, 1, 2), (2, 2, 1), (1, 1, 1)], - [(1, 1, 1), (1, 1, 2), (1, 1, 2), (1, 2, 3), (2, 1, 1), (2, 2, 1)], - ], -) -@pytest.mark.parametrize("key", [1, (1, 2), (1, 2, 3), (2, 1, 1), (9, 9, 9)]) -def test_get_loc_multi_numeric(data, key): - idx = pd.MultiIndex.from_tuples(data) - pi = idx.sort_values() - gi = cudf.from_pandas(pi) - - if key not in pi: - assert_exceptions_equal( - lfunc=pi.get_loc, - rfunc=gi.get_loc, - lfunc_args_and_kwargs=([], {"key": key}), - rfunc_args_and_kwargs=([], {"key": key}), - ) - else: - expected = pi.get_loc(key) - got = gi.get_loc(key) - - assert_eq(expected, got) - - -@pytest.mark.parametrize( - "data", - [ - [(1, 1, 1), (1, 1, 2), (1, 2, 1), (1, 2, 3), (2, 1, 1), (2, 2, 1)], - [(2, 1, 1), (1, 2, 3), (1, 2, 1), (1, 1, 2), (2, 2, 1), (1, 1, 1)], - [(1, 1, 1), (1, 1, 2), (1, 1, 24), (1, 2, 3), (2, 1, 1), (2, 2, 1)], - ], -) -@pytest.mark.parametrize("key", [[(1, 2, 3)], [(9, 9, 9)]]) -@pytest.mark.parametrize("method", [None, "ffill", "bfill"]) -def test_get_indexer_multi_numeric(data, key, method): - idx = pd.MultiIndex.from_tuples(data) - pi = idx.sort_values() - gi = cudf.from_pandas(pi) - - expected = pi.get_indexer(key, method=method) - got = gi.get_indexer(key, method=method) - - assert_eq(expected, got) - - with cudf.option_context("mode.pandas_compatible", True): - got = gi.get_indexer(key, method=method) - - assert_eq(expected, got, check_dtype=True) - - -@pytest.mark.parametrize( - "key, result", - [ - (1, slice(1, 5, 1)), # deviates - ((1, 2), slice(1, 3, 1)), - ((1, 2, 3), slice(1, 2, None)), - ((2, 1, 1), slice(0, 1, None)), - ((9, 9, 9), None), - ], -) -def test_get_loc_multi_numeric_deviate(key, result): - pi = pd.MultiIndex.from_tuples( - [(2, 1, 1), (1, 2, 3), (1, 2, 1), (1, 1, 1), (1, 1, 1), (2, 2, 1)] - ) - gi = cudf.from_pandas(pi) - - with expect_warning_if( - isinstance(key, tuple), pd.errors.PerformanceWarning - ): - key_flag = key not in pi - - if key_flag: - with expect_warning_if( - isinstance(key, tuple), pd.errors.PerformanceWarning - ): - assert_exceptions_equal( - lfunc=pi.get_loc, - rfunc=gi.get_loc, - lfunc_args_and_kwargs=([], {"key": key}), - rfunc_args_and_kwargs=([], {"key": key}), - ) - else: - expected = result - got = gi.get_loc(key) - - assert_eq(expected, got) - - -@pytest.mark.parametrize( - "key", - [ - ((1, 2, 3),), - ((2, 1, 1),), - ((9, 9, 9),), - ], -) -@pytest.mark.parametrize("method", [None, "ffill", "bfill"]) -def test_get_indexer_multi_numeric_deviate(key, method): - pi = pd.MultiIndex.from_tuples( - [(2, 1, 1), (1, 2, 3), (1, 2, 1), (1, 1, 10), (1, 1, 1), (2, 2, 1)] - ).sort_values() - gi = cudf.from_pandas(pi) - - expected = pi.get_indexer(key, method=method) - got = gi.get_indexer(key, method=method) - - assert_eq(expected, got) - - -@pytest.mark.parametrize("method", ["ffill", "bfill"]) -@pytest.mark.skipif( - PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, - reason="Fails in older versions of pandas", -) -def test_get_indexer_multi_error(method): - pi = pd.MultiIndex.from_tuples( - [(2, 1, 1), (1, 2, 3), (1, 2, 1), (1, 1, 10), (1, 1, 1), (2, 2, 1)] - ) - gi = cudf.from_pandas(pi) - - assert_exceptions_equal( - pi.get_indexer, - gi.get_indexer, - lfunc_args_and_kwargs=( - [], - {"target": ((1, 2, 3),), "method": method}, - ), - rfunc_args_and_kwargs=( - [], - {"target": ((1, 2, 3),), "method": method}, - ), - ) - - -@pytest.mark.parametrize( - "data", - [ - [ - ("a", "a", "a"), - ("a", "a", "b"), - ("a", "b", "a"), - ("a", "b", "c"), - ("b", "a", "a"), - ("b", "c", "a"), - ], - [ - ("a", "a", "b"), - ("a", "b", "c"), - ("b", "a", "a"), - ("a", "a", "a"), - ("a", "b", "a"), - ("b", "c", "a"), - ], - [ - ("a", "a", "a"), - ("a", "b", "c"), - ("b", "a", "a"), - ("a", "a", "b"), - ("a", "b", "a"), - ("b", "c", "a"), - ], - [ - ("a", "a", "a"), - ("a", "a", "b"), - ("a", "a", "b"), - ("a", "b", "c"), - ("b", "a", "a"), - ("b", "c", "a"), - ], - [ - ("a", "a", "b"), - ("b", "a", "a"), - ("b", "a", "a"), - ("a", "a", "a"), - ("a", "b", "a"), - ("b", "c", "a"), - ], - ], -) -@pytest.mark.parametrize( - "key", ["a", ("a", "a"), ("a", "b", "c"), ("b", "c", "a"), ("z", "z", "z")] -) -def test_get_loc_multi_string(data, key): - idx = pd.MultiIndex.from_tuples(data) - pi = idx.sort_values() - gi = cudf.from_pandas(pi) - - if key not in pi: - assert_exceptions_equal( - lfunc=pi.get_loc, - rfunc=gi.get_loc, - lfunc_args_and_kwargs=([], {"key": key}), - rfunc_args_and_kwargs=([], {"key": key}), - ) - else: - expected = pi.get_loc(key) - got = gi.get_loc(key) - - assert_eq(expected, got) - - -@pytest.mark.parametrize( - "data", - [ - [ - ("a", "a", "a"), - ("a", "a", "b"), - ("a", "b", "a"), - ("a", "b", "c"), - ("b", "a", "a"), - ("b", "c", "a"), - ], - [ - ("a", "a", "b"), - ("a", "b", "c"), - ("b", "a", "a"), - ("a", "a", "a"), - ("a", "b", "a"), - ("b", "c", "a"), - ], - [ - ("a", "a", "a"), - ("a", "b", "c"), - ("b", "a", "a"), - ("a", "a", "b"), - ("a", "b", "a"), - ("b", "c", "a"), - ], - ], -) -@pytest.mark.parametrize( - "key", [[("a", "b", "c"), ("b", "c", "a")], [("z", "z", "z")]] -) -@pytest.mark.parametrize("method", [None, "ffill", "bfill"]) -def test_get_indexer_multi_string(data, key, method): - idx = pd.MultiIndex.from_tuples(data) - pi = idx.sort_values() - gi = cudf.from_pandas(pi) - - expected = pi.get_indexer(key, method=method) - got = gi.get_indexer(key, method=method) - - assert_eq(expected, got) - - -@pytest.mark.parametrize( - "idx1", - [ - lambda: cudf.Index(["a", "b", "c"]), - lambda: cudf.RangeIndex(0, 10), - lambda: cudf.Index([1, 2, 3], dtype="category"), - lambda: cudf.Index(["a", "b", "c", "d"], dtype="category"), - lambda: cudf.MultiIndex.from_tuples( - [ - ("a", "a", "a"), - ("a", "b", "c"), - ("b", "a", "a"), - ("a", "a", "b"), - ("a", "b", "a"), - ("b", "c", "a"), - ] - ), - ], -) -@pytest.mark.parametrize( - "idx2", - [ - lambda: cudf.Index(["a", "b", "c"]), - lambda: cudf.RangeIndex(0, 10), - lambda: cudf.Index([1, 2, 3], dtype="category"), - lambda: cudf.Index(["a", "b", "c", "d"], dtype="category"), - ], -) -def test_get_indexer_invalid(idx1, idx2): - idx1 = idx1() - idx2 = idx2() - assert_eq( - idx1.get_indexer(idx2), idx1.to_pandas().get_indexer(idx2.to_pandas()) - ) - - -@pytest.mark.parametrize( - "objs", - [ - [pd.RangeIndex(0, 10), pd.RangeIndex(10, 20)], - [pd.RangeIndex(10, 20), pd.RangeIndex(22, 40), pd.RangeIndex(50, 60)], - [pd.RangeIndex(10, 20, 2), pd.RangeIndex(20, 40, 2)], - ], -) -def test_range_index_concat(objs): - cudf_objs = [cudf.from_pandas(obj) for obj in objs] - - actual = cudf.concat(cudf_objs) - - expected = objs[0] - for obj in objs[1:]: - expected = expected.append(obj) - assert_eq(expected, actual) - - -@pytest.mark.parametrize( - "idx1, idx2", - [ - (pd.RangeIndex(0, 10), pd.RangeIndex(3, 7)), - (pd.RangeIndex(0, 10), pd.RangeIndex(10, 20)), - (pd.RangeIndex(0, 10, 2), pd.RangeIndex(1, 5, 3)), - (pd.RangeIndex(1, 5, 3), pd.RangeIndex(0, 10, 2)), - (pd.RangeIndex(1, 10, 3), pd.RangeIndex(1, 5, 2)), - (pd.RangeIndex(1, 5, 2), pd.RangeIndex(1, 10, 3)), - (pd.RangeIndex(1, 100, 3), pd.RangeIndex(1, 50, 3)), - (pd.RangeIndex(1, 100, 3), pd.RangeIndex(1, 50, 6)), - (pd.RangeIndex(1, 100, 6), pd.RangeIndex(1, 50, 3)), - (pd.RangeIndex(0, 10, name="a"), pd.RangeIndex(90, 100, name="b")), - (pd.Index([0, 1, 2, 30], name="a"), pd.Index([90, 100])), - (pd.Index([0, 1, 2, 30], name="a"), [90, 100]), - (pd.Index([0, 1, 2, 30]), pd.Index([0, 10, 1.0, 11])), - (pd.Index(["a", "b", "c", "d", "c"]), pd.Index(["a", "c", "z"])), - ( - pd.IntervalIndex.from_tuples([(0, 2), (0, 2), (2, 4)]), - pd.IntervalIndex.from_tuples([(0, 2), (2, 4)]), - ), - (pd.RangeIndex(0, 10), pd.Index([8, 1, 2, 4])), - (pd.Index([8, 1, 2, 4], name="a"), pd.Index([8, 1, 2, 4], name="b")), - ( - pd.Index([8, 1, 2, 4], name="a"), - pd.Index([], name="b", dtype="int64"), - ), - (pd.Index([], dtype="int64", name="a"), pd.Index([10, 12], name="b")), - (pd.Index([True, True, True], name="a"), pd.Index([], dtype="bool")), - ( - pd.Index([True, True, True]), - pd.Index([False, True], dtype="bool", name="b"), - ), - ], -) -@pytest.mark.parametrize("sort", [None, False, True]) -def test_union_index(idx1, idx2, sort): - expected = idx1.union(idx2, sort=sort) - - idx1 = cudf.from_pandas(idx1) if isinstance(idx1, pd.Index) else idx1 - idx2 = cudf.from_pandas(idx2) if isinstance(idx2, pd.Index) else idx2 - - actual = idx1.union(idx2, sort=sort) - - assert_eq(expected, actual) - - -def test_union_bool_with_other(): - idx1 = cudf.Index([True, True, True]) - idx2 = cudf.Index([0, 1], name="b") - with cudf.option_context("mode.pandas_compatible", True): - with pytest.raises(cudf.errors.MixedTypeError): - idx1.union(idx2) - - -@pytest.mark.parametrize("dtype1", ["int8", "int32", "int32"]) -@pytest.mark.parametrize("dtype2", ["uint32", "uint64"]) -def test_union_unsigned_vs_signed(dtype1, dtype2): - idx1 = cudf.Index([10, 20, 30], dtype=dtype1) - idx2 = cudf.Index([0, 1], dtype=dtype2) - with cudf.option_context("mode.pandas_compatible", True): - with pytest.raises(cudf.errors.MixedTypeError): - idx1.union(idx2) - - -@pytest.mark.parametrize( - "idx1, idx2", - [ - (pd.RangeIndex(0, 10), pd.RangeIndex(3, 7)), - (pd.RangeIndex(0, 10), pd.RangeIndex(-10, 20)), - (pd.RangeIndex(0, 10, name="a"), pd.RangeIndex(90, 100, name="b")), - (pd.Index([0, 1, 2, 30], name=pd.NA), pd.Index([30, 0, 90, 100])), - (pd.Index([0, 1, 2, 30], name="a"), [90, 100]), - (pd.Index([0, 1, 2, 30]), pd.Index([0, 10, 1.0, 11])), - ( - pd.Index(["a", "b", "c", "d", "c"]), - pd.Index(["a", "c", "z"], name="abc"), - ), - ( - pd.Index(["a", "b", "c", "d", "c"]), - pd.Index(["a", "b", "c", "d", "c"]), - ), - (pd.Index([True, False, True, True]), pd.Index([10, 11, 12, 0, 1, 2])), - (pd.Index([True, False, True, True]), pd.Index([True, True])), - (pd.RangeIndex(0, 10, name="a"), pd.Index([5, 6, 7], name="b")), - (pd.Index(["a", "b", "c"], dtype="category"), pd.Index(["a", "b"])), - (pd.Index([0, 1, 2], dtype="category"), pd.RangeIndex(0, 10)), - (pd.Index(["a", "b", "c"], name="abc"), []), - (pd.Index([], name="abc"), pd.RangeIndex(0, 4)), - (pd.Index([1, 2, 3]), pd.Index([1, 2], dtype="category")), - (pd.Index([]), pd.Index([1, 2], dtype="category")), - ], -) -@pytest.mark.parametrize("sort", [None, False, True]) -@pytest.mark.parametrize("pandas_compatible", [True, False]) -def test_intersection_index(idx1, idx2, sort, pandas_compatible): - expected = idx1.intersection(idx2, sort=sort) - - with cudf.option_context("mode.pandas_compatible", pandas_compatible): - idx1 = cudf.from_pandas(idx1) if isinstance(idx1, pd.Index) else idx1 - idx2 = cudf.from_pandas(idx2) if isinstance(idx2, pd.Index) else idx2 - - actual = idx1.intersection(idx2, sort=sort) - - # TODO: Resolve the bool vs ints mixed issue - # once pandas has a direction on this issue - # https://github.com/pandas-dev/pandas/issues/44000 - assert_eq( - expected, - actual, - exact=False - if (idx1.dtype.kind == "b" and idx2.dtype.kind != "b") - or (idx1.dtype.kind != "b" or idx2.dtype.kind == "b") - else True, - ) - - -@pytest.mark.parametrize( - "idx1, idx2", - [ - (pd.Index(["a", "b", "c"], dtype="category"), pd.Index([1, 2, 3])), - ], -) -@pytest.mark.parametrize("sort", [None, False, True]) -@pytest.mark.parametrize("pandas_compatible", [True, False]) -def test_intersection_index_error(idx1, idx2, sort, pandas_compatible): - expected = idx1.intersection(idx2, sort=sort) - - with cudf.option_context("mode.pandas_compatible", pandas_compatible): - idx1 = cudf.from_pandas(idx1) if isinstance(idx1, pd.Index) else idx1 - idx2 = cudf.from_pandas(idx2) if isinstance(idx2, pd.Index) else idx2 - - if pandas_compatible: - with pytest.raises( - ValueError, - match="Cannot convert numerical column to string column when dtype is an object dtype in pandas compatibility mode.", - ): - idx1.intersection(idx2, sort=sort) - else: - actual = idx1.intersection(idx2, sort=sort) - - assert_eq( - expected, - actual, - ) - - -@pytest.mark.parametrize( - "data", - [ - [1, 2, 3], - ["a", "v", "d"], - [234.243, 2432.3, None], - [True, False, True], - pd.Series(["a", " ", "v"], dtype="category"), - pd.IntervalIndex.from_breaks([0, 1, 2, 3]), - ], -) -@pytest.mark.parametrize( - "func", - [ - "is_numeric", - "is_boolean", - "is_integer", - "is_floating", - "is_object", - "is_categorical", - "is_interval", - ], -) -def test_index_type_methods(data, func): - pidx = pd.Index(data) - gidx = cudf.from_pandas(pidx) - - with pytest.warns(FutureWarning): - expected = getattr(pidx, func)() - with pytest.warns(FutureWarning): - actual = getattr(gidx, func)() - - if gidx.dtype == np.dtype("bool") and func == "is_object": - assert_eq(False, actual) - else: - assert_eq(expected, actual) - - -@pytest.mark.parametrize( - "resolution", ["D", "h", "min", "s", "ms", "us", "ns"] -) -def test_index_datetime_ceil(resolution): - cuidx = cudf.DatetimeIndex([1000000, 2000000, 3000000, 4000000, 5000000]) - pidx = cuidx.to_pandas() - - pidx_ceil = pidx.ceil(resolution) - cuidx_ceil = cuidx.ceil(resolution) - - assert_eq(pidx_ceil, cuidx_ceil) - - -@pytest.mark.parametrize( - "resolution", ["D", "h", "min", "s", "ms", "us", "ns"] -) -def test_index_datetime_floor(resolution): - cuidx = cudf.DatetimeIndex([1000000, 2000000, 3000000, 4000000, 5000000]) - pidx = cuidx.to_pandas() - - pidx_floor = pidx.floor(resolution) - cuidx_floor = cuidx.floor(resolution) - - assert_eq(pidx_floor, cuidx_floor) - - -@pytest.mark.parametrize( - "resolution", ["D", "h", "min", "s", "ms", "us", "ns"] -) -def test_index_datetime_round(resolution): - cuidx = cudf.DatetimeIndex([1000000, 2000000, 3000000, 4000000, 5000000]) - pidx = cuidx.to_pandas() - - pidx_floor = pidx.round(resolution) - cuidx_floor = cuidx.round(resolution) - - assert_eq(pidx_floor, cuidx_floor) - - -@pytest.mark.parametrize( - "data,nan_idx,NA_idx", - [([1, 2, 3, None], None, 3), ([2, 3, np.nan, None], 2, 3)], -) -@pytest.mark.parametrize("nan_as_null", [True, False]) -def test_index_nan_as_null(data, nan_idx, NA_idx, nan_as_null): - idx = cudf.Index(data, nan_as_null=nan_as_null) - - if nan_as_null: - if nan_idx is not None: - assert idx[nan_idx] is cudf.NA - else: - if nan_idx is not None: - assert np.isnan(idx[nan_idx]) - - if NA_idx is not None: - assert idx[NA_idx] is cudf.NA - - -@pytest.mark.parametrize( - "index", - [ - pd.Index([]), - pd.Index(["a", "b", "c", "d", "e"]), - pd.Index([0, None, 9]), - pd.date_range("2019-01-01", periods=3), - ], -) -@pytest.mark.parametrize( - "values", - [ - [], - ["this", "is"], - [0, 19, 13], - ["2019-01-01 04:00:00", "2019-01-01 06:00:00", "2018-03-02 10:00:00"], - ], -) -def test_isin_index(index, values): - pidx = index - gidx = cudf.Index.from_pandas(pidx) - - is_dt_str = ( - next(iter(values), None) == "2019-01-01 04:00:00" - and len(pidx) - and pidx.dtype.kind == "M" - ) - with expect_warning_if(is_dt_str): - got = gidx.isin(values) - with expect_warning_if(PANDAS_GE_220 and is_dt_str): - expected = pidx.isin(values) - - assert_eq(got, expected) - - -@pytest.mark.parametrize( - "data", - [ - pd.MultiIndex.from_arrays( - [[1, 2, 3], ["red", "blue", "green"]], names=("number", "color") - ), - pd.MultiIndex.from_arrays([[], []], names=("number", "color")), - pd.MultiIndex.from_arrays( - [[1, 2, 3, 10, 100], ["red", "blue", "green", "pink", "white"]], - names=("number", "color"), - ), - pd.MultiIndex.from_product( - [[0, 1], ["red", "blue", "green"]], names=("number", "color") - ), - ], -) -@pytest.mark.parametrize( - "values,level,err", - [ - ([(1, "red"), (2, "blue"), (0, "green")], None, None), - (["red", "orange", "yellow"], "color", None), - (["red", "white", "yellow"], "color", None), - ([0, 1, 2, 10, 11, 15], "number", None), - ([0, 1, 2, 10, 11, 15], None, TypeError), - (pd.Series([0, 1, 2, 10, 11, 15]), None, TypeError), - (pd.Index([0, 1, 2, 10, 11, 15]), None, TypeError), - (pd.Index([0, 1, 2, 8, 11, 15]), "number", None), - (pd.Index(["red", "white", "yellow"]), "color", None), - ([(1, "red"), (3, "red")], None, None), - (((1, "red"), (3, "red")), None, None), - ( - pd.MultiIndex.from_arrays( - [[1, 2, 3], ["red", "blue", "green"]], - names=("number", "color"), - ), - None, - None, - ), - ( - pd.MultiIndex.from_arrays([[], []], names=("number", "color")), - None, - None, - ), - ( - pd.MultiIndex.from_arrays( - [ - [1, 2, 3, 10, 100], - ["red", "blue", "green", "pink", "white"], - ], - names=("number", "color"), - ), - None, - None, - ), - ], -) -def test_isin_multiindex(data, values, level, err): - pmdx = data - gmdx = cudf.from_pandas(data) - - if err is None: - expected = pmdx.isin(values, level=level) - if isinstance(values, pd.MultiIndex): - values = cudf.from_pandas(values) - got = gmdx.isin(values, level=level) - - assert_eq(got, expected) - else: - assert_exceptions_equal( - lfunc=pmdx.isin, - rfunc=gmdx.isin, - lfunc_args_and_kwargs=([values], {"level": level}), - rfunc_args_and_kwargs=([values], {"level": level}), - check_exception_type=False, - ) - - -@pytest.mark.parametrize( - "rangeindex", - [ - range(np.random.default_rng(seed=0).integers(0, 100)), - range(9, 12, 2), - range(20, 30), - range(100, 1000, 10), - range(0, 10, -2), - range(0, -10, 2), - range(0, -10, -2), - ], -) -@pytest.mark.parametrize( - "func", - ["nunique", "min", "max", "any", "values"], -) -def test_rangeindex_methods(rangeindex, func): - gidx = cudf.RangeIndex(rangeindex) - pidx = gidx.to_pandas() - - if func == "values": - expected = pidx.values - actual = gidx.values - else: - expected = getattr(pidx, func)() - actual = getattr(gidx, func)() - - assert_eq(expected, actual) - - -def test_index_constructor_integer(default_integer_bitwidth): - got = cudf.Index([1, 2, 3]) - expect = cudf.Index([1, 2, 3], dtype=f"int{default_integer_bitwidth}") - - assert_eq(expect, got) - - -def test_index_constructor_float(default_float_bitwidth): - got = cudf.Index([1.0, 2.0, 3.0]) - expect = cudf.Index( - [1.0, 2.0, 3.0], dtype=f"float{default_float_bitwidth}" - ) - - assert_eq(expect, got) - - -def test_rangeindex_union_default_user_option(default_integer_bitwidth): - # Test that RangeIndex is materialized into 32 bit index under user - # configuration for union operation. - idx1 = cudf.RangeIndex(0, 2) - idx2 = cudf.RangeIndex(5, 6) - - expected = cudf.Index([0, 1, 5], dtype=f"int{default_integer_bitwidth}") - actual = idx1.union(idx2) - - assert_eq(expected, actual) - - -def test_rangeindex_intersection_default_user_option(default_integer_bitwidth): - # Test that RangeIndex is materialized into 32 bit index under user - # configuration for intersection operation. - idx1 = cudf.RangeIndex(0, 100) - # Intersecting two RangeIndex will _always_ result in a RangeIndex, use - # regular index here to force materializing. - idx2 = cudf.Index([50, 102]) - - expected = cudf.Index([50], dtype=f"int{default_integer_bitwidth}") - actual = idx1.intersection(idx2) - - assert_eq(expected, actual) - - -def test_rangeindex_take_default_user_option(default_integer_bitwidth): - # Test that RangeIndex is materialized into 32 bit index under user - # configuration for take operation. - idx = cudf.RangeIndex(0, 100) - actual = idx.take([0, 3, 7, 62]) - expected = cudf.Index( - [0, 3, 7, 62], dtype=f"int{default_integer_bitwidth}" - ) - assert_eq(expected, actual) - - -def test_rangeindex_apply_boolean_mask_user_option(default_integer_bitwidth): - # Test that RangeIndex is materialized into 32 bit index under user - # configuration for apply boolean mask operation. - idx = cudf.RangeIndex(0, 8) - mask = [True, True, True, False, False, False, True, False] - actual = idx[mask] - expected = cudf.Index([0, 1, 2, 6], dtype=f"int{default_integer_bitwidth}") - assert_eq(expected, actual) - - -def test_rangeindex_repeat_user_option(default_integer_bitwidth): - # Test that RangeIndex is materialized into 32 bit index under user - # configuration for repeat operation. - idx = cudf.RangeIndex(0, 3) - actual = idx.repeat(3) - expected = cudf.Index( - [0, 0, 0, 1, 1, 1, 2, 2, 2], dtype=f"int{default_integer_bitwidth}" - ) - assert_eq(expected, actual) - - -@pytest.mark.parametrize( - "op, expected, expected_kind", - [ - (lambda idx: 2**idx, [2, 4, 8, 16], "int"), - (lambda idx: idx**2, [1, 4, 9, 16], "int"), - (lambda idx: idx / 2, [0.5, 1, 1.5, 2], "float"), - (lambda idx: 2 / idx, [2, 1, 2 / 3, 0.5], "float"), - (lambda idx: idx % 3, [1, 2, 0, 1], "int"), - (lambda idx: 3 % idx, [0, 1, 0, 3], "int"), - ], -) -def test_rangeindex_binops_user_option( - op, expected, expected_kind, default_integer_bitwidth -): - # Test that RangeIndex is materialized into 32 bit index under user - # configuration for binary operation. - idx = cudf.RangeIndex(1, 5) - actual = op(idx) - expected = cudf.Index( - expected, dtype=f"{expected_kind}{default_integer_bitwidth}" - ) - assert_eq( - expected, - actual, - ) - - -@pytest.mark.parametrize( - "op", [operator.add, operator.sub, operator.mul, operator.truediv] -) -def test_rangeindex_binop_diff_names_none(op): - idx1 = cudf.RangeIndex(10, 13, name="foo") - idx2 = cudf.RangeIndex(13, 16, name="bar") - result = op(idx1, idx2) - expected = op(idx1.to_pandas(), idx2.to_pandas()) - assert_eq(result, expected) - assert result.name is None - - -def test_rangeindex_join_user_option(default_integer_bitwidth): - # Test that RangeIndex is materialized into 32 bit index under user - # configuration for join. - idx1 = cudf.RangeIndex(0, 10, name="a") - idx2 = cudf.RangeIndex(5, 15, name="b") - - actual = idx1.join(idx2, how="inner", sort=True) - expected = idx1.to_pandas().join(idx2.to_pandas(), how="inner", sort=True) - assert actual.dtype == cudf.dtype(f"int{default_integer_bitwidth}") - # exact=False to ignore dtype comparison, - # because `default_integer_bitwidth` is cudf only option - assert_eq(expected, actual, exact=False) - - -def test_rangeindex_where_user_option(default_integer_bitwidth): - # Test that RangeIndex is materialized into 32 bit index under user - # configuration for where operation. - idx = cudf.RangeIndex(0, 10) - mask = [True, False, True, False, True, False, True, False, True, False] - actual = idx.where(mask, -1) - expected = cudf.Index( - [0, -1, 2, -1, 4, -1, 6, -1, 8, -1], - dtype=f"int{default_integer_bitwidth}", - ) - assert_eq(expected, actual) - - -def test_rangeindex_append_return_rangeindex(): - idx = cudf.RangeIndex(0, 10) - result = idx.append([]) - assert_eq(idx, result) - - result = idx.append(cudf.Index([10])) - expected = cudf.RangeIndex(0, 11) - assert_eq(result, expected) - - -@pytest.mark.parametrize( - "index", - [ - range(np.random.default_rng(seed=0).integers(0, 100)), - range(0, 10, -2), - range(0, -10, 2), - range(0, -10, -2), - range(0, 1), - [1, 2, 3, 1, None, None], - [None, None, 3.2, 1, None, None], - [None, "a", "3.2", "z", None, None], - pd.Series(["a", "b", None], dtype="category"), - np.array([1, 2, 3, None], dtype="datetime64[s]"), - ], -) -@pytest.mark.parametrize( - "func", - [ - "to_series", - "isna", - "notna", - "append", - ], -) -def test_index_methods(index, func): - gidx = cudf.Index(index) - pidx = gidx.to_pandas() - - if func == "append": - expected = pidx.append(other=pidx) - actual = gidx.append(other=gidx) - else: - expected = getattr(pidx, func)() - actual = getattr(gidx, func)() - - assert_eq(expected, actual) - - -@pytest.mark.parametrize( - "idx, values", - [ - (range(100, 1000, 10), [200, 600, 800]), - ([None, "a", "3.2", "z", None, None], ["a", "z"]), - (pd.Series(["a", "b", None], dtype="category"), [10, None]), - ], -) -def test_index_isin_values(idx, values): - gidx = cudf.Index(idx) - pidx = gidx.to_pandas() - - actual = gidx.isin(values) - expected = pidx.isin(values) - - assert_eq(expected, actual) - - -@pytest.mark.parametrize( - "idx, scalar", - [ - (range(0, -10, -2), -4), - ([None, "a", "3.2", "z", None, None], "x"), - (pd.Series(["a", "b", None], dtype="category"), 10), - ], -) -def test_index_isin_scalar_values(idx, scalar): - gidx = cudf.Index(idx) - - with pytest.raises( - TypeError, - match=re.escape( - f"only list-like objects are allowed to be passed " - f"to isin(), you passed a {type(scalar).__name__}" - ), - ): - gidx.isin(scalar) - - -def test_index_any(): - gidx = cudf.Index([1, 2, 3]) - pidx = gidx.to_pandas() - - assert_eq(pidx.any(), gidx.any()) - - -def test_index_values(): - gidx = cudf.Index([1, 2, 3]) - pidx = gidx.to_pandas() - - assert_eq(pidx.values, gidx.values) - - -def test_index_null_values(): - gidx = cudf.Index([1.0, None, 3, 0, None]) - with pytest.raises(ValueError): - gidx.values - - -def test_index_error_list_index(): - s = cudf.Series([[1, 2], [2], [4]]) - with pytest.raises( - NotImplementedError, - match=re.escape( - "Unsupported column type passed to create an " - "Index: " - ), - ): - cudf.Index(s) - - -@pytest.mark.parametrize( - "data", - [ - [1, 2, 3], - pytest.param( - [np.nan, 10, 15, 16], - marks=pytest.mark.xfail( - reason="https://github.com/pandas-dev/pandas/issues/49818" - ), - ), - range(0, 10), - [np.nan, None, 10, 20], - ["ab", "zx", "pq"], - ["ab", "zx", None, "pq"], - ], -) -def test_index_hasnans(data): - gs = cudf.Index(data, nan_as_null=False) - if isinstance(gs, cudf.RangeIndex): - with pytest.raises(NotImplementedError): - gs.to_pandas(nullable=True) - else: - ps = gs.to_pandas(nullable=True) - # Check type to avoid mixing Python bool and NumPy bool - assert isinstance(gs.hasnans, bool) - assert gs.hasnans == ps.hasnans - - -@pytest.mark.parametrize( - "data", - [ - [1, 2, 3, 1, 1, 3, 2, 3], - [np.nan, 10, 15, 16, np.nan, 10, 16], - range(0, 10), - ["ab", "zx", None, "pq", "ab", None, "zx", None], - ], -) -@pytest.mark.parametrize("keep", ["first", "last", False]) -def test_index_duplicated(data, keep): - gs = cudf.Index(data) - ps = gs.to_pandas() - - expected = ps.duplicated(keep=keep) - actual = gs.duplicated(keep=keep) - assert_eq(expected, actual) - - -@pytest.mark.parametrize( - "data,expected_dtype", - [ - ([10, 11, 12], pd.Int64Dtype()), - ([0.1, 10.2, 12.3], pd.Float64Dtype()), - (["abc", None, "def"], pd.StringDtype()), - ], -) -def test_index_to_pandas_nullable(data, expected_dtype): - gi = cudf.Index(data) - pi = gi.to_pandas(nullable=True) - expected = pd.Index(data, dtype=expected_dtype) - - assert_eq(pi, expected) - - -@pytest.mark.parametrize( - "index_values", - [range(1, 10, 2), [1, 2, 3], ["a", "b", "c"], [1.5, 2.5, 3.5]], -) -@pytest.mark.parametrize("i_type", [int, np.int8, np.int32, np.int64]) -def test_scalar_getitem(index_values, i_type): - i = i_type(1) - index = cudf.Index(index_values) - - assert not isinstance(index[i], cudf.Index) - assert index[i] == index_values[i] - assert_eq(index, index.to_pandas()) - - -@pytest.mark.parametrize( - "data", - [ - [ - pd.Timestamp("1970-01-01 00:00:00.000000001"), - pd.Timestamp("1970-01-01 00:00:00.000000002"), - 12, - 20, - ], - [ - pd.Timedelta(10), - pd.Timedelta(20), - 12, - 20, - ], - [1, 2, 3, 4], - ], -) -def test_index_mixed_dtype_error(data): - pi = pd.Index(data, dtype="object") - with pytest.raises(TypeError): - cudf.Index(pi) - - -@pytest.mark.parametrize("cls", [pd.DatetimeIndex, pd.TimedeltaIndex]) -def test_index_date_duration_freq_error(cls): - s = cls([1, 2, 3], freq="infer") - with cudf.option_context("mode.pandas_compatible", True): - with pytest.raises(NotImplementedError): - cudf.Index(s) - - -@pytest.mark.parametrize("dtype", ["datetime64[ns]", "timedelta64[ns]"]) -def test_index_getitem_time_duration(dtype): - gidx = cudf.Index([1, 2, 3, 4, None], dtype=dtype) - pidx = gidx.to_pandas() - with cudf.option_context("mode.pandas_compatible", True): - for i in range(len(gidx)): - if i == 4: - assert gidx[i] is pidx[i] - else: - assert_eq(gidx[i], pidx[i]) - - -@pytest.mark.parametrize("dtype", ALL_TYPES) -def test_index_empty_from_pandas(dtype): - pidx = pd.Index([], dtype=dtype) - gidx = cudf.from_pandas(pidx) - - assert_eq(pidx, gidx) - - -def test_empty_index_init(): - pidx = pd.Index([]) - gidx = cudf.Index([]) - - assert_eq(pidx, gidx) - - -@pytest.mark.parametrize( - "data", [[1, 2, 3], ["ab", "cd", "e", None], range(0, 10)] -) -@pytest.mark.parametrize("data_name", [None, 1, "abc"]) -@pytest.mark.parametrize("index", [True, False]) -@pytest.mark.parametrize("name", [None, no_default, 1, "abc"]) -def test_index_to_frame(data, data_name, index, name): - pidx = pd.Index(data, name=data_name) - gidx = cudf.from_pandas(pidx) - - expected = pidx.to_frame(index=index, name=name) - actual = gidx.to_frame(index=index, name=name) - - assert_eq(expected, actual) - - -@pytest.mark.parametrize("data", [[1, 2, 3], range(0, 10)]) -@pytest.mark.parametrize("dtype", ["str", "int64", "float64"]) -def test_index_with_index_dtype(data, dtype): - pidx = pd.Index(data) - gidx = cudf.Index(data) - - expected = pd.Index(pidx, dtype=dtype) - actual = cudf.Index(gidx, dtype=dtype) - - assert_eq(expected, actual) - - -def test_period_index_error(): - pidx = pd.PeriodIndex(data=[pd.Period("2020-01")]) - with pytest.raises(NotImplementedError): - cudf.from_pandas(pidx) - with pytest.raises(NotImplementedError): - cudf.Index(pidx) - with pytest.raises(NotImplementedError): - cudf.Series(pidx) - with pytest.raises(NotImplementedError): - cudf.Series(pd.Series(pidx)) - with pytest.raises(NotImplementedError): - cudf.Series(pd.array(pidx)) - - -@pytest.mark.parametrize("value", [cudf.DataFrame(range(1)), 11]) -def test_index_from_dataframe_scalar_raises(value): - with pytest.raises(TypeError): - cudf.Index(value) - - -@pytest.mark.parametrize("idx", [0, np.int64(0)]) -def test_index_getitem_from_int(idx): - result = cudf.Index([1, 2])[idx] - assert result == 1 - - -@pytest.mark.parametrize("idx", [1.5, True, "foo"]) -def test_index_getitem_from_nonint_raises(idx): - with pytest.raises(ValueError): - cudf.Index([1, 2])[idx] - - -@pytest.mark.parametrize( - "data", - [ - cp.ones(5, dtype=cp.float16), - np.ones(5, dtype="float16"), - pd.Series([0.1, 1.2, 3.3], dtype="float16"), - pytest.param( - pa.array(np.ones(5, dtype="float16")), - marks=pytest.mark.xfail( - reason="https://issues.apache.org/jira/browse/ARROW-13762" - ), - ), - ], -) -def test_index_raises_float16(data): - with pytest.raises(TypeError): - cudf.Index(data) - - -def test_from_pandas_rangeindex_return_rangeindex(): - pidx = pd.RangeIndex(start=3, stop=9, step=3, name="a") - result = cudf.Index.from_pandas(pidx) - expected = cudf.RangeIndex(start=3, stop=9, step=3, name="a") - assert_eq(result, expected, exact=True) - - -@pytest.mark.parametrize( - "data", - [ - range(1), - np.array([1, 2], dtype="datetime64[ns]"), - np.array([1, 2], dtype="timedelta64[ns]"), - ], -) -def test_index_to_pandas_nullable_notimplemented(data): - idx = cudf.Index(data) - with pytest.raises(NotImplementedError): - idx.to_pandas(nullable=True) - - -@pytest.mark.parametrize( - "scalar", - [ - 1, - 1.0, - "a", - datetime.datetime(2020, 1, 1), - datetime.timedelta(1), - pd.Interval(1, 2), - ], -) -def test_index_to_pandas_arrow_type_nullable_raises(scalar): - data = [scalar, None] - idx = cudf.Index(data) - with pytest.raises(ValueError): - idx.to_pandas(nullable=True, arrow_type=True) - - -@pytest.mark.parametrize( - "scalar", - [ - 1, - 1.0, - "a", - datetime.datetime(2020, 1, 1), - datetime.timedelta(1), - ], -) -def test_index_to_pandas_arrow_type(scalar): - pa_array = pa.array([scalar, None]) - idx = cudf.Index(pa_array) - result = idx.to_pandas(arrow_type=True) - expected = pd.Index(pd.arrays.ArrowExtensionArray(pa_array)) - pd.testing.assert_index_equal(result, expected) - - -@pytest.mark.parametrize("data", [range(-3, 3), range(1, 3), range(0)]) -def test_rangeindex_all(data): - result = cudf.RangeIndex(data).all() - expected = cudf.Index(list(data)).all() - assert result == expected - - -@pytest.mark.parametrize("sort", [True, False]) -@pytest.mark.parametrize("data", [range(2), range(2, -1, -1)]) -def test_rangeindex_factorize(sort, data): - res_codes, res_uniques = cudf.RangeIndex(data).factorize(sort=sort) - exp_codes, exp_uniques = cudf.Index(list(data)).factorize(sort=sort) - assert_eq(res_codes, exp_codes) - assert_eq(res_uniques, exp_uniques) - - -def test_rangeindex_dropna(): - ri = cudf.RangeIndex(range(2)) - result = ri.dropna() - expected = ri.copy() - assert_eq(result, expected) - - -def test_rangeindex_unique_shallow_copy(): - ri_pandas = pd.RangeIndex(1) - result = ri_pandas.unique() - assert result is not ri_pandas - - ri_cudf = cudf.RangeIndex(1) - result = ri_cudf.unique() - assert result is not ri_cudf - assert_eq(result, ri_cudf) - - -def test_rename_shallow_copy(): - idx = pd.Index([1]) - result = idx.rename("a") - assert idx.to_numpy(copy=False) is result.to_numpy(copy=False) - - idx = cudf.Index([1]) - result = idx.rename("a") - assert idx._column is result._column - - -@pytest.mark.parametrize("data", [range(2), [10, 11, 12]]) -def test_index_contains_hashable(data): - gidx = cudf.Index(data) - pidx = gidx.to_pandas() - - assert_exceptions_equal( - lambda: [] in gidx, - lambda: [] in pidx, - lfunc_args_and_kwargs=((),), - rfunc_args_and_kwargs=((),), - ) - - -@pytest.mark.parametrize("data", [[0, 1, 2], [1.1, 2.3, 4.5]]) -@pytest.mark.parametrize("dtype", ["int32", "float32", "float64"]) -@pytest.mark.parametrize("needle", [0, 1, 2.3]) -def test_index_contains_float_int(data, dtype, needle): - gidx = cudf.Index(data=data, dtype=dtype) - pidx = gidx.to_pandas() - - actual = needle in gidx - expected = needle in pidx - - assert_eq(actual, expected) - - -def test_Index_init_with_nans(): - with cudf.option_context("mode.pandas_compatible", True): - gi = cudf.Index([1, 2, 3, np.nan]) - assert gi.dtype == np.dtype("float64") - pi = pd.Index([1, 2, 3, np.nan]) - assert_eq(pi, gi) - - -def test_index_datetime_repeat(): - gidx = cudf.date_range("2021-01-01", periods=3, freq="D") - pidx = gidx.to_pandas() - - actual = gidx.repeat(5) - expected = pidx.repeat(5) - - assert_eq(actual, expected) - - actual = gidx.to_frame().repeat(5) - - assert_eq(actual.index, expected) - - -@pytest.mark.parametrize( - "index", - [ - lambda: cudf.Index([1]), - lambda: cudf.RangeIndex(1), - lambda: cudf.MultiIndex(levels=[[0]], codes=[[0]]), + lambda: cudf.Index([1]), + lambda: cudf.RangeIndex(1), + lambda: cudf.MultiIndex(levels=[[0]], codes=[[0]]), ], ) def test_index_assignment_no_shallow_copy(index): @@ -3189,43 +163,3 @@ def test_index_assignment_no_shallow_copy(index): df = cudf.DataFrame(range(1)) df.index = index assert df.index is index - - -def test_bool_rangeindex_raises(): - assert_exceptions_equal( - lfunc=bool, - rfunc=bool, - lfunc_args_and_kwargs=[[pd.RangeIndex(0)]], - rfunc_args_and_kwargs=[[cudf.RangeIndex(0)]], - ) - - -@pytest.mark.parametrize("ordered", [True, False]) -@pytest.mark.parametrize("name", [None, "test"]) -def test_categoricalindex_from_codes(ordered, name): - codes = [0, 1, 2, 3, 4] - categories = ["a", "b", "c", "d", "e"] - result = cudf.CategoricalIndex.from_codes(codes, categories, ordered, name) - expected = pd.CategoricalIndex( - pd.Categorical.from_codes(codes, categories, ordered=ordered), - name=name, - ) - assert_eq(result, expected) - - -@pytest.mark.parametrize("klass", [cudf.RangeIndex, pd.RangeIndex]) -@pytest.mark.parametrize("name_inner", [None, "a"]) -@pytest.mark.parametrize("name_outer", [None, "b"]) -def test_rangeindex_accepts_rangeindex(klass, name_inner, name_outer): - result = cudf.RangeIndex(klass(range(1), name=name_inner), name=name_outer) - expected = pd.RangeIndex( - pd.RangeIndex(range(1), name=name_inner), name=name_outer - ) - assert_eq(result, expected) - - -def test_roundtrip_index_plc_column(): - index = cudf.Index([1]) - expect = cudf.Index(index) - actual = cudf.Index.from_pylibcudf(*expect.to_pylibcudf()) - assert_eq(expect, actual) diff --git a/python/cudf/cudf/tests/test_indexing.py b/python/cudf/cudf/tests/test_indexing.py index 6d373f56b14..c26527f8282 100644 --- a/python/cudf/cudf/tests/test_indexing.py +++ b/python/cudf/cudf/tests/test_indexing.py @@ -360,16 +360,6 @@ def test_dataframe_loc(scalar, step): assert_eq(df.loc[np.array([0])], pdf.loc[np.array([0])]) -def test_dataframe_loc_duplicate_index_scalar(): - pdf = pd.DataFrame({"a": [1, 2, 3, 4, 5]}, index=[1, 2, 1, 4, 2]) - gdf = cudf.DataFrame.from_pandas(pdf) - - pdf_sorted = pdf.sort_values(by=list(pdf.columns), axis=0) - gdf_sorted = gdf.sort_values(by=list(gdf.columns), axis=0) - - assert_eq(pdf_sorted, gdf_sorted) - - @pytest.mark.parametrize( "mask", [[True, False, False, False, False], [True, False, True, False, True]], @@ -701,78 +691,6 @@ def test_dataframe_iloc_index_error(): gdf.iloc[nelem * 2] -@pytest.mark.parametrize("ntake", [0, 1, 10, 123, 122, 200]) -def test_dataframe_take(ntake): - rng = np.random.default_rng(seed=0) - nelem = 123 - df = cudf.DataFrame( - { - "ii": rng.integers(0, 20, nelem), - "ff": rng.random(nelem), - } - ) - - take_indices = rng.integers(0, len(df), ntake) - - actual = df.take(take_indices) - expected = df.to_pandas().take(take_indices) - - assert actual.ii.null_count == 0 - assert actual.ff.null_count == 0 - assert_eq(actual, expected) - - -@pytest.mark.parametrize("ntake", [1, 2, 8, 9]) -def test_dataframe_take_with_multiindex(ntake): - rng = np.random.default_rng(seed=0) - df = cudf.DataFrame( - index=cudf.MultiIndex( - levels=[["lama", "cow", "falcon"], ["speed", "weight", "length"]], - codes=[[0, 0, 0, 1, 1, 1, 2, 2, 2], [0, 1, 2, 0, 1, 2, 0, 1, 2]], - ) - ) - - nelem = 9 - df["ii"] = rng.integers(0, 20, nelem) - df["ff"] = rng.random(nelem) - - take_indices = rng.integers(0, len(df), ntake) - - actual = df.take(take_indices) - expected = df.to_pandas().take(take_indices) - - assert_eq(actual, expected) - - -@pytest.mark.parametrize("ntake", [0, 1, 10, 123, 122, 200]) -def test_series_take(ntake): - rng = np.random.default_rng(seed=0) - nelem = 123 - - psr = pd.Series(rng.integers(0, 20, nelem)) - gsr = cudf.Series(psr) - - take_indices = rng.integers(0, len(gsr), ntake) - - actual = gsr.take(take_indices) - expected = psr.take(take_indices) - - assert_eq(actual, expected) - - -def test_series_take_positional(): - psr = pd.Series([1, 2, 3, 4, 5], index=["a", "b", "c", "d", "e"]) - - gsr = cudf.Series.from_pandas(psr) - - take_indices = [1, 2, 0, 3] - - expect = psr.take(take_indices) - got = gsr.take(take_indices) - - assert_eq(expect, got) - - @pytest.mark.parametrize("nelem", [0, 1, 5, 20, 100]) @pytest.mark.parametrize("slice_start", [None, 0, 1, 3, 10, -10]) @pytest.mark.parametrize("slice_end", [None, 0, 1, 30, 50, -1]) diff --git a/python/cudf/cudf/tests/test_multiindex.py b/python/cudf/cudf/tests/test_multiindex.py index b612e20a17f..edb7fc3ec9b 100644 --- a/python/cudf/cudf/tests/test_multiindex.py +++ b/python/cudf/cudf/tests/test_multiindex.py @@ -1,10 +1,7 @@ # Copyright (c) 2025, NVIDIA CORPORATION. -import itertools import operator -import pickle from contextlib import contextmanager -from io import BytesIO import cupy as cp import numpy as np @@ -13,7 +10,7 @@ import cudf from cudf.core.column import as_column -from cudf.testing import assert_eq, assert_neq +from cudf.testing import assert_eq from cudf.testing._utils import assert_exceptions_equal, expect_warning_if @@ -172,15 +169,6 @@ def test_series_multiindex(pdfIndex): assert_eq(ps, gs) -def test_multiindex_getitem(pdf, gdf, pdfIndex): - gdfIndex = cudf.from_pandas(pdfIndex) - pdf = pdf.copy(deep=False) - gdf = gdf.copy(deep=False) - pdf.index = pdfIndex - gdf.index = gdfIndex - assert_eq(pdf.index[0], gdf.index[0]) - - @pytest.mark.parametrize( "key_tuple", [ @@ -386,150 +374,6 @@ def test_multiindex_index_and_columns(): assert_eq(pdf, gdf) -def test_multiindex_multiple_groupby(): - rng = np.random.default_rng(seed=0) - pdf = pd.DataFrame( - { - "a": [4, 17, 4, 9, 5], - "b": [1, 4, 4, 3, 2], - "x": rng.normal(size=5), - } - ) - gdf = cudf.DataFrame.from_pandas(pdf) - pdg = pdf.groupby(["a", "b"], sort=True).sum() - gdg = gdf.groupby(["a", "b"], sort=True).sum() - assert_eq(pdg, gdg) - pdg = pdf.groupby(["a", "b"], sort=True).x.sum() - gdg = gdf.groupby(["a", "b"], sort=True).x.sum() - assert_eq(pdg, gdg) - - -@pytest.mark.parametrize( - "func", - [ - lambda df: df.groupby(["x", "y"], sort=True).z.sum(), - lambda df: df.groupby(["x", "y"], sort=True).sum(), - ], -) -def test_multi_column(func): - rng = np.random.default_rng(seed=0) - pdf = pd.DataFrame( - { - "x": rng.integers(0, 5, size=1000), - "y": rng.integers(0, 10, size=1000), - "z": rng.normal(size=1000), - } - ) - gdf = cudf.DataFrame.from_pandas(pdf) - - a = func(pdf) - b = func(gdf) - - assert_eq(a, b) - - -def test_multiindex_equality(): - # mi made from groupby - # mi made manually to be identical - # are they equal? - gdf = cudf.DataFrame( - {"x": [1, 5, 3, 4, 1], "y": [1, 1, 2, 2, 5], "z": [0, 1, 0, 1, 0]} - ) - mi1 = gdf.groupby(["x", "y"], sort=True).mean().index - mi2 = cudf.MultiIndex( - levels=[[1, 3, 4, 5], [1, 2, 5]], - codes=[[0, 0, 1, 2, 3], [0, 2, 1, 1, 0]], - names=["x", "y"], - ) - assert_eq(mi1, mi2) - - # mi made from two groupbys, are they equal? - mi2 = gdf.groupby(["x", "y"], sort=True).max().index - assert_eq(mi1, mi2) - - # mi made manually twice are they equal? - mi1 = cudf.MultiIndex( - levels=[[1, 3, 4, 5], [1, 2, 5]], - codes=[[0, 0, 1, 2, 3], [0, 2, 1, 1, 0]], - names=["x", "y"], - ) - mi2 = cudf.MultiIndex( - levels=[[1, 3, 4, 5], [1, 2, 5]], - codes=[[0, 0, 1, 2, 3], [0, 2, 1, 1, 0]], - names=["x", "y"], - ) - assert_eq(mi1, mi2) - - # mi made from different groupbys are they not equal? - mi1 = gdf.groupby(["x", "y"]).mean().index - mi2 = gdf.groupby(["x", "z"]).mean().index - assert_neq(mi1, mi2) - - # mi made from different manuals are they not equal? - mi1 = cudf.MultiIndex( - levels=[[1, 3, 4, 5], [1, 2, 5]], - codes=[[0, 0, 1, 2, 3], [0, 2, 1, 1, 0]], - names=["x", "y"], - ) - mi2 = cudf.MultiIndex( - levels=[[0, 3, 4, 5], [1, 2, 5]], - codes=[[0, 0, 1, 2, 3], [0, 2, 1, 1, 0]], - names=["x", "y"], - ) - assert_neq(mi1, mi2) - - -def test_multiindex_equals(): - # mi made from groupby - # mi made manually to be identical - # are they equal? - gdf = cudf.DataFrame( - {"x": [1, 5, 3, 4, 1], "y": [1, 1, 2, 2, 5], "z": [0, 1, 0, 1, 0]} - ) - mi1 = gdf.groupby(["x", "y"], sort=True).mean().index - mi2 = cudf.MultiIndex( - levels=[[1, 3, 4, 5], [1, 2, 5]], - codes=[[0, 0, 1, 2, 3], [0, 2, 1, 1, 0]], - names=["x", "y"], - ) - assert_eq(mi1.equals(mi2), True) - - # mi made from two groupbys, are they equal? - mi2 = gdf.groupby(["x", "y"], sort=True).max().index - assert_eq(mi1.equals(mi2), True) - - # mi made manually twice are they equal? - mi1 = cudf.MultiIndex( - levels=[[1, 3, 4, 5], [1, 2, 5]], - codes=[[0, 0, 1, 2, 3], [0, 2, 1, 1, 0]], - names=["x", "y"], - ) - mi2 = cudf.MultiIndex( - levels=[[1, 3, 4, 5], [1, 2, 5]], - codes=[[0, 0, 1, 2, 3], [0, 2, 1, 1, 0]], - names=["x", "y"], - ) - assert_eq(mi1.equals(mi2), True) - - # mi made from different groupbys are they not equal? - mi1 = gdf.groupby(["x", "y"], sort=True).mean().index - mi2 = gdf.groupby(["x", "z"], sort=True).mean().index - assert_eq(mi1.equals(mi2), False) - - # mi made from different manuals are they not equal? - mi1 = cudf.MultiIndex( - levels=[[1, 3, 4, 5], [1, 2, 5]], - codes=[[0, 0, 1, 2, 3], [0, 2, 1, 1, 0]], - names=["x", "y"], - ) - mi2 = cudf.MultiIndex( - levels=[[0, 3, 4, 5], [1, 2, 5]], - codes=[[0, 0, 1, 2, 3], [0, 2, 1, 1, 0]], - names=["x", "y"], - ) - assert_eq(mi1.equals(mi2), False) - - @pytest.mark.parametrize( "iloc_rows", [ @@ -649,52 +493,6 @@ def test_multicolumn_item(): assert_eq(gdgT[(0, 0)], pdgT[(0, 0)]) -def test_multiindex_reset_index(pdf, gdf, pdfIndex): - gdfIndex = cudf.from_pandas(pdfIndex) - pdf = pdf.copy(deep=False) - gdf = gdf.copy(deep=False) - pdf.index = pdfIndex - gdf.index = gdfIndex - assert_eq(pdf.reset_index(), gdf.reset_index()) - - -def test_multiindex_groupby_reset_index(): - gdf = cudf.DataFrame( - {"x": [1, 5, 3, 4, 1], "y": [1, 1, 2, 2, 5], "z": [0, 1, 0, 1, 0]} - ) - pdf = gdf.to_pandas() - gdg = gdf.groupby(["x", "y"], sort=True).sum() - pdg = pdf.groupby(["x", "y"], sort=True).sum() - assert_eq(pdg.reset_index(), gdg.reset_index()) - - -def test_multicolumn_reset_index(): - gdf = cudf.DataFrame({"x": [1, 5, 3, 4, 1], "y": [1, 1, 2, 2, 5]}) - pdf = gdf.to_pandas() - gdg = gdf.groupby(["x"], sort=True).agg({"y": ["count", "mean"]}) - pdg = pdf.groupby(["x"], sort=True).agg({"y": ["count", "mean"]}) - assert_eq(pdg.reset_index(), gdg.reset_index(), check_dtype=False) - gdg = gdf.groupby(["x"], sort=True).agg({"y": ["count"]}) - pdg = pdf.groupby(["x"], sort=True).agg({"y": ["count"]}) - assert_eq(pdg.reset_index(), gdg.reset_index(), check_dtype=False) - gdg = gdf.groupby(["x"], sort=True).agg({"y": "count"}) - pdg = pdf.groupby(["x"], sort=True).agg({"y": "count"}) - assert_eq(pdg.reset_index(), gdg.reset_index(), check_dtype=False) - - -def test_multiindex_multicolumn_reset_index(): - gdf = cudf.DataFrame( - {"x": [1, 5, 3, 4, 1], "y": [1, 1, 2, 2, 5], "z": [1, 2, 3, 4, 5]} - ) - pdf = gdf.to_pandas() - gdg = gdf.groupby(["x", "y"], sort=True).agg({"y": ["count", "mean"]}) - pdg = pdf.groupby(["x", "y"], sort=True).agg({"y": ["count", "mean"]}) - assert_eq(pdg.reset_index(), gdg.reset_index(), check_dtype=False) - gdg = gdf.groupby(["x", "z"], sort=True).agg({"y": ["count", "mean"]}) - pdg = pdf.groupby(["x", "z"], sort=True).agg({"y": ["count", "mean"]}) - assert_eq(pdg.reset_index(), gdg.reset_index(), check_dtype=False) - - def test_groupby_multiindex_columns_from_pandas(pdf, gdf, pdfIndex): gdfIndex = cudf.from_pandas(pdfIndex) pdf = pdf.copy(deep=False) @@ -780,48 +578,6 @@ def test_multicolumn_set_item(pdf, pdfIndex): assert_eq(pdf, gdf) -@pytest.mark.parametrize( - "key", - [0, 1, [], [0, 1], slice(None), slice(0, 0), slice(0, 1), slice(0, 2)], -) -def test_multiindex_indexing(key): - gi = cudf.MultiIndex.from_frame( - cudf.DataFrame({"a": [1, 2, 3], "b": [True, False, False]}) - ) - pi = gi.to_pandas() - - assert_eq(gi[key], pi[key], exact=False) - - -@pytest.mark.parametrize( - "names", - [ - ["a", "b", "c"], - [None, None, None], - ["aa", "aa", "aa"], - ["bb", "aa", "aa"], - None, - ], -) -def test_pickle_roundtrip_multiindex(names): - df = cudf.DataFrame( - { - "one": [1, 2, 3], - "two": [True, False, True], - "three": ["ab", "cd", "ef"], - "four": [0.2, 0.1, -10.2], - } - ) - expected_df = df.set_index(["one", "two", "three"]) - expected_df.index.names = names - local_file = BytesIO() - - pickle.dump(expected_df, local_file) - local_file.seek(0) - actual_df = pickle.load(local_file) - assert_eq(expected_df, actual_df) - - def test_multiindex_index_single_row(): arrays = [["a", "a", "b", "b"], [1, 2, 3, 4]] tuples = list(zip(*arrays, strict=True)) @@ -834,29 +590,6 @@ def test_multiindex_index_single_row(): assert_eq(pdf.loc[("b", 3)], gdf.loc[("b", 3)]) -@pytest.mark.parametrize( - "levels", - itertools.chain.from_iterable( - itertools.permutations(range(3), n) for n in range(1, 4) - ), - ids=str, -) -def test_multiindex_sort_index_partial(levels): - df = pd.DataFrame( - { - "a": [3, 3, 3, 1, 1, 1, 2, 2], - "b": [4, 2, 7, -1, 11, -2, 7, 7], - "c": [4, 4, 2, 3, 3, 3, 1, 1], - "val": [1, 2, 3, 4, 5, 6, 7, 8], - } - ).set_index(["a", "b", "c"]) - cdf = cudf.from_pandas(df) - - expect = df.sort_index(level=levels, sort_remaining=True) - got = cdf.sort_index(level=levels, sort_remaining=True) - assert_eq(expect, got) - - @pytest.mark.parametrize("idx_get", [(0, 0), (0, 1), (1, 0), (1, 1)]) @pytest.mark.parametrize("cols_get", [0, 1, [0, 1], [1, 0], [1], [0]]) def test_multiindex_loc_scalar(idx_get, cols_get): From 92626499193a8d63710d1de0da0bd02fd2bf10b6 Mon Sep 17 00:00:00 2001 From: Peixin Date: Thu, 4 Sep 2025 12:08:38 +0800 Subject: [PATCH 255/366] Update boost version to 1.79 for JNI dockerfile (#19883) Fix https://github.com/rapidsai/cudf/issues/19879 The default boost-devel (1.66.x) is incompatible with the recent arrow update. This change, which is ported from JNI repo, has been verified internally. Authors: - Peixin (https://github.com/pxLi) Approvers: - Tim Liu (https://github.com/NvTimLiu) URL: https://github.com/rapidsai/cudf/pull/19883 --- java/ci/Dockerfile.rocky | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/java/ci/Dockerfile.rocky b/java/ci/Dockerfile.rocky index d6eff8125fa..fbb687ce908 100644 --- a/java/ci/Dockerfile.rocky +++ b/java/ci/Dockerfile.rocky @@ -30,7 +30,7 @@ ARG TOOLSET_VERSION=14 ARG CMAKE_VERSION=3.30.7 ARG CCACHE_VERSION=4.11.2 ### Install basic requirements -RUN dnf --enablerepo=powertools install -y scl-utils gcc-toolset-11 gcc-toolset-${TOOLSET_VERSION} git zlib-devel maven tar wget patch ninja-build boost-devel +RUN dnf --enablerepo=powertools install -y scl-utils gcc-toolset-11 gcc-toolset-${TOOLSET_VERSION} git zlib-devel maven tar wget patch ninja-build ## pre-create the CMAKE_INSTALL_PREFIX folder, set writable by any user for Jenkins RUN mkdir /usr/local/rapids /rapids && chmod 777 /usr/local/rapids /rapids @@ -58,5 +58,15 @@ RUN cd /tmp && wget --quiet https://github.com/ccache/ccache/releases/download/v cd ../.. && \ rm -rf ccache-${CCACHE_VERSION} +## install a version of boost that is needed for arrow/parquet to work +RUN cd /usr/local && wget --quiet https://archives.boost.io/release/1.79.0/source/boost_1_79_0.tar.gz && \ + tar -xzf boost_1_79_0.tar.gz && \ + rm boost_1_79_0.tar.gz && \ + cd boost_1_79_0 && \ + ./bootstrap.sh --prefix=/usr/local && \ + ./b2 install --prefix=/usr/local --with-filesystem --with-system && \ + cd /usr/local && \ + rm -rf boost_1_79_0 + # disable cuda container constraints to allow running w/ elder drivers on data-center GPUs ENV NVIDIA_DISABLE_REQUIRE="true" From 9f2fe17b49a0e4e31421575f23382b8825e0ab36 Mon Sep 17 00:00:00 2001 From: brandon-b-miller <53796099+brandon-b-miller@users.noreply.github.com> Date: Thu, 4 Sep 2025 05:47:07 -0500 Subject: [PATCH 256/366] Skip flaky stats tests pending follow up (#19881) Skips a few flaky tests while we investigate xref https://github.com/rapidsai/cudf/issues/19880 Authors: - https://github.com/brandon-b-miller Approvers: - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/19881 --- python/cudf/cudf/tests/private_objects/test_nrt_stats.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/python/cudf/cudf/tests/private_objects/test_nrt_stats.py b/python/cudf/cudf/tests/private_objects/test_nrt_stats.py index e951374f5a0..e65aa85911e 100644 --- a/python/cudf/cudf/tests/private_objects/test_nrt_stats.py +++ b/python/cudf/cudf/tests/private_objects/test_nrt_stats.py @@ -1,4 +1,5 @@ # Copyright (c) 2025, NVIDIA CORPORATION. +import pytest from numba import config from numba.cuda.memory_management.nrt import rtsys @@ -14,6 +15,7 @@ from cudf.utils._numba import _CUDFNumbaConfig +@pytest.mark.skip(reason="https://github.com/rapidsai/cudf/issues/19880") def test_string_udf_basic(monkeypatch): monkeypatch.setattr(config, "CUDA_NRT_STATS", True) @@ -34,6 +36,7 @@ def double(st): assert stats.alloc - stats.free == 0 +@pytest.mark.skip(reason="https://github.com/rapidsai/cudf/issues/19880") def test_string_udf_conditional_allocations(monkeypatch): monkeypatch.setattr(config, "CUDA_NRT_STATS", True) @@ -54,6 +57,7 @@ def double(st): assert after_stats.alloc - before_stats.free == 1 +@pytest.mark.skip(reason="https://github.com/rapidsai/cudf/issues/19880") def test_string_udf_free_kernel(monkeypatch): monkeypatch.setattr(config, "CUDA_NRT_STATS", True) From 99aed37ece502444e3573b385cc539482089a5ef Mon Sep 17 00:00:00 2001 From: David Wendt <45795991+davidwendt@users.noreply.github.com> Date: Thu, 4 Sep 2025 11:04:45 -0400 Subject: [PATCH 257/366] Fix strings::find_instance warp parallel logic (#19845) Fixes the `find_instance_warp_parallel_fn` to ensure all threads in the warp participate in the cooperative group functions appropriately. Failure was found on CUDA 13 build running on a sm_75 system only. All `compute-sanitizer` tool types on this kernel passed. Authors: - David Wendt (https://github.com/davidwendt) Approvers: - Nghia Truong (https://github.com/ttnghia) - Yunsong Wang (https://github.com/PointKernel) URL: https://github.com/rapidsai/cudf/pull/19845 --- cpp/src/strings/search/find_instance.cu | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/cpp/src/strings/search/find_instance.cu b/cpp/src/strings/search/find_instance.cu index 79dd54ad2aa..012059b1cb5 100644 --- a/cpp/src/strings/search/find_instance.cu +++ b/cpp/src/strings/search/find_instance.cu @@ -63,23 +63,27 @@ CUDF_KERNEL void find_instance_warp_parallel_fn(column_device_view const d_strin auto const max_pos = d_str.size_bytes(); size_type char_pos = max_pos; size_type char_count = 0; - size_type count = 0; - for (auto itr = begin + lane_idx; itr + d_target.size_bytes() <= end; - itr += cudf::detail::warp_size) { - size_type const is_char = !is_utf8_continuation_char(*itr); - size_type const found = is_char && (d_target.compare(itr, d_target.size_bytes()) == 0); + size_type byte_count = 0; + size_type offset = 0; + auto itr = begin + lane_idx; + while (byte_count + d_target.size_bytes() <= d_str.size_bytes()) { + size_type const is_char = + (itr + d_target.size_bytes() <= end) && !is_utf8_continuation_char(*itr); + size_type const found = is_char && (d_target.compare(itr, d_target.size_bytes()) == 0); // count of threads that matched in this warp and produce an offset in each thread auto const found_count = cg::reduce(warp, found, cg::plus()); auto const found_scan = cg::inclusive_scan(warp, found); // handy character counter for threads in this warp auto const chars_scan = cg::exclusive_scan(warp, is_char); // activate the thread where we hit the desired find instance - auto const found_pos = (found_scan + count) == (instance + 1) ? chars_scan : char_pos; + auto const found_pos = (found_scan + offset) == (instance + 1) ? chars_scan : char_pos; // copy the position value for that thread into all warp threads char_pos = cg::reduce(warp, found_pos, cg::less()); if (char_pos < max_pos) { break; } // all threads will stop - count += found_count; // otherwise continue with the next set + offset += found_count; // otherwise continue with the next set char_count += cg::reduce(warp, is_char, cg::plus()); + itr += cudf::detail::warp_size; + byte_count += cudf::detail::warp_size; } // output the position if an instance match has been found From 9d7afc74e0ecb009ab5396b62839faf5c4aa7d15 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Thu, 4 Sep 2025 11:47:19 -0700 Subject: [PATCH 258/366] Move prefetching out of experimental and simplify the API (#19875) We have been using prefetching in cudf.pandas for almost a year now, and for a shorter time in cudf-polars, both without issue. We have not modified the places where we prefetch or how we use the APIs in a long time. This PR therefore moves the libcudf prefetching logic out of an experimental namespace. It also simplifies the APIs for enabling prefetching, removing per-key control and instead making it an all-or-nothing choice. I've marked the PR as breaking, but since we are changing experimental features we do not need to provide any sort of compatibility shim. Authors: - Vyas Ramasubramani (https://github.com/vyasr) Approvers: - Nghia Truong (https://github.com/ttnghia) - David Wendt (https://github.com/davidwendt) - Matthew Roeschke (https://github.com/mroeschke) URL: https://github.com/rapidsai/cudf/pull/19875 --- cpp/include/cudf/column/column_view.hpp | 1 - cpp/include/cudf/strings/detail/gather.cuh | 5 +- .../cudf/strings/detail/strings_children.cuh | 2 +- cpp/include/cudf/utilities/prefetch.hpp | 95 +++++-------------- cpp/src/column/column_view.cpp | 10 +- cpp/src/join/hash_join.cu | 5 +- cpp/src/utilities/prefetch.cpp | 84 +++++++--------- python/cudf/cudf/pandas/__init__.py | 47 ++++----- python/cudf_polars/cudf_polars/callback.py | 11 +-- python/pylibcudf/pylibcudf/CMakeLists.txt | 2 +- python/pylibcudf/pylibcudf/__init__.pxd | 4 +- python/pylibcudf/pylibcudf/__init__.py | 4 +- python/pylibcudf/pylibcudf/experimental.pxd | 10 -- python/pylibcudf/pylibcudf/experimental.pyi | 5 - python/pylibcudf/pylibcudf/experimental.pyx | 44 --------- .../pylibcudf/libcudf/experimental.pxd | 16 ---- .../pylibcudf/pylibcudf/libcudf/prefetch.pxd | 8 ++ python/pylibcudf/pylibcudf/prefetch.pxd | 9 ++ python/pylibcudf/pylibcudf/prefetch.pyi | 6 ++ python/pylibcudf/pylibcudf/prefetch.pyx | 25 +++++ 20 files changed, 141 insertions(+), 252 deletions(-) delete mode 100644 python/pylibcudf/pylibcudf/experimental.pxd delete mode 100644 python/pylibcudf/pylibcudf/experimental.pyi delete mode 100644 python/pylibcudf/pylibcudf/experimental.pyx delete mode 100644 python/pylibcudf/pylibcudf/libcudf/experimental.pxd create mode 100644 python/pylibcudf/pylibcudf/libcudf/prefetch.pxd create mode 100644 python/pylibcudf/pylibcudf/prefetch.pxd create mode 100644 python/pylibcudf/pylibcudf/prefetch.pyi create mode 100644 python/pylibcudf/pylibcudf/prefetch.pyx diff --git a/cpp/include/cudf/column/column_view.hpp b/cpp/include/cudf/column/column_view.hpp index 11180e8c339..2c6a14739ea 100644 --- a/cpp/include/cudf/column/column_view.hpp +++ b/cpp/include/cudf/column/column_view.hpp @@ -17,7 +17,6 @@ #include #include -#include #include #include #include diff --git a/cpp/include/cudf/strings/detail/gather.cuh b/cpp/include/cudf/strings/detail/gather.cuh index 73b53dbe9be..288c0e0a45c 100644 --- a/cpp/include/cudf/strings/detail/gather.cuh +++ b/cpp/include/cudf/strings/detail/gather.cuh @@ -233,7 +233,7 @@ rmm::device_uvector gather_chars(StringIterator strings_begin, if (output_count == 0) return rmm::device_uvector(0, stream, mr); auto chars_data = rmm::device_uvector(chars_bytes, stream, mr); - cudf::experimental::prefetch::detail::prefetch("gather", chars_data, stream); + cudf::prefetch::detail::prefetch(chars_data, stream); auto d_chars = chars_data.data(); constexpr int warps_per_threadblock = 4; @@ -316,8 +316,7 @@ std::unique_ptr gather(strings_column_view const& strings, // build chars column auto const offsets_view = cudf::detail::offsetalator_factory::make_input_iterator(out_offsets_column->view()); - cudf::experimental::prefetch::detail::prefetch( - "gather", strings.chars_begin(stream), strings.chars_size(stream), stream); + cudf::prefetch::detail::prefetch(strings.chars_begin(stream), strings.chars_size(stream), stream); auto out_chars_data = gather_chars( d_strings->begin(), begin, end, offsets_view, total_bytes, stream, mr); diff --git a/cpp/include/cudf/strings/detail/strings_children.cuh b/cpp/include/cudf/strings/detail/strings_children.cuh index 3402abd9d36..c378ba97309 100644 --- a/cpp/include/cudf/strings/detail/strings_children.cuh +++ b/cpp/include/cudf/strings/detail/strings_children.cuh @@ -260,7 +260,7 @@ auto make_strings_children(SizeAndExecuteFunction size_and_exec_fn, // Now build the chars column rmm::device_uvector chars(bytes, stream, mr); - cudf::experimental::prefetch::detail::prefetch("gather", chars, stream); + cudf::prefetch::detail::prefetch(chars, stream); size_and_exec_fn.d_chars = chars.data(); // Execute the function fn again to fill in the chars data. diff --git a/cpp/include/cudf/utilities/prefetch.hpp b/cpp/include/cudf/utilities/prefetch.hpp index 3384181fc37..ce7892b4460 100644 --- a/cpp/include/cudf/utilities/prefetch.hpp +++ b/cpp/include/cudf/utilities/prefetch.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2024, NVIDIA CORPORATION. + * Copyright (c) 2024-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -20,73 +20,26 @@ #include -#include -#include -#include -#include +#include namespace CUDF_EXPORT cudf { -namespace experimental::prefetch { +namespace prefetch { namespace detail { -/** - * @brief A singleton class that manages the prefetching configuration. - */ -class prefetch_config { - public: - prefetch_config& operator=(const prefetch_config&) = delete; - prefetch_config(const prefetch_config&) = delete; - - /** - * @brief Get the singleton instance of the prefetching configuration. - * - * @return The singleton instance of the prefetching configuration. - */ - static prefetch_config& instance(); - - /** - * @brief Get the value of a configuration key. - * - * If the key does not exist, a `false` value will be returned. - * - * @param key The configuration key. - * @return The value of the configuration key. - */ - bool get(std::string_view key); - /** - * @brief Set the value of a configuration key. - * - * This is a thread-safe operation. - * - * @param key The configuration key. - * @param value The value to set. - */ - void set(std::string_view key, bool value); - /** - * @brief Enable or disable debug mode. - * - * In debug mode, the pointers being prefetched are printed to stderr. - */ - bool debug{false}; - - private: - prefetch_config() = default; //< Private constructor to enforce singleton pattern - std::map config_values; //< Map of configuration keys to values - std::shared_mutex config_mtx; //< Mutex for thread-safe config access -}; +std::atomic_bool& enabled(); + +std::atomic_bool& debug(); /** * @brief Enable prefetching for a particular structure or algorithm. * - * @param key The key to enable prefetching for. * @param ptr The pointer to prefetch. * @param size The size of the memory region to prefetch. * @param stream The stream to prefetch on. * @param device_id The device to prefetch on. */ -void prefetch(std::string_view key, - void const* ptr, +void prefetch(void const* ptr, std::size_t size, rmm::cuda_stream_view stream, rmm::cuda_device_id device_id = rmm::get_current_cuda_device()); @@ -100,14 +53,12 @@ void prefetch(std::string_view key, * removed once an method for stream-ordered data pointer access is added to * those data structures. * - * @param key The key to enable prefetching for. * @param ptr The pointer to prefetch. * @param size The size of the memory region to prefetch. * @param stream The stream to prefetch on. * @param device_id The device to prefetch on. */ cudaError_t prefetch_noexcept( - std::string_view key, void const* ptr, std::size_t size, rmm::cuda_stream_view stream, @@ -119,45 +70,47 @@ cudaError_t prefetch_noexcept( * @note At present this function does not support stream-ordered execution. Prefetching always * occurs on the default stream. * - * @param key The key to enable prefetching for. * @param v The device_uvector to prefetch. * @param stream The stream to prefetch on. * @param device_id The device to prefetch on. */ template -void prefetch(std::string_view key, - rmm::device_uvector const& v, +void prefetch(rmm::device_uvector const& v, rmm::cuda_stream_view stream, rmm::cuda_device_id device_id = rmm::get_current_cuda_device()) { if (v.is_empty()) { return; } - prefetch(key, v.data(), v.size(), stream, device_id); + prefetch(v.data(), v.size(), stream, device_id); } } // namespace detail /** - * @brief Enable prefetching for a particular structure or algorithm. + * @brief Enable prefetching. * - * @param key The key to enable prefetching for. + * Prefetching of managed memory in cudf currently always synchronizes on the + * default stream and is not compatible with multi-stream applications. */ -void enable_prefetching(std::string_view key); +void enable() noexcept; /** - * @brief Disable prefetching for a particular structure or algorithm. - * - * @param key The key to disable prefetching for. + * @brief Disable prefetching. */ -void disable_prefetching(std::string_view key); +void disable() noexcept; /** - * @brief Enable or disable debug mode. + * @brief Enable debug mode for prefetching. * * In debug mode, the pointers being prefetched are printed to stderr. + */ +void enable_debugging() noexcept; + +/** + * @brief Enable debug mode for prefetching. * - * @param enable Whether to enable or disable debug mode. + * In debug mode, the pointers being prefetched are printed to stderr. */ -void prefetch_debugging(bool enable); +void disable_debugging() noexcept; -} // namespace experimental::prefetch +} // namespace prefetch } // namespace CUDF_EXPORT cudf diff --git a/cpp/src/column/column_view.cpp b/cpp/src/column/column_view.cpp index b54689441be..f87fcce76fd 100644 --- a/cpp/src/column/column_view.cpp +++ b/cpp/src/column/column_view.cpp @@ -22,6 +22,7 @@ #include #include #include +#include #include #include @@ -37,21 +38,20 @@ namespace { template void prefetch_col_data(ColumnView& col, void const* data_ptr, std::string_view key) noexcept { - if (cudf::experimental::prefetch::detail::prefetch_config::instance().get(key)) { + if (cudf::prefetch::detail::enabled()) { if (col.type().id() == cudf::type_id::EMPTY) { // Skip prefetching for empty columns return; } else if (cudf::is_fixed_width(col.type())) { - cudf::experimental::prefetch::detail::prefetch_noexcept( - key, data_ptr, col.size() * size_of(col.type()), cudf::get_default_stream()); + cudf::prefetch::detail::prefetch_noexcept( + data_ptr, col.size() * size_of(col.type()), cudf::get_default_stream()); } else if (col.type().id() == type_id::STRING) { strings_column_view const scv{col}; if (data_ptr == nullptr) { // Do not call chars_size if the data_ptr is nullptr. return; } - cudf::experimental::prefetch::detail::prefetch_noexcept( - key, + cudf::prefetch::detail::prefetch_noexcept( data_ptr, scv.chars_size(cudf::get_default_stream()) * sizeof(char), cudf::get_default_stream()); diff --git a/cpp/src/join/hash_join.cu b/cpp/src/join/hash_join.cu index 5a211b91466..8cf99c5218d 100644 --- a/cpp/src/join/hash_join.cu +++ b/cpp/src/join/hash_join.cu @@ -26,6 +26,7 @@ #include #include #include +#include #include #include @@ -303,8 +304,8 @@ probe_join_hash_table( auto left_indices = std::make_unique>(join_size, stream, mr); auto right_indices = std::make_unique>(join_size, stream, mr); - cudf::experimental::prefetch::detail::prefetch("hash_join", *left_indices, stream); - cudf::experimental::prefetch::detail::prefetch("hash_join", *right_indices, stream); + cudf::prefetch::detail::prefetch(*left_indices, stream); + cudf::prefetch::detail::prefetch(*right_indices, stream); auto const probe_table_num_rows = probe_table.num_rows(); auto const out_probe_begin = diff --git a/cpp/src/utilities/prefetch.cpp b/cpp/src/utilities/prefetch.cpp index be28d54214e..a2aec9abda0 100644 --- a/cpp/src/utilities/prefetch.cpp +++ b/cpp/src/utilities/prefetch.cpp @@ -19,80 +19,66 @@ #include +#include #include -namespace cudf::experimental::prefetch { +namespace cudf::prefetch { namespace detail { -prefetch_config& prefetch_config::instance() +std::atomic_bool& enabled() { - static prefetch_config instance; - return instance; + static std::atomic_bool value; + return value; } -bool prefetch_config::get(std::string_view key) +std::atomic_bool& debug() { - std::shared_lock const lock(config_mtx); - auto const it = config_values.find(key.data()); - return it == config_values.end() ? false : it->second; // default to not prefetching + static std::atomic_bool value; + return value; } -void prefetch_config::set(std::string_view key, bool value) -{ - std::lock_guard const lock(config_mtx); - config_values[key.data()] = value; -} - -cudaError_t prefetch_noexcept(std::string_view key, - void const* ptr, +cudaError_t prefetch_noexcept(void const* ptr, std::size_t size, rmm::cuda_stream_view stream, rmm::cuda_device_id device_id) noexcept { + if (!detail::enabled()) { return cudaSuccess; } + // Don't try to prefetch nullptrs or empty data. Sometimes libcudf has column // views that use nullptrs with a nonzero size as an optimization. if (ptr == nullptr) { - if (prefetch_config::instance().debug) { - std::cerr << "Skipping prefetch of nullptr" << std::endl; - } + if (detail::debug()) { std::cerr << "Skipping prefetch of nullptr" << std::endl; } return cudaSuccess; } if (size == 0) { - if (prefetch_config::instance().debug) { - std::cerr << "Skipping prefetch of size 0" << std::endl; - } + if (detail::debug()) { std::cerr << "Skipping prefetch of size 0" << std::endl; } return cudaSuccess; } - if (prefetch_config::instance().get(key)) { - if (prefetch_config::instance().debug) { - std::cerr << "Prefetching " << size << " bytes for key " << key << " at location " << ptr - << std::endl; - } + if (detail::debug()) { + std::cerr << "Prefetching " << size << " bytes at location " << ptr << std::endl; + } #if defined(CUDART_VERSION) && CUDART_VERSION >= 13000 - cudaMemLocation location{ - (device_id.value() == cudaCpuDeviceId) ? cudaMemLocationTypeHost : cudaMemLocationTypeDevice, - device_id.value()}; - constexpr int flags = 0; - auto result = cudaMemPrefetchAsync(ptr, size, location, flags, stream.value()); + cudaMemLocation location{ + (device_id.value() == cudaCpuDeviceId) ? cudaMemLocationTypeHost : cudaMemLocationTypeDevice, + device_id.value()}; + constexpr int flags = 0; + auto result = cudaMemPrefetchAsync(ptr, size, location, flags, stream.value()); #else - auto result = cudaMemPrefetchAsync(ptr, size, device_id.value(), stream.value()); + auto result = cudaMemPrefetchAsync(ptr, size, device_id.value(), stream.value()); #endif - // Need to flush the CUDA error so that the context is not corrupted. - if (result == cudaErrorInvalidValue) { cudaGetLastError(); } - return result; - } - return cudaSuccess; + // Need to flush the CUDA error so that the context is not corrupted. + if (result == cudaErrorInvalidValue) { cudaGetLastError(); } + return result; } -void prefetch(std::string_view key, - void const* ptr, +void prefetch(void const* ptr, std::size_t size, rmm::cuda_stream_view stream, rmm::cuda_device_id device_id) { - auto result = prefetch_noexcept(key, ptr, size, stream, device_id); + auto result = prefetch_noexcept(ptr, size, stream, device_id); // Ignore cudaErrorInvalidValue because that will be raised if prefetching is // attempted on unmanaged memory. if ((result != cudaErrorInvalidValue) && (result != cudaSuccess)) { @@ -103,15 +89,11 @@ void prefetch(std::string_view key, } // namespace detail -void enable_prefetching(std::string_view key) -{ - detail::prefetch_config::instance().set(key, true); -} +void enable() noexcept { detail::enabled() = true; } -void disable_prefetching(std::string_view key) -{ - detail::prefetch_config::instance().set(key, false); -} +void disable() noexcept { detail::enabled() = false; } + +void enable_debugging() noexcept { detail::debug() = true; } -void prefetch_debugging(bool enable) { detail::prefetch_config::instance().debug = enable; } -} // namespace cudf::experimental::prefetch +void disable_debugging() noexcept { detail::debug() = false; } +} // namespace cudf::prefetch diff --git a/python/cudf/cudf/pandas/__init__.py b/python/cudf/cudf/pandas/__init__.py index e71564cbcc3..70cbbb19830 100644 --- a/python/cudf/cudf/pandas/__init__.py +++ b/python/cudf/cudf/pandas/__init__.py @@ -28,19 +28,6 @@ LOADED = False -_SUPPORTED_PREFETCHES = { - "column_view::get_data", - "mutable_column_view::get_data", - "gather", - "hash_join", -} - - -def _enable_managed_prefetching(rmm_mode, managed_memory_is_supported): - if managed_memory_is_supported and "managed" in rmm_mode: - for key in _SUPPORTED_PREFETCHES: - pylibcudf.experimental.enable_prefetching(key) - def install(): """Enable Pandas Accelerator Mode.""" @@ -69,11 +56,6 @@ def install(): if rmm_mode is None: rmm_mode = "managed_pool" if managed_memory_is_supported else "pool" - if "managed" in rmm_mode and not managed_memory_is_supported: - raise ValueError( - f"Managed memory is not supported on this system, so the requested {rmm_mode=} is invalid." - ) - # Check if a non-default memory resource is set current_mr = rmm.mr.get_current_device_resource() if not isinstance(current_mr, rmm.mr.CudaMemoryResource): @@ -97,22 +79,31 @@ def install(): ) elif rmm_mode == "async": new_mr = rmm.mr.CudaAsyncMemoryResource(initial_pool_size=free_memory) - elif rmm_mode == "managed": - new_mr = rmm.mr.PrefetchResourceAdaptor(rmm.mr.ManagedMemoryResource()) - elif rmm_mode == "managed_pool": - new_mr = rmm.mr.PrefetchResourceAdaptor( - rmm.mr.PoolMemoryResource( - rmm.mr.ManagedMemoryResource(), - initial_pool_size=free_memory, + elif "managed" in rmm_mode: + if not managed_memory_is_supported: + raise ValueError( + "Managed memory is not supported on this system, so the " + f"requested {rmm_mode=} is invalid." ) - ) + if rmm_mode == "managed": + new_mr = rmm.mr.PrefetchResourceAdaptor( + rmm.mr.ManagedMemoryResource() + ) + elif rmm_mode == "managed_pool": + new_mr = rmm.mr.PrefetchResourceAdaptor( + rmm.mr.PoolMemoryResource( + rmm.mr.ManagedMemoryResource(), + initial_pool_size=free_memory, + ) + ) + else: + raise ValueError(f"Unsupported {rmm_mode=}") + pylibcudf.prefetch.enable() elif rmm_mode != "cuda": raise ValueError(f"Unsupported {rmm_mode=}") rmm.mr.set_current_device_resource(new_mr) - _enable_managed_prefetching(rmm_mode, managed_memory_is_supported) - def pytest_load_initial_conftests(early_config, parser, args): # We need to install ourselves before conftest.py import (which diff --git a/python/cudf_polars/cudf_polars/callback.py b/python/cudf_polars/cudf_polars/callback.py index 45518cd3cf2..a3418311630 100644 --- a/python/cudf_polars/cudf_polars/callback.py +++ b/python/cudf_polars/cudf_polars/callback.py @@ -40,14 +40,6 @@ __all__: list[str] = ["execute_with_cudf"] -_SUPPORTED_PREFETCHES = { - "column_view::get_data", - "mutable_column_view::get_data", - "gather", - "hash_join", -} - - @cache def default_memory_resource( device: int, @@ -80,8 +72,7 @@ def default_memory_resource( # Leaving a 20% headroom to avoid OOM errors. free_memory, _ = rmm.mr.available_device_memory() free_memory = int(round(float(free_memory) * 0.80 / 256) * 256) - for key in _SUPPORTED_PREFETCHES: - pylibcudf.experimental.enable_prefetching(key) + pylibcudf.prefetch.enable() mr = rmm.mr.PrefetchResourceAdaptor( rmm.mr.PoolMemoryResource( rmm.mr.ManagedMemoryResource(), diff --git a/python/pylibcudf/pylibcudf/CMakeLists.txt b/python/pylibcudf/pylibcudf/CMakeLists.txt index f4f707b45e4..efd989496d6 100644 --- a/python/pylibcudf/pylibcudf/CMakeLists.txt +++ b/python/pylibcudf/pylibcudf/CMakeLists.txt @@ -21,7 +21,6 @@ set(cython_sources concatenate.pyx copying.pyx datetime.pyx - experimental.pyx expressions.pyx filling.pyx gpumemoryview.pyx @@ -36,6 +35,7 @@ set(cython_sources merge.pyx null_mask.pyx partitioning.pyx + prefetch.pyx quantiles.pyx reduce.pyx replace.pyx diff --git a/python/pylibcudf/pylibcudf/__init__.pxd b/python/pylibcudf/pylibcudf/__init__.pxd index decbe9282ec..2739e14d4a3 100644 --- a/python/pylibcudf/pylibcudf/__init__.pxd +++ b/python/pylibcudf/pylibcudf/__init__.pxd @@ -9,7 +9,6 @@ from . cimport ( contiguous_split, copying, datetime, - experimental, expressions, filling, groupby, @@ -23,6 +22,7 @@ from . cimport ( null_mask, nvtext, partitioning, + prefetch, quantiles, reduce, replace, @@ -60,7 +60,6 @@ __all__ = [ "concatenate", "copying", "datetime", - "experimental", "expressions", "filling", "gpumemoryview", @@ -72,6 +71,7 @@ __all__ = [ "lists", "merge", "null_mask", + "prefetch", "partitioning", "quantiles", "reduce", diff --git a/python/pylibcudf/pylibcudf/__init__.py b/python/pylibcudf/pylibcudf/__init__.py index b819c0abae2..09dcfacca8c 100644 --- a/python/pylibcudf/pylibcudf/__init__.py +++ b/python/pylibcudf/pylibcudf/__init__.py @@ -18,7 +18,6 @@ contiguous_split, copying, datetime, - experimental, expressions, filling, groupby, @@ -33,6 +32,7 @@ null_mask, nvtext, partitioning, + prefetch, quantiles, reduce, replace, @@ -70,7 +70,6 @@ "contiguous_split", "copying", "datetime", - "experimental", "expressions", "filling", "gpumemoryview", @@ -86,6 +85,7 @@ "null_mask", "nvtext", "partitioning", + "prefetch", "quantiles", "reduce", "replace", diff --git a/python/pylibcudf/pylibcudf/experimental.pxd b/python/pylibcudf/pylibcudf/experimental.pxd deleted file mode 100644 index 107c91c8365..00000000000 --- a/python/pylibcudf/pylibcudf/experimental.pxd +++ /dev/null @@ -1,10 +0,0 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. - -from libcpp cimport bool - - -cpdef enable_prefetching(str key) - -cpdef disable_prefetching(str key) - -cpdef prefetch_debugging(bool enable) diff --git a/python/pylibcudf/pylibcudf/experimental.pyi b/python/pylibcudf/pylibcudf/experimental.pyi deleted file mode 100644 index bbfb86b0ff6..00000000000 --- a/python/pylibcudf/pylibcudf/experimental.pyi +++ /dev/null @@ -1,5 +0,0 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. - -def enable_prefetching(key: str) -> None: ... -def disable_prefetching(key: str) -> None: ... -def prefetch_debugging(enable: bool) -> None: ... diff --git a/python/pylibcudf/pylibcudf/experimental.pyx b/python/pylibcudf/pylibcudf/experimental.pyx deleted file mode 100644 index d94d6d087ac..00000000000 --- a/python/pylibcudf/pylibcudf/experimental.pyx +++ /dev/null @@ -1,44 +0,0 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. - -from libcpp cimport bool -from libcpp.string cimport string -from pylibcudf.libcudf cimport experimental as cpp_experimental - - -__all__ = ["disable_prefetching", "enable_prefetching", "prefetch_debugging"] - -cpdef enable_prefetching(str key): - """Turn on prefetch instructions for the given key. - - Parameters - ---------- - key : str - The key to enable prefetching for. - """ - cdef string c_key = key.encode("utf-8") - cpp_experimental.enable_prefetching(c_key) - - -cpdef disable_prefetching(str key): - """Turn off prefetch instructions for the given key. - - Parameters - ---------- - key : str - The key to disable prefetching for. - """ - cdef string c_key = key.encode("utf-8") - cpp_experimental.disable_prefetching(c_key) - - -cpdef prefetch_debugging(bool enable): - """Enable or disable prefetch debugging. - - When enabled, any prefetch instructions will be logged to the console. - - Parameters - ---------- - enable : bool - Whether to enable or disable prefetch debugging. - """ - cpp_experimental.prefetch_debugging(enable) diff --git a/python/pylibcudf/pylibcudf/libcudf/experimental.pxd b/python/pylibcudf/pylibcudf/libcudf/experimental.pxd deleted file mode 100644 index 764815fba36..00000000000 --- a/python/pylibcudf/pylibcudf/libcudf/experimental.pxd +++ /dev/null @@ -1,16 +0,0 @@ -# Copyright (c) 2022-2024, NVIDIA CORPORATION. -from libcpp cimport bool -from libcpp.string cimport string -from pylibcudf.exception_handler cimport libcudf_exception_handler - - -cdef extern from "cudf/utilities/prefetch.hpp" \ - namespace "cudf::experimental::prefetch" nogil: - # Not technically the right signature, but it's good enough to let Cython - # generate valid C++ code. It just means we'll be copying a host string - # extra, but that's OK. If we care we could generate string_view bindings, - # but there's no real rush so if we go that route we might as well - # contribute them upstream to Cython itself. - void enable_prefetching(string key) - void disable_prefetching(string key) - void prefetch_debugging(bool enable) diff --git a/python/pylibcudf/pylibcudf/libcudf/prefetch.pxd b/python/pylibcudf/pylibcudf/libcudf/prefetch.pxd new file mode 100644 index 00000000000..1b7dc36444a --- /dev/null +++ b/python/pylibcudf/pylibcudf/libcudf/prefetch.pxd @@ -0,0 +1,8 @@ +# Copyright (c) 2022-2025, NVIDIA CORPORATION. + + +cdef extern from "cudf/utilities/prefetch.hpp" namespace "cudf::prefetch" nogil: + void enable() noexcept + void disable() noexcept + void enable_debugging() noexcept + void disable_debugging() noexcept diff --git a/python/pylibcudf/pylibcudf/prefetch.pxd b/python/pylibcudf/pylibcudf/prefetch.pxd new file mode 100644 index 00000000000..36632a4ed02 --- /dev/null +++ b/python/pylibcudf/pylibcudf/prefetch.pxd @@ -0,0 +1,9 @@ +# Copyright (c) 2024-2025, NVIDIA CORPORATION. + +cpdef enable() + +cpdef disable() + +cpdef enable_debugging() + +cpdef disable_debugging() diff --git a/python/pylibcudf/pylibcudf/prefetch.pyi b/python/pylibcudf/pylibcudf/prefetch.pyi new file mode 100644 index 00000000000..d949e001d71 --- /dev/null +++ b/python/pylibcudf/pylibcudf/prefetch.pyi @@ -0,0 +1,6 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +def enable() -> None: ... +def disable() -> None: ... +def enable_debugging() -> None: ... +def disable_debugging() -> None: ... diff --git a/python/pylibcudf/pylibcudf/prefetch.pyx b/python/pylibcudf/pylibcudf/prefetch.pyx new file mode 100644 index 00000000000..8d251d8c96d --- /dev/null +++ b/python/pylibcudf/pylibcudf/prefetch.pyx @@ -0,0 +1,25 @@ +# Copyright (c) 2024-2025, NVIDIA CORPORATION. + +from pylibcudf.libcudf cimport prefetch as cpp_prefetch + + +__all__ = ["disable", "disable_debugging", "enable", "enable_debugging"] + +cpdef enable(): + """Turn on prefetching of managed memory.""" + cpp_prefetch.enable() + + +cpdef disable(): + """Turn off prefetching of managed memory.""" + cpp_prefetch.disable() + + +cpdef enable_debugging(): + """Enable prefetch debugging.""" + cpp_prefetch.enable_debugging() + + +cpdef disable_debugging(): + """Disable prefetch debugging.""" + cpp_prefetch.disable_debugging() From cd173ff120789c2206a0409ba5375533540a9b63 Mon Sep 17 00:00:00 2001 From: Vukasin Milovanovic Date: Thu, 4 Sep 2025 12:55:04 -0700 Subject: [PATCH 259/366] Fix a decompression parameter in the chunked ORC reader (#19882) Chunked ORC reader passes zero for the maximum uncompressed block size to the `decompress` call; the `max_uncompressed_block_size` in the `compinfo` array elements is never set. In the non-chunked case, `parse_compressed_stripe_data` is called in `decompress_stripe_data` so this information is correct. In the chunked case, the `compinfo` array is re-created from `compinfo_map`, which does not `have max_uncompressed_block_size`. This PR fixes this by including `max_uncompressed_block_size` in `compinfo_map` and propagating it from `load_next_stripe_data` to `decompress_stripe_data` via this map. Authors: - Vukasin Milovanovic (https://github.com/vuule) Approvers: - Tianyu Liu (https://github.com/kingcrimsontianyu) - https://github.com/nvdbaranec URL: https://github.com/rapidsai/cudf/pull/19882 --- cpp/src/io/comp/nvcomp_adapter.cpp | 10 ++++++++++ cpp/src/io/orc/reader_impl_chunking.cu | 3 ++- cpp/src/io/orc/reader_impl_chunking.hpp | 1 + cpp/src/io/orc/reader_impl_decode.cu | 10 +++++----- 4 files changed, 18 insertions(+), 6 deletions(-) diff --git a/cpp/src/io/comp/nvcomp_adapter.cpp b/cpp/src/io/comp/nvcomp_adapter.cpp index 5dd78515bcf..ae7eda0711e 100644 --- a/cpp/src/io/comp/nvcomp_adapter.cpp +++ b/cpp/src/io/comp/nvcomp_adapter.cpp @@ -715,6 +715,12 @@ void batched_decompress(compression_type compression, size_t max_total_uncomp_size, rmm::cuda_stream_view stream) { + CUDF_EXPECTS(inputs.size() > 0, "inputs must be non-empty"); + CUDF_EXPECTS(inputs.size() == outputs.size(), "inputs and outputs must have the same size"); + CUDF_EXPECTS(inputs.size() == results.size(), "inputs and results must have the same size"); + CUDF_EXPECTS(max_total_uncomp_size > 0, "max_total_uncomp_size must be greater than 0"); + CUDF_EXPECTS(max_uncomp_chunk_size > 0, "max_uncomp_chunk_size must be greater than 0"); + auto const num_chunks = inputs.size(); // cuDF inflate inputs converted to nvcomp inputs @@ -837,6 +843,10 @@ void batched_compress(compression_type compression, device_span results, rmm::cuda_stream_view stream) { + CUDF_EXPECTS(inputs.size() > 0, "inputs must be non-empty"); + CUDF_EXPECTS(inputs.size() == outputs.size(), "inputs and outputs must have the same size"); + CUDF_EXPECTS(inputs.size() == results.size(), "inputs and results must have the same size"); + auto const num_chunks = inputs.size(); auto nvcomp_args = create_batched_nvcomp_args(inputs, outputs, stream); diff --git a/cpp/src/io/orc/reader_impl_chunking.cu b/cpp/src/io/orc/reader_impl_chunking.cu index 092fa33c95c..d0e3c862656 100644 --- a/cpp/src/io/orc/reader_impl_chunking.cu +++ b/cpp/src/io/orc/reader_impl_chunking.cu @@ -690,7 +690,8 @@ void reader_impl::load_next_stripe_data(read_mode mode) // Cache these parsed numbers so they can be reused in the decompression/decoding step. compinfo_map[info.source] = {stream_compinfo.num_compressed_blocks, stream_compinfo.num_uncompressed_blocks, - stream_compinfo.max_uncompressed_size}; + stream_compinfo.max_uncompressed_size, + stream_compinfo.max_uncompressed_block_size}; stripe_decomp_sizes[info.source.stripe_idx - stripe_start].size_bytes += stream_compinfo.max_uncompressed_size; } diff --git a/cpp/src/io/orc/reader_impl_chunking.hpp b/cpp/src/io/orc/reader_impl_chunking.hpp index 3aa0344bf21..54ed6e041da 100644 --- a/cpp/src/io/orc/reader_impl_chunking.hpp +++ b/cpp/src/io/orc/reader_impl_chunking.hpp @@ -74,6 +74,7 @@ struct stripe_level_comp_info { std::size_t num_compressed_blocks{0}; std::size_t num_uncompressed_blocks{0}; std::size_t total_decomp_size{0}; + std::size_t max_uncompressed_block_size{0}; }; /** diff --git a/cpp/src/io/orc/reader_impl_decode.cu b/cpp/src/io/orc/reader_impl_decode.cu index 6909a84e903..cfd68fa9d04 100644 --- a/cpp/src/io/orc/reader_impl_decode.cu +++ b/cpp/src/io/orc/reader_impl_decode.cu @@ -106,12 +106,12 @@ rmm::device_buffer decompress_stripe_data( stripe_data[info.source.stripe_idx - loaded_stripe_range.begin].data()) + info.dst_pos, info.length); - if (compinfo_ready) { - auto const& cached_comp_info = compinfo_map.at(info.source); - stream_comp_info.num_compressed_blocks = cached_comp_info.num_compressed_blocks; - stream_comp_info.num_uncompressed_blocks = cached_comp_info.num_uncompressed_blocks; - stream_comp_info.max_uncompressed_size = cached_comp_info.total_decomp_size; + auto const& cached_comp_info = compinfo_map.at(info.source); + stream_comp_info.num_compressed_blocks = cached_comp_info.num_compressed_blocks; + stream_comp_info.num_uncompressed_blocks = cached_comp_info.num_uncompressed_blocks; + stream_comp_info.max_uncompressed_size = cached_comp_info.total_decomp_size; + stream_comp_info.max_uncompressed_block_size = cached_comp_info.max_uncompressed_block_size; num_compressed_blocks += cached_comp_info.num_compressed_blocks; num_uncompressed_blocks += cached_comp_info.num_uncompressed_blocks; From 5e59566b4a22e5d5df3e88bf8d3d2ae81254ce84 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Thu, 4 Sep 2025 16:59:26 -0700 Subject: [PATCH 260/366] Move row operators to detail and deprecate legacy (#19849) Related to #12593 and #19191 This PR deprecates the legacy row operator and moves the experimental and primitive row operators into the `detail` namespace. To avoid breaking changes, the code is temporarily duplicated: primitive operators are copied to `detail/row_operator/primitive_row_operators.cuh`, and experimental ones to `detail/row_operator/row_operators.cuh`. An attempt to wrap the detail APIs instead of duplicating code was not feasible due to nested namespaces and static functions. There are no functional modifications; only namespace reorganizations and header inclusion updates. Authors: - Yunsong Wang (https://github.com/PointKernel) Approvers: - Nghia Truong (https://github.com/ttnghia) - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/19849 --- cpp/CMakeLists.txt | 2 + .../cudf/detail/join/distinct_hash_join.cuh | 8 +- cpp/include/cudf/detail/join/hash_join.cuh | 4 +- .../cudf/detail/row_operator/common_utils.cuh | 151 ++ .../row_operator/primitive_row_operators.cuh | 287 +++ .../detail/row_operator/row_operators.cuh | 2064 +++++++++++++++++ .../cudf/table/experimental/row_operators.cuh | 66 + .../cudf/table/primitive_row_operators.cuh | 27 + cpp/include/cudf/table/row_operators.cuh | 187 +- cpp/src/binaryop/compiled/binary_ops.cu | 10 +- .../binaryop/compiled/struct_binary_ops.cuh | 33 +- cpp/src/groupby/hash/groupby.cu | 8 +- cpp/src/groupby/hash/helpers.cuh | 15 +- cpp/src/groupby/sort/group_nunique.cu | 8 +- cpp/src/groupby/sort/group_rank_scan.cu | 5 +- cpp/src/groupby/sort/sort_helper.cu | 4 +- cpp/src/hash/murmurhash3_x86_32.cu | 6 +- cpp/src/io/orc/dict_enc.cu | 4 +- cpp/src/io/parquet/chunk_dict.cu | 5 +- cpp/src/join/distinct_hash_join.cu | 52 +- cpp/src/join/hash_join.cu | 105 +- cpp/src/join/join_common_utils.cuh | 14 +- cpp/src/join/mixed_join.cu | 26 +- cpp/src/join/mixed_join_common_utils.cuh | 23 +- cpp/src/join/mixed_join_semi.cu | 19 +- cpp/src/join/sort_merge_join.cu | 10 +- cpp/src/lists/contains.cu | 9 +- cpp/src/merge/merge.cu | 6 +- cpp/src/partitioning/partitioning.cu | 4 +- cpp/src/reductions/histogram.cu | 20 +- .../reductions/nested_type_minmax_util.cuh | 12 +- cpp/src/reductions/scan/rank_scan.cu | 6 +- cpp/src/reductions/segmented/nunique.cu | 9 +- .../row_operator/primitive_row_operators.cu | 29 + cpp/src/row_operator/row_operators.cu | 879 +++++++ cpp/src/search/contains_scalar.cu | 10 +- cpp/src/search/contains_table.cu | 31 +- cpp/src/search/contains_table_impl.cu | 29 +- cpp/src/search/contains_table_impl.cuh | 35 +- cpp/src/search/contains_table_impl_nested.cu | 27 +- .../search/contains_table_impl_primitive.cu | 10 +- cpp/src/search/search_ordered.cu | 10 +- cpp/src/sort/is_sorted.cu | 6 +- cpp/src/sort/rank.cu | 4 +- cpp/src/sort/sort_column_impl.cuh | 2 +- cpp/src/sort/sort_impl.cuh | 4 +- cpp/src/stream_compaction/distinct.cu | 22 +- cpp/src/stream_compaction/distinct_count.cu | 11 +- cpp/src/stream_compaction/distinct_helpers.cu | 16 +- .../stream_compaction/distinct_helpers.hpp | 10 +- cpp/src/stream_compaction/unique.cu | 4 +- cpp/src/stream_compaction/unique_count.cu | 6 +- .../stream_compaction/unique_count_column.cu | 2 +- cpp/src/transform/one_hot_encode.cu | 15 +- .../table/experimental_row_operator_tests.cu | 71 +- .../table/row_operator_tests_utilities.cu | 25 +- .../table/row_operator_tests_utilities.hpp | 15 +- .../table/row_operator_tests_utilities2.cu | 4 +- cpp/tests/utilities/column_utilities.cu | 12 +- 59 files changed, 3977 insertions(+), 521 deletions(-) create mode 100644 cpp/include/cudf/detail/row_operator/common_utils.cuh create mode 100644 cpp/include/cudf/detail/row_operator/primitive_row_operators.cuh create mode 100644 cpp/include/cudf/detail/row_operator/row_operators.cuh create mode 100644 cpp/src/row_operator/primitive_row_operators.cu create mode 100644 cpp/src/row_operator/row_operators.cu diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 5eb59323caa..28b35d4cbb6 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -683,6 +683,8 @@ add_library( src/rolling/range_window_bounds.cpp src/rolling/rolling.cpp src/round/round.cu + src/row_operator/primitive_row_operators.cu + src/row_operator/row_operators.cu src/runtime/context.cpp src/scalar/scalar.cpp src/scalar/scalar_factories.cpp diff --git a/cpp/include/cudf/detail/join/distinct_hash_join.cuh b/cpp/include/cudf/detail/join/distinct_hash_join.cuh index 3da903dc415..a50675c156e 100644 --- a/cpp/include/cudf/detail/join/distinct_hash_join.cuh +++ b/cpp/include/cudf/detail/join/distinct_hash_join.cuh @@ -15,8 +15,8 @@ */ #pragma once +#include #include -#include #include #include @@ -32,8 +32,8 @@ namespace cudf::detail { -using cudf::experimental::row::lhs_index_type; -using cudf::experimental::row::rhs_index_type; +using cudf::detail::row::lhs_index_type; +using cudf::detail::row::rhs_index_type; /** * @brief A custom comparator used for the build table insertion @@ -170,7 +170,7 @@ class distinct_hash_join { bool _has_nested_columns; ///< True if nested columns are present in build and probe tables cudf::null_equality _nulls_equal; ///< Whether to consider nulls as equal cudf::table_view _build; ///< Input table to build the hash map - std::shared_ptr + std::shared_ptr _preprocessed_build; ///< Input table preprocssed for row operators hash_table_type _hash_table; ///< Hash table built on `_build` }; diff --git a/cpp/include/cudf/detail/join/hash_join.cuh b/cpp/include/cudf/detail/join/hash_join.cuh index 4bff2d0b7ed..99f11126999 100644 --- a/cpp/include/cudf/detail/join/hash_join.cuh +++ b/cpp/include/cudf/detail/join/hash_join.cuh @@ -36,7 +36,7 @@ #include // Forward declaration -namespace cudf::experimental::row::equality { +namespace cudf::detail::row::equality { class preprocessed_table; } @@ -108,7 +108,7 @@ struct hash_join { bool const _has_nulls; ///< true if nulls are present in either build table or any probe table cudf::null_equality const _nulls_equal; ///< whether to consider nulls as equal cudf::table_view _build; ///< input table to build the hash map - std::shared_ptr + std::shared_ptr _preprocessed_build; ///< input table preprocssed for row operators hash_table_t _hash_table; ///< hash table built on `_build` diff --git a/cpp/include/cudf/detail/row_operator/common_utils.cuh b/cpp/include/cudf/detail/row_operator/common_utils.cuh new file mode 100644 index 00000000000..794b2300bf7 --- /dev/null +++ b/cpp/include/cudf/detail/row_operator/common_utils.cuh @@ -0,0 +1,151 @@ +/* + * Copyright (c) 2025, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +#include + +namespace cudf::detail { + +/** + * @brief Result type of comparison operations. + * + * Indicates how two elements `a` and `b` compare with one and another. + * + * Equivalence is defined as `not (a +__device__ weak_ordering compare_elements(Element lhs, Element rhs) +{ + if (lhs < rhs) { + return weak_ordering::LESS; + } else if (rhs < lhs) { + return weak_ordering::GREATER; + } + return weak_ordering::EQUIVALENT; +} + +/** + * @brief A specialization for floating-point `Element` type relational comparison + * to derive the order of the elements with respect to `lhs`. + * + * This specialization handles `nan` in the following order: + * `[-Inf, -ve, 0, -0, +ve, +Inf, NaN, NaN, null] (for null_order::AFTER)` + * `[null, -Inf, -ve, 0, -0, +ve, +Inf, NaN, NaN] (for null_order::BEFORE)` + * + * @param lhs The first element + * @param rhs The second element + * @return Indicates the relationship between the elements + */ +template +__device__ weak_ordering relational_compare(Element lhs, Element rhs) + requires(cuda::std::is_floating_point_v) +{ + if (isnan(lhs) and isnan(rhs)) { + return weak_ordering::EQUIVALENT; + } else if (isnan(rhs)) { + return weak_ordering::LESS; + } else if (isnan(lhs)) { + return weak_ordering::GREATER; + } + + return detail::compare_elements(lhs, rhs); +} + +/** + * @brief Compare the nulls according to null order. + * + * @param lhs_is_null boolean representing if lhs is null + * @param rhs_is_null boolean representing if rhs is null + * @param null_precedence null order + * @return Indicates the relationship between null in lhs and rhs columns. + */ +inline __device__ auto null_compare(bool lhs_is_null, bool rhs_is_null, null_order null_precedence) +{ + if (lhs_is_null and rhs_is_null) { // null +__device__ weak_ordering relational_compare(Element lhs, Element rhs) + requires(not cuda::std::is_floating_point_v) +{ + return detail::compare_elements(lhs, rhs); +} + +/** + * @brief A specialization for floating-point `Element` type to check if + * `lhs` is equivalent to `rhs`. `nan == nan`. + * + * @param lhs first element + * @param rhs second element + * @return `true` if `lhs` == `rhs` else `false`. + */ +template +__device__ bool equality_compare(Element lhs, Element rhs) + requires(cuda::std::is_floating_point_v) +{ + if (isnan(lhs) and isnan(rhs)) { return true; } + return lhs == rhs; +} + +/** + * @brief A specialization for non-floating-point `Element` type to check if + * `lhs` is equivalent to `rhs`. + * + * @param lhs first element + * @param rhs second element + * @return `true` if `lhs` == `rhs` else `false`. + */ +template +__device__ bool equality_compare(Element const lhs, Element const rhs) + requires(not cuda::std::is_floating_point_v) +{ + return lhs == rhs; +} + +} // namespace cudf::detail diff --git a/cpp/include/cudf/detail/row_operator/primitive_row_operators.cuh b/cpp/include/cudf/detail/row_operator/primitive_row_operators.cuh new file mode 100644 index 00000000000..4743e9fc645 --- /dev/null +++ b/cpp/include/cudf/detail/row_operator/primitive_row_operators.cuh @@ -0,0 +1,287 @@ +/* + * Copyright (c) 2025, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include + +namespace CUDF_EXPORT cudf { + +namespace detail { + +/** + * @brief Checks if a table is compatible with primitive row operations + * + * A table is compatible with primitive row operations if it contains exactly one column + * and that column contains only numeric data types. + * + * @param table The table to check for compatibility + * @return Boolean indicating if the table is compatible with primitive row operations + */ +bool is_primitive_row_op_compatible(cudf::table_view const& table); + +namespace row::primitive { + +/** + * @brief Returns `void` if it's not a primitive type + */ +template +using primitive_type_t = cuda::std::conditional_t(), T, void>; + +/** + * @brief Custom dispatcher for primitive types + */ +template +struct dispatch_primitive_type { + using type = primitive_type_t>; ///< The underlying type +}; + +/** + * @brief Performs an equality comparison between two elements in two columns. + */ +class element_equality_comparator { + public: + /** + * @brief Compares the specified elements for equality. + * + * @param lhs The first column + * @param rhs The second column + * @param lhs_element_index The index of the first element + * @param rhs_element_index The index of the second element + * @return True if lhs and rhs element are equal + */ + template ())> + __device__ bool operator()(column_device_view const& lhs, + column_device_view const& rhs, + size_type lhs_element_index, + size_type rhs_element_index) const + { + return cudf::detail::equality_compare(lhs.element(lhs_element_index), + rhs.element(rhs_element_index)); + } + + // @cond + template ())> + __device__ bool operator()(column_device_view const&, + column_device_view const&, + size_type, + size_type) const + { + CUDF_UNREACHABLE("Attempted to compare elements of uncomparable types."); + } + // @endcond +}; + +/** + * @brief Performs a relational comparison between two elements in two tables. + */ +class row_equality_comparator { + public: + /** + * @brief Construct a new row equality comparator object + * + * @param has_nulls Indicates if either input column contains nulls + * @param lhs Preprocessed table containing the first element + * @param rhs Preprocessed table containing the second element (may be the same as lhs) + * @param nulls_are_equal Indicates if two null elements are treated as equivalent + */ + row_equality_comparator(cudf::nullate::DYNAMIC const& has_nulls, + std::shared_ptr lhs, + std::shared_ptr rhs, + null_equality nulls_are_equal) + : _has_nulls{has_nulls}, _lhs{*lhs}, _rhs{*rhs}, _nulls_are_equal{nulls_are_equal} + { + CUDF_EXPECTS(_lhs.num_columns() == _rhs.num_columns(), "Mismatched number of columns."); + } + + /** + * @brief Compares the specified rows for equality. + * + * @param lhs_row_index The index of the first row to compare (in the lhs table) + * @param rhs_row_index The index of the second row to compare (in the rhs table) + * @return true if both rows are equal, otherwise false + */ + __device__ bool operator()(size_type lhs_row_index, size_type rhs_row_index) const + { + auto equal_elements = [this, lhs_row_index, rhs_row_index](column_device_view const& l, + column_device_view const& r) { + // Handle null comparison for each element + if (_has_nulls) { + bool const lhs_is_null{l.is_null(lhs_row_index)}; + bool const rhs_is_null{r.is_null(rhs_row_index)}; + if (lhs_is_null and rhs_is_null) { + return _nulls_are_equal == null_equality::EQUAL; + } else if (lhs_is_null != rhs_is_null) { + return false; + } + } + + // Both elements are non-null, compare their values + element_equality_comparator comparator; + return cudf::type_dispatcher( + l.type(), comparator, l, r, lhs_row_index, rhs_row_index); + }; + + return thrust::equal(thrust::seq, _lhs.begin(), _lhs.end(), _rhs.begin(), equal_elements); + } + + /** + * @brief Compares the specified rows for equality. + * + * @param lhs_index The index of the first row to compare (in the lhs table) + * @param rhs_index The index of the second row to compare (in the rhs table) + * @return Boolean indicating if both rows are equal + */ + __device__ bool operator()(cudf::detail::row::lhs_index_type lhs_index, + cudf::detail::row::rhs_index_type rhs_index) const + { + return (*this)(static_cast(lhs_index), static_cast(rhs_index)); + } + + private: + cudf::nullate::DYNAMIC _has_nulls; + table_device_view _lhs; + table_device_view _rhs; + null_equality _nulls_are_equal; +}; + +/** + * @brief Function object for computing the hash value of a row in a column. + * + * @tparam Hash Hash functor to use for hashing elements + */ +template